Print this page
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/os/zone.c
+++ new/usr/src/uts/common/os/zone.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright 2016, Joyent Inc.
25 25 */
26 26
27 27 /*
28 28 * Zones
29 29 *
30 30 * A zone is a named collection of processes, namespace constraints,
31 31 * and other system resources which comprise a secure and manageable
32 32 * application containment facility.
33 33 *
34 34 * Zones (represented by the reference counted zone_t) are tracked in
35 35 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs
36 36 * (zoneid_t) are used to track zone association. Zone IDs are
37 37 * dynamically generated when the zone is created; if a persistent
38 38 * identifier is needed (core files, accounting logs, audit trail,
39 39 * etc.), the zone name should be used.
40 40 *
41 41 *
42 42 * Global Zone:
43 43 *
44 44 * The global zone (zoneid 0) is automatically associated with all
45 45 * system resources that have not been bound to a user-created zone.
46 46 * This means that even systems where zones are not in active use
47 47 * have a global zone, and all processes, mounts, etc. are
48 48 * associated with that zone. The global zone is generally
49 49 * unconstrained in terms of privileges and access, though the usual
50 50 * credential and privilege based restrictions apply.
51 51 *
52 52 *
53 53 * Zone States:
54 54 *
55 55 * The states in which a zone may be in and the transitions are as
56 56 * follows:
57 57 *
58 58 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
59 59 * initialized zone is added to the list of active zones on the system but
60 60 * isn't accessible.
61 61 *
62 62 * ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
63 63 * not yet completed. Not possible to enter the zone, but attributes can
64 64 * be retrieved.
65 65 *
66 66 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
67 67 * ready. The zone is made visible after the ZSD constructor callbacks are
68 68 * executed. A zone remains in this state until it transitions into
69 69 * the ZONE_IS_BOOTING state as a result of a call to zone_boot().
70 70 *
71 71 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
72 72 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
73 73 * state.
74 74 *
75 75 * ZONE_IS_RUNNING: The zone is open for business: zsched has
76 76 * successfully started init. A zone remains in this state until
77 77 * zone_shutdown() is called.
78 78 *
79 79 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
80 80 * killing all processes running in the zone. The zone remains
81 81 * in this state until there are no more user processes running in the zone.
82 82 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
83 83 * Since zone_shutdown() is restartable, it may be called successfully
84 84 * multiple times for the same zone_t. Setting of the zone's state to
85 85 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
86 86 * the zone's status without worrying about it being a moving target.
87 87 *
88 88 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there
89 89 * are no more user processes in the zone. The zone remains in this
90 90 * state until there are no more kernel threads associated with the
91 91 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will
92 92 * fail.
93 93 *
94 94 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
95 95 * have exited. zone_shutdown() returns. Henceforth it is not possible to
96 96 * join the zone or create kernel threads therein.
97 97 *
98 98 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
99 99 * remains in this state until zsched exits. Calls to zone_find_by_*()
100 100 * return NULL from now on.
101 101 *
102 102 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no
103 103 * processes or threads doing work on behalf of the zone. The zone is
104 104 * removed from the list of active zones. zone_destroy() returns, and
105 105 * the zone can be recreated.
106 106 *
107 107 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
108 108 * callbacks are executed, and all memory associated with the zone is
109 109 * freed.
110 110 *
111 111 * Threads can wait for the zone to enter a requested state by using
112 112 * zone_status_wait() or zone_status_timedwait() with the desired
113 113 * state passed in as an argument. Zone state transitions are
114 114 * uni-directional; it is not possible to move back to an earlier state.
115 115 *
116 116 *
117 117 * Zone-Specific Data:
118 118 *
119 119 * Subsystems needing to maintain zone-specific data can store that
120 120 * data using the ZSD mechanism. This provides a zone-specific data
121 121 * store, similar to thread-specific data (see pthread_getspecific(3C)
122 122 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used
123 123 * to register callbacks to be invoked when a zone is created, shut
124 124 * down, or destroyed. This can be used to initialize zone-specific
125 125 * data for new zones and to clean up when zones go away.
126 126 *
127 127 *
128 128 * Data Structures:
129 129 *
130 130 * The per-zone structure (zone_t) is reference counted, and freed
131 131 * when all references are released. zone_hold and zone_rele can be
132 132 * used to adjust the reference count. In addition, reference counts
133 133 * associated with the cred_t structure are tracked separately using
134 134 * zone_cred_hold and zone_cred_rele.
135 135 *
136 136 * Pointers to active zone_t's are stored in two hash tables; one
137 137 * for searching by id, the other for searching by name. Lookups
138 138 * can be performed on either basis, using zone_find_by_id and
139 139 * zone_find_by_name. Both return zone_t pointers with the zone
140 140 * held, so zone_rele should be called when the pointer is no longer
141 141 * needed. Zones can also be searched by path; zone_find_by_path
142 142 * returns the zone with which a path name is associated (global
143 143 * zone if the path is not within some other zone's file system
144 144 * hierarchy). This currently requires iterating through each zone,
145 145 * so it is slower than an id or name search via a hash table.
146 146 *
147 147 *
148 148 * Locking:
149 149 *
150 150 * zonehash_lock: This is a top-level global lock used to protect the
151 151 * zone hash tables and lists. Zones cannot be created or destroyed
152 152 * while this lock is held.
153 153 * zone_status_lock: This is a global lock protecting zone state.
154 154 * Zones cannot change state while this lock is held. It also
155 155 * protects the list of kernel threads associated with a zone.
156 156 * zone_lock: This is a per-zone lock used to protect several fields of
157 157 * the zone_t (see <sys/zone.h> for details). In addition, holding
158 158 * this lock means that the zone cannot go away.
159 159 * zone_nlwps_lock: This is a per-zone lock used to protect the fields
160 160 * related to the zone.max-lwps rctl.
161 161 * zone_mem_lock: This is a per-zone lock used to protect the fields
162 162 * related to the zone.max-locked-memory and zone.max-swap rctls.
163 163 * zone_rctl_lock: This is a per-zone lock used to protect other rctls,
164 164 * currently just max_lofi
165 165 * zsd_key_lock: This is a global lock protecting the key state for ZSD.
166 166 * zone_deathrow_lock: This is a global lock protecting the "deathrow"
167 167 * list (a list of zones in the ZONE_IS_DEAD state).
168 168 *
169 169 * Ordering requirements:
170 170 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
171 171 * zone_lock --> zsd_key_lock --> pidlock --> p_lock
172 172 *
173 173 * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
174 174 * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
175 175 * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
176 176 *
177 177 * Blocking memory allocations are permitted while holding any of the
178 178 * zone locks.
179 179 *
180 180 *
181 181 * System Call Interface:
182 182 *
183 183 * The zone subsystem can be managed and queried from user level with
184 184 * the following system calls (all subcodes of the primary "zone"
185 185 * system call):
186 186 * - zone_create: creates a zone with selected attributes (name,
187 187 * root path, privileges, resource controls, ZFS datasets)
188 188 * - zone_enter: allows the current process to enter a zone
189 189 * - zone_getattr: reports attributes of a zone
190 190 * - zone_setattr: set attributes of a zone
191 191 * - zone_boot: set 'init' running for the zone
192 192 * - zone_list: lists all zones active in the system
193 193 * - zone_lookup: looks up zone id based on name
194 194 * - zone_shutdown: initiates shutdown process (see states above)
195 195 * - zone_destroy: completes shutdown process (see states above)
196 196 *
197 197 */
198 198
199 199 #include <sys/priv_impl.h>
200 200 #include <sys/cred.h>
201 201 #include <c2/audit.h>
202 202 #include <sys/debug.h>
203 203 #include <sys/file.h>
204 204 #include <sys/kmem.h>
205 205 #include <sys/kstat.h>
206 206 #include <sys/mutex.h>
207 207 #include <sys/note.h>
208 208 #include <sys/pathname.h>
209 209 #include <sys/proc.h>
210 210 #include <sys/project.h>
211 211 #include <sys/sysevent.h>
212 212 #include <sys/task.h>
213 213 #include <sys/systm.h>
214 214 #include <sys/types.h>
215 215 #include <sys/utsname.h>
216 216 #include <sys/vnode.h>
217 217 #include <sys/vfs.h>
218 218 #include <sys/systeminfo.h>
219 219 #include <sys/policy.h>
220 220 #include <sys/cred_impl.h>
221 221 #include <sys/contract_impl.h>
222 222 #include <sys/contract/process_impl.h>
223 223 #include <sys/class.h>
224 224 #include <sys/pool.h>
225 225 #include <sys/pool_pset.h>
226 226 #include <sys/pset.h>
227 227 #include <sys/strlog.h>
228 228 #include <sys/sysmacros.h>
229 229 #include <sys/callb.h>
230 230 #include <sys/vmparam.h>
231 231 #include <sys/corectl.h>
232 232 #include <sys/ipc_impl.h>
233 233 #include <sys/klpd.h>
234 234
235 235 #include <sys/door.h>
236 236 #include <sys/cpuvar.h>
237 237 #include <sys/sdt.h>
238 238
239 239 #include <sys/uadmin.h>
240 240 #include <sys/session.h>
241 241 #include <sys/cmn_err.h>
242 242 #include <sys/modhash.h>
243 243 #include <sys/sunddi.h>
244 244 #include <sys/nvpair.h>
245 245 #include <sys/rctl.h>
246 246 #include <sys/fss.h>
247 247 #include <sys/brand.h>
248 248 #include <sys/zone.h>
249 249 #include <net/if.h>
250 250 #include <sys/cpucaps.h>
251 251 #include <vm/seg.h>
252 252 #include <sys/mac.h>
253 253 #include <sys/rt.h>
254 254 #include <sys/fx.h>
255 255
256 256 /*
257 257 * This constant specifies the number of seconds that threads waiting for
258 258 * subsystems to release a zone's general-purpose references will wait before
259 259 * they log the zone's reference counts. The constant's value shouldn't
260 260 * be so small that reference counts are unnecessarily reported for zones
261 261 * whose references are slowly released. On the other hand, it shouldn't be so
262 262 * large that users reboot their systems out of frustration over hung zones
263 263 * before the system logs the zones' reference counts.
264 264 */
265 265 #define ZONE_DESTROY_TIMEOUT_SECS 60
266 266
267 267 /* List of data link IDs which are accessible from the zone */
268 268 typedef struct zone_dl {
269 269 datalink_id_t zdl_id;
270 270 nvlist_t *zdl_net;
271 271 list_node_t zdl_linkage;
272 272 } zone_dl_t;
273 273
274 274 /*
275 275 * cv used to signal that all references to the zone have been released. This
276 276 * needs to be global since there may be multiple waiters, and the first to
277 277 * wake up will free the zone_t, hence we cannot use zone->zone_cv.
278 278 */
279 279 static kcondvar_t zone_destroy_cv;
280 280 /*
281 281 * Lock used to serialize access to zone_cv. This could have been per-zone,
282 282 * but then we'd need another lock for zone_destroy_cv, and why bother?
283 283 */
284 284 static kmutex_t zone_status_lock;
285 285
286 286 /*
287 287 * ZSD-related global variables.
288 288 */
289 289 static kmutex_t zsd_key_lock; /* protects the following two */
290 290 /*
291 291 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
292 292 */
293 293 static zone_key_t zsd_keyval = 0;
294 294 /*
295 295 * Global list of registered keys. We use this when a new zone is created.
296 296 */
297 297 static list_t zsd_registered_keys;
298 298
299 299 int zone_hash_size = 256;
300 300 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
301 301 static kmutex_t zonehash_lock;
302 302 static uint_t zonecount;
303 303 static id_space_t *zoneid_space;
304 304
305 305 /*
306 306 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
307 307 * kernel proper runs, and which manages all other zones.
308 308 *
309 309 * Although not declared as static, the variable "zone0" should not be used
310 310 * except for by code that needs to reference the global zone early on in boot,
311 311 * before it is fully initialized. All other consumers should use
312 312 * 'global_zone'.
313 313 */
314 314 zone_t zone0;
315 315 zone_t *global_zone = NULL; /* Set when the global zone is initialized */
316 316
317 317 /*
318 318 * List of active zones, protected by zonehash_lock.
319 319 */
320 320 static list_t zone_active;
321 321
322 322 /*
323 323 * List of destroyed zones that still have outstanding cred references.
324 324 * Used for debugging. Uses a separate lock to avoid lock ordering
325 325 * problems in zone_free.
326 326 */
327 327 static list_t zone_deathrow;
328 328 static kmutex_t zone_deathrow_lock;
329 329
330 330 /* number of zones is limited by virtual interface limit in IP */
331 331 uint_t maxzones = 8192;
332 332
333 333 /* Event channel to sent zone state change notifications */
334 334 evchan_t *zone_event_chan;
335 335
336 336 /*
337 337 * This table holds the mapping from kernel zone states to
338 338 * states visible in the state notification API.
339 339 * The idea is that we only expose "obvious" states and
340 340 * do not expose states which are just implementation details.
341 341 */
342 342 const char *zone_status_table[] = {
343 343 ZONE_EVENT_UNINITIALIZED, /* uninitialized */
344 344 ZONE_EVENT_INITIALIZED, /* initialized */
345 345 ZONE_EVENT_READY, /* ready */
346 346 ZONE_EVENT_READY, /* booting */
347 347 ZONE_EVENT_RUNNING, /* running */
348 348 ZONE_EVENT_SHUTTING_DOWN, /* shutting_down */
349 349 ZONE_EVENT_SHUTTING_DOWN, /* empty */
350 350 ZONE_EVENT_SHUTTING_DOWN, /* down */
351 351 ZONE_EVENT_SHUTTING_DOWN, /* dying */
352 352 ZONE_EVENT_UNINITIALIZED, /* dead */
353 353 };
354 354
355 355 /*
356 356 * This array contains the names of the subsystems listed in zone_ref_subsys_t
357 357 * (see sys/zone.h).
358 358 */
359 359 static char *zone_ref_subsys_names[] = {
360 360 "NFS", /* ZONE_REF_NFS */
361 361 "NFSv4", /* ZONE_REF_NFSV4 */
362 362 "SMBFS", /* ZONE_REF_SMBFS */
363 363 "MNTFS", /* ZONE_REF_MNTFS */
364 364 "LOFI", /* ZONE_REF_LOFI */
365 365 "VFS", /* ZONE_REF_VFS */
366 366 "IPC" /* ZONE_REF_IPC */
367 367 };
|
↓ open down ↓ |
367 lines elided |
↑ open up ↑ |
368 368
369 369 /*
370 370 * This isn't static so lint doesn't complain.
371 371 */
372 372 rctl_hndl_t rc_zone_cpu_shares;
373 373 rctl_hndl_t rc_zone_locked_mem;
374 374 rctl_hndl_t rc_zone_max_swap;
375 375 rctl_hndl_t rc_zone_phys_mem;
376 376 rctl_hndl_t rc_zone_max_lofi;
377 377 rctl_hndl_t rc_zone_cpu_cap;
378 -rctl_hndl_t rc_zone_cpu_baseline;
379 -rctl_hndl_t rc_zone_cpu_burst_time;
380 378 rctl_hndl_t rc_zone_zfs_io_pri;
381 379 rctl_hndl_t rc_zone_nlwps;
382 380 rctl_hndl_t rc_zone_nprocs;
383 381 rctl_hndl_t rc_zone_shmmax;
384 382 rctl_hndl_t rc_zone_shmmni;
385 383 rctl_hndl_t rc_zone_semmni;
386 384 rctl_hndl_t rc_zone_msgmni;
387 385
388 386 const char * const zone_default_initname = "/sbin/init";
389 387 static char * const zone_prefix = "/zone/";
390 388 static int zone_shutdown(zoneid_t zoneid);
391 389 static int zone_add_datalink(zoneid_t, datalink_id_t);
392 390 static int zone_remove_datalink(zoneid_t, datalink_id_t);
393 391 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
394 392 static int zone_set_network(zoneid_t, zone_net_data_t *);
395 393 static int zone_get_network(zoneid_t, zone_net_data_t *);
396 394
397 395 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
398 396
399 397 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
400 398 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
401 399 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
402 400 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
403 401 zone_key_t);
404 402 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
405 403 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
406 404 kmutex_t *);
407 405 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
408 406 kmutex_t *);
409 407
410 408 /*
411 409 * Bump this number when you alter the zone syscall interfaces; this is
412 410 * because we need to have support for previous API versions in libc
413 411 * to support patching; libc calls into the kernel to determine this number.
414 412 *
415 413 * Version 1 of the API is the version originally shipped with Solaris 10
|
↓ open down ↓ |
26 lines elided |
↑ open up ↑ |
416 414 * Version 2 alters the zone_create system call in order to support more
417 415 * arguments by moving the args into a structure; and to do better
418 416 * error reporting when zone_create() fails.
419 417 * Version 3 alters the zone_create system call in order to support the
420 418 * import of ZFS datasets to zones.
421 419 * Version 4 alters the zone_create system call in order to support
422 420 * Trusted Extensions.
423 421 * Version 5 alters the zone_boot system call, and converts its old
424 422 * bootargs parameter to be set by the zone_setattr API instead.
425 423 * Version 6 adds the flag argument to zone_create.
426 - * Version 7 adds the requested zoneid to zone_create.
427 424 */
428 -static const int ZONE_SYSCALL_API_VERSION = 7;
425 +static const int ZONE_SYSCALL_API_VERSION = 6;
429 426
430 427 /*
431 428 * Certain filesystems (such as NFS and autofs) need to know which zone
432 429 * the mount is being placed in. Because of this, we need to be able to
433 430 * ensure that a zone isn't in the process of being created/destroyed such
434 431 * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
435 432 * it gets added the list of mounted zones, it ends up on the wrong zone's
436 433 * mount list. Since a zone can't reside on an NFS file system, we don't
437 434 * have to worry about the zonepath itself.
438 435 *
439 436 * The following functions: block_mounts()/resume_mounts() and
440 437 * mount_in_progress()/mount_completed() are used by zones and the VFS
441 438 * layer (respectively) to synchronize zone state transitions and new
442 439 * mounts within a zone. This syncronization is on a per-zone basis, so
443 440 * activity for one zone will not interfere with activity for another zone.
444 441 *
445 442 * The semantics are like a reader-reader lock such that there may
446 443 * either be multiple mounts (or zone state transitions, if that weren't
447 444 * serialized by zonehash_lock) in progress at the same time, but not
448 445 * both.
449 446 *
450 447 * We use cv's so the user can ctrl-C out of the operation if it's
451 448 * taking too long.
452 449 *
453 450 * The semantics are such that there is unfair bias towards the
454 451 * "current" operation. This means that zone halt may starve if
455 452 * there is a rapid succession of new mounts coming in to the zone.
456 453 */
457 454 /*
458 455 * Prevent new mounts from progressing to the point of calling
459 456 * VFS_MOUNT(). If there are already mounts in this "region", wait for
460 457 * them to complete.
461 458 */
462 459 static int
463 460 block_mounts(zone_t *zp)
464 461 {
465 462 int retval = 0;
466 463
467 464 /*
468 465 * Since it may block for a long time, block_mounts() shouldn't be
469 466 * called with zonehash_lock held.
470 467 */
471 468 ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
472 469 mutex_enter(&zp->zone_mount_lock);
473 470 while (zp->zone_mounts_in_progress > 0) {
474 471 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
475 472 goto signaled;
476 473 }
477 474 /*
478 475 * A negative value of mounts_in_progress indicates that mounts
479 476 * have been blocked by (-mounts_in_progress) different callers
480 477 * (remotely possible if two threads enter zone_shutdown at the same
481 478 * time).
482 479 */
483 480 zp->zone_mounts_in_progress--;
484 481 retval = 1;
485 482 signaled:
486 483 mutex_exit(&zp->zone_mount_lock);
487 484 return (retval);
488 485 }
489 486
490 487 /*
491 488 * The VFS layer may progress with new mounts as far as we're concerned.
492 489 * Allow them to progress if we were the last obstacle.
493 490 */
494 491 static void
495 492 resume_mounts(zone_t *zp)
496 493 {
497 494 mutex_enter(&zp->zone_mount_lock);
498 495 if (++zp->zone_mounts_in_progress == 0)
499 496 cv_broadcast(&zp->zone_mount_cv);
500 497 mutex_exit(&zp->zone_mount_lock);
501 498 }
502 499
503 500 /*
504 501 * The VFS layer is busy with a mount; this zone should wait until all
505 502 * of its mounts are completed to progress.
506 503 */
507 504 void
508 505 mount_in_progress(zone_t *zp)
509 506 {
510 507 mutex_enter(&zp->zone_mount_lock);
511 508 while (zp->zone_mounts_in_progress < 0)
512 509 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
513 510 zp->zone_mounts_in_progress++;
514 511 mutex_exit(&zp->zone_mount_lock);
515 512 }
516 513
517 514 /*
518 515 * VFS is done with one mount; wake up any waiting block_mounts()
519 516 * callers if this is the last mount.
520 517 */
521 518 void
522 519 mount_completed(zone_t *zp)
523 520 {
524 521 mutex_enter(&zp->zone_mount_lock);
525 522 if (--zp->zone_mounts_in_progress == 0)
526 523 cv_broadcast(&zp->zone_mount_cv);
527 524 mutex_exit(&zp->zone_mount_lock);
528 525 }
529 526
530 527 /*
531 528 * ZSD routines.
532 529 *
533 530 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
534 531 * defined by the pthread_key_create() and related interfaces.
535 532 *
536 533 * Kernel subsystems may register one or more data items and/or
537 534 * callbacks to be executed when a zone is created, shutdown, or
538 535 * destroyed.
539 536 *
540 537 * Unlike the thread counterpart, destructor callbacks will be executed
541 538 * even if the data pointer is NULL and/or there are no constructor
542 539 * callbacks, so it is the responsibility of such callbacks to check for
543 540 * NULL data values if necessary.
544 541 *
545 542 * The locking strategy and overall picture is as follows:
546 543 *
547 544 * When someone calls zone_key_create(), a template ZSD entry is added to the
548 545 * global list "zsd_registered_keys", protected by zsd_key_lock. While
549 546 * holding that lock all the existing zones are marked as
550 547 * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
551 548 * zone_zsd list (protected by zone_lock). The global list is updated first
552 549 * (under zone_key_lock) to make sure that newly created zones use the
553 550 * most recent list of keys. Then under zonehash_lock we walk the zones
554 551 * and mark them. Similar locking is used in zone_key_delete().
555 552 *
556 553 * The actual create, shutdown, and destroy callbacks are done without
557 554 * holding any lock. And zsd_flags are used to ensure that the operations
558 555 * completed so that when zone_key_create (and zone_create) is done, as well as
559 556 * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
560 557 * are completed.
561 558 *
562 559 * When new zones are created constructor callbacks for all registered ZSD
563 560 * entries will be called. That also uses the above two phases of marking
564 561 * what needs to be done, and then running the callbacks without holding
565 562 * any locks.
566 563 *
567 564 * The framework does not provide any locking around zone_getspecific() and
568 565 * zone_setspecific() apart from that needed for internal consistency, so
569 566 * callers interested in atomic "test-and-set" semantics will need to provide
570 567 * their own locking.
571 568 */
572 569
573 570 /*
574 571 * Helper function to find the zsd_entry associated with the key in the
575 572 * given list.
576 573 */
577 574 static struct zsd_entry *
578 575 zsd_find(list_t *l, zone_key_t key)
579 576 {
580 577 struct zsd_entry *zsd;
581 578
582 579 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
583 580 if (zsd->zsd_key == key) {
584 581 return (zsd);
585 582 }
586 583 }
587 584 return (NULL);
588 585 }
589 586
590 587 /*
591 588 * Helper function to find the zsd_entry associated with the key in the
592 589 * given list. Move it to the front of the list.
593 590 */
594 591 static struct zsd_entry *
595 592 zsd_find_mru(list_t *l, zone_key_t key)
596 593 {
597 594 struct zsd_entry *zsd;
598 595
599 596 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
600 597 if (zsd->zsd_key == key) {
601 598 /*
602 599 * Move to head of list to keep list in MRU order.
603 600 */
604 601 if (zsd != list_head(l)) {
605 602 list_remove(l, zsd);
606 603 list_insert_head(l, zsd);
607 604 }
608 605 return (zsd);
609 606 }
610 607 }
611 608 return (NULL);
612 609 }
613 610
614 611 void
615 612 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
616 613 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
617 614 {
618 615 struct zsd_entry *zsdp;
619 616 struct zsd_entry *t;
620 617 struct zone *zone;
621 618 zone_key_t key;
622 619
623 620 zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
624 621 zsdp->zsd_data = NULL;
625 622 zsdp->zsd_create = create;
626 623 zsdp->zsd_shutdown = shutdown;
627 624 zsdp->zsd_destroy = destroy;
628 625
629 626 /*
630 627 * Insert in global list of callbacks. Makes future zone creations
631 628 * see it.
632 629 */
633 630 mutex_enter(&zsd_key_lock);
634 631 key = zsdp->zsd_key = ++zsd_keyval;
635 632 ASSERT(zsd_keyval != 0);
636 633 list_insert_tail(&zsd_registered_keys, zsdp);
637 634 mutex_exit(&zsd_key_lock);
638 635
639 636 /*
640 637 * Insert for all existing zones and mark them as needing
641 638 * a create callback.
642 639 */
643 640 mutex_enter(&zonehash_lock); /* stop the world */
644 641 for (zone = list_head(&zone_active); zone != NULL;
645 642 zone = list_next(&zone_active, zone)) {
646 643 zone_status_t status;
647 644
648 645 mutex_enter(&zone->zone_lock);
649 646
650 647 /* Skip zones that are on the way down or not yet up */
651 648 status = zone_status_get(zone);
652 649 if (status >= ZONE_IS_DOWN ||
653 650 status == ZONE_IS_UNINITIALIZED) {
654 651 mutex_exit(&zone->zone_lock);
655 652 continue;
656 653 }
657 654
658 655 t = zsd_find_mru(&zone->zone_zsd, key);
659 656 if (t != NULL) {
660 657 /*
661 658 * A zsd_configure already inserted it after
662 659 * we dropped zsd_key_lock above.
663 660 */
664 661 mutex_exit(&zone->zone_lock);
665 662 continue;
666 663 }
667 664 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
668 665 t->zsd_key = key;
669 666 t->zsd_create = create;
670 667 t->zsd_shutdown = shutdown;
671 668 t->zsd_destroy = destroy;
672 669 if (create != NULL) {
673 670 t->zsd_flags = ZSD_CREATE_NEEDED;
674 671 DTRACE_PROBE2(zsd__create__needed,
675 672 zone_t *, zone, zone_key_t, key);
676 673 }
677 674 list_insert_tail(&zone->zone_zsd, t);
678 675 mutex_exit(&zone->zone_lock);
679 676 }
680 677 mutex_exit(&zonehash_lock);
681 678
682 679 if (create != NULL) {
683 680 /* Now call the create callback for this key */
684 681 zsd_apply_all_zones(zsd_apply_create, key);
685 682 }
686 683 /*
687 684 * It is safe for consumers to use the key now, make it
688 685 * globally visible. Specifically zone_getspecific() will
689 686 * always successfully return the zone specific data associated
690 687 * with the key.
691 688 */
692 689 *keyp = key;
693 690
694 691 }
695 692
696 693 /*
697 694 * Function called when a module is being unloaded, or otherwise wishes
698 695 * to unregister its ZSD key and callbacks.
699 696 *
700 697 * Remove from the global list and determine the functions that need to
701 698 * be called under a global lock. Then call the functions without
702 699 * holding any locks. Finally free up the zone_zsd entries. (The apply
703 700 * functions need to access the zone_zsd entries to find zsd_data etc.)
704 701 */
705 702 int
706 703 zone_key_delete(zone_key_t key)
707 704 {
708 705 struct zsd_entry *zsdp = NULL;
709 706 zone_t *zone;
710 707
711 708 mutex_enter(&zsd_key_lock);
712 709 zsdp = zsd_find_mru(&zsd_registered_keys, key);
713 710 if (zsdp == NULL) {
714 711 mutex_exit(&zsd_key_lock);
715 712 return (-1);
716 713 }
717 714 list_remove(&zsd_registered_keys, zsdp);
718 715 mutex_exit(&zsd_key_lock);
719 716
720 717 mutex_enter(&zonehash_lock);
721 718 for (zone = list_head(&zone_active); zone != NULL;
722 719 zone = list_next(&zone_active, zone)) {
723 720 struct zsd_entry *del;
724 721
725 722 mutex_enter(&zone->zone_lock);
726 723 del = zsd_find_mru(&zone->zone_zsd, key);
727 724 if (del == NULL) {
728 725 /*
729 726 * Somebody else got here first e.g the zone going
730 727 * away.
731 728 */
732 729 mutex_exit(&zone->zone_lock);
733 730 continue;
734 731 }
735 732 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
736 733 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
737 734 if (del->zsd_shutdown != NULL &&
738 735 (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
739 736 del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
740 737 DTRACE_PROBE2(zsd__shutdown__needed,
741 738 zone_t *, zone, zone_key_t, key);
742 739 }
743 740 if (del->zsd_destroy != NULL &&
744 741 (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
745 742 del->zsd_flags |= ZSD_DESTROY_NEEDED;
746 743 DTRACE_PROBE2(zsd__destroy__needed,
747 744 zone_t *, zone, zone_key_t, key);
748 745 }
749 746 mutex_exit(&zone->zone_lock);
750 747 }
751 748 mutex_exit(&zonehash_lock);
752 749 kmem_free(zsdp, sizeof (*zsdp));
753 750
754 751 /* Now call the shutdown and destroy callback for this key */
755 752 zsd_apply_all_zones(zsd_apply_shutdown, key);
756 753 zsd_apply_all_zones(zsd_apply_destroy, key);
757 754
758 755 /* Now we can free up the zsdp structures in each zone */
759 756 mutex_enter(&zonehash_lock);
760 757 for (zone = list_head(&zone_active); zone != NULL;
761 758 zone = list_next(&zone_active, zone)) {
762 759 struct zsd_entry *del;
763 760
764 761 mutex_enter(&zone->zone_lock);
765 762 del = zsd_find(&zone->zone_zsd, key);
766 763 if (del != NULL) {
767 764 list_remove(&zone->zone_zsd, del);
768 765 ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
769 766 kmem_free(del, sizeof (*del));
770 767 }
771 768 mutex_exit(&zone->zone_lock);
772 769 }
773 770 mutex_exit(&zonehash_lock);
774 771
775 772 return (0);
776 773 }
777 774
778 775 /*
779 776 * ZSD counterpart of pthread_setspecific().
780 777 *
781 778 * Since all zsd callbacks, including those with no create function,
782 779 * have an entry in zone_zsd, if the key is registered it is part of
783 780 * the zone_zsd list.
784 781 * Return an error if the key wasn't registerd.
785 782 */
786 783 int
787 784 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
788 785 {
789 786 struct zsd_entry *t;
790 787
791 788 mutex_enter(&zone->zone_lock);
792 789 t = zsd_find_mru(&zone->zone_zsd, key);
793 790 if (t != NULL) {
794 791 /*
795 792 * Replace old value with new
796 793 */
797 794 t->zsd_data = (void *)data;
798 795 mutex_exit(&zone->zone_lock);
799 796 return (0);
800 797 }
801 798 mutex_exit(&zone->zone_lock);
802 799 return (-1);
803 800 }
804 801
805 802 /*
806 803 * ZSD counterpart of pthread_getspecific().
807 804 */
808 805 void *
809 806 zone_getspecific(zone_key_t key, zone_t *zone)
810 807 {
811 808 struct zsd_entry *t;
812 809 void *data;
813 810
814 811 mutex_enter(&zone->zone_lock);
815 812 t = zsd_find_mru(&zone->zone_zsd, key);
816 813 data = (t == NULL ? NULL : t->zsd_data);
817 814 mutex_exit(&zone->zone_lock);
818 815 return (data);
819 816 }
820 817
821 818 /*
822 819 * Function used to initialize a zone's list of ZSD callbacks and data
823 820 * when the zone is being created. The callbacks are initialized from
824 821 * the template list (zsd_registered_keys). The constructor callback is
825 822 * executed later (once the zone exists and with locks dropped).
826 823 */
827 824 static void
828 825 zone_zsd_configure(zone_t *zone)
829 826 {
830 827 struct zsd_entry *zsdp;
831 828 struct zsd_entry *t;
832 829
833 830 ASSERT(MUTEX_HELD(&zonehash_lock));
834 831 ASSERT(list_head(&zone->zone_zsd) == NULL);
835 832 mutex_enter(&zone->zone_lock);
836 833 mutex_enter(&zsd_key_lock);
837 834 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
838 835 zsdp = list_next(&zsd_registered_keys, zsdp)) {
839 836 /*
840 837 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
841 838 * should not have added anything to it.
842 839 */
843 840 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
844 841
845 842 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
846 843 t->zsd_key = zsdp->zsd_key;
847 844 t->zsd_create = zsdp->zsd_create;
848 845 t->zsd_shutdown = zsdp->zsd_shutdown;
849 846 t->zsd_destroy = zsdp->zsd_destroy;
850 847 if (zsdp->zsd_create != NULL) {
851 848 t->zsd_flags = ZSD_CREATE_NEEDED;
852 849 DTRACE_PROBE2(zsd__create__needed,
853 850 zone_t *, zone, zone_key_t, zsdp->zsd_key);
854 851 }
855 852 list_insert_tail(&zone->zone_zsd, t);
856 853 }
857 854 mutex_exit(&zsd_key_lock);
858 855 mutex_exit(&zone->zone_lock);
859 856 }
860 857
861 858 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
862 859
863 860 /*
864 861 * Helper function to execute shutdown or destructor callbacks.
865 862 */
866 863 static void
867 864 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
868 865 {
869 866 struct zsd_entry *t;
870 867
871 868 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
872 869 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
873 870 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
874 871
875 872 /*
876 873 * Run the callback solely based on what is registered for the zone
877 874 * in zone_zsd. The global list can change independently of this
878 875 * as keys are registered and unregistered and we don't register new
879 876 * callbacks for a zone that is in the process of going away.
880 877 */
881 878 mutex_enter(&zone->zone_lock);
882 879 for (t = list_head(&zone->zone_zsd); t != NULL;
883 880 t = list_next(&zone->zone_zsd, t)) {
884 881 zone_key_t key = t->zsd_key;
885 882
886 883 /* Skip if no callbacks registered */
887 884
888 885 if (ct == ZSD_SHUTDOWN) {
889 886 if (t->zsd_shutdown != NULL &&
890 887 (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
891 888 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
892 889 DTRACE_PROBE2(zsd__shutdown__needed,
893 890 zone_t *, zone, zone_key_t, key);
894 891 }
895 892 } else {
896 893 if (t->zsd_destroy != NULL &&
897 894 (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
898 895 t->zsd_flags |= ZSD_DESTROY_NEEDED;
899 896 DTRACE_PROBE2(zsd__destroy__needed,
900 897 zone_t *, zone, zone_key_t, key);
901 898 }
902 899 }
903 900 }
904 901 mutex_exit(&zone->zone_lock);
905 902
906 903 /* Now call the shutdown and destroy callback for this key */
907 904 zsd_apply_all_keys(zsd_apply_shutdown, zone);
908 905 zsd_apply_all_keys(zsd_apply_destroy, zone);
909 906
910 907 }
911 908
912 909 /*
913 910 * Called when the zone is going away; free ZSD-related memory, and
914 911 * destroy the zone_zsd list.
915 912 */
916 913 static void
917 914 zone_free_zsd(zone_t *zone)
918 915 {
919 916 struct zsd_entry *t, *next;
920 917
921 918 /*
922 919 * Free all the zsd_entry's we had on this zone.
923 920 */
924 921 mutex_enter(&zone->zone_lock);
925 922 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
926 923 next = list_next(&zone->zone_zsd, t);
927 924 list_remove(&zone->zone_zsd, t);
928 925 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
929 926 kmem_free(t, sizeof (*t));
930 927 }
931 928 list_destroy(&zone->zone_zsd);
932 929 mutex_exit(&zone->zone_lock);
933 930
934 931 }
935 932
936 933 /*
937 934 * Apply a function to all zones for particular key value.
938 935 *
939 936 * The applyfn has to drop zonehash_lock if it does some work, and
940 937 * then reacquire it before it returns.
941 938 * When the lock is dropped we don't follow list_next even
942 939 * if it is possible to do so without any hazards. This is
943 940 * because we want the design to allow for the list of zones
944 941 * to change in any arbitrary way during the time the
945 942 * lock was dropped.
946 943 *
947 944 * It is safe to restart the loop at list_head since the applyfn
948 945 * changes the zsd_flags as it does work, so a subsequent
949 946 * pass through will have no effect in applyfn, hence the loop will terminate
950 947 * in at worst O(N^2).
951 948 */
952 949 static void
953 950 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
954 951 {
955 952 zone_t *zone;
956 953
957 954 mutex_enter(&zonehash_lock);
958 955 zone = list_head(&zone_active);
959 956 while (zone != NULL) {
960 957 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
961 958 /* Lock dropped - restart at head */
962 959 zone = list_head(&zone_active);
963 960 } else {
964 961 zone = list_next(&zone_active, zone);
965 962 }
966 963 }
967 964 mutex_exit(&zonehash_lock);
968 965 }
969 966
970 967 /*
971 968 * Apply a function to all keys for a particular zone.
972 969 *
973 970 * The applyfn has to drop zonehash_lock if it does some work, and
974 971 * then reacquire it before it returns.
975 972 * When the lock is dropped we don't follow list_next even
976 973 * if it is possible to do so without any hazards. This is
977 974 * because we want the design to allow for the list of zsd callbacks
978 975 * to change in any arbitrary way during the time the
979 976 * lock was dropped.
980 977 *
981 978 * It is safe to restart the loop at list_head since the applyfn
982 979 * changes the zsd_flags as it does work, so a subsequent
983 980 * pass through will have no effect in applyfn, hence the loop will terminate
984 981 * in at worst O(N^2).
985 982 */
986 983 static void
987 984 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
988 985 {
989 986 struct zsd_entry *t;
990 987
991 988 mutex_enter(&zone->zone_lock);
992 989 t = list_head(&zone->zone_zsd);
993 990 while (t != NULL) {
994 991 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
995 992 /* Lock dropped - restart at head */
996 993 t = list_head(&zone->zone_zsd);
997 994 } else {
998 995 t = list_next(&zone->zone_zsd, t);
999 996 }
1000 997 }
1001 998 mutex_exit(&zone->zone_lock);
1002 999 }
1003 1000
1004 1001 /*
1005 1002 * Call the create function for the zone and key if CREATE_NEEDED
1006 1003 * is set.
1007 1004 * If some other thread gets here first and sets CREATE_INPROGRESS, then
1008 1005 * we wait for that thread to complete so that we can ensure that
1009 1006 * all the callbacks are done when we've looped over all zones/keys.
1010 1007 *
1011 1008 * When we call the create function, we drop the global held by the
1012 1009 * caller, and return true to tell the caller it needs to re-evalute the
1013 1010 * state.
1014 1011 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1015 1012 * remains held on exit.
1016 1013 */
1017 1014 static boolean_t
1018 1015 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1019 1016 zone_t *zone, zone_key_t key)
1020 1017 {
1021 1018 void *result;
1022 1019 struct zsd_entry *t;
1023 1020 boolean_t dropped;
1024 1021
1025 1022 if (lockp != NULL) {
1026 1023 ASSERT(MUTEX_HELD(lockp));
1027 1024 }
1028 1025 if (zone_lock_held) {
1029 1026 ASSERT(MUTEX_HELD(&zone->zone_lock));
1030 1027 } else {
1031 1028 mutex_enter(&zone->zone_lock);
1032 1029 }
1033 1030
1034 1031 t = zsd_find(&zone->zone_zsd, key);
1035 1032 if (t == NULL) {
1036 1033 /*
1037 1034 * Somebody else got here first e.g the zone going
1038 1035 * away.
1039 1036 */
1040 1037 if (!zone_lock_held)
1041 1038 mutex_exit(&zone->zone_lock);
1042 1039 return (B_FALSE);
1043 1040 }
1044 1041 dropped = B_FALSE;
1045 1042 if (zsd_wait_for_inprogress(zone, t, lockp))
1046 1043 dropped = B_TRUE;
1047 1044
1048 1045 if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1049 1046 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1050 1047 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1051 1048 DTRACE_PROBE2(zsd__create__inprogress,
1052 1049 zone_t *, zone, zone_key_t, key);
1053 1050 mutex_exit(&zone->zone_lock);
1054 1051 if (lockp != NULL)
1055 1052 mutex_exit(lockp);
1056 1053
1057 1054 dropped = B_TRUE;
1058 1055 ASSERT(t->zsd_create != NULL);
1059 1056 DTRACE_PROBE2(zsd__create__start,
1060 1057 zone_t *, zone, zone_key_t, key);
1061 1058
1062 1059 result = (*t->zsd_create)(zone->zone_id);
1063 1060
1064 1061 DTRACE_PROBE2(zsd__create__end,
1065 1062 zone_t *, zone, voidn *, result);
1066 1063
1067 1064 ASSERT(result != NULL);
1068 1065 if (lockp != NULL)
1069 1066 mutex_enter(lockp);
1070 1067 mutex_enter(&zone->zone_lock);
1071 1068 t->zsd_data = result;
1072 1069 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1073 1070 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1074 1071 cv_broadcast(&t->zsd_cv);
1075 1072 DTRACE_PROBE2(zsd__create__completed,
1076 1073 zone_t *, zone, zone_key_t, key);
1077 1074 }
1078 1075 if (!zone_lock_held)
1079 1076 mutex_exit(&zone->zone_lock);
1080 1077 return (dropped);
1081 1078 }
1082 1079
1083 1080 /*
1084 1081 * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1085 1082 * is set.
1086 1083 * If some other thread gets here first and sets *_INPROGRESS, then
1087 1084 * we wait for that thread to complete so that we can ensure that
1088 1085 * all the callbacks are done when we've looped over all zones/keys.
1089 1086 *
1090 1087 * When we call the shutdown function, we drop the global held by the
1091 1088 * caller, and return true to tell the caller it needs to re-evalute the
1092 1089 * state.
1093 1090 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1094 1091 * remains held on exit.
1095 1092 */
1096 1093 static boolean_t
1097 1094 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1098 1095 zone_t *zone, zone_key_t key)
1099 1096 {
1100 1097 struct zsd_entry *t;
1101 1098 void *data;
1102 1099 boolean_t dropped;
1103 1100
1104 1101 if (lockp != NULL) {
1105 1102 ASSERT(MUTEX_HELD(lockp));
1106 1103 }
1107 1104 if (zone_lock_held) {
1108 1105 ASSERT(MUTEX_HELD(&zone->zone_lock));
1109 1106 } else {
1110 1107 mutex_enter(&zone->zone_lock);
1111 1108 }
1112 1109
1113 1110 t = zsd_find(&zone->zone_zsd, key);
1114 1111 if (t == NULL) {
1115 1112 /*
1116 1113 * Somebody else got here first e.g the zone going
1117 1114 * away.
1118 1115 */
1119 1116 if (!zone_lock_held)
1120 1117 mutex_exit(&zone->zone_lock);
1121 1118 return (B_FALSE);
1122 1119 }
1123 1120 dropped = B_FALSE;
1124 1121 if (zsd_wait_for_creator(zone, t, lockp))
1125 1122 dropped = B_TRUE;
1126 1123
1127 1124 if (zsd_wait_for_inprogress(zone, t, lockp))
1128 1125 dropped = B_TRUE;
1129 1126
1130 1127 if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1131 1128 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1132 1129 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1133 1130 DTRACE_PROBE2(zsd__shutdown__inprogress,
1134 1131 zone_t *, zone, zone_key_t, key);
1135 1132 mutex_exit(&zone->zone_lock);
1136 1133 if (lockp != NULL)
1137 1134 mutex_exit(lockp);
1138 1135 dropped = B_TRUE;
1139 1136
1140 1137 ASSERT(t->zsd_shutdown != NULL);
1141 1138 data = t->zsd_data;
1142 1139
1143 1140 DTRACE_PROBE2(zsd__shutdown__start,
1144 1141 zone_t *, zone, zone_key_t, key);
1145 1142
1146 1143 (t->zsd_shutdown)(zone->zone_id, data);
1147 1144 DTRACE_PROBE2(zsd__shutdown__end,
1148 1145 zone_t *, zone, zone_key_t, key);
1149 1146
1150 1147 if (lockp != NULL)
1151 1148 mutex_enter(lockp);
1152 1149 mutex_enter(&zone->zone_lock);
1153 1150 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1154 1151 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1155 1152 cv_broadcast(&t->zsd_cv);
1156 1153 DTRACE_PROBE2(zsd__shutdown__completed,
1157 1154 zone_t *, zone, zone_key_t, key);
1158 1155 }
1159 1156 if (!zone_lock_held)
1160 1157 mutex_exit(&zone->zone_lock);
1161 1158 return (dropped);
1162 1159 }
1163 1160
1164 1161 /*
1165 1162 * Call the destroy function for the zone and key if DESTROY_NEEDED
1166 1163 * is set.
1167 1164 * If some other thread gets here first and sets *_INPROGRESS, then
1168 1165 * we wait for that thread to complete so that we can ensure that
1169 1166 * all the callbacks are done when we've looped over all zones/keys.
1170 1167 *
1171 1168 * When we call the destroy function, we drop the global held by the
1172 1169 * caller, and return true to tell the caller it needs to re-evalute the
1173 1170 * state.
1174 1171 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1175 1172 * remains held on exit.
1176 1173 */
1177 1174 static boolean_t
1178 1175 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1179 1176 zone_t *zone, zone_key_t key)
1180 1177 {
1181 1178 struct zsd_entry *t;
1182 1179 void *data;
1183 1180 boolean_t dropped;
1184 1181
1185 1182 if (lockp != NULL) {
1186 1183 ASSERT(MUTEX_HELD(lockp));
1187 1184 }
1188 1185 if (zone_lock_held) {
1189 1186 ASSERT(MUTEX_HELD(&zone->zone_lock));
1190 1187 } else {
1191 1188 mutex_enter(&zone->zone_lock);
1192 1189 }
1193 1190
1194 1191 t = zsd_find(&zone->zone_zsd, key);
1195 1192 if (t == NULL) {
1196 1193 /*
1197 1194 * Somebody else got here first e.g the zone going
1198 1195 * away.
1199 1196 */
1200 1197 if (!zone_lock_held)
1201 1198 mutex_exit(&zone->zone_lock);
1202 1199 return (B_FALSE);
1203 1200 }
1204 1201 dropped = B_FALSE;
1205 1202 if (zsd_wait_for_creator(zone, t, lockp))
1206 1203 dropped = B_TRUE;
1207 1204
1208 1205 if (zsd_wait_for_inprogress(zone, t, lockp))
1209 1206 dropped = B_TRUE;
1210 1207
1211 1208 if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1212 1209 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1213 1210 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1214 1211 DTRACE_PROBE2(zsd__destroy__inprogress,
1215 1212 zone_t *, zone, zone_key_t, key);
1216 1213 mutex_exit(&zone->zone_lock);
1217 1214 if (lockp != NULL)
1218 1215 mutex_exit(lockp);
1219 1216 dropped = B_TRUE;
1220 1217
1221 1218 ASSERT(t->zsd_destroy != NULL);
1222 1219 data = t->zsd_data;
1223 1220 DTRACE_PROBE2(zsd__destroy__start,
1224 1221 zone_t *, zone, zone_key_t, key);
1225 1222
1226 1223 (t->zsd_destroy)(zone->zone_id, data);
1227 1224 DTRACE_PROBE2(zsd__destroy__end,
1228 1225 zone_t *, zone, zone_key_t, key);
1229 1226
1230 1227 if (lockp != NULL)
1231 1228 mutex_enter(lockp);
1232 1229 mutex_enter(&zone->zone_lock);
1233 1230 t->zsd_data = NULL;
1234 1231 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1235 1232 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1236 1233 cv_broadcast(&t->zsd_cv);
1237 1234 DTRACE_PROBE2(zsd__destroy__completed,
1238 1235 zone_t *, zone, zone_key_t, key);
1239 1236 }
1240 1237 if (!zone_lock_held)
1241 1238 mutex_exit(&zone->zone_lock);
1242 1239 return (dropped);
1243 1240 }
1244 1241
1245 1242 /*
1246 1243 * Wait for any CREATE_NEEDED flag to be cleared.
1247 1244 * Returns true if lockp was temporarily dropped while waiting.
1248 1245 */
1249 1246 static boolean_t
1250 1247 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1251 1248 {
1252 1249 boolean_t dropped = B_FALSE;
1253 1250
1254 1251 while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1255 1252 DTRACE_PROBE2(zsd__wait__for__creator,
1256 1253 zone_t *, zone, struct zsd_entry *, t);
1257 1254 if (lockp != NULL) {
1258 1255 dropped = B_TRUE;
1259 1256 mutex_exit(lockp);
1260 1257 }
1261 1258 cv_wait(&t->zsd_cv, &zone->zone_lock);
1262 1259 if (lockp != NULL) {
1263 1260 /* First drop zone_lock to preserve order */
1264 1261 mutex_exit(&zone->zone_lock);
1265 1262 mutex_enter(lockp);
1266 1263 mutex_enter(&zone->zone_lock);
1267 1264 }
1268 1265 }
1269 1266 return (dropped);
1270 1267 }
1271 1268
1272 1269 /*
1273 1270 * Wait for any INPROGRESS flag to be cleared.
1274 1271 * Returns true if lockp was temporarily dropped while waiting.
1275 1272 */
1276 1273 static boolean_t
1277 1274 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1278 1275 {
1279 1276 boolean_t dropped = B_FALSE;
1280 1277
1281 1278 while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1282 1279 DTRACE_PROBE2(zsd__wait__for__inprogress,
1283 1280 zone_t *, zone, struct zsd_entry *, t);
1284 1281 if (lockp != NULL) {
1285 1282 dropped = B_TRUE;
1286 1283 mutex_exit(lockp);
1287 1284 }
1288 1285 cv_wait(&t->zsd_cv, &zone->zone_lock);
1289 1286 if (lockp != NULL) {
1290 1287 /* First drop zone_lock to preserve order */
1291 1288 mutex_exit(&zone->zone_lock);
1292 1289 mutex_enter(lockp);
1293 1290 mutex_enter(&zone->zone_lock);
1294 1291 }
1295 1292 }
1296 1293 return (dropped);
1297 1294 }
1298 1295
1299 1296 /*
1300 1297 * Frees memory associated with the zone dataset list.
1301 1298 */
1302 1299 static void
1303 1300 zone_free_datasets(zone_t *zone)
1304 1301 {
1305 1302 zone_dataset_t *t, *next;
1306 1303
1307 1304 for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1308 1305 next = list_next(&zone->zone_datasets, t);
1309 1306 list_remove(&zone->zone_datasets, t);
1310 1307 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1311 1308 kmem_free(t, sizeof (*t));
1312 1309 }
1313 1310 list_destroy(&zone->zone_datasets);
1314 1311 }
1315 1312
1316 1313 /*
1317 1314 * zone.cpu-shares resource control support.
1318 1315 */
1319 1316 /*ARGSUSED*/
1320 1317 static rctl_qty_t
1321 1318 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1322 1319 {
1323 1320 ASSERT(MUTEX_HELD(&p->p_lock));
1324 1321 return (p->p_zone->zone_shares);
1325 1322 }
1326 1323
1327 1324 /*ARGSUSED*/
1328 1325 static int
1329 1326 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1330 1327 rctl_qty_t nv)
1331 1328 {
1332 1329 ASSERT(MUTEX_HELD(&p->p_lock));
1333 1330 ASSERT(e->rcep_t == RCENTITY_ZONE);
1334 1331 if (e->rcep_p.zone == NULL)
1335 1332 return (0);
1336 1333
1337 1334 e->rcep_p.zone->zone_shares = nv;
1338 1335 return (0);
1339 1336 }
1340 1337
1341 1338 static rctl_ops_t zone_cpu_shares_ops = {
1342 1339 rcop_no_action,
1343 1340 zone_cpu_shares_usage,
1344 1341 zone_cpu_shares_set,
1345 1342 rcop_no_test
1346 1343 };
1347 1344
1348 1345 /*
1349 1346 * zone.cpu-cap resource control support.
1350 1347 */
1351 1348 /*ARGSUSED*/
1352 1349 static rctl_qty_t
1353 1350 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1354 1351 {
1355 1352 ASSERT(MUTEX_HELD(&p->p_lock));
1356 1353 return (cpucaps_zone_get(p->p_zone));
1357 1354 }
1358 1355
1359 1356 /*ARGSUSED*/
1360 1357 static int
1361 1358 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1362 1359 rctl_qty_t nv)
1363 1360 {
1364 1361 zone_t *zone = e->rcep_p.zone;
1365 1362
1366 1363 ASSERT(MUTEX_HELD(&p->p_lock));
1367 1364 ASSERT(e->rcep_t == RCENTITY_ZONE);
1368 1365
1369 1366 if (zone == NULL)
1370 1367 return (0);
1371 1368
1372 1369 /*
1373 1370 * set cap to the new value.
1374 1371 */
|
↓ open down ↓ |
936 lines elided |
↑ open up ↑ |
1375 1372 return (cpucaps_zone_set(zone, nv));
1376 1373 }
1377 1374
1378 1375 static rctl_ops_t zone_cpu_cap_ops = {
1379 1376 rcop_no_action,
1380 1377 zone_cpu_cap_get,
1381 1378 zone_cpu_cap_set,
1382 1379 rcop_no_test
1383 1380 };
1384 1381
1385 -/*ARGSUSED*/
1386 -static rctl_qty_t
1387 -zone_cpu_base_get(rctl_t *rctl, struct proc *p)
1388 -{
1389 - ASSERT(MUTEX_HELD(&p->p_lock));
1390 - return (cpucaps_zone_get_base(p->p_zone));
1391 -}
1392 -
1393 1382 /*
1394 - * The zone cpu base is used to set the baseline CPU for the zone
1395 - * so we can track when the zone is bursting.
1396 - */
1397 -/*ARGSUSED*/
1398 -static int
1399 -zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1400 - rctl_qty_t nv)
1401 -{
1402 - zone_t *zone = e->rcep_p.zone;
1403 -
1404 - ASSERT(MUTEX_HELD(&p->p_lock));
1405 - ASSERT(e->rcep_t == RCENTITY_ZONE);
1406 -
1407 - if (zone == NULL)
1408 - return (0);
1409 -
1410 - return (cpucaps_zone_set_base(zone, nv));
1411 -}
1412 -
1413 -static rctl_ops_t zone_cpu_base_ops = {
1414 - rcop_no_action,
1415 - zone_cpu_base_get,
1416 - zone_cpu_base_set,
1417 - rcop_no_test
1418 -};
1419 -
1420 -/*ARGSUSED*/
1421 -static rctl_qty_t
1422 -zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
1423 -{
1424 - ASSERT(MUTEX_HELD(&p->p_lock));
1425 - return (cpucaps_zone_get_burst_time(p->p_zone));
1426 -}
1427 -
1428 -/*
1429 - * The zone cpu burst time is used to set the amount of time CPU(s) can be
1430 - * bursting for the zone.
1431 - */
1432 -/*ARGSUSED*/
1433 -static int
1434 -zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1435 - rctl_qty_t nv)
1436 -{
1437 - zone_t *zone = e->rcep_p.zone;
1438 -
1439 - ASSERT(MUTEX_HELD(&p->p_lock));
1440 - ASSERT(e->rcep_t == RCENTITY_ZONE);
1441 -
1442 - if (zone == NULL)
1443 - return (0);
1444 -
1445 - return (cpucaps_zone_set_burst_time(zone, nv));
1446 -}
1447 -
1448 -static rctl_ops_t zone_cpu_burst_time_ops = {
1449 - rcop_no_action,
1450 - zone_cpu_burst_time_get,
1451 - zone_cpu_burst_time_set,
1452 - rcop_no_test
1453 -};
1454 -
1455 -/*
1456 1383 * zone.zfs-io-pri resource control support (IO priority).
1457 1384 */
1458 1385 /*ARGSUSED*/
1459 1386 static rctl_qty_t
1460 1387 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1461 1388 {
1462 1389 ASSERT(MUTEX_HELD(&p->p_lock));
1463 1390 return (p->p_zone->zone_zfs_io_pri);
1464 1391 }
1465 1392
1466 1393 /*ARGSUSED*/
1467 1394 static int
1468 1395 zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1469 1396 rctl_qty_t nv)
1470 1397 {
1471 1398 zone_t *zone = e->rcep_p.zone;
1472 1399
1473 1400 ASSERT(MUTEX_HELD(&p->p_lock));
1474 1401 ASSERT(e->rcep_t == RCENTITY_ZONE);
1475 1402
1476 1403 if (zone == NULL)
1477 1404 return (0);
1478 1405
1479 1406 /*
1480 1407 * set priority to the new value.
1481 1408 */
1482 1409 zone->zone_zfs_io_pri = nv;
1483 1410 return (0);
1484 1411 }
1485 1412
1486 1413 static rctl_ops_t zone_zfs_io_pri_ops = {
1487 1414 rcop_no_action,
1488 1415 zone_zfs_io_pri_get,
1489 1416 zone_zfs_io_pri_set,
1490 1417 rcop_no_test
1491 1418 };
1492 1419
1493 1420 /*ARGSUSED*/
1494 1421 static rctl_qty_t
1495 1422 zone_lwps_usage(rctl_t *r, proc_t *p)
1496 1423 {
1497 1424 rctl_qty_t nlwps;
1498 1425 zone_t *zone = p->p_zone;
1499 1426
1500 1427 ASSERT(MUTEX_HELD(&p->p_lock));
1501 1428
1502 1429 mutex_enter(&zone->zone_nlwps_lock);
1503 1430 nlwps = zone->zone_nlwps;
1504 1431 mutex_exit(&zone->zone_nlwps_lock);
1505 1432
1506 1433 return (nlwps);
1507 1434 }
1508 1435
1509 1436 /*ARGSUSED*/
1510 1437 static int
1511 1438 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1512 1439 rctl_qty_t incr, uint_t flags)
1513 1440 {
1514 1441 rctl_qty_t nlwps;
1515 1442
1516 1443 ASSERT(MUTEX_HELD(&p->p_lock));
1517 1444 ASSERT(e->rcep_t == RCENTITY_ZONE);
1518 1445 if (e->rcep_p.zone == NULL)
1519 1446 return (0);
1520 1447 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1521 1448 nlwps = e->rcep_p.zone->zone_nlwps;
1522 1449
1523 1450 if (nlwps + incr > rcntl->rcv_value)
1524 1451 return (1);
1525 1452
1526 1453 return (0);
1527 1454 }
1528 1455
1529 1456 /*ARGSUSED*/
1530 1457 static int
1531 1458 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1532 1459 {
1533 1460 ASSERT(MUTEX_HELD(&p->p_lock));
1534 1461 ASSERT(e->rcep_t == RCENTITY_ZONE);
1535 1462 if (e->rcep_p.zone == NULL)
1536 1463 return (0);
1537 1464 e->rcep_p.zone->zone_nlwps_ctl = nv;
1538 1465 return (0);
1539 1466 }
1540 1467
1541 1468 static rctl_ops_t zone_lwps_ops = {
1542 1469 rcop_no_action,
1543 1470 zone_lwps_usage,
1544 1471 zone_lwps_set,
1545 1472 zone_lwps_test,
1546 1473 };
1547 1474
1548 1475 /*ARGSUSED*/
1549 1476 static rctl_qty_t
1550 1477 zone_procs_usage(rctl_t *r, proc_t *p)
1551 1478 {
1552 1479 rctl_qty_t nprocs;
1553 1480 zone_t *zone = p->p_zone;
1554 1481
1555 1482 ASSERT(MUTEX_HELD(&p->p_lock));
1556 1483
1557 1484 mutex_enter(&zone->zone_nlwps_lock);
1558 1485 nprocs = zone->zone_nprocs;
1559 1486 mutex_exit(&zone->zone_nlwps_lock);
1560 1487
1561 1488 return (nprocs);
1562 1489 }
1563 1490
1564 1491 /*ARGSUSED*/
1565 1492 static int
1566 1493 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1567 1494 rctl_qty_t incr, uint_t flags)
1568 1495 {
1569 1496 rctl_qty_t nprocs;
1570 1497
1571 1498 ASSERT(MUTEX_HELD(&p->p_lock));
1572 1499 ASSERT(e->rcep_t == RCENTITY_ZONE);
1573 1500 if (e->rcep_p.zone == NULL)
1574 1501 return (0);
1575 1502 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1576 1503 nprocs = e->rcep_p.zone->zone_nprocs;
1577 1504
1578 1505 if (nprocs + incr > rcntl->rcv_value)
1579 1506 return (1);
1580 1507
1581 1508 return (0);
1582 1509 }
1583 1510
1584 1511 /*ARGSUSED*/
1585 1512 static int
1586 1513 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1587 1514 {
1588 1515 ASSERT(MUTEX_HELD(&p->p_lock));
1589 1516 ASSERT(e->rcep_t == RCENTITY_ZONE);
1590 1517 if (e->rcep_p.zone == NULL)
1591 1518 return (0);
1592 1519 e->rcep_p.zone->zone_nprocs_ctl = nv;
1593 1520 return (0);
1594 1521 }
1595 1522
1596 1523 static rctl_ops_t zone_procs_ops = {
1597 1524 rcop_no_action,
1598 1525 zone_procs_usage,
1599 1526 zone_procs_set,
1600 1527 zone_procs_test,
1601 1528 };
1602 1529
1603 1530 /*ARGSUSED*/
1604 1531 static rctl_qty_t
1605 1532 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1606 1533 {
1607 1534 ASSERT(MUTEX_HELD(&p->p_lock));
1608 1535 return (p->p_zone->zone_shmmax);
1609 1536 }
1610 1537
1611 1538 /*ARGSUSED*/
1612 1539 static int
1613 1540 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1614 1541 rctl_qty_t incr, uint_t flags)
1615 1542 {
1616 1543 rctl_qty_t v;
1617 1544 ASSERT(MUTEX_HELD(&p->p_lock));
1618 1545 ASSERT(e->rcep_t == RCENTITY_ZONE);
1619 1546 v = e->rcep_p.zone->zone_shmmax + incr;
1620 1547 if (v > rval->rcv_value)
1621 1548 return (1);
1622 1549 return (0);
1623 1550 }
1624 1551
1625 1552 static rctl_ops_t zone_shmmax_ops = {
1626 1553 rcop_no_action,
1627 1554 zone_shmmax_usage,
1628 1555 rcop_no_set,
1629 1556 zone_shmmax_test
1630 1557 };
1631 1558
1632 1559 /*ARGSUSED*/
1633 1560 static rctl_qty_t
1634 1561 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1635 1562 {
1636 1563 ASSERT(MUTEX_HELD(&p->p_lock));
1637 1564 return (p->p_zone->zone_ipc.ipcq_shmmni);
1638 1565 }
1639 1566
1640 1567 /*ARGSUSED*/
1641 1568 static int
1642 1569 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1643 1570 rctl_qty_t incr, uint_t flags)
1644 1571 {
1645 1572 rctl_qty_t v;
1646 1573 ASSERT(MUTEX_HELD(&p->p_lock));
1647 1574 ASSERT(e->rcep_t == RCENTITY_ZONE);
1648 1575 v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1649 1576 if (v > rval->rcv_value)
1650 1577 return (1);
1651 1578 return (0);
1652 1579 }
1653 1580
1654 1581 static rctl_ops_t zone_shmmni_ops = {
1655 1582 rcop_no_action,
1656 1583 zone_shmmni_usage,
1657 1584 rcop_no_set,
1658 1585 zone_shmmni_test
1659 1586 };
1660 1587
1661 1588 /*ARGSUSED*/
1662 1589 static rctl_qty_t
1663 1590 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1664 1591 {
1665 1592 ASSERT(MUTEX_HELD(&p->p_lock));
1666 1593 return (p->p_zone->zone_ipc.ipcq_semmni);
1667 1594 }
1668 1595
1669 1596 /*ARGSUSED*/
1670 1597 static int
1671 1598 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1672 1599 rctl_qty_t incr, uint_t flags)
1673 1600 {
1674 1601 rctl_qty_t v;
1675 1602 ASSERT(MUTEX_HELD(&p->p_lock));
1676 1603 ASSERT(e->rcep_t == RCENTITY_ZONE);
1677 1604 v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1678 1605 if (v > rval->rcv_value)
1679 1606 return (1);
1680 1607 return (0);
1681 1608 }
1682 1609
1683 1610 static rctl_ops_t zone_semmni_ops = {
1684 1611 rcop_no_action,
1685 1612 zone_semmni_usage,
1686 1613 rcop_no_set,
1687 1614 zone_semmni_test
1688 1615 };
1689 1616
1690 1617 /*ARGSUSED*/
1691 1618 static rctl_qty_t
1692 1619 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1693 1620 {
1694 1621 ASSERT(MUTEX_HELD(&p->p_lock));
1695 1622 return (p->p_zone->zone_ipc.ipcq_msgmni);
1696 1623 }
1697 1624
1698 1625 /*ARGSUSED*/
1699 1626 static int
1700 1627 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1701 1628 rctl_qty_t incr, uint_t flags)
1702 1629 {
1703 1630 rctl_qty_t v;
1704 1631 ASSERT(MUTEX_HELD(&p->p_lock));
1705 1632 ASSERT(e->rcep_t == RCENTITY_ZONE);
1706 1633 v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1707 1634 if (v > rval->rcv_value)
1708 1635 return (1);
1709 1636 return (0);
1710 1637 }
1711 1638
1712 1639 static rctl_ops_t zone_msgmni_ops = {
1713 1640 rcop_no_action,
1714 1641 zone_msgmni_usage,
1715 1642 rcop_no_set,
1716 1643 zone_msgmni_test
1717 1644 };
1718 1645
1719 1646 /*ARGSUSED*/
1720 1647 static rctl_qty_t
1721 1648 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1722 1649 {
1723 1650 rctl_qty_t q;
1724 1651 ASSERT(MUTEX_HELD(&p->p_lock));
1725 1652 mutex_enter(&p->p_zone->zone_mem_lock);
1726 1653 q = p->p_zone->zone_locked_mem;
1727 1654 mutex_exit(&p->p_zone->zone_mem_lock);
1728 1655 return (q);
1729 1656 }
1730 1657
1731 1658 /*ARGSUSED*/
1732 1659 static int
1733 1660 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1734 1661 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1735 1662 {
1736 1663 rctl_qty_t q;
1737 1664 zone_t *z;
1738 1665
1739 1666 z = e->rcep_p.zone;
1740 1667 ASSERT(MUTEX_HELD(&p->p_lock));
1741 1668 ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1742 1669 q = z->zone_locked_mem;
1743 1670 if (q + incr > rcntl->rcv_value)
1744 1671 return (1);
1745 1672 return (0);
1746 1673 }
1747 1674
1748 1675 /*ARGSUSED*/
1749 1676 static int
1750 1677 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1751 1678 rctl_qty_t nv)
1752 1679 {
1753 1680 ASSERT(MUTEX_HELD(&p->p_lock));
1754 1681 ASSERT(e->rcep_t == RCENTITY_ZONE);
1755 1682 if (e->rcep_p.zone == NULL)
1756 1683 return (0);
1757 1684 e->rcep_p.zone->zone_locked_mem_ctl = nv;
1758 1685 return (0);
1759 1686 }
1760 1687
1761 1688 static rctl_ops_t zone_locked_mem_ops = {
1762 1689 rcop_no_action,
1763 1690 zone_locked_mem_usage,
1764 1691 zone_locked_mem_set,
1765 1692 zone_locked_mem_test
1766 1693 };
1767 1694
1768 1695 /*ARGSUSED*/
1769 1696 static rctl_qty_t
1770 1697 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1771 1698 {
1772 1699 rctl_qty_t q;
1773 1700 zone_t *z = p->p_zone;
1774 1701
1775 1702 ASSERT(MUTEX_HELD(&p->p_lock));
1776 1703 mutex_enter(&z->zone_mem_lock);
1777 1704 q = z->zone_max_swap;
1778 1705 mutex_exit(&z->zone_mem_lock);
1779 1706 return (q);
1780 1707 }
1781 1708
1782 1709 /*ARGSUSED*/
1783 1710 static int
1784 1711 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1785 1712 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1786 1713 {
1787 1714 rctl_qty_t q;
1788 1715 zone_t *z;
1789 1716
1790 1717 z = e->rcep_p.zone;
1791 1718 ASSERT(MUTEX_HELD(&p->p_lock));
1792 1719 ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1793 1720 q = z->zone_max_swap;
1794 1721 if (q + incr > rcntl->rcv_value)
1795 1722 return (1);
1796 1723 return (0);
1797 1724 }
1798 1725
1799 1726 /*ARGSUSED*/
1800 1727 static int
1801 1728 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1802 1729 rctl_qty_t nv)
1803 1730 {
1804 1731 ASSERT(MUTEX_HELD(&p->p_lock));
1805 1732 ASSERT(e->rcep_t == RCENTITY_ZONE);
1806 1733 if (e->rcep_p.zone == NULL)
1807 1734 return (0);
1808 1735 e->rcep_p.zone->zone_max_swap_ctl = nv;
1809 1736 return (0);
1810 1737 }
1811 1738
1812 1739 static rctl_ops_t zone_max_swap_ops = {
1813 1740 rcop_no_action,
1814 1741 zone_max_swap_usage,
1815 1742 zone_max_swap_set,
1816 1743 zone_max_swap_test
1817 1744 };
1818 1745
1819 1746 /*ARGSUSED*/
1820 1747 static rctl_qty_t
1821 1748 zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
1822 1749 {
1823 1750 rctl_qty_t q;
1824 1751 zone_t *z = p->p_zone;
1825 1752
1826 1753 ASSERT(MUTEX_HELD(&p->p_lock));
1827 1754 /* No additional lock because not enforced in the kernel */
1828 1755 q = z->zone_phys_mem;
1829 1756 return (q);
1830 1757 }
1831 1758
1832 1759 /*ARGSUSED*/
1833 1760 static int
1834 1761 zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1835 1762 rctl_qty_t nv)
1836 1763 {
1837 1764 ASSERT(MUTEX_HELD(&p->p_lock));
1838 1765 ASSERT(e->rcep_t == RCENTITY_ZONE);
1839 1766 if (e->rcep_p.zone == NULL)
1840 1767 return (0);
1841 1768 e->rcep_p.zone->zone_phys_mem_ctl = nv;
1842 1769 return (0);
1843 1770 }
1844 1771
1845 1772 static rctl_ops_t zone_phys_mem_ops = {
1846 1773 rcop_no_action,
1847 1774 zone_phys_mem_usage,
1848 1775 zone_phys_mem_set,
1849 1776 rcop_no_test
1850 1777 };
1851 1778
1852 1779 /*ARGSUSED*/
1853 1780 static rctl_qty_t
1854 1781 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1855 1782 {
1856 1783 rctl_qty_t q;
1857 1784 zone_t *z = p->p_zone;
1858 1785
1859 1786 ASSERT(MUTEX_HELD(&p->p_lock));
1860 1787 mutex_enter(&z->zone_rctl_lock);
1861 1788 q = z->zone_max_lofi;
1862 1789 mutex_exit(&z->zone_rctl_lock);
1863 1790 return (q);
1864 1791 }
1865 1792
1866 1793 /*ARGSUSED*/
1867 1794 static int
1868 1795 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1869 1796 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1870 1797 {
1871 1798 rctl_qty_t q;
1872 1799 zone_t *z;
1873 1800
1874 1801 z = e->rcep_p.zone;
1875 1802 ASSERT(MUTEX_HELD(&p->p_lock));
1876 1803 ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1877 1804 q = z->zone_max_lofi;
1878 1805 if (q + incr > rcntl->rcv_value)
1879 1806 return (1);
1880 1807 return (0);
1881 1808 }
1882 1809
1883 1810 /*ARGSUSED*/
1884 1811 static int
1885 1812 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1886 1813 rctl_qty_t nv)
1887 1814 {
1888 1815 ASSERT(MUTEX_HELD(&p->p_lock));
1889 1816 ASSERT(e->rcep_t == RCENTITY_ZONE);
1890 1817 if (e->rcep_p.zone == NULL)
1891 1818 return (0);
1892 1819 e->rcep_p.zone->zone_max_lofi_ctl = nv;
1893 1820 return (0);
1894 1821 }
1895 1822
1896 1823 static rctl_ops_t zone_max_lofi_ops = {
1897 1824 rcop_no_action,
1898 1825 zone_max_lofi_usage,
1899 1826 zone_max_lofi_set,
1900 1827 zone_max_lofi_test
1901 1828 };
1902 1829
1903 1830 /*
1904 1831 * Helper function to brand the zone with a unique ID.
1905 1832 */
1906 1833 static void
1907 1834 zone_uniqid(zone_t *zone)
1908 1835 {
1909 1836 static uint64_t uniqid = 0;
1910 1837
1911 1838 ASSERT(MUTEX_HELD(&zonehash_lock));
1912 1839 zone->zone_uniqid = uniqid++;
1913 1840 }
1914 1841
1915 1842 /*
1916 1843 * Returns a held pointer to the "kcred" for the specified zone.
1917 1844 */
1918 1845 struct cred *
1919 1846 zone_get_kcred(zoneid_t zoneid)
1920 1847 {
1921 1848 zone_t *zone;
1922 1849 cred_t *cr;
1923 1850
1924 1851 if ((zone = zone_find_by_id(zoneid)) == NULL)
1925 1852 return (NULL);
1926 1853 cr = zone->zone_kcred;
1927 1854 crhold(cr);
1928 1855 zone_rele(zone);
1929 1856 return (cr);
1930 1857 }
1931 1858
1932 1859 static int
1933 1860 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1934 1861 {
1935 1862 zone_t *zone = ksp->ks_private;
1936 1863 zone_kstat_t *zk = ksp->ks_data;
1937 1864
1938 1865 if (rw == KSTAT_WRITE)
1939 1866 return (EACCES);
1940 1867
1941 1868 zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1942 1869 zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1943 1870 return (0);
1944 1871 }
1945 1872
1946 1873 static int
1947 1874 zone_physmem_kstat_update(kstat_t *ksp, int rw)
1948 1875 {
1949 1876 zone_t *zone = ksp->ks_private;
1950 1877 zone_kstat_t *zk = ksp->ks_data;
1951 1878
1952 1879 if (rw == KSTAT_WRITE)
1953 1880 return (EACCES);
1954 1881
1955 1882 zk->zk_usage.value.ui64 = zone->zone_phys_mem;
1956 1883 zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
1957 1884 return (0);
1958 1885 }
1959 1886
1960 1887 static int
1961 1888 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1962 1889 {
1963 1890 zone_t *zone = ksp->ks_private;
1964 1891 zone_kstat_t *zk = ksp->ks_data;
1965 1892
1966 1893 if (rw == KSTAT_WRITE)
1967 1894 return (EACCES);
1968 1895
1969 1896 zk->zk_usage.value.ui64 = zone->zone_nprocs;
1970 1897 zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1971 1898 return (0);
1972 1899 }
1973 1900
1974 1901 static int
1975 1902 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1976 1903 {
1977 1904 zone_t *zone = ksp->ks_private;
1978 1905 zone_kstat_t *zk = ksp->ks_data;
|
↓ open down ↓ |
513 lines elided |
↑ open up ↑ |
1979 1906
1980 1907 if (rw == KSTAT_WRITE)
1981 1908 return (EACCES);
1982 1909
1983 1910 zk->zk_usage.value.ui64 = zone->zone_max_swap;
1984 1911 zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1985 1912 return (0);
1986 1913 }
1987 1914
1988 1915 static kstat_t *
1989 -zone_rctl_kstat_create_common(zone_t *zone, char *name,
1916 +zone_kstat_create_common(zone_t *zone, char *name,
1990 1917 int (*updatefunc) (kstat_t *, int))
1991 1918 {
1992 1919 kstat_t *ksp;
1993 1920 zone_kstat_t *zk;
1994 1921
1995 1922 ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1996 1923 sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1997 1924 KSTAT_FLAG_VIRTUAL);
1998 1925
1999 1926 if (ksp == NULL)
2000 1927 return (NULL);
2001 1928
2002 1929 zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
2003 1930 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2004 1931 kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
2005 1932 kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
2006 1933 kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
2007 1934 kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
2008 1935 ksp->ks_update = updatefunc;
2009 1936 ksp->ks_private = zone;
2010 1937 kstat_install(ksp);
2011 1938 return (ksp);
2012 1939 }
2013 1940
2014 1941 static int
2015 1942 zone_vfs_kstat_update(kstat_t *ksp, int rw)
2016 1943 {
2017 1944 zone_t *zone = ksp->ks_private;
2018 1945 zone_vfs_kstat_t *zvp = ksp->ks_data;
2019 1946 kstat_io_t *kiop = &zone->zone_vfs_rwstats;
2020 1947
2021 1948 if (rw == KSTAT_WRITE)
2022 1949 return (EACCES);
2023 1950
2024 1951 /*
2025 1952 * Extract the VFS statistics from the kstat_io_t structure used by
2026 1953 * kstat_runq_enter() and related functions. Since the slow ops
2027 1954 * counters are updated directly by the VFS layer, there's no need to
2028 1955 * copy those statistics here.
2029 1956 *
2030 1957 * Note that kstat_runq_enter() and the related functions use
2031 1958 * gethrtime_unscaled(), so scale the time here.
2032 1959 */
2033 1960 zvp->zv_nread.value.ui64 = kiop->nread;
2034 1961 zvp->zv_reads.value.ui64 = kiop->reads;
2035 1962 zvp->zv_rtime.value.ui64 = kiop->rtime;
2036 1963 zvp->zv_rcnt.value.ui64 = kiop->rcnt;
2037 1964 zvp->zv_rlentime.value.ui64 = kiop->rlentime;
2038 1965 zvp->zv_nwritten.value.ui64 = kiop->nwritten;
2039 1966 zvp->zv_writes.value.ui64 = kiop->writes;
2040 1967 zvp->zv_wtime.value.ui64 = kiop->wtime;
2041 1968 zvp->zv_wcnt.value.ui64 = kiop->wcnt;
2042 1969 zvp->zv_wlentime.value.ui64 = kiop->wlentime;
2043 1970
2044 1971 scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
2045 1972 scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
2046 1973 scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
2047 1974 scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
2048 1975
2049 1976 return (0);
2050 1977 }
2051 1978
2052 1979 static kstat_t *
2053 1980 zone_vfs_kstat_create(zone_t *zone)
2054 1981 {
2055 1982 kstat_t *ksp;
2056 1983 zone_vfs_kstat_t *zvp;
2057 1984
2058 1985 if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
2059 1986 zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
2060 1987 sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
2061 1988 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2062 1989 return (NULL);
2063 1990
2064 1991 if (zone->zone_id != GLOBAL_ZONEID)
2065 1992 kstat_zone_add(ksp, GLOBAL_ZONEID);
2066 1993
2067 1994 zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
2068 1995 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2069 1996 ksp->ks_lock = &zone->zone_vfs_lock;
2070 1997 zone->zone_vfs_stats = zvp;
2071 1998
2072 1999 /* The kstat "name" field is not large enough for a full zonename */
2073 2000 kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
2074 2001 kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
2075 2002 kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
2076 2003 kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
2077 2004 kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
2078 2005 kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
2079 2006 kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
2080 2007 kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
2081 2008 kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
2082 2009 kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
2083 2010 kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
2084 2011 kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
2085 2012 kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
2086 2013 kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
2087 2014 kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
2088 2015 kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
2089 2016 kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
2090 2017 kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
2091 2018
2092 2019 ksp->ks_update = zone_vfs_kstat_update;
2093 2020 ksp->ks_private = zone;
2094 2021
2095 2022 kstat_install(ksp);
2096 2023 return (ksp);
2097 2024 }
2098 2025
2099 2026 static int
2100 2027 zone_zfs_kstat_update(kstat_t *ksp, int rw)
2101 2028 {
2102 2029 zone_t *zone = ksp->ks_private;
2103 2030 zone_zfs_kstat_t *zzp = ksp->ks_data;
2104 2031 kstat_io_t *kiop = &zone->zone_zfs_rwstats;
2105 2032
2106 2033 if (rw == KSTAT_WRITE)
2107 2034 return (EACCES);
2108 2035
2109 2036 /*
2110 2037 * Extract the ZFS statistics from the kstat_io_t structure used by
2111 2038 * kstat_runq_enter() and related functions. Since the I/O throttle
2112 2039 * counters are updated directly by the ZFS layer, there's no need to
2113 2040 * copy those statistics here.
2114 2041 *
2115 2042 * Note that kstat_runq_enter() and the related functions use
2116 2043 * gethrtime_unscaled(), so scale the time here.
2117 2044 */
2118 2045 zzp->zz_nread.value.ui64 = kiop->nread;
2119 2046 zzp->zz_reads.value.ui64 = kiop->reads;
2120 2047 zzp->zz_rtime.value.ui64 = kiop->rtime;
2121 2048 zzp->zz_rlentime.value.ui64 = kiop->rlentime;
2122 2049 zzp->zz_nwritten.value.ui64 = kiop->nwritten;
2123 2050 zzp->zz_writes.value.ui64 = kiop->writes;
2124 2051
2125 2052 scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
2126 2053 scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
2127 2054
2128 2055 return (0);
2129 2056 }
2130 2057
2131 2058 static kstat_t *
2132 2059 zone_zfs_kstat_create(zone_t *zone)
2133 2060 {
2134 2061 kstat_t *ksp;
2135 2062 zone_zfs_kstat_t *zzp;
2136 2063
2137 2064 if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
2138 2065 zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
2139 2066 sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
2140 2067 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2141 2068 return (NULL);
2142 2069
2143 2070 if (zone->zone_id != GLOBAL_ZONEID)
2144 2071 kstat_zone_add(ksp, GLOBAL_ZONEID);
2145 2072
2146 2073 zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
2147 2074 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2148 2075 ksp->ks_lock = &zone->zone_zfs_lock;
2149 2076 zone->zone_zfs_stats = zzp;
2150 2077
2151 2078 /* The kstat "name" field is not large enough for a full zonename */
2152 2079 kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
2153 2080 kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
2154 2081 kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
2155 2082 kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
2156 2083 kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
2157 2084 kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
2158 2085 kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
2159 2086 kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
2160 2087 kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
2161 2088
2162 2089 ksp->ks_update = zone_zfs_kstat_update;
2163 2090 ksp->ks_private = zone;
2164 2091
2165 2092 kstat_install(ksp);
2166 2093 return (ksp);
2167 2094 }
2168 2095
2169 2096 static int
2170 2097 zone_mcap_kstat_update(kstat_t *ksp, int rw)
2171 2098 {
2172 2099 zone_t *zone = ksp->ks_private;
2173 2100 zone_mcap_kstat_t *zmp = ksp->ks_data;
2174 2101
2175 2102 if (rw == KSTAT_WRITE)
2176 2103 return (EACCES);
2177 2104
2178 2105 zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
2179 2106 zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
2180 2107 zmp->zm_swap.value.ui64 = zone->zone_max_swap;
2181 2108 zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
2182 2109 zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
2183 2110 zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
2184 2111 zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
2185 2112 zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
2186 2113 zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
2187 2114 zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
2188 2115 zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
2189 2116 zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
2190 2117 zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
2191 2118
2192 2119 return (0);
2193 2120 }
2194 2121
2195 2122 static kstat_t *
2196 2123 zone_mcap_kstat_create(zone_t *zone)
2197 2124 {
2198 2125 kstat_t *ksp;
2199 2126 zone_mcap_kstat_t *zmp;
2200 2127
2201 2128 if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2202 2129 zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2203 2130 sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2204 2131 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2205 2132 return (NULL);
2206 2133
2207 2134 if (zone->zone_id != GLOBAL_ZONEID)
|
↓ open down ↓ |
208 lines elided |
↑ open up ↑ |
2208 2135 kstat_zone_add(ksp, GLOBAL_ZONEID);
2209 2136
2210 2137 zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2211 2138 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2212 2139 ksp->ks_lock = &zone->zone_mcap_lock;
2213 2140 zone->zone_mcap_stats = zmp;
2214 2141
2215 2142 /* The kstat "name" field is not large enough for a full zonename */
2216 2143 kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2217 2144 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2145 + kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2218 2146 kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2219 2147 kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2220 2148 kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2221 2149 kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2222 2150 kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2223 2151 kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2224 2152 kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2225 2153 kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2226 2154 kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2227 2155 kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2228 2156 kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2229 2157 KSTAT_DATA_UINT64);
2230 2158 kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
2231 2159 KSTAT_DATA_UINT64);
2232 2160 kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
2233 2161 KSTAT_DATA_UINT64);
2234 2162
2235 2163 ksp->ks_update = zone_mcap_kstat_update;
2236 2164 ksp->ks_private = zone;
2237 2165
2238 2166 kstat_install(ksp);
2239 2167 return (ksp);
2240 2168 }
2241 2169
2242 2170 static int
2243 2171 zone_misc_kstat_update(kstat_t *ksp, int rw)
2244 2172 {
2245 2173 zone_t *zone = ksp->ks_private;
2246 2174 zone_misc_kstat_t *zmp = ksp->ks_data;
2247 2175 hrtime_t tmp;
2248 2176
2249 2177 if (rw == KSTAT_WRITE)
2250 2178 return (EACCES);
2251 2179
2252 2180 tmp = zone->zone_utime;
2253 2181 scalehrtime(&tmp);
2254 2182 zmp->zm_utime.value.ui64 = tmp;
2255 2183 tmp = zone->zone_stime;
2256 2184 scalehrtime(&tmp);
2257 2185 zmp->zm_stime.value.ui64 = tmp;
2258 2186 tmp = zone->zone_wtime;
2259 2187 scalehrtime(&tmp);
2260 2188 zmp->zm_wtime.value.ui64 = tmp;
|
↓ open down ↓ |
33 lines elided |
↑ open up ↑ |
2261 2189
2262 2190 zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
2263 2191 zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
2264 2192 zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
2265 2193
2266 2194 zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
2267 2195 zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
2268 2196 zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
2269 2197 zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
2270 2198
2271 - zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
2272 -
2273 2199 zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
2274 2200
2275 2201 zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
2276 2202 zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
2277 2203
2278 2204 return (0);
2279 2205 }
2280 2206
2281 2207 static kstat_t *
2282 2208 zone_misc_kstat_create(zone_t *zone)
2283 2209 {
2284 2210 kstat_t *ksp;
2285 2211 zone_misc_kstat_t *zmp;
2286 2212
2287 2213 if ((ksp = kstat_create_zone("zones", zone->zone_id,
2288 2214 zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
2289 2215 sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
2290 2216 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2291 2217 return (NULL);
2292 2218
2293 2219 if (zone->zone_id != GLOBAL_ZONEID)
2294 2220 kstat_zone_add(ksp, GLOBAL_ZONEID);
2295 2221
2296 2222 zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2297 2223 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2298 2224 ksp->ks_lock = &zone->zone_misc_lock;
2299 2225 zone->zone_misc_stats = zmp;
2300 2226
2301 2227 /* The kstat "name" field is not large enough for a full zonename */
2302 2228 kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2303 2229 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2304 2230 kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2305 2231 kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
|
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
2306 2232 kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2307 2233 kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2308 2234 kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2309 2235 kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2310 2236 KSTAT_DATA_UINT32);
2311 2237 kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2312 2238 kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2313 2239 KSTAT_DATA_UINT32);
2314 2240 kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2315 2241 kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2316 - kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
2317 - KSTAT_DATA_UINT32);
2318 2242 kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2319 2243 KSTAT_DATA_UINT32);
2320 2244 kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2321 2245 kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2322 2246
2323 2247 ksp->ks_update = zone_misc_kstat_update;
2324 2248 ksp->ks_private = zone;
2325 2249
2326 2250 kstat_install(ksp);
2327 2251 return (ksp);
2328 2252 }
2329 2253
2330 2254 static void
2331 2255 zone_kstat_create(zone_t *zone)
2332 2256 {
2333 - zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
2257 + zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2334 2258 "lockedmem", zone_lockedmem_kstat_update);
2335 - zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
2259 + zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2336 2260 "swapresv", zone_swapresv_kstat_update);
2337 - zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
2261 + zone->zone_physmem_kstat = zone_kstat_create_common(zone,
2338 2262 "physicalmem", zone_physmem_kstat_update);
2339 - zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
2263 + zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2340 2264 "nprocs", zone_nprocs_kstat_update);
2341 2265
2342 2266 if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2343 2267 zone->zone_vfs_stats = kmem_zalloc(
2344 2268 sizeof (zone_vfs_kstat_t), KM_SLEEP);
2345 2269 }
2346 2270
2347 - if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
2348 - zone->zone_zfs_stats = kmem_zalloc(
2349 - sizeof (zone_zfs_kstat_t), KM_SLEEP);
2350 - }
2351 -
2352 2271 if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2353 2272 zone->zone_mcap_stats = kmem_zalloc(
2354 2273 sizeof (zone_mcap_kstat_t), KM_SLEEP);
2355 2274 }
2356 2275
2357 2276 if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2358 2277 zone->zone_misc_stats = kmem_zalloc(
2359 2278 sizeof (zone_misc_kstat_t), KM_SLEEP);
2360 2279 }
2280 +
2361 2281 }
2362 2282
2363 2283 static void
2364 2284 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2365 2285 {
2366 2286 void *data;
2367 2287
2368 2288 if (*pkstat != NULL) {
2369 2289 data = (*pkstat)->ks_data;
2370 2290 kstat_delete(*pkstat);
2371 2291 kmem_free(data, datasz);
2372 2292 *pkstat = NULL;
2373 2293 }
2374 2294 }
2375 2295
2376 2296 static void
2377 2297 zone_kstat_delete(zone_t *zone)
2378 2298 {
2379 2299 zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
|
↓ open down ↓ |
9 lines elided |
↑ open up ↑ |
2380 2300 sizeof (zone_kstat_t));
2381 2301 zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2382 2302 sizeof (zone_kstat_t));
2383 2303 zone_kstat_delete_common(&zone->zone_physmem_kstat,
2384 2304 sizeof (zone_kstat_t));
2385 2305 zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2386 2306 sizeof (zone_kstat_t));
2387 2307
2388 2308 zone_kstat_delete_common(&zone->zone_vfs_ksp,
2389 2309 sizeof (zone_vfs_kstat_t));
2390 - zone_kstat_delete_common(&zone->zone_zfs_ksp,
2391 - sizeof (zone_zfs_kstat_t));
2392 2310 zone_kstat_delete_common(&zone->zone_mcap_ksp,
2393 2311 sizeof (zone_mcap_kstat_t));
2394 2312 zone_kstat_delete_common(&zone->zone_misc_ksp,
2395 2313 sizeof (zone_misc_kstat_t));
2314 +
2396 2315 }
2397 2316
2398 2317 /*
2399 2318 * Called very early on in boot to initialize the ZSD list so that
2400 2319 * zone_key_create() can be called before zone_init(). It also initializes
2401 2320 * portions of zone0 which may be used before zone_init() is called. The
2402 2321 * variable "global_zone" will be set when zone0 is fully initialized by
2403 2322 * zone_init().
2404 2323 */
2405 2324 void
2406 2325 zone_zsd_init(void)
2407 2326 {
2408 2327 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2409 2328 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2410 2329 list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2411 2330 offsetof(struct zsd_entry, zsd_linkage));
2412 2331 list_create(&zone_active, sizeof (zone_t),
2413 2332 offsetof(zone_t, zone_linkage));
2414 2333 list_create(&zone_deathrow, sizeof (zone_t),
2415 2334 offsetof(zone_t, zone_linkage));
2416 2335
2417 2336 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2418 2337 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2419 2338 mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2420 2339 zone0.zone_shares = 1;
2421 2340 zone0.zone_nlwps = 0;
2422 2341 zone0.zone_nlwps_ctl = INT_MAX;
2423 2342 zone0.zone_nprocs = 0;
2424 2343 zone0.zone_nprocs_ctl = INT_MAX;
2425 2344 zone0.zone_locked_mem = 0;
2426 2345 zone0.zone_locked_mem_ctl = UINT64_MAX;
2427 2346 ASSERT(zone0.zone_max_swap == 0);
2428 2347 zone0.zone_max_swap_ctl = UINT64_MAX;
2429 2348 zone0.zone_phys_mem = 0;
2430 2349 zone0.zone_phys_mem_ctl = UINT64_MAX;
2431 2350 zone0.zone_max_lofi = 0;
2432 2351 zone0.zone_max_lofi_ctl = UINT64_MAX;
2433 2352 zone0.zone_shmmax = 0;
2434 2353 zone0.zone_ipc.ipcq_shmmni = 0;
2435 2354 zone0.zone_ipc.ipcq_semmni = 0;
2436 2355 zone0.zone_ipc.ipcq_msgmni = 0;
2437 2356 zone0.zone_name = GLOBAL_ZONENAME;
2438 2357 zone0.zone_nodename = utsname.nodename;
2439 2358 zone0.zone_domain = srpc_domain;
2440 2359 zone0.zone_hostid = HW_INVALID_HOSTID;
2441 2360 zone0.zone_fs_allowed = NULL;
2442 2361 zone0.zone_ref = 1;
2443 2362 zone0.zone_id = GLOBAL_ZONEID;
2444 2363 zone0.zone_status = ZONE_IS_RUNNING;
2445 2364 zone0.zone_rootpath = "/";
2446 2365 zone0.zone_rootpathlen = 2;
|
↓ open down ↓ |
41 lines elided |
↑ open up ↑ |
2447 2366 zone0.zone_psetid = ZONE_PS_INVAL;
2448 2367 zone0.zone_ncpus = 0;
2449 2368 zone0.zone_ncpus_online = 0;
2450 2369 zone0.zone_proc_initpid = 1;
2451 2370 zone0.zone_initname = initname;
2452 2371 zone0.zone_lockedmem_kstat = NULL;
2453 2372 zone0.zone_swapresv_kstat = NULL;
2454 2373 zone0.zone_physmem_kstat = NULL;
2455 2374 zone0.zone_nprocs_kstat = NULL;
2456 2375 zone0.zone_zfs_io_pri = 1;
2376 +
2457 2377 zone0.zone_stime = 0;
2458 2378 zone0.zone_utime = 0;
2459 2379 zone0.zone_wtime = 0;
2460 2380
2461 2381 list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2462 2382 offsetof(zone_ref_t, zref_linkage));
2463 2383 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2464 2384 offsetof(struct zsd_entry, zsd_linkage));
2465 2385 list_insert_head(&zone_active, &zone0);
2466 2386
2467 2387 /*
2468 2388 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2469 2389 * to anything meaningful. It is assigned to be 'rootdir' in
2470 2390 * vfs_mountroot().
2471 2391 */
2472 2392 zone0.zone_rootvp = NULL;
2473 2393 zone0.zone_vfslist = NULL;
2474 2394 zone0.zone_bootargs = initargs;
2475 2395 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2476 2396 /*
2477 2397 * The global zone has all privileges
2478 2398 */
2479 2399 priv_fillset(zone0.zone_privset);
2480 2400 /*
2481 2401 * Add p0 to the global zone
2482 2402 */
2483 2403 zone0.zone_zsched = &p0;
2484 2404 p0.p_zone = &zone0;
2485 2405 }
2486 2406
2487 2407 /*
2488 2408 * Compute a hash value based on the contents of the label and the DOI. The
2489 2409 * hash algorithm is somewhat arbitrary, but is based on the observation that
2490 2410 * humans will likely pick labels that differ by amounts that work out to be
2491 2411 * multiples of the number of hash chains, and thus stirring in some primes
2492 2412 * should help.
2493 2413 */
2494 2414 static uint_t
2495 2415 hash_bylabel(void *hdata, mod_hash_key_t key)
2496 2416 {
2497 2417 const ts_label_t *lab = (ts_label_t *)key;
2498 2418 const uint32_t *up, *ue;
2499 2419 uint_t hash;
2500 2420 int i;
2501 2421
2502 2422 _NOTE(ARGUNUSED(hdata));
2503 2423
2504 2424 hash = lab->tsl_doi + (lab->tsl_doi << 1);
2505 2425 /* we depend on alignment of label, but not representation */
2506 2426 up = (const uint32_t *)&lab->tsl_label;
2507 2427 ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2508 2428 i = 1;
2509 2429 while (up < ue) {
2510 2430 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2511 2431 hash += *up + (*up << ((i % 16) + 1));
2512 2432 up++;
2513 2433 i++;
2514 2434 }
2515 2435 return (hash);
2516 2436 }
2517 2437
2518 2438 /*
2519 2439 * All that mod_hash cares about here is zero (equal) versus non-zero (not
2520 2440 * equal). This may need to be changed if less than / greater than is ever
2521 2441 * needed.
2522 2442 */
2523 2443 static int
2524 2444 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2525 2445 {
2526 2446 ts_label_t *lab1 = (ts_label_t *)key1;
2527 2447 ts_label_t *lab2 = (ts_label_t *)key2;
2528 2448
2529 2449 return (label_equal(lab1, lab2) ? 0 : 1);
2530 2450 }
2531 2451
2532 2452 /*
2533 2453 * Called by main() to initialize the zones framework.
2534 2454 */
2535 2455 void
2536 2456 zone_init(void)
2537 2457 {
2538 2458 rctl_dict_entry_t *rde;
2539 2459 rctl_val_t *dval;
2540 2460 rctl_set_t *set;
2541 2461 rctl_alloc_gp_t *gp;
2542 2462 rctl_entity_p_t e;
2543 2463 int res;
2544 2464
2545 2465 ASSERT(curproc == &p0);
2546 2466
2547 2467 /*
2548 2468 * Create ID space for zone IDs. ID 0 is reserved for the
2549 2469 * global zone.
2550 2470 */
2551 2471 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2552 2472
2553 2473 /*
2554 2474 * Initialize generic zone resource controls, if any.
2555 2475 */
2556 2476 rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
|
↓ open down ↓ |
90 lines elided |
↑ open up ↑ |
2557 2477 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2558 2478 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2559 2479 FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2560 2480
2561 2481 rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2562 2482 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2563 2483 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2564 2484 RCTL_GLOBAL_INFINITE,
2565 2485 MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2566 2486
2567 - rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
2568 - RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2569 - RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2570 - MAXCAP, MAXCAP, &zone_cpu_base_ops);
2571 -
2572 - rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
2573 - RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2574 - RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2575 - INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
2576 -
2577 2487 rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2578 2488 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2579 2489 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2580 2490 16384, 16384, &zone_zfs_io_pri_ops);
2581 2491
2582 2492 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2583 2493 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2584 2494 INT_MAX, INT_MAX, &zone_lwps_ops);
2585 2495
2586 2496 rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2587 2497 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2588 2498 INT_MAX, INT_MAX, &zone_procs_ops);
2589 2499
2590 2500 /*
2591 2501 * System V IPC resource controls
2592 2502 */
2593 2503 rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2594 2504 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2595 2505 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2596 2506
2597 2507 rc_zone_semmni = rctl_register("zone.max-sem-ids",
2598 2508 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2599 2509 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2600 2510
2601 2511 rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2602 2512 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2603 2513 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2604 2514
2605 2515 rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2606 2516 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2607 2517 RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2608 2518
2609 2519 /*
2610 2520 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
2611 2521 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2612 2522 */
|
↓ open down ↓ |
26 lines elided |
↑ open up ↑ |
2613 2523 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2614 2524 bzero(dval, sizeof (rctl_val_t));
2615 2525 dval->rcv_value = 1;
2616 2526 dval->rcv_privilege = RCPRIV_PRIVILEGED;
2617 2527 dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2618 2528 dval->rcv_action_recip_pid = -1;
2619 2529
2620 2530 rde = rctl_dict_lookup("zone.cpu-shares");
2621 2531 (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2622 2532
2623 - /*
2624 - * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
2625 - * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
2626 - */
2627 - dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2628 - bzero(dval, sizeof (rctl_val_t));
2629 - dval->rcv_value = 1;
2630 - dval->rcv_privilege = RCPRIV_PRIVILEGED;
2631 - dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2632 - dval->rcv_action_recip_pid = -1;
2633 -
2634 - rde = rctl_dict_lookup("zone.zfs-io-priority");
2635 - (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2636 -
2637 2533 rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2638 2534 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2639 2535 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2640 2536 &zone_locked_mem_ops);
2641 2537
2642 2538 rc_zone_max_swap = rctl_register("zone.max-swap",
2643 2539 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2644 2540 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2645 2541 &zone_max_swap_ops);
2646 2542
2647 2543 rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2648 2544 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2649 2545 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2650 2546 &zone_phys_mem_ops);
2651 2547
2652 2548 rc_zone_max_lofi = rctl_register("zone.max-lofi",
2653 2549 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2654 2550 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2655 2551 &zone_max_lofi_ops);
2656 2552
2657 2553 /*
2658 2554 * Initialize the ``global zone''.
2659 2555 */
2660 2556 set = rctl_set_create();
2661 2557 gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2662 2558 mutex_enter(&p0.p_lock);
2663 2559 e.rcep_p.zone = &zone0;
2664 2560 e.rcep_t = RCENTITY_ZONE;
2665 2561 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2666 2562 gp);
2667 2563
2668 2564 zone0.zone_nlwps = p0.p_lwpcnt;
2669 2565 zone0.zone_nprocs = 1;
2670 2566 zone0.zone_ntasks = 1;
2671 2567 mutex_exit(&p0.p_lock);
2672 2568 zone0.zone_restart_init = B_TRUE;
2673 2569 zone0.zone_reboot_on_init_exit = B_FALSE;
2674 2570 zone0.zone_init_status = -1;
2675 2571 zone0.zone_brand = &native_brand;
2676 2572 rctl_prealloc_destroy(gp);
2677 2573 /*
2678 2574 * pool_default hasn't been initialized yet, so we let pool_init()
2679 2575 * take care of making sure the global zone is in the default pool.
2680 2576 */
2681 2577
2682 2578 /*
2683 2579 * Initialize global zone kstats
2684 2580 */
2685 2581 zone_kstat_create(&zone0);
2686 2582
2687 2583 /*
2688 2584 * Initialize zone label.
2689 2585 * mlp are initialized when tnzonecfg is loaded.
2690 2586 */
2691 2587 zone0.zone_slabel = l_admin_low;
2692 2588 rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2693 2589 label_hold(l_admin_low);
2694 2590
2695 2591 /*
2696 2592 * Initialise the lock for the database structure used by mntfs.
2697 2593 */
2698 2594 rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2699 2595
2700 2596 mutex_enter(&zonehash_lock);
2701 2597 zone_uniqid(&zone0);
2702 2598 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2703 2599
2704 2600 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2705 2601 mod_hash_null_valdtor);
2706 2602 zonehashbyname = mod_hash_create_strhash("zone_by_name",
2707 2603 zone_hash_size, mod_hash_null_valdtor);
2708 2604 /*
2709 2605 * maintain zonehashbylabel only for labeled systems
2710 2606 */
2711 2607 if (is_system_labeled())
2712 2608 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2713 2609 zone_hash_size, mod_hash_null_keydtor,
2714 2610 mod_hash_null_valdtor, hash_bylabel, NULL,
2715 2611 hash_labelkey_cmp, KM_SLEEP);
2716 2612 zonecount = 1;
2717 2613
2718 2614 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2719 2615 (mod_hash_val_t)&zone0);
2720 2616 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2721 2617 (mod_hash_val_t)&zone0);
2722 2618 if (is_system_labeled()) {
2723 2619 zone0.zone_flags |= ZF_HASHED_LABEL;
2724 2620 (void) mod_hash_insert(zonehashbylabel,
2725 2621 (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2726 2622 }
2727 2623 mutex_exit(&zonehash_lock);
2728 2624
2729 2625 /*
2730 2626 * We avoid setting zone_kcred until now, since kcred is initialized
2731 2627 * sometime after zone_zsd_init() and before zone_init().
2732 2628 */
2733 2629 zone0.zone_kcred = kcred;
2734 2630 /*
2735 2631 * The global zone is fully initialized (except for zone_rootvp which
2736 2632 * will be set when the root filesystem is mounted).
2737 2633 */
2738 2634 global_zone = &zone0;
2739 2635
2740 2636 /*
2741 2637 * Setup an event channel to send zone status change notifications on
2742 2638 */
2743 2639 res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2744 2640 EVCH_CREAT);
2745 2641
2746 2642 if (res)
2747 2643 panic("Sysevent_evc_bind failed during zone setup.\n");
2748 2644
2749 2645 }
2750 2646
2751 2647 static void
2752 2648 zone_free(zone_t *zone)
2753 2649 {
2754 2650 zone_dl_t *zdl;
2755 2651
2756 2652 ASSERT(zone != global_zone);
2757 2653 ASSERT(zone->zone_ntasks == 0);
2758 2654 ASSERT(zone->zone_nlwps == 0);
2759 2655 ASSERT(zone->zone_nprocs == 0);
2760 2656 ASSERT(zone->zone_cred_ref == 0);
2761 2657 ASSERT(zone->zone_kcred == NULL);
2762 2658 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2763 2659 zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2764 2660 ASSERT(list_is_empty(&zone->zone_ref_list));
2765 2661
2766 2662 /*
2767 2663 * Remove any zone caps.
2768 2664 */
2769 2665 cpucaps_zone_remove(zone);
2770 2666
2771 2667 ASSERT(zone->zone_cpucap == NULL);
2772 2668
2773 2669 /* remove from deathrow list */
2774 2670 if (zone_status_get(zone) == ZONE_IS_DEAD) {
2775 2671 ASSERT(zone->zone_ref == 0);
2776 2672 mutex_enter(&zone_deathrow_lock);
2777 2673 list_remove(&zone_deathrow, zone);
2778 2674 mutex_exit(&zone_deathrow_lock);
2779 2675 }
2780 2676
2781 2677 list_destroy(&zone->zone_ref_list);
2782 2678 zone_free_zsd(zone);
2783 2679 zone_free_datasets(zone);
2784 2680
2785 2681 /*
2786 2682 * While dlmgmtd should have removed all of these, it could have left
2787 2683 * something behind or crashed. In which case it's not safe for us to
2788 2684 * assume that the list is empty which list_destroy() will ASSERT. We
2789 2685 * clean up for our userland comrades which may have crashed, or worse,
2790 2686 * been disabled by SMF.
2791 2687 */
2792 2688 while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
2793 2689 if (zdl->zdl_net != NULL)
2794 2690 nvlist_free(zdl->zdl_net);
2795 2691 kmem_free(zdl, sizeof (zone_dl_t));
2796 2692 }
2797 2693 list_destroy(&zone->zone_dl_list);
2798 2694
2799 2695 if (zone->zone_rootvp != NULL)
2800 2696 VN_RELE(zone->zone_rootvp);
2801 2697 if (zone->zone_rootpath)
2802 2698 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2803 2699 if (zone->zone_name != NULL)
2804 2700 kmem_free(zone->zone_name, ZONENAME_MAX);
2805 2701 if (zone->zone_slabel != NULL)
2806 2702 label_rele(zone->zone_slabel);
2807 2703 if (zone->zone_nodename != NULL)
2808 2704 kmem_free(zone->zone_nodename, _SYS_NMLN);
2809 2705 if (zone->zone_domain != NULL)
2810 2706 kmem_free(zone->zone_domain, _SYS_NMLN);
2811 2707 if (zone->zone_privset != NULL)
2812 2708 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2813 2709 if (zone->zone_rctls != NULL)
2814 2710 rctl_set_free(zone->zone_rctls);
2815 2711 if (zone->zone_bootargs != NULL)
2816 2712 strfree(zone->zone_bootargs);
2817 2713 if (zone->zone_initname != NULL)
2818 2714 strfree(zone->zone_initname);
2819 2715 if (zone->zone_fs_allowed != NULL)
2820 2716 strfree(zone->zone_fs_allowed);
2821 2717 if (zone->zone_pfexecd != NULL)
2822 2718 klpd_freelist(&zone->zone_pfexecd);
2823 2719 id_free(zoneid_space, zone->zone_id);
2824 2720 mutex_destroy(&zone->zone_lock);
2825 2721 cv_destroy(&zone->zone_cv);
2826 2722 rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2827 2723 rw_destroy(&zone->zone_mntfs_db_lock);
2828 2724 kmem_free(zone, sizeof (zone_t));
2829 2725 }
2830 2726
|
↓ open down ↓ |
184 lines elided |
↑ open up ↑ |
2831 2727 /*
2832 2728 * See block comment at the top of this file for information about zone
2833 2729 * status values.
2834 2730 */
2835 2731 /*
2836 2732 * Convenience function for setting zone status.
2837 2733 */
2838 2734 static void
2839 2735 zone_status_set(zone_t *zone, zone_status_t status)
2840 2736 {
2841 - timestruc_t now;
2842 - uint64_t t;
2843 2737
2844 2738 nvlist_t *nvl = NULL;
2845 2739 ASSERT(MUTEX_HELD(&zone_status_lock));
2846 2740 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2847 2741 status >= zone_status_get(zone));
2848 2742
2849 - /* Current time since Jan 1 1970 but consumers expect NS */
2850 - gethrestime(&now);
2851 - t = (now.tv_sec * NANOSEC) + now.tv_nsec;
2852 -
2853 2743 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2854 2744 nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2855 2745 nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2856 2746 zone_status_table[status]) ||
2857 2747 nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2858 2748 zone_status_table[zone->zone_status]) ||
2859 2749 nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2860 - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
2750 + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2861 2751 sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2862 2752 ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2863 2753 #ifdef DEBUG
2864 2754 (void) printf(
2865 2755 "Failed to allocate and send zone state change event.\n");
2866 2756 #endif
2867 2757 }
2868 2758 nvlist_free(nvl);
2869 2759
2870 2760 zone->zone_status = status;
2871 2761
2872 2762 cv_broadcast(&zone->zone_cv);
2873 2763 }
2874 2764
2875 2765 /*
2876 2766 * Public function to retrieve the zone status. The zone status may
2877 2767 * change after it is retrieved.
2878 2768 */
2879 2769 zone_status_t
2880 2770 zone_status_get(zone_t *zone)
2881 2771 {
2882 2772 return (zone->zone_status);
2883 2773 }
2884 2774
2885 2775 static int
2886 2776 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2887 2777 {
2888 2778 char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2889 2779 int err = 0;
2890 2780
2891 2781 ASSERT(zone != global_zone);
2892 2782 if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2893 2783 goto done; /* EFAULT or ENAMETOOLONG */
2894 2784
2895 2785 if (zone->zone_bootargs != NULL)
2896 2786 strfree(zone->zone_bootargs);
2897 2787
2898 2788 zone->zone_bootargs = strdup(buf);
2899 2789
2900 2790 done:
2901 2791 kmem_free(buf, BOOTARGS_MAX);
2902 2792 return (err);
2903 2793 }
2904 2794
2905 2795 static int
2906 2796 zone_set_brand(zone_t *zone, const char *brand)
2907 2797 {
2908 2798 struct brand_attr *attrp;
2909 2799 brand_t *bp;
2910 2800
2911 2801 attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2912 2802 if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2913 2803 kmem_free(attrp, sizeof (struct brand_attr));
2914 2804 return (EFAULT);
2915 2805 }
2916 2806
2917 2807 bp = brand_register_zone(attrp);
2918 2808 kmem_free(attrp, sizeof (struct brand_attr));
2919 2809 if (bp == NULL)
2920 2810 return (EINVAL);
2921 2811
2922 2812 /*
2923 2813 * This is the only place where a zone can change it's brand.
2924 2814 * We already need to hold zone_status_lock to check the zone
2925 2815 * status, so we'll just use that lock to serialize zone
2926 2816 * branding requests as well.
2927 2817 */
2928 2818 mutex_enter(&zone_status_lock);
2929 2819
2930 2820 /* Re-Branding is not allowed and the zone can't be booted yet */
2931 2821 if ((ZONE_IS_BRANDED(zone)) ||
2932 2822 (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2933 2823 mutex_exit(&zone_status_lock);
2934 2824 brand_unregister_zone(bp);
2935 2825 return (EINVAL);
2936 2826 }
2937 2827
2938 2828 /*
2939 2829 * Set up the brand specific data.
2940 2830 * Note that it's possible that the hook has to drop the
2941 2831 * zone_status_lock and reaquire it before returning so we can't
2942 2832 * assume the lock has been held the entire time.
2943 2833 */
2944 2834 zone->zone_brand = bp;
2945 2835 ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
2946 2836
2947 2837 mutex_exit(&zone_status_lock);
2948 2838 return (0);
2949 2839 }
2950 2840
2951 2841 static int
2952 2842 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2953 2843 {
2954 2844 char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2955 2845 int err = 0;
2956 2846
2957 2847 ASSERT(zone != global_zone);
2958 2848 if ((err = copyinstr(zone_fs_allowed, buf,
2959 2849 ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2960 2850 goto done;
2961 2851
2962 2852 if (zone->zone_fs_allowed != NULL)
2963 2853 strfree(zone->zone_fs_allowed);
2964 2854
2965 2855 zone->zone_fs_allowed = strdup(buf);
2966 2856
2967 2857 done:
2968 2858 kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2969 2859 return (err);
2970 2860 }
2971 2861
2972 2862 static int
2973 2863 zone_set_initname(zone_t *zone, const char *zone_initname)
2974 2864 {
2975 2865 char initname[INITNAME_SZ];
2976 2866 size_t len;
2977 2867 int err = 0;
2978 2868
2979 2869 ASSERT(zone != global_zone);
2980 2870 if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2981 2871 return (err); /* EFAULT or ENAMETOOLONG */
2982 2872
2983 2873 if (zone->zone_initname != NULL)
2984 2874 strfree(zone->zone_initname);
2985 2875
2986 2876 zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2987 2877 (void) strcpy(zone->zone_initname, initname);
2988 2878 return (0);
2989 2879 }
2990 2880
2991 2881 /*
2992 2882 * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
2993 2883 * to provide the physical memory capping kstats. Since physical memory
2994 2884 * capping is currently implemented in userland, that code uses the setattr
2995 2885 * entry point to increment the kstats. We always simply increment nover
2996 2886 * every time that setattr is called and we always add in the input value
2997 2887 * to zone_mcap_pagedout every time that is called.
2998 2888 */
2999 2889 /*ARGSUSED*/
3000 2890 static int
3001 2891 zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
3002 2892 {
3003 2893 zone->zone_mcap_nover++;
3004 2894
3005 2895 return (0);
3006 2896 }
3007 2897
3008 2898 static int
3009 2899 zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
3010 2900 {
3011 2901 uint64_t pageout;
3012 2902 int err;
3013 2903
3014 2904 if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
3015 2905 zone->zone_mcap_pagedout += pageout;
3016 2906
3017 2907 return (err);
3018 2908 }
3019 2909
3020 2910 /*
3021 2911 * The zone_set_page_fault_delay function is used to set the number of usecs
3022 2912 * to throttle page faults. This is normally 0 but can be set to a non-0 value
3023 2913 * by the user-land memory capping code when the zone is over its physcial
3024 2914 * memory cap.
3025 2915 */
3026 2916 static int
3027 2917 zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
3028 2918 {
3029 2919 uint32_t dusec;
3030 2920 int err;
3031 2921
3032 2922 if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
3033 2923 zone->zone_pg_flt_delay = dusec;
3034 2924
3035 2925 return (err);
3036 2926 }
3037 2927
3038 2928 /*
3039 2929 * The zone_set_rss function is used to set the zone's RSS when we do the
3040 2930 * fast, approximate calculation in user-land.
3041 2931 */
3042 2932 static int
3043 2933 zone_set_rss(zone_t *zone, const uint64_t *prss)
3044 2934 {
3045 2935 uint64_t rss;
3046 2936 int err;
3047 2937
3048 2938 if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
3049 2939 zone->zone_phys_mem = rss;
3050 2940
3051 2941 return (err);
3052 2942 }
3053 2943
3054 2944 static int
3055 2945 zone_set_sched_class(zone_t *zone, const char *new_class)
3056 2946 {
3057 2947 char sched_class[PC_CLNMSZ];
3058 2948 id_t classid;
3059 2949 int err;
3060 2950
3061 2951 ASSERT(zone != global_zone);
3062 2952 if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
3063 2953 return (err); /* EFAULT or ENAMETOOLONG */
3064 2954
3065 2955 if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
3066 2956 return (set_errno(EINVAL));
3067 2957 zone->zone_defaultcid = classid;
3068 2958 ASSERT(zone->zone_defaultcid > 0 &&
3069 2959 zone->zone_defaultcid < loaded_classes);
3070 2960
3071 2961 return (0);
3072 2962 }
3073 2963
3074 2964 /*
3075 2965 * Block indefinitely waiting for (zone_status >= status)
3076 2966 */
3077 2967 void
3078 2968 zone_status_wait(zone_t *zone, zone_status_t status)
3079 2969 {
3080 2970 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3081 2971
3082 2972 mutex_enter(&zone_status_lock);
3083 2973 while (zone->zone_status < status) {
3084 2974 cv_wait(&zone->zone_cv, &zone_status_lock);
3085 2975 }
3086 2976 mutex_exit(&zone_status_lock);
3087 2977 }
3088 2978
3089 2979 /*
3090 2980 * Private CPR-safe version of zone_status_wait().
3091 2981 */
3092 2982 static void
3093 2983 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
3094 2984 {
3095 2985 callb_cpr_t cprinfo;
3096 2986
3097 2987 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3098 2988
3099 2989 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
3100 2990 str);
3101 2991 mutex_enter(&zone_status_lock);
3102 2992 while (zone->zone_status < status) {
3103 2993 CALLB_CPR_SAFE_BEGIN(&cprinfo);
3104 2994 cv_wait(&zone->zone_cv, &zone_status_lock);
3105 2995 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
3106 2996 }
3107 2997 /*
3108 2998 * zone_status_lock is implicitly released by the following.
3109 2999 */
3110 3000 CALLB_CPR_EXIT(&cprinfo);
3111 3001 }
3112 3002
3113 3003 /*
3114 3004 * Block until zone enters requested state or signal is received. Return (0)
3115 3005 * if signaled, non-zero otherwise.
3116 3006 */
3117 3007 int
3118 3008 zone_status_wait_sig(zone_t *zone, zone_status_t status)
3119 3009 {
3120 3010 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3121 3011
3122 3012 mutex_enter(&zone_status_lock);
3123 3013 while (zone->zone_status < status) {
3124 3014 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
3125 3015 mutex_exit(&zone_status_lock);
3126 3016 return (0);
3127 3017 }
3128 3018 }
3129 3019 mutex_exit(&zone_status_lock);
3130 3020 return (1);
3131 3021 }
3132 3022
3133 3023 /*
3134 3024 * Block until the zone enters the requested state or the timeout expires,
3135 3025 * whichever happens first. Return (-1) if operation timed out, time remaining
3136 3026 * otherwise.
3137 3027 */
3138 3028 clock_t
3139 3029 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
3140 3030 {
3141 3031 clock_t timeleft = 0;
3142 3032
3143 3033 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3144 3034
3145 3035 mutex_enter(&zone_status_lock);
3146 3036 while (zone->zone_status < status && timeleft != -1) {
3147 3037 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
3148 3038 }
3149 3039 mutex_exit(&zone_status_lock);
3150 3040 return (timeleft);
3151 3041 }
3152 3042
3153 3043 /*
3154 3044 * Block until the zone enters the requested state, the current process is
3155 3045 * signaled, or the timeout expires, whichever happens first. Return (-1) if
3156 3046 * operation timed out, 0 if signaled, time remaining otherwise.
3157 3047 */
3158 3048 clock_t
3159 3049 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
3160 3050 {
3161 3051 clock_t timeleft = tim - ddi_get_lbolt();
3162 3052
3163 3053 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3164 3054
3165 3055 mutex_enter(&zone_status_lock);
3166 3056 while (zone->zone_status < status) {
3167 3057 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
3168 3058 tim);
3169 3059 if (timeleft <= 0)
3170 3060 break;
3171 3061 }
3172 3062 mutex_exit(&zone_status_lock);
3173 3063 return (timeleft);
3174 3064 }
3175 3065
3176 3066 /*
3177 3067 * Zones have two reference counts: one for references from credential
3178 3068 * structures (zone_cred_ref), and one (zone_ref) for everything else.
3179 3069 * This is so we can allow a zone to be rebooted while there are still
3180 3070 * outstanding cred references, since certain drivers cache dblks (which
3181 3071 * implicitly results in cached creds). We wait for zone_ref to drop to
3182 3072 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is
3183 3073 * later freed when the zone_cred_ref drops to 0, though nothing other
3184 3074 * than the zone id and privilege set should be accessed once the zone
3185 3075 * is "dead".
3186 3076 *
3187 3077 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
3188 3078 * to force halt/reboot to block waiting for the zone_cred_ref to drop
3189 3079 * to 0. This can be useful to flush out other sources of cached creds
3190 3080 * that may be less innocuous than the driver case.
3191 3081 *
3192 3082 * Zones also provide a tracked reference counting mechanism in which zone
3193 3083 * references are represented by "crumbs" (zone_ref structures). Crumbs help
3194 3084 * debuggers determine the sources of leaked zone references. See
3195 3085 * zone_hold_ref() and zone_rele_ref() below for more information.
3196 3086 */
3197 3087
3198 3088 int zone_wait_for_cred = 0;
3199 3089
3200 3090 static void
3201 3091 zone_hold_locked(zone_t *z)
3202 3092 {
3203 3093 ASSERT(MUTEX_HELD(&z->zone_lock));
3204 3094 z->zone_ref++;
3205 3095 ASSERT(z->zone_ref != 0);
3206 3096 }
3207 3097
3208 3098 /*
3209 3099 * Increment the specified zone's reference count. The zone's zone_t structure
3210 3100 * will not be freed as long as the zone's reference count is nonzero.
3211 3101 * Decrement the zone's reference count via zone_rele().
3212 3102 *
3213 3103 * NOTE: This function should only be used to hold zones for short periods of
3214 3104 * time. Use zone_hold_ref() if the zone must be held for a long time.
3215 3105 */
3216 3106 void
3217 3107 zone_hold(zone_t *z)
3218 3108 {
3219 3109 mutex_enter(&z->zone_lock);
3220 3110 zone_hold_locked(z);
3221 3111 mutex_exit(&z->zone_lock);
3222 3112 }
3223 3113
3224 3114 /*
3225 3115 * If the non-cred ref count drops to 1 and either the cred ref count
3226 3116 * is 0 or we aren't waiting for cred references, the zone is ready to
3227 3117 * be destroyed.
3228 3118 */
3229 3119 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \
3230 3120 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
3231 3121
3232 3122 /*
3233 3123 * Common zone reference release function invoked by zone_rele() and
3234 3124 * zone_rele_ref(). If subsys is ZONE_REF_NUM_SUBSYS, then the specified
3235 3125 * zone's subsystem-specific reference counters are not affected by the
3236 3126 * release. If ref is not NULL, then the zone_ref_t to which it refers is
3237 3127 * removed from the specified zone's reference list. ref must be non-NULL iff
3238 3128 * subsys is not ZONE_REF_NUM_SUBSYS.
3239 3129 */
3240 3130 static void
3241 3131 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3242 3132 {
3243 3133 boolean_t wakeup;
3244 3134
3245 3135 mutex_enter(&z->zone_lock);
3246 3136 ASSERT(z->zone_ref != 0);
3247 3137 z->zone_ref--;
3248 3138 if (subsys != ZONE_REF_NUM_SUBSYS) {
3249 3139 ASSERT(z->zone_subsys_ref[subsys] != 0);
3250 3140 z->zone_subsys_ref[subsys]--;
3251 3141 list_remove(&z->zone_ref_list, ref);
3252 3142 }
3253 3143 if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3254 3144 /* no more refs, free the structure */
3255 3145 mutex_exit(&z->zone_lock);
3256 3146 zone_free(z);
3257 3147 return;
3258 3148 }
3259 3149 /* signal zone_destroy so the zone can finish halting */
3260 3150 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
3261 3151 mutex_exit(&z->zone_lock);
3262 3152
3263 3153 if (wakeup) {
3264 3154 /*
3265 3155 * Grabbing zonehash_lock here effectively synchronizes with
3266 3156 * zone_destroy() to avoid missed signals.
3267 3157 */
3268 3158 mutex_enter(&zonehash_lock);
3269 3159 cv_broadcast(&zone_destroy_cv);
3270 3160 mutex_exit(&zonehash_lock);
3271 3161 }
3272 3162 }
3273 3163
3274 3164 /*
3275 3165 * Decrement the specified zone's reference count. The specified zone will
3276 3166 * cease to exist after this function returns if the reference count drops to
3277 3167 * zero. This function should be paired with zone_hold().
3278 3168 */
3279 3169 void
3280 3170 zone_rele(zone_t *z)
3281 3171 {
3282 3172 zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
3283 3173 }
3284 3174
3285 3175 /*
3286 3176 * Initialize a zone reference structure. This function must be invoked for
3287 3177 * a reference structure before the structure is passed to zone_hold_ref().
3288 3178 */
3289 3179 void
3290 3180 zone_init_ref(zone_ref_t *ref)
3291 3181 {
3292 3182 ref->zref_zone = NULL;
3293 3183 list_link_init(&ref->zref_linkage);
3294 3184 }
3295 3185
3296 3186 /*
3297 3187 * Acquire a reference to zone z. The caller must specify the
3298 3188 * zone_ref_subsys_t constant associated with its subsystem. The specified
3299 3189 * zone_ref_t structure will represent a reference to the specified zone. Use
3300 3190 * zone_rele_ref() to release the reference.
3301 3191 *
3302 3192 * The referenced zone_t structure will not be freed as long as the zone_t's
3303 3193 * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
3304 3194 * references.
3305 3195 *
3306 3196 * NOTE: The zone_ref_t structure must be initialized before it is used.
3307 3197 * See zone_init_ref() above.
3308 3198 */
3309 3199 void
3310 3200 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3311 3201 {
3312 3202 ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
3313 3203
3314 3204 /*
3315 3205 * Prevent consumers from reusing a reference structure before
3316 3206 * releasing it.
3317 3207 */
3318 3208 VERIFY(ref->zref_zone == NULL);
3319 3209
3320 3210 ref->zref_zone = z;
3321 3211 mutex_enter(&z->zone_lock);
3322 3212 zone_hold_locked(z);
3323 3213 z->zone_subsys_ref[subsys]++;
3324 3214 ASSERT(z->zone_subsys_ref[subsys] != 0);
3325 3215 list_insert_head(&z->zone_ref_list, ref);
3326 3216 mutex_exit(&z->zone_lock);
3327 3217 }
3328 3218
3329 3219 /*
3330 3220 * Release the zone reference represented by the specified zone_ref_t.
3331 3221 * The reference is invalid after it's released; however, the zone_ref_t
3332 3222 * structure can be reused without having to invoke zone_init_ref().
3333 3223 * subsys should be the same value that was passed to zone_hold_ref()
3334 3224 * when the reference was acquired.
3335 3225 */
3336 3226 void
3337 3227 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
3338 3228 {
3339 3229 zone_rele_common(ref->zref_zone, ref, subsys);
3340 3230
3341 3231 /*
3342 3232 * Set the zone_ref_t's zref_zone field to NULL to generate panics
3343 3233 * when consumers dereference the reference. This helps us catch
3344 3234 * consumers who use released references. Furthermore, this lets
3345 3235 * consumers reuse the zone_ref_t structure without having to
3346 3236 * invoke zone_init_ref().
3347 3237 */
3348 3238 ref->zref_zone = NULL;
3349 3239 }
3350 3240
3351 3241 void
3352 3242 zone_cred_hold(zone_t *z)
3353 3243 {
3354 3244 mutex_enter(&z->zone_lock);
3355 3245 z->zone_cred_ref++;
3356 3246 ASSERT(z->zone_cred_ref != 0);
3357 3247 mutex_exit(&z->zone_lock);
3358 3248 }
3359 3249
3360 3250 void
3361 3251 zone_cred_rele(zone_t *z)
3362 3252 {
3363 3253 boolean_t wakeup;
3364 3254
3365 3255 mutex_enter(&z->zone_lock);
3366 3256 ASSERT(z->zone_cred_ref != 0);
3367 3257 z->zone_cred_ref--;
3368 3258 if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3369 3259 /* no more refs, free the structure */
3370 3260 mutex_exit(&z->zone_lock);
3371 3261 zone_free(z);
3372 3262 return;
3373 3263 }
3374 3264 /*
3375 3265 * If zone_destroy is waiting for the cred references to drain
3376 3266 * out, and they have, signal it.
3377 3267 */
3378 3268 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
3379 3269 zone_status_get(z) >= ZONE_IS_DEAD);
3380 3270 mutex_exit(&z->zone_lock);
3381 3271
3382 3272 if (wakeup) {
3383 3273 /*
3384 3274 * Grabbing zonehash_lock here effectively synchronizes with
3385 3275 * zone_destroy() to avoid missed signals.
3386 3276 */
3387 3277 mutex_enter(&zonehash_lock);
3388 3278 cv_broadcast(&zone_destroy_cv);
3389 3279 mutex_exit(&zonehash_lock);
3390 3280 }
3391 3281 }
3392 3282
3393 3283 void
3394 3284 zone_task_hold(zone_t *z)
3395 3285 {
3396 3286 mutex_enter(&z->zone_lock);
3397 3287 z->zone_ntasks++;
3398 3288 ASSERT(z->zone_ntasks != 0);
3399 3289 mutex_exit(&z->zone_lock);
3400 3290 }
3401 3291
3402 3292 void
3403 3293 zone_task_rele(zone_t *zone)
3404 3294 {
3405 3295 uint_t refcnt;
3406 3296
3407 3297 mutex_enter(&zone->zone_lock);
3408 3298 ASSERT(zone->zone_ntasks != 0);
3409 3299 refcnt = --zone->zone_ntasks;
3410 3300 if (refcnt > 1) { /* Common case */
3411 3301 mutex_exit(&zone->zone_lock);
3412 3302 return;
3413 3303 }
3414 3304 zone_hold_locked(zone); /* so we can use the zone_t later */
3415 3305 mutex_exit(&zone->zone_lock);
3416 3306 if (refcnt == 1) {
3417 3307 /*
3418 3308 * See if the zone is shutting down.
3419 3309 */
3420 3310 mutex_enter(&zone_status_lock);
3421 3311 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
3422 3312 goto out;
3423 3313 }
3424 3314
3425 3315 /*
3426 3316 * Make sure the ntasks didn't change since we
3427 3317 * dropped zone_lock.
3428 3318 */
3429 3319 mutex_enter(&zone->zone_lock);
3430 3320 if (refcnt != zone->zone_ntasks) {
3431 3321 mutex_exit(&zone->zone_lock);
3432 3322 goto out;
3433 3323 }
3434 3324 mutex_exit(&zone->zone_lock);
3435 3325
3436 3326 /*
3437 3327 * No more user processes in the zone. The zone is empty.
3438 3328 */
3439 3329 zone_status_set(zone, ZONE_IS_EMPTY);
3440 3330 goto out;
3441 3331 }
3442 3332
3443 3333 ASSERT(refcnt == 0);
3444 3334 /*
3445 3335 * zsched has exited; the zone is dead.
3446 3336 */
3447 3337 zone->zone_zsched = NULL; /* paranoia */
3448 3338 mutex_enter(&zone_status_lock);
3449 3339 zone_status_set(zone, ZONE_IS_DEAD);
3450 3340 out:
|
↓ open down ↓ |
580 lines elided |
↑ open up ↑ |
3451 3341 mutex_exit(&zone_status_lock);
3452 3342 zone_rele(zone);
3453 3343 }
3454 3344
3455 3345 zoneid_t
3456 3346 getzoneid(void)
3457 3347 {
3458 3348 return (curproc->p_zone->zone_id);
3459 3349 }
3460 3350
3461 -zoneid_t
3462 -getzonedid(void)
3463 -{
3464 - return (curproc->p_zone->zone_did);
3465 -}
3466 -
3467 3351 /*
3468 3352 * Internal versions of zone_find_by_*(). These don't zone_hold() or
3469 3353 * check the validity of a zone's state.
3470 3354 */
3471 3355 static zone_t *
3472 3356 zone_find_all_by_id(zoneid_t zoneid)
3473 3357 {
3474 3358 mod_hash_val_t hv;
3475 3359 zone_t *zone = NULL;
3476 3360
3477 3361 ASSERT(MUTEX_HELD(&zonehash_lock));
3478 3362
3479 3363 if (mod_hash_find(zonehashbyid,
3480 3364 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3481 3365 zone = (zone_t *)hv;
3482 3366 return (zone);
3483 3367 }
3484 3368
3485 3369 static zone_t *
3486 3370 zone_find_all_by_label(const ts_label_t *label)
3487 3371 {
3488 3372 mod_hash_val_t hv;
3489 3373 zone_t *zone = NULL;
3490 3374
3491 3375 ASSERT(MUTEX_HELD(&zonehash_lock));
3492 3376
3493 3377 /*
3494 3378 * zonehashbylabel is not maintained for unlabeled systems
3495 3379 */
3496 3380 if (!is_system_labeled())
3497 3381 return (NULL);
3498 3382 if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3499 3383 zone = (zone_t *)hv;
3500 3384 return (zone);
3501 3385 }
3502 3386
3503 3387 static zone_t *
3504 3388 zone_find_all_by_name(char *name)
3505 3389 {
3506 3390 mod_hash_val_t hv;
3507 3391 zone_t *zone = NULL;
3508 3392
3509 3393 ASSERT(MUTEX_HELD(&zonehash_lock));
3510 3394
3511 3395 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3512 3396 zone = (zone_t *)hv;
3513 3397 return (zone);
3514 3398 }
3515 3399
3516 3400 /*
3517 3401 * Public interface for looking up a zone by zoneid. Only returns the zone if
3518 3402 * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3519 3403 * Caller must call zone_rele() once it is done with the zone.
3520 3404 *
3521 3405 * The zone may begin the zone_destroy() sequence immediately after this
3522 3406 * function returns, but may be safely used until zone_rele() is called.
3523 3407 */
3524 3408 zone_t *
3525 3409 zone_find_by_id(zoneid_t zoneid)
3526 3410 {
3527 3411 zone_t *zone;
3528 3412 zone_status_t status;
3529 3413
3530 3414 mutex_enter(&zonehash_lock);
3531 3415 if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3532 3416 mutex_exit(&zonehash_lock);
3533 3417 return (NULL);
3534 3418 }
3535 3419 status = zone_status_get(zone);
3536 3420 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3537 3421 /*
3538 3422 * For all practical purposes the zone doesn't exist.
3539 3423 */
3540 3424 mutex_exit(&zonehash_lock);
3541 3425 return (NULL);
3542 3426 }
3543 3427 zone_hold(zone);
3544 3428 mutex_exit(&zonehash_lock);
3545 3429 return (zone);
3546 3430 }
3547 3431
3548 3432 /*
3549 3433 * Similar to zone_find_by_id, but using zone label as the key.
3550 3434 */
3551 3435 zone_t *
3552 3436 zone_find_by_label(const ts_label_t *label)
3553 3437 {
3554 3438 zone_t *zone;
3555 3439 zone_status_t status;
3556 3440
3557 3441 mutex_enter(&zonehash_lock);
3558 3442 if ((zone = zone_find_all_by_label(label)) == NULL) {
3559 3443 mutex_exit(&zonehash_lock);
3560 3444 return (NULL);
3561 3445 }
3562 3446
3563 3447 status = zone_status_get(zone);
3564 3448 if (status > ZONE_IS_DOWN) {
3565 3449 /*
3566 3450 * For all practical purposes the zone doesn't exist.
3567 3451 */
3568 3452 mutex_exit(&zonehash_lock);
3569 3453 return (NULL);
3570 3454 }
3571 3455 zone_hold(zone);
3572 3456 mutex_exit(&zonehash_lock);
3573 3457 return (zone);
3574 3458 }
3575 3459
3576 3460 /*
3577 3461 * Similar to zone_find_by_id, but using zone name as the key.
3578 3462 */
3579 3463 zone_t *
3580 3464 zone_find_by_name(char *name)
3581 3465 {
3582 3466 zone_t *zone;
3583 3467 zone_status_t status;
3584 3468
3585 3469 mutex_enter(&zonehash_lock);
3586 3470 if ((zone = zone_find_all_by_name(name)) == NULL) {
3587 3471 mutex_exit(&zonehash_lock);
3588 3472 return (NULL);
3589 3473 }
3590 3474 status = zone_status_get(zone);
3591 3475 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3592 3476 /*
3593 3477 * For all practical purposes the zone doesn't exist.
3594 3478 */
3595 3479 mutex_exit(&zonehash_lock);
3596 3480 return (NULL);
3597 3481 }
3598 3482 zone_hold(zone);
3599 3483 mutex_exit(&zonehash_lock);
3600 3484 return (zone);
3601 3485 }
3602 3486
3603 3487 /*
3604 3488 * Similar to zone_find_by_id(), using the path as a key. For instance,
3605 3489 * if there is a zone "foo" rooted at /foo/root, and the path argument
3606 3490 * is "/foo/root/proc", it will return the held zone_t corresponding to
3607 3491 * zone "foo".
3608 3492 *
3609 3493 * zone_find_by_path() always returns a non-NULL value, since at the
3610 3494 * very least every path will be contained in the global zone.
3611 3495 *
3612 3496 * As with the other zone_find_by_*() functions, the caller is
3613 3497 * responsible for zone_rele()ing the return value of this function.
3614 3498 */
3615 3499 zone_t *
3616 3500 zone_find_by_path(const char *path)
3617 3501 {
3618 3502 zone_t *zone;
3619 3503 zone_t *zret = NULL;
3620 3504 zone_status_t status;
3621 3505
3622 3506 if (path == NULL) {
3623 3507 /*
3624 3508 * Call from rootconf().
3625 3509 */
3626 3510 zone_hold(global_zone);
3627 3511 return (global_zone);
3628 3512 }
3629 3513 ASSERT(*path == '/');
3630 3514 mutex_enter(&zonehash_lock);
3631 3515 for (zone = list_head(&zone_active); zone != NULL;
3632 3516 zone = list_next(&zone_active, zone)) {
3633 3517 if (ZONE_PATH_VISIBLE(path, zone))
3634 3518 zret = zone;
3635 3519 }
3636 3520 ASSERT(zret != NULL);
3637 3521 status = zone_status_get(zret);
3638 3522 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3639 3523 /*
3640 3524 * Zone practically doesn't exist.
3641 3525 */
3642 3526 zret = global_zone;
3643 3527 }
3644 3528 zone_hold(zret);
3645 3529 mutex_exit(&zonehash_lock);
3646 3530 return (zret);
3647 3531 }
3648 3532
3649 3533 /*
3650 3534 * Public interface for updating per-zone load averages. Called once per
3651 3535 * second.
3652 3536 *
3653 3537 * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3654 3538 */
3655 3539 void
3656 3540 zone_loadavg_update()
3657 3541 {
3658 3542 zone_t *zp;
3659 3543 zone_status_t status;
3660 3544 struct loadavg_s *lavg;
3661 3545 hrtime_t zone_total;
3662 3546 int i;
3663 3547 hrtime_t hr_avg;
3664 3548 int nrun;
3665 3549 static int64_t f[3] = { 135, 27, 9 };
3666 3550 int64_t q, r;
3667 3551
3668 3552 mutex_enter(&zonehash_lock);
3669 3553 for (zp = list_head(&zone_active); zp != NULL;
3670 3554 zp = list_next(&zone_active, zp)) {
3671 3555 mutex_enter(&zp->zone_lock);
3672 3556
3673 3557 /* Skip zones that are on the way down or not yet up */
3674 3558 status = zone_status_get(zp);
3675 3559 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3676 3560 /* For all practical purposes the zone doesn't exist. */
3677 3561 mutex_exit(&zp->zone_lock);
3678 3562 continue;
3679 3563 }
3680 3564
3681 3565 /*
3682 3566 * Update the 10 second moving average data in zone_loadavg.
3683 3567 */
3684 3568 lavg = &zp->zone_loadavg;
3685 3569
3686 3570 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3687 3571 scalehrtime(&zone_total);
3688 3572
3689 3573 /* The zone_total should always be increasing. */
3690 3574 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3691 3575 zone_total - lavg->lg_total : 0;
3692 3576 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3693 3577 /* lg_total holds the prev. 1 sec. total */
3694 3578 lavg->lg_total = zone_total;
3695 3579
3696 3580 /*
3697 3581 * To simplify the calculation, we don't calculate the load avg.
3698 3582 * until the zone has been up for at least 10 seconds and our
3699 3583 * moving average is thus full.
3700 3584 */
3701 3585 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3702 3586 lavg->lg_len++;
3703 3587 mutex_exit(&zp->zone_lock);
3704 3588 continue;
3705 3589 }
3706 3590
3707 3591 /* Now calculate the 1min, 5min, 15 min load avg. */
3708 3592 hr_avg = 0;
3709 3593 for (i = 0; i < S_LOADAVG_SZ; i++)
3710 3594 hr_avg += lavg->lg_loads[i];
3711 3595 hr_avg = hr_avg / S_LOADAVG_SZ;
3712 3596 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3713 3597
3714 3598 /* Compute load avg. See comment in calcloadavg() */
3715 3599 for (i = 0; i < 3; i++) {
3716 3600 q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3717 3601 r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3718 3602 zp->zone_hp_avenrun[i] +=
3719 3603 ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3720 3604
3721 3605 /* avenrun[] can only hold 31 bits of load avg. */
3722 3606 if (zp->zone_hp_avenrun[i] <
3723 3607 ((uint64_t)1<<(31+16-FSHIFT)))
3724 3608 zp->zone_avenrun[i] = (int32_t)
3725 3609 (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3726 3610 else
3727 3611 zp->zone_avenrun[i] = 0x7fffffff;
3728 3612 }
3729 3613
3730 3614 mutex_exit(&zp->zone_lock);
3731 3615 }
3732 3616 mutex_exit(&zonehash_lock);
3733 3617 }
3734 3618
3735 3619 /*
3736 3620 * Get the number of cpus visible to this zone. The system-wide global
3737 3621 * 'ncpus' is returned if pools are disabled, the caller is in the
3738 3622 * global zone, or a NULL zone argument is passed in.
3739 3623 */
3740 3624 int
3741 3625 zone_ncpus_get(zone_t *zone)
3742 3626 {
3743 3627 int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3744 3628
3745 3629 return (myncpus != 0 ? myncpus : ncpus);
3746 3630 }
3747 3631
3748 3632 /*
3749 3633 * Get the number of online cpus visible to this zone. The system-wide
3750 3634 * global 'ncpus_online' is returned if pools are disabled, the caller
3751 3635 * is in the global zone, or a NULL zone argument is passed in.
3752 3636 */
3753 3637 int
3754 3638 zone_ncpus_online_get(zone_t *zone)
3755 3639 {
3756 3640 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3757 3641
3758 3642 return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3759 3643 }
3760 3644
3761 3645 /*
3762 3646 * Return the pool to which the zone is currently bound.
3763 3647 */
3764 3648 pool_t *
3765 3649 zone_pool_get(zone_t *zone)
3766 3650 {
3767 3651 ASSERT(pool_lock_held());
3768 3652
3769 3653 return (zone->zone_pool);
3770 3654 }
3771 3655
3772 3656 /*
3773 3657 * Set the zone's pool pointer and update the zone's visibility to match
3774 3658 * the resources in the new pool.
3775 3659 */
3776 3660 void
3777 3661 zone_pool_set(zone_t *zone, pool_t *pool)
3778 3662 {
3779 3663 ASSERT(pool_lock_held());
3780 3664 ASSERT(MUTEX_HELD(&cpu_lock));
3781 3665
3782 3666 zone->zone_pool = pool;
3783 3667 zone_pset_set(zone, pool->pool_pset->pset_id);
3784 3668 }
3785 3669
3786 3670 /*
3787 3671 * Return the cached value of the id of the processor set to which the
3788 3672 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools
3789 3673 * facility is disabled.
3790 3674 */
3791 3675 psetid_t
3792 3676 zone_pset_get(zone_t *zone)
3793 3677 {
3794 3678 ASSERT(MUTEX_HELD(&cpu_lock));
3795 3679
3796 3680 return (zone->zone_psetid);
3797 3681 }
3798 3682
3799 3683 /*
3800 3684 * Set the cached value of the id of the processor set to which the zone
3801 3685 * is currently bound. Also update the zone's visibility to match the
3802 3686 * resources in the new processor set.
3803 3687 */
3804 3688 void
3805 3689 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3806 3690 {
3807 3691 psetid_t oldpsetid;
3808 3692
3809 3693 ASSERT(MUTEX_HELD(&cpu_lock));
3810 3694 oldpsetid = zone_pset_get(zone);
3811 3695
3812 3696 if (oldpsetid == newpsetid)
3813 3697 return;
3814 3698 /*
3815 3699 * Global zone sees all.
3816 3700 */
3817 3701 if (zone != global_zone) {
3818 3702 zone->zone_psetid = newpsetid;
3819 3703 if (newpsetid != ZONE_PS_INVAL)
3820 3704 pool_pset_visibility_add(newpsetid, zone);
3821 3705 if (oldpsetid != ZONE_PS_INVAL)
3822 3706 pool_pset_visibility_remove(oldpsetid, zone);
3823 3707 }
3824 3708 /*
3825 3709 * Disabling pools, so we should start using the global values
3826 3710 * for ncpus and ncpus_online.
3827 3711 */
3828 3712 if (newpsetid == ZONE_PS_INVAL) {
3829 3713 zone->zone_ncpus = 0;
3830 3714 zone->zone_ncpus_online = 0;
3831 3715 }
3832 3716 }
3833 3717
3834 3718 /*
3835 3719 * Walk the list of active zones and issue the provided callback for
3836 3720 * each of them.
3837 3721 *
3838 3722 * Caller must not be holding any locks that may be acquired under
3839 3723 * zonehash_lock. See comment at the beginning of the file for a list of
3840 3724 * common locks and their interactions with zones.
3841 3725 */
3842 3726 int
3843 3727 zone_walk(int (*cb)(zone_t *, void *), void *data)
3844 3728 {
3845 3729 zone_t *zone;
3846 3730 int ret = 0;
3847 3731 zone_status_t status;
3848 3732
3849 3733 mutex_enter(&zonehash_lock);
3850 3734 for (zone = list_head(&zone_active); zone != NULL;
3851 3735 zone = list_next(&zone_active, zone)) {
3852 3736 /*
3853 3737 * Skip zones that shouldn't be externally visible.
3854 3738 */
3855 3739 status = zone_status_get(zone);
3856 3740 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3857 3741 continue;
3858 3742 /*
3859 3743 * Bail immediately if any callback invocation returns a
3860 3744 * non-zero value.
3861 3745 */
3862 3746 ret = (*cb)(zone, data);
3863 3747 if (ret != 0)
3864 3748 break;
3865 3749 }
3866 3750 mutex_exit(&zonehash_lock);
3867 3751 return (ret);
3868 3752 }
3869 3753
3870 3754 static int
3871 3755 zone_set_root(zone_t *zone, const char *upath)
3872 3756 {
3873 3757 vnode_t *vp;
3874 3758 int trycount;
3875 3759 int error = 0;
3876 3760 char *path;
3877 3761 struct pathname upn, pn;
3878 3762 size_t pathlen;
3879 3763
3880 3764 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3881 3765 return (error);
3882 3766
3883 3767 pn_alloc(&pn);
3884 3768
3885 3769 /* prevent infinite loop */
3886 3770 trycount = 10;
3887 3771 for (;;) {
3888 3772 if (--trycount <= 0) {
3889 3773 error = ESTALE;
3890 3774 goto out;
3891 3775 }
3892 3776
3893 3777 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3894 3778 /*
3895 3779 * VOP_ACCESS() may cover 'vp' with a new
3896 3780 * filesystem, if 'vp' is an autoFS vnode.
3897 3781 * Get the new 'vp' if so.
3898 3782 */
3899 3783 if ((error =
3900 3784 VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3901 3785 (!vn_ismntpt(vp) ||
3902 3786 (error = traverse(&vp)) == 0)) {
3903 3787 pathlen = pn.pn_pathlen + 2;
3904 3788 path = kmem_alloc(pathlen, KM_SLEEP);
3905 3789 (void) strncpy(path, pn.pn_path,
3906 3790 pn.pn_pathlen + 1);
3907 3791 path[pathlen - 2] = '/';
3908 3792 path[pathlen - 1] = '\0';
3909 3793 pn_free(&pn);
3910 3794 pn_free(&upn);
3911 3795
3912 3796 /* Success! */
3913 3797 break;
3914 3798 }
3915 3799 VN_RELE(vp);
3916 3800 }
3917 3801 if (error != ESTALE)
3918 3802 goto out;
3919 3803 }
3920 3804
3921 3805 ASSERT(error == 0);
3922 3806 zone->zone_rootvp = vp; /* we hold a reference to vp */
3923 3807 zone->zone_rootpath = path;
3924 3808 zone->zone_rootpathlen = pathlen;
3925 3809 if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3926 3810 zone->zone_flags |= ZF_IS_SCRATCH;
3927 3811 return (0);
3928 3812
3929 3813 out:
3930 3814 pn_free(&pn);
3931 3815 pn_free(&upn);
3932 3816 return (error);
3933 3817 }
3934 3818
3935 3819 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \
3936 3820 ((c) >= 'a' && (c) <= 'z') || \
3937 3821 ((c) >= 'A' && (c) <= 'Z'))
3938 3822
3939 3823 static int
3940 3824 zone_set_name(zone_t *zone, const char *uname)
3941 3825 {
3942 3826 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3943 3827 size_t len;
3944 3828 int i, err;
3945 3829
3946 3830 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3947 3831 kmem_free(kname, ZONENAME_MAX);
3948 3832 return (err); /* EFAULT or ENAMETOOLONG */
3949 3833 }
3950 3834
3951 3835 /* must be less than ZONENAME_MAX */
3952 3836 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3953 3837 kmem_free(kname, ZONENAME_MAX);
3954 3838 return (EINVAL);
3955 3839 }
3956 3840
3957 3841 /*
3958 3842 * Name must start with an alphanumeric and must contain only
3959 3843 * alphanumerics, '-', '_' and '.'.
3960 3844 */
3961 3845 if (!isalnum(kname[0])) {
3962 3846 kmem_free(kname, ZONENAME_MAX);
3963 3847 return (EINVAL);
3964 3848 }
3965 3849 for (i = 1; i < len - 1; i++) {
3966 3850 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3967 3851 kname[i] != '.') {
3968 3852 kmem_free(kname, ZONENAME_MAX);
3969 3853 return (EINVAL);
3970 3854 }
3971 3855 }
3972 3856
3973 3857 zone->zone_name = kname;
3974 3858 return (0);
3975 3859 }
3976 3860
3977 3861 /*
3978 3862 * Gets the 32-bit hostid of the specified zone as an unsigned int. If 'zonep'
3979 3863 * is NULL or it points to a zone with no hostid emulation, then the machine's
3980 3864 * hostid (i.e., the global zone's hostid) is returned. This function returns
3981 3865 * zero if neither the zone nor the host machine (global zone) have hostids. It
3982 3866 * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3983 3867 * hostid and the machine's hostid is invalid.
3984 3868 */
3985 3869 uint32_t
3986 3870 zone_get_hostid(zone_t *zonep)
3987 3871 {
3988 3872 unsigned long machine_hostid;
3989 3873
3990 3874 if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3991 3875 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3992 3876 return (HW_INVALID_HOSTID);
3993 3877 return ((uint32_t)machine_hostid);
3994 3878 }
3995 3879 return (zonep->zone_hostid);
3996 3880 }
3997 3881
3998 3882 /*
3999 3883 * Similar to thread_create(), but makes sure the thread is in the appropriate
4000 3884 * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
4001 3885 */
4002 3886 /*ARGSUSED*/
4003 3887 kthread_t *
4004 3888 zthread_create(
4005 3889 caddr_t stk,
4006 3890 size_t stksize,
4007 3891 void (*proc)(),
4008 3892 void *arg,
4009 3893 size_t len,
4010 3894 pri_t pri)
4011 3895 {
4012 3896 kthread_t *t;
4013 3897 zone_t *zone = curproc->p_zone;
4014 3898 proc_t *pp = zone->zone_zsched;
4015 3899
4016 3900 zone_hold(zone); /* Reference to be dropped when thread exits */
4017 3901
4018 3902 /*
4019 3903 * No-one should be trying to create threads if the zone is shutting
4020 3904 * down and there aren't any kernel threads around. See comment
4021 3905 * in zthread_exit().
4022 3906 */
4023 3907 ASSERT(!(zone->zone_kthreads == NULL &&
4024 3908 zone_status_get(zone) >= ZONE_IS_EMPTY));
4025 3909 /*
4026 3910 * Create a thread, but don't let it run until we've finished setting
4027 3911 * things up.
4028 3912 */
4029 3913 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
4030 3914 ASSERT(t->t_forw == NULL);
4031 3915 mutex_enter(&zone_status_lock);
4032 3916 if (zone->zone_kthreads == NULL) {
4033 3917 t->t_forw = t->t_back = t;
4034 3918 } else {
4035 3919 kthread_t *tx = zone->zone_kthreads;
4036 3920
4037 3921 t->t_forw = tx;
4038 3922 t->t_back = tx->t_back;
4039 3923 tx->t_back->t_forw = t;
4040 3924 tx->t_back = t;
4041 3925 }
4042 3926 zone->zone_kthreads = t;
4043 3927 mutex_exit(&zone_status_lock);
4044 3928
4045 3929 mutex_enter(&pp->p_lock);
4046 3930 t->t_proc_flag |= TP_ZTHREAD;
4047 3931 project_rele(t->t_proj);
4048 3932 t->t_proj = project_hold(pp->p_task->tk_proj);
4049 3933
4050 3934 /*
4051 3935 * Setup complete, let it run.
4052 3936 */
4053 3937 thread_lock(t);
4054 3938 t->t_schedflag |= TS_ALLSTART;
4055 3939 setrun_locked(t);
4056 3940 thread_unlock(t);
4057 3941
4058 3942 mutex_exit(&pp->p_lock);
4059 3943
4060 3944 return (t);
4061 3945 }
4062 3946
4063 3947 /*
4064 3948 * Similar to thread_exit(). Must be called by threads created via
4065 3949 * zthread_exit().
4066 3950 */
4067 3951 void
4068 3952 zthread_exit(void)
4069 3953 {
4070 3954 kthread_t *t = curthread;
4071 3955 proc_t *pp = curproc;
4072 3956 zone_t *zone = pp->p_zone;
4073 3957
4074 3958 mutex_enter(&zone_status_lock);
4075 3959
4076 3960 /*
4077 3961 * Reparent to p0
4078 3962 */
4079 3963 kpreempt_disable();
4080 3964 mutex_enter(&pp->p_lock);
4081 3965 t->t_proc_flag &= ~TP_ZTHREAD;
4082 3966 t->t_procp = &p0;
4083 3967 hat_thread_exit(t);
4084 3968 mutex_exit(&pp->p_lock);
4085 3969 kpreempt_enable();
4086 3970
4087 3971 if (t->t_back == t) {
4088 3972 ASSERT(t->t_forw == t);
4089 3973 /*
4090 3974 * If the zone is empty, once the thread count
4091 3975 * goes to zero no further kernel threads can be
4092 3976 * created. This is because if the creator is a process
4093 3977 * in the zone, then it must have exited before the zone
4094 3978 * state could be set to ZONE_IS_EMPTY.
4095 3979 * Otherwise, if the creator is a kernel thread in the
4096 3980 * zone, the thread count is non-zero.
4097 3981 *
4098 3982 * This really means that non-zone kernel threads should
4099 3983 * not create zone kernel threads.
4100 3984 */
4101 3985 zone->zone_kthreads = NULL;
4102 3986 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
4103 3987 zone_status_set(zone, ZONE_IS_DOWN);
4104 3988 /*
4105 3989 * Remove any CPU caps on this zone.
4106 3990 */
4107 3991 cpucaps_zone_remove(zone);
4108 3992 }
4109 3993 } else {
4110 3994 t->t_forw->t_back = t->t_back;
4111 3995 t->t_back->t_forw = t->t_forw;
4112 3996 if (zone->zone_kthreads == t)
4113 3997 zone->zone_kthreads = t->t_forw;
4114 3998 }
4115 3999 mutex_exit(&zone_status_lock);
4116 4000 zone_rele(zone);
4117 4001 thread_exit();
4118 4002 /* NOTREACHED */
4119 4003 }
4120 4004
4121 4005 static void
4122 4006 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
4123 4007 {
4124 4008 vnode_t *oldvp;
4125 4009
4126 4010 /* we're going to hold a reference here to the directory */
4127 4011 VN_HOLD(vp);
4128 4012
4129 4013 /* update abs cwd/root path see c2/audit.c */
4130 4014 if (AU_AUDITING())
4131 4015 audit_chdirec(vp, vpp);
4132 4016
4133 4017 mutex_enter(&pp->p_lock);
4134 4018 oldvp = *vpp;
4135 4019 *vpp = vp;
4136 4020 mutex_exit(&pp->p_lock);
4137 4021 if (oldvp != NULL)
4138 4022 VN_RELE(oldvp);
4139 4023 }
4140 4024
4141 4025 /*
4142 4026 * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
4143 4027 */
4144 4028 static int
4145 4029 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
4146 4030 {
4147 4031 nvpair_t *nvp = NULL;
4148 4032 boolean_t priv_set = B_FALSE;
4149 4033 boolean_t limit_set = B_FALSE;
4150 4034 boolean_t action_set = B_FALSE;
4151 4035
4152 4036 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4153 4037 const char *name;
4154 4038 uint64_t ui64;
4155 4039
4156 4040 name = nvpair_name(nvp);
4157 4041 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
4158 4042 return (EINVAL);
4159 4043 (void) nvpair_value_uint64(nvp, &ui64);
4160 4044 if (strcmp(name, "privilege") == 0) {
4161 4045 /*
4162 4046 * Currently only privileged values are allowed, but
4163 4047 * this may change in the future.
4164 4048 */
4165 4049 if (ui64 != RCPRIV_PRIVILEGED)
4166 4050 return (EINVAL);
4167 4051 rv->rcv_privilege = ui64;
4168 4052 priv_set = B_TRUE;
4169 4053 } else if (strcmp(name, "limit") == 0) {
4170 4054 rv->rcv_value = ui64;
4171 4055 limit_set = B_TRUE;
4172 4056 } else if (strcmp(name, "action") == 0) {
4173 4057 if (ui64 != RCTL_LOCAL_NOACTION &&
4174 4058 ui64 != RCTL_LOCAL_DENY)
4175 4059 return (EINVAL);
4176 4060 rv->rcv_flagaction = ui64;
4177 4061 action_set = B_TRUE;
4178 4062 } else {
4179 4063 return (EINVAL);
4180 4064 }
4181 4065 }
4182 4066
4183 4067 if (!(priv_set && limit_set && action_set))
4184 4068 return (EINVAL);
4185 4069 rv->rcv_action_signal = 0;
4186 4070 rv->rcv_action_recipient = NULL;
4187 4071 rv->rcv_action_recip_pid = -1;
4188 4072 rv->rcv_firing_time = 0;
4189 4073
4190 4074 return (0);
4191 4075 }
4192 4076
4193 4077 /*
4194 4078 * Non-global zone version of start_init.
4195 4079 */
4196 4080 void
4197 4081 zone_start_init(void)
4198 4082 {
4199 4083 proc_t *p = ttoproc(curthread);
4200 4084 zone_t *z = p->p_zone;
4201 4085
4202 4086 ASSERT(!INGLOBALZONE(curproc));
4203 4087
4204 4088 /*
4205 4089 * For all purposes (ZONE_ATTR_INITPID and restart_init),
4206 4090 * storing just the pid of init is sufficient.
4207 4091 */
4208 4092 z->zone_proc_initpid = p->p_pid;
4209 4093
4210 4094 if (z->zone_setup_app_contract == B_TRUE) {
4211 4095 /*
4212 4096 * Normally a process cannot modify its own contract, but we're
4213 4097 * just starting the zone's init process and its contract is
4214 4098 * always initialized from the sys_process_tmpl template, so
4215 4099 * this is the simplest way to setup init's contract to kill
4216 4100 * the process if any other process in the contract exits.
4217 4101 */
4218 4102 p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
4219 4103 }
4220 4104
4221 4105 /*
4222 4106 * We maintain zone_boot_err so that we can return the cause of the
4223 4107 * failure back to the caller of the zone_boot syscall.
4224 4108 */
4225 4109 p->p_zone->zone_boot_err = start_init_common();
4226 4110
4227 4111 /*
4228 4112 * We will prevent booting zones from becoming running zones if the
4229 4113 * global zone is shutting down.
4230 4114 */
4231 4115 mutex_enter(&zone_status_lock);
4232 4116 if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
4233 4117 ZONE_IS_SHUTTING_DOWN) {
4234 4118 /*
4235 4119 * Make sure we are still in the booting state-- we could have
4236 4120 * raced and already be shutting down, or even further along.
4237 4121 */
4238 4122 if (zone_status_get(z) == ZONE_IS_BOOTING) {
4239 4123 zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
4240 4124 }
4241 4125 mutex_exit(&zone_status_lock);
4242 4126 /* It's gone bad, dispose of the process */
4243 4127 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
4244 4128 mutex_enter(&p->p_lock);
4245 4129 ASSERT(p->p_flag & SEXITLWPS);
4246 4130 lwp_exit();
4247 4131 }
4248 4132 } else {
4249 4133 id_t cid = curthread->t_cid;
4250 4134
4251 4135 if (zone_status_get(z) == ZONE_IS_BOOTING)
4252 4136 zone_status_set(z, ZONE_IS_RUNNING);
4253 4137 mutex_exit(&zone_status_lock);
4254 4138
4255 4139 mutex_enter(&class_lock);
4256 4140 ASSERT(cid < loaded_classes);
4257 4141 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
4258 4142 z->zone_fixed_hipri) {
4259 4143 /*
4260 4144 * If the zone is using FX then by default all
4261 4145 * processes start at the lowest priority and stay
4262 4146 * there. We provide a mechanism for the zone to
4263 4147 * indicate that it should run at "high priority". In
4264 4148 * this case we setup init to run at the highest FX
4265 4149 * priority (which is one level higher than the
4266 4150 * non-fixed scheduling classes can use).
4267 4151 */
4268 4152 pcparms_t pcparms;
4269 4153
4270 4154 pcparms.pc_cid = cid;
4271 4155 ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
4272 4156 ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
4273 4157 FXMAXUPRI;
4274 4158 ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
4275 4159 FX_DOUPRILIM | FX_DOUPRI;
4276 4160
4277 4161 mutex_enter(&pidlock);
4278 4162 mutex_enter(&curproc->p_lock);
4279 4163
4280 4164 (void) parmsset(&pcparms, curthread);
4281 4165
4282 4166 mutex_exit(&curproc->p_lock);
4283 4167 mutex_exit(&pidlock);
4284 4168 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
4285 4169 /*
4286 4170 * zsched always starts the init lwp at priority
4287 4171 * minclsyspri - 1. This priority gets set in t_pri and
4288 4172 * is invalid for RT, but RT never uses t_pri. However
4289 4173 * t_pri is used by procfs, so we always see processes
4290 4174 * within an RT zone with an invalid priority value.
4291 4175 * We fix that up now.
4292 4176 */
4293 4177 curthread->t_pri = RTGPPRIO0;
4294 4178 }
4295 4179 mutex_exit(&class_lock);
4296 4180
4297 4181 /* cause the process to return to userland. */
4298 4182 lwp_rtt();
4299 4183 }
4300 4184 }
4301 4185
4302 4186 struct zsched_arg {
4303 4187 zone_t *zone;
4304 4188 nvlist_t *nvlist;
4305 4189 };
4306 4190
4307 4191 /*
4308 4192 * Per-zone "sched" workalike. The similarity to "sched" doesn't have
4309 4193 * anything to do with scheduling, but rather with the fact that
4310 4194 * per-zone kernel threads are parented to zsched, just like regular
4311 4195 * kernel threads are parented to sched (p0).
4312 4196 *
4313 4197 * zsched is also responsible for launching init for the zone.
4314 4198 */
4315 4199 static void
4316 4200 zsched(void *arg)
4317 4201 {
4318 4202 struct zsched_arg *za = arg;
4319 4203 proc_t *pp = curproc;
4320 4204 proc_t *initp = proc_init;
4321 4205 zone_t *zone = za->zone;
4322 4206 cred_t *cr, *oldcred;
4323 4207 rctl_set_t *set;
4324 4208 rctl_alloc_gp_t *gp;
4325 4209 contract_t *ct = NULL;
4326 4210 task_t *tk, *oldtk;
4327 4211 rctl_entity_p_t e;
4328 4212 kproject_t *pj;
4329 4213
4330 4214 nvlist_t *nvl = za->nvlist;
4331 4215 nvpair_t *nvp = NULL;
4332 4216
4333 4217 bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
4334 4218 bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
4335 4219 PTOU(pp)->u_argc = 0;
4336 4220 PTOU(pp)->u_argv = NULL;
4337 4221 PTOU(pp)->u_envp = NULL;
4338 4222 PTOU(pp)->u_commpagep = NULL;
4339 4223 closeall(P_FINFO(pp));
4340 4224
4341 4225 /*
4342 4226 * We are this zone's "zsched" process. As the zone isn't generally
4343 4227 * visible yet we don't need to grab any locks before initializing its
4344 4228 * zone_proc pointer.
4345 4229 */
4346 4230 zone_hold(zone); /* this hold is released by zone_destroy() */
4347 4231 zone->zone_zsched = pp;
4348 4232 mutex_enter(&pp->p_lock);
4349 4233 pp->p_zone = zone;
4350 4234 mutex_exit(&pp->p_lock);
4351 4235
4352 4236 /*
4353 4237 * Disassociate process from its 'parent'; parent ourselves to init
4354 4238 * (pid 1) and change other values as needed.
4355 4239 */
4356 4240 sess_create();
4357 4241
4358 4242 mutex_enter(&pidlock);
4359 4243 proc_detach(pp);
4360 4244 pp->p_ppid = 1;
4361 4245 pp->p_flag |= SZONETOP;
4362 4246 pp->p_ancpid = 1;
4363 4247 pp->p_parent = initp;
4364 4248 pp->p_psibling = NULL;
4365 4249 if (initp->p_child)
4366 4250 initp->p_child->p_psibling = pp;
4367 4251 pp->p_sibling = initp->p_child;
4368 4252 initp->p_child = pp;
4369 4253
4370 4254 /* Decrement what newproc() incremented. */
4371 4255 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
4372 4256 /*
4373 4257 * Our credentials are about to become kcred-like, so we don't care
4374 4258 * about the caller's ruid.
4375 4259 */
4376 4260 upcount_inc(crgetruid(kcred), zone->zone_id);
4377 4261 mutex_exit(&pidlock);
4378 4262
4379 4263 /*
4380 4264 * getting out of global zone, so decrement lwp and process counts
4381 4265 */
4382 4266 pj = pp->p_task->tk_proj;
4383 4267 mutex_enter(&global_zone->zone_nlwps_lock);
4384 4268 pj->kpj_nlwps -= pp->p_lwpcnt;
4385 4269 global_zone->zone_nlwps -= pp->p_lwpcnt;
4386 4270 pj->kpj_nprocs--;
4387 4271 global_zone->zone_nprocs--;
4388 4272 mutex_exit(&global_zone->zone_nlwps_lock);
4389 4273
4390 4274 /*
4391 4275 * Decrement locked memory counts on old zone and project.
4392 4276 */
4393 4277 mutex_enter(&global_zone->zone_mem_lock);
4394 4278 global_zone->zone_locked_mem -= pp->p_locked_mem;
4395 4279 pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
4396 4280 mutex_exit(&global_zone->zone_mem_lock);
4397 4281
4398 4282 /*
4399 4283 * Create and join a new task in project '0' of this zone.
4400 4284 *
4401 4285 * We don't need to call holdlwps() since we know we're the only lwp in
4402 4286 * this process.
4403 4287 *
4404 4288 * task_join() returns with p_lock held.
4405 4289 */
4406 4290 tk = task_create(0, zone);
4407 4291 mutex_enter(&cpu_lock);
4408 4292 oldtk = task_join(tk, 0);
4409 4293
4410 4294 pj = pp->p_task->tk_proj;
4411 4295
4412 4296 mutex_enter(&zone->zone_mem_lock);
4413 4297 zone->zone_locked_mem += pp->p_locked_mem;
4414 4298 pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
4415 4299 mutex_exit(&zone->zone_mem_lock);
4416 4300
4417 4301 /*
4418 4302 * add lwp and process counts to zsched's zone, and increment
4419 4303 * project's task and process count due to the task created in
4420 4304 * the above task_create.
4421 4305 */
4422 4306 mutex_enter(&zone->zone_nlwps_lock);
4423 4307 pj->kpj_nlwps += pp->p_lwpcnt;
4424 4308 pj->kpj_ntasks += 1;
4425 4309 zone->zone_nlwps += pp->p_lwpcnt;
4426 4310 pj->kpj_nprocs++;
4427 4311 zone->zone_nprocs++;
4428 4312 mutex_exit(&zone->zone_nlwps_lock);
4429 4313
4430 4314 mutex_exit(&curproc->p_lock);
4431 4315 mutex_exit(&cpu_lock);
4432 4316 task_rele(oldtk);
4433 4317
4434 4318 /*
4435 4319 * The process was created by a process in the global zone, hence the
4436 4320 * credentials are wrong. We might as well have kcred-ish credentials.
4437 4321 */
4438 4322 cr = zone->zone_kcred;
4439 4323 crhold(cr);
4440 4324 mutex_enter(&pp->p_crlock);
4441 4325 oldcred = pp->p_cred;
4442 4326 pp->p_cred = cr;
4443 4327 mutex_exit(&pp->p_crlock);
4444 4328 crfree(oldcred);
4445 4329
4446 4330 /*
4447 4331 * Hold credentials again (for thread)
4448 4332 */
4449 4333 crhold(cr);
4450 4334
4451 4335 /*
4452 4336 * p_lwpcnt can't change since this is a kernel process.
4453 4337 */
4454 4338 crset(pp, cr);
4455 4339
4456 4340 /*
4457 4341 * Chroot
4458 4342 */
4459 4343 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
4460 4344 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
4461 4345
4462 4346 /*
4463 4347 * Initialize zone's rctl set.
4464 4348 */
4465 4349 set = rctl_set_create();
4466 4350 gp = rctl_set_init_prealloc(RCENTITY_ZONE);
4467 4351 mutex_enter(&pp->p_lock);
4468 4352 e.rcep_p.zone = zone;
4469 4353 e.rcep_t = RCENTITY_ZONE;
4470 4354 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
4471 4355 mutex_exit(&pp->p_lock);
4472 4356 rctl_prealloc_destroy(gp);
4473 4357
4474 4358 /*
4475 4359 * Apply the rctls passed in to zone_create(). This is basically a list
4476 4360 * assignment: all of the old values are removed and the new ones
4477 4361 * inserted. That is, if an empty list is passed in, all values are
4478 4362 * removed.
4479 4363 */
4480 4364 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4481 4365 rctl_dict_entry_t *rde;
4482 4366 rctl_hndl_t hndl;
4483 4367 char *name;
4484 4368 nvlist_t **nvlarray;
4485 4369 uint_t i, nelem;
4486 4370 int error; /* For ASSERT()s */
4487 4371
4488 4372 name = nvpair_name(nvp);
4489 4373 hndl = rctl_hndl_lookup(name);
4490 4374 ASSERT(hndl != -1);
4491 4375 rde = rctl_dict_lookup_hndl(hndl);
4492 4376 ASSERT(rde != NULL);
4493 4377
4494 4378 for (; /* ever */; ) {
4495 4379 rctl_val_t oval;
4496 4380
4497 4381 mutex_enter(&pp->p_lock);
4498 4382 error = rctl_local_get(hndl, NULL, &oval, pp);
4499 4383 mutex_exit(&pp->p_lock);
4500 4384 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */
4501 4385 ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4502 4386 if (oval.rcv_privilege == RCPRIV_SYSTEM)
4503 4387 break;
4504 4388 mutex_enter(&pp->p_lock);
4505 4389 error = rctl_local_delete(hndl, &oval, pp);
4506 4390 mutex_exit(&pp->p_lock);
4507 4391 ASSERT(error == 0);
4508 4392 }
4509 4393 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4510 4394 ASSERT(error == 0);
4511 4395 for (i = 0; i < nelem; i++) {
4512 4396 rctl_val_t *nvalp;
4513 4397
4514 4398 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4515 4399 error = nvlist2rctlval(nvlarray[i], nvalp);
4516 4400 ASSERT(error == 0);
4517 4401 /*
4518 4402 * rctl_local_insert can fail if the value being
4519 4403 * inserted is a duplicate; this is OK.
4520 4404 */
4521 4405 mutex_enter(&pp->p_lock);
4522 4406 if (rctl_local_insert(hndl, nvalp, pp) != 0)
4523 4407 kmem_cache_free(rctl_val_cache, nvalp);
4524 4408 mutex_exit(&pp->p_lock);
4525 4409 }
4526 4410 }
4527 4411 /*
4528 4412 * Tell the world that we're done setting up.
4529 4413 *
4530 4414 * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4531 4415 * and atomically set the zone's processor set visibility. Once
4532 4416 * we drop pool_lock() this zone will automatically get updated
4533 4417 * to reflect any future changes to the pools configuration.
4534 4418 *
4535 4419 * Note that after we drop the locks below (zonehash_lock in
4536 4420 * particular) other operations such as a zone_getattr call can
4537 4421 * now proceed and observe the zone. That is the reason for doing a
4538 4422 * state transition to the INITIALIZED state.
4539 4423 */
4540 4424 pool_lock();
4541 4425 mutex_enter(&cpu_lock);
4542 4426 mutex_enter(&zonehash_lock);
4543 4427 zone_uniqid(zone);
4544 4428 zone_zsd_configure(zone);
4545 4429 if (pool_state == POOL_ENABLED)
4546 4430 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4547 4431 mutex_enter(&zone_status_lock);
4548 4432 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4549 4433 zone_status_set(zone, ZONE_IS_INITIALIZED);
4550 4434 mutex_exit(&zone_status_lock);
4551 4435 mutex_exit(&zonehash_lock);
4552 4436 mutex_exit(&cpu_lock);
4553 4437 pool_unlock();
4554 4438
4555 4439 /* Now call the create callback for this key */
4556 4440 zsd_apply_all_keys(zsd_apply_create, zone);
4557 4441
4558 4442 /* The callbacks are complete. Mark ZONE_IS_READY */
4559 4443 mutex_enter(&zone_status_lock);
4560 4444 ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4561 4445 zone_status_set(zone, ZONE_IS_READY);
4562 4446 mutex_exit(&zone_status_lock);
4563 4447
4564 4448 /*
4565 4449 * Once we see the zone transition to the ZONE_IS_BOOTING state,
4566 4450 * we launch init, and set the state to running.
4567 4451 */
4568 4452 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4569 4453
4570 4454 if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4571 4455 id_t cid;
4572 4456
4573 4457 /*
4574 4458 * Ok, this is a little complicated. We need to grab the
4575 4459 * zone's pool's scheduling class ID; note that by now, we
4576 4460 * are already bound to a pool if we need to be (zoneadmd
4577 4461 * will have done that to us while we're in the READY
4578 4462 * state). *But* the scheduling class for the zone's 'init'
4579 4463 * must be explicitly passed to newproc, which doesn't
4580 4464 * respect pool bindings.
4581 4465 *
4582 4466 * We hold the pool_lock across the call to newproc() to
4583 4467 * close the obvious race: the pool's scheduling class
4584 4468 * could change before we manage to create the LWP with
4585 4469 * classid 'cid'.
4586 4470 */
4587 4471 pool_lock();
4588 4472 if (zone->zone_defaultcid > 0)
4589 4473 cid = zone->zone_defaultcid;
4590 4474 else
4591 4475 cid = pool_get_class(zone->zone_pool);
4592 4476 if (cid == -1)
4593 4477 cid = defaultcid;
4594 4478
4595 4479 /*
4596 4480 * If this fails, zone_boot will ultimately fail. The
4597 4481 * state of the zone will be set to SHUTTING_DOWN-- userland
4598 4482 * will have to tear down the zone, and fail, or try again.
4599 4483 */
4600 4484 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4601 4485 minclsyspri - 1, &ct, 0)) != 0) {
4602 4486 mutex_enter(&zone_status_lock);
4603 4487 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4604 4488 mutex_exit(&zone_status_lock);
4605 4489 } else {
4606 4490 zone->zone_boot_time = gethrestime_sec();
4607 4491 }
4608 4492
4609 4493 pool_unlock();
4610 4494 }
4611 4495
4612 4496 /*
4613 4497 * Wait for zone_destroy() to be called. This is what we spend
4614 4498 * most of our life doing.
4615 4499 */
4616 4500 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4617 4501
4618 4502 if (ct)
4619 4503 /*
4620 4504 * At this point the process contract should be empty.
4621 4505 * (Though if it isn't, it's not the end of the world.)
4622 4506 */
4623 4507 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4624 4508
4625 4509 /*
4626 4510 * Allow kcred to be freed when all referring processes
4627 4511 * (including this one) go away. We can't just do this in
4628 4512 * zone_free because we need to wait for the zone_cred_ref to
4629 4513 * drop to 0 before calling zone_free, and the existence of
4630 4514 * zone_kcred will prevent that. Thus, we call crfree here to
4631 4515 * balance the crdup in zone_create. The crhold calls earlier
4632 4516 * in zsched will be dropped when the thread and process exit.
4633 4517 */
4634 4518 crfree(zone->zone_kcred);
4635 4519 zone->zone_kcred = NULL;
4636 4520
4637 4521 exit(CLD_EXITED, 0);
4638 4522 }
4639 4523
4640 4524 /*
4641 4525 * Helper function to determine if there are any submounts of the
4642 4526 * provided path. Used to make sure the zone doesn't "inherit" any
4643 4527 * mounts from before it is created.
4644 4528 */
4645 4529 static uint_t
4646 4530 zone_mount_count(const char *rootpath)
4647 4531 {
4648 4532 vfs_t *vfsp;
4649 4533 uint_t count = 0;
4650 4534 size_t rootpathlen = strlen(rootpath);
4651 4535
4652 4536 /*
4653 4537 * Holding zonehash_lock prevents race conditions with
4654 4538 * vfs_list_add()/vfs_list_remove() since we serialize with
4655 4539 * zone_find_by_path().
4656 4540 */
4657 4541 ASSERT(MUTEX_HELD(&zonehash_lock));
4658 4542 /*
4659 4543 * The rootpath must end with a '/'
4660 4544 */
4661 4545 ASSERT(rootpath[rootpathlen - 1] == '/');
4662 4546
4663 4547 /*
4664 4548 * This intentionally does not count the rootpath itself if that
4665 4549 * happens to be a mount point.
4666 4550 */
4667 4551 vfs_list_read_lock();
4668 4552 vfsp = rootvfs;
4669 4553 do {
4670 4554 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4671 4555 rootpathlen) == 0)
4672 4556 count++;
4673 4557 vfsp = vfsp->vfs_next;
4674 4558 } while (vfsp != rootvfs);
4675 4559 vfs_list_unlock();
4676 4560 return (count);
4677 4561 }
4678 4562
4679 4563 /*
4680 4564 * Helper function to make sure that a zone created on 'rootpath'
4681 4565 * wouldn't end up containing other zones' rootpaths.
4682 4566 */
4683 4567 static boolean_t
4684 4568 zone_is_nested(const char *rootpath)
4685 4569 {
4686 4570 zone_t *zone;
4687 4571 size_t rootpathlen = strlen(rootpath);
4688 4572 size_t len;
4689 4573
4690 4574 ASSERT(MUTEX_HELD(&zonehash_lock));
4691 4575
4692 4576 /*
4693 4577 * zone_set_root() appended '/' and '\0' at the end of rootpath
4694 4578 */
4695 4579 if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4696 4580 (rootpath[1] == '/') && (rootpath[2] == '\0'))
4697 4581 return (B_TRUE);
4698 4582
4699 4583 for (zone = list_head(&zone_active); zone != NULL;
4700 4584 zone = list_next(&zone_active, zone)) {
4701 4585 if (zone == global_zone)
4702 4586 continue;
4703 4587 len = strlen(zone->zone_rootpath);
4704 4588 if (strncmp(rootpath, zone->zone_rootpath,
4705 4589 MIN(rootpathlen, len)) == 0)
4706 4590 return (B_TRUE);
4707 4591 }
4708 4592 return (B_FALSE);
4709 4593 }
4710 4594
4711 4595 static int
4712 4596 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4713 4597 size_t zone_privssz)
4714 4598 {
4715 4599 priv_set_t *privs;
4716 4600
4717 4601 if (zone_privssz < sizeof (priv_set_t))
4718 4602 return (ENOMEM);
4719 4603
4720 4604 privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4721 4605
4722 4606 if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4723 4607 kmem_free(privs, sizeof (priv_set_t));
4724 4608 return (EFAULT);
4725 4609 }
4726 4610
4727 4611 zone->zone_privset = privs;
4728 4612 return (0);
4729 4613 }
4730 4614
4731 4615 /*
4732 4616 * We make creative use of nvlists to pass in rctls from userland. The list is
4733 4617 * a list of the following structures:
4734 4618 *
4735 4619 * (name = rctl_name, value = nvpair_list_array)
4736 4620 *
4737 4621 * Where each element of the nvpair_list_array is of the form:
4738 4622 *
4739 4623 * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4740 4624 * (name = "limit", value = uint64_t),
4741 4625 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4742 4626 */
4743 4627 static int
4744 4628 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4745 4629 {
4746 4630 nvpair_t *nvp = NULL;
4747 4631 nvlist_t *nvl = NULL;
4748 4632 char *kbuf;
4749 4633 int error;
4750 4634 rctl_val_t rv;
4751 4635
4752 4636 *nvlp = NULL;
4753 4637
4754 4638 if (buflen == 0)
4755 4639 return (0);
4756 4640
4757 4641 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4758 4642 return (ENOMEM);
4759 4643 if (copyin(ubuf, kbuf, buflen)) {
4760 4644 error = EFAULT;
4761 4645 goto out;
4762 4646 }
4763 4647 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4764 4648 /*
4765 4649 * nvl may have been allocated/free'd, but the value set to
4766 4650 * non-NULL, so we reset it here.
4767 4651 */
4768 4652 nvl = NULL;
4769 4653 error = EINVAL;
4770 4654 goto out;
4771 4655 }
4772 4656 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4773 4657 rctl_dict_entry_t *rde;
4774 4658 rctl_hndl_t hndl;
4775 4659 nvlist_t **nvlarray;
4776 4660 uint_t i, nelem;
4777 4661 char *name;
4778 4662
4779 4663 error = EINVAL;
4780 4664 name = nvpair_name(nvp);
4781 4665 if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
4782 4666 strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
4783 4667 nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4784 4668 goto out;
4785 4669 }
4786 4670 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4787 4671 goto out;
4788 4672 }
4789 4673 rde = rctl_dict_lookup_hndl(hndl);
4790 4674 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4791 4675 ASSERT(error == 0);
4792 4676 for (i = 0; i < nelem; i++) {
4793 4677 if (error = nvlist2rctlval(nvlarray[i], &rv))
4794 4678 goto out;
4795 4679 }
4796 4680 if (rctl_invalid_value(rde, &rv)) {
4797 4681 error = EINVAL;
4798 4682 goto out;
4799 4683 }
4800 4684 }
|
↓ open down ↓ |
1324 lines elided |
↑ open up ↑ |
4801 4685 error = 0;
4802 4686 *nvlp = nvl;
4803 4687 out:
4804 4688 kmem_free(kbuf, buflen);
4805 4689 if (error && nvl != NULL)
4806 4690 nvlist_free(nvl);
4807 4691 return (error);
4808 4692 }
4809 4693
4810 4694 int
4811 -zone_create_error(int er_error, int er_ext, int *er_out)
4812 -{
4695 +zone_create_error(int er_error, int er_ext, int *er_out) {
4813 4696 if (er_out != NULL) {
4814 4697 if (copyout(&er_ext, er_out, sizeof (int))) {
4815 4698 return (set_errno(EFAULT));
4816 4699 }
4817 4700 }
4818 4701 return (set_errno(er_error));
4819 4702 }
4820 4703
4821 4704 static int
4822 4705 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4823 4706 {
4824 4707 ts_label_t *tsl;
4825 4708 bslabel_t blab;
4826 4709
4827 4710 /* Get label from user */
4828 4711 if (copyin(lab, &blab, sizeof (blab)) != 0)
4829 4712 return (EFAULT);
4830 4713 tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4831 4714 if (tsl == NULL)
4832 4715 return (ENOMEM);
4833 4716
4834 4717 zone->zone_slabel = tsl;
4835 4718 return (0);
4836 4719 }
4837 4720
4838 4721 /*
4839 4722 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4840 4723 */
4841 4724 static int
4842 4725 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4843 4726 {
4844 4727 char *kbuf;
4845 4728 char *dataset, *next;
4846 4729 zone_dataset_t *zd;
4847 4730 size_t len;
4848 4731
4849 4732 if (ubuf == NULL || buflen == 0)
4850 4733 return (0);
4851 4734
4852 4735 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4853 4736 return (ENOMEM);
4854 4737
4855 4738 if (copyin(ubuf, kbuf, buflen) != 0) {
4856 4739 kmem_free(kbuf, buflen);
4857 4740 return (EFAULT);
4858 4741 }
4859 4742
4860 4743 dataset = next = kbuf;
4861 4744 for (;;) {
4862 4745 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4863 4746
4864 4747 next = strchr(dataset, ',');
4865 4748
4866 4749 if (next == NULL)
4867 4750 len = strlen(dataset);
4868 4751 else
4869 4752 len = next - dataset;
4870 4753
4871 4754 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4872 4755 bcopy(dataset, zd->zd_dataset, len);
4873 4756 zd->zd_dataset[len] = '\0';
4874 4757
4875 4758 list_insert_head(&zone->zone_datasets, zd);
4876 4759
4877 4760 if (next == NULL)
4878 4761 break;
4879 4762
4880 4763 dataset = next + 1;
4881 4764 }
4882 4765
4883 4766 kmem_free(kbuf, buflen);
4884 4767 return (0);
4885 4768 }
4886 4769
4887 4770 /*
4888 4771 * System call to create/initialize a new zone named 'zone_name', rooted
4889 4772 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4890 4773 * and initialized with the zone-wide rctls described in 'rctlbuf', and
4891 4774 * with labeling set by 'match', 'doi', and 'label'.
|
↓ open down ↓ |
69 lines elided |
↑ open up ↑ |
4892 4775 *
4893 4776 * If extended error is non-null, we may use it to return more detailed
4894 4777 * error information.
4895 4778 */
4896 4779 static zoneid_t
4897 4780 zone_create(const char *zone_name, const char *zone_root,
4898 4781 const priv_set_t *zone_privs, size_t zone_privssz,
4899 4782 caddr_t rctlbuf, size_t rctlbufsz,
4900 4783 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4901 4784 int match, uint32_t doi, const bslabel_t *label,
4902 - int flags, zoneid_t zone_did)
4785 + int flags)
4903 4786 {
4904 4787 struct zsched_arg zarg;
4905 4788 nvlist_t *rctls = NULL;
4906 4789 proc_t *pp = curproc;
4907 4790 zone_t *zone, *ztmp;
4908 - zoneid_t zoneid, start = GLOBAL_ZONEID;
4791 + zoneid_t zoneid;
4909 4792 int error;
4910 4793 int error2 = 0;
4911 4794 char *str;
4912 4795 cred_t *zkcr;
4913 4796 boolean_t insert_label_hash;
4914 4797
4915 4798 if (secpolicy_zone_config(CRED()) != 0)
4916 4799 return (set_errno(EPERM));
4917 4800
4918 4801 /* can't boot zone from within chroot environment */
4919 4802 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4920 4803 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4921 4804 extended_error));
4922 4805
4923 - /*
4924 - * As the first step of zone creation, we want to allocate a zoneid.
4925 - * This allocation is complicated by the fact that netstacks use the
4926 - * zoneid to determine their stackid, but netstacks themselves are
4927 - * freed asynchronously with respect to zone destruction. This means
4928 - * that a netstack reference leak (or in principle, an extraordinarily
4929 - * long netstack reference hold) could result in a zoneid being
4930 - * allocated that in fact corresponds to a stackid from an active
4931 - * (referenced) netstack -- unleashing all sorts of havoc when that
4932 - * netstack is actually (re)used. (In the abstract, we might wish a
4933 - * zoneid to not be deallocated until its last referencing netstack
4934 - * has been released, but netstacks lack a backpointer into their
4935 - * referencing zone -- and changing them to have such a pointer would
4936 - * be substantial, to put it euphemistically.) To avoid this, we
4937 - * detect this condition on allocation: if we have allocated a zoneid
4938 - * that corresponds to a netstack that's still in use, we warn about
4939 - * it (as it is much more likely to be a reference leak than an actual
4940 - * netstack reference), free it, and allocate another. That these
4941 - * identifers are allocated out of an ID space assures that we won't
4942 - * see the identifier we just allocated.
4943 - */
4944 - for (;;) {
4945 - zoneid = id_alloc(zoneid_space);
4946 -
4947 - if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4948 - break;
4949 -
4950 - id_free(zoneid_space, zoneid);
4951 -
4952 - if (start == GLOBAL_ZONEID) {
4953 - start = zoneid;
4954 - } else if (zoneid == start) {
4955 - /*
4956 - * We have managed to iterate over the entire available
4957 - * zoneid space -- there are no identifiers available,
4958 - * presumably due to some number of leaked netstack
4959 - * references. While it's in principle possible for us
4960 - * to continue to try, it seems wiser to give up at
4961 - * this point to warn and fail explicitly with a
4962 - * distinctive error.
4963 - */
4964 - cmn_err(CE_WARN, "zone_create() failed: all available "
4965 - "zone IDs have netstacks still in use");
4966 - return (set_errno(ENFILE));
4967 - }
4968 -
4969 - cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4970 - "netstack still in use", zoneid);
4971 - }
4972 -
4973 4806 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4974 - zone->zone_id = zoneid;
4975 - zone->zone_did = zone_did;
4807 + zoneid = zone->zone_id = id_alloc(zoneid_space);
4976 4808 zone->zone_status = ZONE_IS_UNINITIALIZED;
4977 4809 zone->zone_pool = pool_default;
4978 4810 zone->zone_pool_mod = gethrtime();
4979 4811 zone->zone_psetid = ZONE_PS_INVAL;
4980 4812 zone->zone_ncpus = 0;
4981 4813 zone->zone_ncpus_online = 0;
4982 4814 zone->zone_restart_init = B_TRUE;
4983 4815 zone->zone_reboot_on_init_exit = B_FALSE;
4984 4816 zone->zone_init_status = -1;
4985 4817 zone->zone_brand = &native_brand;
4986 4818 zone->zone_initname = NULL;
4987 4819 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4988 4820 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4989 4821 mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4990 4822 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4991 4823 list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4992 4824 offsetof(zone_ref_t, zref_linkage));
4993 4825 list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4994 4826 offsetof(struct zsd_entry, zsd_linkage));
4995 4827 list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4996 4828 offsetof(zone_dataset_t, zd_linkage));
4997 4829 list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4998 4830 offsetof(zone_dl_t, zdl_linkage));
4999 4831 rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
5000 4832 rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
5001 4833
5002 4834 if (flags & ZCF_NET_EXCL) {
5003 4835 zone->zone_flags |= ZF_NET_EXCL;
5004 4836 }
5005 4837
5006 4838 if ((error = zone_set_name(zone, zone_name)) != 0) {
5007 4839 zone_free(zone);
5008 4840 return (zone_create_error(error, 0, extended_error));
5009 4841 }
5010 4842
5011 4843 if ((error = zone_set_root(zone, zone_root)) != 0) {
5012 4844 zone_free(zone);
5013 4845 return (zone_create_error(error, 0, extended_error));
5014 4846 }
5015 4847 if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
5016 4848 zone_free(zone);
5017 4849 return (zone_create_error(error, 0, extended_error));
5018 4850 }
5019 4851
5020 4852 /* initialize node name to be the same as zone name */
5021 4853 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
5022 4854 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
5023 4855 zone->zone_nodename[_SYS_NMLN - 1] = '\0';
5024 4856
5025 4857 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
5026 4858 zone->zone_domain[0] = '\0';
5027 4859 zone->zone_hostid = HW_INVALID_HOSTID;
5028 4860 zone->zone_shares = 1;
5029 4861 zone->zone_shmmax = 0;
5030 4862 zone->zone_ipc.ipcq_shmmni = 0;
5031 4863 zone->zone_ipc.ipcq_semmni = 0;
5032 4864 zone->zone_ipc.ipcq_msgmni = 0;
5033 4865 zone->zone_bootargs = NULL;
5034 4866 zone->zone_fs_allowed = NULL;
5035 4867 zone->zone_initname =
5036 4868 kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
5037 4869 (void) strcpy(zone->zone_initname, zone_default_initname);
5038 4870 zone->zone_nlwps = 0;
5039 4871 zone->zone_nlwps_ctl = INT_MAX;
5040 4872 zone->zone_nprocs = 0;
5041 4873 zone->zone_nprocs_ctl = INT_MAX;
5042 4874 zone->zone_locked_mem = 0;
5043 4875 zone->zone_locked_mem_ctl = UINT64_MAX;
5044 4876 zone->zone_max_swap = 0;
5045 4877 zone->zone_max_swap_ctl = UINT64_MAX;
5046 4878 zone->zone_phys_mem = 0;
5047 4879 zone->zone_phys_mem_ctl = UINT64_MAX;
5048 4880 zone->zone_max_lofi = 0;
5049 4881 zone->zone_max_lofi_ctl = UINT64_MAX;
5050 4882 zone->zone_lockedmem_kstat = NULL;
5051 4883 zone->zone_swapresv_kstat = NULL;
5052 4884 zone->zone_physmem_kstat = NULL;
5053 4885 zone->zone_zfs_io_pri = 1;
5054 4886
5055 4887 /*
5056 4888 * Zsched initializes the rctls.
5057 4889 */
5058 4890 zone->zone_rctls = NULL;
5059 4891
5060 4892 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
5061 4893 zone_free(zone);
5062 4894 return (zone_create_error(error, 0, extended_error));
5063 4895 }
5064 4896
5065 4897 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
5066 4898 zone_free(zone);
5067 4899 return (set_errno(error));
5068 4900 }
5069 4901
5070 4902 /*
5071 4903 * Read in the trusted system parameters:
5072 4904 * match flag and sensitivity label.
5073 4905 */
5074 4906 zone->zone_match = match;
5075 4907 if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
5076 4908 /* Fail if requested to set doi to anything but system's doi */
5077 4909 if (doi != 0 && doi != default_doi) {
5078 4910 zone_free(zone);
5079 4911 return (set_errno(EINVAL));
5080 4912 }
5081 4913 /* Always apply system's doi to the zone */
5082 4914 error = zone_set_label(zone, label, default_doi);
5083 4915 if (error != 0) {
5084 4916 zone_free(zone);
5085 4917 return (set_errno(error));
5086 4918 }
5087 4919 insert_label_hash = B_TRUE;
5088 4920 } else {
5089 4921 /* all zones get an admin_low label if system is not labeled */
5090 4922 zone->zone_slabel = l_admin_low;
5091 4923 label_hold(l_admin_low);
5092 4924 insert_label_hash = B_FALSE;
5093 4925 }
5094 4926
5095 4927 /*
5096 4928 * Stop all lwps since that's what normally happens as part of fork().
5097 4929 * This needs to happen before we grab any locks to avoid deadlock
5098 4930 * (another lwp in the process could be waiting for the held lock).
5099 4931 */
5100 4932 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
5101 4933 zone_free(zone);
5102 4934 nvlist_free(rctls);
5103 4935 return (zone_create_error(error, 0, extended_error));
5104 4936 }
5105 4937
5106 4938 if (block_mounts(zone) == 0) {
5107 4939 mutex_enter(&pp->p_lock);
5108 4940 if (curthread != pp->p_agenttp)
5109 4941 continuelwps(pp);
5110 4942 mutex_exit(&pp->p_lock);
5111 4943 zone_free(zone);
5112 4944 nvlist_free(rctls);
5113 4945 return (zone_create_error(error, 0, extended_error));
5114 4946 }
5115 4947
5116 4948 /*
5117 4949 * Set up credential for kernel access. After this, any errors
5118 4950 * should go through the dance in errout rather than calling
5119 4951 * zone_free directly.
5120 4952 */
5121 4953 zone->zone_kcred = crdup(kcred);
5122 4954 crsetzone(zone->zone_kcred, zone);
5123 4955 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
5124 4956 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
5125 4957 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
5126 4958 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
5127 4959
5128 4960 mutex_enter(&zonehash_lock);
5129 4961 /*
5130 4962 * Make sure zone doesn't already exist.
5131 4963 *
5132 4964 * If the system and zone are labeled,
5133 4965 * make sure no other zone exists that has the same label.
5134 4966 */
5135 4967 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
5136 4968 (insert_label_hash &&
5137 4969 (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
5138 4970 zone_status_t status;
5139 4971
5140 4972 status = zone_status_get(ztmp);
5141 4973 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
5142 4974 error = EEXIST;
5143 4975 else
5144 4976 error = EBUSY;
5145 4977
5146 4978 if (insert_label_hash)
5147 4979 error2 = ZE_LABELINUSE;
5148 4980
5149 4981 goto errout;
5150 4982 }
5151 4983
5152 4984 /*
5153 4985 * Don't allow zone creations which would cause one zone's rootpath to
5154 4986 * be accessible from that of another (non-global) zone.
5155 4987 */
5156 4988 if (zone_is_nested(zone->zone_rootpath)) {
5157 4989 error = EBUSY;
5158 4990 goto errout;
5159 4991 }
5160 4992
5161 4993 ASSERT(zonecount != 0); /* check for leaks */
5162 4994 if (zonecount + 1 > maxzones) {
5163 4995 error = ENOMEM;
5164 4996 goto errout;
5165 4997 }
5166 4998
5167 4999 if (zone_mount_count(zone->zone_rootpath) != 0) {
5168 5000 error = EBUSY;
5169 5001 error2 = ZE_AREMOUNTS;
5170 5002 goto errout;
5171 5003 }
5172 5004
5173 5005 /*
5174 5006 * Zone is still incomplete, but we need to drop all locks while
5175 5007 * zsched() initializes this zone's kernel process. We
5176 5008 * optimistically add the zone to the hashtable and associated
5177 5009 * lists so a parallel zone_create() doesn't try to create the
5178 5010 * same zone.
5179 5011 */
5180 5012 zonecount++;
5181 5013 (void) mod_hash_insert(zonehashbyid,
5182 5014 (mod_hash_key_t)(uintptr_t)zone->zone_id,
5183 5015 (mod_hash_val_t)(uintptr_t)zone);
5184 5016 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
5185 5017 (void) strcpy(str, zone->zone_name);
5186 5018 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
5187 5019 (mod_hash_val_t)(uintptr_t)zone);
5188 5020 if (insert_label_hash) {
5189 5021 (void) mod_hash_insert(zonehashbylabel,
5190 5022 (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
5191 5023 zone->zone_flags |= ZF_HASHED_LABEL;
5192 5024 }
5193 5025
5194 5026 /*
5195 5027 * Insert into active list. At this point there are no 'hold's
5196 5028 * on the zone, but everyone else knows not to use it, so we can
5197 5029 * continue to use it. zsched() will do a zone_hold() if the
5198 5030 * newproc() is successful.
5199 5031 */
5200 5032 list_insert_tail(&zone_active, zone);
5201 5033 mutex_exit(&zonehash_lock);
5202 5034
5203 5035 zarg.zone = zone;
5204 5036 zarg.nvlist = rctls;
5205 5037 /*
5206 5038 * The process, task, and project rctls are probably wrong;
5207 5039 * we need an interface to get the default values of all rctls,
5208 5040 * and initialize zsched appropriately. However, we allow zoneadmd
5209 5041 * to pass down both zone and project rctls for the zone's init.
5210 5042 */
5211 5043 error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
5212 5044 if (error != 0) {
5213 5045 /*
5214 5046 * We need to undo all globally visible state.
5215 5047 */
5216 5048 mutex_enter(&zonehash_lock);
5217 5049 list_remove(&zone_active, zone);
5218 5050 if (zone->zone_flags & ZF_HASHED_LABEL) {
5219 5051 ASSERT(zone->zone_slabel != NULL);
5220 5052 (void) mod_hash_destroy(zonehashbylabel,
5221 5053 (mod_hash_key_t)zone->zone_slabel);
5222 5054 }
5223 5055 (void) mod_hash_destroy(zonehashbyname,
5224 5056 (mod_hash_key_t)(uintptr_t)zone->zone_name);
5225 5057 (void) mod_hash_destroy(zonehashbyid,
5226 5058 (mod_hash_key_t)(uintptr_t)zone->zone_id);
5227 5059 ASSERT(zonecount > 1);
5228 5060 zonecount--;
5229 5061 goto errout;
5230 5062 }
5231 5063
5232 5064 /*
5233 5065 * Zone creation can't fail from now on.
5234 5066 */
5235 5067
5236 5068 /*
5237 5069 * Create zone kstats
5238 5070 */
5239 5071 zone_kstat_create(zone);
5240 5072
5241 5073 /*
5242 5074 * Let the other lwps continue.
5243 5075 */
5244 5076 mutex_enter(&pp->p_lock);
5245 5077 if (curthread != pp->p_agenttp)
5246 5078 continuelwps(pp);
5247 5079 mutex_exit(&pp->p_lock);
5248 5080
5249 5081 /*
5250 5082 * Wait for zsched to finish initializing the zone.
5251 5083 */
5252 5084 zone_status_wait(zone, ZONE_IS_READY);
5253 5085 /*
5254 5086 * The zone is fully visible, so we can let mounts progress.
5255 5087 */
5256 5088 resume_mounts(zone);
5257 5089 nvlist_free(rctls);
5258 5090
5259 5091 return (zoneid);
5260 5092
5261 5093 errout:
5262 5094 mutex_exit(&zonehash_lock);
5263 5095 /*
5264 5096 * Let the other lwps continue.
5265 5097 */
5266 5098 mutex_enter(&pp->p_lock);
5267 5099 if (curthread != pp->p_agenttp)
5268 5100 continuelwps(pp);
5269 5101 mutex_exit(&pp->p_lock);
5270 5102
5271 5103 resume_mounts(zone);
5272 5104 nvlist_free(rctls);
5273 5105 /*
5274 5106 * There is currently one reference to the zone, a cred_ref from
5275 5107 * zone_kcred. To free the zone, we call crfree, which will call
5276 5108 * zone_cred_rele, which will call zone_free.
5277 5109 */
5278 5110 ASSERT(zone->zone_cred_ref == 1);
5279 5111 ASSERT(zone->zone_kcred->cr_ref == 1);
5280 5112 ASSERT(zone->zone_ref == 0);
5281 5113 zkcr = zone->zone_kcred;
5282 5114 zone->zone_kcred = NULL;
5283 5115 crfree(zkcr); /* triggers call to zone_free */
5284 5116 return (zone_create_error(error, error2, extended_error));
5285 5117 }
5286 5118
5287 5119 /*
5288 5120 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do
5289 5121 * the heavy lifting. initname is the path to the program to launch
5290 5122 * at the "top" of the zone; if this is NULL, we use the system default,
5291 5123 * which is stored at zone_default_initname.
5292 5124 */
5293 5125 static int
5294 5126 zone_boot(zoneid_t zoneid)
5295 5127 {
5296 5128 int err;
5297 5129 zone_t *zone;
5298 5130
5299 5131 if (secpolicy_zone_config(CRED()) != 0)
5300 5132 return (set_errno(EPERM));
5301 5133 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5302 5134 return (set_errno(EINVAL));
5303 5135
5304 5136 mutex_enter(&zonehash_lock);
5305 5137 /*
5306 5138 * Look for zone under hash lock to prevent races with calls to
5307 5139 * zone_shutdown, zone_destroy, etc.
5308 5140 */
5309 5141 if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5310 5142 mutex_exit(&zonehash_lock);
5311 5143 return (set_errno(EINVAL));
5312 5144 }
5313 5145
5314 5146 mutex_enter(&zone_status_lock);
5315 5147 if (zone_status_get(zone) != ZONE_IS_READY) {
5316 5148 mutex_exit(&zone_status_lock);
5317 5149 mutex_exit(&zonehash_lock);
5318 5150 return (set_errno(EINVAL));
5319 5151 }
5320 5152 zone_status_set(zone, ZONE_IS_BOOTING);
5321 5153 mutex_exit(&zone_status_lock);
5322 5154
5323 5155 zone_hold(zone); /* so we can use the zone_t later */
5324 5156 mutex_exit(&zonehash_lock);
5325 5157
5326 5158 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
5327 5159 zone_rele(zone);
5328 5160 return (set_errno(EINTR));
5329 5161 }
5330 5162
5331 5163 /*
5332 5164 * Boot (starting init) might have failed, in which case the zone
5333 5165 * will go to the SHUTTING_DOWN state; an appropriate errno will
5334 5166 * be placed in zone->zone_boot_err, and so we return that.
5335 5167 */
5336 5168 err = zone->zone_boot_err;
5337 5169 zone_rele(zone);
|
↓ open down ↓ |
352 lines elided |
↑ open up ↑ |
5338 5170 return (err ? set_errno(err) : 0);
5339 5171 }
5340 5172
5341 5173 /*
5342 5174 * Kills all user processes in the zone, waiting for them all to exit
5343 5175 * before returning.
5344 5176 */
5345 5177 static int
5346 5178 zone_empty(zone_t *zone)
5347 5179 {
5348 - int cnt = 0;
5349 5180 int waitstatus;
5350 5181
5351 5182 /*
5352 5183 * We need to drop zonehash_lock before killing all
5353 5184 * processes, otherwise we'll deadlock with zone_find_*
5354 5185 * which can be called from the exit path.
5355 5186 */
5356 5187 ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
5357 5188 while ((waitstatus = zone_status_timedwait_sig(zone,
5358 5189 ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
5359 - boolean_t force = B_FALSE;
5360 -
5361 - /* Every 30 seconds, try harder */
5362 - if (cnt++ >= 30) {
5363 - cmn_err(CE_WARN, "attempt to force kill zone %d\n",
5364 - zone->zone_id);
5365 - force = B_TRUE;
5366 - cnt = 0;
5367 - }
5368 - killall(zone->zone_id, force);
5190 + killall(zone->zone_id);
5369 5191 }
5370 5192 /*
5371 5193 * return EINTR if we were signaled
5372 5194 */
5373 5195 if (waitstatus == 0)
5374 5196 return (EINTR);
5375 5197 return (0);
5376 5198 }
5377 5199
5378 5200 /*
5379 5201 * This function implements the policy for zone visibility.
5380 5202 *
5381 5203 * In standard Solaris, a non-global zone can only see itself.
5382 5204 *
5383 5205 * In Trusted Extensions, a labeled zone can lookup any zone whose label
5384 5206 * it dominates. For this test, the label of the global zone is treated as
5385 5207 * admin_high so it is special-cased instead of being checked for dominance.
5386 5208 *
5387 5209 * Returns true if zone attributes are viewable, false otherwise.
5388 5210 */
5389 5211 static boolean_t
5390 5212 zone_list_access(zone_t *zone)
5391 5213 {
5392 5214
5393 5215 if (curproc->p_zone == global_zone ||
5394 5216 curproc->p_zone == zone) {
5395 5217 return (B_TRUE);
5396 5218 } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
5397 5219 bslabel_t *curproc_label;
5398 5220 bslabel_t *zone_label;
5399 5221
5400 5222 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
5401 5223 zone_label = label2bslabel(zone->zone_slabel);
5402 5224
5403 5225 if (zone->zone_id != GLOBAL_ZONEID &&
5404 5226 bldominates(curproc_label, zone_label)) {
5405 5227 return (B_TRUE);
5406 5228 } else {
5407 5229 return (B_FALSE);
5408 5230 }
5409 5231 } else {
5410 5232 return (B_FALSE);
5411 5233 }
5412 5234 }
5413 5235
5414 5236 /*
5415 5237 * Systemcall to start the zone's halt sequence. By the time this
5416 5238 * function successfully returns, all user processes and kernel threads
5417 5239 * executing in it will have exited, ZSD shutdown callbacks executed,
5418 5240 * and the zone status set to ZONE_IS_DOWN.
5419 5241 *
5420 5242 * It is possible that the call will interrupt itself if the caller is the
5421 5243 * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
5422 5244 */
5423 5245 static int
5424 5246 zone_shutdown(zoneid_t zoneid)
5425 5247 {
5426 5248 int error;
5427 5249 zone_t *zone;
5428 5250 zone_status_t status;
5429 5251
5430 5252 if (secpolicy_zone_config(CRED()) != 0)
5431 5253 return (set_errno(EPERM));
5432 5254 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5433 5255 return (set_errno(EINVAL));
5434 5256
5435 5257 mutex_enter(&zonehash_lock);
5436 5258 /*
5437 5259 * Look for zone under hash lock to prevent races with other
5438 5260 * calls to zone_shutdown and zone_destroy.
5439 5261 */
5440 5262 if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5441 5263 mutex_exit(&zonehash_lock);
5442 5264 return (set_errno(EINVAL));
5443 5265 }
5444 5266
5445 5267 /*
5446 5268 * We have to drop zonehash_lock before calling block_mounts.
5447 5269 * Hold the zone so we can continue to use the zone_t.
5448 5270 */
5449 5271 zone_hold(zone);
5450 5272 mutex_exit(&zonehash_lock);
5451 5273
5452 5274 /*
5453 5275 * Block mounts so that VFS_MOUNT() can get an accurate view of
5454 5276 * the zone's status with regards to ZONE_IS_SHUTTING down.
5455 5277 *
5456 5278 * e.g. NFS can fail the mount if it determines that the zone
5457 5279 * has already begun the shutdown sequence.
5458 5280 *
5459 5281 */
5460 5282 if (block_mounts(zone) == 0) {
5461 5283 zone_rele(zone);
5462 5284 return (set_errno(EINTR));
5463 5285 }
5464 5286
5465 5287 mutex_enter(&zonehash_lock);
5466 5288 mutex_enter(&zone_status_lock);
5467 5289 status = zone_status_get(zone);
5468 5290 /*
5469 5291 * Fail if the zone isn't fully initialized yet.
5470 5292 */
5471 5293 if (status < ZONE_IS_READY) {
5472 5294 mutex_exit(&zone_status_lock);
5473 5295 mutex_exit(&zonehash_lock);
5474 5296 resume_mounts(zone);
5475 5297 zone_rele(zone);
5476 5298 return (set_errno(EINVAL));
5477 5299 }
5478 5300 /*
5479 5301 * If conditions required for zone_shutdown() to return have been met,
5480 5302 * return success.
5481 5303 */
5482 5304 if (status >= ZONE_IS_DOWN) {
5483 5305 mutex_exit(&zone_status_lock);
5484 5306 mutex_exit(&zonehash_lock);
5485 5307 resume_mounts(zone);
5486 5308 zone_rele(zone);
5487 5309 return (0);
5488 5310 }
5489 5311 /*
5490 5312 * If zone_shutdown() hasn't been called before, go through the motions.
5491 5313 * If it has, there's nothing to do but wait for the kernel threads to
5492 5314 * drain.
5493 5315 */
5494 5316 if (status < ZONE_IS_EMPTY) {
5495 5317 uint_t ntasks;
5496 5318
5497 5319 mutex_enter(&zone->zone_lock);
5498 5320 if ((ntasks = zone->zone_ntasks) != 1) {
5499 5321 /*
5500 5322 * There's still stuff running.
5501 5323 */
5502 5324 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5503 5325 }
5504 5326 mutex_exit(&zone->zone_lock);
5505 5327 if (ntasks == 1) {
5506 5328 /*
5507 5329 * The only way to create another task is through
5508 5330 * zone_enter(), which will block until we drop
5509 5331 * zonehash_lock. The zone is empty.
5510 5332 */
5511 5333 if (zone->zone_kthreads == NULL) {
5512 5334 /*
5513 5335 * Skip ahead to ZONE_IS_DOWN
5514 5336 */
5515 5337 zone_status_set(zone, ZONE_IS_DOWN);
5516 5338 } else {
5517 5339 zone_status_set(zone, ZONE_IS_EMPTY);
5518 5340 }
5519 5341 }
5520 5342 }
5521 5343 mutex_exit(&zone_status_lock);
5522 5344 mutex_exit(&zonehash_lock);
5523 5345 resume_mounts(zone);
5524 5346
5525 5347 if (error = zone_empty(zone)) {
5526 5348 zone_rele(zone);
5527 5349 return (set_errno(error));
5528 5350 }
5529 5351 /*
5530 5352 * After the zone status goes to ZONE_IS_DOWN this zone will no
5531 5353 * longer be notified of changes to the pools configuration, so
5532 5354 * in order to not end up with a stale pool pointer, we point
5533 5355 * ourselves at the default pool and remove all resource
5534 5356 * visibility. This is especially important as the zone_t may
5535 5357 * languish on the deathrow for a very long time waiting for
5536 5358 * cred's to drain out.
5537 5359 *
5538 5360 * This rebinding of the zone can happen multiple times
5539 5361 * (presumably due to interrupted or parallel systemcalls)
5540 5362 * without any adverse effects.
5541 5363 */
5542 5364 if (pool_lock_intr() != 0) {
5543 5365 zone_rele(zone);
5544 5366 return (set_errno(EINTR));
5545 5367 }
5546 5368 if (pool_state == POOL_ENABLED) {
5547 5369 mutex_enter(&cpu_lock);
5548 5370 zone_pool_set(zone, pool_default);
5549 5371 /*
5550 5372 * The zone no longer needs to be able to see any cpus.
5551 5373 */
5552 5374 zone_pset_set(zone, ZONE_PS_INVAL);
5553 5375 mutex_exit(&cpu_lock);
5554 5376 }
5555 5377 pool_unlock();
5556 5378
5557 5379 /*
5558 5380 * ZSD shutdown callbacks can be executed multiple times, hence
5559 5381 * it is safe to not be holding any locks across this call.
5560 5382 */
5561 5383 zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5562 5384
5563 5385 mutex_enter(&zone_status_lock);
5564 5386 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5565 5387 zone_status_set(zone, ZONE_IS_DOWN);
5566 5388 mutex_exit(&zone_status_lock);
5567 5389
5568 5390 /*
5569 5391 * Wait for kernel threads to drain.
5570 5392 */
5571 5393 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5572 5394 zone_rele(zone);
5573 5395 return (set_errno(EINTR));
5574 5396 }
5575 5397
5576 5398 /*
5577 5399 * Zone can be become down/destroyable even if the above wait
5578 5400 * returns EINTR, so any code added here may never execute.
5579 5401 * (i.e. don't add code here)
5580 5402 */
5581 5403
5582 5404 zone_rele(zone);
5583 5405 return (0);
5584 5406 }
5585 5407
5586 5408 /*
5587 5409 * Log the specified zone's reference counts. The caller should not be
5588 5410 * holding the zone's zone_lock.
5589 5411 */
5590 5412 static void
5591 5413 zone_log_refcounts(zone_t *zone)
5592 5414 {
5593 5415 char *buffer;
5594 5416 char *buffer_position;
5595 5417 uint32_t buffer_size;
5596 5418 uint32_t index;
5597 5419 uint_t ref;
5598 5420 uint_t cred_ref;
5599 5421
5600 5422 /*
5601 5423 * Construct a string representing the subsystem-specific reference
5602 5424 * counts. The counts are printed in ascending order by index into the
5603 5425 * zone_t::zone_subsys_ref array. The list will be surrounded by
5604 5426 * square brackets [] and will only contain nonzero reference counts.
5605 5427 *
5606 5428 * The buffer will hold two square bracket characters plus ten digits,
5607 5429 * one colon, one space, one comma, and some characters for a
5608 5430 * subsystem name per subsystem-specific reference count. (Unsigned 32-
5609 5431 * bit integers have at most ten decimal digits.) The last
5610 5432 * reference count's comma is replaced by the closing square
5611 5433 * bracket and a NULL character to terminate the string.
5612 5434 *
5613 5435 * NOTE: We have to grab the zone's zone_lock to create a consistent
5614 5436 * snapshot of the zone's reference counters.
5615 5437 *
5616 5438 * First, figure out how much space the string buffer will need.
5617 5439 * The buffer's size is stored in buffer_size.
5618 5440 */
5619 5441 buffer_size = 2; /* for the square brackets */
5620 5442 mutex_enter(&zone->zone_lock);
5621 5443 zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5622 5444 ref = zone->zone_ref;
5623 5445 cred_ref = zone->zone_cred_ref;
5624 5446 for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5625 5447 if (zone->zone_subsys_ref[index] != 0)
5626 5448 buffer_size += strlen(zone_ref_subsys_names[index]) +
5627 5449 13;
5628 5450 if (buffer_size == 2) {
5629 5451 /*
5630 5452 * No subsystems had nonzero reference counts. Don't bother
5631 5453 * with allocating a buffer; just log the general-purpose and
5632 5454 * credential reference counts.
5633 5455 */
5634 5456 mutex_exit(&zone->zone_lock);
5635 5457 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5636 5458 "Zone '%s' (ID: %d) is shutting down, but %u zone "
5637 5459 "references and %u credential references are still extant",
5638 5460 zone->zone_name, zone->zone_id, ref, cred_ref);
5639 5461 return;
5640 5462 }
5641 5463
5642 5464 /*
5643 5465 * buffer_size contains the exact number of characters that the
5644 5466 * buffer will need. Allocate the buffer and fill it with nonzero
5645 5467 * subsystem-specific reference counts. Surround the results with
5646 5468 * square brackets afterwards.
5647 5469 */
5648 5470 buffer = kmem_alloc(buffer_size, KM_SLEEP);
5649 5471 buffer_position = &buffer[1];
5650 5472 for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5651 5473 /*
5652 5474 * NOTE: The DDI's version of sprintf() returns a pointer to
5653 5475 * the modified buffer rather than the number of bytes written
5654 5476 * (as in snprintf(3C)). This is unfortunate and annoying.
5655 5477 * Therefore, we'll use snprintf() with INT_MAX to get the
5656 5478 * number of bytes written. Using INT_MAX is safe because
5657 5479 * the buffer is perfectly sized for the data: we'll never
5658 5480 * overrun the buffer.
5659 5481 */
5660 5482 if (zone->zone_subsys_ref[index] != 0)
5661 5483 buffer_position += snprintf(buffer_position, INT_MAX,
5662 5484 "%s: %u,", zone_ref_subsys_names[index],
5663 5485 zone->zone_subsys_ref[index]);
5664 5486 }
5665 5487 mutex_exit(&zone->zone_lock);
5666 5488 buffer[0] = '[';
5667 5489 ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5668 5490 ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5669 5491 buffer_position[-1] = ']';
5670 5492
5671 5493 /*
5672 5494 * Log the reference counts and free the message buffer.
5673 5495 */
5674 5496 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5675 5497 "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5676 5498 "%u credential references are still extant %s", zone->zone_name,
5677 5499 zone->zone_id, ref, cred_ref, buffer);
5678 5500 kmem_free(buffer, buffer_size);
5679 5501 }
5680 5502
5681 5503 /*
5682 5504 * Systemcall entry point to finalize the zone halt process. The caller
5683 5505 * must have already successfully called zone_shutdown().
5684 5506 *
5685 5507 * Upon successful completion, the zone will have been fully destroyed:
5686 5508 * zsched will have exited, destructor callbacks executed, and the zone
5687 5509 * removed from the list of active zones.
5688 5510 */
5689 5511 static int
5690 5512 zone_destroy(zoneid_t zoneid)
5691 5513 {
5692 5514 uint64_t uniqid;
5693 5515 zone_t *zone;
5694 5516 zone_status_t status;
5695 5517 clock_t wait_time;
5696 5518 boolean_t log_refcounts;
5697 5519
5698 5520 if (secpolicy_zone_config(CRED()) != 0)
5699 5521 return (set_errno(EPERM));
5700 5522 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5701 5523 return (set_errno(EINVAL));
5702 5524
5703 5525 mutex_enter(&zonehash_lock);
5704 5526 /*
5705 5527 * Look for zone under hash lock to prevent races with other
5706 5528 * calls to zone_destroy.
5707 5529 */
5708 5530 if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5709 5531 mutex_exit(&zonehash_lock);
5710 5532 return (set_errno(EINVAL));
5711 5533 }
5712 5534
5713 5535 if (zone_mount_count(zone->zone_rootpath) != 0) {
5714 5536 mutex_exit(&zonehash_lock);
5715 5537 return (set_errno(EBUSY));
5716 5538 }
5717 5539 mutex_enter(&zone_status_lock);
5718 5540 status = zone_status_get(zone);
5719 5541 if (status < ZONE_IS_DOWN) {
5720 5542 mutex_exit(&zone_status_lock);
5721 5543 mutex_exit(&zonehash_lock);
5722 5544 return (set_errno(EBUSY));
5723 5545 } else if (status == ZONE_IS_DOWN) {
5724 5546 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5725 5547 }
5726 5548 mutex_exit(&zone_status_lock);
5727 5549 zone_hold(zone);
5728 5550 mutex_exit(&zonehash_lock);
5729 5551
5730 5552 /*
5731 5553 * wait for zsched to exit
5732 5554 */
5733 5555 zone_status_wait(zone, ZONE_IS_DEAD);
5734 5556 zone_zsd_callbacks(zone, ZSD_DESTROY);
5735 5557 zone->zone_netstack = NULL;
5736 5558 uniqid = zone->zone_uniqid;
5737 5559 zone_rele(zone);
5738 5560 zone = NULL; /* potentially free'd */
5739 5561
5740 5562 log_refcounts = B_FALSE;
5741 5563 wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5742 5564 mutex_enter(&zonehash_lock);
5743 5565 for (; /* ever */; ) {
5744 5566 boolean_t unref;
5745 5567 boolean_t refs_have_been_logged;
5746 5568
5747 5569 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5748 5570 zone->zone_uniqid != uniqid) {
5749 5571 /*
5750 5572 * The zone has gone away. Necessary conditions
5751 5573 * are met, so we return success.
5752 5574 */
5753 5575 mutex_exit(&zonehash_lock);
5754 5576 return (0);
5755 5577 }
5756 5578 mutex_enter(&zone->zone_lock);
5757 5579 unref = ZONE_IS_UNREF(zone);
5758 5580 refs_have_been_logged = (zone->zone_flags &
5759 5581 ZF_REFCOUNTS_LOGGED);
5760 5582 mutex_exit(&zone->zone_lock);
5761 5583 if (unref) {
5762 5584 /*
5763 5585 * There is only one reference to the zone -- that
5764 5586 * added when the zone was added to the hashtables --
5765 5587 * and things will remain this way until we drop
5766 5588 * zonehash_lock... we can go ahead and cleanup the
5767 5589 * zone.
5768 5590 */
5769 5591 break;
5770 5592 }
5771 5593
5772 5594 /*
5773 5595 * Wait for zone_rele_common() or zone_cred_rele() to signal
5774 5596 * zone_destroy_cv. zone_destroy_cv is signaled only when
5775 5597 * some zone's general-purpose reference count reaches one.
5776 5598 * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5777 5599 * on zone_destroy_cv, then log the zone's reference counts and
5778 5600 * continue to wait for zone_rele() and zone_cred_rele().
5779 5601 */
5780 5602 if (!refs_have_been_logged) {
5781 5603 if (!log_refcounts) {
5782 5604 /*
5783 5605 * This thread hasn't timed out waiting on
5784 5606 * zone_destroy_cv yet. Wait wait_time clock
5785 5607 * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5786 5608 * seconds) for the zone's references to clear.
5787 5609 */
5788 5610 ASSERT(wait_time > 0);
5789 5611 wait_time = cv_reltimedwait_sig(
5790 5612 &zone_destroy_cv, &zonehash_lock, wait_time,
5791 5613 TR_SEC);
5792 5614 if (wait_time > 0) {
5793 5615 /*
5794 5616 * A thread in zone_rele() or
5795 5617 * zone_cred_rele() signaled
5796 5618 * zone_destroy_cv before this thread's
5797 5619 * wait timed out. The zone might have
5798 5620 * only one reference left; find out!
5799 5621 */
5800 5622 continue;
5801 5623 } else if (wait_time == 0) {
5802 5624 /* The thread's process was signaled. */
5803 5625 mutex_exit(&zonehash_lock);
5804 5626 return (set_errno(EINTR));
5805 5627 }
5806 5628
5807 5629 /*
5808 5630 * The thread timed out while waiting on
5809 5631 * zone_destroy_cv. Even though the thread
5810 5632 * timed out, it has to check whether another
5811 5633 * thread woke up from zone_destroy_cv and
5812 5634 * destroyed the zone.
5813 5635 *
5814 5636 * If the zone still exists and has more than
5815 5637 * one unreleased general-purpose reference,
5816 5638 * then log the zone's reference counts.
5817 5639 */
5818 5640 log_refcounts = B_TRUE;
5819 5641 continue;
5820 5642 }
5821 5643
5822 5644 /*
5823 5645 * The thread already timed out on zone_destroy_cv while
5824 5646 * waiting for subsystems to release the zone's last
5825 5647 * general-purpose references. Log the zone's reference
5826 5648 * counts and wait indefinitely on zone_destroy_cv.
5827 5649 */
5828 5650 zone_log_refcounts(zone);
5829 5651 }
5830 5652 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5831 5653 /* The thread's process was signaled. */
5832 5654 mutex_exit(&zonehash_lock);
5833 5655 return (set_errno(EINTR));
5834 5656 }
5835 5657 }
5836 5658
5837 5659 /*
5838 5660 * Remove CPU cap for this zone now since we're not going to
5839 5661 * fail below this point.
5840 5662 */
5841 5663 cpucaps_zone_remove(zone);
5842 5664
5843 5665 /* Get rid of the zone's kstats */
5844 5666 zone_kstat_delete(zone);
5845 5667
5846 5668 /* remove the pfexecd doors */
5847 5669 if (zone->zone_pfexecd != NULL) {
5848 5670 klpd_freelist(&zone->zone_pfexecd);
5849 5671 zone->zone_pfexecd = NULL;
5850 5672 }
5851 5673
5852 5674 /* free brand specific data */
5853 5675 if (ZONE_IS_BRANDED(zone))
5854 5676 ZBROP(zone)->b_free_brand_data(zone);
5855 5677
5856 5678 /* Say goodbye to brand framework. */
5857 5679 brand_unregister_zone(zone->zone_brand);
5858 5680
5859 5681 /*
5860 5682 * It is now safe to let the zone be recreated; remove it from the
5861 5683 * lists. The memory will not be freed until the last cred
5862 5684 * reference goes away.
5863 5685 */
5864 5686 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */
5865 5687 zonecount--;
5866 5688 /* remove from active list and hash tables */
5867 5689 list_remove(&zone_active, zone);
5868 5690 (void) mod_hash_destroy(zonehashbyname,
5869 5691 (mod_hash_key_t)zone->zone_name);
5870 5692 (void) mod_hash_destroy(zonehashbyid,
5871 5693 (mod_hash_key_t)(uintptr_t)zone->zone_id);
5872 5694 if (zone->zone_flags & ZF_HASHED_LABEL)
5873 5695 (void) mod_hash_destroy(zonehashbylabel,
5874 5696 (mod_hash_key_t)zone->zone_slabel);
5875 5697 mutex_exit(&zonehash_lock);
5876 5698
5877 5699 /*
5878 5700 * Release the root vnode; we're not using it anymore. Nor should any
5879 5701 * other thread that might access it exist.
5880 5702 */
5881 5703 if (zone->zone_rootvp != NULL) {
5882 5704 VN_RELE(zone->zone_rootvp);
5883 5705 zone->zone_rootvp = NULL;
5884 5706 }
5885 5707
5886 5708 /* add to deathrow list */
5887 5709 mutex_enter(&zone_deathrow_lock);
5888 5710 list_insert_tail(&zone_deathrow, zone);
5889 5711 mutex_exit(&zone_deathrow_lock);
5890 5712
5891 5713 /*
5892 5714 * Drop last reference (which was added by zsched()), this will
5893 5715 * free the zone unless there are outstanding cred references.
5894 5716 */
5895 5717 zone_rele(zone);
5896 5718 return (0);
5897 5719 }
5898 5720
5899 5721 /*
5900 5722 * Systemcall entry point for zone_getattr(2).
5901 5723 */
5902 5724 static ssize_t
5903 5725 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5904 5726 {
5905 5727 size_t size;
5906 5728 int error = 0, err;
5907 5729 zone_t *zone;
5908 5730 char *zonepath;
5909 5731 char *outstr;
5910 5732 zone_status_t zone_status;
5911 5733 pid_t initpid;
5912 5734 boolean_t global = (curzone == global_zone);
5913 5735 boolean_t inzone = (curzone->zone_id == zoneid);
5914 5736 ushort_t flags;
5915 5737 zone_net_data_t *zbuf;
5916 5738
5917 5739 mutex_enter(&zonehash_lock);
5918 5740 if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5919 5741 mutex_exit(&zonehash_lock);
5920 5742 return (set_errno(EINVAL));
5921 5743 }
5922 5744 zone_status = zone_status_get(zone);
5923 5745 if (zone_status < ZONE_IS_INITIALIZED) {
5924 5746 mutex_exit(&zonehash_lock);
5925 5747 return (set_errno(EINVAL));
5926 5748 }
5927 5749 zone_hold(zone);
5928 5750 mutex_exit(&zonehash_lock);
5929 5751
5930 5752 /*
5931 5753 * If not in the global zone, don't show information about other zones,
5932 5754 * unless the system is labeled and the local zone's label dominates
5933 5755 * the other zone.
5934 5756 */
5935 5757 if (!zone_list_access(zone)) {
5936 5758 zone_rele(zone);
5937 5759 return (set_errno(EINVAL));
5938 5760 }
5939 5761
5940 5762 switch (attr) {
5941 5763 case ZONE_ATTR_ROOT:
5942 5764 if (global) {
5943 5765 /*
5944 5766 * Copy the path to trim the trailing "/" (except for
5945 5767 * the global zone).
5946 5768 */
5947 5769 if (zone != global_zone)
5948 5770 size = zone->zone_rootpathlen - 1;
5949 5771 else
5950 5772 size = zone->zone_rootpathlen;
5951 5773 zonepath = kmem_alloc(size, KM_SLEEP);
5952 5774 bcopy(zone->zone_rootpath, zonepath, size);
5953 5775 zonepath[size - 1] = '\0';
5954 5776 } else {
5955 5777 if (inzone || !is_system_labeled()) {
5956 5778 /*
5957 5779 * Caller is not in the global zone.
5958 5780 * if the query is on the current zone
5959 5781 * or the system is not labeled,
5960 5782 * just return faked-up path for current zone.
5961 5783 */
5962 5784 zonepath = "/";
5963 5785 size = 2;
5964 5786 } else {
5965 5787 /*
5966 5788 * Return related path for current zone.
5967 5789 */
5968 5790 int prefix_len = strlen(zone_prefix);
5969 5791 int zname_len = strlen(zone->zone_name);
5970 5792
5971 5793 size = prefix_len + zname_len + 1;
5972 5794 zonepath = kmem_alloc(size, KM_SLEEP);
5973 5795 bcopy(zone_prefix, zonepath, prefix_len);
5974 5796 bcopy(zone->zone_name, zonepath +
5975 5797 prefix_len, zname_len);
5976 5798 zonepath[size - 1] = '\0';
5977 5799 }
5978 5800 }
5979 5801 if (bufsize > size)
5980 5802 bufsize = size;
5981 5803 if (buf != NULL) {
5982 5804 err = copyoutstr(zonepath, buf, bufsize, NULL);
5983 5805 if (err != 0 && err != ENAMETOOLONG)
5984 5806 error = EFAULT;
5985 5807 }
5986 5808 if (global || (is_system_labeled() && !inzone))
5987 5809 kmem_free(zonepath, size);
5988 5810 break;
5989 5811
5990 5812 case ZONE_ATTR_NAME:
5991 5813 size = strlen(zone->zone_name) + 1;
5992 5814 if (bufsize > size)
5993 5815 bufsize = size;
5994 5816 if (buf != NULL) {
5995 5817 err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5996 5818 if (err != 0 && err != ENAMETOOLONG)
5997 5819 error = EFAULT;
5998 5820 }
5999 5821 break;
6000 5822
6001 5823 case ZONE_ATTR_STATUS:
6002 5824 /*
6003 5825 * Since we're not holding zonehash_lock, the zone status
6004 5826 * may be anything; leave it up to userland to sort it out.
6005 5827 */
6006 5828 size = sizeof (zone_status);
6007 5829 if (bufsize > size)
6008 5830 bufsize = size;
6009 5831 zone_status = zone_status_get(zone);
6010 5832 if (buf != NULL &&
6011 5833 copyout(&zone_status, buf, bufsize) != 0)
6012 5834 error = EFAULT;
6013 5835 break;
6014 5836 case ZONE_ATTR_FLAGS:
6015 5837 size = sizeof (zone->zone_flags);
6016 5838 if (bufsize > size)
6017 5839 bufsize = size;
6018 5840 flags = zone->zone_flags;
6019 5841 if (buf != NULL &&
6020 5842 copyout(&flags, buf, bufsize) != 0)
6021 5843 error = EFAULT;
6022 5844 break;
6023 5845 case ZONE_ATTR_PRIVSET:
6024 5846 size = sizeof (priv_set_t);
6025 5847 if (bufsize > size)
6026 5848 bufsize = size;
6027 5849 if (buf != NULL &&
6028 5850 copyout(zone->zone_privset, buf, bufsize) != 0)
6029 5851 error = EFAULT;
6030 5852 break;
6031 5853 case ZONE_ATTR_UNIQID:
6032 5854 size = sizeof (zone->zone_uniqid);
6033 5855 if (bufsize > size)
6034 5856 bufsize = size;
6035 5857 if (buf != NULL &&
6036 5858 copyout(&zone->zone_uniqid, buf, bufsize) != 0)
6037 5859 error = EFAULT;
6038 5860 break;
6039 5861 case ZONE_ATTR_POOLID:
6040 5862 {
6041 5863 pool_t *pool;
6042 5864 poolid_t poolid;
6043 5865
6044 5866 if (pool_lock_intr() != 0) {
6045 5867 error = EINTR;
6046 5868 break;
6047 5869 }
6048 5870 pool = zone_pool_get(zone);
6049 5871 poolid = pool->pool_id;
6050 5872 pool_unlock();
6051 5873 size = sizeof (poolid);
6052 5874 if (bufsize > size)
6053 5875 bufsize = size;
6054 5876 if (buf != NULL && copyout(&poolid, buf, size) != 0)
6055 5877 error = EFAULT;
6056 5878 }
6057 5879 break;
6058 5880 case ZONE_ATTR_SLBL:
6059 5881 size = sizeof (bslabel_t);
6060 5882 if (bufsize > size)
6061 5883 bufsize = size;
6062 5884 if (zone->zone_slabel == NULL)
6063 5885 error = EINVAL;
6064 5886 else if (buf != NULL &&
6065 5887 copyout(label2bslabel(zone->zone_slabel), buf,
6066 5888 bufsize) != 0)
6067 5889 error = EFAULT;
6068 5890 break;
6069 5891 case ZONE_ATTR_INITPID:
6070 5892 size = sizeof (initpid);
6071 5893 if (bufsize > size)
6072 5894 bufsize = size;
6073 5895 initpid = zone->zone_proc_initpid;
6074 5896 if (initpid == -1) {
6075 5897 error = ESRCH;
6076 5898 break;
6077 5899 }
6078 5900 if (buf != NULL &&
6079 5901 copyout(&initpid, buf, bufsize) != 0)
6080 5902 error = EFAULT;
6081 5903 break;
6082 5904 case ZONE_ATTR_BRAND:
6083 5905 size = strlen(zone->zone_brand->b_name) + 1;
6084 5906
6085 5907 if (bufsize > size)
6086 5908 bufsize = size;
6087 5909 if (buf != NULL) {
6088 5910 err = copyoutstr(zone->zone_brand->b_name, buf,
6089 5911 bufsize, NULL);
6090 5912 if (err != 0 && err != ENAMETOOLONG)
6091 5913 error = EFAULT;
6092 5914 }
6093 5915 break;
6094 5916 case ZONE_ATTR_INITNAME:
6095 5917 size = strlen(zone->zone_initname) + 1;
6096 5918 if (bufsize > size)
6097 5919 bufsize = size;
6098 5920 if (buf != NULL) {
6099 5921 err = copyoutstr(zone->zone_initname, buf, bufsize,
6100 5922 NULL);
6101 5923 if (err != 0 && err != ENAMETOOLONG)
6102 5924 error = EFAULT;
6103 5925 }
6104 5926 break;
6105 5927 case ZONE_ATTR_BOOTARGS:
6106 5928 if (zone->zone_bootargs == NULL)
6107 5929 outstr = "";
6108 5930 else
6109 5931 outstr = zone->zone_bootargs;
6110 5932 size = strlen(outstr) + 1;
6111 5933 if (bufsize > size)
6112 5934 bufsize = size;
6113 5935 if (buf != NULL) {
6114 5936 err = copyoutstr(outstr, buf, bufsize, NULL);
6115 5937 if (err != 0 && err != ENAMETOOLONG)
6116 5938 error = EFAULT;
6117 5939 }
6118 5940 break;
6119 5941 case ZONE_ATTR_SCHED_CLASS:
6120 5942 mutex_enter(&class_lock);
6121 5943
6122 5944 if (zone->zone_defaultcid >= loaded_classes)
6123 5945 outstr = "";
6124 5946 else
6125 5947 outstr = sclass[zone->zone_defaultcid].cl_name;
6126 5948 size = strlen(outstr) + 1;
6127 5949 if (bufsize > size)
6128 5950 bufsize = size;
6129 5951 if (buf != NULL) {
6130 5952 err = copyoutstr(outstr, buf, bufsize, NULL);
6131 5953 if (err != 0 && err != ENAMETOOLONG)
6132 5954 error = EFAULT;
6133 5955 }
6134 5956
6135 5957 mutex_exit(&class_lock);
6136 5958 break;
6137 5959 case ZONE_ATTR_HOSTID:
6138 5960 if (zone->zone_hostid != HW_INVALID_HOSTID &&
6139 5961 bufsize == sizeof (zone->zone_hostid)) {
6140 5962 size = sizeof (zone->zone_hostid);
6141 5963 if (buf != NULL && copyout(&zone->zone_hostid, buf,
6142 5964 bufsize) != 0)
6143 5965 error = EFAULT;
6144 5966 } else {
6145 5967 error = EINVAL;
6146 5968 }
6147 5969 break;
6148 5970 case ZONE_ATTR_FS_ALLOWED:
6149 5971 if (zone->zone_fs_allowed == NULL)
6150 5972 outstr = "";
6151 5973 else
6152 5974 outstr = zone->zone_fs_allowed;
6153 5975 size = strlen(outstr) + 1;
6154 5976 if (bufsize > size)
6155 5977 bufsize = size;
6156 5978 if (buf != NULL) {
6157 5979 err = copyoutstr(outstr, buf, bufsize, NULL);
6158 5980 if (err != 0 && err != ENAMETOOLONG)
6159 5981 error = EFAULT;
6160 5982 }
6161 5983 break;
6162 5984 case ZONE_ATTR_NETWORK:
|
↓ open down ↓ |
784 lines elided |
↑ open up ↑ |
6163 5985 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6164 5986 if (copyin(buf, zbuf, bufsize) != 0) {
6165 5987 error = EFAULT;
6166 5988 } else {
6167 5989 error = zone_get_network(zoneid, zbuf);
6168 5990 if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
6169 5991 error = EFAULT;
6170 5992 }
6171 5993 kmem_free(zbuf, bufsize);
6172 5994 break;
6173 - case ZONE_ATTR_DID:
6174 - size = sizeof (zoneid_t);
6175 - if (bufsize > size)
6176 - bufsize = size;
6177 -
6178 - if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
6179 - error = EFAULT;
6180 - break;
6181 5995 case ZONE_ATTR_SCHED_FIXEDHI:
6182 5996 size = sizeof (boolean_t);
6183 5997 if (bufsize > size)
6184 5998 bufsize = size;
6185 5999
6186 6000 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6187 6001 bufsize) != 0)
6188 6002 error = EFAULT;
6189 6003 break;
6190 6004 default:
6191 6005 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6192 6006 size = bufsize;
6193 6007 error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6194 6008 } else {
6195 6009 error = EINVAL;
6196 6010 }
6197 6011 }
6198 6012 zone_rele(zone);
6199 6013
6200 6014 if (error)
6201 6015 return (set_errno(error));
6202 6016 return ((ssize_t)size);
6203 6017 }
6204 6018
6205 6019 /*
6206 6020 * Systemcall entry point for zone_setattr(2).
6207 6021 */
6208 6022 /*ARGSUSED*/
6209 6023 static int
6210 6024 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
6211 6025 {
6212 6026 zone_t *zone;
6213 6027 zone_status_t zone_status;
6214 6028 int err = -1;
6215 6029 zone_net_data_t *zbuf;
6216 6030
6217 6031 if (secpolicy_zone_config(CRED()) != 0)
6218 6032 return (set_errno(EPERM));
6219 6033
6220 6034 /*
6221 6035 * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
6222 6036 * attributes can be set on the global zone.
6223 6037 */
6224 6038 if (zoneid == GLOBAL_ZONEID &&
6225 6039 attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
6226 6040 return (set_errno(EINVAL));
6227 6041 }
6228 6042
6229 6043 mutex_enter(&zonehash_lock);
6230 6044 if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
6231 6045 mutex_exit(&zonehash_lock);
6232 6046 return (set_errno(EINVAL));
6233 6047 }
6234 6048 zone_hold(zone);
6235 6049 mutex_exit(&zonehash_lock);
6236 6050
6237 6051 /*
6238 6052 * At present most attributes can only be set on non-running,
6239 6053 * non-global zones.
6240 6054 */
6241 6055 zone_status = zone_status_get(zone);
6242 6056 if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
6243 6057 attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
6244 6058 zone_status > ZONE_IS_READY) {
6245 6059 err = EINVAL;
6246 6060 goto done;
6247 6061 }
6248 6062
6249 6063 switch (attr) {
6250 6064 case ZONE_ATTR_INITNAME:
6251 6065 err = zone_set_initname(zone, (const char *)buf);
6252 6066 break;
6253 6067 case ZONE_ATTR_INITNORESTART:
6254 6068 zone->zone_restart_init = B_FALSE;
6255 6069 err = 0;
6256 6070 break;
6257 6071 case ZONE_ATTR_BOOTARGS:
6258 6072 err = zone_set_bootargs(zone, (const char *)buf);
6259 6073 break;
6260 6074 case ZONE_ATTR_BRAND:
6261 6075 err = zone_set_brand(zone, (const char *)buf);
6262 6076 break;
6263 6077 case ZONE_ATTR_FS_ALLOWED:
6264 6078 err = zone_set_fs_allowed(zone, (const char *)buf);
6265 6079 break;
6266 6080 case ZONE_ATTR_PMCAP_NOVER:
6267 6081 err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
6268 6082 break;
6269 6083 case ZONE_ATTR_PMCAP_PAGEOUT:
6270 6084 err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
6271 6085 break;
6272 6086 case ZONE_ATTR_PG_FLT_DELAY:
6273 6087 err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
6274 6088 break;
6275 6089 case ZONE_ATTR_RSS:
6276 6090 err = zone_set_rss(zone, (const uint64_t *)buf);
6277 6091 break;
6278 6092 case ZONE_ATTR_SCHED_CLASS:
6279 6093 err = zone_set_sched_class(zone, (const char *)buf);
6280 6094 break;
6281 6095 case ZONE_ATTR_HOSTID:
6282 6096 if (bufsize == sizeof (zone->zone_hostid)) {
6283 6097 if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
6284 6098 err = 0;
6285 6099 else
6286 6100 err = EFAULT;
6287 6101 } else {
6288 6102 err = EINVAL;
6289 6103 }
6290 6104 break;
6291 6105 case ZONE_ATTR_NETWORK:
6292 6106 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
6293 6107 err = EINVAL;
6294 6108 break;
6295 6109 }
6296 6110 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6297 6111 if (copyin(buf, zbuf, bufsize) != 0) {
6298 6112 kmem_free(zbuf, bufsize);
6299 6113 err = EFAULT;
6300 6114 break;
6301 6115 }
6302 6116 err = zone_set_network(zoneid, zbuf);
6303 6117 kmem_free(zbuf, bufsize);
6304 6118 break;
6305 6119 case ZONE_ATTR_APP_SVC_CT:
6306 6120 if (bufsize != sizeof (boolean_t)) {
6307 6121 err = EINVAL;
6308 6122 } else {
6309 6123 zone->zone_setup_app_contract = (boolean_t)buf;
6310 6124 err = 0;
6311 6125 }
6312 6126 break;
6313 6127 case ZONE_ATTR_SCHED_FIXEDHI:
6314 6128 if (bufsize != sizeof (boolean_t)) {
6315 6129 err = EINVAL;
6316 6130 } else {
6317 6131 zone->zone_fixed_hipri = (boolean_t)buf;
6318 6132 err = 0;
6319 6133 }
6320 6134 break;
6321 6135 default:
6322 6136 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
6323 6137 err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
6324 6138 else
6325 6139 err = EINVAL;
6326 6140 }
6327 6141
6328 6142 done:
6329 6143 zone_rele(zone);
6330 6144 ASSERT(err != -1);
6331 6145 return (err != 0 ? set_errno(err) : 0);
6332 6146 }
6333 6147
6334 6148 /*
6335 6149 * Return zero if the process has at least one vnode mapped in to its
6336 6150 * address space which shouldn't be allowed to change zones.
6337 6151 *
6338 6152 * Also return zero if the process has any shared mappings which reserve
6339 6153 * swap. This is because the counting for zone.max-swap does not allow swap
6340 6154 * reservation to be shared between zones. zone swap reservation is counted
6341 6155 * on zone->zone_max_swap.
6342 6156 */
6343 6157 static int
6344 6158 as_can_change_zones(void)
6345 6159 {
6346 6160 proc_t *pp = curproc;
6347 6161 struct seg *seg;
6348 6162 struct as *as = pp->p_as;
6349 6163 vnode_t *vp;
6350 6164 int allow = 1;
6351 6165
6352 6166 ASSERT(pp->p_as != &kas);
6353 6167 AS_LOCK_ENTER(as, RW_READER);
6354 6168 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
6355 6169
6356 6170 /*
6357 6171 * Cannot enter zone with shared anon memory which
6358 6172 * reserves swap. See comment above.
6359 6173 */
6360 6174 if (seg_can_change_zones(seg) == B_FALSE) {
6361 6175 allow = 0;
6362 6176 break;
6363 6177 }
6364 6178 /*
6365 6179 * if we can't get a backing vnode for this segment then skip
6366 6180 * it.
6367 6181 */
6368 6182 vp = NULL;
6369 6183 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
6370 6184 continue;
6371 6185 if (!vn_can_change_zones(vp)) { /* bail on first match */
6372 6186 allow = 0;
6373 6187 break;
6374 6188 }
6375 6189 }
6376 6190 AS_LOCK_EXIT(as);
6377 6191 return (allow);
6378 6192 }
6379 6193
6380 6194 /*
6381 6195 * Count swap reserved by curproc's address space
6382 6196 */
6383 6197 static size_t
6384 6198 as_swresv(void)
6385 6199 {
6386 6200 proc_t *pp = curproc;
6387 6201 struct seg *seg;
6388 6202 struct as *as = pp->p_as;
6389 6203 size_t swap = 0;
6390 6204
6391 6205 ASSERT(pp->p_as != &kas);
6392 6206 ASSERT(AS_WRITE_HELD(as));
6393 6207 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
6394 6208 swap += seg_swresv(seg);
6395 6209
6396 6210 return (swap);
6397 6211 }
6398 6212
6399 6213 /*
6400 6214 * Systemcall entry point for zone_enter().
6401 6215 *
6402 6216 * The current process is injected into said zone. In the process
6403 6217 * it will change its project membership, privileges, rootdir/cwd,
6404 6218 * zone-wide rctls, and pool association to match those of the zone.
6405 6219 *
6406 6220 * The first zone_enter() called while the zone is in the ZONE_IS_READY
6407 6221 * state will transition it to ZONE_IS_RUNNING. Processes may only
6408 6222 * enter a zone that is "ready" or "running".
6409 6223 */
6410 6224 static int
6411 6225 zone_enter(zoneid_t zoneid)
6412 6226 {
6413 6227 zone_t *zone;
6414 6228 vnode_t *vp;
6415 6229 proc_t *pp = curproc;
6416 6230 contract_t *ct;
6417 6231 cont_process_t *ctp;
6418 6232 task_t *tk, *oldtk;
6419 6233 kproject_t *zone_proj0;
6420 6234 cred_t *cr, *newcr;
6421 6235 pool_t *oldpool, *newpool;
6422 6236 sess_t *sp;
6423 6237 uid_t uid;
6424 6238 zone_status_t status;
6425 6239 int err = 0;
6426 6240 rctl_entity_p_t e;
6427 6241 size_t swap;
6428 6242 kthread_id_t t;
6429 6243
6430 6244 if (secpolicy_zone_config(CRED()) != 0)
6431 6245 return (set_errno(EPERM));
6432 6246 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
6433 6247 return (set_errno(EINVAL));
6434 6248
6435 6249 /*
6436 6250 * Stop all lwps so we don't need to hold a lock to look at
6437 6251 * curproc->p_zone. This needs to happen before we grab any
6438 6252 * locks to avoid deadlock (another lwp in the process could
6439 6253 * be waiting for the held lock).
6440 6254 */
6441 6255 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
6442 6256 return (set_errno(EINTR));
6443 6257
6444 6258 /*
6445 6259 * Make sure we're not changing zones with files open or mapped in
6446 6260 * to our address space which shouldn't be changing zones.
6447 6261 */
6448 6262 if (!files_can_change_zones()) {
6449 6263 err = EBADF;
6450 6264 goto out;
6451 6265 }
6452 6266 if (!as_can_change_zones()) {
6453 6267 err = EFAULT;
6454 6268 goto out;
6455 6269 }
6456 6270
6457 6271 mutex_enter(&zonehash_lock);
6458 6272 if (pp->p_zone != global_zone) {
6459 6273 mutex_exit(&zonehash_lock);
6460 6274 err = EINVAL;
6461 6275 goto out;
6462 6276 }
6463 6277
6464 6278 zone = zone_find_all_by_id(zoneid);
6465 6279 if (zone == NULL) {
6466 6280 mutex_exit(&zonehash_lock);
6467 6281 err = EINVAL;
6468 6282 goto out;
6469 6283 }
6470 6284
6471 6285 /*
6472 6286 * To prevent processes in a zone from holding contracts on
6473 6287 * extrazonal resources, and to avoid process contract
6474 6288 * memberships which span zones, contract holders and processes
6475 6289 * which aren't the sole members of their encapsulating process
6476 6290 * contracts are not allowed to zone_enter.
6477 6291 */
6478 6292 ctp = pp->p_ct_process;
6479 6293 ct = &ctp->conp_contract;
6480 6294 mutex_enter(&ct->ct_lock);
6481 6295 mutex_enter(&pp->p_lock);
6482 6296 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
6483 6297 mutex_exit(&pp->p_lock);
6484 6298 mutex_exit(&ct->ct_lock);
6485 6299 mutex_exit(&zonehash_lock);
6486 6300 err = EINVAL;
6487 6301 goto out;
6488 6302 }
6489 6303
6490 6304 /*
6491 6305 * Moreover, we don't allow processes whose encapsulating
6492 6306 * process contracts have inherited extrazonal contracts.
6493 6307 * While it would be easier to eliminate all process contracts
6494 6308 * with inherited contracts, we need to be able to give a
6495 6309 * restarted init (or other zone-penetrating process) its
6496 6310 * predecessor's contracts.
6497 6311 */
6498 6312 if (ctp->conp_ninherited != 0) {
6499 6313 contract_t *next;
6500 6314 for (next = list_head(&ctp->conp_inherited); next;
6501 6315 next = list_next(&ctp->conp_inherited, next)) {
6502 6316 if (contract_getzuniqid(next) != zone->zone_uniqid) {
6503 6317 mutex_exit(&pp->p_lock);
6504 6318 mutex_exit(&ct->ct_lock);
6505 6319 mutex_exit(&zonehash_lock);
6506 6320 err = EINVAL;
6507 6321 goto out;
6508 6322 }
6509 6323 }
6510 6324 }
6511 6325
6512 6326 mutex_exit(&pp->p_lock);
6513 6327 mutex_exit(&ct->ct_lock);
6514 6328
6515 6329 status = zone_status_get(zone);
6516 6330 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
6517 6331 /*
6518 6332 * Can't join
6519 6333 */
6520 6334 mutex_exit(&zonehash_lock);
6521 6335 err = EINVAL;
6522 6336 goto out;
6523 6337 }
6524 6338
6525 6339 /*
6526 6340 * Make sure new priv set is within the permitted set for caller
6527 6341 */
6528 6342 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
6529 6343 mutex_exit(&zonehash_lock);
6530 6344 err = EPERM;
6531 6345 goto out;
6532 6346 }
6533 6347 /*
6534 6348 * We want to momentarily drop zonehash_lock while we optimistically
6535 6349 * bind curproc to the pool it should be running in. This is safe
6536 6350 * since the zone can't disappear (we have a hold on it).
6537 6351 */
6538 6352 zone_hold(zone);
6539 6353 mutex_exit(&zonehash_lock);
6540 6354
6541 6355 /*
6542 6356 * Grab pool_lock to keep the pools configuration from changing
6543 6357 * and to stop ourselves from getting rebound to another pool
6544 6358 * until we join the zone.
6545 6359 */
6546 6360 if (pool_lock_intr() != 0) {
6547 6361 zone_rele(zone);
6548 6362 err = EINTR;
6549 6363 goto out;
6550 6364 }
6551 6365 ASSERT(secpolicy_pool(CRED()) == 0);
6552 6366 /*
6553 6367 * Bind ourselves to the pool currently associated with the zone.
6554 6368 */
6555 6369 oldpool = curproc->p_pool;
6556 6370 newpool = zone_pool_get(zone);
6557 6371 if (pool_state == POOL_ENABLED && newpool != oldpool &&
6558 6372 (err = pool_do_bind(newpool, P_PID, P_MYID,
6559 6373 POOL_BIND_ALL)) != 0) {
6560 6374 pool_unlock();
6561 6375 zone_rele(zone);
6562 6376 goto out;
6563 6377 }
6564 6378
6565 6379 /*
6566 6380 * Grab cpu_lock now; we'll need it later when we call
6567 6381 * task_join().
6568 6382 */
6569 6383 mutex_enter(&cpu_lock);
6570 6384 mutex_enter(&zonehash_lock);
6571 6385 /*
6572 6386 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6573 6387 */
6574 6388 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6575 6389 /*
6576 6390 * Can't join anymore.
6577 6391 */
6578 6392 mutex_exit(&zonehash_lock);
6579 6393 mutex_exit(&cpu_lock);
6580 6394 if (pool_state == POOL_ENABLED &&
6581 6395 newpool != oldpool)
6582 6396 (void) pool_do_bind(oldpool, P_PID, P_MYID,
6583 6397 POOL_BIND_ALL);
6584 6398 pool_unlock();
6585 6399 zone_rele(zone);
6586 6400 err = EINVAL;
6587 6401 goto out;
6588 6402 }
6589 6403
6590 6404 /*
6591 6405 * a_lock must be held while transfering locked memory and swap
6592 6406 * reservation from the global zone to the non global zone because
6593 6407 * asynchronous faults on the processes' address space can lock
6594 6408 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6595 6409 * segments respectively.
6596 6410 */
6597 6411 AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6598 6412 swap = as_swresv();
6599 6413 mutex_enter(&pp->p_lock);
6600 6414 zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6601 6415 /* verify that we do not exceed and task or lwp limits */
6602 6416 mutex_enter(&zone->zone_nlwps_lock);
6603 6417 /* add new lwps to zone and zone's proj0 */
6604 6418 zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6605 6419 zone->zone_nlwps += pp->p_lwpcnt;
6606 6420 /* add 1 task to zone's proj0 */
6607 6421 zone_proj0->kpj_ntasks += 1;
6608 6422
6609 6423 zone_proj0->kpj_nprocs++;
6610 6424 zone->zone_nprocs++;
6611 6425 mutex_exit(&zone->zone_nlwps_lock);
6612 6426
6613 6427 mutex_enter(&zone->zone_mem_lock);
6614 6428 zone->zone_locked_mem += pp->p_locked_mem;
6615 6429 zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6616 6430 zone->zone_max_swap += swap;
6617 6431 mutex_exit(&zone->zone_mem_lock);
6618 6432
6619 6433 mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6620 6434 zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6621 6435 mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6622 6436
6623 6437 /* remove lwps and process from proc's old zone and old project */
6624 6438 mutex_enter(&pp->p_zone->zone_nlwps_lock);
6625 6439 pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6626 6440 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6627 6441 pp->p_task->tk_proj->kpj_nprocs--;
6628 6442 pp->p_zone->zone_nprocs--;
6629 6443 mutex_exit(&pp->p_zone->zone_nlwps_lock);
6630 6444
6631 6445 mutex_enter(&pp->p_zone->zone_mem_lock);
6632 6446 pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6633 6447 pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6634 6448 pp->p_zone->zone_max_swap -= swap;
6635 6449 mutex_exit(&pp->p_zone->zone_mem_lock);
6636 6450
6637 6451 mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6638 6452 pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6639 6453 mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6640 6454
6641 6455 pp->p_flag |= SZONETOP;
6642 6456 pp->p_zone = zone;
6643 6457 mutex_exit(&pp->p_lock);
6644 6458 AS_LOCK_EXIT(pp->p_as);
6645 6459
6646 6460 /*
6647 6461 * Joining the zone cannot fail from now on.
6648 6462 *
6649 6463 * This means that a lot of the following code can be commonized and
6650 6464 * shared with zsched().
6651 6465 */
6652 6466
6653 6467 /*
6654 6468 * If the process contract fmri was inherited, we need to
6655 6469 * flag this so that any contract status will not leak
6656 6470 * extra zone information, svc_fmri in this case
6657 6471 */
6658 6472 if (ctp->conp_svc_ctid != ct->ct_id) {
6659 6473 mutex_enter(&ct->ct_lock);
6660 6474 ctp->conp_svc_zone_enter = ct->ct_id;
6661 6475 mutex_exit(&ct->ct_lock);
6662 6476 }
6663 6477
6664 6478 /*
6665 6479 * Reset the encapsulating process contract's zone.
6666 6480 */
6667 6481 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6668 6482 contract_setzuniqid(ct, zone->zone_uniqid);
6669 6483
6670 6484 /*
6671 6485 * Create a new task and associate the process with the project keyed
6672 6486 * by (projid,zoneid).
6673 6487 *
6674 6488 * We might as well be in project 0; the global zone's projid doesn't
6675 6489 * make much sense in a zone anyhow.
6676 6490 *
6677 6491 * This also increments zone_ntasks, and returns with p_lock held.
6678 6492 */
6679 6493 tk = task_create(0, zone);
6680 6494 oldtk = task_join(tk, 0);
6681 6495 mutex_exit(&cpu_lock);
6682 6496
6683 6497 /*
6684 6498 * call RCTLOP_SET functions on this proc
6685 6499 */
6686 6500 e.rcep_p.zone = zone;
6687 6501 e.rcep_t = RCENTITY_ZONE;
6688 6502 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6689 6503 RCD_CALLBACK);
6690 6504 mutex_exit(&pp->p_lock);
6691 6505
6692 6506 /*
6693 6507 * We don't need to hold any of zsched's locks here; not only do we know
6694 6508 * the process and zone aren't going away, we know its session isn't
6695 6509 * changing either.
6696 6510 *
6697 6511 * By joining zsched's session here, we mimic the behavior in the
6698 6512 * global zone of init's sid being the pid of sched. We extend this
6699 6513 * to all zlogin-like zone_enter()'ing processes as well.
6700 6514 */
6701 6515 mutex_enter(&pidlock);
6702 6516 sp = zone->zone_zsched->p_sessp;
6703 6517 sess_hold(zone->zone_zsched);
6704 6518 mutex_enter(&pp->p_lock);
6705 6519 pgexit(pp);
6706 6520 sess_rele(pp->p_sessp, B_TRUE);
6707 6521 pp->p_sessp = sp;
6708 6522 pgjoin(pp, zone->zone_zsched->p_pidp);
6709 6523
6710 6524 /*
6711 6525 * If any threads are scheduled to be placed on zone wait queue they
6712 6526 * should abandon the idea since the wait queue is changing.
6713 6527 * We need to be holding pidlock & p_lock to do this.
6714 6528 */
6715 6529 if ((t = pp->p_tlist) != NULL) {
6716 6530 do {
6717 6531 thread_lock(t);
6718 6532 /*
6719 6533 * Kick this thread so that he doesn't sit
6720 6534 * on a wrong wait queue.
6721 6535 */
6722 6536 if (ISWAITING(t))
6723 6537 setrun_locked(t);
6724 6538
6725 6539 if (t->t_schedflag & TS_ANYWAITQ)
6726 6540 t->t_schedflag &= ~ TS_ANYWAITQ;
6727 6541
6728 6542 thread_unlock(t);
6729 6543 } while ((t = t->t_forw) != pp->p_tlist);
6730 6544 }
6731 6545
6732 6546 /*
6733 6547 * If there is a default scheduling class for the zone and it is not
6734 6548 * the class we are currently in, change all of the threads in the
6735 6549 * process to the new class. We need to be holding pidlock & p_lock
6736 6550 * when we call parmsset so this is a good place to do it.
6737 6551 */
6738 6552 if (zone->zone_defaultcid > 0 &&
6739 6553 zone->zone_defaultcid != curthread->t_cid) {
6740 6554 pcparms_t pcparms;
6741 6555
6742 6556 pcparms.pc_cid = zone->zone_defaultcid;
6743 6557 pcparms.pc_clparms[0] = 0;
6744 6558
6745 6559 /*
6746 6560 * If setting the class fails, we still want to enter the zone.
6747 6561 */
6748 6562 if ((t = pp->p_tlist) != NULL) {
6749 6563 do {
6750 6564 (void) parmsset(&pcparms, t);
6751 6565 } while ((t = t->t_forw) != pp->p_tlist);
6752 6566 }
6753 6567 }
6754 6568
6755 6569 mutex_exit(&pp->p_lock);
6756 6570 mutex_exit(&pidlock);
6757 6571
6758 6572 mutex_exit(&zonehash_lock);
6759 6573 /*
6760 6574 * We're firmly in the zone; let pools progress.
6761 6575 */
6762 6576 pool_unlock();
6763 6577 task_rele(oldtk);
6764 6578 /*
6765 6579 * We don't need to retain a hold on the zone since we already
6766 6580 * incremented zone_ntasks, so the zone isn't going anywhere.
6767 6581 */
6768 6582 zone_rele(zone);
6769 6583
6770 6584 /*
6771 6585 * Chroot
6772 6586 */
6773 6587 vp = zone->zone_rootvp;
6774 6588 zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6775 6589 zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6776 6590
6777 6591 /*
6778 6592 * Change process credentials
6779 6593 */
6780 6594 newcr = cralloc();
6781 6595 mutex_enter(&pp->p_crlock);
6782 6596 cr = pp->p_cred;
6783 6597 crcopy_to(cr, newcr);
6784 6598 crsetzone(newcr, zone);
6785 6599 pp->p_cred = newcr;
6786 6600
6787 6601 /*
6788 6602 * Restrict all process privilege sets to zone limit
6789 6603 */
6790 6604 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6791 6605 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6792 6606 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6793 6607 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6794 6608 mutex_exit(&pp->p_crlock);
6795 6609 crset(pp, newcr);
6796 6610
6797 6611 /*
6798 6612 * Adjust upcount to reflect zone entry.
6799 6613 */
6800 6614 uid = crgetruid(newcr);
6801 6615 mutex_enter(&pidlock);
6802 6616 upcount_dec(uid, GLOBAL_ZONEID);
6803 6617 upcount_inc(uid, zoneid);
6804 6618 mutex_exit(&pidlock);
6805 6619
6806 6620 /*
6807 6621 * Set up core file path and content.
6808 6622 */
6809 6623 set_core_defaults();
6810 6624
6811 6625 out:
6812 6626 /*
6813 6627 * Let the other lwps continue.
6814 6628 */
6815 6629 mutex_enter(&pp->p_lock);
6816 6630 if (curthread != pp->p_agenttp)
6817 6631 continuelwps(pp);
6818 6632 mutex_exit(&pp->p_lock);
6819 6633
6820 6634 return (err != 0 ? set_errno(err) : 0);
6821 6635 }
6822 6636
6823 6637 /*
6824 6638 * Systemcall entry point for zone_list(2).
6825 6639 *
6826 6640 * Processes running in a (non-global) zone only see themselves.
6827 6641 * On labeled systems, they see all zones whose label they dominate.
6828 6642 */
6829 6643 static int
6830 6644 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6831 6645 {
6832 6646 zoneid_t *zoneids;
6833 6647 zone_t *zone, *myzone;
6834 6648 uint_t user_nzones, real_nzones;
6835 6649 uint_t domi_nzones;
6836 6650 int error;
6837 6651
6838 6652 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6839 6653 return (set_errno(EFAULT));
6840 6654
6841 6655 myzone = curproc->p_zone;
6842 6656 if (myzone != global_zone) {
6843 6657 bslabel_t *mybslab;
6844 6658
6845 6659 if (!is_system_labeled()) {
6846 6660 /* just return current zone */
6847 6661 real_nzones = domi_nzones = 1;
6848 6662 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6849 6663 zoneids[0] = myzone->zone_id;
6850 6664 } else {
6851 6665 /* return all zones that are dominated */
6852 6666 mutex_enter(&zonehash_lock);
6853 6667 real_nzones = zonecount;
6854 6668 domi_nzones = 0;
6855 6669 if (real_nzones > 0) {
6856 6670 zoneids = kmem_alloc(real_nzones *
6857 6671 sizeof (zoneid_t), KM_SLEEP);
6858 6672 mybslab = label2bslabel(myzone->zone_slabel);
6859 6673 for (zone = list_head(&zone_active);
6860 6674 zone != NULL;
6861 6675 zone = list_next(&zone_active, zone)) {
6862 6676 if (zone->zone_id == GLOBAL_ZONEID)
6863 6677 continue;
6864 6678 if (zone != myzone &&
6865 6679 (zone->zone_flags & ZF_IS_SCRATCH))
6866 6680 continue;
6867 6681 /*
6868 6682 * Note that a label always dominates
6869 6683 * itself, so myzone is always included
6870 6684 * in the list.
6871 6685 */
6872 6686 if (bldominates(mybslab,
6873 6687 label2bslabel(zone->zone_slabel))) {
6874 6688 zoneids[domi_nzones++] =
6875 6689 zone->zone_id;
6876 6690 }
6877 6691 }
6878 6692 }
6879 6693 mutex_exit(&zonehash_lock);
6880 6694 }
6881 6695 } else {
6882 6696 mutex_enter(&zonehash_lock);
6883 6697 real_nzones = zonecount;
6884 6698 domi_nzones = 0;
6885 6699 if (real_nzones > 0) {
6886 6700 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6887 6701 KM_SLEEP);
6888 6702 for (zone = list_head(&zone_active); zone != NULL;
6889 6703 zone = list_next(&zone_active, zone))
6890 6704 zoneids[domi_nzones++] = zone->zone_id;
6891 6705 ASSERT(domi_nzones == real_nzones);
6892 6706 }
6893 6707 mutex_exit(&zonehash_lock);
6894 6708 }
6895 6709
6896 6710 /*
6897 6711 * If user has allocated space for fewer entries than we found, then
6898 6712 * return only up to his limit. Either way, tell him exactly how many
6899 6713 * we found.
6900 6714 */
6901 6715 if (domi_nzones < user_nzones)
6902 6716 user_nzones = domi_nzones;
6903 6717 error = 0;
6904 6718 if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6905 6719 error = EFAULT;
6906 6720 } else if (zoneidlist != NULL && user_nzones != 0) {
6907 6721 if (copyout(zoneids, zoneidlist,
6908 6722 user_nzones * sizeof (zoneid_t)) != 0)
6909 6723 error = EFAULT;
6910 6724 }
6911 6725
6912 6726 if (real_nzones > 0)
6913 6727 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6914 6728
6915 6729 if (error != 0)
6916 6730 return (set_errno(error));
6917 6731 else
6918 6732 return (0);
6919 6733 }
6920 6734
6921 6735 /*
6922 6736 * Systemcall entry point for zone_lookup(2).
6923 6737 *
6924 6738 * Non-global zones are only able to see themselves and (on labeled systems)
6925 6739 * the zones they dominate.
6926 6740 */
6927 6741 static zoneid_t
6928 6742 zone_lookup(const char *zone_name)
6929 6743 {
6930 6744 char *kname;
6931 6745 zone_t *zone;
6932 6746 zoneid_t zoneid;
6933 6747 int err;
6934 6748
6935 6749 if (zone_name == NULL) {
6936 6750 /* return caller's zone id */
6937 6751 return (getzoneid());
6938 6752 }
6939 6753
6940 6754 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6941 6755 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6942 6756 kmem_free(kname, ZONENAME_MAX);
6943 6757 return (set_errno(err));
6944 6758 }
6945 6759
6946 6760 mutex_enter(&zonehash_lock);
6947 6761 zone = zone_find_all_by_name(kname);
6948 6762 kmem_free(kname, ZONENAME_MAX);
6949 6763 /*
6950 6764 * In a non-global zone, can only lookup global and own name.
6951 6765 * In Trusted Extensions zone label dominance rules apply.
6952 6766 */
6953 6767 if (zone == NULL ||
6954 6768 zone_status_get(zone) < ZONE_IS_READY ||
6955 6769 !zone_list_access(zone)) {
6956 6770 mutex_exit(&zonehash_lock);
6957 6771 return (set_errno(EINVAL));
6958 6772 } else {
6959 6773 zoneid = zone->zone_id;
6960 6774 mutex_exit(&zonehash_lock);
6961 6775 return (zoneid);
6962 6776 }
6963 6777 }
6964 6778
6965 6779 static int
6966 6780 zone_version(int *version_arg)
6967 6781 {
6968 6782 int version = ZONE_SYSCALL_API_VERSION;
6969 6783
6970 6784 if (copyout(&version, version_arg, sizeof (int)) != 0)
6971 6785 return (set_errno(EFAULT));
6972 6786 return (0);
6973 6787 }
6974 6788
6975 6789 /* ARGSUSED */
6976 6790 long
6977 6791 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6978 6792 {
6979 6793 zone_def zs;
6980 6794 int err;
6981 6795
6982 6796 switch (cmd) {
6983 6797 case ZONE_CREATE:
6984 6798 if (get_udatamodel() == DATAMODEL_NATIVE) {
6985 6799 if (copyin(arg1, &zs, sizeof (zone_def))) {
6986 6800 return (set_errno(EFAULT));
6987 6801 }
6988 6802 } else {
6989 6803 #ifdef _SYSCALL32_IMPL
6990 6804 zone_def32 zs32;
6991 6805
6992 6806 if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6993 6807 return (set_errno(EFAULT));
6994 6808 }
6995 6809 zs.zone_name =
6996 6810 (const char *)(unsigned long)zs32.zone_name;
6997 6811 zs.zone_root =
6998 6812 (const char *)(unsigned long)zs32.zone_root;
6999 6813 zs.zone_privs =
7000 6814 (const struct priv_set *)
7001 6815 (unsigned long)zs32.zone_privs;
7002 6816 zs.zone_privssz = zs32.zone_privssz;
|
↓ open down ↓ |
812 lines elided |
↑ open up ↑ |
7003 6817 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
7004 6818 zs.rctlbufsz = zs32.rctlbufsz;
7005 6819 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
7006 6820 zs.zfsbufsz = zs32.zfsbufsz;
7007 6821 zs.extended_error =
7008 6822 (int *)(unsigned long)zs32.extended_error;
7009 6823 zs.match = zs32.match;
7010 6824 zs.doi = zs32.doi;
7011 6825 zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
7012 6826 zs.flags = zs32.flags;
7013 - zs.zoneid = zs32.zoneid;
7014 6827 #else
7015 6828 panic("get_udatamodel() returned bogus result\n");
7016 6829 #endif
7017 6830 }
7018 6831
7019 6832 return (zone_create(zs.zone_name, zs.zone_root,
7020 6833 zs.zone_privs, zs.zone_privssz,
7021 6834 (caddr_t)zs.rctlbuf, zs.rctlbufsz,
7022 6835 (caddr_t)zs.zfsbuf, zs.zfsbufsz,
7023 6836 zs.extended_error, zs.match, zs.doi,
7024 - zs.label, zs.flags, zs.zoneid));
6837 + zs.label, zs.flags));
7025 6838 case ZONE_BOOT:
7026 6839 return (zone_boot((zoneid_t)(uintptr_t)arg1));
7027 6840 case ZONE_DESTROY:
7028 6841 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
7029 6842 case ZONE_GETATTR:
7030 6843 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
7031 6844 (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7032 6845 case ZONE_SETATTR:
7033 6846 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
7034 6847 (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7035 6848 case ZONE_ENTER:
7036 6849 return (zone_enter((zoneid_t)(uintptr_t)arg1));
7037 6850 case ZONE_LIST:
7038 6851 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
7039 6852 case ZONE_SHUTDOWN:
7040 6853 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
7041 6854 case ZONE_LOOKUP:
7042 6855 return (zone_lookup((const char *)arg1));
7043 6856 case ZONE_VERSION:
7044 6857 return (zone_version((int *)arg1));
7045 6858 case ZONE_ADD_DATALINK:
7046 6859 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
7047 6860 (datalink_id_t)(uintptr_t)arg2));
7048 6861 case ZONE_DEL_DATALINK:
7049 6862 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
7050 6863 (datalink_id_t)(uintptr_t)arg2));
7051 6864 case ZONE_CHECK_DATALINK: {
7052 6865 zoneid_t zoneid;
7053 6866 boolean_t need_copyout;
7054 6867
7055 6868 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
7056 6869 return (EFAULT);
7057 6870 need_copyout = (zoneid == ALL_ZONES);
7058 6871 err = zone_check_datalink(&zoneid,
7059 6872 (datalink_id_t)(uintptr_t)arg2);
7060 6873 if (err == 0 && need_copyout) {
7061 6874 if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
7062 6875 err = EFAULT;
7063 6876 }
7064 6877 return (err == 0 ? 0 : set_errno(err));
7065 6878 }
7066 6879 case ZONE_LIST_DATALINK:
7067 6880 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
7068 6881 (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
7069 6882 default:
7070 6883 return (set_errno(EINVAL));
7071 6884 }
7072 6885 }
7073 6886
7074 6887 struct zarg {
7075 6888 zone_t *zone;
7076 6889 zone_cmd_arg_t arg;
7077 6890 };
7078 6891
7079 6892 static int
7080 6893 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
7081 6894 {
7082 6895 char *buf;
7083 6896 size_t buflen;
7084 6897 int error;
7085 6898
7086 6899 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
7087 6900 buf = kmem_alloc(buflen, KM_SLEEP);
7088 6901 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
7089 6902 error = door_ki_open(buf, doorp);
7090 6903 kmem_free(buf, buflen);
7091 6904 return (error);
7092 6905 }
7093 6906
7094 6907 static void
7095 6908 zone_release_door(door_handle_t *doorp)
7096 6909 {
7097 6910 door_ki_rele(*doorp);
7098 6911 *doorp = NULL;
7099 6912 }
7100 6913
7101 6914 static void
7102 6915 zone_ki_call_zoneadmd(struct zarg *zargp)
7103 6916 {
7104 6917 door_handle_t door = NULL;
7105 6918 door_arg_t darg, save_arg;
7106 6919 char *zone_name;
7107 6920 size_t zone_namelen;
7108 6921 zoneid_t zoneid;
7109 6922 zone_t *zone;
7110 6923 zone_cmd_arg_t arg;
7111 6924 uint64_t uniqid;
7112 6925 size_t size;
7113 6926 int error;
7114 6927 int retry;
7115 6928
7116 6929 zone = zargp->zone;
7117 6930 arg = zargp->arg;
7118 6931 kmem_free(zargp, sizeof (*zargp));
7119 6932
7120 6933 zone_namelen = strlen(zone->zone_name) + 1;
7121 6934 zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
7122 6935 bcopy(zone->zone_name, zone_name, zone_namelen);
7123 6936 zoneid = zone->zone_id;
7124 6937 uniqid = zone->zone_uniqid;
7125 6938 arg.status = zone->zone_init_status;
7126 6939 /*
7127 6940 * zoneadmd may be down, but at least we can empty out the zone.
7128 6941 * We can ignore the return value of zone_empty() since we're called
7129 6942 * from a kernel thread and know we won't be delivered any signals.
7130 6943 */
7131 6944 ASSERT(curproc == &p0);
7132 6945 (void) zone_empty(zone);
7133 6946 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
7134 6947 zone_rele(zone);
7135 6948
7136 6949 size = sizeof (arg);
7137 6950 darg.rbuf = (char *)&arg;
7138 6951 darg.data_ptr = (char *)&arg;
7139 6952 darg.rsize = size;
7140 6953 darg.data_size = size;
7141 6954 darg.desc_ptr = NULL;
7142 6955 darg.desc_num = 0;
7143 6956
7144 6957 save_arg = darg;
7145 6958 /*
7146 6959 * Since we're not holding a reference to the zone, any number of
7147 6960 * things can go wrong, including the zone disappearing before we get a
7148 6961 * chance to talk to zoneadmd.
7149 6962 */
7150 6963 for (retry = 0; /* forever */; retry++) {
7151 6964 if (door == NULL &&
7152 6965 (error = zone_lookup_door(zone_name, &door)) != 0) {
7153 6966 goto next;
7154 6967 }
7155 6968 ASSERT(door != NULL);
7156 6969
7157 6970 if ((error = door_ki_upcall_limited(door, &darg, NULL,
7158 6971 SIZE_MAX, 0)) == 0) {
7159 6972 break;
7160 6973 }
7161 6974 switch (error) {
7162 6975 case EINTR:
7163 6976 /* FALLTHROUGH */
7164 6977 case EAGAIN: /* process may be forking */
7165 6978 /*
7166 6979 * Back off for a bit
7167 6980 */
7168 6981 break;
7169 6982 case EBADF:
7170 6983 zone_release_door(&door);
7171 6984 if (zone_lookup_door(zone_name, &door) != 0) {
7172 6985 /*
7173 6986 * zoneadmd may be dead, but it may come back to
7174 6987 * life later.
7175 6988 */
7176 6989 break;
7177 6990 }
7178 6991 break;
7179 6992 default:
7180 6993 cmn_err(CE_WARN,
7181 6994 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
7182 6995 error);
7183 6996 goto out;
7184 6997 }
7185 6998 next:
7186 6999 /*
7187 7000 * If this isn't the same zone_t that we originally had in mind,
7188 7001 * then this is the same as if two kadmin requests come in at
7189 7002 * the same time: the first one wins. This means we lose, so we
7190 7003 * bail.
7191 7004 */
7192 7005 if ((zone = zone_find_by_id(zoneid)) == NULL) {
7193 7006 /*
7194 7007 * Problem is solved.
7195 7008 */
7196 7009 break;
7197 7010 }
7198 7011 if (zone->zone_uniqid != uniqid) {
7199 7012 /*
7200 7013 * zoneid recycled
7201 7014 */
7202 7015 zone_rele(zone);
7203 7016 break;
7204 7017 }
7205 7018 /*
7206 7019 * We could zone_status_timedwait(), but there doesn't seem to
7207 7020 * be much point in doing that (plus, it would mean that
7208 7021 * zone_free() isn't called until this thread exits).
7209 7022 */
7210 7023 zone_rele(zone);
7211 7024 delay(hz);
7212 7025 darg = save_arg;
7213 7026 }
7214 7027 out:
7215 7028 if (door != NULL) {
7216 7029 zone_release_door(&door);
7217 7030 }
7218 7031 kmem_free(zone_name, zone_namelen);
7219 7032 thread_exit();
7220 7033 }
7221 7034
7222 7035 /*
7223 7036 * Entry point for uadmin() to tell the zone to go away or reboot. Analog to
7224 7037 * kadmin(). The caller is a process in the zone.
7225 7038 *
7226 7039 * In order to shutdown the zone, we will hand off control to zoneadmd
7227 7040 * (running in the global zone) via a door. We do a half-hearted job at
7228 7041 * killing all processes in the zone, create a kernel thread to contact
7229 7042 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is
7230 7043 * a form of generation number used to let zoneadmd (as well as
7231 7044 * zone_destroy()) know exactly which zone they're re talking about.
7232 7045 */
7233 7046 int
7234 7047 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
7235 7048 {
7236 7049 struct zarg *zargp;
7237 7050 zone_cmd_t zcmd;
7238 7051 zone_t *zone;
7239 7052
7240 7053 zone = curproc->p_zone;
7241 7054 ASSERT(getzoneid() != GLOBAL_ZONEID);
7242 7055
7243 7056 switch (cmd) {
7244 7057 case A_SHUTDOWN:
7245 7058 switch (fcn) {
7246 7059 case AD_HALT:
7247 7060 case AD_POWEROFF:
7248 7061 zcmd = Z_HALT;
7249 7062 break;
7250 7063 case AD_BOOT:
7251 7064 zcmd = Z_REBOOT;
7252 7065 break;
7253 7066 case AD_IBOOT:
7254 7067 case AD_SBOOT:
7255 7068 case AD_SIBOOT:
7256 7069 case AD_NOSYNC:
7257 7070 return (ENOTSUP);
7258 7071 default:
7259 7072 return (EINVAL);
7260 7073 }
7261 7074 break;
7262 7075 case A_REBOOT:
7263 7076 zcmd = Z_REBOOT;
7264 7077 break;
7265 7078 case A_FTRACE:
7266 7079 case A_REMOUNT:
7267 7080 case A_FREEZE:
7268 7081 case A_DUMP:
7269 7082 case A_CONFIG:
7270 7083 return (ENOTSUP);
7271 7084 default:
7272 7085 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */
7273 7086 return (EINVAL);
7274 7087 }
7275 7088
7276 7089 if (secpolicy_zone_admin(credp, B_FALSE))
7277 7090 return (EPERM);
7278 7091 mutex_enter(&zone_status_lock);
7279 7092
7280 7093 /*
7281 7094 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
7282 7095 * is in the zone.
7283 7096 */
7284 7097 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
7285 7098 if (zone_status_get(zone) > ZONE_IS_RUNNING) {
7286 7099 /*
7287 7100 * This zone is already on its way down.
7288 7101 */
7289 7102 mutex_exit(&zone_status_lock);
7290 7103 return (0);
7291 7104 }
7292 7105 /*
|
↓ open down ↓ |
258 lines elided |
↑ open up ↑ |
7293 7106 * Prevent future zone_enter()s
7294 7107 */
7295 7108 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
7296 7109 mutex_exit(&zone_status_lock);
7297 7110
7298 7111 /*
7299 7112 * Kill everyone now and call zoneadmd later.
7300 7113 * zone_ki_call_zoneadmd() will do a more thorough job of this
7301 7114 * later.
7302 7115 */
7303 - killall(zone->zone_id, B_FALSE);
7116 + killall(zone->zone_id);
7304 7117 /*
7305 7118 * Now, create the thread to contact zoneadmd and do the rest of the
7306 7119 * work. This thread can't be created in our zone otherwise
7307 7120 * zone_destroy() would deadlock.
7308 7121 */
7309 7122 zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
7310 7123 zargp->arg.cmd = zcmd;
7311 7124 zargp->arg.uniqid = zone->zone_uniqid;
7312 7125 zargp->zone = zone;
7313 7126 (void) strcpy(zargp->arg.locale, "C");
7314 7127 /* mdep was already copied in for us by uadmin */
7315 7128 if (mdep != NULL)
7316 7129 (void) strlcpy(zargp->arg.bootbuf, mdep,
7317 7130 sizeof (zargp->arg.bootbuf));
7318 7131 zone_hold(zone);
7319 7132
7320 7133 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
7321 7134 TS_RUN, minclsyspri);
7322 7135 exit(CLD_EXITED, 0);
7323 7136
7324 7137 return (EINVAL);
7325 7138 }
7326 7139
7327 7140 /*
7328 7141 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
7329 7142 * status to ZONE_IS_SHUTTING_DOWN.
7330 7143 *
7331 7144 * This function also shuts down all running zones to ensure that they won't
7332 7145 * fork new processes.
7333 7146 */
7334 7147 void
7335 7148 zone_shutdown_global(void)
7336 7149 {
7337 7150 zone_t *current_zonep;
7338 7151
7339 7152 ASSERT(INGLOBALZONE(curproc));
7340 7153 mutex_enter(&zonehash_lock);
7341 7154 mutex_enter(&zone_status_lock);
7342 7155
7343 7156 /* Modify the global zone's status first. */
7344 7157 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
7345 7158 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
7346 7159
7347 7160 /*
7348 7161 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
7349 7162 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
7350 7163 * could cause assertions to fail (e.g., assertions about a zone's
7351 7164 * state during initialization, readying, or booting) or produce races.
7352 7165 * We'll let threads continue to initialize and ready new zones: they'll
7353 7166 * fail to boot the new zones when they see that the global zone is
7354 7167 * shutting down.
7355 7168 */
7356 7169 for (current_zonep = list_head(&zone_active); current_zonep != NULL;
7357 7170 current_zonep = list_next(&zone_active, current_zonep)) {
7358 7171 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
7359 7172 zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
7360 7173 }
7361 7174 mutex_exit(&zone_status_lock);
7362 7175 mutex_exit(&zonehash_lock);
7363 7176 }
7364 7177
7365 7178 /*
7366 7179 * Returns true if the named dataset is visible in the specified zone.
7367 7180 * The 'write' parameter is set to 1 if the dataset is also writable.
7368 7181 */
7369 7182 int
7370 7183 zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
7371 7184 {
7372 7185 static int zfstype = -1;
7373 7186 zone_dataset_t *zd;
7374 7187 size_t len;
7375 7188 const char *name = NULL;
7376 7189 vfs_t *vfsp = NULL;
7377 7190
7378 7191 if (dataset[0] == '\0')
7379 7192 return (0);
7380 7193
7381 7194 /*
7382 7195 * Walk the list once, looking for datasets which match exactly, or
7383 7196 * specify a dataset underneath an exported dataset. If found, return
7384 7197 * true and note that it is writable.
7385 7198 */
7386 7199 for (zd = list_head(&zone->zone_datasets); zd != NULL;
7387 7200 zd = list_next(&zone->zone_datasets, zd)) {
7388 7201
7389 7202 len = strlen(zd->zd_dataset);
7390 7203 if (strlen(dataset) >= len &&
7391 7204 bcmp(dataset, zd->zd_dataset, len) == 0 &&
7392 7205 (dataset[len] == '\0' || dataset[len] == '/' ||
7393 7206 dataset[len] == '@')) {
7394 7207 if (write)
7395 7208 *write = 1;
7396 7209 return (1);
7397 7210 }
7398 7211 }
7399 7212
7400 7213 /*
7401 7214 * Walk the list a second time, searching for datasets which are parents
7402 7215 * of exported datasets. These should be visible, but read-only.
7403 7216 *
7404 7217 * Note that we also have to support forms such as 'pool/dataset/', with
7405 7218 * a trailing slash.
7406 7219 */
7407 7220 for (zd = list_head(&zone->zone_datasets); zd != NULL;
7408 7221 zd = list_next(&zone->zone_datasets, zd)) {
7409 7222
7410 7223 len = strlen(dataset);
7411 7224 if (dataset[len - 1] == '/')
7412 7225 len--; /* Ignore trailing slash */
7413 7226 if (len < strlen(zd->zd_dataset) &&
7414 7227 bcmp(dataset, zd->zd_dataset, len) == 0 &&
7415 7228 zd->zd_dataset[len] == '/') {
7416 7229 if (write)
7417 7230 *write = 0;
7418 7231 return (1);
7419 7232 }
7420 7233 }
7421 7234
7422 7235 /*
7423 7236 * We reach here if the given dataset is not found in the zone_dataset
7424 7237 * list. Check if this dataset was added as a filesystem (ie. "add fs")
7425 7238 * instead of delegation. For this we search for the dataset in the
7426 7239 * zone_vfslist of this zone. If found, return true and note that it is
7427 7240 * not writable.
7428 7241 */
7429 7242
7430 7243 /*
7431 7244 * Initialize zfstype if it is not initialized yet.
7432 7245 */
7433 7246 if (zfstype == -1) {
7434 7247 struct vfssw *vswp = vfs_getvfssw("zfs");
7435 7248 zfstype = vswp - vfssw;
7436 7249 vfs_unrefvfssw(vswp);
7437 7250 }
7438 7251
7439 7252 vfs_list_read_lock();
7440 7253 vfsp = zone->zone_vfslist;
7441 7254 do {
7442 7255 if (vfsp == NULL)
7443 7256 break;
7444 7257 if (vfsp->vfs_fstype == zfstype) {
7445 7258 name = refstr_value(vfsp->vfs_resource);
7446 7259
7447 7260 /*
7448 7261 * Check if we have an exact match.
7449 7262 */
7450 7263 if (strcmp(dataset, name) == 0) {
7451 7264 vfs_list_unlock();
7452 7265 if (write)
7453 7266 *write = 0;
7454 7267 return (1);
7455 7268 }
7456 7269 /*
7457 7270 * We need to check if we are looking for parents of
7458 7271 * a dataset. These should be visible, but read-only.
7459 7272 */
7460 7273 len = strlen(dataset);
7461 7274 if (dataset[len - 1] == '/')
7462 7275 len--;
7463 7276
7464 7277 if (len < strlen(name) &&
7465 7278 bcmp(dataset, name, len) == 0 && name[len] == '/') {
7466 7279 vfs_list_unlock();
7467 7280 if (write)
7468 7281 *write = 0;
7469 7282 return (1);
7470 7283 }
7471 7284 }
7472 7285 vfsp = vfsp->vfs_zone_next;
7473 7286 } while (vfsp != zone->zone_vfslist);
7474 7287
7475 7288 vfs_list_unlock();
7476 7289 return (0);
7477 7290 }
7478 7291
7479 7292 /*
7480 7293 * Returns true if the named dataset is visible in the current zone.
7481 7294 * The 'write' parameter is set to 1 if the dataset is also writable.
7482 7295 */
7483 7296 int
7484 7297 zone_dataset_visible(const char *dataset, int *write)
7485 7298 {
7486 7299 zone_t *zone = curproc->p_zone;
7487 7300
7488 7301 return (zone_dataset_visible_inzone(zone, dataset, write));
7489 7302 }
7490 7303
7491 7304 /*
7492 7305 * zone_find_by_any_path() -
7493 7306 *
7494 7307 * kernel-private routine similar to zone_find_by_path(), but which
7495 7308 * effectively compares against zone paths rather than zonerootpath
7496 7309 * (i.e., the last component of zonerootpaths, which should be "root/",
7497 7310 * are not compared.) This is done in order to accurately identify all
7498 7311 * paths, whether zone-visible or not, including those which are parallel
7499 7312 * to /root/, such as /dev/, /home/, etc...
7500 7313 *
7501 7314 * If the specified path does not fall under any zone path then global
7502 7315 * zone is returned.
7503 7316 *
7504 7317 * The treat_abs parameter indicates whether the path should be treated as
7505 7318 * an absolute path although it does not begin with "/". (This supports
7506 7319 * nfs mount syntax such as host:any/path.)
7507 7320 *
7508 7321 * The caller is responsible for zone_rele of the returned zone.
7509 7322 */
7510 7323 zone_t *
7511 7324 zone_find_by_any_path(const char *path, boolean_t treat_abs)
7512 7325 {
7513 7326 zone_t *zone;
7514 7327 int path_offset = 0;
7515 7328
7516 7329 if (path == NULL) {
7517 7330 zone_hold(global_zone);
7518 7331 return (global_zone);
7519 7332 }
7520 7333
7521 7334 if (*path != '/') {
7522 7335 ASSERT(treat_abs);
7523 7336 path_offset = 1;
7524 7337 }
7525 7338
7526 7339 mutex_enter(&zonehash_lock);
7527 7340 for (zone = list_head(&zone_active); zone != NULL;
7528 7341 zone = list_next(&zone_active, zone)) {
7529 7342 char *c;
7530 7343 size_t pathlen;
7531 7344 char *rootpath_start;
7532 7345
7533 7346 if (zone == global_zone) /* skip global zone */
7534 7347 continue;
7535 7348
7536 7349 /* scan backwards to find start of last component */
7537 7350 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
7538 7351 do {
7539 7352 c--;
7540 7353 } while (*c != '/');
7541 7354
7542 7355 pathlen = c - zone->zone_rootpath + 1 - path_offset;
7543 7356 rootpath_start = (zone->zone_rootpath + path_offset);
7544 7357 if (strncmp(path, rootpath_start, pathlen) == 0)
7545 7358 break;
7546 7359 }
7547 7360 if (zone == NULL)
7548 7361 zone = global_zone;
7549 7362 zone_hold(zone);
7550 7363 mutex_exit(&zonehash_lock);
7551 7364 return (zone);
7552 7365 }
7553 7366
7554 7367 /*
7555 7368 * Finds a zone_dl_t with the given linkid in the given zone. Returns the
7556 7369 * zone_dl_t pointer if found, and NULL otherwise.
7557 7370 */
7558 7371 static zone_dl_t *
7559 7372 zone_find_dl(zone_t *zone, datalink_id_t linkid)
7560 7373 {
7561 7374 zone_dl_t *zdl;
7562 7375
7563 7376 ASSERT(mutex_owned(&zone->zone_lock));
7564 7377 for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7565 7378 zdl = list_next(&zone->zone_dl_list, zdl)) {
7566 7379 if (zdl->zdl_id == linkid)
7567 7380 break;
7568 7381 }
7569 7382 return (zdl);
7570 7383 }
7571 7384
7572 7385 static boolean_t
7573 7386 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7574 7387 {
7575 7388 boolean_t exists;
7576 7389
7577 7390 mutex_enter(&zone->zone_lock);
7578 7391 exists = (zone_find_dl(zone, linkid) != NULL);
7579 7392 mutex_exit(&zone->zone_lock);
7580 7393 return (exists);
7581 7394 }
7582 7395
7583 7396 /*
7584 7397 * Add an data link name for the zone.
7585 7398 */
7586 7399 static int
7587 7400 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7588 7401 {
7589 7402 zone_dl_t *zdl;
7590 7403 zone_t *zone;
7591 7404 zone_t *thiszone;
7592 7405
7593 7406 if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7594 7407 return (set_errno(ENXIO));
7595 7408
7596 7409 /* Verify that the datalink ID doesn't already belong to a zone. */
7597 7410 mutex_enter(&zonehash_lock);
7598 7411 for (zone = list_head(&zone_active); zone != NULL;
7599 7412 zone = list_next(&zone_active, zone)) {
7600 7413 if (zone_dl_exists(zone, linkid)) {
7601 7414 mutex_exit(&zonehash_lock);
7602 7415 zone_rele(thiszone);
7603 7416 return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7604 7417 }
7605 7418 }
7606 7419
7607 7420 zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7608 7421 zdl->zdl_id = linkid;
7609 7422 zdl->zdl_net = NULL;
7610 7423 mutex_enter(&thiszone->zone_lock);
7611 7424 list_insert_head(&thiszone->zone_dl_list, zdl);
7612 7425 mutex_exit(&thiszone->zone_lock);
7613 7426 mutex_exit(&zonehash_lock);
7614 7427 zone_rele(thiszone);
7615 7428 return (0);
7616 7429 }
7617 7430
7618 7431 static int
7619 7432 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7620 7433 {
7621 7434 zone_dl_t *zdl;
7622 7435 zone_t *zone;
7623 7436 int err = 0;
7624 7437
7625 7438 if ((zone = zone_find_by_id(zoneid)) == NULL)
7626 7439 return (set_errno(EINVAL));
7627 7440
7628 7441 mutex_enter(&zone->zone_lock);
7629 7442 if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7630 7443 err = ENXIO;
7631 7444 } else {
7632 7445 list_remove(&zone->zone_dl_list, zdl);
7633 7446 nvlist_free(zdl->zdl_net);
7634 7447 kmem_free(zdl, sizeof (zone_dl_t));
7635 7448 }
7636 7449 mutex_exit(&zone->zone_lock);
7637 7450 zone_rele(zone);
7638 7451 return (err == 0 ? 0 : set_errno(err));
7639 7452 }
7640 7453
7641 7454 /*
7642 7455 * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7643 7456 * the linkid. Otherwise we just check if the specified zoneidp has been
7644 7457 * assigned the supplied linkid.
7645 7458 */
7646 7459 int
7647 7460 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7648 7461 {
7649 7462 zone_t *zone;
7650 7463 int err = ENXIO;
7651 7464
7652 7465 if (*zoneidp != ALL_ZONES) {
7653 7466 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7654 7467 if (zone_dl_exists(zone, linkid))
7655 7468 err = 0;
7656 7469 zone_rele(zone);
7657 7470 }
7658 7471 return (err);
7659 7472 }
7660 7473
7661 7474 mutex_enter(&zonehash_lock);
7662 7475 for (zone = list_head(&zone_active); zone != NULL;
7663 7476 zone = list_next(&zone_active, zone)) {
7664 7477 if (zone_dl_exists(zone, linkid)) {
7665 7478 *zoneidp = zone->zone_id;
7666 7479 err = 0;
7667 7480 break;
7668 7481 }
7669 7482 }
7670 7483 mutex_exit(&zonehash_lock);
7671 7484 return (err);
7672 7485 }
7673 7486
7674 7487 /*
7675 7488 * Get the list of datalink IDs assigned to a zone.
7676 7489 *
7677 7490 * On input, *nump is the number of datalink IDs that can fit in the supplied
7678 7491 * idarray. Upon return, *nump is either set to the number of datalink IDs
7679 7492 * that were placed in the array if the array was large enough, or to the
7680 7493 * number of datalink IDs that the function needs to place in the array if the
7681 7494 * array is too small.
7682 7495 */
7683 7496 static int
7684 7497 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7685 7498 {
7686 7499 uint_t num, dlcount;
7687 7500 zone_t *zone;
7688 7501 zone_dl_t *zdl;
7689 7502 datalink_id_t *idptr = idarray;
7690 7503
7691 7504 if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7692 7505 return (set_errno(EFAULT));
7693 7506 if ((zone = zone_find_by_id(zoneid)) == NULL)
7694 7507 return (set_errno(ENXIO));
7695 7508
7696 7509 num = 0;
7697 7510 mutex_enter(&zone->zone_lock);
7698 7511 for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7699 7512 zdl = list_next(&zone->zone_dl_list, zdl)) {
7700 7513 /*
7701 7514 * If the list is bigger than what the caller supplied, just
7702 7515 * count, don't do copyout.
7703 7516 */
7704 7517 if (++num > dlcount)
7705 7518 continue;
7706 7519 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7707 7520 mutex_exit(&zone->zone_lock);
7708 7521 zone_rele(zone);
7709 7522 return (set_errno(EFAULT));
7710 7523 }
7711 7524 idptr++;
7712 7525 }
7713 7526 mutex_exit(&zone->zone_lock);
7714 7527 zone_rele(zone);
7715 7528
7716 7529 /* Increased or decreased, caller should be notified. */
7717 7530 if (num != dlcount) {
7718 7531 if (copyout(&num, nump, sizeof (num)) != 0)
7719 7532 return (set_errno(EFAULT));
7720 7533 }
7721 7534 return (0);
7722 7535 }
7723 7536
7724 7537 /*
7725 7538 * Public interface for looking up a zone by zoneid. It's a customized version
7726 7539 * for netstack_zone_create(). It can only be called from the zsd create
7727 7540 * callbacks, since it doesn't have reference on the zone structure hence if
7728 7541 * it is called elsewhere the zone could disappear after the zonehash_lock
7729 7542 * is dropped.
7730 7543 *
7731 7544 * Furthermore it
7732 7545 * 1. Doesn't check the status of the zone.
7733 7546 * 2. It will be called even before zone_init is called, in that case the
7734 7547 * address of zone0 is returned directly, and netstack_zone_create()
7735 7548 * will only assign a value to zone0.zone_netstack, won't break anything.
7736 7549 * 3. Returns without the zone being held.
7737 7550 */
7738 7551 zone_t *
7739 7552 zone_find_by_id_nolock(zoneid_t zoneid)
7740 7553 {
7741 7554 zone_t *zone;
7742 7555
7743 7556 mutex_enter(&zonehash_lock);
7744 7557 if (zonehashbyid == NULL)
7745 7558 zone = &zone0;
7746 7559 else
7747 7560 zone = zone_find_all_by_id(zoneid);
7748 7561 mutex_exit(&zonehash_lock);
7749 7562 return (zone);
7750 7563 }
7751 7564
7752 7565 /*
7753 7566 * Walk the datalinks for a given zone
7754 7567 */
7755 7568 int
7756 7569 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7757 7570 void *data)
7758 7571 {
7759 7572 zone_t *zone;
7760 7573 zone_dl_t *zdl;
7761 7574 datalink_id_t *idarray;
7762 7575 uint_t idcount = 0;
7763 7576 int i, ret = 0;
7764 7577
7765 7578 if ((zone = zone_find_by_id(zoneid)) == NULL)
7766 7579 return (ENOENT);
7767 7580
7768 7581 /*
7769 7582 * We first build an array of linkid's so that we can walk these and
7770 7583 * execute the callback with the zone_lock dropped.
7771 7584 */
7772 7585 mutex_enter(&zone->zone_lock);
7773 7586 for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7774 7587 zdl = list_next(&zone->zone_dl_list, zdl)) {
7775 7588 idcount++;
7776 7589 }
7777 7590
7778 7591 if (idcount == 0) {
7779 7592 mutex_exit(&zone->zone_lock);
7780 7593 zone_rele(zone);
7781 7594 return (0);
7782 7595 }
7783 7596
7784 7597 idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7785 7598 if (idarray == NULL) {
7786 7599 mutex_exit(&zone->zone_lock);
7787 7600 zone_rele(zone);
7788 7601 return (ENOMEM);
7789 7602 }
7790 7603
7791 7604 for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7792 7605 i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7793 7606 idarray[i] = zdl->zdl_id;
7794 7607 }
7795 7608
7796 7609 mutex_exit(&zone->zone_lock);
7797 7610
7798 7611 for (i = 0; i < idcount && ret == 0; i++) {
7799 7612 if ((ret = (*cb)(idarray[i], data)) != 0)
7800 7613 break;
7801 7614 }
7802 7615
7803 7616 zone_rele(zone);
7804 7617 kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7805 7618 return (ret);
7806 7619 }
7807 7620
7808 7621 static char *
7809 7622 zone_net_type2name(int type)
7810 7623 {
7811 7624 switch (type) {
7812 7625 case ZONE_NETWORK_ADDRESS:
7813 7626 return (ZONE_NET_ADDRNAME);
7814 7627 case ZONE_NETWORK_DEFROUTER:
7815 7628 return (ZONE_NET_RTRNAME);
7816 7629 default:
7817 7630 return (NULL);
7818 7631 }
7819 7632 }
7820 7633
7821 7634 static int
7822 7635 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7823 7636 {
7824 7637 zone_t *zone;
7825 7638 zone_dl_t *zdl;
7826 7639 nvlist_t *nvl;
7827 7640 int err = 0;
7828 7641 uint8_t *new = NULL;
7829 7642 char *nvname;
7830 7643 int bufsize;
7831 7644 datalink_id_t linkid = znbuf->zn_linkid;
7832 7645
7833 7646 if (secpolicy_zone_config(CRED()) != 0)
7834 7647 return (set_errno(EPERM));
7835 7648
7836 7649 if (zoneid == GLOBAL_ZONEID)
7837 7650 return (set_errno(EINVAL));
7838 7651
7839 7652 nvname = zone_net_type2name(znbuf->zn_type);
7840 7653 bufsize = znbuf->zn_len;
7841 7654 new = znbuf->zn_val;
7842 7655 if (nvname == NULL)
7843 7656 return (set_errno(EINVAL));
7844 7657
7845 7658 if ((zone = zone_find_by_id(zoneid)) == NULL) {
7846 7659 return (set_errno(EINVAL));
7847 7660 }
7848 7661
7849 7662 mutex_enter(&zone->zone_lock);
7850 7663 if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7851 7664 err = ENXIO;
7852 7665 goto done;
7853 7666 }
7854 7667 if ((nvl = zdl->zdl_net) == NULL) {
7855 7668 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7856 7669 err = ENOMEM;
7857 7670 goto done;
7858 7671 } else {
7859 7672 zdl->zdl_net = nvl;
7860 7673 }
7861 7674 }
7862 7675 if (nvlist_exists(nvl, nvname)) {
7863 7676 err = EINVAL;
7864 7677 goto done;
7865 7678 }
7866 7679 err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7867 7680 ASSERT(err == 0);
7868 7681 done:
7869 7682 mutex_exit(&zone->zone_lock);
7870 7683 zone_rele(zone);
7871 7684 if (err != 0)
7872 7685 return (set_errno(err));
7873 7686 else
7874 7687 return (0);
7875 7688 }
7876 7689
7877 7690 static int
7878 7691 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7879 7692 {
7880 7693 zone_t *zone;
7881 7694 zone_dl_t *zdl;
7882 7695 nvlist_t *nvl;
7883 7696 uint8_t *ptr;
7884 7697 uint_t psize;
7885 7698 int err = 0;
7886 7699 char *nvname;
7887 7700 int bufsize;
7888 7701 void *buf;
7889 7702 datalink_id_t linkid = znbuf->zn_linkid;
7890 7703
7891 7704 if (zoneid == GLOBAL_ZONEID)
7892 7705 return (set_errno(EINVAL));
7893 7706
7894 7707 nvname = zone_net_type2name(znbuf->zn_type);
7895 7708 bufsize = znbuf->zn_len;
7896 7709 buf = znbuf->zn_val;
7897 7710
7898 7711 if (nvname == NULL)
7899 7712 return (set_errno(EINVAL));
7900 7713 if ((zone = zone_find_by_id(zoneid)) == NULL)
7901 7714 return (set_errno(EINVAL));
7902 7715
7903 7716 mutex_enter(&zone->zone_lock);
7904 7717 if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7905 7718 err = ENXIO;
7906 7719 goto done;
7907 7720 }
7908 7721 if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7909 7722 err = ENOENT;
7910 7723 goto done;
7911 7724 }
7912 7725 err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7913 7726 ASSERT(err == 0);
7914 7727
7915 7728 if (psize > bufsize) {
7916 7729 err = ENOBUFS;
7917 7730 goto done;
7918 7731 }
7919 7732 znbuf->zn_len = psize;
7920 7733 bcopy(ptr, buf, psize);
7921 7734 done:
7922 7735 mutex_exit(&zone->zone_lock);
7923 7736 zone_rele(zone);
7924 7737 if (err != 0)
7925 7738 return (set_errno(err));
7926 7739 else
7927 7740 return (0);
7928 7741 }
|
↓ open down ↓ |
615 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX