Print this page
Revert "Revert "6602 lofi should support labeled devices""
This reverts commit 21386c8bd8477810b291eee22e08f1382e70cdf3.
Revert "6602 lofi should support labeled devices"
This reverts commit 406fc5100dac8d225a315a6def6be8d628f34e24.
Adding AoE support to nza-kernel
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/vfs.c
+++ new/usr/src/uts/common/fs/vfs.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright 2016 Joyent, Inc.
25 25 * Copyright 2016 Toomas Soome <tsoome@me.com>
26 26 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
27 27 * Copyright 2016 Nexenta Systems, Inc.
28 28 * Copyright 2017 RackTop Systems.
29 29 */
30 30
31 31 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
32 32 /* All Rights Reserved */
33 33
34 34 /*
35 35 * University Copyright- Copyright (c) 1982, 1986, 1988
36 36 * The Regents of the University of California
37 37 * All Rights Reserved
38 38 *
39 39 * University Acknowledgment- Portions of this document are derived from
40 40 * software developed by the University of California, Berkeley, and its
41 41 * contributors.
42 42 */
43 43
44 44 #include <sys/types.h>
45 45 #include <sys/t_lock.h>
46 46 #include <sys/param.h>
47 47 #include <sys/errno.h>
48 48 #include <sys/user.h>
49 49 #include <sys/fstyp.h>
50 50 #include <sys/kmem.h>
51 51 #include <sys/systm.h>
52 52 #include <sys/proc.h>
53 53 #include <sys/mount.h>
54 54 #include <sys/vfs.h>
55 55 #include <sys/vfs_opreg.h>
56 56 #include <sys/fem.h>
57 57 #include <sys/mntent.h>
58 58 #include <sys/stat.h>
59 59 #include <sys/statvfs.h>
60 60 #include <sys/statfs.h>
61 61 #include <sys/cred.h>
62 62 #include <sys/vnode.h>
63 63 #include <sys/rwstlock.h>
64 64 #include <sys/dnlc.h>
65 65 #include <sys/file.h>
66 66 #include <sys/time.h>
67 67 #include <sys/atomic.h>
68 68 #include <sys/cmn_err.h>
69 69 #include <sys/buf.h>
70 70 #include <sys/swap.h>
71 71 #include <sys/debug.h>
72 72 #include <sys/vnode.h>
73 73 #include <sys/modctl.h>
74 74 #include <sys/ddi.h>
75 75 #include <sys/pathname.h>
76 76 #include <sys/bootconf.h>
77 77 #include <sys/dumphdr.h>
78 78 #include <sys/dc_ki.h>
79 79 #include <sys/poll.h>
80 80 #include <sys/sunddi.h>
81 81 #include <sys/sysmacros.h>
82 82 #include <sys/zone.h>
83 83 #include <sys/policy.h>
84 84 #include <sys/ctfs.h>
85 85 #include <sys/objfs.h>
86 86 #include <sys/console.h>
87 87 #include <sys/reboot.h>
88 88 #include <sys/attr.h>
89 89 #include <sys/zio.h>
90 90 #include <sys/spa.h>
91 91 #include <sys/lofi.h>
92 92 #include <sys/bootprops.h>
93 93
94 94 #include <vm/page.h>
95 95
96 96 #include <fs/fs_subr.h>
97 97 /* Private interfaces to create vopstats-related data structures */
98 98 extern void initialize_vopstats(vopstats_t *);
99 99 extern vopstats_t *get_fstype_vopstats(struct vfs *, struct vfssw *);
100 100 extern vsk_anchor_t *get_vskstat_anchor(struct vfs *);
101 101
102 102 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
103 103 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
104 104 const char *, int, int);
105 105 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
106 106 static void vfs_freemnttab(struct vfs *);
107 107 static void vfs_freeopt(mntopt_t *);
108 108 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
109 109 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
110 110 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
111 111 static void vfs_createopttbl_extend(mntopts_t *, const char *,
112 112 const mntopts_t *);
113 113 static char **vfs_copycancelopt_extend(char **const, int);
114 114 static void vfs_freecancelopt(char **);
115 115 static void getrootfs(char **, char **);
116 116 static int getmacpath(dev_info_t *, void *);
117 117 static void vfs_mnttabvp_setup(void);
118 118
119 119 struct ipmnt {
120 120 struct ipmnt *mip_next;
121 121 dev_t mip_dev;
122 122 struct vfs *mip_vfsp;
123 123 };
124 124
125 125 static kmutex_t vfs_miplist_mutex;
126 126 static struct ipmnt *vfs_miplist = NULL;
127 127 static struct ipmnt *vfs_miplist_end = NULL;
128 128
129 129 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
130 130
131 131 /*
132 132 * VFS global data.
133 133 */
134 134 vnode_t *rootdir; /* pointer to root inode vnode. */
135 135 vnode_t *devicesdir; /* pointer to inode of devices root */
136 136 vnode_t *devdir; /* pointer to inode of dev root */
137 137
138 138 char *server_rootpath; /* root path for diskless clients */
139 139 char *server_hostname; /* hostname of diskless server */
140 140
141 141 static struct vfs root;
142 142 static struct vfs devices;
143 143 static struct vfs dev;
144 144 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */
145 145 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */
146 146 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */
147 147 /* must be power of 2! */
148 148 timespec_t vfs_mnttab_ctime; /* mnttab created time */
149 149 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */
150 150 char *vfs_dummyfstype = "\0";
151 151 struct pollhead vfs_pollhd; /* for mnttab pollers */
152 152 struct vnode *vfs_mntdummyvp; /* to fake mnttab read/write for file events */
153 153 int mntfstype; /* will be set once mnt fs is mounted */
154 154
155 155 /*
156 156 * Table for generic options recognized in the VFS layer and acted
157 157 * on at this level before parsing file system specific options.
158 158 * The nosuid option is stronger than any of the devices and setuid
159 159 * options, so those are canceled when nosuid is seen.
160 160 *
161 161 * All options which are added here need to be added to the
162 162 * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
163 163 */
164 164 /*
165 165 * VFS Mount options table
166 166 */
167 167 static char *ro_cancel[] = { MNTOPT_RW, NULL };
168 168 static char *rw_cancel[] = { MNTOPT_RO, NULL };
169 169 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
170 170 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
171 171 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
172 172 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
173 173 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
174 174 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
175 175 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
176 176 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
177 177 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
178 178 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
179 179 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
180 180
181 181 static const mntopt_t mntopts[] = {
182 182 /*
183 183 * option name cancel options default arg flags
184 184 */
185 185 { MNTOPT_REMOUNT, NULL, NULL,
186 186 MO_NODISPLAY, (void *)0 },
187 187 { MNTOPT_RO, ro_cancel, NULL, 0,
188 188 (void *)0 },
189 189 { MNTOPT_RW, rw_cancel, NULL, 0,
190 190 (void *)0 },
191 191 { MNTOPT_SUID, suid_cancel, NULL, 0,
192 192 (void *)0 },
193 193 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0,
194 194 (void *)0 },
195 195 { MNTOPT_DEVICES, devices_cancel, NULL, 0,
196 196 (void *)0 },
197 197 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0,
198 198 (void *)0 },
199 199 { MNTOPT_SETUID, setuid_cancel, NULL, 0,
200 200 (void *)0 },
201 201 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0,
202 202 (void *)0 },
203 203 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0,
204 204 (void *)0 },
205 205 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0,
206 206 (void *)0 },
207 207 { MNTOPT_EXEC, exec_cancel, NULL, 0,
208 208 (void *)0 },
209 209 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0,
210 210 (void *)0 },
211 211 };
212 212
213 213 const mntopts_t vfs_mntopts = {
214 214 sizeof (mntopts) / sizeof (mntopt_t),
215 215 (mntopt_t *)&mntopts[0]
216 216 };
217 217
218 218 /*
219 219 * File system operation dispatch functions.
220 220 */
221 221
222 222 int
223 223 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
224 224 {
225 225 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
226 226 }
227 227
228 228 int
229 229 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
230 230 {
231 231 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
232 232 }
233 233
234 234 int
235 235 fsop_root(vfs_t *vfsp, vnode_t **vpp)
236 236 {
237 237 refstr_t *mntpt;
238 238 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
239 239 /*
240 240 * Make sure this root has a path. With lofs, it is possible to have
241 241 * a NULL mountpoint.
242 242 */
243 243 if (ret == 0 && vfsp->vfs_mntpt != NULL &&
244 244 (*vpp)->v_path == vn_vpath_empty) {
245 245 const char *path;
246 246
247 247 mntpt = vfs_getmntpoint(vfsp);
248 248 path = refstr_value(mntpt);
249 249 vn_setpath_str(*vpp, path, strlen(path));
250 250 refstr_rele(mntpt);
251 251 }
252 252
253 253 return (ret);
254 254 }
255 255
256 256 int
257 257 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
258 258 {
259 259 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
260 260 }
261 261
262 262 int
263 263 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
264 264 {
265 265 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
266 266 }
267 267
268 268 int
269 269 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
270 270 {
271 271 /*
272 272 * In order to handle system attribute fids in a manner
273 273 * transparent to the underlying fs, we embed the fid for
274 274 * the sysattr parent object in the sysattr fid and tack on
275 275 * some extra bytes that only the sysattr layer knows about.
276 276 *
277 277 * This guarantees that sysattr fids are larger than other fids
278 278 * for this vfs. If the vfs supports the sysattr view interface
279 279 * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
280 280 * collision with XATTR_FIDSZ.
281 281 */
282 282 if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
283 283 fidp->fid_len == XATTR_FIDSZ)
284 284 return (xattr_dir_vget(vfsp, vpp, fidp));
285 285
286 286 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
287 287 }
288 288
289 289 int
290 290 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
291 291 {
292 292 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
293 293 }
294 294
295 295 void
296 296 fsop_freefs(vfs_t *vfsp)
297 297 {
298 298 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
299 299 }
300 300
301 301 int
302 302 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
303 303 {
304 304 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
305 305 }
306 306
307 307 int
308 308 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
309 309 {
310 310 ASSERT((fstype >= 0) && (fstype < nfstype));
311 311
312 312 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
313 313 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
314 314 else
315 315 return (ENOTSUP);
316 316 }
317 317
318 318 /*
319 319 * File system initialization. vfs_setfsops() must be called from a file
320 320 * system's init routine.
321 321 */
322 322
323 323 static int
324 324 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
325 325 int *unused_ops)
326 326 {
327 327 static const fs_operation_trans_def_t vfs_ops_table[] = {
328 328 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
329 329 fs_nosys, fs_nosys,
330 330
331 331 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
332 332 fs_nosys, fs_nosys,
333 333
334 334 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
335 335 fs_nosys, fs_nosys,
336 336
337 337 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
338 338 fs_nosys, fs_nosys,
339 339
340 340 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
341 341 (fs_generic_func_p) fs_sync,
342 342 (fs_generic_func_p) fs_sync, /* No errors allowed */
343 343
344 344 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
345 345 fs_nosys, fs_nosys,
346 346
347 347 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
348 348 fs_nosys, fs_nosys,
349 349
350 350 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
351 351 (fs_generic_func_p)fs_freevfs,
352 352 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */
353 353
354 354 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
355 355 (fs_generic_func_p)fs_nosys,
356 356 (fs_generic_func_p)fs_nosys,
357 357
358 358 NULL, 0, NULL, NULL
359 359 };
360 360
361 361 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
362 362 }
363 363
364 364 void
365 365 zfs_boot_init(void)
366 366 {
367 367 if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
368 368 spa_boot_init();
369 369 }
370 370
371 371 int
372 372 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
373 373 {
374 374 int error;
375 375 int unused_ops;
376 376
377 377 /*
378 378 * Verify that fstype refers to a valid fs. Note that
379 379 * 0 is valid since it's used to set "stray" ops.
380 380 */
381 381 if ((fstype < 0) || (fstype >= nfstype))
382 382 return (EINVAL);
383 383
384 384 if (!ALLOCATED_VFSSW(&vfssw[fstype]))
385 385 return (EINVAL);
386 386
387 387 /* Set up the operations vector. */
388 388
389 389 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
390 390
391 391 if (error != 0)
392 392 return (error);
393 393
394 394 vfssw[fstype].vsw_flag |= VSW_INSTALLED;
395 395
396 396 if (actual != NULL)
397 397 *actual = &vfssw[fstype].vsw_vfsops;
398 398
399 399 #if DEBUG
400 400 if (unused_ops != 0)
401 401 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
402 402 "but not used", vfssw[fstype].vsw_name, unused_ops);
403 403 #endif
404 404
405 405 return (0);
406 406 }
407 407
408 408 int
409 409 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
410 410 {
411 411 int error;
412 412 int unused_ops;
413 413
414 414 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
415 415
416 416 error = fs_copyfsops(template, *actual, &unused_ops);
417 417 if (error != 0) {
418 418 kmem_free(*actual, sizeof (vfsops_t));
419 419 *actual = NULL;
420 420 return (error);
421 421 }
422 422
423 423 return (0);
424 424 }
425 425
426 426 /*
427 427 * Free a vfsops structure created as a result of vfs_makefsops().
428 428 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
429 429 * vfs_freevfsops_by_type().
430 430 */
431 431 void
432 432 vfs_freevfsops(vfsops_t *vfsops)
433 433 {
434 434 kmem_free(vfsops, sizeof (vfsops_t));
435 435 }
436 436
437 437 /*
438 438 * Since the vfsops structure is part of the vfssw table and wasn't
439 439 * really allocated, we're not really freeing anything. We keep
440 440 * the name for consistency with vfs_freevfsops(). We do, however,
441 441 * need to take care of a little bookkeeping.
442 442 * NOTE: For a vfsops structure created by vfs_setfsops(), use
443 443 * vfs_freevfsops_by_type().
444 444 */
445 445 int
446 446 vfs_freevfsops_by_type(int fstype)
447 447 {
448 448
449 449 /* Verify that fstype refers to a loaded fs (and not fsid 0). */
450 450 if ((fstype <= 0) || (fstype >= nfstype))
451 451 return (EINVAL);
452 452
453 453 WLOCK_VFSSW();
454 454 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
455 455 WUNLOCK_VFSSW();
456 456 return (EINVAL);
457 457 }
458 458
459 459 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
460 460 WUNLOCK_VFSSW();
461 461
462 462 return (0);
463 463 }
464 464
465 465 /* Support routines used to reference vfs_op */
466 466
467 467 /* Set the operations vector for a vfs */
468 468 void
469 469 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
470 470 {
471 471 vfsops_t *op;
472 472
473 473 ASSERT(vfsp != NULL);
474 474 ASSERT(vfsops != NULL);
475 475
476 476 op = vfsp->vfs_op;
477 477 membar_consumer();
478 478 if (vfsp->vfs_femhead == NULL &&
479 479 atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) {
480 480 return;
481 481 }
482 482 fsem_setvfsops(vfsp, vfsops);
483 483 }
484 484
485 485 /* Retrieve the operations vector for a vfs */
486 486 vfsops_t *
487 487 vfs_getops(vfs_t *vfsp)
488 488 {
489 489 vfsops_t *op;
490 490
491 491 ASSERT(vfsp != NULL);
492 492
493 493 op = vfsp->vfs_op;
494 494 membar_consumer();
495 495 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
496 496 return (op);
497 497 } else {
498 498 return (fsem_getvfsops(vfsp));
499 499 }
500 500 }
501 501
502 502 /*
503 503 * Returns non-zero (1) if the vfsops matches that of the vfs.
504 504 * Returns zero (0) if not.
505 505 */
506 506 int
507 507 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
508 508 {
509 509 return (vfs_getops(vfsp) == vfsops);
510 510 }
511 511
512 512 /*
513 513 * Returns non-zero (1) if the file system has installed a non-default,
514 514 * non-error vfs_sync routine. Returns zero (0) otherwise.
515 515 */
516 516 int
517 517 vfs_can_sync(vfs_t *vfsp)
518 518 {
519 519 /* vfs_sync() routine is not the default/error function */
520 520 return (vfs_getops(vfsp)->vfs_sync != fs_sync);
521 521 }
522 522
523 523 /*
524 524 * Initialize a vfs structure.
525 525 */
526 526 void
527 527 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
528 528 {
529 529 /* Other initialization has been moved to vfs_alloc() */
530 530 vfsp->vfs_count = 0;
531 531 vfsp->vfs_next = vfsp;
532 532 vfsp->vfs_prev = vfsp;
533 533 vfsp->vfs_zone_next = vfsp;
534 534 vfsp->vfs_zone_prev = vfsp;
535 535 vfsp->vfs_lofi_id = 0;
536 536 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
537 537 vfsimpl_setup(vfsp);
538 538 vfsp->vfs_data = (data);
539 539 vfs_setops((vfsp), (op));
540 540 }
541 541
542 542 /*
543 543 * Allocate and initialize the vfs implementation private data
544 544 * structure, vfs_impl_t.
545 545 */
546 546 void
547 547 vfsimpl_setup(vfs_t *vfsp)
548 548 {
549 549 int i;
550 550
551 551 if (vfsp->vfs_implp != NULL) {
552 552 return;
553 553 }
554 554
555 555 vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
556 556 /* Note that these are #define'd in vfs.h */
557 557 vfsp->vfs_vskap = NULL;
558 558 vfsp->vfs_fstypevsp = NULL;
559 559
560 560 /* Set size of counted array, then zero the array */
561 561 vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
562 562 for (i = 1; i < VFS_FEATURE_MAXSZ; i++) {
563 563 vfsp->vfs_featureset[i] = 0;
564 564 }
565 565 }
566 566
567 567 /*
568 568 * Release the vfs_impl_t structure, if it exists. Some unbundled
569 569 * filesystems may not use the newer version of vfs and thus
570 570 * would not contain this implementation private data structure.
571 571 */
572 572 void
573 573 vfsimpl_teardown(vfs_t *vfsp)
574 574 {
575 575 vfs_impl_t *vip = vfsp->vfs_implp;
576 576
577 577 if (vip == NULL)
578 578 return;
579 579
580 580 kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
581 581 vfsp->vfs_implp = NULL;
582 582 }
583 583
584 584 /*
585 585 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
586 586 * fstatvfs, and sysfs moved to common/syscall.
587 587 */
588 588
589 589 /*
590 590 * Update every mounted file system. We call the vfs_sync operation of
591 591 * each file system type, passing it a NULL vfsp to indicate that all
592 592 * mounted file systems of that type should be updated.
593 593 */
594 594 void
595 595 vfs_sync(int flag)
596 596 {
597 597 struct vfssw *vswp;
598 598 RLOCK_VFSSW();
599 599 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
600 600 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
601 601 vfs_refvfssw(vswp);
602 602 RUNLOCK_VFSSW();
603 603 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
604 604 CRED());
605 605 vfs_unrefvfssw(vswp);
606 606 RLOCK_VFSSW();
607 607 }
608 608 }
609 609 RUNLOCK_VFSSW();
610 610 }
611 611
612 612 void
613 613 sync(void)
614 614 {
615 615 vfs_sync(0);
616 616 }
617 617
618 618 /*
619 619 * External routines.
620 620 */
621 621
622 622 krwlock_t vfssw_lock; /* lock accesses to vfssw */
623 623
624 624 /*
625 625 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(),
626 626 * but otherwise should be accessed only via vfs_list_lock() and
627 627 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list.
628 628 */
629 629 static krwlock_t vfslist;
630 630
631 631 /*
632 632 * Mount devfs on /devices. This is done right after root is mounted
633 633 * to provide device access support for the system
634 634 */
635 635 static void
636 636 vfs_mountdevices(void)
637 637 {
638 638 struct vfssw *vsw;
639 639 struct vnode *mvp;
640 640 struct mounta mounta = { /* fake mounta for devfs_mount() */
641 641 NULL,
642 642 NULL,
643 643 MS_SYSSPACE,
644 644 NULL,
645 645 NULL,
646 646 0,
647 647 NULL,
648 648 0
649 649 };
650 650
651 651 /*
652 652 * _init devfs module to fill in the vfssw
653 653 */
654 654 if (modload("fs", "devfs") == -1)
655 655 panic("Cannot _init devfs module");
656 656
657 657 /*
658 658 * Hold vfs
659 659 */
660 660 RLOCK_VFSSW();
661 661 vsw = vfs_getvfsswbyname("devfs");
662 662 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
663 663 VFS_HOLD(&devices);
664 664
665 665 /*
666 666 * Locate mount point
667 667 */
668 668 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
669 669 panic("Cannot find /devices");
670 670
671 671 /*
672 672 * Perform the mount of /devices
673 673 */
674 674 if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
675 675 panic("Cannot mount /devices");
676 676
677 677 RUNLOCK_VFSSW();
678 678
679 679 /*
680 680 * Set appropriate members and add to vfs list for mnttab display
681 681 */
682 682 vfs_setresource(&devices, "/devices", 0);
683 683 vfs_setmntpoint(&devices, "/devices", 0);
684 684
685 685 /*
686 686 * Hold the root of /devices so it won't go away
687 687 */
688 688 if (VFS_ROOT(&devices, &devicesdir))
689 689 panic("vfs_mountdevices: not devices root");
690 690
691 691 if (vfs_lock(&devices) != 0) {
692 692 VN_RELE(devicesdir);
693 693 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
694 694 return;
695 695 }
696 696
697 697 if (vn_vfswlock(mvp) != 0) {
698 698 vfs_unlock(&devices);
699 699 VN_RELE(devicesdir);
700 700 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
701 701 return;
702 702 }
703 703
704 704 vfs_add(mvp, &devices, 0);
705 705 vn_vfsunlock(mvp);
706 706 vfs_unlock(&devices);
707 707 VN_RELE(devicesdir);
708 708 }
709 709
710 710 /*
711 711 * mount the first instance of /dev to root and remain mounted
712 712 */
713 713 static void
714 714 vfs_mountdev1(void)
715 715 {
716 716 struct vfssw *vsw;
717 717 struct vnode *mvp;
718 718 struct mounta mounta = { /* fake mounta for sdev_mount() */
719 719 NULL,
720 720 NULL,
721 721 MS_SYSSPACE | MS_OVERLAY,
722 722 NULL,
723 723 NULL,
724 724 0,
725 725 NULL,
726 726 0
727 727 };
728 728
729 729 /*
730 730 * _init dev module to fill in the vfssw
731 731 */
732 732 if (modload("fs", "dev") == -1)
733 733 cmn_err(CE_PANIC, "Cannot _init dev module\n");
734 734
735 735 /*
736 736 * Hold vfs
737 737 */
738 738 RLOCK_VFSSW();
739 739 vsw = vfs_getvfsswbyname("dev");
740 740 VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
741 741 VFS_HOLD(&dev);
742 742
743 743 /*
744 744 * Locate mount point
745 745 */
746 746 if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
747 747 cmn_err(CE_PANIC, "Cannot find /dev\n");
748 748
749 749 /*
750 750 * Perform the mount of /dev
751 751 */
752 752 if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
753 753 cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
754 754
755 755 RUNLOCK_VFSSW();
756 756
757 757 /*
758 758 * Set appropriate members and add to vfs list for mnttab display
759 759 */
760 760 vfs_setresource(&dev, "/dev", 0);
761 761 vfs_setmntpoint(&dev, "/dev", 0);
762 762
763 763 /*
764 764 * Hold the root of /dev so it won't go away
765 765 */
766 766 if (VFS_ROOT(&dev, &devdir))
767 767 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
768 768
769 769 if (vfs_lock(&dev) != 0) {
770 770 VN_RELE(devdir);
771 771 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
772 772 return;
773 773 }
774 774
775 775 if (vn_vfswlock(mvp) != 0) {
776 776 vfs_unlock(&dev);
777 777 VN_RELE(devdir);
778 778 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
779 779 return;
780 780 }
781 781
782 782 vfs_add(mvp, &dev, 0);
783 783 vn_vfsunlock(mvp);
784 784 vfs_unlock(&dev);
785 785 VN_RELE(devdir);
786 786 }
787 787
788 788 /*
789 789 * Mount required filesystem. This is done right after root is mounted.
790 790 */
791 791 static void
792 792 vfs_mountfs(char *module, char *spec, char *path)
793 793 {
794 794 struct vnode *mvp;
795 795 struct mounta mounta;
796 796 vfs_t *vfsp;
797 797
798 798 bzero(&mounta, sizeof (mounta));
799 799 mounta.flags = MS_SYSSPACE | MS_DATA;
800 800 mounta.fstype = module;
801 801 mounta.spec = spec;
802 802 mounta.dir = path;
803 803 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
804 804 cmn_err(CE_WARN, "Cannot find %s", path);
805 805 return;
806 806 }
807 807 if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
808 808 cmn_err(CE_WARN, "Cannot mount %s", path);
809 809 else
810 810 VFS_RELE(vfsp);
811 811 VN_RELE(mvp);
812 812 }
813 813
814 814 /*
815 815 * vfs_mountroot is called by main() to mount the root filesystem.
816 816 */
817 817 void
818 818 vfs_mountroot(void)
819 819 {
820 820 struct vnode *rvp = NULL;
821 821 char *path;
822 822 size_t plen;
823 823 struct vfssw *vswp;
824 824 proc_t *p;
825 825
826 826 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
827 827 rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
828 828
829 829 /*
830 830 * Alloc the vfs hash bucket array and locks
831 831 */
832 832 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
833 833
834 834 /*
835 835 * Call machine-dependent routine "rootconf" to choose a root
836 836 * file system type.
837 837 */
838 838 if (rootconf())
839 839 panic("vfs_mountroot: cannot mount root");
840 840 /*
841 841 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir
842 842 * to point to it. These are used by lookuppn() so that it
843 843 * knows where to start from ('/' or '.').
844 844 */
845 845 vfs_setmntpoint(rootvfs, "/", 0);
846 846 if (VFS_ROOT(rootvfs, &rootdir))
847 847 panic("vfs_mountroot: no root vnode");
848 848
849 849 /*
850 850 * At this point, the process tree consists of p0 and possibly some
851 851 * direct children of p0. (i.e. there are no grandchildren)
852 852 *
853 853 * Walk through them all, setting their current directory.
854 854 */
855 855 mutex_enter(&pidlock);
856 856 for (p = practive; p != NULL; p = p->p_next) {
857 857 ASSERT(p == &p0 || p->p_parent == &p0);
858 858
859 859 PTOU(p)->u_cdir = rootdir;
860 860 VN_HOLD(PTOU(p)->u_cdir);
861 861 PTOU(p)->u_rdir = NULL;
862 862 }
863 863 mutex_exit(&pidlock);
864 864
865 865 /*
866 866 * Setup the global zone's rootvp, now that it exists.
867 867 */
868 868 global_zone->zone_rootvp = rootdir;
869 869 VN_HOLD(global_zone->zone_rootvp);
870 870
871 871 /*
872 872 * Notify the module code that it can begin using the
873 873 * root filesystem instead of the boot program's services.
874 874 */
875 875 modrootloaded = 1;
876 876
877 877 /*
878 878 * Special handling for a ZFS root file system.
879 879 */
880 880 zfs_boot_init();
881 881
882 882 /*
883 883 * Set up mnttab information for root
884 884 */
885 885 vfs_setresource(rootvfs, rootfs.bo_name, 0);
886 886
887 887 /*
888 888 * Notify cluster software that the root filesystem is available.
889 889 */
890 890 clboot_mountroot();
891 891
892 892 /* Now that we're all done with the root FS, set up its vopstats */
893 893 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
894 894 /* Set flag for statistics collection */
895 895 if (vswp->vsw_flag & VSW_STATS) {
896 896 initialize_vopstats(&rootvfs->vfs_vopstats);
897 897 rootvfs->vfs_flag |= VFS_STATS;
898 898 rootvfs->vfs_fstypevsp =
899 899 get_fstype_vopstats(rootvfs, vswp);
900 900 rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
901 901 }
902 902 vfs_unrefvfssw(vswp);
903 903 }
904 904
905 905 /*
906 906 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
907 907 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
908 908 */
909 909 vfs_mountdevices();
910 910 vfs_mountdev1();
911 911
912 912 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
913 913 vfs_mountfs("proc", "/proc", "/proc");
914 914 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
915 915 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
916 916 vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
917 917 vfs_mountfs("bootfs", "bootfs", "/system/boot");
918 918
919 919 if (getzoneid() == GLOBAL_ZONEID) {
920 920 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
921 921 }
922 922
923 923 if (strcmp(rootfs.bo_fstype, "zfs") != 0) {
924 924 /*
925 925 * Look up the root device via devfs so that a dv_node is
926 926 * created for it. The vnode is never VN_RELE()ed.
927 927 * We allocate more than MAXPATHLEN so that the
928 928 * buffer passed to i_ddi_prompath_to_devfspath() is
929 929 * exactly MAXPATHLEN (the function expects a buffer
930 930 * of that length).
931 931 */
932 932 plen = strlen("/devices");
933 933 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
934 934 (void) strcpy(path, "/devices");
935 935
936 936 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
937 937 != DDI_SUCCESS ||
938 938 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
939 939
940 940 /* NUL terminate in case "path" has garbage */
941 941 path[plen + MAXPATHLEN - 1] = '\0';
942 942 #ifdef DEBUG
943 943 cmn_err(CE_WARN, "!Cannot lookup root device: %s",
944 944 path);
945 945 #endif
946 946 }
947 947 kmem_free(path, plen + MAXPATHLEN);
948 948 }
949 949
950 950 vfs_mnttabvp_setup();
951 951 }
952 952
953 953 /*
954 954 * Check to see if our "block device" is actually a file. If so,
955 955 * automatically add a lofi device, and keep track of this fact.
956 956 */
957 957 static int
958 958 lofi_add(const char *fsname, struct vfs *vfsp,
959 959 mntopts_t *mntopts, struct mounta *uap)
960 960 {
961 961 int fromspace = (uap->flags & MS_SYSSPACE) ?
962 962 UIO_SYSSPACE : UIO_USERSPACE;
963 963 struct lofi_ioctl *li = NULL;
964 964 struct vnode *vp = NULL;
965 965 struct pathname pn = { NULL };
966 966 ldi_ident_t ldi_id;
967 967 ldi_handle_t ldi_hdl;
968 968 vfssw_t *vfssw;
969 969 int id;
970 970 int err = 0;
971 971
972 972 if ((vfssw = vfs_getvfssw(fsname)) == NULL)
973 973 return (0);
974 974
975 975 if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
976 976 vfs_unrefvfssw(vfssw);
977 977 return (0);
978 978 }
979 979
980 980 vfs_unrefvfssw(vfssw);
981 981 vfssw = NULL;
982 982
983 983 if (pn_get(uap->spec, fromspace, &pn) != 0)
984 984 return (0);
985 985
986 986 if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
987 987 goto out;
988 988
989 989 if (vp->v_type != VREG)
990 990 goto out;
991 991
992 992 /* OK, this is a lofi mount. */
993 993
994 994 if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
995 995 vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
996 996 vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
997 997 vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
998 998 err = EINVAL;
999 999 goto out;
1000 1000 }
1001 1001
1002 1002 ldi_id = ldi_ident_from_anon();
1003 1003 li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1004 1004 (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1005 1005
1006 1006 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1007 1007 &ldi_hdl, ldi_id);
1008 1008
1009 1009 if (err)
1010 1010 goto out2;
1011 1011
1012 1012 err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1013 1013 FREAD | FWRITE | FKIOCTL, kcred, &id);
1014 1014
1015 1015 (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1016 1016
1017 1017 if (!err)
1018 1018 vfsp->vfs_lofi_id = id;
1019 1019
1020 1020 out2:
1021 1021 ldi_ident_release(ldi_id);
1022 1022 out:
1023 1023 if (li != NULL)
1024 1024 kmem_free(li, sizeof (*li));
1025 1025 if (vp != NULL)
1026 1026 VN_RELE(vp);
1027 1027 pn_free(&pn);
1028 1028 return (err);
1029 1029 }
1030 1030
1031 1031 static void
1032 1032 lofi_remove(struct vfs *vfsp)
1033 1033 {
1034 1034 struct lofi_ioctl *li = NULL;
1035 1035 ldi_ident_t ldi_id;
1036 1036 ldi_handle_t ldi_hdl;
1037 1037 int err;
1038 1038
1039 1039 if (vfsp->vfs_lofi_id == 0)
1040 1040 return;
1041 1041
1042 1042 ldi_id = ldi_ident_from_anon();
1043 1043
1044 1044 li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1045 1045 li->li_id = vfsp->vfs_lofi_id;
1046 1046 li->li_cleanup = B_TRUE;
1047 1047
1048 1048 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1049 1049 &ldi_hdl, ldi_id);
1050 1050
1051 1051 if (err)
1052 1052 goto out;
1053 1053
1054 1054 err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1055 1055 FREAD | FWRITE | FKIOCTL, kcred, NULL);
1056 1056
1057 1057 (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1058 1058
1059 1059 if (!err)
1060 1060 vfsp->vfs_lofi_id = 0;
1061 1061
1062 1062 out:
1063 1063 ldi_ident_release(ldi_id);
1064 1064 if (li != NULL)
1065 1065 kmem_free(li, sizeof (*li));
1066 1066 }
1067 1067
1068 1068 /*
1069 1069 * Common mount code. Called from the system call entry point, from autofs,
1070 1070 * nfsv4 trigger mounts, and from pxfs.
1071 1071 *
1072 1072 * Takes the effective file system type, mount arguments, the mount point
1073 1073 * vnode, flags specifying whether the mount is a remount and whether it
1074 1074 * should be entered into the vfs list, and credentials. Fills in its vfspp
1075 1075 * parameter with the mounted file system instance's vfs.
1076 1076 *
1077 1077 * Note that the effective file system type is specified as a string. It may
1078 1078 * be null, in which case it's determined from the mount arguments, and may
1079 1079 * differ from the type specified in the mount arguments; this is a hook to
1080 1080 * allow interposition when instantiating file system instances.
1081 1081 *
1082 1082 * The caller is responsible for releasing its own hold on the mount point
1083 1083 * vp (this routine does its own hold when necessary).
1084 1084 * Also note that for remounts, the mount point vp should be the vnode for
1085 1085 * the root of the file system rather than the vnode that the file system
1086 1086 * is mounted on top of.
1087 1087 */
1088 1088 int
1089 1089 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1090 1090 struct vfs **vfspp)
1091 1091 {
1092 1092 struct vfssw *vswp;
1093 1093 vfsops_t *vfsops;
1094 1094 struct vfs *vfsp;
1095 1095 struct vnode *bvp;
1096 1096 dev_t bdev = 0;
1097 1097 mntopts_t mnt_mntopts;
1098 1098 int error = 0;
1099 1099 int copyout_error = 0;
1100 1100 int ovflags;
1101 1101 char *opts = uap->optptr;
1102 1102 char *inargs = opts;
1103 1103 int optlen = uap->optlen;
1104 1104 int remount;
1105 1105 int rdonly;
1106 1106 int nbmand = 0;
1107 1107 int delmip = 0;
1108 1108 int addmip = 0;
1109 1109 int splice = ((uap->flags & MS_NOSPLICE) == 0);
1110 1110 int fromspace = (uap->flags & MS_SYSSPACE) ?
1111 1111 UIO_SYSSPACE : UIO_USERSPACE;
1112 1112 char *resource = NULL, *mountpt = NULL;
1113 1113 refstr_t *oldresource, *oldmntpt;
1114 1114 struct pathname pn, rpn;
1115 1115 vsk_anchor_t *vskap;
1116 1116 char fstname[FSTYPSZ];
1117 1117 zone_t *zone;
1118 1118
1119 1119 /*
1120 1120 * The v_flag value for the mount point vp is permanently set
1121 1121 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1122 1122 * for mount point locking.
1123 1123 */
1124 1124 mutex_enter(&vp->v_lock);
1125 1125 vp->v_flag |= VVFSLOCK;
1126 1126 mutex_exit(&vp->v_lock);
1127 1127
1128 1128 mnt_mntopts.mo_count = 0;
1129 1129 /*
1130 1130 * Find the ops vector to use to invoke the file system-specific mount
1131 1131 * method. If the fsname argument is non-NULL, use it directly.
1132 1132 * Otherwise, dig the file system type information out of the mount
1133 1133 * arguments.
1134 1134 *
1135 1135 * A side effect is to hold the vfssw entry.
1136 1136 *
1137 1137 * Mount arguments can be specified in several ways, which are
1138 1138 * distinguished by flag bit settings. The preferred way is to set
1139 1139 * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1140 1140 * type supplied as a character string and the last two arguments
1141 1141 * being a pointer to a character buffer and the size of the buffer.
1142 1142 * On entry, the buffer holds a null terminated list of options; on
1143 1143 * return, the string is the list of options the file system
1144 1144 * recognized. If MS_DATA is set arguments five and six point to a
1145 1145 * block of binary data which the file system interprets.
1146 1146 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1147 1147 * consistently with these conventions. To handle them, we check to
1148 1148 * see whether the pointer to the file system name has a numeric value
1149 1149 * less than 256. If so, we treat it as an index.
1150 1150 */
1151 1151 if (fsname != NULL) {
1152 1152 if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1153 1153 return (EINVAL);
1154 1154 }
1155 1155 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1156 1156 size_t n;
1157 1157 uint_t fstype;
1158 1158
1159 1159 fsname = fstname;
1160 1160
1161 1161 if ((fstype = (uintptr_t)uap->fstype) < 256) {
1162 1162 RLOCK_VFSSW();
1163 1163 if (fstype == 0 || fstype >= nfstype ||
1164 1164 !ALLOCATED_VFSSW(&vfssw[fstype])) {
1165 1165 RUNLOCK_VFSSW();
1166 1166 return (EINVAL);
1167 1167 }
1168 1168 (void) strcpy(fsname, vfssw[fstype].vsw_name);
1169 1169 RUNLOCK_VFSSW();
1170 1170 if ((vswp = vfs_getvfssw(fsname)) == NULL)
1171 1171 return (EINVAL);
1172 1172 } else {
1173 1173 /*
1174 1174 * Handle either kernel or user address space.
1175 1175 */
1176 1176 if (uap->flags & MS_SYSSPACE) {
1177 1177 error = copystr(uap->fstype, fsname,
1178 1178 FSTYPSZ, &n);
1179 1179 } else {
1180 1180 error = copyinstr(uap->fstype, fsname,
1181 1181 FSTYPSZ, &n);
1182 1182 }
1183 1183 if (error) {
1184 1184 if (error == ENAMETOOLONG)
1185 1185 return (EINVAL);
1186 1186 return (error);
1187 1187 }
1188 1188 if ((vswp = vfs_getvfssw(fsname)) == NULL)
1189 1189 return (EINVAL);
1190 1190 }
1191 1191 } else {
1192 1192 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1193 1193 return (EINVAL);
1194 1194 fsname = vswp->vsw_name;
1195 1195 }
1196 1196 if (!VFS_INSTALLED(vswp))
1197 1197 return (EINVAL);
1198 1198
1199 1199 if ((error = secpolicy_fs_allowed_mount(fsname)) != 0) {
1200 1200 vfs_unrefvfssw(vswp);
1201 1201 return (error);
1202 1202 }
1203 1203
1204 1204 vfsops = &vswp->vsw_vfsops;
1205 1205
1206 1206 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1207 1207 /*
1208 1208 * Fetch mount options and parse them for generic vfs options
1209 1209 */
1210 1210 if (uap->flags & MS_OPTIONSTR) {
1211 1211 /*
1212 1212 * Limit the buffer size
1213 1213 */
1214 1214 if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1215 1215 error = EINVAL;
1216 1216 goto errout;
1217 1217 }
1218 1218 if ((uap->flags & MS_SYSSPACE) == 0) {
1219 1219 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1220 1220 inargs[0] = '\0';
1221 1221 if (optlen) {
1222 1222 error = copyinstr(opts, inargs, (size_t)optlen,
1223 1223 NULL);
1224 1224 if (error) {
1225 1225 goto errout;
1226 1226 }
1227 1227 }
1228 1228 }
1229 1229 vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1230 1230 }
1231 1231 /*
1232 1232 * Flag bits override the options string.
1233 1233 */
1234 1234 if (uap->flags & MS_REMOUNT)
1235 1235 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1236 1236 if (uap->flags & MS_RDONLY)
1237 1237 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1238 1238 if (uap->flags & MS_NOSUID)
1239 1239 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1240 1240
1241 1241 /*
1242 1242 * Check if this is a remount; must be set in the option string and
1243 1243 * the file system must support a remount option.
1244 1244 */
1245 1245 if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1246 1246 MNTOPT_REMOUNT, NULL)) {
1247 1247 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1248 1248 error = ENOTSUP;
1249 1249 goto errout;
1250 1250 }
1251 1251 uap->flags |= MS_REMOUNT;
1252 1252 }
1253 1253
1254 1254 /*
1255 1255 * uap->flags and vfs_optionisset() should agree.
1256 1256 */
1257 1257 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1258 1258 uap->flags |= MS_RDONLY;
1259 1259 }
1260 1260 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1261 1261 uap->flags |= MS_NOSUID;
1262 1262 }
1263 1263 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1264 1264 ASSERT(splice || !remount);
1265 1265 /*
1266 1266 * If we are splicing the fs into the namespace,
1267 1267 * perform mount point checks.
1268 1268 *
1269 1269 * We want to resolve the path for the mount point to eliminate
1270 1270 * '.' and ".." and symlinks in mount points; we can't do the
1271 1271 * same for the resource string, since it would turn
1272 1272 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do
1273 1273 * this before grabbing vn_vfswlock(), because otherwise we
1274 1274 * would deadlock with lookuppn().
1275 1275 */
1276 1276 if (splice) {
1277 1277 ASSERT(vp->v_count > 0);
1278 1278
1279 1279 /*
1280 1280 * Pick up mount point and device from appropriate space.
1281 1281 */
1282 1282 if (pn_get(uap->spec, fromspace, &pn) == 0) {
1283 1283 resource = kmem_alloc(pn.pn_pathlen + 1,
1284 1284 KM_SLEEP);
1285 1285 (void) strcpy(resource, pn.pn_path);
1286 1286 pn_free(&pn);
1287 1287 }
1288 1288 /*
1289 1289 * Do a lookupname prior to taking the
1290 1290 * writelock. Mark this as completed if
1291 1291 * successful for later cleanup and addition to
1292 1292 * the mount in progress table.
1293 1293 */
1294 1294 if ((vswp->vsw_flag & VSW_MOUNTDEV) &&
1295 1295 (uap->flags & MS_GLOBAL) == 0 &&
1296 1296 lookupname(uap->spec, fromspace,
1297 1297 FOLLOW, NULL, &bvp) == 0) {
1298 1298 addmip = 1;
1299 1299 }
1300 1300
1301 1301 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1302 1302 pathname_t *pnp;
1303 1303
1304 1304 if (*pn.pn_path != '/') {
1305 1305 error = EINVAL;
1306 1306 pn_free(&pn);
1307 1307 goto errout;
1308 1308 }
1309 1309 pn_alloc(&rpn);
1310 1310 /*
1311 1311 * Kludge to prevent autofs from deadlocking with
1312 1312 * itself when it calls domount().
1313 1313 *
1314 1314 * If autofs is calling, it is because it is doing
1315 1315 * (autofs) mounts in the process of an NFS mount. A
1316 1316 * lookuppn() here would cause us to block waiting for
1317 1317 * said NFS mount to complete, which can't since this
1318 1318 * is the thread that was supposed to doing it.
1319 1319 */
1320 1320 if (fromspace == UIO_USERSPACE) {
1321 1321 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1322 1322 NULL)) == 0) {
1323 1323 pnp = &rpn;
1324 1324 } else {
1325 1325 /*
1326 1326 * The file disappeared or otherwise
1327 1327 * became inaccessible since we opened
1328 1328 * it; might as well fail the mount
1329 1329 * since the mount point is no longer
1330 1330 * accessible.
1331 1331 */
1332 1332 pn_free(&rpn);
1333 1333 pn_free(&pn);
1334 1334 goto errout;
1335 1335 }
1336 1336 } else {
1337 1337 pnp = &pn;
1338 1338 }
1339 1339 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1340 1340 (void) strcpy(mountpt, pnp->pn_path);
1341 1341
1342 1342 /*
1343 1343 * If the addition of the zone's rootpath
1344 1344 * would push us over a total path length
1345 1345 * of MAXPATHLEN, we fail the mount with
1346 1346 * ENAMETOOLONG, which is what we would have
1347 1347 * gotten if we were trying to perform the same
1348 1348 * mount in the global zone.
1349 1349 *
1350 1350 * strlen() doesn't count the trailing
1351 1351 * '\0', but zone_rootpathlen counts both a
1352 1352 * trailing '/' and the terminating '\0'.
1353 1353 */
1354 1354 if ((curproc->p_zone->zone_rootpathlen - 1 +
1355 1355 strlen(mountpt)) > MAXPATHLEN ||
1356 1356 (resource != NULL &&
1357 1357 (curproc->p_zone->zone_rootpathlen - 1 +
1358 1358 strlen(resource)) > MAXPATHLEN)) {
1359 1359 error = ENAMETOOLONG;
1360 1360 }
1361 1361
1362 1362 pn_free(&rpn);
1363 1363 pn_free(&pn);
1364 1364 }
1365 1365
1366 1366 if (error)
1367 1367 goto errout;
1368 1368
1369 1369 /*
1370 1370 * Prevent path name resolution from proceeding past
1371 1371 * the mount point.
1372 1372 */
1373 1373 if (vn_vfswlock(vp) != 0) {
1374 1374 error = EBUSY;
1375 1375 goto errout;
1376 1376 }
1377 1377
1378 1378 /*
1379 1379 * Verify that it's legitimate to establish a mount on
1380 1380 * the prospective mount point.
1381 1381 */
1382 1382 if (vn_mountedvfs(vp) != NULL) {
1383 1383 /*
1384 1384 * The mount point lock was obtained after some
1385 1385 * other thread raced through and established a mount.
1386 1386 */
1387 1387 vn_vfsunlock(vp);
1388 1388 error = EBUSY;
1389 1389 goto errout;
1390 1390 }
1391 1391 if (vp->v_flag & VNOMOUNT) {
1392 1392 vn_vfsunlock(vp);
1393 1393 error = EINVAL;
1394 1394 goto errout;
1395 1395 }
1396 1396 }
1397 1397 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1398 1398 uap->dataptr = NULL;
1399 1399 uap->datalen = 0;
1400 1400 }
1401 1401
1402 1402 /*
1403 1403 * If this is a remount, we don't want to create a new VFS.
1404 1404 * Instead, we pass the existing one with a remount flag.
1405 1405 */
1406 1406 if (remount) {
1407 1407 /*
1408 1408 * Confirm that the mount point is the root vnode of the
1409 1409 * file system that is being remounted.
1410 1410 * This can happen if the user specifies a different
1411 1411 * mount point directory pathname in the (re)mount command.
1412 1412 *
1413 1413 * Code below can only be reached if splice is true, so it's
1414 1414 * safe to do vn_vfsunlock() here.
1415 1415 */
1416 1416 if ((vp->v_flag & VROOT) == 0) {
1417 1417 vn_vfsunlock(vp);
1418 1418 error = ENOENT;
1419 1419 goto errout;
1420 1420 }
1421 1421 /*
1422 1422 * Disallow making file systems read-only unless file system
1423 1423 * explicitly allows it in its vfssw. Ignore other flags.
1424 1424 */
1425 1425 if (rdonly && vn_is_readonly(vp) == 0 &&
1426 1426 (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1427 1427 vn_vfsunlock(vp);
1428 1428 error = EINVAL;
1429 1429 goto errout;
1430 1430 }
1431 1431 /*
1432 1432 * Disallow changing the NBMAND disposition of the file
1433 1433 * system on remounts.
1434 1434 */
1435 1435 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1436 1436 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1437 1437 vn_vfsunlock(vp);
1438 1438 error = EINVAL;
1439 1439 goto errout;
1440 1440 }
1441 1441 vfsp = vp->v_vfsp;
1442 1442 ovflags = vfsp->vfs_flag;
1443 1443 vfsp->vfs_flag |= VFS_REMOUNT;
1444 1444 vfsp->vfs_flag &= ~VFS_RDONLY;
1445 1445 } else {
1446 1446 vfsp = vfs_alloc(KM_SLEEP);
1447 1447 VFS_INIT(vfsp, vfsops, NULL);
1448 1448 }
1449 1449
1450 1450 VFS_HOLD(vfsp);
1451 1451
1452 1452 if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1453 1453 if (!remount) {
1454 1454 if (splice)
1455 1455 vn_vfsunlock(vp);
1456 1456 vfs_free(vfsp);
1457 1457 } else {
1458 1458 vn_vfsunlock(vp);
1459 1459 VFS_RELE(vfsp);
1460 1460 }
1461 1461 goto errout;
1462 1462 }
1463 1463
1464 1464 /*
1465 1465 * PRIV_SYS_MOUNT doesn't mean you can become root.
1466 1466 */
1467 1467 if (vfsp->vfs_lofi_id != 0) {
1468 1468 uap->flags |= MS_NOSUID;
1469 1469 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1470 1470 }
1471 1471
1472 1472 /*
1473 1473 * The vfs_reflock is not used anymore the code below explicitly
1474 1474 * holds it preventing others accesing it directly.
1475 1475 */
1476 1476 if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1477 1477 !(vfsp->vfs_flag & VFS_REMOUNT))
1478 1478 cmn_err(CE_WARN,
1479 1479 "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1480 1480
1481 1481 /*
1482 1482 * Lock the vfs. If this is a remount we want to avoid spurious umount
1483 1483 * failures that happen as a side-effect of fsflush() and other mount
1484 1484 * and unmount operations that might be going on simultaneously and
1485 1485 * may have locked the vfs currently. To not return EBUSY immediately
1486 1486 * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1487 1487 */
1488 1488 if (!remount) {
1489 1489 if (error = vfs_lock(vfsp)) {
1490 1490 vfsp->vfs_flag = ovflags;
1491 1491
1492 1492 lofi_remove(vfsp);
1493 1493
1494 1494 if (splice)
1495 1495 vn_vfsunlock(vp);
1496 1496 vfs_free(vfsp);
1497 1497 goto errout;
1498 1498 }
1499 1499 } else {
1500 1500 vfs_lock_wait(vfsp);
1501 1501 }
1502 1502
1503 1503 /*
1504 1504 * Add device to mount in progress table, global mounts require special
1505 1505 * handling. It is possible that we have already done the lookupname
1506 1506 * on a spliced, non-global fs. If so, we don't want to do it again
1507 1507 * since we cannot do a lookupname after taking the
1508 1508 * wlock above. This case is for a non-spliced, non-global filesystem.
1509 1509 */
1510 1510 if (!addmip) {
1511 1511 if ((vswp->vsw_flag & VSW_MOUNTDEV) &&
1512 1512 (uap->flags & MS_GLOBAL) == 0 &&
1513 1513 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1514 1514 addmip = 1;
1515 1515 }
1516 1516 }
1517 1517
1518 1518 if (addmip) {
1519 1519 vnode_t *lvp = NULL;
1520 1520
1521 1521 error = vfs_get_lofi(vfsp, &lvp);
1522 1522 if (error > 0) {
1523 1523 lofi_remove(vfsp);
1524 1524
1525 1525 if (splice)
1526 1526 vn_vfsunlock(vp);
1527 1527 vfs_unlock(vfsp);
1528 1528
1529 1529 if (remount) {
1530 1530 VFS_RELE(vfsp);
1531 1531 } else {
1532 1532 vfs_free(vfsp);
1533 1533 }
1534 1534
1535 1535 goto errout;
1536 1536 } else if (error == -1) {
1537 1537 bdev = bvp->v_rdev;
1538 1538 VN_RELE(bvp);
1539 1539 } else {
1540 1540 bdev = lvp->v_rdev;
1541 1541 VN_RELE(lvp);
1542 1542 VN_RELE(bvp);
1543 1543 }
1544 1544
1545 1545 vfs_addmip(bdev, vfsp);
1546 1546 addmip = 0;
1547 1547 delmip = 1;
1548 1548 }
1549 1549 /*
1550 1550 * Invalidate cached entry for the mount point.
1551 1551 */
1552 1552 if (splice)
1553 1553 dnlc_purge_vp(vp);
1554 1554
1555 1555 /*
1556 1556 * If have an option string but the filesystem doesn't supply a
1557 1557 * prototype options table, create a table with the global
1558 1558 * options and sufficient room to accept all the options in the
1559 1559 * string. Then parse the passed in option string
1560 1560 * accepting all the options in the string. This gives us an
1561 1561 * option table with all the proper cancel properties for the
1562 1562 * global options.
1563 1563 *
1564 1564 * Filesystems that supply a prototype options table are handled
1565 1565 * earlier in this function.
1566 1566 */
1567 1567 if (uap->flags & MS_OPTIONSTR) {
1568 1568 if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1569 1569 mntopts_t tmp_mntopts;
1570 1570
1571 1571 tmp_mntopts.mo_count = 0;
1572 1572 vfs_createopttbl_extend(&tmp_mntopts, inargs,
1573 1573 &mnt_mntopts);
1574 1574 vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1575 1575 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1576 1576 vfs_freeopttbl(&tmp_mntopts);
1577 1577 }
1578 1578 }
1579 1579
1580 1580 /*
1581 1581 * Serialize with zone state transitions.
1582 1582 * See vfs_list_add; zone mounted into is:
1583 1583 * zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
1584 1584 * not the zone doing the mount (curproc->p_zone), but if we're already
1585 1585 * inside a NGZ, then we know what zone we are.
1586 1586 */
1587 1587 if (INGLOBALZONE(curproc)) {
1588 1588 zone = zone_find_by_path(mountpt);
1589 1589 ASSERT(zone != NULL);
1590 1590 } else {
1591 1591 zone = curproc->p_zone;
1592 1592 /*
1593 1593 * zone_find_by_path does a hold, so do one here too so that
1594 1594 * we can do a zone_rele after mount_completed.
1595 1595 */
1596 1596 zone_hold(zone);
1597 1597 }
1598 1598 mount_in_progress(zone);
1599 1599 /*
1600 1600 * Instantiate (or reinstantiate) the file system. If appropriate,
1601 1601 * splice it into the file system name space.
1602 1602 *
1603 1603 * We want VFS_MOUNT() to be able to override the vfs_resource
1604 1604 * string if necessary (ie, mntfs), and also for a remount to
1605 1605 * change the same (necessary when remounting '/' during boot).
1606 1606 * So we set up vfs_mntpt and vfs_resource to what we think they
1607 1607 * should be, then hand off control to VFS_MOUNT() which can
1608 1608 * override this.
1609 1609 *
1610 1610 * For safety's sake, when changing vfs_resource or vfs_mntpt of
1611 1611 * a vfs which is on the vfs list (i.e. during a remount), we must
1612 1612 * never set those fields to NULL. Several bits of code make
1613 1613 * assumptions that the fields are always valid.
1614 1614 */
1615 1615 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1616 1616 if (remount) {
1617 1617 if ((oldresource = vfsp->vfs_resource) != NULL)
1618 1618 refstr_hold(oldresource);
1619 1619 if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1620 1620 refstr_hold(oldmntpt);
1621 1621 }
1622 1622 vfs_setresource(vfsp, resource, 0);
1623 1623 vfs_setmntpoint(vfsp, mountpt, 0);
1624 1624
1625 1625 /*
1626 1626 * going to mount on this vnode, so notify.
1627 1627 */
1628 1628 vnevent_mountedover(vp, NULL);
1629 1629 error = VFS_MOUNT(vfsp, vp, uap, credp);
1630 1630
1631 1631 if (uap->flags & MS_RDONLY)
1632 1632 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1633 1633 if (uap->flags & MS_NOSUID)
1634 1634 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1635 1635 if (uap->flags & MS_GLOBAL)
1636 1636 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1637 1637
1638 1638 if (error) {
1639 1639 lofi_remove(vfsp);
1640 1640
1641 1641 if (remount) {
1642 1642 /* put back pre-remount options */
1643 1643 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1644 1644 vfs_setmntpoint(vfsp, refstr_value(oldmntpt),
1645 1645 VFSSP_VERBATIM);
1646 1646 if (oldmntpt)
1647 1647 refstr_rele(oldmntpt);
1648 1648 vfs_setresource(vfsp, refstr_value(oldresource),
1649 1649 VFSSP_VERBATIM);
1650 1650 if (oldresource)
1651 1651 refstr_rele(oldresource);
1652 1652 vfsp->vfs_flag = ovflags;
1653 1653 vfs_unlock(vfsp);
1654 1654 VFS_RELE(vfsp);
1655 1655 } else {
1656 1656 vfs_unlock(vfsp);
1657 1657 vfs_freemnttab(vfsp);
1658 1658 vfs_free(vfsp);
1659 1659 }
1660 1660 } else {
1661 1661 /*
1662 1662 * Set the mount time to now
1663 1663 */
1664 1664 vfsp->vfs_mtime = ddi_get_time();
1665 1665 if (remount) {
1666 1666 vfsp->vfs_flag &= ~VFS_REMOUNT;
1667 1667 if (oldresource)
1668 1668 refstr_rele(oldresource);
1669 1669 if (oldmntpt)
1670 1670 refstr_rele(oldmntpt);
1671 1671 } else if (splice) {
1672 1672 /*
1673 1673 * Link vfsp into the name space at the mount
1674 1674 * point. Vfs_add() is responsible for
1675 1675 * holding the mount point which will be
1676 1676 * released when vfs_remove() is called.
1677 1677 */
1678 1678 vfs_add(vp, vfsp, uap->flags);
1679 1679 } else {
1680 1680 /*
1681 1681 * Hold the reference to file system which is
1682 1682 * not linked into the name space.
1683 1683 */
1684 1684 vfsp->vfs_zone = NULL;
1685 1685 VFS_HOLD(vfsp);
1686 1686 vfsp->vfs_vnodecovered = NULL;
1687 1687 }
1688 1688 /*
1689 1689 * Set flags for global options encountered
1690 1690 */
1691 1691 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1692 1692 vfsp->vfs_flag |= VFS_RDONLY;
1693 1693 else
1694 1694 vfsp->vfs_flag &= ~VFS_RDONLY;
1695 1695 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1696 1696 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1697 1697 } else {
1698 1698 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1699 1699 vfsp->vfs_flag |= VFS_NODEVICES;
1700 1700 else
1701 1701 vfsp->vfs_flag &= ~VFS_NODEVICES;
1702 1702 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1703 1703 vfsp->vfs_flag |= VFS_NOSETUID;
1704 1704 else
1705 1705 vfsp->vfs_flag &= ~VFS_NOSETUID;
1706 1706 }
1707 1707 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1708 1708 vfsp->vfs_flag |= VFS_NBMAND;
1709 1709 else
1710 1710 vfsp->vfs_flag &= ~VFS_NBMAND;
1711 1711
1712 1712 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1713 1713 vfsp->vfs_flag |= VFS_XATTR;
1714 1714 else
1715 1715 vfsp->vfs_flag &= ~VFS_XATTR;
1716 1716
1717 1717 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1718 1718 vfsp->vfs_flag |= VFS_NOEXEC;
1719 1719 else
1720 1720 vfsp->vfs_flag &= ~VFS_NOEXEC;
1721 1721
1722 1722 /*
1723 1723 * Now construct the output option string of options
1724 1724 * we recognized.
1725 1725 */
1726 1726 if (uap->flags & MS_OPTIONSTR) {
1727 1727 vfs_list_read_lock();
1728 1728 copyout_error = vfs_buildoptionstr(
1729 1729 &vfsp->vfs_mntopts, inargs, optlen);
1730 1730 vfs_list_unlock();
1731 1731 if (copyout_error == 0 &&
1732 1732 (uap->flags & MS_SYSSPACE) == 0) {
1733 1733 copyout_error = copyoutstr(inargs, opts,
1734 1734 optlen, NULL);
1735 1735 }
1736 1736 }
1737 1737
1738 1738 /*
1739 1739 * If this isn't a remount, set up the vopstats before
1740 1740 * anyone can touch this. We only allow spliced file
1741 1741 * systems (file systems which are in the namespace) to
1742 1742 * have the VFS_STATS flag set.
1743 1743 * NOTE: PxFS mounts the underlying file system with
1744 1744 * MS_NOSPLICE set and copies those vfs_flags to its private
1745 1745 * vfs structure. As a result, PxFS should never have
1746 1746 * the VFS_STATS flag or else we might access the vfs
1747 1747 * statistics-related fields prior to them being
1748 1748 * properly initialized.
1749 1749 */
1750 1750 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1751 1751 initialize_vopstats(&vfsp->vfs_vopstats);
1752 1752 /*
1753 1753 * We need to set vfs_vskap to NULL because there's
1754 1754 * a chance it won't be set below. This is checked
1755 1755 * in teardown_vopstats() so we can't have garbage.
1756 1756 */
1757 1757 vfsp->vfs_vskap = NULL;
1758 1758 vfsp->vfs_flag |= VFS_STATS;
1759 1759 vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1760 1760 }
1761 1761
1762 1762 if (vswp->vsw_flag & VSW_XID)
1763 1763 vfsp->vfs_flag |= VFS_XID;
1764 1764
1765 1765 vfs_unlock(vfsp);
1766 1766 }
1767 1767 mount_completed(zone);
1768 1768 zone_rele(zone);
1769 1769 if (splice)
1770 1770 vn_vfsunlock(vp);
1771 1771
1772 1772 if ((error == 0) && (copyout_error == 0)) {
1773 1773 if (!remount) {
1774 1774 /*
1775 1775 * Don't call get_vskstat_anchor() while holding
1776 1776 * locks since it allocates memory and calls
1777 1777 * VFS_STATVFS(). For NFS, the latter can generate
1778 1778 * an over-the-wire call.
1779 1779 */
1780 1780 vskap = get_vskstat_anchor(vfsp);
1781 1781 /* Only take the lock if we have something to do */
1782 1782 if (vskap != NULL) {
1783 1783 vfs_lock_wait(vfsp);
1784 1784 if (vfsp->vfs_flag & VFS_STATS) {
1785 1785 vfsp->vfs_vskap = vskap;
1786 1786 }
1787 1787 vfs_unlock(vfsp);
1788 1788 }
1789 1789 }
1790 1790 /* Return vfsp to caller. */
1791 1791 *vfspp = vfsp;
1792 1792 }
1793 1793 errout:
1794 1794 vfs_freeopttbl(&mnt_mntopts);
1795 1795 if (resource != NULL)
1796 1796 kmem_free(resource, strlen(resource) + 1);
1797 1797 if (mountpt != NULL)
1798 1798 kmem_free(mountpt, strlen(mountpt) + 1);
1799 1799 /*
1800 1800 * It is possible we errored prior to adding to mount in progress
1801 1801 * table. Must free vnode we acquired with successful lookupname.
1802 1802 */
1803 1803 if (addmip)
1804 1804 VN_RELE(bvp);
1805 1805 if (delmip)
1806 1806 vfs_delmip(vfsp);
1807 1807 ASSERT(vswp != NULL);
1808 1808 vfs_unrefvfssw(vswp);
1809 1809 if (inargs != opts)
1810 1810 kmem_free(inargs, MAX_MNTOPT_STR);
1811 1811 if (copyout_error) {
1812 1812 lofi_remove(vfsp);
1813 1813 VFS_RELE(vfsp);
1814 1814 error = copyout_error;
1815 1815 }
1816 1816 return (error);
1817 1817 }
1818 1818
1819 1819 static void
1820 1820 vfs_setpath(
1821 1821 struct vfs *vfsp, /* vfs being updated */
1822 1822 refstr_t **refp, /* Ref-count string to contain the new path */
1823 1823 const char *newpath, /* Path to add to refp (above) */
1824 1824 uint32_t flag) /* flag */
1825 1825 {
1826 1826 size_t len;
1827 1827 refstr_t *ref;
1828 1828 zone_t *zone = curproc->p_zone;
1829 1829 char *sp;
1830 1830 int have_list_lock = 0;
1831 1831
1832 1832 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1833 1833
1834 1834 /*
1835 1835 * New path must be less than MAXPATHLEN because mntfs
1836 1836 * will only display up to MAXPATHLEN bytes. This is currently
1837 1837 * safe, because domount() uses pn_get(), and other callers
1838 1838 * similarly cap the size to fewer than MAXPATHLEN bytes.
1839 1839 */
1840 1840
1841 1841 ASSERT(strlen(newpath) < MAXPATHLEN);
1842 1842
1843 1843 /* mntfs requires consistency while vfs list lock is held */
1844 1844
1845 1845 if (VFS_ON_LIST(vfsp)) {
1846 1846 have_list_lock = 1;
1847 1847 vfs_list_lock();
1848 1848 }
1849 1849
1850 1850 if (*refp != NULL)
1851 1851 refstr_rele(*refp);
1852 1852
1853 1853 /*
1854 1854 * If we are in a non-global zone then we prefix the supplied path,
1855 1855 * newpath, with the zone's root path, with two exceptions. The first
1856 1856 * is where we have been explicitly directed to avoid doing so; this
1857 1857 * will be the case following a failed remount, where the path supplied
1858 1858 * will be a saved version which must now be restored. The second
1859 1859 * exception is where newpath is not a pathname but a descriptive name,
1860 1860 * e.g. "procfs".
1861 1861 */
1862 1862 if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') {
1863 1863 ref = refstr_alloc(newpath);
1864 1864 goto out;
1865 1865 }
1866 1866
1867 1867 /*
1868 1868 * Truncate the trailing '/' in the zoneroot, and merge
1869 1869 * in the zone's rootpath with the "newpath" (resource
1870 1870 * or mountpoint) passed in.
1871 1871 *
1872 1872 * The size of the required buffer is thus the size of
1873 1873 * the buffer required for the passed-in newpath
1874 1874 * (strlen(newpath) + 1), plus the size of the buffer
1875 1875 * required to hold zone_rootpath (zone_rootpathlen)
1876 1876 * minus one for one of the now-superfluous NUL
1877 1877 * terminations, minus one for the trailing '/'.
1878 1878 *
1879 1879 * That gives us:
1880 1880 *
1881 1881 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1882 1882 *
1883 1883 * Which is what we have below.
1884 1884 */
1885 1885
1886 1886 len = strlen(newpath) + zone->zone_rootpathlen - 1;
1887 1887 sp = kmem_alloc(len, KM_SLEEP);
1888 1888
1889 1889 /*
1890 1890 * Copy everything including the trailing slash, which
1891 1891 * we then overwrite with the NUL character.
1892 1892 */
1893 1893
1894 1894 (void) strcpy(sp, zone->zone_rootpath);
1895 1895 sp[zone->zone_rootpathlen - 2] = '\0';
1896 1896 (void) strcat(sp, newpath);
1897 1897
1898 1898 ref = refstr_alloc(sp);
1899 1899 kmem_free(sp, len);
1900 1900 out:
1901 1901 *refp = ref;
1902 1902
1903 1903 if (have_list_lock) {
1904 1904 vfs_mnttab_modtimeupd();
1905 1905 vfs_list_unlock();
1906 1906 }
1907 1907 }
1908 1908
1909 1909 /*
1910 1910 * Record a mounted resource name in a vfs structure.
1911 1911 * If vfsp is already mounted, caller must hold the vfs lock.
1912 1912 */
1913 1913 void
1914 1914 vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag)
1915 1915 {
1916 1916 if (resource == NULL || resource[0] == '\0')
1917 1917 resource = VFS_NORESOURCE;
1918 1918 vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag);
1919 1919 }
1920 1920
1921 1921 /*
1922 1922 * Record a mount point name in a vfs structure.
1923 1923 * If vfsp is already mounted, caller must hold the vfs lock.
1924 1924 */
1925 1925 void
1926 1926 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag)
1927 1927 {
1928 1928 if (mntpt == NULL || mntpt[0] == '\0')
1929 1929 mntpt = VFS_NOMNTPT;
1930 1930 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag);
1931 1931 }
1932 1932
1933 1933 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1934 1934
1935 1935 refstr_t *
1936 1936 vfs_getresource(const struct vfs *vfsp)
1937 1937 {
1938 1938 refstr_t *resource;
1939 1939
1940 1940 vfs_list_read_lock();
1941 1941 resource = vfsp->vfs_resource;
1942 1942 refstr_hold(resource);
1943 1943 vfs_list_unlock();
1944 1944
1945 1945 return (resource);
1946 1946 }
1947 1947
1948 1948 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1949 1949
1950 1950 refstr_t *
1951 1951 vfs_getmntpoint(const struct vfs *vfsp)
1952 1952 {
1953 1953 refstr_t *mntpt;
1954 1954
1955 1955 vfs_list_read_lock();
1956 1956 mntpt = vfsp->vfs_mntpt;
1957 1957 refstr_hold(mntpt);
1958 1958 vfs_list_unlock();
1959 1959
1960 1960 return (mntpt);
1961 1961 }
1962 1962
1963 1963 /*
1964 1964 * Create an empty options table with enough empty slots to hold all
1965 1965 * The options in the options string passed as an argument.
1966 1966 * Potentially prepend another options table.
1967 1967 *
1968 1968 * Note: caller is responsible for locking the vfs list, if needed,
1969 1969 * to protect mops.
1970 1970 */
1971 1971 static void
1972 1972 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1973 1973 const mntopts_t *mtmpl)
1974 1974 {
1975 1975 const char *s = opts;
1976 1976 uint_t count;
1977 1977
1978 1978 if (opts == NULL || *opts == '\0') {
1979 1979 count = 0;
1980 1980 } else {
1981 1981 count = 1;
1982 1982
1983 1983 /*
1984 1984 * Count number of options in the string
1985 1985 */
1986 1986 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1987 1987 count++;
1988 1988 s++;
1989 1989 }
1990 1990 }
1991 1991 vfs_copyopttbl_extend(mtmpl, mops, count);
1992 1992 }
1993 1993
1994 1994 /*
1995 1995 * Create an empty options table with enough empty slots to hold all
1996 1996 * The options in the options string passed as an argument.
1997 1997 *
1998 1998 * This function is *not* for general use by filesystems.
1999 1999 *
2000 2000 * Note: caller is responsible for locking the vfs list, if needed,
2001 2001 * to protect mops.
2002 2002 */
2003 2003 void
2004 2004 vfs_createopttbl(mntopts_t *mops, const char *opts)
2005 2005 {
2006 2006 vfs_createopttbl_extend(mops, opts, NULL);
2007 2007 }
2008 2008
2009 2009
2010 2010 /*
2011 2011 * Swap two mount options tables
2012 2012 */
2013 2013 static void
2014 2014 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2015 2015 {
2016 2016 uint_t tmpcnt;
2017 2017 mntopt_t *tmplist;
2018 2018
2019 2019 tmpcnt = optbl2->mo_count;
2020 2020 tmplist = optbl2->mo_list;
2021 2021 optbl2->mo_count = optbl1->mo_count;
2022 2022 optbl2->mo_list = optbl1->mo_list;
2023 2023 optbl1->mo_count = tmpcnt;
2024 2024 optbl1->mo_list = tmplist;
2025 2025 }
2026 2026
2027 2027 static void
2028 2028 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2029 2029 {
2030 2030 vfs_list_lock();
2031 2031 vfs_swapopttbl_nolock(optbl1, optbl2);
2032 2032 vfs_mnttab_modtimeupd();
2033 2033 vfs_list_unlock();
2034 2034 }
2035 2035
2036 2036 static char **
2037 2037 vfs_copycancelopt_extend(char **const moc, int extend)
2038 2038 {
2039 2039 int i = 0;
2040 2040 int j;
2041 2041 char **result;
2042 2042
2043 2043 if (moc != NULL) {
2044 2044 for (; moc[i] != NULL; i++)
2045 2045 /* count number of options to cancel */;
2046 2046 }
2047 2047
2048 2048 if (i + extend == 0)
2049 2049 return (NULL);
2050 2050
2051 2051 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2052 2052
2053 2053 for (j = 0; j < i; j++) {
2054 2054 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2055 2055 (void) strcpy(result[j], moc[j]);
2056 2056 }
2057 2057 for (; j <= i + extend; j++)
2058 2058 result[j] = NULL;
2059 2059
2060 2060 return (result);
2061 2061 }
2062 2062
2063 2063 static void
2064 2064 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2065 2065 {
2066 2066 char *sp, *dp;
2067 2067
2068 2068 d->mo_flags = s->mo_flags;
2069 2069 d->mo_data = s->mo_data;
2070 2070 sp = s->mo_name;
2071 2071 if (sp != NULL) {
2072 2072 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2073 2073 (void) strcpy(dp, sp);
2074 2074 d->mo_name = dp;
2075 2075 } else {
2076 2076 d->mo_name = NULL; /* should never happen */
2077 2077 }
2078 2078
2079 2079 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2080 2080
2081 2081 sp = s->mo_arg;
2082 2082 if (sp != NULL) {
2083 2083 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2084 2084 (void) strcpy(dp, sp);
2085 2085 d->mo_arg = dp;
2086 2086 } else {
2087 2087 d->mo_arg = NULL;
2088 2088 }
2089 2089 }
2090 2090
2091 2091 /*
2092 2092 * Copy a mount options table, possibly allocating some spare
2093 2093 * slots at the end. It is permissible to copy_extend the NULL table.
2094 2094 */
2095 2095 static void
2096 2096 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2097 2097 {
2098 2098 uint_t i, count;
2099 2099 mntopt_t *motbl;
2100 2100
2101 2101 /*
2102 2102 * Clear out any existing stuff in the options table being initialized
2103 2103 */
2104 2104 vfs_freeopttbl(dmo);
2105 2105 count = (smo == NULL) ? 0 : smo->mo_count;
2106 2106 if ((count + extra) == 0) /* nothing to do */
2107 2107 return;
2108 2108 dmo->mo_count = count + extra;
2109 2109 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2110 2110 dmo->mo_list = motbl;
2111 2111 for (i = 0; i < count; i++) {
2112 2112 vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2113 2113 }
2114 2114 for (i = count; i < count + extra; i++) {
2115 2115 motbl[i].mo_flags = MO_EMPTY;
2116 2116 }
2117 2117 }
2118 2118
2119 2119 /*
2120 2120 * Copy a mount options table.
2121 2121 *
2122 2122 * This function is *not* for general use by filesystems.
2123 2123 *
2124 2124 * Note: caller is responsible for locking the vfs list, if needed,
2125 2125 * to protect smo and dmo.
2126 2126 */
2127 2127 void
2128 2128 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2129 2129 {
2130 2130 vfs_copyopttbl_extend(smo, dmo, 0);
2131 2131 }
2132 2132
2133 2133 static char **
2134 2134 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2135 2135 {
2136 2136 int c1 = 0;
2137 2137 int c2 = 0;
2138 2138 char **result;
2139 2139 char **sp1, **sp2, **dp;
2140 2140
2141 2141 /*
2142 2142 * First we count both lists of cancel options.
2143 2143 * If either is NULL or has no elements, we return a copy of
2144 2144 * the other.
2145 2145 */
2146 2146 if (mop1->mo_cancel != NULL) {
2147 2147 for (; mop1->mo_cancel[c1] != NULL; c1++)
2148 2148 /* count cancel options in mop1 */;
2149 2149 }
2150 2150
2151 2151 if (c1 == 0)
2152 2152 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2153 2153
2154 2154 if (mop2->mo_cancel != NULL) {
2155 2155 for (; mop2->mo_cancel[c2] != NULL; c2++)
2156 2156 /* count cancel options in mop2 */;
2157 2157 }
2158 2158
2159 2159 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2160 2160
2161 2161 if (c2 == 0)
2162 2162 return (result);
2163 2163
2164 2164 /*
2165 2165 * When we get here, we've got two sets of cancel options;
2166 2166 * we need to merge the two sets. We know that the result
2167 2167 * array has "c1+c2+1" entries and in the end we might shrink
2168 2168 * it.
2169 2169 * Result now has a copy of the c1 entries from mop1; we'll
2170 2170 * now lookup all the entries of mop2 in mop1 and copy it if
2171 2171 * it is unique.
2172 2172 * This operation is O(n^2) but it's only called once per
2173 2173 * filesystem per duplicate option. This is a situation
2174 2174 * which doesn't arise with the filesystems in ON and
2175 2175 * n is generally 1.
2176 2176 */
2177 2177
2178 2178 dp = &result[c1];
2179 2179 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2180 2180 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2181 2181 if (strcmp(*sp1, *sp2) == 0)
2182 2182 break;
2183 2183 }
2184 2184 if (*sp1 == NULL) {
2185 2185 /*
2186 2186 * Option *sp2 not found in mop1, so copy it.
2187 2187 * The calls to vfs_copycancelopt_extend()
2188 2188 * guarantee that there's enough room.
2189 2189 */
2190 2190 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2191 2191 (void) strcpy(*dp++, *sp2);
2192 2192 }
2193 2193 }
2194 2194 if (dp != &result[c1+c2]) {
2195 2195 size_t bytes = (dp - result + 1) * sizeof (char *);
2196 2196 char **nres = kmem_alloc(bytes, KM_SLEEP);
2197 2197
2198 2198 bcopy(result, nres, bytes);
2199 2199 kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2200 2200 result = nres;
2201 2201 }
2202 2202 return (result);
2203 2203 }
2204 2204
2205 2205 /*
2206 2206 * Merge two mount option tables (outer and inner) into one. This is very
2207 2207 * similar to "merging" global variables and automatic variables in C.
2208 2208 *
2209 2209 * This isn't (and doesn't have to be) fast.
2210 2210 *
2211 2211 * This function is *not* for general use by filesystems.
2212 2212 *
2213 2213 * Note: caller is responsible for locking the vfs list, if needed,
2214 2214 * to protect omo, imo & dmo.
2215 2215 */
2216 2216 void
2217 2217 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2218 2218 {
2219 2219 uint_t i, count;
2220 2220 mntopt_t *mop, *motbl;
2221 2221 uint_t freeidx;
2222 2222
2223 2223 /*
2224 2224 * First determine how much space we need to allocate.
2225 2225 */
2226 2226 count = omo->mo_count;
2227 2227 for (i = 0; i < imo->mo_count; i++) {
2228 2228 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2229 2229 continue;
2230 2230 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2231 2231 count++;
2232 2232 }
2233 2233 ASSERT(count >= omo->mo_count &&
2234 2234 count <= omo->mo_count + imo->mo_count);
2235 2235 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2236 2236 for (i = 0; i < omo->mo_count; i++)
2237 2237 vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2238 2238 freeidx = omo->mo_count;
2239 2239 for (i = 0; i < imo->mo_count; i++) {
2240 2240 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2241 2241 continue;
2242 2242 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2243 2243 char **newcanp;
2244 2244 uint_t index = mop - omo->mo_list;
2245 2245
2246 2246 newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2247 2247
2248 2248 vfs_freeopt(&motbl[index]);
2249 2249 vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2250 2250
2251 2251 vfs_freecancelopt(motbl[index].mo_cancel);
2252 2252 motbl[index].mo_cancel = newcanp;
2253 2253 } else {
2254 2254 /*
2255 2255 * If it's a new option, just copy it over to the first
2256 2256 * free location.
2257 2257 */
2258 2258 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2259 2259 }
2260 2260 }
2261 2261 dmo->mo_count = count;
2262 2262 dmo->mo_list = motbl;
2263 2263 }
2264 2264
2265 2265 /*
2266 2266 * Functions to set and clear mount options in a mount options table.
2267 2267 */
2268 2268
2269 2269 /*
2270 2270 * Clear a mount option, if it exists.
2271 2271 *
2272 2272 * The update_mnttab arg indicates whether mops is part of a vfs that is on
2273 2273 * the vfs list.
2274 2274 */
2275 2275 static void
2276 2276 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2277 2277 {
2278 2278 struct mntopt *mop;
2279 2279 uint_t i, count;
2280 2280
2281 2281 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2282 2282
2283 2283 count = mops->mo_count;
2284 2284 for (i = 0; i < count; i++) {
2285 2285 mop = &mops->mo_list[i];
2286 2286
2287 2287 if (mop->mo_flags & MO_EMPTY)
2288 2288 continue;
2289 2289 if (strcmp(opt, mop->mo_name))
2290 2290 continue;
2291 2291 mop->mo_flags &= ~MO_SET;
2292 2292 if (mop->mo_arg != NULL) {
2293 2293 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2294 2294 }
2295 2295 mop->mo_arg = NULL;
2296 2296 if (update_mnttab)
2297 2297 vfs_mnttab_modtimeupd();
2298 2298 break;
2299 2299 }
2300 2300 }
2301 2301
2302 2302 void
2303 2303 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2304 2304 {
2305 2305 int gotlock = 0;
2306 2306
2307 2307 if (VFS_ON_LIST(vfsp)) {
2308 2308 gotlock = 1;
2309 2309 vfs_list_lock();
2310 2310 }
2311 2311 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2312 2312 if (gotlock)
2313 2313 vfs_list_unlock();
2314 2314 }
2315 2315
2316 2316
2317 2317 /*
2318 2318 * Set a mount option on. If it's not found in the table, it's silently
2319 2319 * ignored. If the option has MO_IGNORE set, it is still set unless the
2320 2320 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag
2321 2321 * bits can be used to toggle the MO_NODISPLAY bit for the option.
2322 2322 * If the VFS_CREATEOPT flag bit is set then the first option slot with
2323 2323 * MO_EMPTY set is created as the option passed in.
2324 2324 *
2325 2325 * The update_mnttab arg indicates whether mops is part of a vfs that is on
2326 2326 * the vfs list.
2327 2327 */
2328 2328 static void
2329 2329 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2330 2330 const char *arg, int flags, int update_mnttab)
2331 2331 {
2332 2332 mntopt_t *mop;
2333 2333 uint_t i, count;
2334 2334 char *sp;
2335 2335
2336 2336 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2337 2337
2338 2338 if (flags & VFS_CREATEOPT) {
2339 2339 if (vfs_hasopt(mops, opt) != NULL) {
2340 2340 flags &= ~VFS_CREATEOPT;
2341 2341 }
2342 2342 }
2343 2343 count = mops->mo_count;
2344 2344 for (i = 0; i < count; i++) {
2345 2345 mop = &mops->mo_list[i];
2346 2346
2347 2347 if (mop->mo_flags & MO_EMPTY) {
2348 2348 if ((flags & VFS_CREATEOPT) == 0)
2349 2349 continue;
2350 2350 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2351 2351 (void) strcpy(sp, opt);
2352 2352 mop->mo_name = sp;
2353 2353 if (arg != NULL)
2354 2354 mop->mo_flags = MO_HASVALUE;
2355 2355 else
2356 2356 mop->mo_flags = 0;
2357 2357 } else if (strcmp(opt, mop->mo_name)) {
2358 2358 continue;
2359 2359 }
2360 2360 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2361 2361 break;
2362 2362 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2363 2363 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2364 2364 (void) strcpy(sp, arg);
2365 2365 } else {
2366 2366 sp = NULL;
2367 2367 }
2368 2368 if (mop->mo_arg != NULL)
2369 2369 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2370 2370 mop->mo_arg = sp;
2371 2371 if (flags & VFS_DISPLAY)
2372 2372 mop->mo_flags &= ~MO_NODISPLAY;
2373 2373 if (flags & VFS_NODISPLAY)
2374 2374 mop->mo_flags |= MO_NODISPLAY;
2375 2375 mop->mo_flags |= MO_SET;
2376 2376 if (mop->mo_cancel != NULL) {
2377 2377 char **cp;
2378 2378
2379 2379 for (cp = mop->mo_cancel; *cp != NULL; cp++)
2380 2380 vfs_clearmntopt_nolock(mops, *cp, 0);
2381 2381 }
2382 2382 if (update_mnttab)
2383 2383 vfs_mnttab_modtimeupd();
2384 2384 break;
2385 2385 }
2386 2386 }
2387 2387
2388 2388 void
2389 2389 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2390 2390 {
2391 2391 int gotlock = 0;
2392 2392
2393 2393 if (VFS_ON_LIST(vfsp)) {
2394 2394 gotlock = 1;
2395 2395 vfs_list_lock();
2396 2396 }
2397 2397 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2398 2398 if (gotlock)
2399 2399 vfs_list_unlock();
2400 2400 }
2401 2401
2402 2402
2403 2403 /*
2404 2404 * Add a "tag" option to a mounted file system's options list.
2405 2405 *
2406 2406 * Note: caller is responsible for locking the vfs list, if needed,
2407 2407 * to protect mops.
2408 2408 */
2409 2409 static mntopt_t *
2410 2410 vfs_addtag(mntopts_t *mops, const char *tag)
2411 2411 {
2412 2412 uint_t count;
2413 2413 mntopt_t *mop, *motbl;
2414 2414
2415 2415 count = mops->mo_count + 1;
2416 2416 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2417 2417 if (mops->mo_count) {
2418 2418 size_t len = (count - 1) * sizeof (mntopt_t);
2419 2419
2420 2420 bcopy(mops->mo_list, motbl, len);
2421 2421 kmem_free(mops->mo_list, len);
2422 2422 }
2423 2423 mops->mo_count = count;
2424 2424 mops->mo_list = motbl;
2425 2425 mop = &motbl[count - 1];
2426 2426 mop->mo_flags = MO_TAG;
2427 2427 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2428 2428 (void) strcpy(mop->mo_name, tag);
2429 2429 return (mop);
2430 2430 }
2431 2431
2432 2432 /*
2433 2433 * Allow users to set arbitrary "tags" in a vfs's mount options.
2434 2434 * Broader use within the kernel is discouraged.
2435 2435 */
2436 2436 int
2437 2437 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2438 2438 cred_t *cr)
2439 2439 {
2440 2440 vfs_t *vfsp;
2441 2441 mntopts_t *mops;
2442 2442 mntopt_t *mop;
2443 2443 int found = 0;
2444 2444 dev_t dev = makedevice(major, minor);
2445 2445 int err = 0;
2446 2446 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2447 2447
2448 2448 /*
2449 2449 * Find the desired mounted file system
2450 2450 */
2451 2451 vfs_list_lock();
2452 2452 vfsp = rootvfs;
2453 2453 do {
2454 2454 if (vfsp->vfs_dev == dev &&
2455 2455 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2456 2456 found = 1;
2457 2457 break;
2458 2458 }
2459 2459 vfsp = vfsp->vfs_next;
2460 2460 } while (vfsp != rootvfs);
2461 2461
2462 2462 if (!found) {
2463 2463 err = EINVAL;
2464 2464 goto out;
2465 2465 }
2466 2466 err = secpolicy_fs_config(cr, vfsp);
2467 2467 if (err != 0)
2468 2468 goto out;
2469 2469
2470 2470 mops = &vfsp->vfs_mntopts;
2471 2471 /*
2472 2472 * Add tag if it doesn't already exist
2473 2473 */
2474 2474 if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2475 2475 int len;
2476 2476
2477 2477 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2478 2478 len = strlen(buf);
2479 2479 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2480 2480 err = ENAMETOOLONG;
2481 2481 goto out;
2482 2482 }
2483 2483 mop = vfs_addtag(mops, tag);
2484 2484 }
2485 2485 if ((mop->mo_flags & MO_TAG) == 0) {
2486 2486 err = EINVAL;
2487 2487 goto out;
2488 2488 }
2489 2489 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2490 2490 out:
2491 2491 vfs_list_unlock();
2492 2492 kmem_free(buf, MAX_MNTOPT_STR);
2493 2493 return (err);
2494 2494 }
2495 2495
2496 2496 /*
2497 2497 * Allow users to remove arbitrary "tags" in a vfs's mount options.
2498 2498 * Broader use within the kernel is discouraged.
2499 2499 */
2500 2500 int
2501 2501 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2502 2502 cred_t *cr)
2503 2503 {
2504 2504 vfs_t *vfsp;
2505 2505 mntopt_t *mop;
2506 2506 int found = 0;
2507 2507 dev_t dev = makedevice(major, minor);
2508 2508 int err = 0;
2509 2509
2510 2510 /*
2511 2511 * Find the desired mounted file system
2512 2512 */
2513 2513 vfs_list_lock();
2514 2514 vfsp = rootvfs;
2515 2515 do {
2516 2516 if (vfsp->vfs_dev == dev &&
2517 2517 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2518 2518 found = 1;
2519 2519 break;
2520 2520 }
2521 2521 vfsp = vfsp->vfs_next;
2522 2522 } while (vfsp != rootvfs);
2523 2523
2524 2524 if (!found) {
2525 2525 err = EINVAL;
2526 2526 goto out;
2527 2527 }
2528 2528 err = secpolicy_fs_config(cr, vfsp);
2529 2529 if (err != 0)
2530 2530 goto out;
2531 2531
2532 2532 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2533 2533 err = EINVAL;
2534 2534 goto out;
2535 2535 }
2536 2536 if ((mop->mo_flags & MO_TAG) == 0) {
2537 2537 err = EINVAL;
2538 2538 goto out;
2539 2539 }
2540 2540 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2541 2541 out:
2542 2542 vfs_list_unlock();
2543 2543 return (err);
2544 2544 }
2545 2545
2546 2546 /*
2547 2547 * Function to parse an option string and fill in a mount options table.
2548 2548 * Unknown options are silently ignored. The input option string is modified
2549 2549 * by replacing separators with nulls. If the create flag is set, options
2550 2550 * not found in the table are just added on the fly. The table must have
2551 2551 * an option slot marked MO_EMPTY to add an option on the fly.
2552 2552 *
2553 2553 * This function is *not* for general use by filesystems.
2554 2554 *
2555 2555 * Note: caller is responsible for locking the vfs list, if needed,
2556 2556 * to protect mops..
2557 2557 */
2558 2558 void
2559 2559 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2560 2560 {
2561 2561 char *s = osp, *p, *nextop, *valp, *cp, *ep;
2562 2562 int setflg = VFS_NOFORCEOPT;
2563 2563
2564 2564 if (osp == NULL)
2565 2565 return;
2566 2566 while (*s != '\0') {
2567 2567 p = strchr(s, ','); /* find next option */
2568 2568 if (p == NULL) {
2569 2569 cp = NULL;
2570 2570 p = s + strlen(s);
2571 2571 } else {
2572 2572 cp = p; /* save location of comma */
2573 2573 *p++ = '\0'; /* mark end and point to next option */
2574 2574 }
2575 2575 nextop = p;
2576 2576 p = strchr(s, '='); /* look for value */
2577 2577 if (p == NULL) {
2578 2578 valp = NULL; /* no value supplied */
2579 2579 } else {
2580 2580 ep = p; /* save location of equals */
2581 2581 *p++ = '\0'; /* end option and point to value */
2582 2582 valp = p;
2583 2583 }
2584 2584 /*
2585 2585 * set option into options table
2586 2586 */
2587 2587 if (create)
2588 2588 setflg |= VFS_CREATEOPT;
2589 2589 vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2590 2590 if (cp != NULL)
2591 2591 *cp = ','; /* restore the comma */
2592 2592 if (valp != NULL)
2593 2593 *ep = '='; /* restore the equals */
2594 2594 s = nextop;
2595 2595 }
2596 2596 }
2597 2597
2598 2598 /*
2599 2599 * Function to inquire if an option exists in a mount options table.
2600 2600 * Returns a pointer to the option if it exists, else NULL.
2601 2601 *
2602 2602 * This function is *not* for general use by filesystems.
2603 2603 *
2604 2604 * Note: caller is responsible for locking the vfs list, if needed,
2605 2605 * to protect mops.
2606 2606 */
2607 2607 struct mntopt *
2608 2608 vfs_hasopt(const mntopts_t *mops, const char *opt)
2609 2609 {
2610 2610 struct mntopt *mop;
2611 2611 uint_t i, count;
2612 2612
2613 2613 count = mops->mo_count;
2614 2614 for (i = 0; i < count; i++) {
2615 2615 mop = &mops->mo_list[i];
2616 2616
2617 2617 if (mop->mo_flags & MO_EMPTY)
2618 2618 continue;
2619 2619 if (strcmp(opt, mop->mo_name) == 0)
2620 2620 return (mop);
2621 2621 }
2622 2622 return (NULL);
2623 2623 }
2624 2624
2625 2625 /*
2626 2626 * Function to inquire if an option is set in a mount options table.
2627 2627 * Returns non-zero if set and fills in the arg pointer with a pointer to
2628 2628 * the argument string or NULL if there is no argument string.
2629 2629 */
2630 2630 static int
2631 2631 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2632 2632 {
2633 2633 struct mntopt *mop;
2634 2634 uint_t i, count;
2635 2635
2636 2636 count = mops->mo_count;
2637 2637 for (i = 0; i < count; i++) {
2638 2638 mop = &mops->mo_list[i];
2639 2639
2640 2640 if (mop->mo_flags & MO_EMPTY)
2641 2641 continue;
2642 2642 if (strcmp(opt, mop->mo_name))
2643 2643 continue;
2644 2644 if ((mop->mo_flags & MO_SET) == 0)
2645 2645 return (0);
2646 2646 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2647 2647 *argp = mop->mo_arg;
2648 2648 return (1);
2649 2649 }
2650 2650 return (0);
2651 2651 }
2652 2652
2653 2653
2654 2654 int
2655 2655 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2656 2656 {
2657 2657 int ret;
2658 2658
2659 2659 vfs_list_read_lock();
2660 2660 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2661 2661 vfs_list_unlock();
2662 2662 return (ret);
2663 2663 }
2664 2664
2665 2665
2666 2666 /*
2667 2667 * Construct a comma separated string of the options set in the given
2668 2668 * mount table, return the string in the given buffer. Return non-zero if
2669 2669 * the buffer would overflow.
2670 2670 *
2671 2671 * This function is *not* for general use by filesystems.
2672 2672 *
2673 2673 * Note: caller is responsible for locking the vfs list, if needed,
2674 2674 * to protect mp.
2675 2675 */
2676 2676 int
2677 2677 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2678 2678 {
2679 2679 char *cp;
2680 2680 uint_t i;
2681 2681
2682 2682 buf[0] = '\0';
2683 2683 cp = buf;
2684 2684 for (i = 0; i < mp->mo_count; i++) {
2685 2685 struct mntopt *mop;
2686 2686
2687 2687 mop = &mp->mo_list[i];
2688 2688 if (mop->mo_flags & MO_SET) {
2689 2689 int optlen, comma = 0;
2690 2690
2691 2691 if (buf[0] != '\0')
2692 2692 comma = 1;
2693 2693 optlen = strlen(mop->mo_name);
2694 2694 if (strlen(buf) + comma + optlen + 1 > len)
2695 2695 goto err;
2696 2696 if (comma)
2697 2697 *cp++ = ',';
2698 2698 (void) strcpy(cp, mop->mo_name);
2699 2699 cp += optlen;
2700 2700 /*
2701 2701 * Append option value if there is one
2702 2702 */
2703 2703 if (mop->mo_arg != NULL) {
2704 2704 int arglen;
2705 2705
2706 2706 arglen = strlen(mop->mo_arg);
2707 2707 if (strlen(buf) + arglen + 2 > len)
2708 2708 goto err;
2709 2709 *cp++ = '=';
2710 2710 (void) strcpy(cp, mop->mo_arg);
2711 2711 cp += arglen;
2712 2712 }
2713 2713 }
2714 2714 }
2715 2715 return (0);
2716 2716 err:
2717 2717 return (EOVERFLOW);
2718 2718 }
2719 2719
2720 2720 static void
2721 2721 vfs_freecancelopt(char **moc)
2722 2722 {
2723 2723 if (moc != NULL) {
2724 2724 int ccnt = 0;
2725 2725 char **cp;
2726 2726
2727 2727 for (cp = moc; *cp != NULL; cp++) {
2728 2728 kmem_free(*cp, strlen(*cp) + 1);
2729 2729 ccnt++;
2730 2730 }
2731 2731 kmem_free(moc, (ccnt + 1) * sizeof (char *));
2732 2732 }
2733 2733 }
2734 2734
2735 2735 static void
2736 2736 vfs_freeopt(mntopt_t *mop)
2737 2737 {
2738 2738 if (mop->mo_name != NULL)
2739 2739 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2740 2740
2741 2741 vfs_freecancelopt(mop->mo_cancel);
2742 2742
2743 2743 if (mop->mo_arg != NULL)
2744 2744 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2745 2745 }
2746 2746
2747 2747 /*
2748 2748 * Free a mount options table
2749 2749 *
2750 2750 * This function is *not* for general use by filesystems.
2751 2751 *
2752 2752 * Note: caller is responsible for locking the vfs list, if needed,
2753 2753 * to protect mp.
2754 2754 */
2755 2755 void
2756 2756 vfs_freeopttbl(mntopts_t *mp)
2757 2757 {
2758 2758 uint_t i, count;
2759 2759
2760 2760 count = mp->mo_count;
2761 2761 for (i = 0; i < count; i++) {
2762 2762 vfs_freeopt(&mp->mo_list[i]);
2763 2763 }
2764 2764 if (count) {
2765 2765 kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2766 2766 mp->mo_count = 0;
2767 2767 mp->mo_list = NULL;
2768 2768 }
2769 2769 }
2770 2770
2771 2771
2772 2772 /* ARGSUSED */
2773 2773 static int
2774 2774 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2775 2775 caller_context_t *ct)
2776 2776 {
2777 2777 return (0);
2778 2778 }
2779 2779
2780 2780 /* ARGSUSED */
2781 2781 static int
2782 2782 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2783 2783 caller_context_t *ct)
2784 2784 {
2785 2785 return (0);
2786 2786 }
2787 2787
2788 2788 /*
2789 2789 * The dummy vnode is currently used only by file events notification
2790 2790 * module which is just interested in the timestamps.
2791 2791 */
2792 2792 /* ARGSUSED */
2793 2793 static int
2794 2794 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2795 2795 caller_context_t *ct)
2796 2796 {
2797 2797 bzero(vap, sizeof (vattr_t));
2798 2798 vap->va_type = VREG;
2799 2799 vap->va_nlink = 1;
2800 2800 vap->va_ctime = vfs_mnttab_ctime;
2801 2801 /*
2802 2802 * it is ok to just copy mtime as the time will be monotonically
2803 2803 * increasing.
2804 2804 */
2805 2805 vap->va_mtime = vfs_mnttab_mtime;
2806 2806 vap->va_atime = vap->va_mtime;
2807 2807 return (0);
2808 2808 }
2809 2809
2810 2810 static void
2811 2811 vfs_mnttabvp_setup(void)
2812 2812 {
2813 2813 vnode_t *tvp;
2814 2814 vnodeops_t *vfs_mntdummyvnops;
2815 2815 const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2816 2816 VOPNAME_READ, { .vop_read = vfs_mntdummyread },
2817 2817 VOPNAME_WRITE, { .vop_write = vfs_mntdummywrite },
2818 2818 VOPNAME_GETATTR, { .vop_getattr = vfs_mntdummygetattr },
2819 2819 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
2820 2820 NULL, NULL
2821 2821 };
2822 2822
2823 2823 if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2824 2824 &vfs_mntdummyvnops) != 0) {
2825 2825 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2826 2826 /* Shouldn't happen, but not bad enough to panic */
2827 2827 return;
2828 2828 }
2829 2829
2830 2830 /*
2831 2831 * A global dummy vnode is allocated to represent mntfs files.
2832 2832 * The mntfs file (/etc/mnttab) can be monitored for file events
2833 2833 * and receive an event when mnttab changes. Dummy VOP calls
2834 2834 * will be made on this vnode. The file events notification module
2835 2835 * intercepts this vnode and delivers relevant events.
2836 2836 */
2837 2837 tvp = vn_alloc(KM_SLEEP);
2838 2838 tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2839 2839 vn_setops(tvp, vfs_mntdummyvnops);
2840 2840 tvp->v_type = VREG;
2841 2841 /*
2842 2842 * The mnt dummy ops do not reference v_data.
2843 2843 * No other module intercepting this vnode should either.
2844 2844 * Just set it to point to itself.
2845 2845 */
2846 2846 tvp->v_data = (caddr_t)tvp;
2847 2847 tvp->v_vfsp = rootvfs;
2848 2848 vfs_mntdummyvp = tvp;
2849 2849 }
2850 2850
2851 2851 /*
2852 2852 * performs fake read/write ops
2853 2853 */
2854 2854 static void
2855 2855 vfs_mnttab_rwop(int rw)
2856 2856 {
2857 2857 struct uio uio;
2858 2858 struct iovec iov;
2859 2859 char buf[1];
2860 2860
2861 2861 if (vfs_mntdummyvp == NULL)
2862 2862 return;
2863 2863
2864 2864 bzero(&uio, sizeof (uio));
2865 2865 bzero(&iov, sizeof (iov));
2866 2866 iov.iov_base = buf;
2867 2867 iov.iov_len = 0;
2868 2868 uio.uio_iov = &iov;
2869 2869 uio.uio_iovcnt = 1;
2870 2870 uio.uio_loffset = 0;
2871 2871 uio.uio_segflg = UIO_SYSSPACE;
2872 2872 uio.uio_resid = 0;
2873 2873 if (rw) {
2874 2874 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2875 2875 } else {
2876 2876 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2877 2877 }
2878 2878 }
2879 2879
2880 2880 /*
2881 2881 * Generate a write operation.
2882 2882 */
2883 2883 void
2884 2884 vfs_mnttab_writeop(void)
2885 2885 {
2886 2886 vfs_mnttab_rwop(1);
2887 2887 }
2888 2888
2889 2889 /*
2890 2890 * Generate a read operation.
2891 2891 */
2892 2892 void
2893 2893 vfs_mnttab_readop(void)
2894 2894 {
2895 2895 vfs_mnttab_rwop(0);
2896 2896 }
2897 2897
2898 2898 /*
2899 2899 * Free any mnttab information recorded in the vfs struct.
2900 2900 * The vfs must not be on the vfs list.
2901 2901 */
2902 2902 static void
2903 2903 vfs_freemnttab(struct vfs *vfsp)
2904 2904 {
2905 2905 ASSERT(!VFS_ON_LIST(vfsp));
2906 2906
2907 2907 /*
2908 2908 * Free device and mount point information
2909 2909 */
2910 2910 if (vfsp->vfs_mntpt != NULL) {
2911 2911 refstr_rele(vfsp->vfs_mntpt);
2912 2912 vfsp->vfs_mntpt = NULL;
2913 2913 }
2914 2914 if (vfsp->vfs_resource != NULL) {
2915 2915 refstr_rele(vfsp->vfs_resource);
2916 2916 vfsp->vfs_resource = NULL;
2917 2917 }
2918 2918 /*
2919 2919 * Now free mount options information
2920 2920 */
2921 2921 vfs_freeopttbl(&vfsp->vfs_mntopts);
2922 2922 }
2923 2923
2924 2924 /*
2925 2925 * Return the last mnttab modification time
2926 2926 */
2927 2927 void
2928 2928 vfs_mnttab_modtime(timespec_t *ts)
2929 2929 {
2930 2930 ASSERT(RW_LOCK_HELD(&vfslist));
2931 2931 *ts = vfs_mnttab_mtime;
2932 2932 }
2933 2933
2934 2934 /*
2935 2935 * See if mnttab is changed
2936 2936 */
2937 2937 void
2938 2938 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2939 2939 {
2940 2940 int changed;
2941 2941
2942 2942 *phpp = (struct pollhead *)NULL;
2943 2943
2944 2944 /*
2945 2945 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2946 2946 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2947 2947 * to not grab the vfs list lock because tv_sec is monotonically
2948 2948 * increasing.
2949 2949 */
2950 2950
2951 2951 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2952 2952 (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2953 2953 if (!changed) {
2954 2954 *phpp = &vfs_pollhd;
2955 2955 }
2956 2956 }
2957 2957
2958 2958 /* Provide a unique and monotonically-increasing timestamp. */
2959 2959 void
2960 2960 vfs_mono_time(timespec_t *ts)
2961 2961 {
2962 2962 static volatile hrtime_t hrt; /* The saved time. */
2963 2963 hrtime_t newhrt, oldhrt; /* For effecting the CAS. */
2964 2964 timespec_t newts;
2965 2965
2966 2966 /*
2967 2967 * Try gethrestime() first, but be prepared to fabricate a sensible
2968 2968 * answer at the first sign of any trouble.
2969 2969 */
2970 2970 gethrestime(&newts);
2971 2971 newhrt = ts2hrt(&newts);
2972 2972 for (;;) {
2973 2973 oldhrt = hrt;
2974 2974 if (newhrt <= hrt)
2975 2975 newhrt = hrt + 1;
2976 2976 if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
2977 2977 break;
2978 2978 }
2979 2979 hrt2ts(newhrt, ts);
2980 2980 }
2981 2981
2982 2982 /*
2983 2983 * Update the mnttab modification time and wake up any waiters for
2984 2984 * mnttab changes
2985 2985 */
2986 2986 void
2987 2987 vfs_mnttab_modtimeupd()
2988 2988 {
2989 2989 hrtime_t oldhrt, newhrt;
2990 2990
2991 2991 ASSERT(RW_WRITE_HELD(&vfslist));
2992 2992 oldhrt = ts2hrt(&vfs_mnttab_mtime);
2993 2993 gethrestime(&vfs_mnttab_mtime);
2994 2994 newhrt = ts2hrt(&vfs_mnttab_mtime);
2995 2995 if (oldhrt == (hrtime_t)0)
2996 2996 vfs_mnttab_ctime = vfs_mnttab_mtime;
2997 2997 /*
2998 2998 * Attempt to provide unique mtime (like uniqtime but not).
2999 2999 */
3000 3000 if (newhrt == oldhrt) {
3001 3001 newhrt++;
3002 3002 hrt2ts(newhrt, &vfs_mnttab_mtime);
3003 3003 }
3004 3004 pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
3005 3005 vfs_mnttab_writeop();
3006 3006 }
3007 3007
3008 3008 int
3009 3009 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
3010 3010 {
3011 3011 vnode_t *coveredvp;
3012 3012 int error;
3013 3013 extern void teardown_vopstats(vfs_t *);
3014 3014
3015 3015 /*
3016 3016 * Get covered vnode. This will be NULL if the vfs is not linked
3017 3017 * into the file system name space (i.e., domount() with MNT_NOSPICE).
3018 3018 */
3019 3019 coveredvp = vfsp->vfs_vnodecovered;
3020 3020 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3021 3021
3022 3022 /*
3023 3023 * Purge all dnlc entries for this vfs.
3024 3024 */
3025 3025 (void) dnlc_purge_vfsp(vfsp, 0);
3026 3026
3027 3027 /* For forcible umount, skip VFS_SYNC() since it may hang */
3028 3028 if ((flag & MS_FORCE) == 0)
3029 3029 (void) VFS_SYNC(vfsp, 0, cr);
3030 3030
3031 3031 /*
3032 3032 * Lock the vfs to maintain fs status quo during unmount. This
3033 3033 * has to be done after the sync because ufs_update tries to acquire
3034 3034 * the vfs_reflock.
3035 3035 */
3036 3036 vfs_lock_wait(vfsp);
3037 3037
3038 3038 if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3039 3039 vfs_unlock(vfsp);
3040 3040 if (coveredvp != NULL)
3041 3041 vn_vfsunlock(coveredvp);
3042 3042 } else if (coveredvp != NULL) {
3043 3043 teardown_vopstats(vfsp);
3044 3044 /*
3045 3045 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3046 3046 * when it frees vfsp so we do a VN_HOLD() so we can
3047 3047 * continue to use coveredvp afterwards.
3048 3048 */
3049 3049 VN_HOLD(coveredvp);
3050 3050 vfs_remove(vfsp);
3051 3051 vn_vfsunlock(coveredvp);
3052 3052 VN_RELE(coveredvp);
3053 3053 } else {
3054 3054 teardown_vopstats(vfsp);
3055 3055 /*
3056 3056 * Release the reference to vfs that is not linked
3057 3057 * into the name space.
3058 3058 */
3059 3059 vfs_unlock(vfsp);
3060 3060 VFS_RELE(vfsp);
3061 3061 }
3062 3062 return (error);
3063 3063 }
3064 3064
3065 3065
3066 3066 /*
3067 3067 * Vfs_unmountall() is called by uadmin() to unmount all
3068 3068 * mounted file systems (except the root file system) during shutdown.
3069 3069 * It follows the existing locking protocol when traversing the vfs list
3070 3070 * to sync and unmount vfses. Even though there should be no
3071 3071 * other thread running while the system is shutting down, it is prudent
3072 3072 * to still follow the locking protocol.
3073 3073 */
3074 3074 void
3075 3075 vfs_unmountall(void)
3076 3076 {
3077 3077 struct vfs *vfsp;
3078 3078 struct vfs *prev_vfsp = NULL;
3079 3079 int error;
3080 3080
3081 3081 /*
3082 3082 * Toss all dnlc entries now so that the per-vfs sync
3083 3083 * and unmount operations don't have to slog through
3084 3084 * a bunch of uninteresting vnodes over and over again.
3085 3085 */
3086 3086 dnlc_purge();
3087 3087
3088 3088 vfs_list_lock();
3089 3089 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3090 3090 prev_vfsp = vfsp->vfs_prev;
3091 3091
3092 3092 if (vfs_lock(vfsp) != 0)
3093 3093 continue;
3094 3094 error = vn_vfswlock(vfsp->vfs_vnodecovered);
3095 3095 vfs_unlock(vfsp);
3096 3096 if (error)
3097 3097 continue;
3098 3098
3099 3099 vfs_list_unlock();
3100 3100
3101 3101 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3102 3102 (void) dounmount(vfsp, 0, CRED());
3103 3103
3104 3104 /*
3105 3105 * Since we dropped the vfslist lock above we must
3106 3106 * verify that next_vfsp still exists, else start over.
3107 3107 */
3108 3108 vfs_list_lock();
3109 3109 for (vfsp = rootvfs->vfs_prev;
3110 3110 vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3111 3111 if (vfsp == prev_vfsp)
3112 3112 break;
3113 3113 if (vfsp == rootvfs && prev_vfsp != rootvfs)
3114 3114 prev_vfsp = rootvfs->vfs_prev;
3115 3115 }
3116 3116 vfs_list_unlock();
3117 3117 }
3118 3118
3119 3119 /*
3120 3120 * Called to add an entry to the end of the vfs mount in progress list
3121 3121 */
3122 3122 void
3123 3123 vfs_addmip(dev_t dev, struct vfs *vfsp)
3124 3124 {
3125 3125 struct ipmnt *mipp;
3126 3126
3127 3127 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3128 3128 mipp->mip_next = NULL;
3129 3129 mipp->mip_dev = dev;
3130 3130 mipp->mip_vfsp = vfsp;
3131 3131 mutex_enter(&vfs_miplist_mutex);
3132 3132 if (vfs_miplist_end != NULL)
3133 3133 vfs_miplist_end->mip_next = mipp;
3134 3134 else
3135 3135 vfs_miplist = mipp;
3136 3136 vfs_miplist_end = mipp;
3137 3137 mutex_exit(&vfs_miplist_mutex);
3138 3138 }
3139 3139
3140 3140 /*
3141 3141 * Called to remove an entry from the mount in progress list
3142 3142 * Either because the mount completed or it failed.
3143 3143 */
3144 3144 void
3145 3145 vfs_delmip(struct vfs *vfsp)
3146 3146 {
3147 3147 struct ipmnt *mipp, *mipprev;
3148 3148
3149 3149 mutex_enter(&vfs_miplist_mutex);
3150 3150 mipprev = NULL;
3151 3151 for (mipp = vfs_miplist;
3152 3152 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3153 3153 mipprev = mipp;
3154 3154 }
3155 3155 if (mipp == NULL)
3156 3156 return; /* shouldn't happen */
3157 3157 if (mipp == vfs_miplist_end)
3158 3158 vfs_miplist_end = mipprev;
3159 3159 if (mipprev == NULL)
3160 3160 vfs_miplist = mipp->mip_next;
3161 3161 else
3162 3162 mipprev->mip_next = mipp->mip_next;
3163 3163 mutex_exit(&vfs_miplist_mutex);
3164 3164 kmem_free(mipp, sizeof (struct ipmnt));
3165 3165 }
3166 3166
3167 3167 /*
3168 3168 * vfs_add is called by a specific filesystem's mount routine to add
3169 3169 * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3170 3170 * The vfs should already have been locked by the caller.
3171 3171 *
3172 3172 * coveredvp is NULL if this is the root.
3173 3173 */
3174 3174 void
3175 3175 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3176 3176 {
3177 3177 int newflag;
3178 3178
3179 3179 ASSERT(vfs_lock_held(vfsp));
3180 3180 VFS_HOLD(vfsp);
3181 3181 newflag = vfsp->vfs_flag;
3182 3182 if (mflag & MS_RDONLY)
3183 3183 newflag |= VFS_RDONLY;
3184 3184 else
3185 3185 newflag &= ~VFS_RDONLY;
3186 3186 if (mflag & MS_NOSUID)
3187 3187 newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3188 3188 else
3189 3189 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3190 3190 if (mflag & MS_NOMNTTAB)
3191 3191 newflag |= VFS_NOMNTTAB;
3192 3192 else
3193 3193 newflag &= ~VFS_NOMNTTAB;
3194 3194
3195 3195 if (coveredvp != NULL) {
3196 3196 ASSERT(vn_vfswlock_held(coveredvp));
3197 3197 coveredvp->v_vfsmountedhere = vfsp;
3198 3198 VN_HOLD(coveredvp);
3199 3199 }
3200 3200 vfsp->vfs_vnodecovered = coveredvp;
3201 3201 vfsp->vfs_flag = newflag;
3202 3202
3203 3203 vfs_list_add(vfsp);
3204 3204 }
3205 3205
3206 3206 /*
3207 3207 * Remove a vfs from the vfs list, null out the pointer from the
3208 3208 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3209 3209 * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3210 3210 * reference to the vfs and to the covered vnode.
3211 3211 *
3212 3212 * Called from dounmount after it's confirmed with the file system
3213 3213 * that the unmount is legal.
3214 3214 */
3215 3215 void
3216 3216 vfs_remove(struct vfs *vfsp)
3217 3217 {
3218 3218 vnode_t *vp;
3219 3219
3220 3220 ASSERT(vfs_lock_held(vfsp));
3221 3221
3222 3222 /*
3223 3223 * Can't unmount root. Should never happen because fs will
3224 3224 * be busy.
3225 3225 */
3226 3226 if (vfsp == rootvfs)
3227 3227 panic("vfs_remove: unmounting root");
3228 3228
3229 3229 vfs_list_remove(vfsp);
3230 3230
3231 3231 /*
3232 3232 * Unhook from the file system name space.
3233 3233 */
3234 3234 vp = vfsp->vfs_vnodecovered;
3235 3235 ASSERT(vn_vfswlock_held(vp));
3236 3236 vp->v_vfsmountedhere = NULL;
3237 3237 vfsp->vfs_vnodecovered = NULL;
3238 3238 VN_RELE(vp);
3239 3239
3240 3240 /*
3241 3241 * Release lock and wakeup anybody waiting.
3242 3242 */
3243 3243 vfs_unlock(vfsp);
3244 3244 VFS_RELE(vfsp);
3245 3245 }
3246 3246
3247 3247 /*
3248 3248 * Lock a filesystem to prevent access to it while mounting,
3249 3249 * unmounting and syncing. Return EBUSY immediately if lock
3250 3250 * can't be acquired.
3251 3251 */
3252 3252 int
3253 3253 vfs_lock(vfs_t *vfsp)
3254 3254 {
3255 3255 vn_vfslocks_entry_t *vpvfsentry;
3256 3256
3257 3257 vpvfsentry = vn_vfslocks_getlock(vfsp);
3258 3258 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3259 3259 return (0);
3260 3260
3261 3261 vn_vfslocks_rele(vpvfsentry);
3262 3262 return (EBUSY);
3263 3263 }
3264 3264
3265 3265 int
3266 3266 vfs_rlock(vfs_t *vfsp)
3267 3267 {
3268 3268 vn_vfslocks_entry_t *vpvfsentry;
3269 3269
3270 3270 vpvfsentry = vn_vfslocks_getlock(vfsp);
3271 3271
3272 3272 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3273 3273 return (0);
3274 3274
3275 3275 vn_vfslocks_rele(vpvfsentry);
3276 3276 return (EBUSY);
3277 3277 }
3278 3278
3279 3279 void
3280 3280 vfs_lock_wait(vfs_t *vfsp)
3281 3281 {
3282 3282 vn_vfslocks_entry_t *vpvfsentry;
3283 3283
3284 3284 vpvfsentry = vn_vfslocks_getlock(vfsp);
3285 3285 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3286 3286 }
3287 3287
3288 3288 void
3289 3289 vfs_rlock_wait(vfs_t *vfsp)
3290 3290 {
3291 3291 vn_vfslocks_entry_t *vpvfsentry;
3292 3292
3293 3293 vpvfsentry = vn_vfslocks_getlock(vfsp);
3294 3294 rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3295 3295 }
3296 3296
3297 3297 /*
3298 3298 * Unlock a locked filesystem.
3299 3299 */
3300 3300 void
3301 3301 vfs_unlock(vfs_t *vfsp)
3302 3302 {
3303 3303 vn_vfslocks_entry_t *vpvfsentry;
3304 3304
3305 3305 /*
3306 3306 * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3307 3307 * And these changes should remain for the patch changes as it is.
3308 3308 */
3309 3309 if (panicstr)
3310 3310 return;
3311 3311
3312 3312 /*
3313 3313 * ve_refcount needs to be dropped twice here.
3314 3314 * 1. To release refernce after a call to vfs_locks_getlock()
3315 3315 * 2. To release the reference from the locking routines like
3316 3316 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3317 3317 */
3318 3318
3319 3319 vpvfsentry = vn_vfslocks_getlock(vfsp);
3320 3320 vn_vfslocks_rele(vpvfsentry);
3321 3321
3322 3322 rwst_exit(&vpvfsentry->ve_lock);
3323 3323 vn_vfslocks_rele(vpvfsentry);
3324 3324 }
3325 3325
3326 3326 /*
3327 3327 * Utility routine that allows a filesystem to construct its
3328 3328 * fsid in "the usual way" - by munging some underlying dev_t and
3329 3329 * the filesystem type number into the 64-bit fsid. Note that
3330 3330 * this implicitly relies on dev_t persistence to make filesystem
3331 3331 * id's persistent.
3332 3332 *
3333 3333 * There's nothing to prevent an individual fs from constructing its
3334 3334 * fsid in a different way, and indeed they should.
3335 3335 *
3336 3336 * Since we want fsids to be 32-bit quantities (so that they can be
3337 3337 * exported identically by either 32-bit or 64-bit APIs, as well as
3338 3338 * the fact that fsid's are "known" to NFS), we compress the device
3339 3339 * number given down to 32-bits, and panic if that isn't possible.
3340 3340 */
3341 3341 void
3342 3342 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3343 3343 {
3344 3344 if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3345 3345 panic("device number too big for fsid!");
3346 3346 fsi->val[1] = val;
3347 3347 }
3348 3348
3349 3349 int
3350 3350 vfs_lock_held(vfs_t *vfsp)
3351 3351 {
3352 3352 int held;
3353 3353 vn_vfslocks_entry_t *vpvfsentry;
3354 3354
3355 3355 /*
3356 3356 * vfs_lock_held will mimic sema_held behaviour
3357 3357 * if panicstr is set. And these changes should remain
3358 3358 * for the patch changes as it is.
3359 3359 */
3360 3360 if (panicstr)
3361 3361 return (1);
3362 3362
3363 3363 vpvfsentry = vn_vfslocks_getlock(vfsp);
3364 3364 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3365 3365
3366 3366 vn_vfslocks_rele(vpvfsentry);
3367 3367 return (held);
3368 3368 }
3369 3369
3370 3370 struct _kthread *
3371 3371 vfs_lock_owner(vfs_t *vfsp)
3372 3372 {
3373 3373 struct _kthread *owner;
3374 3374 vn_vfslocks_entry_t *vpvfsentry;
3375 3375
3376 3376 /*
3377 3377 * vfs_wlock_held will mimic sema_held behaviour
3378 3378 * if panicstr is set. And these changes should remain
3379 3379 * for the patch changes as it is.
3380 3380 */
3381 3381 if (panicstr)
3382 3382 return (NULL);
3383 3383
3384 3384 vpvfsentry = vn_vfslocks_getlock(vfsp);
3385 3385 owner = rwst_owner(&vpvfsentry->ve_lock);
3386 3386
3387 3387 vn_vfslocks_rele(vpvfsentry);
3388 3388 return (owner);
3389 3389 }
3390 3390
3391 3391 /*
3392 3392 * vfs list locking.
3393 3393 *
3394 3394 * Rather than manipulate the vfslist lock directly, we abstract into lock
3395 3395 * and unlock routines to allow the locking implementation to be changed for
3396 3396 * clustering.
3397 3397 *
3398 3398 * Whenever the vfs list is modified through its hash links, the overall list
3399 3399 * lock must be obtained before locking the relevant hash bucket. But to see
3400 3400 * whether a given vfs is on the list, it suffices to obtain the lock for the
3401 3401 * hash bucket without getting the overall list lock. (See getvfs() below.)
3402 3402 */
3403 3403
3404 3404 void
3405 3405 vfs_list_lock()
3406 3406 {
3407 3407 rw_enter(&vfslist, RW_WRITER);
3408 3408 }
3409 3409
3410 3410 void
3411 3411 vfs_list_read_lock()
3412 3412 {
3413 3413 rw_enter(&vfslist, RW_READER);
3414 3414 }
3415 3415
3416 3416 void
3417 3417 vfs_list_unlock()
3418 3418 {
3419 3419 rw_exit(&vfslist);
3420 3420 }
3421 3421
3422 3422 /*
3423 3423 * Low level worker routines for adding entries to and removing entries from
3424 3424 * the vfs list.
3425 3425 */
3426 3426
3427 3427 static void
3428 3428 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3429 3429 {
3430 3430 int vhno;
3431 3431 struct vfs **hp;
3432 3432 dev_t dev;
3433 3433
3434 3434 ASSERT(RW_WRITE_HELD(&vfslist));
3435 3435
3436 3436 dev = expldev(vfsp->vfs_fsid.val[0]);
3437 3437 vhno = VFSHASH(getmajor(dev), getminor(dev));
3438 3438
3439 3439 mutex_enter(&rvfs_list[vhno].rvfs_lock);
3440 3440
3441 3441 /*
3442 3442 * Link into the hash table, inserting it at the end, so that LOFS
3443 3443 * with the same fsid as UFS (or other) file systems will not hide the
3444 3444 * UFS.
3445 3445 */
3446 3446 if (insert_at_head) {
3447 3447 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3448 3448 rvfs_list[vhno].rvfs_head = vfsp;
3449 3449 } else {
3450 3450 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3451 3451 hp = &(*hp)->vfs_hash)
3452 3452 continue;
3453 3453 /*
3454 3454 * hp now contains the address of the pointer to update
3455 3455 * to effect the insertion.
3456 3456 */
3457 3457 vfsp->vfs_hash = NULL;
3458 3458 *hp = vfsp;
3459 3459 }
3460 3460
3461 3461 rvfs_list[vhno].rvfs_len++;
3462 3462 mutex_exit(&rvfs_list[vhno].rvfs_lock);
3463 3463 }
3464 3464
3465 3465
3466 3466 static void
3467 3467 vfs_hash_remove(struct vfs *vfsp)
3468 3468 {
3469 3469 int vhno;
3470 3470 struct vfs *tvfsp;
3471 3471 dev_t dev;
3472 3472
3473 3473 ASSERT(RW_WRITE_HELD(&vfslist));
3474 3474
3475 3475 dev = expldev(vfsp->vfs_fsid.val[0]);
3476 3476 vhno = VFSHASH(getmajor(dev), getminor(dev));
3477 3477
3478 3478 mutex_enter(&rvfs_list[vhno].rvfs_lock);
3479 3479
3480 3480 /*
3481 3481 * Remove from hash.
3482 3482 */
3483 3483 if (rvfs_list[vhno].rvfs_head == vfsp) {
3484 3484 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3485 3485 rvfs_list[vhno].rvfs_len--;
3486 3486 goto foundit;
3487 3487 }
3488 3488 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3489 3489 tvfsp = tvfsp->vfs_hash) {
3490 3490 if (tvfsp->vfs_hash == vfsp) {
3491 3491 tvfsp->vfs_hash = vfsp->vfs_hash;
3492 3492 rvfs_list[vhno].rvfs_len--;
3493 3493 goto foundit;
3494 3494 }
3495 3495 }
3496 3496 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3497 3497
3498 3498 foundit:
3499 3499
3500 3500 mutex_exit(&rvfs_list[vhno].rvfs_lock);
3501 3501 }
3502 3502
3503 3503
3504 3504 void
3505 3505 vfs_list_add(struct vfs *vfsp)
3506 3506 {
3507 3507 zone_t *zone;
3508 3508
3509 3509 /*
3510 3510 * Typically, the vfs_t will have been created on behalf of the file
3511 3511 * system in vfs_init, where it will have been provided with a
3512 3512 * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3513 3513 * by an unbundled file system. We therefore check for such an example
3514 3514 * before stamping the vfs_t with its creation time for the benefit of
3515 3515 * mntfs.
3516 3516 */
3517 3517 if (vfsp->vfs_implp == NULL)
3518 3518 vfsimpl_setup(vfsp);
3519 3519 vfs_mono_time(&vfsp->vfs_hrctime);
3520 3520
3521 3521 /*
3522 3522 * The zone that owns the mount is the one that performed the mount.
3523 3523 * Note that this isn't necessarily the same as the zone mounted into.
3524 3524 * The corresponding zone_rele_ref() will be done when the vfs_t
3525 3525 * is being free'd.
3526 3526 */
3527 3527 vfsp->vfs_zone = curproc->p_zone;
3528 3528 zone_init_ref(&vfsp->vfs_implp->vi_zone_ref);
3529 3529 zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref,
3530 3530 ZONE_REF_VFS);
3531 3531
3532 3532 /*
3533 3533 * Find the zone mounted into, and put this mount on its vfs list.
3534 3534 */
3535 3535 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3536 3536 ASSERT(zone != NULL);
3537 3537 /*
3538 3538 * Special casing for the root vfs. This structure is allocated
3539 3539 * statically and hooked onto rootvfs at link time. During the
3540 3540 * vfs_mountroot call at system startup time, the root file system's
3541 3541 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3542 3542 * as argument. The code below must detect and handle this special
3543 3543 * case. The only apparent justification for this special casing is
3544 3544 * to ensure that the root file system appears at the head of the
3545 3545 * list.
3546 3546 *
3547 3547 * XXX: I'm assuming that it's ok to do normal list locking when
3548 3548 * adding the entry for the root file system (this used to be
3549 3549 * done with no locks held).
3550 3550 */
3551 3551 vfs_list_lock();
3552 3552 /*
3553 3553 * Link into the vfs list proper.
3554 3554 */
3555 3555 if (vfsp == &root) {
3556 3556 /*
3557 3557 * Assert: This vfs is already on the list as its first entry.
3558 3558 * Thus, there's nothing to do.
3559 3559 */
3560 3560 ASSERT(rootvfs == vfsp);
3561 3561 /*
3562 3562 * Add it to the head of the global zone's vfslist.
3563 3563 */
3564 3564 ASSERT(zone == global_zone);
3565 3565 ASSERT(zone->zone_vfslist == NULL);
3566 3566 zone->zone_vfslist = vfsp;
3567 3567 } else {
3568 3568 /*
3569 3569 * Link to end of list using vfs_prev (as rootvfs is now a
3570 3570 * doubly linked circular list) so list is in mount order for
3571 3571 * mnttab use.
3572 3572 */
3573 3573 rootvfs->vfs_prev->vfs_next = vfsp;
3574 3574 vfsp->vfs_prev = rootvfs->vfs_prev;
3575 3575 rootvfs->vfs_prev = vfsp;
3576 3576 vfsp->vfs_next = rootvfs;
3577 3577
3578 3578 /*
3579 3579 * Do it again for the zone-private list (which may be NULL).
3580 3580 */
3581 3581 if (zone->zone_vfslist == NULL) {
3582 3582 ASSERT(zone != global_zone);
3583 3583 zone->zone_vfslist = vfsp;
3584 3584 } else {
3585 3585 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3586 3586 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3587 3587 zone->zone_vfslist->vfs_zone_prev = vfsp;
3588 3588 vfsp->vfs_zone_next = zone->zone_vfslist;
3589 3589 }
3590 3590 }
3591 3591
3592 3592 /*
3593 3593 * Link into the hash table, inserting it at the end, so that LOFS
3594 3594 * with the same fsid as UFS (or other) file systems will not hide
3595 3595 * the UFS.
3596 3596 */
3597 3597 vfs_hash_add(vfsp, 0);
3598 3598
3599 3599 /*
3600 3600 * update the mnttab modification time
3601 3601 */
3602 3602 vfs_mnttab_modtimeupd();
3603 3603 vfs_list_unlock();
3604 3604 zone_rele(zone);
3605 3605 }
3606 3606
3607 3607 void
3608 3608 vfs_list_remove(struct vfs *vfsp)
3609 3609 {
3610 3610 zone_t *zone;
3611 3611
3612 3612 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3613 3613 ASSERT(zone != NULL);
3614 3614 /*
3615 3615 * Callers are responsible for preventing attempts to unmount the
3616 3616 * root.
3617 3617 */
3618 3618 ASSERT(vfsp != rootvfs);
3619 3619
3620 3620 vfs_list_lock();
3621 3621
3622 3622 /*
3623 3623 * Remove from hash.
3624 3624 */
3625 3625 vfs_hash_remove(vfsp);
3626 3626
3627 3627 /*
3628 3628 * Remove from vfs list.
3629 3629 */
3630 3630 vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3631 3631 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3632 3632 vfsp->vfs_next = vfsp->vfs_prev = NULL;
3633 3633
3634 3634 /*
3635 3635 * Remove from zone-specific vfs list.
3636 3636 */
3637 3637 if (zone->zone_vfslist == vfsp)
3638 3638 zone->zone_vfslist = vfsp->vfs_zone_next;
3639 3639
3640 3640 if (vfsp->vfs_zone_next == vfsp) {
3641 3641 ASSERT(vfsp->vfs_zone_prev == vfsp);
3642 3642 ASSERT(zone->zone_vfslist == vfsp);
3643 3643 zone->zone_vfslist = NULL;
3644 3644 }
3645 3645
3646 3646 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3647 3647 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3648 3648 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3649 3649
3650 3650 /*
3651 3651 * update the mnttab modification time
3652 3652 */
3653 3653 vfs_mnttab_modtimeupd();
3654 3654 vfs_list_unlock();
3655 3655 zone_rele(zone);
3656 3656 }
3657 3657
3658 3658 struct vfs *
3659 3659 getvfs(fsid_t *fsid)
3660 3660 {
3661 3661 struct vfs *vfsp;
3662 3662 int val0 = fsid->val[0];
3663 3663 int val1 = fsid->val[1];
3664 3664 dev_t dev = expldev(val0);
3665 3665 int vhno = VFSHASH(getmajor(dev), getminor(dev));
3666 3666 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3667 3667
3668 3668 mutex_enter(hmp);
3669 3669 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3670 3670 if (vfsp->vfs_fsid.val[0] == val0 &&
3671 3671 vfsp->vfs_fsid.val[1] == val1) {
3672 3672 VFS_HOLD(vfsp);
3673 3673 mutex_exit(hmp);
3674 3674 return (vfsp);
3675 3675 }
3676 3676 }
3677 3677 mutex_exit(hmp);
3678 3678 return (NULL);
3679 3679 }
3680 3680
3681 3681 /*
3682 3682 * Search the vfs mount in progress list for a specified device/vfs entry.
3683 3683 * Returns 0 if the first entry in the list that the device matches has the
3684 3684 * given vfs pointer as well. If the device matches but a different vfs
3685 3685 * pointer is encountered in the list before the given vfs pointer then
3686 3686 * a 1 is returned.
3687 3687 */
3688 3688
3689 3689 int
3690 3690 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3691 3691 {
3692 3692 int retval = 0;
3693 3693 struct ipmnt *mipp;
3694 3694
3695 3695 mutex_enter(&vfs_miplist_mutex);
3696 3696 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3697 3697 if (mipp->mip_dev == dev) {
3698 3698 if (mipp->mip_vfsp != vfsp)
3699 3699 retval = 1;
3700 3700 break;
3701 3701 }
3702 3702 }
3703 3703 mutex_exit(&vfs_miplist_mutex);
3704 3704 return (retval);
3705 3705 }
3706 3706
3707 3707 /*
3708 3708 * Search the vfs list for a specified device. Returns 1, if entry is found
3709 3709 * or 0 if no suitable entry is found.
3710 3710 */
3711 3711
3712 3712 int
3713 3713 vfs_devismounted(dev_t dev)
3714 3714 {
3715 3715 struct vfs *vfsp;
3716 3716 int found;
3717 3717
3718 3718 vfs_list_read_lock();
3719 3719 vfsp = rootvfs;
3720 3720 found = 0;
3721 3721 do {
3722 3722 if (vfsp->vfs_dev == dev) {
3723 3723 found = 1;
3724 3724 break;
3725 3725 }
3726 3726 vfsp = vfsp->vfs_next;
3727 3727 } while (vfsp != rootvfs);
3728 3728
3729 3729 vfs_list_unlock();
3730 3730 return (found);
3731 3731 }
3732 3732
3733 3733 /*
3734 3734 * Search the vfs list for a specified device. Returns a pointer to it
3735 3735 * or NULL if no suitable entry is found. The caller of this routine
3736 3736 * is responsible for releasing the returned vfs pointer.
3737 3737 */
3738 3738 struct vfs *
3739 3739 vfs_dev2vfsp(dev_t dev)
3740 3740 {
3741 3741 struct vfs *vfsp;
3742 3742 int found;
3743 3743
3744 3744 vfs_list_read_lock();
3745 3745 vfsp = rootvfs;
3746 3746 found = 0;
3747 3747 do {
3748 3748 /*
3749 3749 * The following could be made more efficient by making
3750 3750 * the entire loop use vfs_zone_next if the call is from
3751 3751 * a zone. The only callers, however, ustat(2) and
3752 3752 * umount2(2), don't seem to justify the added
3753 3753 * complexity at present.
3754 3754 */
3755 3755 if (vfsp->vfs_dev == dev &&
3756 3756 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3757 3757 curproc->p_zone)) {
3758 3758 VFS_HOLD(vfsp);
3759 3759 found = 1;
3760 3760 break;
3761 3761 }
3762 3762 vfsp = vfsp->vfs_next;
3763 3763 } while (vfsp != rootvfs);
3764 3764 vfs_list_unlock();
3765 3765 return (found ? vfsp: NULL);
3766 3766 }
3767 3767
3768 3768 /*
3769 3769 * Search the vfs list for a specified mntpoint. Returns a pointer to it
3770 3770 * or NULL if no suitable entry is found. The caller of this routine
3771 3771 * is responsible for releasing the returned vfs pointer.
3772 3772 *
3773 3773 * Note that if multiple mntpoints match, the last one matching is
3774 3774 * returned in an attempt to return the "top" mount when overlay
3775 3775 * mounts are covering the same mount point. This is accomplished by starting
3776 3776 * at the end of the list and working our way backwards, stopping at the first
3777 3777 * matching mount.
3778 3778 */
3779 3779 struct vfs *
3780 3780 vfs_mntpoint2vfsp(const char *mp)
3781 3781 {
3782 3782 struct vfs *vfsp;
3783 3783 struct vfs *retvfsp = NULL;
3784 3784 zone_t *zone = curproc->p_zone;
3785 3785 struct vfs *list;
3786 3786
3787 3787 vfs_list_read_lock();
3788 3788 if (getzoneid() == GLOBAL_ZONEID) {
3789 3789 /*
3790 3790 * The global zone may see filesystems in any zone.
3791 3791 */
3792 3792 vfsp = rootvfs->vfs_prev;
3793 3793 do {
3794 3794 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3795 3795 retvfsp = vfsp;
3796 3796 break;
3797 3797 }
3798 3798 vfsp = vfsp->vfs_prev;
3799 3799 } while (vfsp != rootvfs->vfs_prev);
3800 3800 } else if ((list = zone->zone_vfslist) != NULL) {
3801 3801 const char *mntpt;
3802 3802
3803 3803 vfsp = list->vfs_zone_prev;
3804 3804 do {
3805 3805 mntpt = refstr_value(vfsp->vfs_mntpt);
3806 3806 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3807 3807 if (strcmp(mntpt, mp) == 0) {
3808 3808 retvfsp = vfsp;
3809 3809 break;
3810 3810 }
3811 3811 vfsp = vfsp->vfs_zone_prev;
3812 3812 } while (vfsp != list->vfs_zone_prev);
3813 3813 }
3814 3814 if (retvfsp)
3815 3815 VFS_HOLD(retvfsp);
3816 3816 vfs_list_unlock();
3817 3817 return (retvfsp);
3818 3818 }
3819 3819
3820 3820 /*
3821 3821 * Search the vfs list for a specified vfsops.
3822 3822 * if vfs entry is found then return 1, else 0.
3823 3823 */
3824 3824 int
3825 3825 vfs_opsinuse(vfsops_t *ops)
3826 3826 {
3827 3827 struct vfs *vfsp;
3828 3828 int found;
3829 3829
3830 3830 vfs_list_read_lock();
3831 3831 vfsp = rootvfs;
3832 3832 found = 0;
3833 3833 do {
3834 3834 if (vfs_getops(vfsp) == ops) {
3835 3835 found = 1;
3836 3836 break;
3837 3837 }
3838 3838 vfsp = vfsp->vfs_next;
3839 3839 } while (vfsp != rootvfs);
3840 3840 vfs_list_unlock();
3841 3841 return (found);
3842 3842 }
3843 3843
3844 3844 /*
3845 3845 * Allocate an entry in vfssw for a file system type
3846 3846 */
3847 3847 struct vfssw *
3848 3848 allocate_vfssw(const char *type)
3849 3849 {
3850 3850 struct vfssw *vswp;
3851 3851
3852 3852 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3853 3853 /*
3854 3854 * The vfssw table uses the empty string to identify an
3855 3855 * available entry; we cannot add any type which has
3856 3856 * a leading NUL. The string length is limited to
3857 3857 * the size of the st_fstype array in struct stat.
3858 3858 */
3859 3859 return (NULL);
3860 3860 }
3861 3861
3862 3862 ASSERT(VFSSW_WRITE_LOCKED());
3863 3863 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3864 3864 if (!ALLOCATED_VFSSW(vswp)) {
3865 3865 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3866 3866 (void) strcpy(vswp->vsw_name, type);
3867 3867 ASSERT(vswp->vsw_count == 0);
3868 3868 vswp->vsw_count = 1;
3869 3869 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3870 3870 return (vswp);
3871 3871 }
3872 3872 return (NULL);
3873 3873 }
3874 3874
3875 3875 /*
3876 3876 * Impose additional layer of translation between vfstype names
3877 3877 * and module names in the filesystem.
3878 3878 */
3879 3879 static const char *
3880 3880 vfs_to_modname(const char *vfstype)
3881 3881 {
3882 3882 if (strcmp(vfstype, "proc") == 0) {
3883 3883 vfstype = "procfs";
3884 3884 } else if (strcmp(vfstype, "fd") == 0) {
3885 3885 vfstype = "fdfs";
3886 3886 } else if (strncmp(vfstype, "nfs", 3) == 0) {
3887 3887 vfstype = "nfs";
3888 3888 }
3889 3889
3890 3890 return (vfstype);
3891 3891 }
3892 3892
3893 3893 /*
3894 3894 * Find a vfssw entry given a file system type name.
3895 3895 * Try to autoload the filesystem if it's not found.
3896 3896 * If it's installed, return the vfssw locked to prevent unloading.
3897 3897 */
3898 3898 struct vfssw *
3899 3899 vfs_getvfssw(const char *type)
3900 3900 {
3901 3901 struct vfssw *vswp;
3902 3902 const char *modname;
3903 3903
3904 3904 RLOCK_VFSSW();
3905 3905 vswp = vfs_getvfsswbyname(type);
3906 3906 modname = vfs_to_modname(type);
3907 3907
3908 3908 if (rootdir == NULL) {
3909 3909 /*
3910 3910 * If we haven't yet loaded the root file system, then our
3911 3911 * _init won't be called until later. Allocate vfssw entry,
3912 3912 * because mod_installfs won't be called.
3913 3913 */
3914 3914 if (vswp == NULL) {
3915 3915 RUNLOCK_VFSSW();
3916 3916 WLOCK_VFSSW();
3917 3917 if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3918 3918 if ((vswp = allocate_vfssw(type)) == NULL) {
3919 3919 WUNLOCK_VFSSW();
3920 3920 return (NULL);
3921 3921 }
3922 3922 }
3923 3923 WUNLOCK_VFSSW();
3924 3924 RLOCK_VFSSW();
3925 3925 }
3926 3926 if (!VFS_INSTALLED(vswp)) {
3927 3927 RUNLOCK_VFSSW();
3928 3928 (void) modloadonly("fs", modname);
3929 3929 } else
3930 3930 RUNLOCK_VFSSW();
3931 3931 return (vswp);
3932 3932 }
3933 3933
3934 3934 /*
3935 3935 * Try to load the filesystem. Before calling modload(), we drop
3936 3936 * our lock on the VFS switch table, and pick it up after the
3937 3937 * module is loaded. However, there is a potential race: the
3938 3938 * module could be unloaded after the call to modload() completes
3939 3939 * but before we pick up the lock and drive on. Therefore,
3940 3940 * we keep reloading the module until we've loaded the module
3941 3941 * _and_ we have the lock on the VFS switch table.
3942 3942 */
3943 3943 while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3944 3944 RUNLOCK_VFSSW();
3945 3945 if (modload("fs", modname) == -1)
3946 3946 return (NULL);
3947 3947 RLOCK_VFSSW();
3948 3948 if (vswp == NULL)
3949 3949 if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3950 3950 break;
3951 3951 }
3952 3952 RUNLOCK_VFSSW();
3953 3953
3954 3954 return (vswp);
3955 3955 }
3956 3956
3957 3957 /*
3958 3958 * Find a vfssw entry given a file system type name.
3959 3959 */
3960 3960 struct vfssw *
3961 3961 vfs_getvfsswbyname(const char *type)
3962 3962 {
3963 3963 struct vfssw *vswp;
3964 3964
3965 3965 ASSERT(VFSSW_LOCKED());
3966 3966 if (type == NULL || *type == '\0')
3967 3967 return (NULL);
3968 3968
3969 3969 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3970 3970 if (strcmp(type, vswp->vsw_name) == 0) {
3971 3971 vfs_refvfssw(vswp);
3972 3972 return (vswp);
3973 3973 }
3974 3974 }
3975 3975
3976 3976 return (NULL);
3977 3977 }
3978 3978
3979 3979 /*
3980 3980 * Find a vfssw entry given a set of vfsops.
3981 3981 */
3982 3982 struct vfssw *
3983 3983 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3984 3984 {
3985 3985 struct vfssw *vswp;
3986 3986
3987 3987 RLOCK_VFSSW();
3988 3988 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3989 3989 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3990 3990 vfs_refvfssw(vswp);
3991 3991 RUNLOCK_VFSSW();
3992 3992 return (vswp);
3993 3993 }
3994 3994 }
3995 3995 RUNLOCK_VFSSW();
3996 3996
3997 3997 return (NULL);
3998 3998 }
3999 3999
4000 4000 /*
4001 4001 * Reference a vfssw entry.
4002 4002 */
4003 4003 void
4004 4004 vfs_refvfssw(struct vfssw *vswp)
4005 4005 {
4006 4006
4007 4007 mutex_enter(&vswp->vsw_lock);
4008 4008 vswp->vsw_count++;
4009 4009 mutex_exit(&vswp->vsw_lock);
4010 4010 }
4011 4011
4012 4012 /*
4013 4013 * Unreference a vfssw entry.
4014 4014 */
4015 4015 void
4016 4016 vfs_unrefvfssw(struct vfssw *vswp)
4017 4017 {
4018 4018
4019 4019 mutex_enter(&vswp->vsw_lock);
4020 4020 vswp->vsw_count--;
4021 4021 mutex_exit(&vswp->vsw_lock);
4022 4022 }
4023 4023
4024 4024 static int sync_retries = 20; /* number of retries when not making progress */
4025 4025 static int sync_triesleft; /* portion of sync_retries remaining */
4026 4026
4027 4027 static pgcnt_t old_pgcnt, new_pgcnt;
4028 4028 static int new_bufcnt, old_bufcnt;
4029 4029
4030 4030 /*
4031 4031 * Sync all of the mounted filesystems, and then wait for the actual i/o to
4032 4032 * complete. We wait by counting the number of dirty pages and buffers,
4033 4033 * pushing them out using bio_busy() and page_busy(), and then counting again.
4034 4034 * This routine is used during the uadmin A_SHUTDOWN code. It should only
4035 4035 * be used after some higher-level mechanism has quiesced the system so that
4036 4036 * new writes are not being initiated while we are waiting for completion.
4037 4037 *
4038 4038 * To ensure finite running time, our algorithm uses sync_triesleft (a progress
4039 4039 * counter used by the vfs_syncall() loop below). It is declared above so
4040 4040 * it can be found easily in the debugger.
4041 4041 *
4042 4042 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make
4043 4043 * sync_retries consecutive calls to bio_busy() and page_busy() without
4044 4044 * decreasing either the number of dirty buffers or dirty pages below the
4045 4045 * lowest count we have seen so far, we give up and return from vfs_syncall().
4046 4046 *
4047 4047 * Each loop iteration ends with a call to delay() one second to allow time for
4048 4048 * i/o completion and to permit the user time to read our progress messages.
4049 4049 */
4050 4050 void
4051 4051 vfs_syncall(void)
4052 4052 {
4053 4053 if (rootdir == NULL && !modrootloaded)
4054 4054 return; /* no filesystems have been loaded yet */
4055 4055
4056 4056 printf("syncing file systems...");
4057 4057 sync();
4058 4058
4059 4059 sync_triesleft = sync_retries;
4060 4060
4061 4061 old_bufcnt = new_bufcnt = INT_MAX;
4062 4062 old_pgcnt = new_pgcnt = ULONG_MAX;
4063 4063
4064 4064 while (sync_triesleft > 0) {
4065 4065 old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4066 4066 old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4067 4067
4068 4068 new_bufcnt = bio_busy(B_TRUE);
4069 4069 new_pgcnt = page_busy(B_TRUE);
4070 4070
4071 4071 if (new_bufcnt == 0 && new_pgcnt == 0)
4072 4072 break;
4073 4073
4074 4074 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4075 4075 sync_triesleft = sync_retries;
4076 4076 else
4077 4077 sync_triesleft--;
4078 4078
4079 4079 if (new_bufcnt)
4080 4080 printf(" [%d]", new_bufcnt);
4081 4081 if (new_pgcnt)
4082 4082 printf(" %lu", new_pgcnt);
4083 4083
4084 4084 delay(hz);
4085 4085 }
4086 4086
4087 4087 if (new_bufcnt != 0 || new_pgcnt != 0)
4088 4088 printf(" done (not all i/o completed)\n");
4089 4089 else
4090 4090 printf(" done\n");
4091 4091
4092 4092 delay(hz);
4093 4093 }
4094 4094
4095 4095 /*
4096 4096 * Map VFS flags to statvfs flags. These shouldn't really be separate
4097 4097 * flags at all.
4098 4098 */
4099 4099 uint_t
4100 4100 vf_to_stf(uint_t vf)
4101 4101 {
4102 4102 uint_t stf = 0;
4103 4103
4104 4104 if (vf & VFS_RDONLY)
4105 4105 stf |= ST_RDONLY;
4106 4106 if (vf & VFS_NOSETUID)
4107 4107 stf |= ST_NOSUID;
4108 4108 if (vf & VFS_NOTRUNC)
4109 4109 stf |= ST_NOTRUNC;
4110 4110
4111 4111 return (stf);
4112 4112 }
4113 4113
4114 4114 /*
4115 4115 * Entries for (illegal) fstype 0.
4116 4116 */
4117 4117 /* ARGSUSED */
4118 4118 int
4119 4119 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4120 4120 {
4121 4121 cmn_err(CE_PANIC, "stray vfs operation");
4122 4122 return (0);
4123 4123 }
4124 4124
4125 4125 /*
4126 4126 * Entries for (illegal) fstype 0.
4127 4127 */
4128 4128 int
4129 4129 vfsstray(void)
4130 4130 {
4131 4131 cmn_err(CE_PANIC, "stray vfs operation");
4132 4132 return (0);
4133 4133 }
4134 4134
4135 4135 /*
4136 4136 * Support for dealing with forced UFS unmount and its interaction with
4137 4137 * LOFS. Could be used by any filesystem.
4138 4138 * See bug 1203132.
4139 4139 */
4140 4140 int
4141 4141 vfs_EIO(void)
4142 4142 {
4143 4143 return (EIO);
4144 4144 }
4145 4145
4146 4146 /*
4147 4147 * We've gotta define the op for sync separately, since the compiler gets
4148 4148 * confused if we mix and match ANSI and normal style prototypes when
4149 4149 * a "short" argument is present and spits out a warning.
4150 4150 */
4151 4151 /*ARGSUSED*/
4152 4152 int
4153 4153 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4154 4154 {
4155 4155 return (EIO);
4156 4156 }
4157 4157
4158 4158 vfs_t EIO_vfs;
4159 4159 vfsops_t *EIO_vfsops;
4160 4160
4161 4161 /*
4162 4162 * Called from startup() to initialize all loaded vfs's
4163 4163 */
4164 4164 void
4165 4165 vfsinit(void)
4166 4166 {
4167 4167 struct vfssw *vswp;
4168 4168 int error;
4169 4169 extern int vopstats_enabled;
4170 4170 extern void vopstats_startup();
4171 4171
4172 4172 static const fs_operation_def_t EIO_vfsops_template[] = {
4173 4173 VFSNAME_MOUNT, { .error = vfs_EIO },
4174 4174 VFSNAME_UNMOUNT, { .error = vfs_EIO },
4175 4175 VFSNAME_ROOT, { .error = vfs_EIO },
4176 4176 VFSNAME_STATVFS, { .error = vfs_EIO },
4177 4177 VFSNAME_SYNC, { .vfs_sync = vfs_EIO_sync },
4178 4178 VFSNAME_VGET, { .error = vfs_EIO },
4179 4179 VFSNAME_MOUNTROOT, { .error = vfs_EIO },
4180 4180 VFSNAME_FREEVFS, { .error = vfs_EIO },
4181 4181 VFSNAME_VNSTATE, { .error = vfs_EIO },
4182 4182 NULL, NULL
4183 4183 };
4184 4184
4185 4185 static const fs_operation_def_t stray_vfsops_template[] = {
4186 4186 VFSNAME_MOUNT, { .error = vfsstray },
4187 4187 VFSNAME_UNMOUNT, { .error = vfsstray },
4188 4188 VFSNAME_ROOT, { .error = vfsstray },
4189 4189 VFSNAME_STATVFS, { .error = vfsstray },
4190 4190 VFSNAME_SYNC, { .vfs_sync = vfsstray_sync },
4191 4191 VFSNAME_VGET, { .error = vfsstray },
4192 4192 VFSNAME_MOUNTROOT, { .error = vfsstray },
4193 4193 VFSNAME_FREEVFS, { .error = vfsstray },
4194 4194 VFSNAME_VNSTATE, { .error = vfsstray },
4195 4195 NULL, NULL
4196 4196 };
4197 4197
4198 4198 /* Create vfs cache */
4199 4199 vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4200 4200 sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4201 4201
4202 4202 /* Initialize the vnode cache (file systems may use it during init). */
4203 4203 vn_create_cache();
4204 4204
4205 4205 /* Setup event monitor framework */
4206 4206 fem_init();
4207 4207
4208 4208 /* Initialize the dummy stray file system type. */
4209 4209 error = vfs_setfsops(0, stray_vfsops_template, NULL);
4210 4210
4211 4211 /* Initialize the dummy EIO file system. */
4212 4212 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4213 4213 if (error != 0) {
4214 4214 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4215 4215 /* Shouldn't happen, but not bad enough to panic */
4216 4216 }
4217 4217
4218 4218 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4219 4219
4220 4220 /*
4221 4221 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4222 4222 * on this vfs can immediately notice it's invalid.
4223 4223 */
4224 4224 EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4225 4225
4226 4226 /*
4227 4227 * Call the init routines of non-loadable filesystems only.
4228 4228 * Filesystems which are loaded as separate modules will be
4229 4229 * initialized by the module loading code instead.
4230 4230 */
4231 4231
4232 4232 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4233 4233 RLOCK_VFSSW();
4234 4234 if (vswp->vsw_init != NULL)
4235 4235 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4236 4236 RUNLOCK_VFSSW();
4237 4237 }
4238 4238
4239 4239 vopstats_startup();
4240 4240
4241 4241 if (vopstats_enabled) {
4242 4242 /* EIO_vfs can collect stats, but we don't retrieve them */
4243 4243 initialize_vopstats(&EIO_vfs.vfs_vopstats);
4244 4244 EIO_vfs.vfs_fstypevsp = NULL;
4245 4245 EIO_vfs.vfs_vskap = NULL;
4246 4246 EIO_vfs.vfs_flag |= VFS_STATS;
4247 4247 }
4248 4248
4249 4249 xattr_init();
4250 4250
4251 4251 reparse_point_init();
4252 4252 }
4253 4253
4254 4254 vfs_t *
4255 4255 vfs_alloc(int kmflag)
4256 4256 {
4257 4257 vfs_t *vfsp;
4258 4258
4259 4259 vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4260 4260
4261 4261 /*
4262 4262 * Do the simplest initialization here.
4263 4263 * Everything else gets done in vfs_init()
4264 4264 */
4265 4265 bzero(vfsp, sizeof (vfs_t));
4266 4266 return (vfsp);
4267 4267 }
4268 4268
4269 4269 void
4270 4270 vfs_free(vfs_t *vfsp)
4271 4271 {
4272 4272 /*
4273 4273 * One would be tempted to assert that "vfsp->vfs_count == 0".
4274 4274 * The problem is that this gets called out of domount() with
4275 4275 * a partially initialized vfs and a vfs_count of 1. This is
4276 4276 * also called from vfs_rele() with a vfs_count of 0. We can't
4277 4277 * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4278 4278 * returned. This is because VFS_MOUNT() fully initializes the
4279 4279 * vfs structure and its associated data. VFS_RELE() will call
4280 4280 * VFS_FREEVFS() which may panic the system if the data structures
4281 4281 * aren't fully initialized from a successful VFS_MOUNT()).
4282 4282 */
4283 4283
4284 4284 /* If FEM was in use, make sure everything gets cleaned up */
4285 4285 if (vfsp->vfs_femhead) {
4286 4286 ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4287 4287 mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4288 4288 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4289 4289 vfsp->vfs_femhead = NULL;
4290 4290 }
4291 4291
4292 4292 if (vfsp->vfs_implp)
4293 4293 vfsimpl_teardown(vfsp);
4294 4294 sema_destroy(&vfsp->vfs_reflock);
4295 4295 kmem_cache_free(vfs_cache, vfsp);
4296 4296 }
4297 4297
4298 4298 /*
4299 4299 * Increments the vfs reference count by one atomically.
4300 4300 */
4301 4301 void
4302 4302 vfs_hold(vfs_t *vfsp)
4303 4303 {
4304 4304 atomic_inc_32(&vfsp->vfs_count);
4305 4305 ASSERT(vfsp->vfs_count != 0);
4306 4306 }
4307 4307
4308 4308 /*
4309 4309 * Decrements the vfs reference count by one atomically. When
4310 4310 * vfs reference count becomes zero, it calls the file system
4311 4311 * specific vfs_freevfs() to free up the resources.
4312 4312 */
4313 4313 void
4314 4314 vfs_rele(vfs_t *vfsp)
4315 4315 {
4316 4316 ASSERT(vfsp->vfs_count != 0);
4317 4317 if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) {
4318 4318 VFS_FREEVFS(vfsp);
4319 4319 lofi_remove(vfsp);
4320 4320 if (vfsp->vfs_zone)
4321 4321 zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
4322 4322 ZONE_REF_VFS);
4323 4323 vfs_freemnttab(vfsp);
4324 4324 vfs_free(vfsp);
4325 4325 }
4326 4326 }
4327 4327
4328 4328 /*
4329 4329 * Generic operations vector support.
4330 4330 *
4331 4331 * This is used to build operations vectors for both the vfs and vnode.
4332 4332 * It's normally called only when a file system is loaded.
4333 4333 *
4334 4334 * There are many possible algorithms for this, including the following:
4335 4335 *
4336 4336 * (1) scan the list of known operations; for each, see if the file system
4337 4337 * includes an entry for it, and fill it in as appropriate.
4338 4338 *
4339 4339 * (2) set up defaults for all known operations. scan the list of ops
4340 4340 * supplied by the file system; for each which is both supplied and
4341 4341 * known, fill it in.
4342 4342 *
4343 4343 * (3) sort the lists of known ops & supplied ops; scan the list, filling
4344 4344 * in entries as we go.
4345 4345 *
4346 4346 * we choose (1) for simplicity, and because performance isn't critical here.
4347 4347 * note that (2) could be sped up using a precomputed hash table on known ops.
4348 4348 * (3) could be faster than either, but only if the lists were very large or
4349 4349 * supplied in sorted order.
4350 4350 *
4351 4351 */
4352 4352
4353 4353 int
4354 4354 fs_build_vector(void *vector, int *unused_ops,
4355 4355 const fs_operation_trans_def_t *translation,
4356 4356 const fs_operation_def_t *operations)
4357 4357 {
4358 4358 int i, num_trans, num_ops, used;
4359 4359
4360 4360 /*
4361 4361 * Count the number of translations and the number of supplied
4362 4362 * operations.
4363 4363 */
4364 4364
4365 4365 {
4366 4366 const fs_operation_trans_def_t *p;
4367 4367
4368 4368 for (num_trans = 0, p = translation;
4369 4369 p->name != NULL;
4370 4370 num_trans++, p++)
4371 4371 ;
4372 4372 }
4373 4373
4374 4374 {
4375 4375 const fs_operation_def_t *p;
4376 4376
4377 4377 for (num_ops = 0, p = operations;
4378 4378 p->name != NULL;
4379 4379 num_ops++, p++)
4380 4380 ;
4381 4381 }
4382 4382
4383 4383 /* Walk through each operation known to our caller. There will be */
4384 4384 /* one entry in the supplied "translation table" for each. */
4385 4385
4386 4386 used = 0;
4387 4387
4388 4388 for (i = 0; i < num_trans; i++) {
4389 4389 int j, found;
4390 4390 char *curname;
4391 4391 fs_generic_func_p result;
4392 4392 fs_generic_func_p *location;
4393 4393
4394 4394 curname = translation[i].name;
4395 4395
4396 4396 /* Look for a matching operation in the list supplied by the */
4397 4397 /* file system. */
4398 4398
4399 4399 found = 0;
4400 4400
4401 4401 for (j = 0; j < num_ops; j++) {
4402 4402 if (strcmp(operations[j].name, curname) == 0) {
4403 4403 used++;
4404 4404 found = 1;
4405 4405 break;
4406 4406 }
4407 4407 }
4408 4408
4409 4409 /*
4410 4410 * If the file system is using a "placeholder" for default
4411 4411 * or error functions, grab the appropriate function out of
4412 4412 * the translation table. If the file system didn't supply
4413 4413 * this operation at all, use the default function.
4414 4414 */
4415 4415
4416 4416 if (found) {
4417 4417 result = operations[j].func.fs_generic;
4418 4418 if (result == fs_default) {
4419 4419 result = translation[i].defaultFunc;
4420 4420 } else if (result == fs_error) {
4421 4421 result = translation[i].errorFunc;
4422 4422 } else if (result == NULL) {
4423 4423 /* Null values are PROHIBITED */
4424 4424 return (EINVAL);
4425 4425 }
4426 4426 } else {
4427 4427 result = translation[i].defaultFunc;
4428 4428 }
4429 4429
4430 4430 /* Now store the function into the operations vector. */
4431 4431
4432 4432 location = (fs_generic_func_p *)
4433 4433 (((char *)vector) + translation[i].offset);
4434 4434
4435 4435 *location = result;
4436 4436 }
4437 4437
4438 4438 *unused_ops = num_ops - used;
4439 4439
4440 4440 return (0);
4441 4441 }
4442 4442
4443 4443 /* Placeholder functions, should never be called. */
4444 4444
4445 4445 int
4446 4446 fs_error(void)
4447 4447 {
4448 4448 cmn_err(CE_PANIC, "fs_error called");
4449 4449 return (0);
4450 4450 }
4451 4451
4452 4452 int
4453 4453 fs_default(void)
4454 4454 {
4455 4455 cmn_err(CE_PANIC, "fs_default called");
4456 4456 return (0);
4457 4457 }
4458 4458
4459 4459 #ifdef __sparc
4460 4460
4461 4461 /*
4462 4462 * Part of the implementation of booting off a mirrored root
4463 4463 * involves a change of dev_t for the root device. To
4464 4464 * accomplish this, first remove the existing hash table
4465 4465 * entry for the root device, convert to the new dev_t,
4466 4466 * then re-insert in the hash table at the head of the list.
4467 4467 */
4468 4468 void
4469 4469 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4470 4470 {
4471 4471 vfs_list_lock();
4472 4472
4473 4473 vfs_hash_remove(vfsp);
4474 4474
4475 4475 vfsp->vfs_dev = ndev;
4476 4476 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4477 4477
4478 4478 vfs_hash_add(vfsp, 1);
|
↓ open down ↓ |
4478 lines elided |
↑ open up ↑ |
4479 4479
4480 4480 vfs_list_unlock();
4481 4481 }
4482 4482
4483 4483 #else /* x86 NEWBOOT */
4484 4484
4485 4485 #if defined(__x86)
4486 4486 extern int hvmboot_rootconf();
4487 4487 #endif /* __x86 */
4488 4488
4489 +extern char *aoepath_prop;
4489 4490 extern ib_boot_prop_t *iscsiboot_prop;
4490 4491
4491 4492 int
4492 4493 rootconf()
4493 4494 {
4494 4495 int error;
4495 4496 struct vfssw *vsw;
4496 4497 extern void pm_init();
4497 4498 char *fstyp, *fsmod;
4498 4499 int ret = -1;
4499 4500
4500 4501 getrootfs(&fstyp, &fsmod);
4501 4502
4502 4503 #if defined(__x86)
4503 4504 /*
4504 4505 * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4505 4506 * which lives in /platform/i86hvm, and hence is only available when
4506 4507 * booted in an x86 hvm environment. If the hvm_bootstrap misc module
4507 4508 * is not available then the modstub for this function will return 0.
4508 4509 * If the hvm_bootstrap misc module is available it will be loaded
4509 4510 * and hvmboot_rootconf() will be invoked.
4510 4511 */
4511 4512 if (error = hvmboot_rootconf())
4512 4513 return (error);
4513 4514 #endif /* __x86 */
4514 4515
4515 4516 if (error = clboot_rootconf())
4516 4517 return (error);
4517 4518
4518 4519 if (modload("fs", fsmod) == -1)
4519 4520 panic("Cannot _init %s module", fsmod);
4520 4521
|
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
4521 4522 RLOCK_VFSSW();
4522 4523 vsw = vfs_getvfsswbyname(fstyp);
4523 4524 RUNLOCK_VFSSW();
4524 4525 if (vsw == NULL) {
4525 4526 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4526 4527 return (ENXIO);
4527 4528 }
4528 4529 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4529 4530 VFS_HOLD(rootvfs);
4530 4531
4531 - /* always mount readonly first */
4532 + /* Always mount readonly first */
4532 4533 rootvfs->vfs_flag |= VFS_RDONLY;
4533 4534
4534 4535 pm_init();
4535 4536
4536 - if (netboot && iscsiboot_prop) {
4537 - cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4538 - " shouldn't happen in the same time");
4537 + if ((aoepath_prop != NULL && (iscsiboot_prop != NULL || netboot)) ||
4538 + (iscsiboot_prop != NULL && (aoepath_prop != NULL || netboot)) ||
4539 + (netboot && (aoepath_prop != NULL || iscsiboot_prop != NULL))) {
4540 + cmn_err(CE_WARN, "Only one of AoE, iSCSI or NFS boot "
4541 + "can be specified at time");
4539 4542 return (EINVAL);
4540 4543 }
4541 4544
4542 - if (netboot || iscsiboot_prop) {
4545 + if (aoepath_prop != NULL || iscsiboot_prop != NULL || netboot) {
4543 4546 ret = strplumb();
4544 4547 if (ret != 0) {
4545 4548 cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4546 4549 return (EFAULT);
4547 4550 }
4548 4551 }
4549 4552
4550 - if ((ret == 0) && iscsiboot_prop) {
4551 - ret = modload("drv", "iscsi");
4552 - /* -1 indicates fail */
4553 - if (ret == -1) {
4553 + if (aoepath_prop != NULL) {
4554 + if (modload("drv", "aoe") == -1 ||
4555 + modload("drv", "aoeblk") == -1) {
4556 + cmn_err(CE_WARN, "Failed to load aoe modules");
4557 + return (EINVAL);
4558 + }
4559 + if (i_ddi_attach_pseudo_node("aoe") == 0) {
4560 + cmn_err(CE_WARN, "Failed to attach aoe driver");
4561 + return (ENODEV);
4562 + }
4563 + }
4564 +
4565 + if (iscsiboot_prop != NULL) {
4566 + if (modload("drv", "iscsi") == -1) {
4554 4567 cmn_err(CE_WARN, "Failed to load iscsi module");
4555 4568 iscsi_boot_prop_free();
4556 4569 return (EINVAL);
4557 - } else {
4558 - if (!i_ddi_attach_pseudo_node("iscsi")) {
4559 - cmn_err(CE_WARN,
4560 - "Failed to attach iscsi driver");
4561 - iscsi_boot_prop_free();
4562 - return (ENODEV);
4563 - }
4564 4570 }
4571 + if (i_ddi_attach_pseudo_node("iscsi") == 0) {
4572 + cmn_err(CE_WARN, "Failed to attach iscsi driver");
4573 + iscsi_boot_prop_free();
4574 + return (ENODEV);
4575 + }
4565 4576 }
4566 4577
4567 4578 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4568 4579 vfs_unrefvfssw(vsw);
4569 4580 rootdev = rootvfs->vfs_dev;
4570 4581
4571 - if (error)
4582 + if (error != 0)
4572 4583 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4573 4584 rootfs.bo_name, fstyp);
4574 4585 else
4575 4586 cmn_err(CE_CONT, "?root on %s fstype %s\n",
4576 4587 rootfs.bo_name, fstyp);
4577 4588 return (error);
4578 4589 }
4579 4590
4580 4591 /*
4581 4592 * XXX this is called by nfs only and should probably be removed
4582 4593 * If booted with ASKNAME, prompt on the console for a filesystem
4583 4594 * name and return it.
4584 4595 */
4585 4596 void
4586 4597 getfsname(char *askfor, char *name, size_t namelen)
4587 4598 {
4588 4599 if (boothowto & RB_ASKNAME) {
4589 4600 printf("%s name: ", askfor);
4590 4601 console_gets(name, namelen);
4591 4602 }
4592 4603 }
4593 4604
4594 4605 /*
4595 4606 * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4596 4607 * property.
4597 4608 *
4598 4609 * Filesystem types starting with the prefix "nfs" are diskless clients;
4599 4610 * init the root filename name (rootfs.bo_name), too.
4600 4611 *
4601 4612 * If we are booting via NFS we currently have these options:
4602 4613 * nfs - dynamically choose NFS V2, V3, or V4 (default)
4603 4614 * nfs2 - force NFS V2
4604 4615 * nfs3 - force NFS V3
4605 4616 * nfs4 - force NFS V4
4606 4617 * Because we need to maintain backward compatibility with the naming
4607 4618 * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4608 4619 * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs". The dynamic
4609 4620 * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4610 4621 * This is only for root filesystems, all other uses will expect
4611 4622 * that "nfs" == NFS V2.
4612 4623 */
4613 4624 static void
4614 4625 getrootfs(char **fstypp, char **fsmodp)
4615 4626 {
4616 4627 char *propstr = NULL;
4617 4628
4618 4629 /*
4619 4630 * Check fstype property; for diskless it should be one of "nfs",
4620 4631 * "nfs2", "nfs3" or "nfs4".
4621 4632 */
4622 4633 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4623 4634 DDI_PROP_DONTPASS, "fstype", &propstr)
4624 4635 == DDI_SUCCESS) {
4625 4636 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4626 4637 ddi_prop_free(propstr);
4627 4638
4628 4639 /*
4629 4640 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4630 4641 * assume the type of this root filesystem is 'zfs'.
4631 4642 */
4632 4643 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4633 4644 DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4634 4645 == DDI_SUCCESS) {
4635 4646 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4636 4647 ddi_prop_free(propstr);
4637 4648 }
4638 4649
4639 4650 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4640 4651 *fstypp = *fsmodp = rootfs.bo_fstype;
4641 4652 return;
4642 4653 }
4643 4654
4644 4655 ++netboot;
4645 4656
4646 4657 if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4647 4658 (void) strcpy(rootfs.bo_fstype, "nfs");
4648 4659 else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4649 4660 (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4650 4661
4651 4662 /*
4652 4663 * check if path to network interface is specified in bootpath
4653 4664 * or by a hypervisor domain configuration file.
4654 4665 * XXPV - enable strlumb_get_netdev_path()
4655 4666 */
4656 4667 if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4657 4668 "xpv-nfsroot")) {
4658 4669 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4659 4670 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4660 4671 DDI_PROP_DONTPASS, "bootpath", &propstr)
4661 4672 == DDI_SUCCESS) {
4662 4673 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4663 4674 ddi_prop_free(propstr);
4664 4675 } else {
4665 4676 rootfs.bo_name[0] = '\0';
4666 4677 }
4667 4678 *fstypp = rootfs.bo_fstype;
4668 4679 *fsmodp = "nfs";
4669 4680 }
4670 4681 #endif
4671 4682
4672 4683 /*
4673 4684 * VFS feature routines
4674 4685 */
4675 4686
4676 4687 #define VFTINDEX(feature) (((feature) >> 32) & 0xFFFFFFFF)
4677 4688 #define VFTBITS(feature) ((feature) & 0xFFFFFFFFLL)
4678 4689
4679 4690 /* Register a feature in the vfs */
4680 4691 void
4681 4692 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4682 4693 {
4683 4694 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4684 4695 if (vfsp->vfs_implp == NULL)
4685 4696 return;
4686 4697
4687 4698 vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4688 4699 }
4689 4700
4690 4701 void
4691 4702 vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature)
4692 4703 {
4693 4704 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4694 4705 if (vfsp->vfs_implp == NULL)
4695 4706 return;
4696 4707 vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature);
4697 4708 }
4698 4709
4699 4710 /*
4700 4711 * Query a vfs for a feature.
4701 4712 * Returns 1 if feature is present, 0 if not
4702 4713 */
4703 4714 int
4704 4715 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4705 4716 {
4706 4717 int ret = 0;
4707 4718
4708 4719 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4709 4720 if (vfsp->vfs_implp == NULL)
4710 4721 return (ret);
4711 4722
4712 4723 if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4713 4724 ret = 1;
4714 4725
4715 4726 return (ret);
4716 4727 }
4717 4728
4718 4729 /*
4719 4730 * Propagate feature set from one vfs to another
4720 4731 */
4721 4732 void
4722 4733 vfs_propagate_features(vfs_t *from, vfs_t *to)
4723 4734 {
4724 4735 int i;
4725 4736
4726 4737 if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4727 4738 return;
4728 4739
4729 4740 for (i = 1; i <= to->vfs_featureset[0]; i++) {
4730 4741 to->vfs_featureset[i] = from->vfs_featureset[i];
4731 4742 }
4732 4743 }
4733 4744
4734 4745 #define LOFINODE_PATH "/dev/lofi/%d"
4735 4746
4736 4747 /*
4737 4748 * Return the vnode for the lofi node if there's a lofi mount in place.
4738 4749 * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4739 4750 * failure.
4740 4751 */
4741 4752 int
4742 4753 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4743 4754 {
4744 4755 char *path = NULL;
4745 4756 int strsize;
4746 4757 int err;
4747 4758
4748 4759 if (vfsp->vfs_lofi_id == 0) {
4749 4760 *vpp = NULL;
4750 4761 return (-1);
4751 4762 }
4752 4763
4753 4764 strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_id);
4754 4765 path = kmem_alloc(strsize + 1, KM_SLEEP);
4755 4766 (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_id);
4756 4767
4757 4768 /*
4758 4769 * We may be inside a zone, so we need to use the /dev path, but
4759 4770 * it's created asynchronously, so we wait here.
4760 4771 */
4761 4772 for (;;) {
4762 4773 err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4763 4774
4764 4775 if (err != ENOENT)
4765 4776 break;
4766 4777
4767 4778 if ((err = delay_sig(hz / 8)) == EINTR)
4768 4779 break;
4769 4780 }
4770 4781
4771 4782 if (err)
4772 4783 *vpp = NULL;
4773 4784
4774 4785 kmem_free(path, strsize + 1);
4775 4786 return (err);
4776 4787 }
|
↓ open down ↓ |
195 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX