1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2014 Integros [integros.com]
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 /* Portions Copyright 2010 Robert Milkowski */
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/kmem.h>
35 #include <sys/pathname.h>
36 #include <sys/vnode.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/mntent.h>
40 #include <sys/mount.h>
41 #include <sys/cmn_err.h>
42 #include "fs/fs_subr.h"
43 #include <sys/zfs_znode.h>
44 #include <sys/zfs_dir.h>
45 #include <sys/zil.h>
46 #include <sys/fs/zfs.h>
47 #include <sys/dmu.h>
48 #include <sys/dsl_dir.h>
49 #include <sys/dsl_prop.h>
50 #include <sys/dsl_dataset.h>
51 #include <sys/dsl_deleg.h>
52 #include <sys/spa.h>
53 #include <sys/zap.h>
54 #include <sys/sa.h>
55 #include <sys/sa_impl.h>
56 #include <sys/varargs.h>
57 #include <sys/policy.h>
58 #include <sys/atomic.h>
59 #include <sys/mkdev.h>
60 #include <sys/modctl.h>
61 #include <sys/refstr.h>
62 #include <sys/zfs_ioctl.h>
63 #include <sys/zfs_ctldir.h>
64 #include <sys/zfs_fuid.h>
65 #include <sys/bootconf.h>
66 #include <sys/sunddi.h>
67 #include <sys/dnlc.h>
68 #include <sys/dmu_objset.h>
69 #include <sys/spa_boot.h>
70 #include "zfs_comutil.h"
71
72 int zfsfstype;
73 vfsops_t *zfs_vfsops = NULL;
74 static major_t zfs_major;
75 static minor_t zfs_minor;
76 static kmutex_t zfs_dev_mtx;
77
78 extern int sys_shutdown;
79
80 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
81 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
82 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
83 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
84 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
85 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
86 static void zfs_freevfs(vfs_t *vfsp);
87
88 static const fs_operation_def_t zfs_vfsops_template[] = {
89 VFSNAME_MOUNT, { .vfs_mount = zfs_mount },
90 VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot },
91 VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount },
92 VFSNAME_ROOT, { .vfs_root = zfs_root },
93 VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs },
94 VFSNAME_SYNC, { .vfs_sync = zfs_sync },
95 VFSNAME_VGET, { .vfs_vget = zfs_vget },
96 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs },
97 NULL, NULL
98 };
99
100 /*
101 * We need to keep a count of active fs's.
102 * This is necessary to prevent our module
103 * from being unloaded after a umount -f
104 */
105 static uint32_t zfs_active_fs_count = 0;
106
107 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
108 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
109 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
110 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
111
112 /*
113 * MO_DEFAULT is not used since the default value is determined
114 * by the equivalent property.
115 */
116 static mntopt_t mntopts[] = {
117 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
118 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
119 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
120 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
121 };
122
123 static mntopts_t zfs_mntopts = {
124 sizeof (mntopts) / sizeof (mntopt_t),
125 mntopts
126 };
127
128 /*ARGSUSED*/
129 int
130 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
131 {
132 /*
133 * Data integrity is job one. We don't want a compromised kernel
134 * writing to the storage pool, so we never sync during panic.
135 */
136 if (panicstr)
137 return (0);
138
139 /*
140 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
141 * to sync metadata, which they would otherwise cache indefinitely.
142 * Semantically, the only requirement is that the sync be initiated.
143 * The DMU syncs out txgs frequently, so there's nothing to do.
144 */
145 if (flag & SYNC_ATTR)
146 return (0);
147
148 if (vfsp != NULL) {
149 /*
150 * Sync a specific filesystem.
151 */
152 zfsvfs_t *zfsvfs = vfsp->vfs_data;
153 dsl_pool_t *dp;
154
155 ZFS_ENTER(zfsvfs);
156 dp = dmu_objset_pool(zfsvfs->z_os);
157
158 /*
159 * If the system is shutting down, then skip any
160 * filesystems which may exist on a suspended pool.
161 */
162 if (sys_shutdown && spa_suspended(dp->dp_spa)) {
163 ZFS_EXIT(zfsvfs);
164 return (0);
165 }
166
167 if (zfsvfs->z_log != NULL)
168 zil_commit(zfsvfs->z_log, 0);
169
170 ZFS_EXIT(zfsvfs);
171 } else {
172 /*
173 * Sync all ZFS filesystems. This is what happens when you
174 * run sync(1M). Unlike other filesystems, ZFS honors the
175 * request by waiting for all pools to commit all dirty data.
176 */
177 spa_sync_allpools();
178 }
179
180 return (0);
181 }
182
183 static int
184 zfs_create_unique_device(dev_t *dev)
185 {
186 major_t new_major;
187
188 do {
189 ASSERT3U(zfs_minor, <=, MAXMIN32);
190 minor_t start = zfs_minor;
191 do {
192 mutex_enter(&zfs_dev_mtx);
193 if (zfs_minor >= MAXMIN32) {
194 /*
195 * If we're still using the real major
196 * keep out of /dev/zfs and /dev/zvol minor
197 * number space. If we're using a getudev()'ed
198 * major number, we can use all of its minors.
199 */
200 if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
201 zfs_minor = ZFS_MIN_MINOR;
202 else
203 zfs_minor = 0;
204 } else {
205 zfs_minor++;
206 }
207 *dev = makedevice(zfs_major, zfs_minor);
208 mutex_exit(&zfs_dev_mtx);
209 } while (vfs_devismounted(*dev) && zfs_minor != start);
210 if (zfs_minor == start) {
211 /*
212 * We are using all ~262,000 minor numbers for the
213 * current major number. Create a new major number.
214 */
215 if ((new_major = getudev()) == (major_t)-1) {
216 cmn_err(CE_WARN,
217 "zfs_mount: Can't get unique major "
218 "device number.");
219 return (-1);
220 }
221 mutex_enter(&zfs_dev_mtx);
222 zfs_major = new_major;
223 zfs_minor = 0;
224
225 mutex_exit(&zfs_dev_mtx);
226 } else {
227 break;
228 }
229 /* CONSTANTCONDITION */
230 } while (1);
231
232 return (0);
233 }
234
235 static void
236 atime_changed_cb(void *arg, uint64_t newval)
237 {
238 zfsvfs_t *zfsvfs = arg;
239
240 if (newval == TRUE) {
241 zfsvfs->z_atime = TRUE;
242 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
243 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
244 } else {
245 zfsvfs->z_atime = FALSE;
246 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
247 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
248 }
249 }
250
251 static void
252 xattr_changed_cb(void *arg, uint64_t newval)
253 {
254 zfsvfs_t *zfsvfs = arg;
255
256 if (newval == TRUE) {
257 /* XXX locking on vfs_flag? */
258 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
259 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
260 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
261 } else {
262 /* XXX locking on vfs_flag? */
263 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
264 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
265 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
266 }
267 }
268
269 static void
270 blksz_changed_cb(void *arg, uint64_t newval)
271 {
272 zfsvfs_t *zfsvfs = arg;
273 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
274 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
275 ASSERT(ISP2(newval));
276
277 zfsvfs->z_max_blksz = newval;
278 zfsvfs->z_vfs->vfs_bsize = newval;
279 }
280
281 static void
282 readonly_changed_cb(void *arg, uint64_t newval)
283 {
284 zfsvfs_t *zfsvfs = arg;
285
286 if (newval) {
287 /* XXX locking on vfs_flag? */
288 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
289 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
290 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
291 } else {
292 /* XXX locking on vfs_flag? */
293 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
294 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
295 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
296 }
297 }
298
299 static void
300 devices_changed_cb(void *arg, uint64_t newval)
301 {
302 zfsvfs_t *zfsvfs = arg;
303
304 if (newval == FALSE) {
305 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
306 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
307 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
308 } else {
309 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
310 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
311 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
312 }
313 }
314
315 static void
316 setuid_changed_cb(void *arg, uint64_t newval)
317 {
318 zfsvfs_t *zfsvfs = arg;
319
320 if (newval == FALSE) {
321 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
322 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
323 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
324 } else {
325 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
326 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
327 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
328 }
329 }
330
331 static void
332 exec_changed_cb(void *arg, uint64_t newval)
333 {
334 zfsvfs_t *zfsvfs = arg;
335
336 if (newval == FALSE) {
337 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
338 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
339 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
340 } else {
341 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
342 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
343 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
344 }
345 }
346
347 /*
348 * The nbmand mount option can be changed at mount time.
349 * We can't allow it to be toggled on live file systems or incorrect
350 * behavior may be seen from cifs clients
351 *
352 * This property isn't registered via dsl_prop_register(), but this callback
353 * will be called when a file system is first mounted
354 */
355 static void
356 nbmand_changed_cb(void *arg, uint64_t newval)
357 {
358 zfsvfs_t *zfsvfs = arg;
359 if (newval == FALSE) {
360 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
361 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
362 } else {
363 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
364 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
365 }
366 }
367
368 static void
369 snapdir_changed_cb(void *arg, uint64_t newval)
370 {
371 zfsvfs_t *zfsvfs = arg;
372
373 zfsvfs->z_show_ctldir = newval;
374 }
375
376 static void
377 vscan_changed_cb(void *arg, uint64_t newval)
378 {
379 zfsvfs_t *zfsvfs = arg;
380
381 zfsvfs->z_vscan = newval;
382 }
383
384 static void
385 acl_mode_changed_cb(void *arg, uint64_t newval)
386 {
387 zfsvfs_t *zfsvfs = arg;
388
389 zfsvfs->z_acl_mode = newval;
390 }
391
392 static void
393 acl_inherit_changed_cb(void *arg, uint64_t newval)
394 {
395 zfsvfs_t *zfsvfs = arg;
396
397 zfsvfs->z_acl_inherit = newval;
398 }
399
400 static void
401 rate_changed_cb(void *arg, uint64_t newval)
402 {
403 zfsvfs_t *zfsvfs = arg;
404
405 if (newval == UINT64_MAX)
406 newval = 0;
407 zfsvfs->z_rate.rate_cap = newval;
408 }
409
410 static int
411 zfs_register_callbacks(vfs_t *vfsp)
412 {
413 struct dsl_dataset *ds = NULL;
414 objset_t *os = NULL;
415 zfsvfs_t *zfsvfs = NULL;
416 uint64_t nbmand;
417 boolean_t readonly = B_FALSE;
418 boolean_t do_readonly = B_FALSE;
419 boolean_t setuid = B_FALSE;
420 boolean_t do_setuid = B_FALSE;
421 boolean_t exec = B_FALSE;
422 boolean_t do_exec = B_FALSE;
423 boolean_t devices = B_FALSE;
424 boolean_t do_devices = B_FALSE;
425 boolean_t xattr = B_FALSE;
426 boolean_t do_xattr = B_FALSE;
427 boolean_t atime = B_FALSE;
428 boolean_t do_atime = B_FALSE;
429 int error = 0;
430
431 ASSERT(vfsp);
432 zfsvfs = vfsp->vfs_data;
433 ASSERT(zfsvfs);
434 os = zfsvfs->z_os;
435
436 /*
437 * The act of registering our callbacks will destroy any mount
438 * options we may have. In order to enable temporary overrides
439 * of mount options, we stash away the current values and
440 * restore them after we register the callbacks.
441 */
442 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
443 !spa_writeable(dmu_objset_spa(os))) {
444 readonly = B_TRUE;
445 do_readonly = B_TRUE;
446 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
447 readonly = B_FALSE;
448 do_readonly = B_TRUE;
449 }
450 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
451 devices = B_FALSE;
452 setuid = B_FALSE;
453 do_devices = B_TRUE;
454 do_setuid = B_TRUE;
455 } else {
456 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
457 devices = B_FALSE;
458 do_devices = B_TRUE;
459 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
460 devices = B_TRUE;
461 do_devices = B_TRUE;
462 }
463
464 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
465 setuid = B_FALSE;
466 do_setuid = B_TRUE;
467 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
468 setuid = B_TRUE;
469 do_setuid = B_TRUE;
470 }
471 }
472 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
473 exec = B_FALSE;
474 do_exec = B_TRUE;
475 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
476 exec = B_TRUE;
477 do_exec = B_TRUE;
478 }
479 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
480 xattr = B_FALSE;
481 do_xattr = B_TRUE;
482 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
483 xattr = B_TRUE;
484 do_xattr = B_TRUE;
485 }
486 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
487 atime = B_FALSE;
488 do_atime = B_TRUE;
489 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
490 atime = B_TRUE;
491 do_atime = B_TRUE;
492 }
493
494 /*
495 * nbmand is a special property. It can only be changed at
496 * mount time.
497 *
498 * This is weird, but it is documented to only be changeable
499 * at mount time.
500 */
501 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
502 nbmand = B_FALSE;
503 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
504 nbmand = B_TRUE;
505 } else {
506 char osname[ZFS_MAX_DATASET_NAME_LEN];
507
508 dmu_objset_name(os, osname);
509 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
510 NULL)) {
511 return (error);
512 }
513 }
514
515 /*
516 * Register property callbacks.
517 *
518 * It would probably be fine to just check for i/o error from
519 * the first prop_register(), but I guess I like to go
520 * overboard...
521 */
522 ds = dmu_objset_ds(os);
523 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
524 error = dsl_prop_register(ds,
525 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
526 error = error ? error : dsl_prop_register(ds,
527 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
528 error = error ? error : dsl_prop_register(ds,
529 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
530 error = error ? error : dsl_prop_register(ds,
531 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
532 error = error ? error : dsl_prop_register(ds,
533 zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
534 error = error ? error : dsl_prop_register(ds,
535 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
536 error = error ? error : dsl_prop_register(ds,
537 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
538 error = error ? error : dsl_prop_register(ds,
539 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
540 error = error ? error : dsl_prop_register(ds,
541 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
542 error = error ? error : dsl_prop_register(ds,
543 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
544 zfsvfs);
545 error = error ? error : dsl_prop_register(ds,
546 zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
547 error = error ? error : dsl_prop_register(ds,
548 zfs_prop_to_name(ZFS_PROP_RATE_LIMIT), rate_changed_cb, zfsvfs);
549
550 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
551 if (error)
552 goto unregister;
553
554 /*
555 * Invoke our callbacks to restore temporary mount options.
556 */
557 if (do_readonly)
558 readonly_changed_cb(zfsvfs, readonly);
559 if (do_setuid)
560 setuid_changed_cb(zfsvfs, setuid);
561 if (do_exec)
562 exec_changed_cb(zfsvfs, exec);
563 if (do_devices)
564 devices_changed_cb(zfsvfs, devices);
565 if (do_xattr)
566 xattr_changed_cb(zfsvfs, xattr);
567 if (do_atime)
568 atime_changed_cb(zfsvfs, atime);
569
570 nbmand_changed_cb(zfsvfs, nbmand);
571
572 return (0);
573
574 unregister:
575 dsl_prop_unregister_all(ds, zfsvfs);
576 return (error);
577 }
578
579 static int
580 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
581 uint64_t *userp, uint64_t *groupp)
582 {
583 /*
584 * Is it a valid type of object to track?
585 */
586 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
587 return (SET_ERROR(ENOENT));
588
589 /*
590 * If we have a NULL data pointer
591 * then assume the id's aren't changing and
592 * return EEXIST to the dmu to let it know to
593 * use the same ids
594 */
595 if (data == NULL)
596 return (SET_ERROR(EEXIST));
597
598 if (bonustype == DMU_OT_ZNODE) {
599 znode_phys_t *znp = data;
600 *userp = znp->zp_uid;
601 *groupp = znp->zp_gid;
602 } else {
603 int hdrsize;
604 sa_hdr_phys_t *sap = data;
605 sa_hdr_phys_t sa = *sap;
606 boolean_t swap = B_FALSE;
607
608 ASSERT(bonustype == DMU_OT_SA);
609
610 if (sa.sa_magic == 0) {
611 /*
612 * This should only happen for newly created
613 * files that haven't had the znode data filled
614 * in yet.
615 */
616 *userp = 0;
617 *groupp = 0;
618 return (0);
619 }
620 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
621 sa.sa_magic = SA_MAGIC;
622 sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
623 swap = B_TRUE;
624 } else {
625 VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
626 }
627
628 hdrsize = sa_hdrsize(&sa);
629 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
630 *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
631 SA_UID_OFFSET));
632 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
633 SA_GID_OFFSET));
634 if (swap) {
635 *userp = BSWAP_64(*userp);
636 *groupp = BSWAP_64(*groupp);
637 }
638 }
639 return (0);
640 }
641
642 static void
643 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
644 char *domainbuf, int buflen, uid_t *ridp)
645 {
646 uint64_t fuid;
647 const char *domain;
648
649 fuid = zfs_strtonum(fuidstr, NULL);
650
651 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
652 if (domain)
653 (void) strlcpy(domainbuf, domain, buflen);
654 else
655 domainbuf[0] = '\0';
656 *ridp = FUID_RID(fuid);
657 }
658
659 static uint64_t
660 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
661 {
662 switch (type) {
663 case ZFS_PROP_USERUSED:
664 return (DMU_USERUSED_OBJECT);
665 case ZFS_PROP_GROUPUSED:
666 return (DMU_GROUPUSED_OBJECT);
667 case ZFS_PROP_USERQUOTA:
668 return (zfsvfs->z_userquota_obj);
669 case ZFS_PROP_GROUPQUOTA:
670 return (zfsvfs->z_groupquota_obj);
671 }
672 return (0);
673 }
674
675 int
676 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
677 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
678 {
679 int error;
680 zap_cursor_t zc;
681 zap_attribute_t za;
682 zfs_useracct_t *buf = vbuf;
683 uint64_t obj;
684
685 if (!dmu_objset_userspace_present(zfsvfs->z_os))
686 return (SET_ERROR(ENOTSUP));
687
688 obj = zfs_userquota_prop_to_obj(zfsvfs, type);
689 if (obj == 0) {
690 *bufsizep = 0;
691 return (0);
692 }
693
694 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
695 (error = zap_cursor_retrieve(&zc, &za)) == 0;
696 zap_cursor_advance(&zc)) {
697 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
698 *bufsizep)
699 break;
700
701 fuidstr_to_sid(zfsvfs, za.za_name,
702 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
703
704 buf->zu_space = za.za_first_integer;
705 buf++;
706 }
707 if (error == ENOENT)
708 error = 0;
709
710 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
711 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
712 *cookiep = zap_cursor_serialize(&zc);
713 zap_cursor_fini(&zc);
714 return (error);
715 }
716
717 /*
718 * buf must be big enough (eg, 32 bytes)
719 */
720 static int
721 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
722 char *buf, boolean_t addok)
723 {
724 uint64_t fuid;
725 int domainid = 0;
726
727 if (domain && domain[0]) {
728 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
729 if (domainid == -1)
730 return (SET_ERROR(ENOENT));
731 }
732 fuid = FUID_ENCODE(domainid, rid);
733 (void) sprintf(buf, "%llx", (longlong_t)fuid);
734 return (0);
735 }
736
737 int
738 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
739 const char *domain, uint64_t rid, uint64_t *valp)
740 {
741 char buf[32];
742 int err;
743 uint64_t obj;
744
745 *valp = 0;
746
747 if (!dmu_objset_userspace_present(zfsvfs->z_os))
748 return (SET_ERROR(ENOTSUP));
749
750 obj = zfs_userquota_prop_to_obj(zfsvfs, type);
751 if (obj == 0)
752 return (0);
753
754 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
755 if (err)
756 return (err);
757
758 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
759 if (err == ENOENT)
760 err = 0;
761 return (err);
762 }
763
764 int
765 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
766 const char *domain, uint64_t rid, uint64_t quota)
767 {
768 char buf[32];
769 int err;
770 dmu_tx_t *tx;
771 uint64_t *objp;
772 boolean_t fuid_dirtied;
773
774 if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
775 return (SET_ERROR(EINVAL));
776
777 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
778 return (SET_ERROR(ENOTSUP));
779
780 objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
781 &zfsvfs->z_groupquota_obj;
782
783 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
784 if (err)
785 return (err);
786 fuid_dirtied = zfsvfs->z_fuid_dirty;
787
788 tx = dmu_tx_create(zfsvfs->z_os);
789 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
790 if (*objp == 0) {
791 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
792 zfs_userquota_prop_prefixes[type]);
793 }
794 if (fuid_dirtied)
795 zfs_fuid_txhold(zfsvfs, tx);
796 err = dmu_tx_assign(tx, TXG_WAIT);
797 if (err) {
798 dmu_tx_abort(tx);
799 return (err);
800 }
801
802 mutex_enter(&zfsvfs->z_lock);
803 if (*objp == 0) {
804 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
805 DMU_OT_NONE, 0, tx);
806 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
807 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
808 }
809 mutex_exit(&zfsvfs->z_lock);
810
811 if (quota == 0) {
812 err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
813 if (err == ENOENT)
814 err = 0;
815 } else {
816 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx);
817 }
818 ASSERT(err == 0);
819 if (fuid_dirtied)
820 zfs_fuid_sync(zfsvfs, tx);
821 dmu_tx_commit(tx);
822 return (err);
823 }
824
825 boolean_t
826 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
827 {
828 char buf[32];
829 uint64_t used, quota, usedobj, quotaobj;
830 int err;
831
832 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
833 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
834
835 if (quotaobj == 0 || zfsvfs->z_replay)
836 return (B_FALSE);
837
838 (void) sprintf(buf, "%llx", (longlong_t)fuid);
839 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a);
840 if (err != 0)
841 return (B_FALSE);
842
843 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
844 if (err != 0)
845 return (B_FALSE);
846 return (used >= quota);
847 }
848
849 boolean_t
850 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
851 {
852 uint64_t fuid;
853 uint64_t quotaobj;
854
855 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
856
857 fuid = isgroup ? zp->z_gid : zp->z_uid;
858
859 if (quotaobj == 0 || zfsvfs->z_replay)
860 return (B_FALSE);
861
862 return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
863 }
864
865 /*
866 * Associate this zfsvfs with the given objset, which must be owned.
867 * This will cache a bunch of on-disk state from the objset in the
868 * zfsvfs.
869 */
870 static int
871 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
872 {
873 int error;
874 uint64_t val;
875
876 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
877 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
878 zfsvfs->z_os = os;
879
880 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
881 if (error != 0)
882 return (error);
883 if (zfsvfs->z_version >
884 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
885 (void) printf("Can't mount a version %lld file system "
886 "on a version %lld pool\n. Pool must be upgraded to mount "
887 "this file system.", (u_longlong_t)zfsvfs->z_version,
888 (u_longlong_t)spa_version(dmu_objset_spa(os)));
889 return (SET_ERROR(ENOTSUP));
890 }
891 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
892 if (error != 0)
893 return (error);
894 zfsvfs->z_norm = (int)val;
895
896 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
897 if (error != 0)
898 return (error);
899 zfsvfs->z_utf8 = (val != 0);
900
901 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
902 if (error != 0)
903 return (error);
904 zfsvfs->z_case = (uint_t)val;
905
906 /*
907 * Fold case on file systems that are always or sometimes case
908 * insensitive.
909 */
910 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
911 zfsvfs->z_case == ZFS_CASE_MIXED)
912 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
913
914 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
915 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
916
917 uint64_t sa_obj = 0;
918 if (zfsvfs->z_use_sa) {
919 /* should either have both of these objects or none */
920 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
921 &sa_obj);
922 if (error != 0)
923 return (error);
924 }
925
926 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
927 &zfsvfs->z_attr_table);
928 if (error != 0)
929 return (error);
930
931 if (zfsvfs->z_version >= ZPL_VERSION_SA)
932 sa_register_update_callback(os, zfs_sa_upgrade);
933
934 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
935 &zfsvfs->z_root);
936 if (error != 0)
937 return (error);
938 ASSERT(zfsvfs->z_root != 0);
939
940 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
941 &zfsvfs->z_unlinkedobj);
942 if (error != 0)
943 return (error);
944
945 error = zap_lookup(os, MASTER_NODE_OBJ,
946 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
947 8, 1, &zfsvfs->z_userquota_obj);
948 if (error == ENOENT)
949 zfsvfs->z_userquota_obj = 0;
950 else if (error != 0)
951 return (error);
952
953 error = zap_lookup(os, MASTER_NODE_OBJ,
954 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
955 8, 1, &zfsvfs->z_groupquota_obj);
956 if (error == ENOENT)
957 zfsvfs->z_groupquota_obj = 0;
958 else if (error != 0)
959 return (error);
960
961 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
962 &zfsvfs->z_fuid_obj);
963 if (error == ENOENT)
964 zfsvfs->z_fuid_obj = 0;
965 else if (error != 0)
966 return (error);
967
968 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
969 &zfsvfs->z_shares_dir);
970 if (error == ENOENT)
971 zfsvfs->z_shares_dir = 0;
972 else if (error != 0)
973 return (error);
974
975 return (0);
976 }
977
978 int
979 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
980 {
981 objset_t *os;
982 zfsvfs_t *zfsvfs;
983 int error;
984
985 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
986
987 /*
988 * We claim to always be readonly so we can open snapshots;
989 * other ZPL code will prevent us from writing to snapshots.
990 */
991
992 error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
993 if (error != 0) {
994 kmem_free(zfsvfs, sizeof (zfsvfs_t));
995 return (error);
996 }
997
998 error = zfsvfs_create_impl(zfvp, zfsvfs, os);
999 if (error != 0) {
1000 dmu_objset_disown(os, zfsvfs);
1001 }
1002 return (error);
1003 }
1004
1005
1006 int
1007 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1008 {
1009 int error;
1010 int size = spa_get_obj_mtx_sz(dmu_objset_spa(os));
1011
1012 zfsvfs->z_vfs = NULL;
1013 zfsvfs->z_parent = zfsvfs;
1014
1015 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1016 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1017 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1018 offsetof(znode_t, z_link_node));
1019 rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
1020 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1021 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1022 zfsvfs->z_hold_mtx_sz = size;
1023 zfsvfs->z_hold_mtx = kmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1024 for (int i = 0; i != size; i++)
1025 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1026 mutex_init(&zfsvfs->z_drain_lock, NULL, MUTEX_DEFAULT, NULL);
1027 cv_init(&zfsvfs->z_drain_cv, NULL, CV_DEFAULT, NULL);
1028
1029 error = zfsvfs_init(zfsvfs, os);
1030 if (error != 0) {
1031 *zfvp = NULL;
1032 kmem_free(zfsvfs->z_hold_mtx, sizeof (kmutex_t) * size);
1033 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1034 return (error);
1035 }
1036
1037 *zfvp = zfsvfs;
1038 return (0);
1039 }
1040
1041 static int
1042 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1043 {
1044 int error;
1045
1046 error = zfs_register_callbacks(zfsvfs->z_vfs);
1047 if (error)
1048 return (error);
1049
1050 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1051
1052 /*
1053 * If we are not mounting (ie: online recv), then we don't
1054 * have to worry about replaying the log as we blocked all
1055 * operations out since we closed the ZIL.
1056 */
1057 if (mounting) {
1058 boolean_t readonly;
1059
1060 /*
1061 * During replay we remove the read only flag to
1062 * allow replays to succeed.
1063 */
1064 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1065 if (readonly)
1066 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1067 else {
1068 zfs_unlinked_drain(zfsvfs);
1069 }
1070
1071 /*
1072 * Parse and replay the intent log.
1073 *
1074 * Because of ziltest, this must be done after
1075 * zfs_unlinked_drain(). (Further note: ziltest
1076 * doesn't use readonly mounts, where
1077 * zfs_unlinked_drain() isn't called.) This is because
1078 * ziltest causes spa_sync() to think it's committed,
1079 * but actually it is not, so the intent log contains
1080 * many txg's worth of changes.
1081 *
1082 * In particular, if object N is in the unlinked set in
1083 * the last txg to actually sync, then it could be
1084 * actually freed in a later txg and then reallocated
1085 * in a yet later txg. This would write a "create
1086 * object N" record to the intent log. Normally, this
1087 * would be fine because the spa_sync() would have
1088 * written out the fact that object N is free, before
1089 * we could write the "create object N" intent log
1090 * record.
1091 *
1092 * But when we are in ziltest mode, we advance the "open
1093 * txg" without actually spa_sync()-ing the changes to
1094 * disk. So we would see that object N is still
1095 * allocated and in the unlinked set, and there is an
1096 * intent log record saying to allocate it.
1097 */
1098 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1099 if (zil_replay_disable) {
1100 zil_destroy(zfsvfs->z_log, B_FALSE);
1101 } else {
1102 zfsvfs->z_replay = B_TRUE;
1103 zil_replay(zfsvfs->z_os, zfsvfs,
1104 zfs_replay_vector);
1105 zfsvfs->z_replay = B_FALSE;
1106 }
1107 }
1108
1109 /* restore readonly bit */
1110 if (readonly)
1111 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1112 }
1113
1114 /*
1115 * Set the objset user_ptr to track its zfsvfs.
1116 */
1117 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1118 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1119 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1120
1121 return (0);
1122 }
1123
1124 void
1125 zfsvfs_free(zfsvfs_t *zfsvfs)
1126 {
1127 int i;
1128 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1129
1130 /*
1131 * This is a barrier to prevent the filesystem from going away in
1132 * zfs_znode_move() until we can safely ensure that the filesystem is
1133 * not unmounted. We consider the filesystem valid before the barrier
1134 * and invalid after the barrier.
1135 */
1136 rw_enter(&zfsvfs_lock, RW_READER);
1137 rw_exit(&zfsvfs_lock);
1138
1139 VERIFY0(zfsvfs->z_znodes_freeing_cnt);
1140
1141 zfs_fuid_destroy(zfsvfs);
1142
1143 cv_destroy(&zfsvfs->z_drain_cv);
1144 mutex_destroy(&zfsvfs->z_drain_lock);
1145 mutex_destroy(&zfsvfs->z_znodes_lock);
1146 mutex_destroy(&zfsvfs->z_lock);
1147 list_destroy(&zfsvfs->z_all_znodes);
1148 rrm_destroy(&zfsvfs->z_teardown_lock);
1149 rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1150 rw_destroy(&zfsvfs->z_fuid_lock);
1151 for (i = 0; i != zfsvfs->z_hold_mtx_sz; i++)
1152 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1153
1154 kmem_free(zfsvfs->z_hold_mtx,
1155 sizeof (kmutex_t) * zfsvfs->z_hold_mtx_sz);
1156 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1157 }
1158
1159 static void
1160 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1161 {
1162 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1163 if (zfsvfs->z_vfs) {
1164 if (zfsvfs->z_use_fuids) {
1165 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1166 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1167 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1168 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1169 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1170 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1171 } else {
1172 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1173 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1174 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1175 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1176 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1177 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1178 }
1179 }
1180 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1181 }
1182
1183 static int
1184 zfs_domount(vfs_t *vfsp, char *osname)
1185 {
1186 dev_t mount_dev;
1187 uint64_t recordsize, fsid_guid;
1188 int error = 0;
1189 zfsvfs_t *zfsvfs;
1190 char worminfo[13] = {0};
1191
1192 ASSERT(vfsp);
1193 ASSERT(osname);
1194
1195 error = zfsvfs_create(osname, &zfsvfs);
1196 if (error)
1197 return (error);
1198 zfsvfs->z_vfs = vfsp;
1199
1200 /* Initialize the generic filesystem structure. */
1201 vfsp->vfs_bcount = 0;
1202 vfsp->vfs_data = NULL;
1203
1204 if (zfs_create_unique_device(&mount_dev) == -1) {
1205 error = SET_ERROR(ENODEV);
1206 goto out;
1207 }
1208 ASSERT(vfs_devismounted(mount_dev) == 0);
1209
1210 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1211 NULL))
1212 goto out;
1213
1214 if (dsl_prop_get(osname, "nms:worm", 1, 12, &worminfo, NULL) == 0 &&
1215 worminfo[0] && strcmp(worminfo, "0") != 0 &&
1216 strcmp(worminfo, "off") != 0 && strcmp(worminfo, "-") != 0) {
1217 zfsvfs->z_isworm = B_TRUE;
1218 } else {
1219 zfsvfs->z_isworm = B_FALSE;
1220 }
1221
1222 vfsp->vfs_dev = mount_dev;
1223 vfsp->vfs_fstype = zfsfstype;
1224 vfsp->vfs_bsize = recordsize;
1225 vfsp->vfs_flag |= VFS_NOTRUNC;
1226 vfsp->vfs_data = zfsvfs;
1227
1228 /*
1229 * The fsid is 64 bits, composed of an 8-bit fs type, which
1230 * separates our fsid from any other filesystem types, and a
1231 * 56-bit objset unique ID. The objset unique ID is unique to
1232 * all objsets open on this system, provided by unique_create().
1233 * The 8-bit fs type must be put in the low bits of fsid[1]
1234 * because that's where other Solaris filesystems put it.
1235 */
1236 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1237 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1238 vfsp->vfs_fsid.val[0] = fsid_guid;
1239 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1240 zfsfstype & 0xFF;
1241
1242 /*
1243 * Set features for file system.
1244 */
1245 zfs_set_fuid_feature(zfsvfs);
1246 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1247 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1248 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1249 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1250 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1251 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1252 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1253 }
1254 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1255
1256 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1257 uint64_t pval;
1258
1259 atime_changed_cb(zfsvfs, B_FALSE);
1260 readonly_changed_cb(zfsvfs, B_TRUE);
1261 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1262 goto out;
1263 xattr_changed_cb(zfsvfs, pval);
1264 zfsvfs->z_issnap = B_TRUE;
1265 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1266
1267 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1268 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1269 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1270 } else {
1271 error = zfsvfs_setup(zfsvfs, B_TRUE);
1272 }
1273
1274 if (!zfsvfs->z_issnap)
1275 zfsctl_create(zfsvfs);
1276 out:
1277 if (error) {
1278 dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1279 zfsvfs_free(zfsvfs);
1280 } else {
1281 atomic_inc_32(&zfs_active_fs_count);
1282 }
1283
1284 return (error);
1285 }
1286
1287 void
1288 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1289 {
1290 objset_t *os = zfsvfs->z_os;
1291
1292 if (!dmu_objset_is_snapshot(os))
1293 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1294 }
1295
1296 /*
1297 * Convert a decimal digit string to a uint64_t integer.
1298 */
1299 static int
1300 str_to_uint64(char *str, uint64_t *objnum)
1301 {
1302 uint64_t num = 0;
1303
1304 while (*str) {
1305 if (*str < '0' || *str > '9')
1306 return (SET_ERROR(EINVAL));
1307
1308 num = num*10 + *str++ - '0';
1309 }
1310
1311 *objnum = num;
1312 return (0);
1313 }
1314
1315 /*
1316 * The boot path passed from the boot loader is in the form of
1317 * "rootpool-name/root-filesystem-object-number'. Convert this
1318 * string to a dataset name: "rootpool-name/root-filesystem-name".
1319 */
1320 static int
1321 zfs_parse_bootfs(char *bpath, char *outpath)
1322 {
1323 char *slashp;
1324 uint64_t objnum;
1325 int error;
1326
1327 if (*bpath == 0 || *bpath == '/')
1328 return (SET_ERROR(EINVAL));
1329
1330 (void) strcpy(outpath, bpath);
1331
1332 slashp = strchr(bpath, '/');
1333
1334 /* if no '/', just return the pool name */
1335 if (slashp == NULL) {
1336 return (0);
1337 }
1338
1339 /* if not a number, just return the root dataset name */
1340 if (str_to_uint64(slashp+1, &objnum)) {
1341 return (0);
1342 }
1343
1344 *slashp = '\0';
1345 error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1346 *slashp = '/';
1347
1348 return (error);
1349 }
1350
1351 /*
1352 * Check that the hex label string is appropriate for the dataset being
1353 * mounted into the global_zone proper.
1354 *
1355 * Return an error if the hex label string is not default or
1356 * admin_low/admin_high. For admin_low labels, the corresponding
1357 * dataset must be readonly.
1358 */
1359 int
1360 zfs_check_global_label(const char *dsname, const char *hexsl)
1361 {
1362 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1363 return (0);
1364 if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1365 return (0);
1366 if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1367 /* must be readonly */
1368 uint64_t rdonly;
1369
1370 if (dsl_prop_get_integer(dsname,
1371 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1372 return (SET_ERROR(EACCES));
1373 return (rdonly ? 0 : EACCES);
1374 }
1375 return (SET_ERROR(EACCES));
1376 }
1377
1378 /*
1379 * Determine whether the mount is allowed according to MAC check.
1380 * by comparing (where appropriate) label of the dataset against
1381 * the label of the zone being mounted into. If the dataset has
1382 * no label, create one.
1383 *
1384 * Returns 0 if access allowed, error otherwise (e.g. EACCES)
1385 */
1386 static int
1387 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1388 {
1389 int error, retv;
1390 zone_t *mntzone = NULL;
1391 ts_label_t *mnt_tsl;
1392 bslabel_t *mnt_sl;
1393 bslabel_t ds_sl;
1394 char ds_hexsl[MAXNAMELEN];
1395
1396 retv = EACCES; /* assume the worst */
1397
1398 /*
1399 * Start by getting the dataset label if it exists.
1400 */
1401 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1402 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1403 if (error)
1404 return (SET_ERROR(EACCES));
1405
1406 /*
1407 * If labeling is NOT enabled, then disallow the mount of datasets
1408 * which have a non-default label already. No other label checks
1409 * are needed.
1410 */
1411 if (!is_system_labeled()) {
1412 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1413 return (0);
1414 return (SET_ERROR(EACCES));
1415 }
1416
1417 /*
1418 * Get the label of the mountpoint. If mounting into the global
1419 * zone (i.e. mountpoint is not within an active zone and the
1420 * zoned property is off), the label must be default or
1421 * admin_low/admin_high only; no other checks are needed.
1422 */
1423 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1424 if (mntzone->zone_id == GLOBAL_ZONEID) {
1425 uint64_t zoned;
1426
1427 zone_rele(mntzone);
1428
1429 if (dsl_prop_get_integer(osname,
1430 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1431 return (SET_ERROR(EACCES));
1432 if (!zoned)
1433 return (zfs_check_global_label(osname, ds_hexsl));
1434 else
1435 /*
1436 * This is the case of a zone dataset being mounted
1437 * initially, before the zone has been fully created;
1438 * allow this mount into global zone.
1439 */
1440 return (0);
1441 }
1442
1443 mnt_tsl = mntzone->zone_slabel;
1444 ASSERT(mnt_tsl != NULL);
1445 label_hold(mnt_tsl);
1446 mnt_sl = label2bslabel(mnt_tsl);
1447
1448 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1449 /*
1450 * The dataset doesn't have a real label, so fabricate one.
1451 */
1452 char *str = NULL;
1453
1454 if (l_to_str_internal(mnt_sl, &str) == 0 &&
1455 dsl_prop_set_string(osname,
1456 zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1457 ZPROP_SRC_LOCAL, str) == 0)
1458 retv = 0;
1459 if (str != NULL)
1460 kmem_free(str, strlen(str) + 1);
1461 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1462 /*
1463 * Now compare labels to complete the MAC check. If the
1464 * labels are equal then allow access. If the mountpoint
1465 * label dominates the dataset label, allow readonly access.
1466 * Otherwise, access is denied.
1467 */
1468 if (blequal(mnt_sl, &ds_sl))
1469 retv = 0;
1470 else if (bldominates(mnt_sl, &ds_sl)) {
1471 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1472 retv = 0;
1473 }
1474 }
1475
1476 label_rele(mnt_tsl);
1477 zone_rele(mntzone);
1478 return (retv);
1479 }
1480
1481 static int
1482 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1483 {
1484 int error = 0;
1485 static int zfsrootdone = 0;
1486 zfsvfs_t *zfsvfs = NULL;
1487 znode_t *zp = NULL;
1488 vnode_t *vp = NULL;
1489 char *zfs_bootfs;
1490 char *zfs_devid;
1491
1492 ASSERT(vfsp);
1493
1494 /*
1495 * The filesystem that we mount as root is defined in the
1496 * boot property "zfs-bootfs" with a format of
1497 * "poolname/root-dataset-objnum".
1498 */
1499 if (why == ROOT_INIT) {
1500 if (zfsrootdone++)
1501 return (SET_ERROR(EBUSY));
1502 /*
1503 * the process of doing a spa_load will require the
1504 * clock to be set before we could (for example) do
1505 * something better by looking at the timestamp on
1506 * an uberblock, so just set it to -1.
1507 */
1508 clkset(-1);
1509
1510 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1511 cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1512 "bootfs name");
1513 return (SET_ERROR(EINVAL));
1514 }
1515 zfs_devid = spa_get_bootprop("diskdevid");
1516 error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1517 if (zfs_devid)
1518 spa_free_bootprop(zfs_devid);
1519 if (error) {
1520 spa_free_bootprop(zfs_bootfs);
1521 cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1522 error);
1523 return (error);
1524 }
1525 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1526 spa_free_bootprop(zfs_bootfs);
1527 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1528 error);
1529 return (error);
1530 }
1531
1532 spa_free_bootprop(zfs_bootfs);
1533
1534 if (error = vfs_lock(vfsp))
1535 return (error);
1536
1537 if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1538 cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1539 goto out;
1540 }
1541
1542 zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1543 ASSERT(zfsvfs);
1544 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1545 cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1546 goto out;
1547 }
1548
1549 vp = ZTOV(zp);
1550 mutex_enter(&vp->v_lock);
1551 vp->v_flag |= VROOT;
1552 mutex_exit(&vp->v_lock);
1553 rootvp = vp;
1554
1555 /*
1556 * Leave rootvp held. The root file system is never unmounted.
1557 */
1558
1559 vfs_add((struct vnode *)0, vfsp,
1560 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1561 out:
1562 vfs_unlock(vfsp);
1563 return (error);
1564 } else if (why == ROOT_REMOUNT) {
1565 readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1566 vfsp->vfs_flag |= VFS_REMOUNT;
1567
1568 /* refresh mount options */
1569 zfs_unregister_callbacks(vfsp->vfs_data);
1570 return (zfs_register_callbacks(vfsp));
1571
1572 } else if (why == ROOT_UNMOUNT) {
1573 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1574 (void) zfs_sync(vfsp, 0, 0);
1575 return (0);
1576 }
1577
1578 /*
1579 * if "why" is equal to anything else other than ROOT_INIT,
1580 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1581 */
1582 return (SET_ERROR(ENOTSUP));
1583 }
1584
1585 /*ARGSUSED*/
1586 static int
1587 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
1588 {
1589 char *osname;
1590 pathname_t spn;
1591 int error = 0;
1592 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ?
1593 UIO_SYSSPACE : UIO_USERSPACE;
1594 int canwrite;
1595
1596 if (mvp->v_type != VDIR)
1597 return (SET_ERROR(ENOTDIR));
1598
1599 mutex_enter(&mvp->v_lock);
1600 if ((uap->flags & MS_REMOUNT) == 0 &&
1601 (uap->flags & MS_OVERLAY) == 0 &&
1602 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1603 mutex_exit(&mvp->v_lock);
1604 return (SET_ERROR(EBUSY));
1605 }
1606 mutex_exit(&mvp->v_lock);
1607
1608 /*
1609 * ZFS does not support passing unparsed data in via MS_DATA.
1610 * Users should use the MS_OPTIONSTR interface; this means
1611 * that all option parsing is already done and the options struct
1612 * can be interrogated.
1613 */
1614 if ((uap->flags & MS_DATA) && uap->datalen > 0)
1615 return (SET_ERROR(EINVAL));
1616
1617 /*
1618 * Get the objset name (the "special" mount argument).
1619 */
1620 if (error = pn_get(uap->spec, fromspace, &spn))
1621 return (error);
1622
1623 osname = spn.pn_path;
1624
1625 /*
1626 * Check for mount privilege?
1627 *
1628 * If we don't have privilege then see if
1629 * we have local permission to allow it
1630 */
1631 error = secpolicy_fs_mount(cr, mvp, vfsp);
1632 if (error) {
1633 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
1634 vattr_t vattr;
1635
1636 /*
1637 * Make sure user is the owner of the mount point
1638 * or has sufficient privileges.
1639 */
1640
1641 vattr.va_mask = AT_UID;
1642
1643 if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1644 goto out;
1645 }
1646
1647 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1648 VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
1649 goto out;
1650 }
1651 secpolicy_fs_mount_clearopts(cr, vfsp);
1652 } else {
1653 goto out;
1654 }
1655 }
1656
1657 /*
1658 * Refuse to mount a filesystem if we are in a local zone and the
1659 * dataset is not visible.
1660 */
1661 if (!INGLOBALZONE(curproc) &&
1662 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1663 error = SET_ERROR(EPERM);
1664 goto out;
1665 }
1666
1667 error = zfs_mount_label_policy(vfsp, osname);
1668 if (error)
1669 goto out;
1670
1671 /*
1672 * When doing a remount, we simply refresh our temporary properties
1673 * according to those options set in the current VFS options.
1674 */
1675 if (uap->flags & MS_REMOUNT) {
1676 /* refresh mount options */
1677 zfs_unregister_callbacks(vfsp->vfs_data);
1678 error = zfs_register_callbacks(vfsp);
1679 goto out;
1680 }
1681
1682 error = zfs_domount(vfsp, osname);
1683
1684 /*
1685 * Add an extra VFS_HOLD on our parent vfs so that it can't
1686 * disappear due to a forced unmount.
1687 */
1688 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1689 VFS_HOLD(mvp->v_vfsp);
1690
1691 out:
1692 pn_free(&spn);
1693 return (error);
1694 }
1695
1696 static int
1697 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
1698 {
1699 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1700 dev32_t d32;
1701 uint64_t refdbytes, availbytes, usedobjs, availobjs;
1702
1703 ZFS_ENTER(zfsvfs);
1704
1705 dmu_objset_space(zfsvfs->z_os,
1706 &refdbytes, &availbytes, &usedobjs, &availobjs);
1707
1708 /*
1709 * The underlying storage pool actually uses multiple block sizes.
1710 * We report the fragsize as the smallest block size we support,
1711 * and we report our blocksize as the filesystem's maximum blocksize.
1712 */
1713 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1714 statp->f_bsize = zfsvfs->z_max_blksz;
1715
1716 /*
1717 * The following report "total" blocks of various kinds in the
1718 * file system, but reported in terms of f_frsize - the
1719 * "fragment" size.
1720 */
1721
1722 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1723 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1724 statp->f_bavail = statp->f_bfree; /* no root reservation */
1725
1726 /*
1727 * statvfs() should really be called statufs(), because it assumes
1728 * static metadata. ZFS doesn't preallocate files, so the best
1729 * we can do is report the max that could possibly fit in f_files,
1730 * and that minus the number actually used in f_ffree.
1731 * For f_ffree, report the smaller of the number of object available
1732 * and the number of blocks (each object will take at least a block).
1733 */
1734 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1735 statp->f_favail = statp->f_ffree; /* no "root reservation" */
1736 statp->f_files = statp->f_ffree + usedobjs;
1737
1738 (void) cmpldev(&d32, vfsp->vfs_dev);
1739 statp->f_fsid = d32;
1740
1741 /*
1742 * We're a zfs filesystem.
1743 */
1744 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
1745
1746 statp->f_flag = vf_to_stf(vfsp->vfs_flag);
1747
1748 statp->f_namemax = MAXNAMELEN - 1;
1749
1750 /*
1751 * We have all of 32 characters to stuff a string here.
1752 * Is there anything useful we could/should provide?
1753 */
1754 bzero(statp->f_fstr, sizeof (statp->f_fstr));
1755
1756 ZFS_EXIT(zfsvfs);
1757 return (0);
1758 }
1759
1760 static int
1761 zfs_root(vfs_t *vfsp, vnode_t **vpp)
1762 {
1763 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1764 znode_t *rootzp;
1765 int error;
1766
1767 ZFS_ENTER(zfsvfs);
1768
1769 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1770 if (error == 0)
1771 *vpp = ZTOV(rootzp);
1772
1773 ZFS_EXIT(zfsvfs);
1774 return (error);
1775 }
1776
1777 /*
1778 * Teardown the zfsvfs::z_os.
1779 *
1780 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1781 * and 'z_teardown_inactive_lock' held.
1782 */
1783 static int
1784 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1785 {
1786 znode_t *zp;
1787
1788 zfs_unlinked_drain_stop_wait(zfsvfs);
1789 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1790
1791 if (!unmounting) {
1792 /*
1793 * We purge the parent filesystem's vfsp as the parent
1794 * filesystem and all of its snapshots have their vnode's
1795 * v_vfsp set to the parent's filesystem's vfsp. Note,
1796 * 'z_parent' is self referential for non-snapshots.
1797 */
1798 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1799 }
1800
1801 /*
1802 * Close the zil. NB: Can't close the zil while zfs_inactive
1803 * threads are blocked as zil_close can call zfs_inactive.
1804 */
1805 if (zfsvfs->z_log) {
1806 zil_close(zfsvfs->z_log);
1807 zfsvfs->z_log = NULL;
1808 }
1809
1810 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1811
1812 /*
1813 * If we are not unmounting (ie: online recv) and someone already
1814 * unmounted this file system while we were doing the switcheroo,
1815 * or a reopen of z_os failed then just bail out now.
1816 */
1817 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1818 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1819 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1820 return (SET_ERROR(EIO));
1821 }
1822
1823 /*
1824 * At this point there are no vops active, and any new vops will
1825 * fail with EIO since we have z_teardown_lock for writer (only
1826 * relavent for forced unmount).
1827 *
1828 * Release all holds on dbufs.
1829 */
1830 mutex_enter(&zfsvfs->z_znodes_lock);
1831 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1832 zp = list_next(&zfsvfs->z_all_znodes, zp))
1833 if (zp->z_sa_hdl) {
1834 ASSERT(ZTOV(zp)->v_count > 0);
1835 zfs_znode_dmu_fini(zp);
1836 }
1837 mutex_exit(&zfsvfs->z_znodes_lock);
1838
1839 /*
1840 * If we are unmounting, set the unmounted flag and let new vops
1841 * unblock. zfs_inactive will have the unmounted behavior, and all
1842 * other vops will fail with EIO.
1843 */
1844 if (unmounting) {
1845 zfsvfs->z_unmounted = B_TRUE;
1846 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1847 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1848 }
1849
1850 /*
1851 * z_os will be NULL if there was an error in attempting to reopen
1852 * zfsvfs, so just return as the properties had already been
1853 * unregistered and cached data had been evicted before.
1854 */
1855 if (zfsvfs->z_os == NULL)
1856 return (0);
1857
1858 /*
1859 * Unregister properties.
1860 */
1861 zfs_unregister_callbacks(zfsvfs);
1862
1863 /*
1864 * Evict cached data
1865 */
1866 if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
1867 !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
1868 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1869 (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1870
1871 return (0);
1872 }
1873
1874 /*ARGSUSED*/
1875 static int
1876 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1877 {
1878 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1879 objset_t *os;
1880 int ret;
1881
1882 ret = secpolicy_fs_unmount(cr, vfsp);
1883 if (ret) {
1884 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1885 ZFS_DELEG_PERM_MOUNT, cr))
1886 return (ret);
1887 }
1888
1889 /*
1890 * We purge the parent filesystem's vfsp as the parent filesystem
1891 * and all of its snapshots have their vnode's v_vfsp set to the
1892 * parent's filesystem's vfsp. Note, 'z_parent' is self
1893 * referential for non-snapshots.
1894 */
1895 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1896
1897 /*
1898 * Unmount any snapshots mounted under .zfs before unmounting the
1899 * dataset itself.
1900 */
1901 if (zfsvfs->z_ctldir != NULL &&
1902 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1903 return (ret);
1904 }
1905
1906 if (!(fflag & MS_FORCE)) {
1907 uint_t active_vnodes;
1908
1909 /*
1910 * Check the number of active vnodes in the file system.
1911 * Our count is maintained in the vfs structure, but the
1912 * number is off by 1 to indicate a hold on the vfs
1913 * structure itself.
1914 *
1915 * The '.zfs' directory maintains a reference of its
1916 * own, and any active references underneath are
1917 * reflected in the vnode count.
1918 *
1919 * Active vnodes: vnodes that were held by an user
1920 */
1921
1922 active_vnodes =
1923 vfsp->vfs_count - zfsvfs->z_znodes_freeing_cnt;
1924
1925 if (zfsvfs->z_ctldir == NULL) {
1926 if (active_vnodes > 1)
1927 return (SET_ERROR(EBUSY));
1928 } else {
1929 if (active_vnodes > 2 ||
1930 zfsvfs->z_ctldir->v_count > 1)
1931 return (SET_ERROR(EBUSY));
1932 }
1933 }
1934
1935 vfsp->vfs_flag |= VFS_UNMOUNTED;
1936
1937 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1938 os = zfsvfs->z_os;
1939
1940 /*
1941 * z_os will be NULL if there was an error in
1942 * attempting to reopen zfsvfs.
1943 */
1944 if (os != NULL) {
1945 /*
1946 * Unset the objset user_ptr.
1947 */
1948 mutex_enter(&os->os_user_ptr_lock);
1949 dmu_objset_set_user(os, NULL);
1950 mutex_exit(&os->os_user_ptr_lock);
1951
1952 /*
1953 * Finally release the objset
1954 */
1955 dmu_objset_disown(os, zfsvfs);
1956 }
1957
1958 /*
1959 * We can now safely destroy the '.zfs' directory node.
1960 */
1961 if (zfsvfs->z_ctldir != NULL)
1962 zfsctl_destroy(zfsvfs);
1963
1964 return (0);
1965 }
1966
1967 static int
1968 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1969 {
1970 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1971 znode_t *zp;
1972 uint64_t object = 0;
1973 uint64_t fid_gen = 0;
1974 uint64_t gen_mask;
1975 uint64_t zp_gen;
1976 int i, err;
1977
1978 *vpp = NULL;
1979
1980 ZFS_ENTER(zfsvfs);
1981
1982 if (fidp->fid_len == LONG_FID_LEN) {
1983 zfid_long_t *zlfid = (zfid_long_t *)fidp;
1984 uint64_t objsetid = 0;
1985 uint64_t setgen = 0;
1986
1987 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1988 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1989
1990 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1991 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1992
1993 ZFS_EXIT(zfsvfs);
1994
1995 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1996 if (err)
1997 return (SET_ERROR(EINVAL));
1998 ZFS_ENTER(zfsvfs);
1999 }
2000
2001 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
2002 zfid_short_t *zfid = (zfid_short_t *)fidp;
2003
2004 for (i = 0; i < sizeof (zfid->zf_object); i++)
2005 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
2006
2007 for (i = 0; i < sizeof (zfid->zf_gen); i++)
2008 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
2009 } else {
2010 ZFS_EXIT(zfsvfs);
2011 return (SET_ERROR(EINVAL));
2012 }
2013
2014 /* A zero fid_gen means we are in the .zfs control directories */
2015 if (fid_gen == 0 &&
2016 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
2017 *vpp = zfsvfs->z_ctldir;
2018 ASSERT(*vpp != NULL);
2019 if (object == ZFSCTL_INO_SNAPDIR) {
2020 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
2021 0, NULL, NULL, NULL, NULL, NULL) == 0);
2022 } else {
2023 VN_HOLD(*vpp);
2024 }
2025 ZFS_EXIT(zfsvfs);
2026 return (0);
2027 }
2028
2029 gen_mask = -1ULL >> (64 - 8 * i);
2030
2031 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
2032 if (err = zfs_zget(zfsvfs, object, &zp)) {
2033 ZFS_EXIT(zfsvfs);
2034 return (err);
2035 }
2036 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
2037 sizeof (uint64_t));
2038 zp_gen = zp_gen & gen_mask;
2039 if (zp_gen == 0)
2040 zp_gen = 1;
2041 if (zp->z_unlinked || zp_gen != fid_gen) {
2042 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2043 VN_RELE(ZTOV(zp));
2044 ZFS_EXIT(zfsvfs);
2045 return (SET_ERROR(EINVAL));
2046 }
2047
2048 *vpp = ZTOV(zp);
2049 ZFS_EXIT(zfsvfs);
2050 return (0);
2051 }
2052
2053 /*
2054 * Block out VOPs and close zfsvfs_t::z_os
2055 *
2056 * Note, if successful, then we return with the 'z_teardown_lock' and
2057 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
2058 * dataset and objset intact so that they can be atomically handed off during
2059 * a subsequent rollback or recv operation and the resume thereafter.
2060 */
2061 int
2062 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2063 {
2064 int error;
2065
2066 mutex_enter(&zfsvfs->z_lock);
2067 if (zfsvfs->z_busy) {
2068 mutex_exit(&zfsvfs->z_lock);
2069 return (SET_ERROR(EBUSY));
2070 }
2071 zfsvfs->z_busy = B_TRUE;
2072 mutex_exit(&zfsvfs->z_lock);
2073
2074 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) {
2075 mutex_enter(&zfsvfs->z_lock);
2076 zfsvfs->z_busy = B_FALSE;
2077 mutex_exit(&zfsvfs->z_lock);
2078 return (error);
2079 }
2080
2081 return (0);
2082 }
2083
2084 /*
2085 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
2086 * is an invariant across any of the operations that can be performed while the
2087 * filesystem was suspended. Whether it succeeded or failed, the preconditions
2088 * are the same: the relevant objset and associated dataset are owned by
2089 * zfsvfs, held, and long held on entry.
2090 */
2091 int
2092 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2093 {
2094 int err;
2095 znode_t *zp;
2096
2097 ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
2098 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2099
2100 /*
2101 * We already own this, so just update the objset_t, as the one we
2102 * had before may have been evicted.
2103 */
2104 objset_t *os;
2105 VERIFY3P(ds->ds_owner, ==, zfsvfs);
2106 VERIFY(dsl_dataset_long_held(ds));
2107 VERIFY0(dmu_objset_from_ds(ds, &os));
2108
2109 err = zfsvfs_init(zfsvfs, os);
2110 if (err != 0)
2111 goto bail;
2112
2113 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2114
2115 zfs_set_fuid_feature(zfsvfs);
2116
2117 /*
2118 * Attempt to re-establish all the active znodes with
2119 * their dbufs. If a zfs_rezget() fails, then we'll let
2120 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2121 * when they try to use their znode.
2122 */
2123 mutex_enter(&zfsvfs->z_znodes_lock);
2124 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2125 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2126 (void) zfs_rezget(zp);
2127 }
2128 mutex_exit(&zfsvfs->z_znodes_lock);
2129
2130 if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) &&
2131 !zfsvfs->z_unmounted) {
2132 /*
2133 * zfs_suspend_fs() could have interrupted freeing
2134 * of dnodes. We need to restart this freeing so
2135 * that we don't "leak" the space.
2136 */
2137 zfs_unlinked_drain(zfsvfs);
2138 }
2139
2140 bail:
2141 /* release the VOPs */
2142 rw_exit(&zfsvfs->z_teardown_inactive_lock);
2143 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2144
2145 if (err) {
2146 /*
2147 * Since we couldn't setup the sa framework, try to force
2148 * unmount this file system.
2149 */
2150 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
2151 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
2152 }
2153 mutex_enter(&zfsvfs->z_lock);
2154 zfsvfs->z_busy = B_FALSE;
2155 mutex_exit(&zfsvfs->z_lock);
2156
2157 return (err);
2158 }
2159
2160 static void
2161 zfs_freevfs(vfs_t *vfsp)
2162 {
2163 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2164
2165 /*
2166 * If this is a snapshot, we have an extra VFS_HOLD on our parent
2167 * from zfs_mount(). Release it here. If we came through
2168 * zfs_mountroot() instead, we didn't grab an extra hold, so
2169 * skip the VFS_RELE for rootvfs.
2170 */
2171 if (zfsvfs->z_issnap && (vfsp != rootvfs))
2172 VFS_RELE(zfsvfs->z_parent->z_vfs);
2173
2174 zfsvfs_free(zfsvfs);
2175
2176 atomic_dec_32(&zfs_active_fs_count);
2177 }
2178
2179 /*
2180 * VFS_INIT() initialization. Note that there is no VFS_FINI(),
2181 * so we can't safely do any non-idempotent initialization here.
2182 * Leave that to zfs_init() and zfs_fini(), which are called
2183 * from the module's _init() and _fini() entry points.
2184 */
2185 /*ARGSUSED*/
2186 static int
2187 zfs_vfsinit(int fstype, char *name)
2188 {
2189 int error;
2190
2191 zfsfstype = fstype;
2192
2193 /*
2194 * Setup vfsops and vnodeops tables.
2195 */
2196 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
2197 if (error != 0) {
2198 cmn_err(CE_WARN, "zfs: bad vfs ops template");
2199 }
2200
2201 error = zfs_create_op_tables();
2202 if (error) {
2203 zfs_remove_op_tables();
2204 cmn_err(CE_WARN, "zfs: bad vnode ops template");
2205 (void) vfs_freevfsops_by_type(zfsfstype);
2206 return (error);
2207 }
2208
2209 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
2210
2211 /*
2212 * Unique major number for all zfs mounts.
2213 * If we run out of 32-bit minors, we'll getudev() another major.
2214 */
2215 zfs_major = ddi_name_to_major(ZFS_DRIVER);
2216 zfs_minor = ZFS_MIN_MINOR;
2217
2218 return (0);
2219 }
2220
2221 void
2222 zfs_init(void)
2223 {
2224 /*
2225 * Initialize .zfs directory structures
2226 */
2227 zfsctl_init();
2228
2229 /*
2230 * Initialize znode cache, vnode ops, etc...
2231 */
2232 zfs_znode_init();
2233
2234 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2235 }
2236
2237 void
2238 zfs_fini(void)
2239 {
2240 zfsctl_fini();
2241 zfs_znode_fini();
2242 }
2243
2244 int
2245 zfs_busy(void)
2246 {
2247 return (zfs_active_fs_count != 0);
2248 }
2249
2250 int
2251 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2252 {
2253 int error;
2254 objset_t *os = zfsvfs->z_os;
2255 dmu_tx_t *tx;
2256
2257 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2258 return (SET_ERROR(EINVAL));
2259
2260 if (newvers < zfsvfs->z_version)
2261 return (SET_ERROR(EINVAL));
2262
2263 if (zfs_spa_version_map(newvers) >
2264 spa_version(dmu_objset_spa(zfsvfs->z_os)))
2265 return (SET_ERROR(ENOTSUP));
2266
2267 tx = dmu_tx_create(os);
2268 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2269 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2270 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2271 ZFS_SA_ATTRS);
2272 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2273 }
2274 error = dmu_tx_assign(tx, TXG_WAIT);
2275 if (error) {
2276 dmu_tx_abort(tx);
2277 return (error);
2278 }
2279
2280 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2281 8, 1, &newvers, tx);
2282
2283 if (error) {
2284 dmu_tx_commit(tx);
2285 return (error);
2286 }
2287
2288 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2289 uint64_t sa_obj;
2290
2291 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2292 SPA_VERSION_SA);
2293 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2294 DMU_OT_NONE, 0, tx);
2295
2296 error = zap_add(os, MASTER_NODE_OBJ,
2297 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2298 ASSERT0(error);
2299
2300 VERIFY(0 == sa_set_sa_object(os, sa_obj));
2301 sa_register_update_callback(os, zfs_sa_upgrade);
2302 }
2303
2304 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2305 "from %llu to %llu", zfsvfs->z_version, newvers);
2306
2307 dmu_tx_commit(tx);
2308
2309 zfsvfs->z_version = newvers;
2310 os->os_version = newvers;
2311
2312 zfs_set_fuid_feature(zfsvfs);
2313
2314 return (0);
2315 }
2316
2317 /*
2318 * Read a property stored within the master node.
2319 */
2320 int
2321 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2322 {
2323 uint64_t *cached_copy = NULL;
2324
2325 /*
2326 * Figure out where in the objset_t the cached copy would live, if it
2327 * is available for the requested property.
2328 */
2329 if (os != NULL) {
2330 switch (prop) {
2331 case ZFS_PROP_VERSION:
2332 cached_copy = &os->os_version;
2333 break;
2334 case ZFS_PROP_NORMALIZE:
2335 cached_copy = &os->os_normalization;
2336 break;
2337 case ZFS_PROP_UTF8ONLY:
2338 cached_copy = &os->os_utf8only;
2339 break;
2340 case ZFS_PROP_CASE:
2341 cached_copy = &os->os_casesensitivity;
2342 break;
2343 default:
2344 break;
2345 }
2346 }
2347 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2348 *value = *cached_copy;
2349 return (0);
2350 }
2351
2352 /*
2353 * If the property wasn't cached, look up the file system's value for
2354 * the property. For the version property, we look up a slightly
2355 * different string.
2356 */
2357 const char *pname;
2358 int error = ENOENT;
2359 if (prop == ZFS_PROP_VERSION) {
2360 pname = ZPL_VERSION_STR;
2361 } else {
2362 pname = zfs_prop_to_name(prop);
2363 }
2364
2365 if (os != NULL) {
2366 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2367 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2368 }
2369
2370 if (error == ENOENT) {
2371 /* No value set, use the default value */
2372 switch (prop) {
2373 case ZFS_PROP_VERSION:
2374 *value = ZPL_VERSION;
2375 break;
2376 case ZFS_PROP_NORMALIZE:
2377 case ZFS_PROP_UTF8ONLY:
2378 *value = 0;
2379 break;
2380 case ZFS_PROP_CASE:
2381 *value = ZFS_CASE_SENSITIVE;
2382 break;
2383 default:
2384 return (error);
2385 }
2386 error = 0;
2387 }
2388
2389 /*
2390 * If one of the methods for getting the property value above worked,
2391 * copy it into the objset_t's cache.
2392 */
2393 if (error == 0 && cached_copy != NULL) {
2394 *cached_copy = *value;
2395 }
2396
2397 return (error);
2398 }
2399
2400 /*
2401 * Return true if the coresponding vfs's unmounted flag is set.
2402 * Otherwise return false.
2403 * If this function returns true we know VFS unmount has been initiated.
2404 */
2405 boolean_t
2406 zfs_get_vfs_flag_unmounted(objset_t *os)
2407 {
2408 zfsvfs_t *zfvp;
2409 boolean_t unmounted = B_FALSE;
2410
2411 ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
2412
2413 mutex_enter(&os->os_user_ptr_lock);
2414 zfvp = dmu_objset_get_user(os);
2415 if (zfvp != NULL && zfvp->z_vfs != NULL &&
2416 (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED))
2417 unmounted = B_TRUE;
2418 mutex_exit(&os->os_user_ptr_lock);
2419
2420 return (unmounted);
2421 }
2422
2423 static vfsdef_t vfw = {
2424 VFSDEF_VERSION,
2425 MNTTYPE_ZFS,
2426 zfs_vfsinit,
2427 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
2428 VSW_XID|VSW_ZMOUNT,
2429 &zfs_mntopts
2430 };
2431
2432 struct modlfs zfs_modlfs = {
2433 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
2434 };