1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2013, 2016 Joyent, Inc. All rights reserved.
25 * Copyright (c) 2014 by Delphix. All rights reserved.
26 */
27
28 /* vnode ops for the /dev/zvol directory */
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/ddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/sunldi.h>
36 #include <fs/fs_subr.h>
37 #include <sys/fs/dv_node.h>
38 #include <sys/fs/sdev_impl.h>
39 #include <sys/zfs_ioctl.h>
40 #include <sys/policy.h>
41 #include <sys/stat.h>
42 #include <sys/vfs_opreg.h>
43
44 struct vnodeops *devzvol_vnodeops;
45 static major_t devzvol_major;
46 static taskq_ent_t devzvol_zclist_task;
47
48 static kmutex_t devzvol_mtx;
49 /* Below are protected by devzvol_mtx */
50 static boolean_t devzvol_isopen;
51 static boolean_t devzvol_zclist_task_running = B_FALSE;
52 static uint64_t devzvol_gen = 0;
53 static uint64_t devzvol_zclist;
54 static size_t devzvol_zclist_size;
55 static ldi_ident_t devzvol_li;
56 static ldi_handle_t devzvol_lh;
57
58 /*
59 * we need to use ddi_mod* since fs/dev gets loaded early on in
60 * startup(), and linking fs/dev to fs/zfs would drag in a lot of
61 * other stuff (like drv/random) before the rest of the system is
62 * ready to go
63 */
64 ddi_modhandle_t zfs_mod;
65 int (*szcm)(char *);
66 int (*szn2m)(char *, minor_t *);
67
68
69 /*
70 * Enable/disable snapshots from being created in /dev/zvol. By default,
71 * they are enabled, preserving the historic behavior.
72 */
73 boolean_t devzvol_snaps_allowed = B_TRUE;
74
75 int
76 sdev_zvol_create_minor(char *dsname)
77 {
78 if (szcm == NULL)
79 return (-1);
80 return ((*szcm)(dsname));
81 }
82
83 int
84 sdev_zvol_name2minor(char *dsname, minor_t *minor)
85 {
86 if (szn2m == NULL)
87 return (-1);
88 return ((*szn2m)(dsname, minor));
89 }
90
91 int
92 devzvol_open_zfs()
93 {
94 int rc;
95 dev_t dv;
96
97 devzvol_li = ldi_ident_from_anon();
98 if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
99 &devzvol_lh, devzvol_li))
100 return (-1);
101 if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
102 KRTLD_MODE_FIRST, &rc)) == NULL)) {
103 return (rc);
104 }
105 ASSERT(szcm == NULL && szn2m == NULL);
106 if ((szcm = (int (*)(char *))
107 ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
108 cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
109 return (rc);
110 }
111 if ((szn2m = (int(*)(char *, minor_t *))
112 ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
113 cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
114 return (rc);
115 }
116 if (ldi_get_dev(devzvol_lh, &dv))
117 return (-1);
118 devzvol_major = getmajor(dv);
119 return (0);
120 }
121
122 void
123 devzvol_close_zfs()
124 {
125 szcm = NULL;
126 szn2m = NULL;
127 (void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
128 ldi_ident_release(devzvol_li);
129 if (zfs_mod != NULL) {
130 (void) ddi_modclose(zfs_mod);
131 zfs_mod = NULL;
132 }
133 }
134
135 int
136 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
137 {
138 uint64_t cookie;
139 int size = 8000;
140 int unused;
141 int rc;
142
143 if (cmd != ZFS_IOC_POOL_CONFIGS)
144 mutex_enter(&devzvol_mtx);
145 if (!devzvol_isopen) {
146 if ((rc = devzvol_open_zfs()) == 0) {
147 devzvol_isopen = B_TRUE;
148 } else {
149 if (cmd != ZFS_IOC_POOL_CONFIGS)
150 mutex_exit(&devzvol_mtx);
151 return (ENXIO);
152 }
153 }
154 cookie = zc->zc_cookie;
155 again:
156 zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
157 KM_SLEEP);
158 zc->zc_nvlist_dst_size = size;
159 rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
160 &unused);
161 if (rc == ENOMEM) {
162 int newsize;
163 newsize = zc->zc_nvlist_dst_size;
164 ASSERT(newsize > size);
165 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
166 size = newsize;
167 zc->zc_cookie = cookie;
168 goto again;
169 }
170 if (alloc_size == NULL)
171 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
172 else
173 *alloc_size = size;
174 if (cmd != ZFS_IOC_POOL_CONFIGS)
175 mutex_exit(&devzvol_mtx);
176 return (rc);
177 }
178
179 /* figures out if the objset exists and returns its type */
180 int
181 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
182 {
183 boolean_t ispool, is_snapshot;
184 zfs_cmd_t *zc;
185 int rc;
186 nvlist_t *nvl;
187 size_t nvsz;
188
189 ispool = (strchr(dsname, '/') == NULL);
190 is_snapshot = (strchr(dsname, '@') != NULL);
191
192 if (is_snapshot && !devzvol_snaps_allowed)
193 return (ENOTSUP);
194
195 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
196 (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
197
198 nvl = fnvlist_alloc();
199 fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE);
200 zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz);
201 zc->zc_nvlist_src_size = nvsz;
202 fnvlist_free(nvl);
203
204 rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
205 ZFS_IOC_OBJSET_STATS, zc, NULL);
206 if (type && rc == 0)
207 *type = (ispool) ? DMU_OST_ZFS :
208 zc->zc_objset_stats.dds_type;
209 fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz);
210 kmem_free(zc, sizeof (zfs_cmd_t));
211 return (rc);
212 }
213
214 /*
215 * Returns what the zfs dataset name should be, given the /dev/zvol
216 * path and an optional name (can be NULL).
217 *
218 * Note that if the name param is NULL, then path must be an
219 * actual dataset's directory and not one of the top-level
220 * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a
221 * specific dataset.
222 */
223 char *
224 devzvol_make_dsname(const char *path, const char *name)
225 {
226 char *dsname;
227 const char *ptr;
228 int dslen;
229
230 if (strcmp(path, ZVOL_DIR) == 0)
231 return (NULL);
232 if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
233 return (NULL);
234 ptr = path + strlen(ZVOL_DIR);
235 if (strncmp(ptr, "/dsk", 4) == 0)
236 ptr += strlen("/dsk");
237 else if (strncmp(ptr, "/rdsk", 5) == 0)
238 ptr += strlen("/rdsk");
239 else
240 return (NULL);
241
242 if (*ptr == '/')
243 ptr++;
244 else if (name == NULL)
245 return (NULL);
246
247 dslen = strlen(ptr);
248 if (dslen)
249 dslen++; /* plus null */
250 if (name)
251 dslen += strlen(name) + 1; /* plus slash */
252 dsname = kmem_zalloc(dslen, KM_SLEEP);
253 if (*ptr) {
254 (void) strlcpy(dsname, ptr, dslen);
255 if (name)
256 (void) strlcat(dsname, "/", dslen);
257 }
258 if (name)
259 (void) strlcat(dsname, name, dslen);
260 return (dsname);
261 }
262
263 /*
264 * check if the zvol's sdev_node is still valid, which means make
265 * sure the zvol is still valid. zvol minors aren't proactively
266 * destroyed when the zvol is destroyed, so we use a validator to clean
267 * these up (in other words, when such nodes are encountered during
268 * subsequent lookup() and readdir() operations) so that only valid
269 * nodes are returned. The ordering between devname_lookup_func and
270 * devzvol_validate is a little inefficient in the case of invalid
271 * or stale nodes because devname_lookup_func calls
272 * devzvol_create_{dir, link}, then the validator says it's invalid,
273 * and then the node gets cleaned up.
274 */
275 int
276 devzvol_validate(struct sdev_node *dv)
277 {
278 vnode_t *vn = SDEVTOV(dv);
279 dmu_objset_type_t do_type;
280 char *dsname;
281 char *nm = dv->sdev_name;
282 int rc;
283
284 sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
285 /*
286 * validate only READY nodes; if someone is sitting on the
287 * directory of a dataset that just got destroyed we could
288 * get a zombie node which we just skip.
289 */
290 if (dv->sdev_state != SDEV_READY) {
291 sdcmn_err13(("skipping '%s'", nm));
292 return (SDEV_VTOR_SKIP);
293 }
294
295 if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
296 (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
297 return (SDEV_VTOR_VALID);
298 dsname = devzvol_make_dsname(dv->sdev_path, NULL);
299 if (dsname == NULL)
300 return (SDEV_VTOR_INVALID);
301
302 /*
303 * Leave any nodes alone that have been explicitly created by
304 * sdev profiles.
305 */
306 if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) {
307 kmem_free(dsname, strlen(dsname) + 1);
308 return (SDEV_VTOR_VALID);
309 }
310
311 rc = devzvol_objset_check(dsname, &do_type);
312 sdcmn_err13((" '%s' rc %d", dsname, rc));
313 if (rc != 0) {
314 sdev_node_t *parent = dv->sdev_dotdot;
315 /*
316 * Explicitly passed-through zvols in our sdev profile can't
317 * be created as prof_* shadow nodes, because in the GZ they
318 * are symlinks, but in the NGZ they are actual device files.
319 *
320 * The objset_check will fail on these as they are outside
321 * any delegated dataset (zfs will not allow ioctl access to
322 * them from this zone). We still want them to work, though.
323 */
324 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
325 parent->sdev_origin != NULL &&
326 !(dv->sdev_flags & SDEV_GLOBAL) &&
327 (vn->v_type == VBLK || vn->v_type == VCHR) &&
328 prof_name_matched(nm, parent)) {
329 do_type = DMU_OST_ZVOL;
330 } else {
331 kmem_free(dsname, strlen(dsname) + 1);
332 return (SDEV_VTOR_INVALID);
333 }
334 }
335
336 sdcmn_err13((" v_type %d do_type %d",
337 vn->v_type, do_type));
338 if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
339 ((vn->v_type == VBLK || vn->v_type == VCHR) &&
340 do_type != DMU_OST_ZVOL) ||
341 (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
342 kmem_free(dsname, strlen(dsname) + 1);
343 return (SDEV_VTOR_STALE);
344 }
345 if (vn->v_type == VLNK) {
346 char *ptr, *link;
347 long val = 0;
348 minor_t lminor, ominor;
349
350 rc = sdev_getlink(vn, &link);
351 ASSERT(rc == 0);
352
353 ptr = strrchr(link, ':') + 1;
354 rc = ddi_strtol(ptr, NULL, 10, &val);
355 kmem_free(link, strlen(link) + 1);
356 ASSERT(rc == 0 && val != 0);
357 lminor = (minor_t)val;
358 if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
359 ominor != lminor) {
360 kmem_free(dsname, strlen(dsname) + 1);
361 return (SDEV_VTOR_STALE);
362 }
363 }
364 kmem_free(dsname, strlen(dsname) + 1);
365 return (SDEV_VTOR_VALID);
366 }
367
368 /*
369 * Taskq callback to update the devzvol_zclist.
370 *
371 * We need to defer this to the taskq to avoid it running with a user
372 * context that might be associated with some non-global zone, and thus
373 * not being able to list all of the pools on the entire system.
374 */
375 /*ARGSUSED*/
376 static void
377 devzvol_update_zclist_cb(void *arg)
378 {
379 zfs_cmd_t *zc;
380 int rc;
381 size_t size;
382
383 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
384 mutex_enter(&devzvol_mtx);
385 zc->zc_cookie = devzvol_gen;
386
387 rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
388 switch (rc) {
389 case 0:
390 /* new generation */
391 ASSERT(devzvol_gen != zc->zc_cookie);
392 devzvol_gen = zc->zc_cookie;
393 if (devzvol_zclist)
394 kmem_free((void *)(uintptr_t)devzvol_zclist,
395 devzvol_zclist_size);
396 devzvol_zclist = zc->zc_nvlist_dst;
397 /* Keep the alloc'd size, not the nvlist size. */
398 devzvol_zclist_size = size;
399 break;
400 default:
401 /*
402 * Either there was no change in pool configuration
403 * since we last asked (rc == EEXIST) or we got a
404 * catastrophic error.
405 *
406 * Give up memory and exit.
407 */
408 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
409 size);
410 break;
411 }
412
413 VERIFY(devzvol_zclist_task_running == B_TRUE);
414 devzvol_zclist_task_running = B_FALSE;
415 mutex_exit(&devzvol_mtx);
416
417 kmem_free(zc, sizeof (zfs_cmd_t));
418 }
419
420 static void
421 devzvol_update_zclist(void)
422 {
423 mutex_enter(&devzvol_mtx);
424 if (devzvol_zclist_task_running == B_TRUE) {
425 mutex_exit(&devzvol_mtx);
426 goto wait;
427 }
428
429 devzvol_zclist_task_running = B_TRUE;
430
431 taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
432 &devzvol_zclist_task);
433
434 mutex_exit(&devzvol_mtx);
435
436 wait:
437 taskq_wait(sdev_taskq);
438 }
439
440 /*
441 * Creates sub-directories for each zpool as needed in response to a
442 * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
443 */
444 void
445 devzvol_create_pool_dirs(struct vnode *dvp)
446 {
447 nvlist_t *nv = NULL;
448 nvpair_t *elem = NULL;
449 int pools = 0;
450 int rc;
451
452 sdcmn_err13(("devzvol_create_pool_dirs"));
453
454 devzvol_update_zclist();
455
456 mutex_enter(&devzvol_mtx);
457
458 rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
459 devzvol_zclist_size, &nv, 0);
460 if (rc) {
461 ASSERT(rc == 0);
462 kmem_free((void *)(uintptr_t)devzvol_zclist,
463 devzvol_zclist_size);
464 devzvol_gen = 0;
465 devzvol_zclist = NULL;
466 devzvol_zclist_size = 0;
467 goto out;
468 }
469 mutex_exit(&devzvol_mtx);
470 while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
471 struct vnode *vp;
472 ASSERT(dvp->v_count > 0);
473 rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
474 NULL, kcred, NULL, 0, NULL);
475 /*
476 * should either work or we should get an error if this should
477 * not be visible from the zone, or disallowed in the zone
478 */
479 if (rc == 0)
480 VN_RELE(vp);
481 pools++;
482 }
483 nvlist_free(nv);
484 mutex_enter(&devzvol_mtx);
485 if (devzvol_isopen && pools == 0) {
486 /* clean up so zfs can be unloaded */
487 devzvol_close_zfs();
488 devzvol_isopen = B_FALSE;
489 }
490 out:
491 mutex_exit(&devzvol_mtx);
492 }
493
494 /*ARGSUSED3*/
495 static int
496 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
497 cred_t *cred, void *whatever, char *whichever)
498 {
499 timestruc_t now;
500 struct vattr *vap = (struct vattr *)arg;
501
502 sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
503 ddv->sdev_path, nm));
504 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
505 strlen(ZVOL_DIR)) == 0);
506 *vap = *sdev_getdefault_attr(VDIR);
507 gethrestime(&now);
508 vap->va_atime = now;
509 vap->va_mtime = now;
510 vap->va_ctime = now;
511 return (0);
512 }
513
514 /*ARGSUSED3*/
515 static int
516 devzvol_create_link(struct sdev_node *ddv, char *nm,
517 void **arg, cred_t *cred, void *whatever, char *whichever)
518 {
519 minor_t minor;
520 char *pathname = (char *)*arg;
521 int rc;
522 char *dsname;
523 char *x;
524 char str[MAXNAMELEN];
525 sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
526 ddv->sdev_path, nm));
527 dsname = devzvol_make_dsname(ddv->sdev_path, nm);
528 rc = sdev_zvol_create_minor(dsname);
529 if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
530 sdev_zvol_name2minor(dsname, &minor)) {
531 sdcmn_err13(("devzvol_create_link %d", rc));
532 kmem_free(dsname, strlen(dsname) + 1);
533 return (-1);
534 }
535 kmem_free(dsname, strlen(dsname) + 1);
536
537 /*
538 * This is a valid zvol; create a symlink that points to the
539 * minor which was created under /devices/pseudo/zfs@0
540 */
541 *pathname = '\0';
542 for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
543 (void) strcat(pathname, "../");
544 (void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
545 (void) strncat(pathname, str, MAXPATHLEN);
546 if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
547 strlen(ZVOL_FULL_RDEV_DIR)) == 0)
548 (void) strcat(pathname, ",raw");
549 return (0);
550 }
551
552 /* Clean zvol sdev_nodes that are no longer valid. */
553 static void
554 devzvol_prunedir(struct sdev_node *ddv)
555 {
556 struct sdev_node *dv;
557
558 ASSERT(RW_READ_HELD(&ddv->sdev_contents));
559
560 sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
561 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
562 if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
563 rw_exit(&ddv->sdev_contents);
564 rw_enter(&ddv->sdev_contents, RW_WRITER);
565 }
566
567 dv = SDEV_FIRST_ENTRY(ddv);
568 while (dv) {
569 sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
570
571 switch (devzvol_validate(dv)) {
572 case SDEV_VTOR_VALID:
573 case SDEV_VTOR_SKIP:
574 dv = SDEV_NEXT_ENTRY(ddv, dv);
575 continue;
576 case SDEV_VTOR_INVALID:
577 sdcmn_err7(("prunedir: destroy invalid "
578 "node: %s\n", dv->sdev_name));
579 break;
580 }
581
582 if ((SDEVTOV(dv)->v_type == VDIR) &&
583 (sdev_cleandir(dv, NULL, 0) != 0)) {
584 dv = SDEV_NEXT_ENTRY(ddv, dv);
585 continue;
586 }
587 SDEV_HOLD(dv);
588 /* remove the cache node */
589 sdev_cache_update(ddv, &dv, dv->sdev_name,
590 SDEV_CACHE_DELETE);
591 SDEV_RELE(dv);
592 dv = SDEV_FIRST_ENTRY(ddv);
593 }
594 rw_downgrade(&ddv->sdev_contents);
595 }
596
597 /*
598 * This function is used to create a dir or dev inside a zone's /dev when the
599 * zone has a zvol that is dynamically created within the zone (i.e. inside
600 * of a delegated dataset. Since there is no /devices tree within a zone,
601 * we create the chr/blk devices directly inside the zone's /dev instead of
602 * making symlinks.
603 */
604 static int
605 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
606 {
607 struct vattr vattr;
608 timestruc_t now;
609 enum vtype expected_type = VDIR;
610 dmu_objset_type_t do_type;
611 struct sdev_node *dv = NULL;
612 int res;
613 char *dsname;
614
615 bzero(&vattr, sizeof (vattr));
616 gethrestime(&now);
617 vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
618 vattr.va_uid = SDEV_UID_DEFAULT;
619 vattr.va_gid = SDEV_GID_DEFAULT;
620 vattr.va_type = VNON;
621 vattr.va_atime = now;
622 vattr.va_mtime = now;
623 vattr.va_ctime = now;
624
625 if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
626 return (ENOENT);
627
628 if (devzvol_objset_check(dsname, &do_type) != 0) {
629 /*
630 * objset_check will succeed on any valid objset in the global
631 * zone, and any valid delegated dataset. It will fail, however,
632 * in non-global zones on explicitly whitelisted zvol devices
633 * that are outside any delegated dataset.
634 *
635 * The directories leading up to the zvol device itself will be
636 * created by prof for us in advance (and will always validate
637 * because of the matching check in devzvol_validate). The zvol
638 * device itself can't be created by prof though because in the
639 * GZ it's a symlink, and in the NGZ it is not. So, we create
640 * such zvol device files here.
641 */
642 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
643 parent->sdev_origin != NULL &&
644 prof_name_matched(nm, parent)) {
645 do_type = DMU_OST_ZVOL;
646 } else {
647 kmem_free(dsname, strlen(dsname) + 1);
648 return (ENOENT);
649 }
650 }
651
652 if (do_type == DMU_OST_ZVOL)
653 expected_type = VBLK;
654
655 if (expected_type == VDIR) {
656 vattr.va_type = VDIR;
657 vattr.va_mode = SDEV_DIRMODE_DEFAULT;
658 } else {
659 minor_t minor;
660 dev_t devnum;
661 int rc;
662
663 rc = sdev_zvol_create_minor(dsname);
664 if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
665 sdev_zvol_name2minor(dsname, &minor)) {
666 kmem_free(dsname, strlen(dsname) + 1);
667 return (ENOENT);
668 }
669
670 devnum = makedevice(devzvol_major, minor);
671 vattr.va_rdev = devnum;
672
673 if (strstr(parent->sdev_path, "/rdsk/") != NULL)
674 vattr.va_type = VCHR;
675 else
676 vattr.va_type = VBLK;
677 vattr.va_mode = SDEV_DEVMODE_DEFAULT;
678 }
679 kmem_free(dsname, strlen(dsname) + 1);
680
681 rw_enter(&parent->sdev_contents, RW_WRITER);
682
683 res = sdev_mknode(parent, nm, &dv, &vattr,
684 NULL, NULL, kcred, SDEV_READY);
685 rw_exit(&parent->sdev_contents);
686 if (res != 0)
687 return (ENOENT);
688
689 SDEV_RELE(dv);
690 return (0);
691 }
692
693 /*ARGSUSED*/
694 static int
695 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
696 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
697 caller_context_t *ct, int *direntflags, pathname_t *realpnp)
698 {
699 enum vtype expected_type = VDIR;
700 struct sdev_node *parent = VTOSDEV(dvp);
701 char *dsname;
702 dmu_objset_type_t do_type;
703 int error;
704
705 sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
706 *vpp = NULL;
707 /* execute access is required to search the directory */
708 if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
709 return (error);
710
711 rw_enter(&parent->sdev_contents, RW_READER);
712 if (!SDEV_IS_GLOBAL(parent)) {
713 int res;
714
715 rw_exit(&parent->sdev_contents);
716
717 /*
718 * If we're in the global zone and reach down into a non-global
719 * zone's /dev/zvol then this action could trigger the creation
720 * of all of the zvol devices for every zone into the non-global
721 * zone's /dev tree. This could be a big security hole. To
722 * prevent this, disallow the global zone from looking inside
723 * a non-global zones /dev/zvol. This behavior is similar to
724 * delegated datasets, which cannot be used by the global zone.
725 */
726 if (getzoneid() == GLOBAL_ZONEID)
727 return (EPERM);
728
729 res = prof_lookup(dvp, nm, vpp, cred);
730
731 /*
732 * We won't find a zvol that was dynamically created inside
733 * a NGZ, within a delegated dataset, in the zone's dev profile
734 * but prof_lookup will also find it via sdev_cache_lookup.
735 */
736 if (res == ENOENT) {
737 /*
738 * We have to create the sdev node for the dymamically
739 * created zvol.
740 */
741 if (devzvol_mk_ngz_node(parent, nm) != 0)
742 return (ENOENT);
743 res = prof_lookup(dvp, nm, vpp, cred);
744 }
745
746 return (res);
747 }
748
749 /*
750 * Don't let the global-zone style lookup succeed here when we're not
751 * running in the global zone. This can happen because prof calls into
752 * us (in prof_filldir) trying to create an explicitly passed-through
753 * zvol device outside any delegated dataset.
754 *
755 * We have to stop this here or else we will create prof shadows of
756 * the global zone symlink, which will make no sense at all in the
757 * non-global zone (it has no /devices for the symlink to point at).
758 *
759 * These zvols will be created later (at access time) by mk_ngz_node
760 * instead. The dirs leading up to them will be created by prof
761 * internally.
762 *
763 * We have to return EPERM here, because ENOENT is given special
764 * meaning by prof in this context.
765 */
766 if (getzoneid() != GLOBAL_ZONEID) {
767 rw_exit(&parent->sdev_contents);
768 return (EPERM);
769 }
770
771 dsname = devzvol_make_dsname(parent->sdev_path, nm);
772 rw_exit(&parent->sdev_contents);
773 sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
774 if (dsname) {
775 error = devzvol_objset_check(dsname, &do_type);
776 if (error != 0) {
777 error = ENOENT;
778 goto out;
779 }
780 if (do_type == DMU_OST_ZVOL)
781 expected_type = VLNK;
782 }
783 /*
784 * the callbacks expect:
785 *
786 * parent->sdev_path nm
787 * /dev/zvol {r}dsk
788 * /dev/zvol/{r}dsk <pool name>
789 * /dev/zvol/{r}dsk/<dataset name> <last ds component>
790 *
791 * sdev_name is always last path component of sdev_path
792 */
793 if (expected_type == VDIR) {
794 error = devname_lookup_func(parent, nm, vpp, cred,
795 devzvol_create_dir, SDEV_VATTR);
796 } else {
797 error = devname_lookup_func(parent, nm, vpp, cred,
798 devzvol_create_link, SDEV_VLINK);
799 }
800 sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
801 ASSERT(error || ((*vpp)->v_type == expected_type));
802 out:
803 if (dsname)
804 kmem_free(dsname, strlen(dsname) + 1);
805 sdcmn_err13(("devzvol_lookup %d", error));
806 return (error);
807 }
808
809 /*
810 * We allow create to find existing nodes
811 * - if the node doesn't exist - EROFS
812 * - creating an existing dir read-only succeeds, otherwise EISDIR
813 * - exclusive creates fail - EEXIST
814 */
815 /*ARGSUSED2*/
816 static int
817 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
818 int mode, struct vnode **vpp, struct cred *cred, int flag,
819 caller_context_t *ct, vsecattr_t *vsecp)
820 {
821 int error;
822 struct vnode *vp;
823
824 *vpp = NULL;
825
826 error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
827 NULL);
828 if (error == 0) {
829 if (excl == EXCL)
830 error = EEXIST;
831 else if (vp->v_type == VDIR && (mode & VWRITE))
832 error = EISDIR;
833 else
834 error = VOP_ACCESS(vp, mode, 0, cred, ct);
835
836 if (error) {
837 VN_RELE(vp);
838 } else
839 *vpp = vp;
840 } else if (error == ENOENT) {
841 error = EROFS;
842 }
843
844 return (error);
845 }
846
847 void sdev_iter_snapshots(struct vnode *dvp, char *name);
848
849 void
850 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
851 {
852 zfs_cmd_t *zc;
853 int rc;
854
855 sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
856 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
857 (void) strcpy(zc->zc_name, name);
858
859 while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
860 struct vnode *vpp;
861 char *ptr;
862
863 sdcmn_err13((" name %s", zc->zc_name));
864 if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
865 goto skip;
866 ptr = strrchr(zc->zc_name, '/') + 1;
867 rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
868 kcred, NULL, NULL, NULL);
869 if (rc == 0) {
870 VN_RELE(vpp);
871 } else if (rc == ENOENT) {
872 goto skip;
873 } else {
874 /*
875 * EBUSY == problem with zvols's dmu holds?
876 * EPERM when in a NGZ and traversing up and out.
877 */
878 goto skip;
879 }
880 if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
881 zc->zc_objset_stats.dds_type == DMU_OST_ZVOL &&
882 devzvol_snaps_allowed)
883 sdev_iter_snapshots(dvp, zc->zc_name);
884 skip:
885 (void) strcpy(zc->zc_name, name);
886 }
887 kmem_free(zc, sizeof (zfs_cmd_t));
888 }
889
890 void
891 sdev_iter_snapshots(struct vnode *dvp, char *name)
892 {
893 sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
894 }
895
896 /*ARGSUSED4*/
897 static int
898 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
899 int *eofp, caller_context_t *ct_unused, int flags_unused)
900 {
901 struct sdev_node *sdvp = VTOSDEV(dvp);
902 char *ptr;
903
904 sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
905 sdvp->sdev_name));
906
907 if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
908 struct vnode *vp;
909
910 rw_exit(&sdvp->sdev_contents);
911 (void) devname_lookup_func(sdvp, "dsk", &vp, cred,
912 devzvol_create_dir, SDEV_VATTR);
913 VN_RELE(vp);
914 (void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
915 devzvol_create_dir, SDEV_VATTR);
916 VN_RELE(vp);
917 rw_enter(&sdvp->sdev_contents, RW_READER);
918 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
919 }
920 if (uiop->uio_offset == 0)
921 devzvol_prunedir(sdvp);
922 ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
923 if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
924 rw_exit(&sdvp->sdev_contents);
925 devzvol_create_pool_dirs(dvp);
926 rw_enter(&sdvp->sdev_contents, RW_READER);
927 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
928 }
929
930 ptr = strchr(ptr + 1, '/');
931 if (ptr == NULL)
932 return (ENOENT);
933 ptr++;
934 rw_exit(&sdvp->sdev_contents);
935 sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
936 rw_enter(&sdvp->sdev_contents, RW_READER);
937 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
938 }
939
940 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
941 VOPNAME_READDIR, { .vop_readdir = devzvol_readdir },
942 VOPNAME_LOOKUP, { .vop_lookup = devzvol_lookup },
943 VOPNAME_CREATE, { .vop_create = devzvol_create },
944 VOPNAME_RENAME, { .error = fs_nosys },
945 VOPNAME_MKDIR, { .error = fs_nosys },
946 VOPNAME_RMDIR, { .error = fs_nosys },
947 VOPNAME_REMOVE, { .error = fs_nosys },
948 VOPNAME_SYMLINK, { .error = fs_nosys },
949 NULL, NULL
950 };