1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2018 Nexenta Systems, Inc.
25 */
26
27 /*
28 * miscellaneous routines for the devfs
29 */
30
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/t_lock.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/user.h>
37 #include <sys/time.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/fcntl.h>
42 #include <sys/flock.h>
43 #include <sys/kmem.h>
44 #include <sys/uio.h>
45 #include <sys/errno.h>
46 #include <sys/stat.h>
47 #include <sys/cred.h>
48 #include <sys/dirent.h>
49 #include <sys/pathname.h>
50 #include <sys/cmn_err.h>
51 #include <sys/debug.h>
52 #include <sys/modctl.h>
53 #include <fs/fs_subr.h>
54 #include <sys/fs/dv_node.h>
55 #include <sys/fs/snode.h>
56 #include <sys/sunndi.h>
57 #include <sys/sunmdi.h>
58 #include <sys/conf.h>
59
60 #ifdef DEBUG
61 int devfs_debug = 0x0;
62 #endif
63
64 const char dvnm[] = "devfs";
65 kmem_cache_t *dv_node_cache; /* dv_node cache */
66
67 /*
68 * The devfs_clean_key is taken during a devfs_clean operation: it is used to
69 * prevent unnecessary code execution and for detection of potential deadlocks.
70 */
71 uint_t devfs_clean_key;
72
73 struct dv_node *dvroot;
74
75 /* prototype memory vattrs */
76 vattr_t dv_vattr_dir = {
77 AT_TYPE|AT_MODE|AT_UID|AT_GID, /* va_mask */
78 VDIR, /* va_type */
79 DV_DIRMODE_DEFAULT, /* va_mode */
80 DV_UID_DEFAULT, /* va_uid */
81 DV_GID_DEFAULT, /* va_gid */
82 0, /* va_fsid; */
83 0, /* va_nodeid; */
84 0, /* va_nlink; */
85 0, /* va_size; */
86 0, /* va_atime; */
87 0, /* va_mtime; */
88 0, /* va_ctime; */
89 0, /* va_rdev; */
90 0, /* va_blksize; */
91 0, /* va_nblocks; */
92 0, /* va_seq; */
93 };
94
95 vattr_t dv_vattr_file = {
96 AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV, /* va_mask */
97 0, /* va_type */
98 DV_DEVMODE_DEFAULT, /* va_mode */
99 DV_UID_DEFAULT, /* va_uid */
100 DV_GID_DEFAULT, /* va_gid */
101 0, /* va_fsid; */
102 0, /* va_nodeid; */
103 0, /* va_nlink; */
104 0, /* va_size; */
105 0, /* va_atime; */
106 0, /* va_mtime; */
107 0, /* va_ctime; */
108 0, /* va_rdev; */
109 0, /* va_blksize; */
110 0, /* va_nblocks; */
111 0, /* va_seq; */
112 };
113
114 vattr_t dv_vattr_priv = {
115 AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV, /* va_mask */
116 0, /* va_type */
117 DV_DEVMODE_PRIV, /* va_mode */
118 DV_UID_DEFAULT, /* va_uid */
119 DV_GID_DEFAULT, /* va_gid */
120 0, /* va_fsid; */
121 0, /* va_nodeid; */
122 0, /* va_nlink; */
123 0, /* va_size; */
124 0, /* va_atime; */
125 0, /* va_mtime; */
126 0, /* va_ctime; */
127 0, /* va_rdev; */
128 0, /* va_blksize; */
129 0, /* va_nblocks; */
130 0, /* va_seq; */
131 };
132
133 extern dev_info_t *clone_dip;
134 extern major_t clone_major;
135 extern struct dev_ops *ddi_hold_driver(major_t);
136
137 /* dv_node node constructor for kmem cache */
138 static int
139 i_dv_node_ctor(void *buf, void *cfarg, int flag)
140 {
141 _NOTE(ARGUNUSED(cfarg, flag))
142 struct dv_node *dv = (struct dv_node *)buf;
143 struct vnode *vp;
144
145 bzero(buf, sizeof (struct dv_node));
146 vp = dv->dv_vnode = vn_alloc(flag);
147 if (vp == NULL) {
148 return (-1);
149 }
150 vp->v_data = dv;
151 rw_init(&dv->dv_contents, NULL, RW_DEFAULT, NULL);
152 return (0);
153 }
154
155 /* dv_node node destructor for kmem cache */
156 static void
157 i_dv_node_dtor(void *buf, void *arg)
158 {
159 _NOTE(ARGUNUSED(arg))
160 struct dv_node *dv = (struct dv_node *)buf;
161 struct vnode *vp = DVTOV(dv);
162
163 rw_destroy(&dv->dv_contents);
164 vn_invalid(vp);
165 vn_free(vp);
166 }
167
168
169 /* initialize dv_node node cache */
170 void
171 dv_node_cache_init()
172 {
173 ASSERT(dv_node_cache == NULL);
174 dv_node_cache = kmem_cache_create("dv_node_cache",
175 sizeof (struct dv_node), 0, i_dv_node_ctor, i_dv_node_dtor,
176 NULL, NULL, NULL, 0);
177
178 tsd_create(&devfs_clean_key, NULL);
179 }
180
181 /* destroy dv_node node cache */
182 void
183 dv_node_cache_fini()
184 {
185 ASSERT(dv_node_cache != NULL);
186 kmem_cache_destroy(dv_node_cache);
187 dv_node_cache = NULL;
188
189 tsd_destroy(&devfs_clean_key);
190 }
191
192 /*
193 * dv_mkino - Generate a unique inode number for devfs nodes.
194 *
195 * Although ino_t is 64 bits, the inode number is truncated to 32 bits for 32
196 * bit non-LARGEFILE applications. This means that there is a requirement to
197 * maintain the inode number as a 32 bit value or applications will have
198 * stat(2) calls fail with EOVERFLOW. We form a 32 bit inode number from the
199 * dev_t. but if the minor number is larger than L_MAXMIN32 we fold extra minor
200 *
201 * To generate inode numbers for directories, we assume that we will never use
202 * more than half the major space - this allows for ~8190 drivers. We use this
203 * upper major number space to allocate inode numbers for directories by
204 * encoding the major and instance into this space.
205 *
206 * We also skew the result so that inode 2 is reserved for the root of the file
207 * system.
208 *
209 * As part of the future support for 64-bit dev_t APIs, the upper minor bits
210 * should be folded into the high inode bits by adding the following code
211 * after "ino |= 1":
212 *
213 * #if (L_BITSMINOR32 != L_BITSMINOR)
214 * |* fold overflow minor bits into high bits of inode number *|
215 * ino |= ((ino_t)(minor >> L_BITSMINOR32)) << L_BITSMINOR;
216 * #endif |* (L_BITSMINOR32 != L_BITSMINOR) *|
217 *
218 * This way only applications that use devices that overflow their minor
219 * space will have an application level impact.
220 */
221 static ino_t
222 dv_mkino(dev_info_t *devi, vtype_t typ, dev_t dev)
223 {
224 major_t major;
225 minor_t minor;
226 ino_t ino;
227 static int warn;
228
229 if (typ == VDIR) {
230 major = ((L_MAXMAJ32 + 1) >> 1) + DEVI(devi)->devi_major;
231 minor = ddi_get_instance(devi);
232
233 /* makedevice32 in high half of major number space */
234 ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
235
236 major = DEVI(devi)->devi_major;
237 } else {
238 major = getmajor(dev);
239 minor = getminor(dev);
240
241 /* makedevice32 */
242 ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
243
244 /* make ino for VCHR different than VBLK */
245 ino <<= 1;
246 if (typ == VCHR)
247 ino |= 1;
248 }
249
250 ino += DV_ROOTINO + 1; /* skew */
251
252 /*
253 * diagnose things a little early because adding the skew to a large
254 * minor number could roll over the major.
255 */
256 if ((major >= (L_MAXMAJ32 >> 1)) && (warn == 0)) {
257 warn = 1;
258 cmn_err(CE_WARN, "%s: inode numbers are not unique", dvnm);
259 }
260
261 return (ino);
262 }
263
264 /*
265 * Compare two nodes lexographically to balance avl tree
266 */
267 static int
268 dv_compare_nodes(const struct dv_node *dv1, const struct dv_node *dv2)
269 {
270 int rv;
271
272 if ((rv = strcmp(dv1->dv_name, dv2->dv_name)) == 0)
273 return (0);
274 return ((rv < 0) ? -1 : 1);
275 }
276
277 /*
278 * dv_mkroot
279 *
280 * Build the first VDIR dv_node.
281 */
282 struct dv_node *
283 dv_mkroot(struct vfs *vfsp, dev_t devfsdev)
284 {
285 struct dv_node *dv;
286 struct vnode *vp;
287
288 ASSERT(ddi_root_node() != NULL);
289 ASSERT(dv_node_cache != NULL);
290
291 dcmn_err3(("dv_mkroot\n"));
292 dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
293 vp = DVTOV(dv);
294 vn_reinit(vp);
295 vp->v_flag = VROOT;
296 vp->v_vfsp = vfsp;
297 vp->v_type = VDIR;
298 vp->v_rdev = devfsdev;
299 vn_setops(vp, dv_vnodeops);
300 vn_exists(vp);
301
302 dvroot = dv;
303
304 dv->dv_name = NULL; /* not needed */
305 dv->dv_namelen = 0;
306
307 dv->dv_devi = ddi_root_node();
308
309 dv->dv_ino = DV_ROOTINO;
310 dv->dv_nlink = 2; /* name + . (no dv_insert) */
311 dv->dv_dotdot = dv; /* .. == self */
312 dv->dv_attrvp = NULLVP;
313 dv->dv_attr = NULL;
314 dv->dv_flags = DV_BUILD;
315 dv->dv_priv = NULL;
316 dv->dv_busy = 0;
317 dv->dv_dflt_mode = 0;
318
319 avl_create(&dv->dv_entries,
320 (int (*)(const void *, const void *))dv_compare_nodes,
321 sizeof (struct dv_node), offsetof(struct dv_node, dv_avllink));
322
323 return (dv);
324 }
325
326 /*
327 * dv_mkdir
328 *
329 * Given an probed or attached nexus node, create a VDIR dv_node.
330 * No dv_attrvp is created at this point.
331 */
332 struct dv_node *
333 dv_mkdir(struct dv_node *ddv, dev_info_t *devi, char *nm)
334 {
335 struct dv_node *dv;
336 struct vnode *vp;
337 size_t nmlen;
338
339 ASSERT((devi));
340 dcmn_err4(("dv_mkdir: %s\n", nm));
341
342 dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
343 nmlen = strlen(nm) + 1;
344 dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
345 bcopy(nm, dv->dv_name, nmlen);
346 dv->dv_namelen = nmlen - 1; /* '\0' not included */
347
348 vp = DVTOV(dv);
349 vn_reinit(vp);
350 vp->v_flag = 0;
351 vp->v_vfsp = DVTOV(ddv)->v_vfsp;
352 vp->v_type = VDIR;
353 vp->v_rdev = DVTOV(ddv)->v_rdev;
354 vn_setops(vp, vn_getops(DVTOV(ddv)));
355 vn_exists(vp);
356
357 dv->dv_devi = devi;
358 ndi_hold_devi(devi);
359
360 dv->dv_ino = dv_mkino(devi, VDIR, NODEV);
361 dv->dv_nlink = 0; /* updated on insert */
362 dv->dv_dotdot = ddv;
363 dv->dv_attrvp = NULLVP;
364 dv->dv_attr = NULL;
365 dv->dv_flags = DV_BUILD;
366 dv->dv_priv = NULL;
367 dv->dv_busy = 0;
368 dv->dv_dflt_mode = 0;
369
370 avl_create(&dv->dv_entries,
371 (int (*)(const void *, const void *))dv_compare_nodes,
372 sizeof (struct dv_node), offsetof(struct dv_node, dv_avllink));
373
374 return (dv);
375 }
376
377 /*
378 * dv_mknod
379 *
380 * Given a minor node, create a VCHR or VBLK dv_node.
381 * No dv_attrvp is created at this point.
382 */
383 static struct dv_node *
384 dv_mknod(struct dv_node *ddv, dev_info_t *devi, char *nm,
385 struct ddi_minor_data *dmd)
386 {
387 struct dv_node *dv;
388 struct vnode *vp;
389 size_t nmlen;
390
391 dcmn_err4(("dv_mknod: %s\n", nm));
392
393 dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
394 nmlen = strlen(nm) + 1;
395 dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
396 bcopy(nm, dv->dv_name, nmlen);
397 dv->dv_namelen = nmlen - 1; /* no '\0' */
398
399 vp = DVTOV(dv);
400 vn_reinit(vp);
401 vp->v_flag = 0;
402 vp->v_vfsp = DVTOV(ddv)->v_vfsp;
403 vp->v_type = dmd->ddm_spec_type == S_IFCHR ? VCHR : VBLK;
404 vp->v_rdev = dmd->ddm_dev;
405 vn_setops(vp, vn_getops(DVTOV(ddv)));
406 vn_exists(vp);
407
408 ASSERT(DEVI_BUSY_OWNED(devi));
409 ndi_hold_devi(devi);
410
411 dv->dv_devi = devi;
412 dv->dv_ino = dv_mkino(devi, vp->v_type, vp->v_rdev);
413 dv->dv_nlink = 0; /* updated on insert */
414 dv->dv_dotdot = ddv;
415 dv->dv_attrvp = NULLVP;
416 dv->dv_attr = NULL;
417 dv->dv_flags = 0;
418
419 if (dmd->type == DDM_INTERNAL_PATH)
420 dv->dv_flags |= DV_INTERNAL;
421 if (dmd->ddm_flags & DM_NO_FSPERM)
422 dv->dv_flags |= DV_NO_FSPERM;
423
424 dv->dv_priv = dmd->ddm_node_priv;
425 if (dv->dv_priv)
426 dphold(dv->dv_priv);
427
428 /*
429 * Minors created with ddi_create_priv_minor_node can specify
430 * a default mode permission other than the devfs default.
431 */
432 if (dv->dv_priv || dv->dv_flags & DV_NO_FSPERM) {
433 dcmn_err5(("%s: dv_mknod default priv mode 0%o\n",
434 dv->dv_name, dmd->ddm_priv_mode));
435 dv->dv_flags |= DV_DFLT_MODE;
436 dv->dv_dflt_mode = dmd->ddm_priv_mode & S_IAMB;
437 }
438
439 return (dv);
440 }
441
442 /*
443 * dv_destroy
444 *
445 * Destroy what we created in dv_mkdir or dv_mknod.
446 * In the case of a *referenced* directory, do nothing.
447 */
448 void
449 dv_destroy(struct dv_node *dv, uint_t flags)
450 {
451 vnode_t *vp = DVTOV(dv);
452 ASSERT(dv->dv_nlink == 0); /* no references */
453
454 dcmn_err4(("dv_destroy: %s\n", dv->dv_name));
455
456 /*
457 * We may be asked to unlink referenced directories.
458 * In this case, there is nothing to be done.
459 * The eventual memory free will be done in
460 * devfs_inactive.
461 */
462 if (vp->v_count != 0) {
463 ASSERT(vp->v_type == VDIR);
464 ASSERT(flags & DV_CLEAN_FORCE);
465 ASSERT(DV_STALE(dv));
466 return;
467 }
468
469 if (vp->v_type == VDIR) {
470 ASSERT(DV_FIRST_ENTRY(dv) == NULL);
471 avl_destroy(&dv->dv_entries);
472 }
473
474 if (dv->dv_attrvp != NULLVP)
475 VN_RELE(dv->dv_attrvp);
476 if (dv->dv_attr != NULL)
477 kmem_free(dv->dv_attr, sizeof (struct vattr));
478 if (dv->dv_name != NULL)
479 kmem_free(dv->dv_name, dv->dv_namelen + 1);
480 if (dv->dv_devi != NULL) {
481 ndi_rele_devi(dv->dv_devi);
482 }
483 if (dv->dv_priv != NULL) {
484 dpfree(dv->dv_priv);
485 }
486
487 kmem_cache_free(dv_node_cache, dv);
488 }
489
490 /*
491 * Find and hold dv_node by name
492 */
493 static struct dv_node *
494 dv_findbyname(struct dv_node *ddv, char *nm)
495 {
496 struct dv_node *dv;
497 avl_index_t where;
498 struct dv_node dvtmp;
499
500 ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
501 dcmn_err3(("dv_findbyname: %s\n", nm));
502
503 dvtmp.dv_name = nm;
504 dv = avl_find(&ddv->dv_entries, &dvtmp, &where);
505 if (dv) {
506 ASSERT(dv->dv_dotdot == ddv);
507 ASSERT(strcmp(dv->dv_name, nm) == 0);
508 VN_HOLD(DVTOV(dv));
509 return (dv);
510 }
511 return (NULL);
512 }
513
514 /*
515 * Inserts a new dv_node in a parent directory
516 */
517 void
518 dv_insert(struct dv_node *ddv, struct dv_node *dv)
519 {
520 avl_index_t where;
521
522 ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
523 ASSERT(DVTOV(ddv)->v_type == VDIR);
524 ASSERT(ddv->dv_nlink >= 2);
525 ASSERT(dv->dv_nlink == 0);
526
527 dcmn_err3(("dv_insert: %s\n", dv->dv_name));
528
529 dv->dv_dotdot = ddv;
530 if (DVTOV(dv)->v_type == VDIR) {
531 ddv->dv_nlink++; /* .. to containing directory */
532 dv->dv_nlink = 2; /* name + . */
533 } else {
534 dv->dv_nlink = 1; /* name */
535 }
536
537 /* enter node in the avl tree */
538 VERIFY(avl_find(&ddv->dv_entries, dv, &where) == NULL);
539 avl_insert(&ddv->dv_entries, dv, where);
540 }
541
542 /*
543 * Unlink a dv_node from a perent directory
544 */
545 void
546 dv_unlink(struct dv_node *ddv, struct dv_node *dv)
547 {
548 /* verify linkage of arguments */
549 ASSERT(ddv && dv);
550 ASSERT(dv->dv_dotdot == ddv);
551 ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
552 ASSERT(DVTOV(ddv)->v_type == VDIR);
553
554 dcmn_err3(("dv_unlink: %s\n", dv->dv_name));
555
556 if (DVTOV(dv)->v_type == VDIR) {
557 ddv->dv_nlink--; /* .. to containing directory */
558 dv->dv_nlink -= 2; /* name + . */
559 } else {
560 dv->dv_nlink -= 1; /* name */
561 }
562 ASSERT(ddv->dv_nlink >= 2);
563 ASSERT(dv->dv_nlink == 0);
564
565 dv->dv_dotdot = NULL;
566
567 /* remove from avl tree */
568 avl_remove(&ddv->dv_entries, dv);
569 }
570
571 /*
572 * Merge devfs node specific information into an attribute structure.
573 *
574 * NOTE: specfs provides ATIME,MTIME,CTIME,SIZE,BLKSIZE,NBLOCKS on leaf node.
575 */
576 void
577 dv_vattr_merge(struct dv_node *dv, struct vattr *vap)
578 {
579 struct vnode *vp = DVTOV(dv);
580
581 vap->va_nodeid = dv->dv_ino;
582 vap->va_nlink = dv->dv_nlink;
583
584 if (vp->v_type == VDIR) {
585 vap->va_rdev = 0;
586 vap->va_fsid = vp->v_rdev;
587 } else {
588 vap->va_rdev = vp->v_rdev;
589 vap->va_fsid = DVTOV(dv->dv_dotdot)->v_rdev;
590 vap->va_type = vp->v_type;
591 /* don't trust the shadow file type */
592 vap->va_mode &= ~S_IFMT;
593 if (vap->va_type == VCHR)
594 vap->va_mode |= S_IFCHR;
595 else
596 vap->va_mode |= S_IFBLK;
597 }
598 }
599
600 /*
601 * Get default device permission by consulting rules in
602 * privilege specification in minor node and /etc/minor_perm.
603 *
604 * This function is called from the devname filesystem to get default
605 * permissions for a device exported to a non-global zone.
606 */
607 void
608 devfs_get_defattr(struct vnode *vp, struct vattr *vap, int *no_fs_perm)
609 {
610 mperm_t mp;
611 struct dv_node *dv;
612
613 /* If vp isn't a dv_node, return something sensible */
614 if (!vn_matchops(vp, dv_vnodeops)) {
615 if (no_fs_perm)
616 *no_fs_perm = 0;
617 *vap = dv_vattr_file;
618 return;
619 }
620
621 /*
622 * For minors not created by ddi_create_priv_minor_node(),
623 * use devfs defaults.
624 */
625 dv = VTODV(vp);
626 if (vp->v_type == VDIR) {
627 *vap = dv_vattr_dir;
628 } else if (dv->dv_flags & DV_NO_FSPERM) {
629 if (no_fs_perm)
630 *no_fs_perm = 1;
631 *vap = dv_vattr_priv;
632 } else {
633 /*
634 * look up perm bits from minor_perm
635 */
636 *vap = dv_vattr_file;
637 if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) == 0) {
638 VATTR_MP_MERGE((*vap), mp);
639 dcmn_err5(("%s: minor perm mode 0%o\n",
640 dv->dv_name, vap->va_mode));
641 } else if (dv->dv_flags & DV_DFLT_MODE) {
642 ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
643 vap->va_mode &= ~S_IAMB;
644 vap->va_mode |= dv->dv_dflt_mode;
645 dcmn_err5(("%s: priv mode 0%o\n",
646 dv->dv_name, vap->va_mode));
647 }
648 }
649 }
650
651 /*
652 * dv_shadow_node
653 *
654 * Given a VDIR dv_node, find/create the associated VDIR
655 * node in the shadow attribute filesystem.
656 *
657 * Given a VCHR/VBLK dv_node, find the associated VREG
658 * node in the shadow attribute filesystem. These nodes
659 * are only created to persist non-default attributes.
660 * Lack of such a node implies the default permissions
661 * are sufficient.
662 *
663 * Managing the attribute file entries is slightly tricky (mostly
664 * because we can't intercept VN_HOLD and VN_RELE except on the last
665 * release).
666 *
667 * We assert that if the dv_attrvp pointer is non-NULL, it points
668 * to a singly-held (by us) vnode that represents the shadow entry
669 * in the underlying filesystem. To avoid store-ordering issues,
670 * we assert that the pointer can only be tested under the dv_contents
671 * READERS lock.
672 */
673
674 void
675 dv_shadow_node(
676 struct vnode *dvp, /* devfs parent directory vnode */
677 char *nm, /* name component */
678 struct vnode *vp, /* devfs vnode */
679 struct pathname *pnp, /* the path .. */
680 struct vnode *rdir, /* the root .. */
681 struct cred *cred, /* who's asking? */
682 int flags) /* optionally create shadow node */
683 {
684 struct dv_node *dv; /* dv_node of named directory */
685 struct vnode *rdvp; /* shadow parent directory vnode */
686 struct vnode *rvp; /* shadow vnode */
687 struct vnode *rrvp; /* realvp of shadow vnode */
688 struct vattr vattr;
689 int create_tried;
690 int error;
691
692 ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
693 dv = VTODV(vp);
694 dcmn_err3(("dv_shadow_node: name %s attr %p\n",
695 nm, (void *)dv->dv_attrvp));
696
697 if ((flags & DV_SHADOW_WRITE_HELD) == 0) {
698 ASSERT(RW_READ_HELD(&dv->dv_contents));
699 if (dv->dv_attrvp != NULLVP)
700 return;
701 if (!rw_tryupgrade(&dv->dv_contents)) {
702 rw_exit(&dv->dv_contents);
703 rw_enter(&dv->dv_contents, RW_WRITER);
704 if (dv->dv_attrvp != NULLVP) {
705 rw_downgrade(&dv->dv_contents);
706 return;
707 }
708 }
709 } else {
710 ASSERT(RW_WRITE_HELD(&dv->dv_contents));
711 if (dv->dv_attrvp != NULLVP)
712 return;
713 }
714
715 ASSERT(RW_WRITE_HELD(&dv->dv_contents) && dv->dv_attrvp == NULL);
716
717 rdvp = VTODV(dvp)->dv_attrvp;
718 create_tried = 0;
719 lookup:
720 if (rdvp && (dv->dv_flags & DV_NO_FSPERM) == 0) {
721 error = VOP_LOOKUP(rdvp, nm, &rvp, pnp, LOOKUP_DIR, rdir, cred,
722 NULL, NULL, NULL);
723
724 /* factor out the snode since we only want the attribute node */
725 if ((error == 0) && (VOP_REALVP(rvp, &rrvp, NULL) == 0)) {
726 VN_HOLD(rrvp);
727 VN_RELE(rvp);
728 rvp = rrvp;
729 }
730 } else
731 error = EROFS; /* no parent, no entry */
732
733 /*
734 * All we want is the permissions (and maybe ACLs and
735 * extended attributes), and we want to perform lookups
736 * by name. Drivers occasionally change their minor
737 * number space. If something changes, there's no
738 * much we can do about it here.
739 */
740
741 /* The shadow node checks out. We are done */
742 if (error == 0) {
743 dv->dv_attrvp = rvp; /* with one hold */
744
745 /*
746 * Determine if we have non-trivial ACLs on this node.
747 * It is not necessary to VOP_RWLOCK since fs_acl_nontrivial
748 * only does VOP_GETSECATTR.
749 */
750 dv->dv_flags &= ~DV_ACL;
751
752 if (fs_acl_nontrivial(rvp, cred))
753 dv->dv_flags |= DV_ACL;
754
755 /*
756 * If we have synced out the memory attributes, free
757 * them and switch back to using the persistent store.
758 */
759 if (rvp && dv->dv_attr) {
760 kmem_free(dv->dv_attr, sizeof (struct vattr));
761 dv->dv_attr = NULL;
762 }
763 if ((flags & DV_SHADOW_WRITE_HELD) == 0)
764 rw_downgrade(&dv->dv_contents);
765 ASSERT(RW_LOCK_HELD(&dv->dv_contents));
766 return;
767 }
768
769 /*
770 * Failed to find attribute in persistent backing store,
771 * get default permission bits.
772 */
773 devfs_get_defattr(vp, &vattr, NULL);
774
775 dv_vattr_merge(dv, &vattr);
776 gethrestime(&vattr.va_atime);
777 vattr.va_mtime = vattr.va_atime;
778 vattr.va_ctime = vattr.va_atime;
779
780 /*
781 * Try to create shadow dir. This is necessary in case
782 * we need to create a shadow leaf node later, when user
783 * executes chmod.
784 */
785 if ((error == ENOENT) && !create_tried) {
786 switch (vp->v_type) {
787 case VDIR:
788 error = VOP_MKDIR(rdvp, nm, &vattr, &rvp, kcred,
789 NULL, 0, NULL);
790 dsysdebug(error, ("vop_mkdir %s %s %d\n",
791 VTODV(dvp)->dv_name, nm, error));
792 create_tried = 1;
793 break;
794
795 case VCHR:
796 case VBLK:
797 /*
798 * Shadow nodes are only created on demand
799 */
800 if (flags & DV_SHADOW_CREATE) {
801 error = VOP_CREATE(rdvp, nm, &vattr, NONEXCL,
802 VREAD|VWRITE, &rvp, kcred, 0, NULL, NULL);
803 dsysdebug(error, ("vop_create %s %s %d\n",
804 VTODV(dvp)->dv_name, nm, error));
805 create_tried = 1;
806 }
807 break;
808
809 default:
810 cmn_err(CE_PANIC, "devfs: %s: create", dvnm);
811 /*NOTREACHED*/
812 }
813
814 if (create_tried &&
815 (error == 0) || (error == EEXIST)) {
816 VN_RELE(rvp);
817 goto lookup;
818 }
819 }
820
821 /* Store attribute in memory */
822 if (dv->dv_attr == NULL) {
823 dv->dv_attr = kmem_alloc(sizeof (struct vattr), KM_SLEEP);
824 *(dv->dv_attr) = vattr;
825 }
826
827 if ((flags & DV_SHADOW_WRITE_HELD) == 0)
828 rw_downgrade(&dv->dv_contents);
829 ASSERT(RW_LOCK_HELD(&dv->dv_contents));
830 }
831
832 /*
833 * Given a devinfo node, and a name, returns the appropriate
834 * minor information for that named node, if it exists.
835 */
836 static int
837 dv_find_leafnode(dev_info_t *devi, char *minor_nm, struct ddi_minor_data *r_mi)
838 {
839 struct ddi_minor_data *dmd;
840
841 ASSERT(i_ddi_devi_attached(devi));
842
843 dcmn_err3(("dv_find_leafnode: %s\n", minor_nm));
844 ASSERT(DEVI_BUSY_OWNED(devi));
845 for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
846
847 /*
848 * Skip alias nodes and nodes without a name.
849 */
850 if ((dmd->type == DDM_ALIAS) || (dmd->ddm_name == NULL))
851 continue;
852
853 dcmn_err4(("dv_find_leafnode: (%s,%s)\n",
854 minor_nm, dmd->ddm_name));
855 if (strcmp(minor_nm, dmd->ddm_name) == 0) {
856 r_mi->ddm_dev = dmd->ddm_dev;
857 r_mi->ddm_spec_type = dmd->ddm_spec_type;
858 r_mi->type = dmd->type;
859 r_mi->ddm_flags = dmd->ddm_flags;
860 r_mi->ddm_node_priv = dmd->ddm_node_priv;
861 r_mi->ddm_priv_mode = dmd->ddm_priv_mode;
862 if (r_mi->ddm_node_priv)
863 dphold(r_mi->ddm_node_priv);
864 return (0);
865 }
866 }
867
868 dcmn_err3(("dv_find_leafnode: %s: ENOENT\n", minor_nm));
869 return (ENOENT);
870 }
871
872 /*
873 * Special handling for clone node:
874 * Clone minor name is a driver name, the minor number will
875 * be the major number of the driver. There is no minor
876 * node under the clone driver, so we'll manufacture the
877 * dev_t.
878 */
879 static struct dv_node *
880 dv_clone_mknod(struct dv_node *ddv, char *drvname)
881 {
882 major_t major;
883 struct dv_node *dvp;
884 char *devnm;
885 struct ddi_minor_data *dmd;
886
887 /*
888 * Make sure drvname is a STREAMS driver. We load the driver,
889 * but don't attach to any instances. This makes stat(2)
890 * relatively cheap.
891 */
892 major = ddi_name_to_major(drvname);
893 if (major == DDI_MAJOR_T_NONE)
894 return (NULL);
895
896 if (ddi_hold_driver(major) == NULL)
897 return (NULL);
898
899 if (STREAMSTAB(major) == NULL) {
900 ddi_rele_driver(major);
901 return (NULL);
902 }
903
904 ddi_rele_driver(major);
905 devnm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
906 (void) snprintf(devnm, MAXNAMELEN, "clone@0:%s", drvname);
907 dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
908 dmd->ddm_dev = makedevice(clone_major, (minor_t)major);
909 dmd->ddm_spec_type = S_IFCHR;
910 dvp = dv_mknod(ddv, clone_dip, devnm, dmd);
911 kmem_free(dmd, sizeof (*dmd));
912 kmem_free(devnm, MAXNAMELEN);
913 return (dvp);
914 }
915
916 /*
917 * Given the parent directory node, and a name in it, returns the
918 * named dv_node to the caller (as a vnode).
919 *
920 * (We need pnp and rdir for doing shadow lookups; they can be NULL)
921 */
922 int
923 dv_find(struct dv_node *ddv, char *nm, struct vnode **vpp, struct pathname *pnp,
924 struct vnode *rdir, struct cred *cred, uint_t ndi_flags)
925 {
926 extern int isminiroot; /* see modctl.c */
927
928 int circ;
929 int rv = 0, was_busy = 0, nmlen, write_held = 0;
930 struct vnode *vp;
931 struct dv_node *dv, *dup;
932 dev_info_t *pdevi, *devi = NULL;
933 char *mnm;
934 struct ddi_minor_data *dmd;
935
936 dcmn_err3(("dv_find %s\n", nm));
937
938 if (!rw_tryenter(&ddv->dv_contents, RW_READER)) {
939 if (tsd_get(devfs_clean_key))
940 return (EBUSY);
941 rw_enter(&ddv->dv_contents, RW_READER);
942 }
943 start:
944 if (DV_STALE(ddv)) {
945 rw_exit(&ddv->dv_contents);
946 return (ESTALE);
947 }
948
949 /*
950 * Empty name or ., return node itself.
951 */
952 nmlen = strlen(nm);
953 if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
954 *vpp = DVTOV(ddv);
955 rw_exit(&ddv->dv_contents);
956 VN_HOLD(*vpp);
957 return (0);
958 }
959
960 /*
961 * .., return the parent directory
962 */
963 if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
964 *vpp = DVTOV(ddv->dv_dotdot);
965 rw_exit(&ddv->dv_contents);
966 VN_HOLD(*vpp);
967 return (0);
968 }
969
970 /*
971 * Fail anything without a valid device name component
972 */
973 if (nm[0] == '@' || nm[0] == ':') {
974 dcmn_err3(("devfs: no driver '%s'\n", nm));
975 rw_exit(&ddv->dv_contents);
976 return (ENOENT);
977 }
978
979 /*
980 * So, now we have to deal with the trickier stuff.
981 *
982 * (a) search the existing list of dv_nodes on this directory
983 */
984 if ((dv = dv_findbyname(ddv, nm)) != NULL) {
985 founddv:
986 ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
987
988 if (!rw_tryenter(&dv->dv_contents, RW_READER)) {
989 if (tsd_get(devfs_clean_key)) {
990 VN_RELE(DVTOV(dv));
991 rw_exit(&ddv->dv_contents);
992 return (EBUSY);
993 }
994 rw_enter(&dv->dv_contents, RW_READER);
995 }
996
997 vp = DVTOV(dv);
998 if ((dv->dv_attrvp != NULLVP) ||
999 (vp->v_type != VDIR && dv->dv_attr != NULL)) {
1000 /*
1001 * Common case - we already have attributes
1002 */
1003 rw_exit(&dv->dv_contents);
1004 rw_exit(&ddv->dv_contents);
1005 goto found;
1006 }
1007
1008 /*
1009 * No attribute vp, try and build one.
1010 *
1011 * dv_shadow_node() can briefly drop &dv->dv_contents lock
1012 * if it is unable to upgrade it to a write lock. If the
1013 * current thread has come in through the bottom-up device
1014 * configuration devfs_clean() path, we may deadlock against
1015 * a thread performing top-down device configuration if it
1016 * grabs the contents lock. To avoid this, when we are on the
1017 * devfs_clean() path we attempt to upgrade the dv_contents
1018 * lock before we call dv_shadow_node().
1019 */
1020 if (tsd_get(devfs_clean_key)) {
1021 if (!rw_tryupgrade(&dv->dv_contents)) {
1022 VN_RELE(DVTOV(dv));
1023 rw_exit(&dv->dv_contents);
1024 rw_exit(&ddv->dv_contents);
1025 return (EBUSY);
1026 }
1027
1028 write_held = DV_SHADOW_WRITE_HELD;
1029 }
1030
1031 dv_shadow_node(DVTOV(ddv), nm, vp, pnp, rdir, cred,
1032 write_held);
1033
1034 rw_exit(&dv->dv_contents);
1035 rw_exit(&ddv->dv_contents);
1036 goto found;
1037 }
1038
1039 /*
1040 * (b) Search the child devinfo nodes of our parent directory,
1041 * looking for the named node. If we find it, build a new
1042 * node, then grab the writers lock, search the directory
1043 * if it's still not there, then insert it.
1044 *
1045 * We drop the devfs locks before accessing the device tree.
1046 * Take care to mark the node BUSY so that a forced devfs_clean
1047 * doesn't mark the directory node stale.
1048 *
1049 * Also, check if we are called as part of devfs_clean or
1050 * reset_perm. If so, simply return not found because there
1051 * is nothing to clean.
1052 */
1053 if (tsd_get(devfs_clean_key)) {
1054 rw_exit(&ddv->dv_contents);
1055 return (ENOENT);
1056 }
1057
1058 /*
1059 * We could be either READ or WRITE locked at
1060 * this point. Upgrade if we are read locked.
1061 */
1062 ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
1063 if (rw_read_locked(&ddv->dv_contents) &&
1064 !rw_tryupgrade(&ddv->dv_contents)) {
1065 rw_exit(&ddv->dv_contents);
1066 rw_enter(&ddv->dv_contents, RW_WRITER);
1067 /*
1068 * Things may have changed when we dropped
1069 * the contents lock, so start from top again
1070 */
1071 goto start;
1072 }
1073 ddv->dv_busy++; /* mark busy before dropping lock */
1074 was_busy++;
1075 rw_exit(&ddv->dv_contents);
1076
1077 pdevi = ddv->dv_devi;
1078 ASSERT(pdevi != NULL);
1079
1080 mnm = strchr(nm, ':');
1081 if (mnm)
1082 *mnm = (char)0;
1083
1084 /*
1085 * Configure one nexus child, will call nexus's bus_ops
1086 * If successful, devi is held upon returning.
1087 * Note: devfs lookup should not be configuring grandchildren.
1088 */
1089 ASSERT((ndi_flags & NDI_CONFIG) == 0);
1090
1091 rv = ndi_devi_config_one(pdevi, nm, &devi, ndi_flags | NDI_NO_EVENT);
1092 if (mnm)
1093 *mnm = ':';
1094 if (rv != NDI_SUCCESS) {
1095 rv = ENOENT;
1096 goto notfound;
1097 }
1098
1099 ASSERT(devi);
1100
1101 /* Check if this is a path alias */
1102 if (ddi_aliases_present == B_TRUE && ddi_get_parent(devi) != pdevi) {
1103 char *curr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1104
1105 (void) ddi_pathname(devi, curr);
1106
1107 vp = NULL;
1108 if (devfs_lookupname(curr, NULL, &vp) == 0 && vp) {
1109 dv = VTODV(vp);
1110 kmem_free(curr, MAXPATHLEN);
1111 goto found;
1112 }
1113 kmem_free(curr, MAXPATHLEN);
1114 }
1115
1116 /*
1117 * If we configured a hidden node, consider it notfound.
1118 */
1119 if (ndi_dev_is_hidden_node(devi)) {
1120 ndi_rele_devi(devi);
1121 rv = ENOENT;
1122 goto notfound;
1123 }
1124
1125 /*
1126 * Don't make vhci clients visible under phci, unless we
1127 * are in miniroot.
1128 */
1129 if (isminiroot == 0 && ddi_get_parent(devi) != pdevi) {
1130 ndi_rele_devi(devi);
1131 rv = ENOENT;
1132 goto notfound;
1133 }
1134
1135 ASSERT(devi && i_ddi_devi_attached(devi));
1136
1137 /*
1138 * Invalidate cache to notice newly created minor nodes.
1139 */
1140 rw_enter(&ddv->dv_contents, RW_WRITER);
1141 ddv->dv_flags |= DV_BUILD;
1142 rw_exit(&ddv->dv_contents);
1143
1144 /*
1145 * mkdir for nexus drivers and leaf nodes as well. If we are racing
1146 * and create a duplicate, the duplicate will be destroyed below.
1147 */
1148 if (mnm == NULL) {
1149 dv = dv_mkdir(ddv, devi, nm);
1150 } else {
1151 /*
1152 * Allocate dmd first to avoid KM_SLEEP with active
1153 * ndi_devi_enter.
1154 */
1155 dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
1156 ndi_devi_enter(devi, &circ);
1157 if (devi == clone_dip) {
1158 /*
1159 * For clone minors, load the driver indicated by
1160 * minor name.
1161 */
1162 dv = dv_clone_mknod(ddv, mnm + 1);
1163 } else {
1164 /*
1165 * Find minor node and make a dv_node
1166 */
1167 if (dv_find_leafnode(devi, mnm + 1, dmd) == 0) {
1168 dv = dv_mknod(ddv, devi, nm, dmd);
1169 if (dmd->ddm_node_priv)
1170 dpfree(dmd->ddm_node_priv);
1171 }
1172 }
1173 ndi_devi_exit(devi, circ);
1174 kmem_free(dmd, sizeof (*dmd));
1175 }
1176 /*
1177 * Release hold from ndi_devi_config_one()
1178 */
1179 ndi_rele_devi(devi);
1180
1181 if (dv == NULL) {
1182 rv = ENOENT;
1183 goto notfound;
1184 }
1185
1186 /*
1187 * We have released the dv_contents lock, need to check
1188 * if another thread already created a duplicate node
1189 */
1190 rw_enter(&ddv->dv_contents, RW_WRITER);
1191 if ((dup = dv_findbyname(ddv, nm)) == NULL) {
1192 dv_insert(ddv, dv);
1193 } else {
1194 /*
1195 * Duplicate found, use the existing node
1196 */
1197 VN_RELE(DVTOV(dv));
1198 dv_destroy(dv, 0);
1199 dv = dup;
1200 }
1201 goto founddv;
1202 /*NOTREACHED*/
1203
1204 found:
1205 /*
1206 * Fail lookup of device that has now become hidden (typically via
1207 * hot removal of open device).
1208 */
1209 if (dv->dv_devi && ndi_dev_is_hidden_node(dv->dv_devi)) {
1210 dcmn_err2(("dv_find: nm %s failed: hidden/removed\n", nm));
1211 VN_RELE(vp);
1212 rv = ENOENT;
1213 goto notfound;
1214 }
1215
1216 /*
1217 * Skip non-kernel lookups of internal nodes.
1218 * This use of kcred to distinguish between user and
1219 * internal kernel lookups is unfortunate. The information
1220 * provided by the seg argument to lookupnameat should
1221 * evolve into a lookup flag for filesystems that need
1222 * this distinction.
1223 */
1224 if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred)) {
1225 dcmn_err2(("dv_find: nm %s failed: internal\n", nm));
1226 VN_RELE(vp);
1227 rv = ENOENT;
1228 goto notfound;
1229 }
1230
1231 dcmn_err2(("dv_find: returning vp for nm %s\n", nm));
1232 if (vp->v_type == VCHR || vp->v_type == VBLK) {
1233 /*
1234 * If vnode is a device, return special vnode instead
1235 * (though it knows all about -us- via sp->s_realvp,
1236 * sp->s_devvp, and sp->s_dip)
1237 */
1238 *vpp = specvp_devfs(vp, vp->v_rdev, vp->v_type, cred,
1239 dv->dv_devi);
1240 VN_RELE(vp);
1241 if (*vpp == NULLVP)
1242 rv = ENOSYS;
1243 } else
1244 *vpp = vp;
1245
1246 notfound:
1247 if (was_busy) {
1248 /*
1249 * Non-zero was_busy tells us that we are not in the
1250 * devfs_clean() path which in turn means that we can afford
1251 * to take the contents lock unconditionally.
1252 */
1253 rw_enter(&ddv->dv_contents, RW_WRITER);
1254 ddv->dv_busy--;
1255 rw_exit(&ddv->dv_contents);
1256 }
1257 return (rv);
1258 }
1259
1260 /*
1261 * The given directory node is out-of-date; that is, it has been
1262 * marked as needing to be rebuilt, possibly because some new devinfo
1263 * node has come into existence, or possibly because this is the first
1264 * time we've been here.
1265 */
1266 void
1267 dv_filldir(struct dv_node *ddv)
1268 {
1269 struct dv_node *dv;
1270 dev_info_t *devi, *pdevi;
1271 struct ddi_minor_data *dmd;
1272 char devnm[MAXNAMELEN];
1273 int circ, ccirc;
1274
1275 ASSERT(DVTOV(ddv)->v_type == VDIR);
1276 ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
1277 ASSERT(ddv->dv_flags & DV_BUILD);
1278
1279 dcmn_err3(("dv_filldir: %s\n", ddv->dv_name));
1280 if (DV_STALE(ddv))
1281 return;
1282 pdevi = ddv->dv_devi;
1283
1284 if (ndi_devi_config(pdevi, NDI_NO_EVENT) != NDI_SUCCESS) {
1285 dcmn_err3(("dv_filldir: config error %s\n", ddv->dv_name));
1286 }
1287
1288 ndi_devi_enter(pdevi, &circ);
1289 for (devi = ddi_get_child(pdevi); devi;
1290 devi = ddi_get_next_sibling(devi)) {
1291 /*
1292 * While we know enough to create a directory at DS_INITIALIZED,
1293 * the directory will be empty until DS_ATTACHED. The existence
1294 * of an empty directory dv_node will cause a devi_ref, which
1295 * has caused problems for existing code paths doing offline/DR
1296 * type operations - making devfs_clean coordination even more
1297 * sensitive and error prone. Given this, the 'continue' below
1298 * is checking for DS_ATTACHED instead of DS_INITIALIZED.
1299 */
1300 if (i_ddi_node_state(devi) < DS_ATTACHED)
1301 continue;
1302
1303 /* skip hidden nodes */
1304 if (ndi_dev_is_hidden_node(devi))
1305 continue;
1306
1307 dcmn_err3(("dv_filldir: node %s\n", ddi_node_name(devi)));
1308
1309 ndi_devi_enter(devi, &ccirc);
1310 for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
1311 char *addr;
1312
1313 /*
1314 * Skip alias nodes, internal nodes, and nodes
1315 * without a name. We allow DDM_DEFAULT nodes
1316 * to appear in readdir.
1317 */
1318 if ((dmd->type == DDM_ALIAS) ||
1319 (dmd->type == DDM_INTERNAL_PATH) ||
1320 (dmd->ddm_name == NULL))
1321 continue;
1322
1323 addr = ddi_get_name_addr(devi);
1324 if (addr && *addr)
1325 (void) sprintf(devnm, "%s@%s:%s",
1326 ddi_node_name(devi), addr, dmd->ddm_name);
1327 else
1328 (void) sprintf(devnm, "%s:%s",
1329 ddi_node_name(devi), dmd->ddm_name);
1330
1331 if ((dv = dv_findbyname(ddv, devnm)) != NULL) {
1332 /* dv_node already exists */
1333 VN_RELE(DVTOV(dv));
1334 continue;
1335 }
1336
1337 dv = dv_mknod(ddv, devi, devnm, dmd);
1338 dv_insert(ddv, dv);
1339 VN_RELE(DVTOV(dv));
1340 }
1341 ndi_devi_exit(devi, ccirc);
1342
1343 (void) ddi_deviname(devi, devnm);
1344 if ((dv = dv_findbyname(ddv, devnm + 1)) == NULL) {
1345 /* directory doesn't exist */
1346 dv = dv_mkdir(ddv, devi, devnm + 1);
1347 dv_insert(ddv, dv);
1348 }
1349 VN_RELE(DVTOV(dv));
1350 }
1351 ndi_devi_exit(pdevi, circ);
1352
1353 ddv->dv_flags &= ~DV_BUILD;
1354 }
1355
1356 /*
1357 * Given a directory node, clean out all the nodes beneath.
1358 *
1359 * VDIR: Reinvoke to clean them, then delete the directory.
1360 * VCHR, VBLK: Just blow them away.
1361 *
1362 * Mark the directories touched as in need of a rebuild, in case
1363 * we fall over part way through. When DV_CLEAN_FORCE is specified,
1364 * we mark referenced empty directories as stale to facilitate DR.
1365 */
1366 int
1367 dv_cleandir(struct dv_node *ddv, char *devnm, uint_t flags)
1368 {
1369 struct dv_node *dv;
1370 struct dv_node *next;
1371 struct vnode *vp;
1372 int busy = 0;
1373
1374 /*
1375 * We should always be holding the tsd_clean_key here: dv_cleandir()
1376 * will be called as a result of a devfs_clean request and the
1377 * tsd_clean_key will be set in either in devfs_clean() itself or in
1378 * devfs_clean_vhci().
1379 *
1380 * Since we are on the devfs_clean path, we return EBUSY if we cannot
1381 * get the contents lock: if we blocked here we might deadlock against
1382 * a thread performing top-down device configuration.
1383 */
1384 ASSERT(tsd_get(devfs_clean_key));
1385
1386 dcmn_err3(("dv_cleandir: %s\n", ddv->dv_name));
1387
1388 if (!(flags & DV_CLEANDIR_LCK) &&
1389 !rw_tryenter(&ddv->dv_contents, RW_WRITER))
1390 return (EBUSY);
1391
1392 for (dv = DV_FIRST_ENTRY(ddv); dv; dv = next) {
1393 next = DV_NEXT_ENTRY(ddv, dv);
1394
1395 /*
1396 * If devnm is specified, the non-minor portion of the
1397 * name must match devnm.
1398 */
1399 if (devnm &&
1400 (strncmp(devnm, dv->dv_name, strlen(devnm)) ||
1401 (dv->dv_name[strlen(devnm)] != ':' &&
1402 dv->dv_name[strlen(devnm)] != '\0')))
1403 continue;
1404
1405 /* check type of what we are cleaning */
1406 vp = DVTOV(dv);
1407 if (vp->v_type == VDIR) {
1408 /* recurse on directories */
1409 rw_enter(&dv->dv_contents, RW_WRITER);
1410 if (dv_cleandir(dv, NULL,
1411 flags | DV_CLEANDIR_LCK) == EBUSY) {
1412 rw_exit(&dv->dv_contents);
1413 goto set_busy;
1414 }
1415
1416 /* A clean directory is an empty directory... */
1417 ASSERT(dv->dv_nlink == 2);
1418 mutex_enter(&vp->v_lock);
1419 if (vp->v_count > 0) {
1420 /*
1421 * ... but an empty directory can still have
1422 * references to it. If we have dv_busy or
1423 * DV_CLEAN_FORCE is *not* specified then a
1424 * referenced directory is considered busy.
1425 */
1426 if (dv->dv_busy || !(flags & DV_CLEAN_FORCE)) {
1427 mutex_exit(&vp->v_lock);
1428 rw_exit(&dv->dv_contents);
1429 goto set_busy;
1430 }
1431
1432 /*
1433 * Mark referenced directory stale so that DR
1434 * will succeed even if a shell has
1435 * /devices/xxx as current directory (causing
1436 * VN_HOLD reference to an empty directory).
1437 */
1438 ASSERT(!DV_STALE(dv));
1439 ndi_rele_devi(dv->dv_devi);
1440 dv->dv_devi = NULL; /* mark DV_STALE */
1441 }
1442 } else {
1443 ASSERT((vp->v_type == VCHR) || (vp->v_type == VBLK));
1444 ASSERT(dv->dv_nlink == 1); /* no hard links */
1445 mutex_enter(&vp->v_lock);
1446 if (vp->v_count > 0) {
1447 /*
1448 * The file still has references to it. If
1449 * DEVI_GONE is *not* set on the devi referenced
1450 * file is considered busy.
1451 */
1452 if (!DEVI_IS_GONE(dv->dv_devi)) {
1453 mutex_exit(&vp->v_lock);
1454 goto set_busy;
1455 }
1456
1457 /*
1458 * Mark referenced file stale so that DR will
1459 * succeed even if there are userland opens.
1460 */
1461 ASSERT(!DV_STALE(dv));
1462 ndi_rele_devi(dv->dv_devi);
1463 dv->dv_devi = NULL;
1464 }
1465 }
1466
1467 /* unlink from directory */
1468 dv_unlink(ddv, dv);
1469
1470 /* drop locks */
1471 mutex_exit(&vp->v_lock);
1472 if (vp->v_type == VDIR)
1473 rw_exit(&dv->dv_contents);
1474
1475 /* destroy vnode if ref count is zero */
1476 if (vp->v_count == 0)
1477 dv_destroy(dv, flags);
1478
1479 continue;
1480
1481 /*
1482 * If devnm is not NULL we return immediately on busy,
1483 * otherwise we continue destroying unused dv_node's.
1484 */
1485 set_busy: busy++;
1486 if (devnm)
1487 break;
1488 }
1489
1490 /*
1491 * This code may be invoked to inform devfs that a new node has
1492 * been created in the kernel device tree. So we always set
1493 * the DV_BUILD flag to allow the next dv_filldir() to pick
1494 * the new devinfo nodes.
1495 */
1496 ddv->dv_flags |= DV_BUILD;
1497
1498 if (!(flags & DV_CLEANDIR_LCK))
1499 rw_exit(&ddv->dv_contents);
1500
1501 return (busy ? EBUSY : 0);
1502 }
1503
1504 /*
1505 * Walk through the devfs hierarchy, correcting the permissions of
1506 * devices with default permissions that do not match those specified
1507 * by minor perm. This can only be done for all drivers for now.
1508 */
1509 static int
1510 dv_reset_perm_dir(struct dv_node *ddv, uint_t flags)
1511 {
1512 struct dv_node *dv;
1513 struct vnode *vp;
1514 int retval = 0;
1515 struct vattr *attrp;
1516 mperm_t mp;
1517 char *nm;
1518 uid_t old_uid;
1519 gid_t old_gid;
1520 mode_t old_mode;
1521
1522 rw_enter(&ddv->dv_contents, RW_WRITER);
1523 for (dv = DV_FIRST_ENTRY(ddv); dv; dv = DV_NEXT_ENTRY(ddv, dv)) {
1524 int error = 0;
1525 nm = dv->dv_name;
1526
1527 rw_enter(&dv->dv_contents, RW_READER);
1528 vp = DVTOV(dv);
1529 if (vp->v_type == VDIR) {
1530 rw_exit(&dv->dv_contents);
1531 if (dv_reset_perm_dir(dv, flags) != 0) {
1532 error = EBUSY;
1533 }
1534 } else {
1535 ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
1536
1537 /*
1538 * Check for permissions from minor_perm
1539 * If there are none, we're done
1540 */
1541 rw_exit(&dv->dv_contents);
1542 if (dev_minorperm(dv->dv_devi, nm, &mp) != 0)
1543 continue;
1544
1545 rw_enter(&dv->dv_contents, RW_READER);
1546
1547 /*
1548 * Allow a node's permissions to be altered
1549 * permanently from the defaults by chmod,
1550 * using the shadow node as backing store.
1551 * Otherwise, update node to minor_perm permissions.
1552 */
1553 if (dv->dv_attrvp == NULLVP) {
1554 /*
1555 * No attribute vp, try to find one.
1556 */
1557 dv_shadow_node(DVTOV(ddv), nm, vp,
1558 NULL, NULLVP, kcred, 0);
1559 }
1560 if (dv->dv_attrvp != NULLVP || dv->dv_attr == NULL) {
1561 rw_exit(&dv->dv_contents);
1562 continue;
1563 }
1564
1565 attrp = dv->dv_attr;
1566
1567 if (VATTRP_MP_CMP(attrp, mp) == 0) {
1568 dcmn_err5(("%s: no perm change: "
1569 "%d %d 0%o\n", nm, attrp->va_uid,
1570 attrp->va_gid, attrp->va_mode));
1571 rw_exit(&dv->dv_contents);
1572 continue;
1573 }
1574
1575 old_uid = attrp->va_uid;
1576 old_gid = attrp->va_gid;
1577 old_mode = attrp->va_mode;
1578
1579 VATTRP_MP_MERGE(attrp, mp);
1580 mutex_enter(&vp->v_lock);
1581 if (vp->v_count > 0) {
1582 error = EBUSY;
1583 }
1584 mutex_exit(&vp->v_lock);
1585
1586 dcmn_err5(("%s: perm %d/%d/0%o -> %d/%d/0%o (%d)\n",
1587 nm, old_uid, old_gid, old_mode, attrp->va_uid,
1588 attrp->va_gid, attrp->va_mode, error));
1589
1590 rw_exit(&dv->dv_contents);
1591 }
1592
1593 if (error != 0) {
1594 retval = error;
1595 }
1596 }
1597
1598 ddv->dv_flags |= DV_BUILD;
1599
1600 rw_exit(&ddv->dv_contents);
1601
1602 return (retval);
1603 }
1604
1605 int
1606 devfs_reset_perm(uint_t flags)
1607 {
1608 struct dv_node *dvp;
1609 int rval;
1610
1611 if ((dvp = devfs_dip_to_dvnode(ddi_root_node())) == NULL)
1612 return (0);
1613
1614 VN_HOLD(DVTOV(dvp));
1615 rval = dv_reset_perm_dir(dvp, flags);
1616 VN_RELE(DVTOV(dvp));
1617 return (rval);
1618 }
1619
1620 /*
1621 * Clean up dangling devfs shadow nodes for removed
1622 * drivers so that, in the event the driver is re-added
1623 * to the system, newly created nodes won't incorrectly
1624 * pick up these stale shadow node permissions.
1625 *
1626 * This is accomplished by walking down the pathname
1627 * to the directory, starting at the root's attribute
1628 * node, then removing all minors matching the specified
1629 * node name. Care must be taken to remove all entries
1630 * in a directory before the directory itself, so that
1631 * the clean-up associated with rem_drv'ing a nexus driver
1632 * does not inadvertently result in an inconsistent
1633 * filesystem underlying devfs.
1634 */
1635
1636 static int
1637 devfs_remdrv_rmdir(vnode_t *dirvp, const char *dir, vnode_t *rvp)
1638 {
1639 int error;
1640 vnode_t *vp;
1641 int eof;
1642 struct iovec iov;
1643 struct uio uio;
1644 struct dirent64 *dp;
1645 dirent64_t *dbuf;
1646 size_t dlen;
1647 size_t dbuflen;
1648 int ndirents = 64;
1649 char *nm;
1650
1651 VN_HOLD(dirvp);
1652
1653 dlen = ndirents * (sizeof (*dbuf));
1654 dbuf = kmem_alloc(dlen, KM_SLEEP);
1655
1656 uio.uio_iov = &iov;
1657 uio.uio_iovcnt = 1;
1658 uio.uio_segflg = UIO_SYSSPACE;
1659 uio.uio_fmode = 0;
1660 uio.uio_extflg = UIO_COPY_CACHED;
1661 uio.uio_loffset = 0;
1662 uio.uio_llimit = MAXOFFSET_T;
1663
1664 eof = 0;
1665 error = 0;
1666 while (!error && !eof) {
1667 uio.uio_resid = dlen;
1668 iov.iov_base = (char *)dbuf;
1669 iov.iov_len = dlen;
1670
1671 (void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1672 error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1673 VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1674
1675 dbuflen = dlen - uio.uio_resid;
1676
1677 if (error || dbuflen == 0)
1678 break;
1679
1680 for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1681 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1682
1683 nm = dp->d_name;
1684
1685 if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1686 continue;
1687
1688 error = VOP_LOOKUP(dirvp, nm,
1689 &vp, NULL, 0, NULL, kcred, NULL, NULL, NULL);
1690
1691 dsysdebug(error,
1692 ("rem_drv %s/%s lookup (%d)\n",
1693 dir, nm, error));
1694
1695 if (error)
1696 continue;
1697
1698 ASSERT(vp->v_type == VDIR ||
1699 vp->v_type == VCHR || vp->v_type == VBLK);
1700
1701 if (vp->v_type == VDIR) {
1702 error = devfs_remdrv_rmdir(vp, nm, rvp);
1703 if (error == 0) {
1704 error = VOP_RMDIR(dirvp,
1705 (char *)nm, rvp, kcred, NULL, 0);
1706 dsysdebug(error,
1707 ("rem_drv %s/%s rmdir (%d)\n",
1708 dir, nm, error));
1709 }
1710 } else {
1711 error = VOP_REMOVE(dirvp, (char *)nm, kcred,
1712 NULL, 0);
1713 dsysdebug(error,
1714 ("rem_drv %s/%s remove (%d)\n",
1715 dir, nm, error));
1716 }
1717
1718 VN_RELE(vp);
1719 if (error) {
1720 goto exit;
1721 }
1722 }
1723 }
1724
1725 exit:
1726 VN_RELE(dirvp);
1727 kmem_free(dbuf, dlen);
1728
1729 return (error);
1730 }
1731
1732 int
1733 devfs_remdrv_cleanup(const char *dir, const char *nodename)
1734 {
1735 int error;
1736 vnode_t *vp;
1737 vnode_t *dirvp;
1738 int eof;
1739 struct iovec iov;
1740 struct uio uio;
1741 struct dirent64 *dp;
1742 dirent64_t *dbuf;
1743 size_t dlen;
1744 size_t dbuflen;
1745 int ndirents = 64;
1746 int nodenamelen = strlen(nodename);
1747 char *nm;
1748 struct pathname pn;
1749 vnode_t *rvp; /* root node of the underlying attribute fs */
1750
1751 dcmn_err5(("devfs_remdrv_cleanup: %s %s\n", dir, nodename));
1752
1753 if (error = pn_get((char *)dir, UIO_SYSSPACE, &pn))
1754 return (0);
1755
1756 rvp = dvroot->dv_attrvp;
1757 ASSERT(rvp != NULL);
1758 VN_HOLD(rvp);
1759
1760 pn_skipslash(&pn);
1761 dirvp = rvp;
1762 VN_HOLD(dirvp);
1763
1764 nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1765
1766 while (pn_pathleft(&pn)) {
1767 ASSERT(dirvp->v_type == VDIR);
1768 (void) pn_getcomponent(&pn, nm);
1769 ASSERT((strcmp(nm, ".") != 0) && (strcmp(nm, "..") != 0));
1770 error = VOP_LOOKUP(dirvp, nm, &vp, NULL, 0, rvp, kcred,
1771 NULL, NULL, NULL);
1772 if (error) {
1773 dcmn_err5(("remdrv_cleanup %s lookup error %d\n",
1774 nm, error));
1775 VN_RELE(dirvp);
1776 if (dirvp != rvp)
1777 VN_RELE(rvp);
1778 pn_free(&pn);
1779 kmem_free(nm, MAXNAMELEN);
1780 return (0);
1781 }
1782 VN_RELE(dirvp);
1783 dirvp = vp;
1784 pn_skipslash(&pn);
1785 }
1786
1787 ASSERT(dirvp->v_type == VDIR);
1788 if (dirvp != rvp)
1789 VN_RELE(rvp);
1790 pn_free(&pn);
1791 kmem_free(nm, MAXNAMELEN);
1792
1793 dlen = ndirents * (sizeof (*dbuf));
1794 dbuf = kmem_alloc(dlen, KM_SLEEP);
1795
1796 uio.uio_iov = &iov;
1797 uio.uio_iovcnt = 1;
1798 uio.uio_segflg = UIO_SYSSPACE;
1799 uio.uio_fmode = 0;
1800 uio.uio_extflg = UIO_COPY_CACHED;
1801 uio.uio_loffset = 0;
1802 uio.uio_llimit = MAXOFFSET_T;
1803
1804 eof = 0;
1805 error = 0;
1806 while (!error && !eof) {
1807 uio.uio_resid = dlen;
1808 iov.iov_base = (char *)dbuf;
1809 iov.iov_len = dlen;
1810
1811 (void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1812 error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1813 VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1814
1815 dbuflen = dlen - uio.uio_resid;
1816
1817 if (error || dbuflen == 0)
1818 break;
1819
1820 for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1821 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1822
1823 nm = dp->d_name;
1824
1825 if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1826 continue;
1827
1828 if (strncmp(nm, nodename, nodenamelen) != 0)
1829 continue;
1830
1831 error = VOP_LOOKUP(dirvp, nm, &vp,
1832 NULL, 0, NULL, kcred, NULL, NULL, NULL);
1833
1834 dsysdebug(error,
1835 ("rem_drv %s/%s lookup (%d)\n",
1836 dir, nm, error));
1837
1838 if (error)
1839 continue;
1840
1841 ASSERT(vp->v_type == VDIR ||
1842 vp->v_type == VCHR || vp->v_type == VBLK);
1843
1844 if (vp->v_type == VDIR) {
1845 error = devfs_remdrv_rmdir(vp, nm, rvp);
1846 if (error == 0) {
1847 error = VOP_RMDIR(dirvp, (char *)nm,
1848 rvp, kcred, NULL, 0);
1849 dsysdebug(error,
1850 ("rem_drv %s/%s rmdir (%d)\n",
1851 dir, nm, error));
1852 }
1853 } else {
1854 error = VOP_REMOVE(dirvp, (char *)nm, kcred,
1855 NULL, 0);
1856 dsysdebug(error,
1857 ("rem_drv %s/%s remove (%d)\n",
1858 dir, nm, error));
1859 }
1860
1861 VN_RELE(vp);
1862 if (error)
1863 goto exit;
1864 }
1865 }
1866
1867 exit:
1868 VN_RELE(dirvp);
1869
1870 kmem_free(dbuf, dlen);
1871
1872 return (0);
1873 }
1874
1875 struct dv_list {
1876 struct dv_node *dv;
1877 struct dv_list *next;
1878 };
1879
1880 void
1881 dv_walk(
1882 struct dv_node *ddv,
1883 char *devnm,
1884 void (*callback)(struct dv_node *, void *),
1885 void *arg)
1886 {
1887 struct vnode *dvp;
1888 struct dv_node *dv;
1889 struct dv_list *head, *tail, *next;
1890 int len;
1891
1892 dcmn_err3(("dv_walk: ddv = %s, devnm = %s\n",
1893 ddv->dv_name, devnm ? devnm : "<null>"));
1894
1895 dvp = DVTOV(ddv);
1896
1897 ASSERT(dvp->v_type == VDIR);
1898
1899 head = tail = next = NULL;
1900
1901 rw_enter(&ddv->dv_contents, RW_READER);
1902 mutex_enter(&dvp->v_lock);
1903 for (dv = DV_FIRST_ENTRY(ddv); dv; dv = DV_NEXT_ENTRY(ddv, dv)) {
1904 /*
1905 * If devnm is not NULL and is not the empty string,
1906 * select only dv_nodes with matching non-minor name
1907 */
1908 if (devnm && (len = strlen(devnm)) &&
1909 (strncmp(devnm, dv->dv_name, len) ||
1910 (dv->dv_name[len] != ':' && dv->dv_name[len] != '\0')))
1911 continue;
1912
1913 callback(dv, arg);
1914
1915 if (DVTOV(dv)->v_type != VDIR)
1916 continue;
1917
1918 next = kmem_zalloc(sizeof (*next), KM_SLEEP);
1919 next->dv = dv;
1920
1921 if (tail)
1922 tail->next = next;
1923 else
1924 head = next;
1925
1926 tail = next;
1927 }
1928
1929 while (head) {
1930 dv_walk(head->dv, NULL, callback, arg);
1931 next = head->next;
1932 kmem_free(head, sizeof (*head));
1933 head = next;
1934 }
1935 rw_exit(&ddv->dv_contents);
1936 mutex_exit(&dvp->v_lock);
1937 }