1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2013, 2016 Joyent, Inc.  All rights reserved.
  25  * Copyright (c) 2014 by Delphix. All rights reserved.
  26  */
  27 
  28 /* vnode ops for the /dev/zvol directory */
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/ddi.h>
  34 #include <sys/sunndi.h>
  35 #include <sys/sunldi.h>
  36 #include <fs/fs_subr.h>
  37 #include <sys/fs/dv_node.h>
  38 #include <sys/fs/sdev_impl.h>
  39 #include <sys/zfs_ioctl.h>
  40 #include <sys/policy.h>
  41 #include <sys/stat.h>
  42 #include <sys/vfs_opreg.h>
  43 
  44 struct vnodeops *devzvol_vnodeops;
  45 static major_t devzvol_major;
  46 static taskq_ent_t devzvol_zclist_task;
  47 
  48 static kmutex_t devzvol_mtx;
  49 /* Below are protected by devzvol_mtx */
  50 static boolean_t devzvol_isopen;
  51 static boolean_t devzvol_zclist_task_running = B_FALSE;
  52 static uint64_t devzvol_gen = 0;
  53 static uint64_t devzvol_zclist;
  54 static size_t devzvol_zclist_size;
  55 static ldi_ident_t devzvol_li;
  56 static ldi_handle_t devzvol_lh;
  57 
  58 /*
  59  * we need to use ddi_mod* since fs/dev gets loaded early on in
  60  * startup(), and linking fs/dev to fs/zfs would drag in a lot of
  61  * other stuff (like drv/random) before the rest of the system is
  62  * ready to go
  63  */
  64 ddi_modhandle_t zfs_mod;
  65 int (*szcm)(char *);
  66 int (*szn2m)(char *, minor_t *);
  67 
  68 
  69 /*
  70  * Enable/disable snapshots from being created in /dev/zvol. By default,
  71  * they are enabled, preserving the historic behavior.
  72  */
  73 boolean_t devzvol_snaps_allowed = B_TRUE;
  74 
  75 int
  76 sdev_zvol_create_minor(char *dsname)
  77 {
  78         if (szcm == NULL)
  79                 return (-1);
  80         return ((*szcm)(dsname));
  81 }
  82 
  83 int
  84 sdev_zvol_name2minor(char *dsname, minor_t *minor)
  85 {
  86         if (szn2m == NULL)
  87                 return (-1);
  88         return ((*szn2m)(dsname, minor));
  89 }
  90 
  91 int
  92 devzvol_open_zfs()
  93 {
  94         int rc;
  95         dev_t dv;
  96 
  97         devzvol_li = ldi_ident_from_anon();
  98         if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
  99             &devzvol_lh, devzvol_li))
 100                 return (-1);
 101         if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
 102             KRTLD_MODE_FIRST, &rc)) == NULL)) {
 103                 return (rc);
 104         }
 105         ASSERT(szcm == NULL && szn2m == NULL);
 106         if ((szcm = (int (*)(char *))
 107             ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
 108                 cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
 109                 return (rc);
 110         }
 111         if ((szn2m = (int(*)(char *, minor_t *))
 112             ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
 113                 cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
 114                 return (rc);
 115         }
 116         if (ldi_get_dev(devzvol_lh, &dv))
 117                 return (-1);
 118         devzvol_major = getmajor(dv);
 119         return (0);
 120 }
 121 
 122 void
 123 devzvol_close_zfs()
 124 {
 125         szcm = NULL;
 126         szn2m = NULL;
 127         (void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
 128         ldi_ident_release(devzvol_li);
 129         if (zfs_mod != NULL) {
 130                 (void) ddi_modclose(zfs_mod);
 131                 zfs_mod = NULL;
 132         }
 133 }
 134 
 135 int
 136 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
 137 {
 138         uint64_t cookie;
 139         int size = 8000;
 140         int unused;
 141         int rc;
 142 
 143         if (cmd != ZFS_IOC_POOL_CONFIGS)
 144                 mutex_enter(&devzvol_mtx);
 145         if (!devzvol_isopen) {
 146                 if ((rc = devzvol_open_zfs()) == 0) {
 147                         devzvol_isopen = B_TRUE;
 148                 } else {
 149                         if (cmd != ZFS_IOC_POOL_CONFIGS)
 150                                 mutex_exit(&devzvol_mtx);
 151                         return (ENXIO);
 152                 }
 153         }
 154         cookie = zc->zc_cookie;
 155 again:
 156         zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
 157             KM_SLEEP);
 158         zc->zc_nvlist_dst_size = size;
 159         rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
 160             &unused);
 161         if (rc == ENOMEM) {
 162                 int newsize;
 163                 newsize = zc->zc_nvlist_dst_size;
 164                 ASSERT(newsize > size);
 165                 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
 166                 size = newsize;
 167                 zc->zc_cookie = cookie;
 168                 goto again;
 169         }
 170         if (alloc_size == NULL)
 171                 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
 172         else
 173                 *alloc_size = size;
 174         if (cmd != ZFS_IOC_POOL_CONFIGS)
 175                 mutex_exit(&devzvol_mtx);
 176         return (rc);
 177 }
 178 
 179 /* figures out if the objset exists and returns its type */
 180 int
 181 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
 182 {
 183         boolean_t       ispool, is_snapshot;
 184         zfs_cmd_t       *zc;
 185         int rc;
 186         nvlist_t        *nvl;
 187         size_t nvsz;
 188 
 189         ispool = (strchr(dsname, '/') == NULL);
 190         is_snapshot = (strchr(dsname, '@') != NULL);
 191 
 192         if (is_snapshot && !devzvol_snaps_allowed)
 193                 return (ENOTSUP);
 194 
 195         zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 196         (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
 197 
 198         nvl = fnvlist_alloc();
 199         fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE);
 200         zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz);
 201         zc->zc_nvlist_src_size = nvsz;
 202         fnvlist_free(nvl);
 203 
 204         rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
 205             ZFS_IOC_OBJSET_STATS, zc, NULL);
 206         if (type && rc == 0)
 207                 *type = (ispool) ? DMU_OST_ZFS :
 208                     zc->zc_objset_stats.dds_type;
 209         fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz);
 210         kmem_free(zc, sizeof (zfs_cmd_t));
 211         return (rc);
 212 }
 213 
 214 /*
 215  * Returns what the zfs dataset name should be, given the /dev/zvol
 216  * path and an optional name (can be NULL).
 217  *
 218  * Note that if the name param is NULL, then path must be an
 219  * actual dataset's directory and not one of the top-level
 220  * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a
 221  * specific dataset.
 222  */
 223 char *
 224 devzvol_make_dsname(const char *path, const char *name)
 225 {
 226         char *dsname;
 227         const char *ptr;
 228         int dslen;
 229 
 230         if (strcmp(path, ZVOL_DIR) == 0)
 231                 return (NULL);
 232         if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
 233                 return (NULL);
 234         ptr = path + strlen(ZVOL_DIR);
 235         if (strncmp(ptr, "/dsk", 4) == 0)
 236                 ptr += strlen("/dsk");
 237         else if (strncmp(ptr, "/rdsk", 5) == 0)
 238                 ptr += strlen("/rdsk");
 239         else
 240                 return (NULL);
 241 
 242         if (*ptr == '/')
 243                 ptr++;
 244         else if (name == NULL)
 245                 return (NULL);
 246 
 247         dslen = strlen(ptr);
 248         if (dslen)
 249                 dslen++;                        /* plus null */
 250         if (name)
 251                 dslen += strlen(name) + 1;      /* plus slash */
 252         dsname = kmem_zalloc(dslen, KM_SLEEP);
 253         if (*ptr) {
 254                 (void) strlcpy(dsname, ptr, dslen);
 255                 if (name)
 256                         (void) strlcat(dsname, "/", dslen);
 257         }
 258         if (name)
 259                 (void) strlcat(dsname, name, dslen);
 260         return (dsname);
 261 }
 262 
 263 /*
 264  * check if the zvol's sdev_node is still valid, which means make
 265  * sure the zvol is still valid.  zvol minors aren't proactively
 266  * destroyed when the zvol is destroyed, so we use a validator to clean
 267  * these up (in other words, when such nodes are encountered during
 268  * subsequent lookup() and readdir() operations) so that only valid
 269  * nodes are returned.  The ordering between devname_lookup_func and
 270  * devzvol_validate is a little inefficient in the case of invalid
 271  * or stale nodes because devname_lookup_func calls
 272  * devzvol_create_{dir, link}, then the validator says it's invalid,
 273  * and then the node gets cleaned up.
 274  */
 275 int
 276 devzvol_validate(struct sdev_node *dv)
 277 {
 278         vnode_t *vn = SDEVTOV(dv);
 279         dmu_objset_type_t do_type;
 280         char *dsname;
 281         char *nm = dv->sdev_name;
 282         int rc;
 283 
 284         sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
 285         /*
 286          * validate only READY nodes; if someone is sitting on the
 287          * directory of a dataset that just got destroyed we could
 288          * get a zombie node which we just skip.
 289          */
 290         if (dv->sdev_state != SDEV_READY) {
 291                 sdcmn_err13(("skipping '%s'", nm));
 292                 return (SDEV_VTOR_SKIP);
 293         }
 294 
 295         if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
 296             (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
 297                 return (SDEV_VTOR_VALID);
 298         dsname = devzvol_make_dsname(dv->sdev_path, NULL);
 299         if (dsname == NULL)
 300                 return (SDEV_VTOR_INVALID);
 301 
 302         /*
 303          * Leave any nodes alone that have been explicitly created by
 304          * sdev profiles.
 305          */
 306         if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) {
 307                 kmem_free(dsname, strlen(dsname) + 1);
 308                 return (SDEV_VTOR_VALID);
 309         }
 310 
 311         rc = devzvol_objset_check(dsname, &do_type);
 312         sdcmn_err13(("  '%s' rc %d", dsname, rc));
 313         if (rc != 0) {
 314                 sdev_node_t *parent = dv->sdev_dotdot;
 315                 /*
 316                  * Explicitly passed-through zvols in our sdev profile can't
 317                  * be created as prof_* shadow nodes, because in the GZ they
 318                  * are symlinks, but in the NGZ they are actual device files.
 319                  *
 320                  * The objset_check will fail on these as they are outside
 321                  * any delegated dataset (zfs will not allow ioctl access to
 322                  * them from this zone). We still want them to work, though.
 323                  */
 324                 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
 325                     parent->sdev_origin != NULL &&
 326                     !(dv->sdev_flags & SDEV_GLOBAL) &&
 327                     (vn->v_type == VBLK || vn->v_type == VCHR) &&
 328                     prof_name_matched(nm, parent)) {
 329                         do_type = DMU_OST_ZVOL;
 330                 } else {
 331                         kmem_free(dsname, strlen(dsname) + 1);
 332                         return (SDEV_VTOR_INVALID);
 333                 }
 334         }
 335 
 336         sdcmn_err13(("  v_type %d do_type %d",
 337             vn->v_type, do_type));
 338         if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
 339             ((vn->v_type == VBLK || vn->v_type == VCHR) &&
 340             do_type != DMU_OST_ZVOL) ||
 341             (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
 342                 kmem_free(dsname, strlen(dsname) + 1);
 343                 return (SDEV_VTOR_STALE);
 344         }
 345         if (vn->v_type == VLNK) {
 346                 char *ptr, *link;
 347                 long val = 0;
 348                 minor_t lminor, ominor;
 349 
 350                 rc = sdev_getlink(vn, &link);
 351                 ASSERT(rc == 0);
 352 
 353                 ptr = strrchr(link, ':') + 1;
 354                 rc = ddi_strtol(ptr, NULL, 10, &val);
 355                 kmem_free(link, strlen(link) + 1);
 356                 ASSERT(rc == 0 && val != 0);
 357                 lminor = (minor_t)val;
 358                 if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
 359                     ominor != lminor) {
 360                         kmem_free(dsname, strlen(dsname) + 1);
 361                         return (SDEV_VTOR_STALE);
 362                 }
 363         }
 364         kmem_free(dsname, strlen(dsname) + 1);
 365         return (SDEV_VTOR_VALID);
 366 }
 367 
 368 /*
 369  * Taskq callback to update the devzvol_zclist.
 370  *
 371  * We need to defer this to the taskq to avoid it running with a user
 372  * context that might be associated with some non-global zone, and thus
 373  * not being able to list all of the pools on the entire system.
 374  */
 375 /*ARGSUSED*/
 376 static void
 377 devzvol_update_zclist_cb(void *arg)
 378 {
 379         zfs_cmd_t       *zc;
 380         int             rc;
 381         size_t          size;
 382 
 383         zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 384         mutex_enter(&devzvol_mtx);
 385         zc->zc_cookie = devzvol_gen;
 386 
 387         rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
 388         switch (rc) {
 389                 case 0:
 390                         /* new generation */
 391                         ASSERT(devzvol_gen != zc->zc_cookie);
 392                         devzvol_gen = zc->zc_cookie;
 393                         if (devzvol_zclist)
 394                                 kmem_free((void *)(uintptr_t)devzvol_zclist,
 395                                     devzvol_zclist_size);
 396                         devzvol_zclist = zc->zc_nvlist_dst;
 397                         /* Keep the alloc'd size, not the nvlist size. */
 398                         devzvol_zclist_size = size;
 399                         break;
 400                 default:
 401                         /*
 402                          * Either there was no change in pool configuration
 403                          * since we last asked (rc == EEXIST) or we got a
 404                          * catastrophic error.
 405                          *
 406                          * Give up memory and exit.
 407                          */
 408                         kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
 409                             size);
 410                         break;
 411         }
 412 
 413         VERIFY(devzvol_zclist_task_running == B_TRUE);
 414         devzvol_zclist_task_running = B_FALSE;
 415         mutex_exit(&devzvol_mtx);
 416 
 417         kmem_free(zc, sizeof (zfs_cmd_t));
 418 }
 419 
 420 static void
 421 devzvol_update_zclist(void)
 422 {
 423         mutex_enter(&devzvol_mtx);
 424         if (devzvol_zclist_task_running == B_TRUE) {
 425                 mutex_exit(&devzvol_mtx);
 426                 goto wait;
 427         }
 428 
 429         devzvol_zclist_task_running = B_TRUE;
 430 
 431         taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
 432             &devzvol_zclist_task);
 433 
 434         mutex_exit(&devzvol_mtx);
 435 
 436 wait:
 437         taskq_wait(sdev_taskq);
 438 }
 439 
 440 /*
 441  * Creates sub-directories for each zpool as needed in response to a
 442  * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
 443  */
 444 void
 445 devzvol_create_pool_dirs(struct vnode *dvp)
 446 {
 447         nvlist_t *nv = NULL;
 448         nvpair_t *elem = NULL;
 449         int pools = 0;
 450         int rc;
 451 
 452         sdcmn_err13(("devzvol_create_pool_dirs"));
 453 
 454         devzvol_update_zclist();
 455 
 456         mutex_enter(&devzvol_mtx);
 457 
 458         rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
 459             devzvol_zclist_size, &nv, 0);
 460         if (rc) {
 461                 ASSERT(rc == 0);
 462                 kmem_free((void *)(uintptr_t)devzvol_zclist,
 463                     devzvol_zclist_size);
 464                 devzvol_gen = 0;
 465                 devzvol_zclist = NULL;
 466                 devzvol_zclist_size = 0;
 467                 goto out;
 468         }
 469         mutex_exit(&devzvol_mtx);
 470         while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
 471                 struct vnode *vp;
 472                 ASSERT(dvp->v_count > 0);
 473                 rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
 474                     NULL, kcred, NULL, 0, NULL);
 475                 /* should either work, or not be visible from a zone */
 476                 ASSERT(rc == 0 || rc == ENOENT);
 477                 if (rc == 0)
 478                         VN_RELE(vp);
 479                 pools++;
 480         }
 481         nvlist_free(nv);
 482         mutex_enter(&devzvol_mtx);
 483         if (devzvol_isopen && pools == 0) {
 484                 /* clean up so zfs can be unloaded */
 485                 devzvol_close_zfs();
 486                 devzvol_isopen = B_FALSE;
 487         }
 488 out:
 489         mutex_exit(&devzvol_mtx);
 490 }
 491 
 492 /*ARGSUSED3*/
 493 static int
 494 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
 495     cred_t *cred, void *whatever, char *whichever)
 496 {
 497         timestruc_t now;
 498         struct vattr *vap = (struct vattr *)arg;
 499 
 500         sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
 501             ddv->sdev_path, nm));
 502         ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
 503             strlen(ZVOL_DIR)) == 0);
 504         *vap = *sdev_getdefault_attr(VDIR);
 505         gethrestime(&now);
 506         vap->va_atime = now;
 507         vap->va_mtime = now;
 508         vap->va_ctime = now;
 509         return (0);
 510 }
 511 
 512 /*ARGSUSED3*/
 513 static int
 514 devzvol_create_link(struct sdev_node *ddv, char *nm,
 515     void **arg, cred_t *cred, void *whatever, char *whichever)
 516 {
 517         minor_t minor;
 518         char *pathname = (char *)*arg;
 519         int rc;
 520         char *dsname;
 521         char *x;
 522         char str[MAXNAMELEN];
 523         sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
 524             ddv->sdev_path, nm));
 525         dsname = devzvol_make_dsname(ddv->sdev_path, nm);
 526         rc = sdev_zvol_create_minor(dsname);
 527         if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
 528             sdev_zvol_name2minor(dsname, &minor)) {
 529                 sdcmn_err13(("devzvol_create_link %d", rc));
 530                 kmem_free(dsname, strlen(dsname) + 1);
 531                 return (-1);
 532         }
 533         kmem_free(dsname, strlen(dsname) + 1);
 534 
 535         /*
 536          * This is a valid zvol; create a symlink that points to the
 537          * minor which was created under /devices/pseudo/zfs@0
 538          */
 539         *pathname = '\0';
 540         for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
 541                 (void) strcat(pathname, "../");
 542         (void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
 543         (void) strncat(pathname, str, MAXPATHLEN);
 544         if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
 545             strlen(ZVOL_FULL_RDEV_DIR)) == 0)
 546                 (void) strcat(pathname, ",raw");
 547         return (0);
 548 }
 549 
 550 /* Clean zvol sdev_nodes that are no longer valid.  */
 551 static void
 552 devzvol_prunedir(struct sdev_node *ddv)
 553 {
 554         struct sdev_node *dv;
 555 
 556         ASSERT(RW_READ_HELD(&ddv->sdev_contents));
 557 
 558         sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
 559         ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
 560         if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
 561                 rw_exit(&ddv->sdev_contents);
 562                 rw_enter(&ddv->sdev_contents, RW_WRITER);
 563         }
 564 
 565         dv = SDEV_FIRST_ENTRY(ddv);
 566         while (dv) {
 567                 sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
 568 
 569                 switch (devzvol_validate(dv)) {
 570                 case SDEV_VTOR_VALID:
 571                 case SDEV_VTOR_SKIP:
 572                         dv = SDEV_NEXT_ENTRY(ddv, dv);
 573                         continue;
 574                 case SDEV_VTOR_INVALID:
 575                         sdcmn_err7(("prunedir: destroy invalid "
 576                             "node: %s\n", dv->sdev_name));
 577                         break;
 578                 }
 579 
 580                 if ((SDEVTOV(dv)->v_type == VDIR) &&
 581                     (sdev_cleandir(dv, NULL, 0) != 0)) {
 582                         dv = SDEV_NEXT_ENTRY(ddv, dv);
 583                         continue;
 584                 }
 585                 SDEV_HOLD(dv);
 586                 /* remove the cache node */
 587                 sdev_cache_update(ddv, &dv, dv->sdev_name,
 588                     SDEV_CACHE_DELETE);
 589                 SDEV_RELE(dv);
 590                 dv = SDEV_FIRST_ENTRY(ddv);
 591         }
 592         rw_downgrade(&ddv->sdev_contents);
 593 }
 594 
 595 /*
 596  * This function is used to create a dir or dev inside a zone's /dev when the
 597  * zone has a zvol that is dynamically created within the zone (i.e. inside
 598  * of a delegated dataset.  Since there is no /devices tree within a zone,
 599  * we create the chr/blk devices directly inside the zone's /dev instead of
 600  * making symlinks.
 601  */
 602 static int
 603 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
 604 {
 605         struct vattr vattr;
 606         timestruc_t now;
 607         enum vtype expected_type = VDIR;
 608         dmu_objset_type_t do_type;
 609         struct sdev_node *dv = NULL;
 610         int res;
 611         char *dsname;
 612 
 613         bzero(&vattr, sizeof (vattr));
 614         gethrestime(&now);
 615         vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
 616         vattr.va_uid = SDEV_UID_DEFAULT;
 617         vattr.va_gid = SDEV_GID_DEFAULT;
 618         vattr.va_type = VNON;
 619         vattr.va_atime = now;
 620         vattr.va_mtime = now;
 621         vattr.va_ctime = now;
 622 
 623         if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
 624                 return (ENOENT);
 625 
 626         if (devzvol_objset_check(dsname, &do_type) != 0) {
 627                 /*
 628                  * objset_check will succeed on any valid objset in the global
 629                  * zone, and any valid delegated dataset. It will fail, however,
 630                  * in non-global zones on explicitly whitelisted zvol devices
 631                  * that are outside any delegated dataset.
 632                  *
 633                  * The directories leading up to the zvol device itself will be
 634                  * created by prof for us in advance (and will always validate
 635                  * because of the matching check in devzvol_validate). The zvol
 636                  * device itself can't be created by prof though because in the
 637                  * GZ it's a symlink, and in the NGZ it is not. So, we create
 638                  * such zvol device files here.
 639                  */
 640                 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
 641                     parent->sdev_origin != NULL &&
 642                     prof_name_matched(nm, parent)) {
 643                         do_type = DMU_OST_ZVOL;
 644                 } else {
 645                         kmem_free(dsname, strlen(dsname) + 1);
 646                         return (ENOENT);
 647                 }
 648         }
 649 
 650         if (do_type == DMU_OST_ZVOL)
 651                 expected_type = VBLK;
 652 
 653         if (expected_type == VDIR) {
 654                 vattr.va_type = VDIR;
 655                 vattr.va_mode = SDEV_DIRMODE_DEFAULT;
 656         } else {
 657                 minor_t minor;
 658                 dev_t devnum;
 659                 int rc;
 660 
 661                 rc = sdev_zvol_create_minor(dsname);
 662                 if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
 663                     sdev_zvol_name2minor(dsname, &minor)) {
 664                         kmem_free(dsname, strlen(dsname) + 1);
 665                         return (ENOENT);
 666                 }
 667 
 668                 devnum = makedevice(devzvol_major, minor);
 669                 vattr.va_rdev = devnum;
 670 
 671                 if (strstr(parent->sdev_path, "/rdsk/") != NULL)
 672                         vattr.va_type = VCHR;
 673                 else
 674                         vattr.va_type = VBLK;
 675                 vattr.va_mode = SDEV_DEVMODE_DEFAULT;
 676         }
 677         kmem_free(dsname, strlen(dsname) + 1);
 678 
 679         rw_enter(&parent->sdev_contents, RW_WRITER);
 680 
 681         res = sdev_mknode(parent, nm, &dv, &vattr,
 682             NULL, NULL, kcred, SDEV_READY);
 683         rw_exit(&parent->sdev_contents);
 684         if (res != 0)
 685                 return (ENOENT);
 686 
 687         SDEV_RELE(dv);
 688         return (0);
 689 }
 690 
 691 /*ARGSUSED*/
 692 static int
 693 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 694     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
 695     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
 696 {
 697         enum vtype expected_type = VDIR;
 698         struct sdev_node *parent = VTOSDEV(dvp);
 699         char *dsname;
 700         dmu_objset_type_t do_type;
 701         int error;
 702 
 703         sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
 704         *vpp = NULL;
 705         /* execute access is required to search the directory */
 706         if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
 707                 return (error);
 708 
 709         rw_enter(&parent->sdev_contents, RW_READER);
 710         if (!SDEV_IS_GLOBAL(parent)) {
 711                 int res;
 712 
 713                 rw_exit(&parent->sdev_contents);
 714 
 715                 /*
 716                  * If we're in the global zone and reach down into a non-global
 717                  * zone's /dev/zvol then this action could trigger the creation
 718                  * of all of the zvol devices for every zone into the non-global
 719                  * zone's /dev tree. This could be a big security hole. To
 720                  * prevent this, disallow the global zone from looking inside
 721                  * a non-global zones /dev/zvol. This behavior is similar to
 722                  * delegated datasets, which cannot be used by the global zone.
 723                  */
 724                 if (getzoneid() == GLOBAL_ZONEID)
 725                         return (EPERM);
 726 
 727                 res = prof_lookup(dvp, nm, vpp, cred);
 728 
 729                 /*
 730                  * We won't find a zvol that was dynamically created inside
 731                  * a NGZ, within a delegated dataset, in the zone's dev profile
 732                  * but prof_lookup will also find it via sdev_cache_lookup.
 733                  */
 734                 if (res == ENOENT) {
 735                         /*
 736                          * We have to create the sdev node for the dymamically
 737                          * created zvol.
 738                          */
 739                         if (devzvol_mk_ngz_node(parent, nm) != 0)
 740                                 return (ENOENT);
 741                         res = prof_lookup(dvp, nm, vpp, cred);
 742                 }
 743 
 744                 return (res);
 745         }
 746 
 747         /*
 748          * Don't let the global-zone style lookup succeed here when we're not
 749          * running in the global zone. This can happen because prof calls into
 750          * us (in prof_filldir) trying to create an explicitly passed-through
 751          * zvol device outside any delegated dataset.
 752          *
 753          * We have to stop this here or else we will create prof shadows of
 754          * the global zone symlink, which will make no sense at all in the
 755          * non-global zone (it has no /devices for the symlink to point at).
 756          *
 757          * These zvols will be created later (at access time) by mk_ngz_node
 758          * instead. The dirs leading up to them will be created by prof
 759          * internally.
 760          *
 761          * We have to return EPERM here, because ENOENT is given special
 762          * meaning by prof in this context.
 763          */
 764         if (getzoneid() != GLOBAL_ZONEID) {
 765                 rw_exit(&parent->sdev_contents);
 766                 return (EPERM);
 767         }
 768 
 769         dsname = devzvol_make_dsname(parent->sdev_path, nm);
 770         rw_exit(&parent->sdev_contents);
 771         sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
 772         if (dsname) {
 773                 error = devzvol_objset_check(dsname, &do_type);
 774                 if (error != 0) {
 775                         error = ENOENT;
 776                         goto out;
 777                 }
 778                 if (do_type == DMU_OST_ZVOL)
 779                         expected_type = VLNK;
 780         }
 781         /*
 782          * the callbacks expect:
 783          *
 784          * parent->sdev_path            nm
 785          * /dev/zvol                       {r}dsk
 786          * /dev/zvol/{r}dsk                <pool name>
 787          * /dev/zvol/{r}dsk/<dataset name> <last ds component>
 788          *
 789          * sdev_name is always last path component of sdev_path
 790          */
 791         if (expected_type == VDIR) {
 792                 error = devname_lookup_func(parent, nm, vpp, cred,
 793                     devzvol_create_dir, SDEV_VATTR);
 794         } else {
 795                 error = devname_lookup_func(parent, nm, vpp, cred,
 796                     devzvol_create_link, SDEV_VLINK);
 797         }
 798         sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
 799         ASSERT(error || ((*vpp)->v_type == expected_type));
 800 out:
 801         if (dsname)
 802                 kmem_free(dsname, strlen(dsname) + 1);
 803         sdcmn_err13(("devzvol_lookup %d", error));
 804         return (error);
 805 }
 806 
 807 /*
 808  * We allow create to find existing nodes
 809  *      - if the node doesn't exist - EROFS
 810  *      - creating an existing dir read-only succeeds, otherwise EISDIR
 811  *      - exclusive creates fail - EEXIST
 812  */
 813 /*ARGSUSED2*/
 814 static int
 815 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
 816     int mode, struct vnode **vpp, struct cred *cred, int flag,
 817     caller_context_t *ct, vsecattr_t *vsecp)
 818 {
 819         int error;
 820         struct vnode *vp;
 821 
 822         *vpp = NULL;
 823 
 824         error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
 825             NULL);
 826         if (error == 0) {
 827                 if (excl == EXCL)
 828                         error = EEXIST;
 829                 else if (vp->v_type == VDIR && (mode & VWRITE))
 830                         error = EISDIR;
 831                 else
 832                         error = VOP_ACCESS(vp, mode, 0, cred, ct);
 833 
 834                 if (error) {
 835                         VN_RELE(vp);
 836                 } else
 837                         *vpp = vp;
 838         } else if (error == ENOENT) {
 839                 error = EROFS;
 840         }
 841 
 842         return (error);
 843 }
 844 
 845 void sdev_iter_snapshots(struct vnode *dvp, char *name);
 846 
 847 void
 848 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
 849 {
 850         zfs_cmd_t       *zc;
 851         int rc;
 852 
 853         sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
 854         zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 855         (void) strcpy(zc->zc_name, name);
 856 
 857         while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
 858                 struct vnode *vpp;
 859                 char *ptr;
 860 
 861                 sdcmn_err13(("  name %s", zc->zc_name));
 862                 if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
 863                         goto skip;
 864                 ptr = strrchr(zc->zc_name, '/') + 1;
 865                 rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
 866                     kcred, NULL, NULL, NULL);
 867                 if (rc == 0) {
 868                         VN_RELE(vpp);
 869                 } else if (rc == ENOENT) {
 870                         goto skip;
 871                 } else {
 872                         /*
 873                          * EBUSY == problem with zvols's dmu holds?
 874                          * EPERM when in a NGZ and traversing up and out.
 875                          */
 876                         goto skip;
 877                 }
 878                 if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
 879                     zc->zc_objset_stats.dds_type == DMU_OST_ZVOL &&
 880                     devzvol_snaps_allowed)
 881                         sdev_iter_snapshots(dvp, zc->zc_name);
 882 skip:
 883                 (void) strcpy(zc->zc_name, name);
 884         }
 885         kmem_free(zc, sizeof (zfs_cmd_t));
 886 }
 887 
 888 void
 889 sdev_iter_snapshots(struct vnode *dvp, char *name)
 890 {
 891         sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
 892 }
 893 
 894 /*ARGSUSED4*/
 895 static int
 896 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
 897     int *eofp, caller_context_t *ct_unused, int flags_unused)
 898 {
 899         struct sdev_node *sdvp = VTOSDEV(dvp);
 900         char *ptr;
 901 
 902         sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
 903             sdvp->sdev_name));
 904 
 905         if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
 906                 struct vnode *vp;
 907 
 908                 rw_exit(&sdvp->sdev_contents);
 909                 (void) devname_lookup_func(sdvp, "dsk", &vp, cred,
 910                     devzvol_create_dir, SDEV_VATTR);
 911                 VN_RELE(vp);
 912                 (void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
 913                     devzvol_create_dir, SDEV_VATTR);
 914                 VN_RELE(vp);
 915                 rw_enter(&sdvp->sdev_contents, RW_READER);
 916                 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
 917         }
 918         if (uiop->uio_offset == 0)
 919                 devzvol_prunedir(sdvp);
 920         ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
 921         if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
 922                 rw_exit(&sdvp->sdev_contents);
 923                 devzvol_create_pool_dirs(dvp);
 924                 rw_enter(&sdvp->sdev_contents, RW_READER);
 925                 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
 926         }
 927 
 928         ptr = strchr(ptr + 1, '/');
 929         if (ptr == NULL)
 930                 return (ENOENT);
 931         ptr++;
 932         rw_exit(&sdvp->sdev_contents);
 933         sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
 934         rw_enter(&sdvp->sdev_contents, RW_READER);
 935         return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
 936 }
 937 
 938 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
 939         VOPNAME_READDIR,        { .vop_readdir = devzvol_readdir },
 940         VOPNAME_LOOKUP,         { .vop_lookup = devzvol_lookup },
 941         VOPNAME_CREATE,         { .vop_create = devzvol_create },
 942         VOPNAME_RENAME,         { .error = fs_nosys },
 943         VOPNAME_MKDIR,          { .error = fs_nosys },
 944         VOPNAME_RMDIR,          { .error = fs_nosys },
 945         VOPNAME_REMOVE,         { .error = fs_nosys },
 946         VOPNAME_SYMLINK,        { .error = fs_nosys },
 947         NULL,                   NULL
 948 };