1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2013, 2016 Joyent, Inc.  All rights reserved.
  25  * Copyright (c) 2014 by Delphix. All rights reserved.
  26  */
  27 
  28 /* vnode ops for the /dev/zvol directory */
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/ddi.h>
  34 #include <sys/sunndi.h>
  35 #include <sys/sunldi.h>
  36 #include <fs/fs_subr.h>
  37 #include <sys/fs/dv_node.h>
  38 #include <sys/fs/sdev_impl.h>
  39 #include <sys/zfs_ioctl.h>
  40 #include <sys/policy.h>
  41 #include <sys/stat.h>
  42 #include <sys/vfs_opreg.h>
  43 
  44 struct vnodeops *devzvol_vnodeops;
  45 static major_t devzvol_major;
  46 static taskq_ent_t devzvol_zclist_task;
  47 
  48 static kmutex_t devzvol_mtx;
  49 /* Below are protected by devzvol_mtx */
  50 static boolean_t devzvol_isopen;
  51 static boolean_t devzvol_zclist_task_running = B_FALSE;
  52 static uint64_t devzvol_gen = 0;
  53 static uint64_t devzvol_zclist;
  54 static size_t devzvol_zclist_size;
  55 static ldi_ident_t devzvol_li;
  56 static ldi_handle_t devzvol_lh;
  57 
  58 /*
  59  * we need to use ddi_mod* since fs/dev gets loaded early on in
  60  * startup(), and linking fs/dev to fs/zfs would drag in a lot of
  61  * other stuff (like drv/random) before the rest of the system is
  62  * ready to go
  63  */
  64 ddi_modhandle_t zfs_mod;
  65 int (*szcm)(char *);
  66 int (*szn2m)(char *, minor_t *);
  67 
  68 
  69 /*
  70  * Enable/disable snapshots from being created in /dev/zvol. By default,
  71  * they are enabled, preserving the historic behavior.
  72  */
  73 boolean_t devzvol_snaps_allowed = B_TRUE;
  74 
  75 int
  76 sdev_zvol_create_minor(char *dsname)
  77 {
  78         if (szcm == NULL)
  79                 return (-1);
  80         return ((*szcm)(dsname));
  81 }
  82 
  83 int
  84 sdev_zvol_name2minor(char *dsname, minor_t *minor)
  85 {
  86         if (szn2m == NULL)
  87                 return (-1);
  88         return ((*szn2m)(dsname, minor));
  89 }
  90 
  91 int
  92 devzvol_open_zfs()
  93 {
  94         int rc;
  95         dev_t dv;
  96 
  97         devzvol_li = ldi_ident_from_anon();
  98         if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
  99             &devzvol_lh, devzvol_li))
 100                 return (-1);
 101         if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
 102             KRTLD_MODE_FIRST, &rc)) == NULL)) {
 103                 return (rc);
 104         }
 105         ASSERT(szcm == NULL && szn2m == NULL);
 106         if ((szcm = (int (*)(char *))
 107             ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
 108                 cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
 109                 return (rc);
 110         }
 111         if ((szn2m = (int(*)(char *, minor_t *))
 112             ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
 113                 cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
 114                 return (rc);
 115         }
 116         if (ldi_get_dev(devzvol_lh, &dv))
 117                 return (-1);
 118         devzvol_major = getmajor(dv);
 119         return (0);
 120 }
 121 
 122 void
 123 devzvol_close_zfs()
 124 {
 125         szcm = NULL;
 126         szn2m = NULL;
 127         (void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
 128         ldi_ident_release(devzvol_li);
 129         if (zfs_mod != NULL) {
 130                 (void) ddi_modclose(zfs_mod);
 131                 zfs_mod = NULL;
 132         }
 133 }
 134 
 135 int
 136 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
 137 {
 138         uint64_t cookie;
 139         int size = 8000;
 140         int unused;
 141         int rc;
 142 
 143         if (cmd != ZFS_IOC_POOL_CONFIGS)
 144                 mutex_enter(&devzvol_mtx);
 145         if (!devzvol_isopen) {
 146                 if ((rc = devzvol_open_zfs()) == 0) {
 147                         devzvol_isopen = B_TRUE;
 148                 } else {
 149                         if (cmd != ZFS_IOC_POOL_CONFIGS)
 150                                 mutex_exit(&devzvol_mtx);
 151                         return (ENXIO);
 152                 }
 153         }
 154         cookie = zc->zc_cookie;
 155 again:
 156         zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
 157             KM_SLEEP);
 158         zc->zc_nvlist_dst_size = size;
 159         rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
 160             &unused);
 161         if (rc == ENOMEM) {
 162                 int newsize;
 163                 newsize = zc->zc_nvlist_dst_size;
 164                 ASSERT(newsize > size);
 165                 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
 166                 size = newsize;
 167                 zc->zc_cookie = cookie;
 168                 goto again;
 169         }
 170         if (alloc_size == NULL)
 171                 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
 172         else
 173                 *alloc_size = size;
 174         if (cmd != ZFS_IOC_POOL_CONFIGS)
 175                 mutex_exit(&devzvol_mtx);
 176         return (rc);
 177 }
 178 
 179 /* figures out if the objset exists and returns its type */
 180 int
 181 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
 182 {
 183         boolean_t       ispool, is_snapshot;
 184         zfs_cmd_t       *zc;
 185         int rc;
 186         nvlist_t        *nvl;
 187         size_t nvsz;
 188 
 189         ispool = (strchr(dsname, '/') == NULL);
 190         is_snapshot = (strchr(dsname, '@') != NULL);
 191 
 192         if (is_snapshot && !devzvol_snaps_allowed)
 193                 return (ENOTSUP);
 194 
 195         zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 196         (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
 197 
 198         nvl = fnvlist_alloc();
 199         fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE);
 200         zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz);
 201         zc->zc_nvlist_src_size = nvsz;
 202         fnvlist_free(nvl);
 203 
 204         rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
 205             ZFS_IOC_OBJSET_STATS, zc, NULL);
 206         if (type && rc == 0)
 207                 *type = (ispool) ? DMU_OST_ZFS :
 208                     zc->zc_objset_stats.dds_type;
 209         fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz);
 210         kmem_free(zc, sizeof (zfs_cmd_t));
 211         return (rc);
 212 }
 213 
 214 /*
 215  * Returns what the zfs dataset name should be, given the /dev/zvol
 216  * path and an optional name (can be NULL).
 217  *
 218  * Note that if the name param is NULL, then path must be an
 219  * actual dataset's directory and not one of the top-level
 220  * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a
 221  * specific dataset.
 222  */
 223 char *
 224 devzvol_make_dsname(const char *path, const char *name)
 225 {
 226         char *dsname;
 227         const char *ptr;
 228         int dslen;
 229 
 230         if (strcmp(path, ZVOL_DIR) == 0)
 231                 return (NULL);
 232         if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
 233                 return (NULL);
 234         ptr = path + strlen(ZVOL_DIR);
 235         if (strncmp(ptr, "/dsk", 4) == 0)
 236                 ptr += strlen("/dsk");
 237         else if (strncmp(ptr, "/rdsk", 5) == 0)
 238                 ptr += strlen("/rdsk");
 239         else
 240                 return (NULL);
 241 
 242         if (*ptr == '/')
 243                 ptr++;
 244         else if (name == NULL)
 245                 return (NULL);
 246 
 247         dslen = strlen(ptr);
 248         if (dslen)
 249                 dslen++;                        /* plus null */
 250         if (name)
 251                 dslen += strlen(name) + 1;      /* plus slash */
 252         dsname = kmem_zalloc(dslen, KM_SLEEP);
 253         if (*ptr) {
 254                 (void) strlcpy(dsname, ptr, dslen);
 255                 if (name)
 256                         (void) strlcat(dsname, "/", dslen);
 257         }
 258         if (name)
 259                 (void) strlcat(dsname, name, dslen);
 260         return (dsname);
 261 }
 262 
 263 /*
 264  * check if the zvol's sdev_node is still valid, which means make
 265  * sure the zvol is still valid.  zvol minors aren't proactively
 266  * destroyed when the zvol is destroyed, so we use a validator to clean
 267  * these up (in other words, when such nodes are encountered during
 268  * subsequent lookup() and readdir() operations) so that only valid
 269  * nodes are returned.  The ordering between devname_lookup_func and
 270  * devzvol_validate is a little inefficient in the case of invalid
 271  * or stale nodes because devname_lookup_func calls
 272  * devzvol_create_{dir, link}, then the validator says it's invalid,
 273  * and then the node gets cleaned up.
 274  */
 275 int
 276 devzvol_validate(struct sdev_node *dv)
 277 {
 278         vnode_t *vn = SDEVTOV(dv);
 279         dmu_objset_type_t do_type;
 280         char *dsname;
 281         char *nm = dv->sdev_name;
 282         int rc;
 283 
 284         sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
 285         /*
 286          * validate only READY nodes; if someone is sitting on the
 287          * directory of a dataset that just got destroyed we could
 288          * get a zombie node which we just skip.
 289          */
 290         if (dv->sdev_state != SDEV_READY) {
 291                 sdcmn_err13(("skipping '%s'", nm));
 292                 return (SDEV_VTOR_SKIP);
 293         }
 294 
 295         if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
 296             (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
 297                 return (SDEV_VTOR_VALID);
 298         dsname = devzvol_make_dsname(dv->sdev_path, NULL);
 299         if (dsname == NULL)
 300                 return (SDEV_VTOR_INVALID);
 301 
 302         /*
 303          * Leave any nodes alone that have been explicitly created by
 304          * sdev profiles.
 305          */
 306         if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) {
 307                 kmem_free(dsname, strlen(dsname) + 1);
 308                 return (SDEV_VTOR_VALID);
 309         }
 310 
 311         rc = devzvol_objset_check(dsname, &do_type);
 312         sdcmn_err13(("  '%s' rc %d", dsname, rc));
 313         if (rc != 0) {
 314                 sdev_node_t *parent = dv->sdev_dotdot;
 315                 /*
 316                  * Explicitly passed-through zvols in our sdev profile can't
 317                  * be created as prof_* shadow nodes, because in the GZ they
 318                  * are symlinks, but in the NGZ they are actual device files.
 319                  *
 320                  * The objset_check will fail on these as they are outside
 321                  * any delegated dataset (zfs will not allow ioctl access to
 322                  * them from this zone). We still want them to work, though.
 323                  */
 324                 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
 325                     parent->sdev_origin != NULL &&
 326                     !(dv->sdev_flags & SDEV_GLOBAL) &&
 327                     (vn->v_type == VBLK || vn->v_type == VCHR) &&
 328                     prof_name_matched(nm, parent)) {
 329                         do_type = DMU_OST_ZVOL;
 330                 } else {
 331                         kmem_free(dsname, strlen(dsname) + 1);
 332                         return (SDEV_VTOR_INVALID);
 333                 }
 334         }
 335 
 336         sdcmn_err13(("  v_type %d do_type %d",
 337             vn->v_type, do_type));
 338         if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
 339             ((vn->v_type == VBLK || vn->v_type == VCHR) &&
 340             do_type != DMU_OST_ZVOL) ||
 341             (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
 342                 kmem_free(dsname, strlen(dsname) + 1);
 343                 return (SDEV_VTOR_STALE);
 344         }
 345         if (vn->v_type == VLNK) {
 346                 char *ptr, *link;
 347                 long val = 0;
 348                 minor_t lminor, ominor;
 349 
 350                 rc = sdev_getlink(vn, &link);
 351                 ASSERT(rc == 0);
 352 
 353                 ptr = strrchr(link, ':') + 1;
 354                 rc = ddi_strtol(ptr, NULL, 10, &val);
 355                 kmem_free(link, strlen(link) + 1);
 356                 ASSERT(rc == 0 && val != 0);
 357                 lminor = (minor_t)val;
 358                 if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
 359                     ominor != lminor) {
 360                         kmem_free(dsname, strlen(dsname) + 1);
 361                         return (SDEV_VTOR_STALE);
 362                 }
 363         }
 364         kmem_free(dsname, strlen(dsname) + 1);
 365         return (SDEV_VTOR_VALID);
 366 }
 367 
 368 /*
 369  * Taskq callback to update the devzvol_zclist.
 370  *
 371  * We need to defer this to the taskq to avoid it running with a user
 372  * context that might be associated with some non-global zone, and thus
 373  * not being able to list all of the pools on the entire system.
 374  */
 375 /*ARGSUSED*/
 376 static void
 377 devzvol_update_zclist_cb(void *arg)
 378 {
 379         zfs_cmd_t       *zc;
 380         int             rc;
 381         size_t          size;
 382 
 383         zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 384         mutex_enter(&devzvol_mtx);
 385         zc->zc_cookie = devzvol_gen;
 386 
 387         rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
 388         switch (rc) {
 389                 case 0:
 390                         /* new generation */
 391                         ASSERT(devzvol_gen != zc->zc_cookie);
 392                         devzvol_gen = zc->zc_cookie;
 393                         if (devzvol_zclist)
 394                                 kmem_free((void *)(uintptr_t)devzvol_zclist,
 395                                     devzvol_zclist_size);
 396                         devzvol_zclist = zc->zc_nvlist_dst;
 397                         /* Keep the alloc'd size, not the nvlist size. */
 398                         devzvol_zclist_size = size;
 399                         break;
 400                 default:
 401                         /*
 402                          * Either there was no change in pool configuration
 403                          * since we last asked (rc == EEXIST) or we got a
 404                          * catastrophic error.
 405                          *
 406                          * Give up memory and exit.
 407                          */
 408                         kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
 409                             size);
 410                         break;
 411         }
 412 
 413         VERIFY(devzvol_zclist_task_running == B_TRUE);
 414         devzvol_zclist_task_running = B_FALSE;
 415         mutex_exit(&devzvol_mtx);
 416 
 417         kmem_free(zc, sizeof (zfs_cmd_t));
 418 }
 419 
 420 static void
 421 devzvol_update_zclist(void)
 422 {
 423         mutex_enter(&devzvol_mtx);
 424         if (devzvol_zclist_task_running == B_TRUE) {
 425                 mutex_exit(&devzvol_mtx);
 426                 goto wait;
 427         }
 428 
 429         devzvol_zclist_task_running = B_TRUE;
 430 
 431         taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
 432             &devzvol_zclist_task);
 433 
 434         mutex_exit(&devzvol_mtx);
 435 
 436 wait:
 437         taskq_wait(sdev_taskq);
 438 }
 439 
 440 /*
 441  * Creates sub-directories for each zpool as needed in response to a
 442  * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
 443  */
 444 void
 445 devzvol_create_pool_dirs(struct vnode *dvp)
 446 {
 447         nvlist_t *nv = NULL;
 448         nvpair_t *elem = NULL;
 449         int pools = 0;
 450         int rc;
 451 
 452         sdcmn_err13(("devzvol_create_pool_dirs"));
 453 
 454         devzvol_update_zclist();
 455 
 456         mutex_enter(&devzvol_mtx);
 457 
 458         rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
 459             devzvol_zclist_size, &nv, 0);
 460         if (rc) {
 461                 ASSERT(rc == 0);
 462                 kmem_free((void *)(uintptr_t)devzvol_zclist,
 463                     devzvol_zclist_size);
 464                 devzvol_gen = 0;
 465                 devzvol_zclist = NULL;
 466                 devzvol_zclist_size = 0;
 467                 goto out;
 468         }
 469         mutex_exit(&devzvol_mtx);
 470         while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
 471                 struct vnode *vp;
 472                 ASSERT(dvp->v_count > 0);
 473                 rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
 474                     NULL, kcred, NULL, 0, NULL);
 475                 /*
 476                  * should either work or we should get an error if this should
 477                  * not be visible from the zone, or disallowed in the zone
 478                  */
 479                 if (rc == 0)
 480                         VN_RELE(vp);
 481                 pools++;
 482         }
 483         nvlist_free(nv);
 484         mutex_enter(&devzvol_mtx);
 485         if (devzvol_isopen && pools == 0) {
 486                 /* clean up so zfs can be unloaded */
 487                 devzvol_close_zfs();
 488                 devzvol_isopen = B_FALSE;
 489         }
 490 out:
 491         mutex_exit(&devzvol_mtx);
 492 }
 493 
 494 /*ARGSUSED3*/
 495 static int
 496 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
 497     cred_t *cred, void *whatever, char *whichever)
 498 {
 499         timestruc_t now;
 500         struct vattr *vap = (struct vattr *)arg;
 501 
 502         sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
 503             ddv->sdev_path, nm));
 504         ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
 505             strlen(ZVOL_DIR)) == 0);
 506         *vap = *sdev_getdefault_attr(VDIR);
 507         gethrestime(&now);
 508         vap->va_atime = now;
 509         vap->va_mtime = now;
 510         vap->va_ctime = now;
 511         return (0);
 512 }
 513 
 514 /*ARGSUSED3*/
 515 static int
 516 devzvol_create_link(struct sdev_node *ddv, char *nm,
 517     void **arg, cred_t *cred, void *whatever, char *whichever)
 518 {
 519         minor_t minor;
 520         char *pathname = (char *)*arg;
 521         int rc;
 522         char *dsname;
 523         char *x;
 524         char str[MAXNAMELEN];
 525         sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
 526             ddv->sdev_path, nm));
 527         dsname = devzvol_make_dsname(ddv->sdev_path, nm);
 528         rc = sdev_zvol_create_minor(dsname);
 529         if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
 530             sdev_zvol_name2minor(dsname, &minor)) {
 531                 sdcmn_err13(("devzvol_create_link %d", rc));
 532                 kmem_free(dsname, strlen(dsname) + 1);
 533                 return (-1);
 534         }
 535         kmem_free(dsname, strlen(dsname) + 1);
 536 
 537         /*
 538          * This is a valid zvol; create a symlink that points to the
 539          * minor which was created under /devices/pseudo/zfs@0
 540          */
 541         *pathname = '\0';
 542         for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
 543                 (void) strcat(pathname, "../");
 544         (void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
 545         (void) strncat(pathname, str, MAXPATHLEN);
 546         if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
 547             strlen(ZVOL_FULL_RDEV_DIR)) == 0)
 548                 (void) strcat(pathname, ",raw");
 549         return (0);
 550 }
 551 
 552 /* Clean zvol sdev_nodes that are no longer valid.  */
 553 static void
 554 devzvol_prunedir(struct sdev_node *ddv)
 555 {
 556         struct sdev_node *dv;
 557 
 558         ASSERT(RW_READ_HELD(&ddv->sdev_contents));
 559 
 560         sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
 561         ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
 562         if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
 563                 rw_exit(&ddv->sdev_contents);
 564                 rw_enter(&ddv->sdev_contents, RW_WRITER);
 565         }
 566 
 567         dv = SDEV_FIRST_ENTRY(ddv);
 568         while (dv) {
 569                 sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
 570 
 571                 switch (devzvol_validate(dv)) {
 572                 case SDEV_VTOR_VALID:
 573                 case SDEV_VTOR_SKIP:
 574                         dv = SDEV_NEXT_ENTRY(ddv, dv);
 575                         continue;
 576                 case SDEV_VTOR_INVALID:
 577                         sdcmn_err7(("prunedir: destroy invalid "
 578                             "node: %s\n", dv->sdev_name));
 579                         break;
 580                 }
 581 
 582                 if ((SDEVTOV(dv)->v_type == VDIR) &&
 583                     (sdev_cleandir(dv, NULL, 0) != 0)) {
 584                         dv = SDEV_NEXT_ENTRY(ddv, dv);
 585                         continue;
 586                 }
 587                 SDEV_HOLD(dv);
 588                 /* remove the cache node */
 589                 sdev_cache_update(ddv, &dv, dv->sdev_name,
 590                     SDEV_CACHE_DELETE);
 591                 SDEV_RELE(dv);
 592                 dv = SDEV_FIRST_ENTRY(ddv);
 593         }
 594         rw_downgrade(&ddv->sdev_contents);
 595 }
 596 
 597 /*
 598  * This function is used to create a dir or dev inside a zone's /dev when the
 599  * zone has a zvol that is dynamically created within the zone (i.e. inside
 600  * of a delegated dataset.  Since there is no /devices tree within a zone,
 601  * we create the chr/blk devices directly inside the zone's /dev instead of
 602  * making symlinks.
 603  */
 604 static int
 605 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
 606 {
 607         struct vattr vattr;
 608         timestruc_t now;
 609         enum vtype expected_type = VDIR;
 610         dmu_objset_type_t do_type;
 611         struct sdev_node *dv = NULL;
 612         int res;
 613         char *dsname;
 614 
 615         bzero(&vattr, sizeof (vattr));
 616         gethrestime(&now);
 617         vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
 618         vattr.va_uid = SDEV_UID_DEFAULT;
 619         vattr.va_gid = SDEV_GID_DEFAULT;
 620         vattr.va_type = VNON;
 621         vattr.va_atime = now;
 622         vattr.va_mtime = now;
 623         vattr.va_ctime = now;
 624 
 625         if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
 626                 return (ENOENT);
 627 
 628         if (devzvol_objset_check(dsname, &do_type) != 0) {
 629                 /*
 630                  * objset_check will succeed on any valid objset in the global
 631                  * zone, and any valid delegated dataset. It will fail, however,
 632                  * in non-global zones on explicitly whitelisted zvol devices
 633                  * that are outside any delegated dataset.
 634                  *
 635                  * The directories leading up to the zvol device itself will be
 636                  * created by prof for us in advance (and will always validate
 637                  * because of the matching check in devzvol_validate). The zvol
 638                  * device itself can't be created by prof though because in the
 639                  * GZ it's a symlink, and in the NGZ it is not. So, we create
 640                  * such zvol device files here.
 641                  */
 642                 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
 643                     parent->sdev_origin != NULL &&
 644                     prof_name_matched(nm, parent)) {
 645                         do_type = DMU_OST_ZVOL;
 646                 } else {
 647                         kmem_free(dsname, strlen(dsname) + 1);
 648                         return (ENOENT);
 649                 }
 650         }
 651 
 652         if (do_type == DMU_OST_ZVOL)
 653                 expected_type = VBLK;
 654 
 655         if (expected_type == VDIR) {
 656                 vattr.va_type = VDIR;
 657                 vattr.va_mode = SDEV_DIRMODE_DEFAULT;
 658         } else {
 659                 minor_t minor;
 660                 dev_t devnum;
 661                 int rc;
 662 
 663                 rc = sdev_zvol_create_minor(dsname);
 664                 if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
 665                     sdev_zvol_name2minor(dsname, &minor)) {
 666                         kmem_free(dsname, strlen(dsname) + 1);
 667                         return (ENOENT);
 668                 }
 669 
 670                 devnum = makedevice(devzvol_major, minor);
 671                 vattr.va_rdev = devnum;
 672 
 673                 if (strstr(parent->sdev_path, "/rdsk/") != NULL)
 674                         vattr.va_type = VCHR;
 675                 else
 676                         vattr.va_type = VBLK;
 677                 vattr.va_mode = SDEV_DEVMODE_DEFAULT;
 678         }
 679         kmem_free(dsname, strlen(dsname) + 1);
 680 
 681         rw_enter(&parent->sdev_contents, RW_WRITER);
 682 
 683         res = sdev_mknode(parent, nm, &dv, &vattr,
 684             NULL, NULL, kcred, SDEV_READY);
 685         rw_exit(&parent->sdev_contents);
 686         if (res != 0)
 687                 return (ENOENT);
 688 
 689         SDEV_RELE(dv);
 690         return (0);
 691 }
 692 
 693 /*ARGSUSED*/
 694 static int
 695 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 696     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
 697     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
 698 {
 699         enum vtype expected_type = VDIR;
 700         struct sdev_node *parent = VTOSDEV(dvp);
 701         char *dsname;
 702         dmu_objset_type_t do_type;
 703         int error;
 704 
 705         sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
 706         *vpp = NULL;
 707         /* execute access is required to search the directory */
 708         if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
 709                 return (error);
 710 
 711         rw_enter(&parent->sdev_contents, RW_READER);
 712         if (!SDEV_IS_GLOBAL(parent)) {
 713                 int res;
 714 
 715                 rw_exit(&parent->sdev_contents);
 716 
 717                 /*
 718                  * If we're in the global zone and reach down into a non-global
 719                  * zone's /dev/zvol then this action could trigger the creation
 720                  * of all of the zvol devices for every zone into the non-global
 721                  * zone's /dev tree. This could be a big security hole. To
 722                  * prevent this, disallow the global zone from looking inside
 723                  * a non-global zones /dev/zvol. This behavior is similar to
 724                  * delegated datasets, which cannot be used by the global zone.
 725                  */
 726                 if (getzoneid() == GLOBAL_ZONEID)
 727                         return (EPERM);
 728 
 729                 res = prof_lookup(dvp, nm, vpp, cred);
 730 
 731                 /*
 732                  * We won't find a zvol that was dynamically created inside
 733                  * a NGZ, within a delegated dataset, in the zone's dev profile
 734                  * but prof_lookup will also find it via sdev_cache_lookup.
 735                  */
 736                 if (res == ENOENT) {
 737                         /*
 738                          * We have to create the sdev node for the dymamically
 739                          * created zvol.
 740                          */
 741                         if (devzvol_mk_ngz_node(parent, nm) != 0)
 742                                 return (ENOENT);
 743                         res = prof_lookup(dvp, nm, vpp, cred);
 744                 }
 745 
 746                 return (res);
 747         }
 748 
 749         /*
 750          * Don't let the global-zone style lookup succeed here when we're not
 751          * running in the global zone. This can happen because prof calls into
 752          * us (in prof_filldir) trying to create an explicitly passed-through
 753          * zvol device outside any delegated dataset.
 754          *
 755          * We have to stop this here or else we will create prof shadows of
 756          * the global zone symlink, which will make no sense at all in the
 757          * non-global zone (it has no /devices for the symlink to point at).
 758          *
 759          * These zvols will be created later (at access time) by mk_ngz_node
 760          * instead. The dirs leading up to them will be created by prof
 761          * internally.
 762          *
 763          * We have to return EPERM here, because ENOENT is given special
 764          * meaning by prof in this context.
 765          */
 766         if (getzoneid() != GLOBAL_ZONEID) {
 767                 rw_exit(&parent->sdev_contents);
 768                 return (EPERM);
 769         }
 770 
 771         dsname = devzvol_make_dsname(parent->sdev_path, nm);
 772         rw_exit(&parent->sdev_contents);
 773         sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
 774         if (dsname) {
 775                 error = devzvol_objset_check(dsname, &do_type);
 776                 if (error != 0) {
 777                         error = ENOENT;
 778                         goto out;
 779                 }
 780                 if (do_type == DMU_OST_ZVOL)
 781                         expected_type = VLNK;
 782         }
 783         /*
 784          * the callbacks expect:
 785          *
 786          * parent->sdev_path            nm
 787          * /dev/zvol                       {r}dsk
 788          * /dev/zvol/{r}dsk                <pool name>
 789          * /dev/zvol/{r}dsk/<dataset name> <last ds component>
 790          *
 791          * sdev_name is always last path component of sdev_path
 792          */
 793         if (expected_type == VDIR) {
 794                 error = devname_lookup_func(parent, nm, vpp, cred,
 795                     devzvol_create_dir, SDEV_VATTR);
 796         } else {
 797                 error = devname_lookup_func(parent, nm, vpp, cred,
 798                     devzvol_create_link, SDEV_VLINK);
 799         }
 800         sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
 801         ASSERT(error || ((*vpp)->v_type == expected_type));
 802 out:
 803         if (dsname)
 804                 kmem_free(dsname, strlen(dsname) + 1);
 805         sdcmn_err13(("devzvol_lookup %d", error));
 806         return (error);
 807 }
 808 
 809 /*
 810  * We allow create to find existing nodes
 811  *      - if the node doesn't exist - EROFS
 812  *      - creating an existing dir read-only succeeds, otherwise EISDIR
 813  *      - exclusive creates fail - EEXIST
 814  */
 815 /*ARGSUSED2*/
 816 static int
 817 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
 818     int mode, struct vnode **vpp, struct cred *cred, int flag,
 819     caller_context_t *ct, vsecattr_t *vsecp)
 820 {
 821         int error;
 822         struct vnode *vp;
 823 
 824         *vpp = NULL;
 825 
 826         error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
 827             NULL);
 828         if (error == 0) {
 829                 if (excl == EXCL)
 830                         error = EEXIST;
 831                 else if (vp->v_type == VDIR && (mode & VWRITE))
 832                         error = EISDIR;
 833                 else
 834                         error = VOP_ACCESS(vp, mode, 0, cred, ct);
 835 
 836                 if (error) {
 837                         VN_RELE(vp);
 838                 } else
 839                         *vpp = vp;
 840         } else if (error == ENOENT) {
 841                 error = EROFS;
 842         }
 843 
 844         return (error);
 845 }
 846 
 847 void sdev_iter_snapshots(struct vnode *dvp, char *name);
 848 
 849 void
 850 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
 851 {
 852         zfs_cmd_t       *zc;
 853         int rc;
 854 
 855         sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
 856         zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 857         (void) strcpy(zc->zc_name, name);
 858 
 859         while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
 860                 struct vnode *vpp;
 861                 char *ptr;
 862 
 863                 sdcmn_err13(("  name %s", zc->zc_name));
 864                 if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
 865                         goto skip;
 866                 ptr = strrchr(zc->zc_name, '/') + 1;
 867                 rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
 868                     kcred, NULL, NULL, NULL);
 869                 if (rc == 0) {
 870                         VN_RELE(vpp);
 871                 } else if (rc == ENOENT) {
 872                         goto skip;
 873                 } else {
 874                         /*
 875                          * EBUSY == problem with zvols's dmu holds?
 876                          * EPERM when in a NGZ and traversing up and out.
 877                          */
 878                         goto skip;
 879                 }
 880                 if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
 881                     zc->zc_objset_stats.dds_type == DMU_OST_ZVOL &&
 882                     devzvol_snaps_allowed)
 883                         sdev_iter_snapshots(dvp, zc->zc_name);
 884 skip:
 885                 (void) strcpy(zc->zc_name, name);
 886         }
 887         kmem_free(zc, sizeof (zfs_cmd_t));
 888 }
 889 
 890 void
 891 sdev_iter_snapshots(struct vnode *dvp, char *name)
 892 {
 893         sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
 894 }
 895 
 896 /*ARGSUSED4*/
 897 static int
 898 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
 899     int *eofp, caller_context_t *ct_unused, int flags_unused)
 900 {
 901         struct sdev_node *sdvp = VTOSDEV(dvp);
 902         char *ptr;
 903 
 904         sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
 905             sdvp->sdev_name));
 906 
 907         if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
 908                 struct vnode *vp;
 909 
 910                 rw_exit(&sdvp->sdev_contents);
 911                 (void) devname_lookup_func(sdvp, "dsk", &vp, cred,
 912                     devzvol_create_dir, SDEV_VATTR);
 913                 VN_RELE(vp);
 914                 (void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
 915                     devzvol_create_dir, SDEV_VATTR);
 916                 VN_RELE(vp);
 917                 rw_enter(&sdvp->sdev_contents, RW_READER);
 918                 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
 919         }
 920         if (uiop->uio_offset == 0)
 921                 devzvol_prunedir(sdvp);
 922         ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
 923         if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
 924                 rw_exit(&sdvp->sdev_contents);
 925                 devzvol_create_pool_dirs(dvp);
 926                 rw_enter(&sdvp->sdev_contents, RW_READER);
 927                 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
 928         }
 929 
 930         ptr = strchr(ptr + 1, '/');
 931         if (ptr == NULL)
 932                 return (ENOENT);
 933         ptr++;
 934         rw_exit(&sdvp->sdev_contents);
 935         sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
 936         rw_enter(&sdvp->sdev_contents, RW_READER);
 937         return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
 938 }
 939 
 940 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
 941         VOPNAME_READDIR,        { .vop_readdir = devzvol_readdir },
 942         VOPNAME_LOOKUP,         { .vop_lookup = devzvol_lookup },
 943         VOPNAME_CREATE,         { .vop_create = devzvol_create },
 944         VOPNAME_RENAME,         { .error = fs_nosys },
 945         VOPNAME_MKDIR,          { .error = fs_nosys },
 946         VOPNAME_RMDIR,          { .error = fs_nosys },
 947         VOPNAME_REMOVE,         { .error = fs_nosys },
 948         VOPNAME_SYMLINK,        { .error = fs_nosys },
 949         NULL,                   NULL
 950 };