big-one Wdiff usr/src/uts/common/fs/vfs.c

Print this page

Revert "Revert "6602 lofi should support labeled devices""
This reverts commit 21386c8bd8477810b291eee22e08f1382e70cdf3.
Revert "6602 lofi should support labeled devices"
This reverts commit 406fc5100dac8d225a315a6def6be8d628f34e24.
Adding AoE support to nza-kernel

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/vfs.c
          +++ new/usr/src/uts/common/fs/vfs.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2016 Joyent, Inc.
  25   25   * Copyright 2016 Toomas Soome <tsoome@me.com>
  26   26   * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  27   27   * Copyright 2016 Nexenta Systems, Inc.
  28   28   * Copyright 2017 RackTop Systems.
  29   29   */
  30   30  
  31   31  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  32   32  /*        All Rights Reserved   */
  33   33  
  34   34  /*
  35   35   * University Copyright- Copyright (c) 1982, 1986, 1988
  36   36   * The Regents of the University of California
  37   37   * All Rights Reserved
  38   38   *
  39   39   * University Acknowledgment- Portions of this document are derived from
  40   40   * software developed by the University of California, Berkeley, and its
  41   41   * contributors.
  42   42   */
  43   43  
  44   44  #include <sys/types.h>
  45   45  #include <sys/t_lock.h>
  46   46  #include <sys/param.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/user.h>
  49   49  #include <sys/fstyp.h>
  50   50  #include <sys/kmem.h>
  51   51  #include <sys/systm.h>
  52   52  #include <sys/proc.h>
  53   53  #include <sys/mount.h>
  54   54  #include <sys/vfs.h>
  55   55  #include <sys/vfs_opreg.h>
  56   56  #include <sys/fem.h>
  57   57  #include <sys/mntent.h>
  58   58  #include <sys/stat.h>
  59   59  #include <sys/statvfs.h>
  60   60  #include <sys/statfs.h>
  61   61  #include <sys/cred.h>
  62   62  #include <sys/vnode.h>
  63   63  #include <sys/rwstlock.h>
  64   64  #include <sys/dnlc.h>
  65   65  #include <sys/file.h>
  66   66  #include <sys/time.h>
  67   67  #include <sys/atomic.h>
  68   68  #include <sys/cmn_err.h>
  69   69  #include <sys/buf.h>
  70   70  #include <sys/swap.h>
  71   71  #include <sys/debug.h>
  72   72  #include <sys/vnode.h>
  73   73  #include <sys/modctl.h>
  74   74  #include <sys/ddi.h>
  75   75  #include <sys/pathname.h>
  76   76  #include <sys/bootconf.h>
  77   77  #include <sys/dumphdr.h>
  78   78  #include <sys/dc_ki.h>
  79   79  #include <sys/poll.h>
  80   80  #include <sys/sunddi.h>
  81   81  #include <sys/sysmacros.h>
  82   82  #include <sys/zone.h>
  83   83  #include <sys/policy.h>
  84   84  #include <sys/ctfs.h>
  85   85  #include <sys/objfs.h>
  86   86  #include <sys/console.h>
  87   87  #include <sys/reboot.h>
  88   88  #include <sys/attr.h>
  89   89  #include <sys/zio.h>
  90   90  #include <sys/spa.h>
  91   91  #include <sys/lofi.h>
  92   92  #include <sys/bootprops.h>
  93   93  
  94   94  #include <vm/page.h>
  95   95  
  96   96  #include <fs/fs_subr.h>
  97   97  /* Private interfaces to create vopstats-related data structures */
  98   98  extern void             initialize_vopstats(vopstats_t *);
  99   99  extern vopstats_t       *get_fstype_vopstats(struct vfs *, struct vfssw *);
 100  100  extern vsk_anchor_t     *get_vskstat_anchor(struct vfs *);
 101  101  
 102  102  static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
 103  103  static void vfs_setmntopt_nolock(mntopts_t *, const char *,
 104  104      const char *, int, int);
 105  105  static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
 106  106  static void vfs_freemnttab(struct vfs *);
 107  107  static void vfs_freeopt(mntopt_t *);
 108  108  static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
 109  109  static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
 110  110  static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
 111  111  static void vfs_createopttbl_extend(mntopts_t *, const char *,
 112  112      const mntopts_t *);
 113  113  static char **vfs_copycancelopt_extend(char **const, int);
 114  114  static void vfs_freecancelopt(char **);
 115  115  static void getrootfs(char **, char **);
 116  116  static int getmacpath(dev_info_t *, void *);
 117  117  static void vfs_mnttabvp_setup(void);
 118  118  
 119  119  struct ipmnt {
 120  120          struct ipmnt    *mip_next;
 121  121          dev_t           mip_dev;
 122  122          struct vfs      *mip_vfsp;
 123  123  };
 124  124  
 125  125  static kmutex_t         vfs_miplist_mutex;
 126  126  static struct ipmnt     *vfs_miplist = NULL;
 127  127  static struct ipmnt     *vfs_miplist_end = NULL;
 128  128  
 129  129  static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
 130  130  
 131  131  /*
 132  132   * VFS global data.
 133  133   */
 134  134  vnode_t *rootdir;               /* pointer to root inode vnode. */
 135  135  vnode_t *devicesdir;            /* pointer to inode of devices root */
 136  136  vnode_t *devdir;                /* pointer to inode of dev root */
 137  137  
 138  138  char *server_rootpath;          /* root path for diskless clients */
 139  139  char *server_hostname;          /* hostname of diskless server */
 140  140  
 141  141  static struct vfs root;
 142  142  static struct vfs devices;
 143  143  static struct vfs dev;
 144  144  struct vfs *rootvfs = &root;    /* pointer to root vfs; head of VFS list. */
 145  145  rvfs_t *rvfs_list;              /* array of vfs ptrs for vfs hash list */
 146  146  int vfshsz = 512;               /* # of heads/locks in vfs hash arrays */
 147  147                                  /* must be power of 2!  */
 148  148  timespec_t vfs_mnttab_ctime;    /* mnttab created time */
 149  149  timespec_t vfs_mnttab_mtime;    /* mnttab last modified time */
 150  150  char *vfs_dummyfstype = "\0";
 151  151  struct pollhead vfs_pollhd;     /* for mnttab pollers */
 152  152  struct vnode *vfs_mntdummyvp;   /* to fake mnttab read/write for file events */
 153  153  int     mntfstype;              /* will be set once mnt fs is mounted */
 154  154  
 155  155  /*
 156  156   * Table for generic options recognized in the VFS layer and acted
 157  157   * on at this level before parsing file system specific options.
 158  158   * The nosuid option is stronger than any of the devices and setuid
 159  159   * options, so those are canceled when nosuid is seen.
 160  160   *
 161  161   * All options which are added here need to be added to the
 162  162   * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
 163  163   */
 164  164  /*
 165  165   * VFS Mount options table
 166  166   */
 167  167  static char *ro_cancel[] = { MNTOPT_RW, NULL };
 168  168  static char *rw_cancel[] = { MNTOPT_RO, NULL };
 169  169  static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
 170  170  static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
 171  171      MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
 172  172  static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
 173  173  static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
 174  174  static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
 175  175  static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
 176  176  static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
 177  177  static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
 178  178  static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
 179  179  static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
 180  180  
 181  181  static const mntopt_t mntopts[] = {
 182  182  /*
 183  183   *      option name             cancel options          default arg     flags
 184  184   */
 185  185          { MNTOPT_REMOUNT,       NULL,                   NULL,
 186  186                  MO_NODISPLAY, (void *)0 },
 187  187          { MNTOPT_RO,            ro_cancel,              NULL,           0,
 188  188                  (void *)0 },
 189  189          { MNTOPT_RW,            rw_cancel,              NULL,           0,
 190  190                  (void *)0 },
 191  191          { MNTOPT_SUID,          suid_cancel,            NULL,           0,
 192  192                  (void *)0 },
 193  193          { MNTOPT_NOSUID,        nosuid_cancel,          NULL,           0,
 194  194                  (void *)0 },
 195  195          { MNTOPT_DEVICES,       devices_cancel,         NULL,           0,
 196  196                  (void *)0 },
 197  197          { MNTOPT_NODEVICES,     nodevices_cancel,       NULL,           0,
 198  198                  (void *)0 },
 199  199          { MNTOPT_SETUID,        setuid_cancel,          NULL,           0,
 200  200                  (void *)0 },
 201  201          { MNTOPT_NOSETUID,      nosetuid_cancel,        NULL,           0,
 202  202                  (void *)0 },
 203  203          { MNTOPT_NBMAND,        nbmand_cancel,          NULL,           0,
 204  204                  (void *)0 },
 205  205          { MNTOPT_NONBMAND,      nonbmand_cancel,        NULL,           0,
 206  206                  (void *)0 },
 207  207          { MNTOPT_EXEC,          exec_cancel,            NULL,           0,
 208  208                  (void *)0 },
 209  209          { MNTOPT_NOEXEC,        noexec_cancel,          NULL,           0,
 210  210                  (void *)0 },
 211  211  };
 212  212  
 213  213  const mntopts_t vfs_mntopts = {
 214  214          sizeof (mntopts) / sizeof (mntopt_t),
 215  215          (mntopt_t *)&mntopts[0]
 216  216  };
 217  217  
 218  218  /*
 219  219   * File system operation dispatch functions.
 220  220   */
 221  221  
 222  222  int
 223  223  fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 224  224  {
 225  225          return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
 226  226  }
 227  227  
 228  228  int
 229  229  fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 230  230  {
 231  231          return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
 232  232  }
 233  233  
 234  234  int
 235  235  fsop_root(vfs_t *vfsp, vnode_t **vpp)
 236  236  {
 237  237          refstr_t *mntpt;
 238  238          int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
 239  239          /*
 240  240           * Make sure this root has a path.  With lofs, it is possible to have
 241  241           * a NULL mountpoint.
 242  242           */
 243  243          if (ret == 0 && vfsp->vfs_mntpt != NULL &&
 244  244              (*vpp)->v_path == vn_vpath_empty) {
 245  245                  const char *path;
 246  246  
 247  247                  mntpt = vfs_getmntpoint(vfsp);
 248  248                  path = refstr_value(mntpt);
 249  249                  vn_setpath_str(*vpp, path, strlen(path));
 250  250                  refstr_rele(mntpt);
 251  251          }
 252  252  
 253  253          return (ret);
 254  254  }
 255  255  
 256  256  int
 257  257  fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
 258  258  {
 259  259          return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
 260  260  }
 261  261  
 262  262  int
 263  263  fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
 264  264  {
 265  265          return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
 266  266  }
 267  267  
 268  268  int
 269  269  fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
 270  270  {
 271  271          /*
 272  272           * In order to handle system attribute fids in a manner
 273  273           * transparent to the underlying fs, we embed the fid for
 274  274           * the sysattr parent object in the sysattr fid and tack on
 275  275           * some extra bytes that only the sysattr layer knows about.
 276  276           *
 277  277           * This guarantees that sysattr fids are larger than other fids
 278  278           * for this vfs. If the vfs supports the sysattr view interface
 279  279           * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
 280  280           * collision with XATTR_FIDSZ.
 281  281           */
 282  282          if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
 283  283              fidp->fid_len == XATTR_FIDSZ)
 284  284                  return (xattr_dir_vget(vfsp, vpp, fidp));
 285  285  
 286  286          return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
 287  287  }
 288  288  
 289  289  int
 290  290  fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
 291  291  {
 292  292          return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
 293  293  }
 294  294  
 295  295  void
 296  296  fsop_freefs(vfs_t *vfsp)
 297  297  {
 298  298          (*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
 299  299  }
 300  300  
 301  301  int
 302  302  fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
 303  303  {
 304  304          return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
 305  305  }
 306  306  
 307  307  int
 308  308  fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
 309  309  {
 310  310          ASSERT((fstype >= 0) && (fstype < nfstype));
 311  311  
 312  312          if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
 313  313                  return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
 314  314          else
 315  315                  return (ENOTSUP);
 316  316  }
 317  317  
 318  318  /*
 319  319   * File system initialization.  vfs_setfsops() must be called from a file
 320  320   * system's init routine.
 321  321   */
 322  322  
 323  323  static int
 324  324  fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
 325  325      int *unused_ops)
 326  326  {
 327  327          static const fs_operation_trans_def_t vfs_ops_table[] = {
 328  328                  VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
 329  329                          fs_nosys, fs_nosys,
 330  330  
 331  331                  VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
 332  332                          fs_nosys, fs_nosys,
 333  333  
 334  334                  VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
 335  335                          fs_nosys, fs_nosys,
 336  336  
 337  337                  VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
 338  338                          fs_nosys, fs_nosys,
 339  339  
 340  340                  VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
 341  341                          (fs_generic_func_p) fs_sync,
 342  342                          (fs_generic_func_p) fs_sync,    /* No errors allowed */
 343  343  
 344  344                  VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
 345  345                          fs_nosys, fs_nosys,
 346  346  
 347  347                  VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
 348  348                          fs_nosys, fs_nosys,
 349  349  
 350  350                  VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
 351  351                          (fs_generic_func_p)fs_freevfs,
 352  352                          (fs_generic_func_p)fs_freevfs,  /* Shouldn't fail */
 353  353  
 354  354                  VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
 355  355                          (fs_generic_func_p)fs_nosys,
 356  356                          (fs_generic_func_p)fs_nosys,
 357  357  
 358  358                  NULL, 0, NULL, NULL
 359  359          };
 360  360  
 361  361          return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
 362  362  }
 363  363  
 364  364  void
 365  365  zfs_boot_init(void)
 366  366  {
 367  367          if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
 368  368                  spa_boot_init();
 369  369  }
 370  370  
 371  371  int
 372  372  vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
 373  373  {
 374  374          int error;
 375  375          int unused_ops;
 376  376  
 377  377          /*
 378  378           * Verify that fstype refers to a valid fs.  Note that
 379  379           * 0 is valid since it's used to set "stray" ops.
 380  380           */
 381  381          if ((fstype < 0) || (fstype >= nfstype))
 382  382                  return (EINVAL);
 383  383  
 384  384          if (!ALLOCATED_VFSSW(&vfssw[fstype]))
 385  385                  return (EINVAL);
 386  386  
 387  387          /* Set up the operations vector. */
 388  388  
 389  389          error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
 390  390  
 391  391          if (error != 0)
 392  392                  return (error);
 393  393  
 394  394          vfssw[fstype].vsw_flag |= VSW_INSTALLED;
 395  395  
 396  396          if (actual != NULL)
 397  397                  *actual = &vfssw[fstype].vsw_vfsops;
 398  398  
 399  399  #if DEBUG
 400  400          if (unused_ops != 0)
 401  401                  cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
 402  402                      "but not used", vfssw[fstype].vsw_name, unused_ops);
 403  403  #endif
 404  404  
 405  405          return (0);
 406  406  }
 407  407  
 408  408  int
 409  409  vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
 410  410  {
 411  411          int error;
 412  412          int unused_ops;
 413  413  
 414  414          *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
 415  415  
 416  416          error = fs_copyfsops(template, *actual, &unused_ops);
 417  417          if (error != 0) {
 418  418                  kmem_free(*actual, sizeof (vfsops_t));
 419  419                  *actual = NULL;
 420  420                  return (error);
 421  421          }
 422  422  
 423  423          return (0);
 424  424  }
 425  425  
 426  426  /*
 427  427   * Free a vfsops structure created as a result of vfs_makefsops().
 428  428   * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
 429  429   * vfs_freevfsops_by_type().
 430  430   */
 431  431  void
 432  432  vfs_freevfsops(vfsops_t *vfsops)
 433  433  {
 434  434          kmem_free(vfsops, sizeof (vfsops_t));
 435  435  }
 436  436  
 437  437  /*
 438  438   * Since the vfsops structure is part of the vfssw table and wasn't
 439  439   * really allocated, we're not really freeing anything.  We keep
 440  440   * the name for consistency with vfs_freevfsops().  We do, however,
 441  441   * need to take care of a little bookkeeping.
 442  442   * NOTE: For a vfsops structure created by vfs_setfsops(), use
 443  443   * vfs_freevfsops_by_type().
 444  444   */
 445  445  int
 446  446  vfs_freevfsops_by_type(int fstype)
 447  447  {
 448  448  
 449  449          /* Verify that fstype refers to a loaded fs (and not fsid 0). */
 450  450          if ((fstype <= 0) || (fstype >= nfstype))
 451  451                  return (EINVAL);
 452  452  
 453  453          WLOCK_VFSSW();
 454  454          if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
 455  455                  WUNLOCK_VFSSW();
 456  456                  return (EINVAL);
 457  457          }
 458  458  
 459  459          vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
 460  460          WUNLOCK_VFSSW();
 461  461  
 462  462          return (0);
 463  463  }
 464  464  
 465  465  /* Support routines used to reference vfs_op */
 466  466  
 467  467  /* Set the operations vector for a vfs */
 468  468  void
 469  469  vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
 470  470  {
 471  471          vfsops_t        *op;
 472  472  
 473  473          ASSERT(vfsp != NULL);
 474  474          ASSERT(vfsops != NULL);
 475  475  
 476  476          op = vfsp->vfs_op;
 477  477          membar_consumer();
 478  478          if (vfsp->vfs_femhead == NULL &&
 479  479              atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) {
 480  480                  return;
 481  481          }
 482  482          fsem_setvfsops(vfsp, vfsops);
 483  483  }
 484  484  
 485  485  /* Retrieve the operations vector for a vfs */
 486  486  vfsops_t *
 487  487  vfs_getops(vfs_t *vfsp)
 488  488  {
 489  489          vfsops_t        *op;
 490  490  
 491  491          ASSERT(vfsp != NULL);
 492  492  
 493  493          op = vfsp->vfs_op;
 494  494          membar_consumer();
 495  495          if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
 496  496                  return (op);
 497  497          } else {
 498  498                  return (fsem_getvfsops(vfsp));
 499  499          }
 500  500  }
 501  501  
 502  502  /*
 503  503   * Returns non-zero (1) if the vfsops matches that of the vfs.
 504  504   * Returns zero (0) if not.
 505  505   */
 506  506  int
 507  507  vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
 508  508  {
 509  509          return (vfs_getops(vfsp) == vfsops);
 510  510  }
 511  511  
 512  512  /*
 513  513   * Returns non-zero (1) if the file system has installed a non-default,
 514  514   * non-error vfs_sync routine.  Returns zero (0) otherwise.
 515  515   */
 516  516  int
 517  517  vfs_can_sync(vfs_t *vfsp)
 518  518  {
 519  519          /* vfs_sync() routine is not the default/error function */
 520  520          return (vfs_getops(vfsp)->vfs_sync != fs_sync);
 521  521  }
 522  522  
 523  523  /*
 524  524   * Initialize a vfs structure.
 525  525   */
 526  526  void
 527  527  vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
 528  528  {
 529  529          /* Other initialization has been moved to vfs_alloc() */
 530  530          vfsp->vfs_count = 0;
 531  531          vfsp->vfs_next = vfsp;
 532  532          vfsp->vfs_prev = vfsp;
 533  533          vfsp->vfs_zone_next = vfsp;
 534  534          vfsp->vfs_zone_prev = vfsp;
 535  535          vfsp->vfs_lofi_id = 0;
 536  536          sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
 537  537          vfsimpl_setup(vfsp);
 538  538          vfsp->vfs_data = (data);
 539  539          vfs_setops((vfsp), (op));
 540  540  }
 541  541  
 542  542  /*
 543  543   * Allocate and initialize the vfs implementation private data
 544  544   * structure, vfs_impl_t.
 545  545   */
 546  546  void
 547  547  vfsimpl_setup(vfs_t *vfsp)
 548  548  {
 549  549          int i;
 550  550  
 551  551          if (vfsp->vfs_implp != NULL) {
 552  552                  return;
 553  553          }
 554  554  
 555  555          vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
 556  556          /* Note that these are #define'd in vfs.h */
 557  557          vfsp->vfs_vskap = NULL;
 558  558          vfsp->vfs_fstypevsp = NULL;
 559  559  
 560  560          /* Set size of counted array, then zero the array */
 561  561          vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
 562  562          for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
 563  563                  vfsp->vfs_featureset[i] = 0;
 564  564          }
 565  565  }
 566  566  
 567  567  /*
 568  568   * Release the vfs_impl_t structure, if it exists. Some unbundled
 569  569   * filesystems may not use the newer version of vfs and thus
 570  570   * would not contain this implementation private data structure.
 571  571   */
 572  572  void
 573  573  vfsimpl_teardown(vfs_t *vfsp)
 574  574  {
 575  575          vfs_impl_t      *vip = vfsp->vfs_implp;
 576  576  
 577  577          if (vip == NULL)
 578  578                  return;
 579  579  
 580  580          kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
 581  581          vfsp->vfs_implp = NULL;
 582  582  }
 583  583  
 584  584  /*
 585  585   * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
 586  586   * fstatvfs, and sysfs moved to common/syscall.
 587  587   */
 588  588  
 589  589  /*
 590  590   * Update every mounted file system.  We call the vfs_sync operation of
 591  591   * each file system type, passing it a NULL vfsp to indicate that all
 592  592   * mounted file systems of that type should be updated.
 593  593   */
 594  594  void
 595  595  vfs_sync(int flag)
 596  596  {
 597  597          struct vfssw *vswp;
 598  598          RLOCK_VFSSW();
 599  599          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 600  600                  if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 601  601                          vfs_refvfssw(vswp);
 602  602                          RUNLOCK_VFSSW();
 603  603                          (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
 604  604                              CRED());
 605  605                          vfs_unrefvfssw(vswp);
 606  606                          RLOCK_VFSSW();
 607  607                  }
 608  608          }
 609  609          RUNLOCK_VFSSW();
 610  610  }
 611  611  
 612  612  void
 613  613  sync(void)
 614  614  {
 615  615          vfs_sync(0);
 616  616  }
 617  617  
 618  618  /*
 619  619   * External routines.
 620  620   */
 621  621  
 622  622  krwlock_t vfssw_lock;   /* lock accesses to vfssw */
 623  623  
 624  624  /*
 625  625   * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
 626  626   * but otherwise should be accessed only via vfs_list_lock() and
 627  627   * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
 628  628   */
 629  629  static krwlock_t vfslist;
 630  630  
 631  631  /*
 632  632   * Mount devfs on /devices. This is done right after root is mounted
 633  633   * to provide device access support for the system
 634  634   */
 635  635  static void
 636  636  vfs_mountdevices(void)
 637  637  {
 638  638          struct vfssw *vsw;
 639  639          struct vnode *mvp;
 640  640          struct mounta mounta = {        /* fake mounta for devfs_mount() */
 641  641                  NULL,
 642  642                  NULL,
 643  643                  MS_SYSSPACE,
 644  644                  NULL,
 645  645                  NULL,
 646  646                  0,
 647  647                  NULL,
 648  648                  0
 649  649          };
 650  650  
 651  651          /*
 652  652           * _init devfs module to fill in the vfssw
 653  653           */
 654  654          if (modload("fs", "devfs") == -1)
 655  655                  panic("Cannot _init devfs module");
 656  656  
 657  657          /*
 658  658           * Hold vfs
 659  659           */
 660  660          RLOCK_VFSSW();
 661  661          vsw = vfs_getvfsswbyname("devfs");
 662  662          VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
 663  663          VFS_HOLD(&devices);
 664  664  
 665  665          /*
 666  666           * Locate mount point
 667  667           */
 668  668          if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 669  669                  panic("Cannot find /devices");
 670  670  
 671  671          /*
 672  672           * Perform the mount of /devices
 673  673           */
 674  674          if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
 675  675                  panic("Cannot mount /devices");
 676  676  
 677  677          RUNLOCK_VFSSW();
 678  678  
 679  679          /*
 680  680           * Set appropriate members and add to vfs list for mnttab display
 681  681           */
 682  682          vfs_setresource(&devices, "/devices", 0);
 683  683          vfs_setmntpoint(&devices, "/devices", 0);
 684  684  
 685  685          /*
 686  686           * Hold the root of /devices so it won't go away
 687  687           */
 688  688          if (VFS_ROOT(&devices, &devicesdir))
 689  689                  panic("vfs_mountdevices: not devices root");
 690  690  
 691  691          if (vfs_lock(&devices) != 0) {
 692  692                  VN_RELE(devicesdir);
 693  693                  cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
 694  694                  return;
 695  695          }
 696  696  
 697  697          if (vn_vfswlock(mvp) != 0) {
 698  698                  vfs_unlock(&devices);
 699  699                  VN_RELE(devicesdir);
 700  700                  cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
 701  701                  return;
 702  702          }
 703  703  
 704  704          vfs_add(mvp, &devices, 0);
 705  705          vn_vfsunlock(mvp);
 706  706          vfs_unlock(&devices);
 707  707          VN_RELE(devicesdir);
 708  708  }
 709  709  
 710  710  /*
 711  711   * mount the first instance of /dev  to root and remain mounted
 712  712   */
 713  713  static void
 714  714  vfs_mountdev1(void)
 715  715  {
 716  716          struct vfssw *vsw;
 717  717          struct vnode *mvp;
 718  718          struct mounta mounta = {        /* fake mounta for sdev_mount() */
 719  719                  NULL,
 720  720                  NULL,
 721  721                  MS_SYSSPACE | MS_OVERLAY,
 722  722                  NULL,
 723  723                  NULL,
 724  724                  0,
 725  725                  NULL,
 726  726                  0
 727  727          };
 728  728  
 729  729          /*
 730  730           * _init dev module to fill in the vfssw
 731  731           */
 732  732          if (modload("fs", "dev") == -1)
 733  733                  cmn_err(CE_PANIC, "Cannot _init dev module\n");
 734  734  
 735  735          /*
 736  736           * Hold vfs
 737  737           */
 738  738          RLOCK_VFSSW();
 739  739          vsw = vfs_getvfsswbyname("dev");
 740  740          VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
 741  741          VFS_HOLD(&dev);
 742  742  
 743  743          /*
 744  744           * Locate mount point
 745  745           */
 746  746          if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 747  747                  cmn_err(CE_PANIC, "Cannot find /dev\n");
 748  748  
 749  749          /*
 750  750           * Perform the mount of /dev
 751  751           */
 752  752          if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
 753  753                  cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
 754  754  
 755  755          RUNLOCK_VFSSW();
 756  756  
 757  757          /*
 758  758           * Set appropriate members and add to vfs list for mnttab display
 759  759           */
 760  760          vfs_setresource(&dev, "/dev", 0);
 761  761          vfs_setmntpoint(&dev, "/dev", 0);
 762  762  
 763  763          /*
 764  764           * Hold the root of /dev so it won't go away
 765  765           */
 766  766          if (VFS_ROOT(&dev, &devdir))
 767  767                  cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
 768  768  
 769  769          if (vfs_lock(&dev) != 0) {
 770  770                  VN_RELE(devdir);
 771  771                  cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
 772  772                  return;
 773  773          }
 774  774  
 775  775          if (vn_vfswlock(mvp) != 0) {
 776  776                  vfs_unlock(&dev);
 777  777                  VN_RELE(devdir);
 778  778                  cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
 779  779                  return;
 780  780          }
 781  781  
 782  782          vfs_add(mvp, &dev, 0);
 783  783          vn_vfsunlock(mvp);
 784  784          vfs_unlock(&dev);
 785  785          VN_RELE(devdir);
 786  786  }
 787  787  
 788  788  /*
 789  789   * Mount required filesystem. This is done right after root is mounted.
 790  790   */
 791  791  static void
 792  792  vfs_mountfs(char *module, char *spec, char *path)
 793  793  {
 794  794          struct vnode *mvp;
 795  795          struct mounta mounta;
 796  796          vfs_t *vfsp;
 797  797  
 798  798          bzero(&mounta, sizeof (mounta));
 799  799          mounta.flags = MS_SYSSPACE | MS_DATA;
 800  800          mounta.fstype = module;
 801  801          mounta.spec = spec;
 802  802          mounta.dir = path;
 803  803          if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
 804  804                  cmn_err(CE_WARN, "Cannot find %s", path);
 805  805                  return;
 806  806          }
 807  807          if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
 808  808                  cmn_err(CE_WARN, "Cannot mount %s", path);
 809  809          else
 810  810                  VFS_RELE(vfsp);
 811  811          VN_RELE(mvp);
 812  812  }
 813  813  
 814  814  /*
 815  815   * vfs_mountroot is called by main() to mount the root filesystem.
 816  816   */
 817  817  void
 818  818  vfs_mountroot(void)
 819  819  {
 820  820          struct vnode    *rvp = NULL;
 821  821          char            *path;
 822  822          size_t          plen;
 823  823          struct vfssw    *vswp;
 824  824          proc_t          *p;
 825  825  
 826  826          rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
 827  827          rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
 828  828  
 829  829          /*
 830  830           * Alloc the vfs hash bucket array and locks
 831  831           */
 832  832          rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
 833  833  
 834  834          /*
 835  835           * Call machine-dependent routine "rootconf" to choose a root
 836  836           * file system type.
 837  837           */
 838  838          if (rootconf())
 839  839                  panic("vfs_mountroot: cannot mount root");
 840  840          /*
 841  841           * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
 842  842           * to point to it.  These are used by lookuppn() so that it
 843  843           * knows where to start from ('/' or '.').
 844  844           */
 845  845          vfs_setmntpoint(rootvfs, "/", 0);
 846  846          if (VFS_ROOT(rootvfs, &rootdir))
 847  847                  panic("vfs_mountroot: no root vnode");
 848  848  
 849  849          /*
 850  850           * At this point, the process tree consists of p0 and possibly some
 851  851           * direct children of p0.  (i.e. there are no grandchildren)
 852  852           *
 853  853           * Walk through them all, setting their current directory.
 854  854           */
 855  855          mutex_enter(&pidlock);
 856  856          for (p = practive; p != NULL; p = p->p_next) {
 857  857                  ASSERT(p == &p0 || p->p_parent == &p0);
 858  858  
 859  859                  PTOU(p)->u_cdir = rootdir;
 860  860                  VN_HOLD(PTOU(p)->u_cdir);
 861  861                  PTOU(p)->u_rdir = NULL;
 862  862          }
 863  863          mutex_exit(&pidlock);
 864  864  
 865  865          /*
 866  866           * Setup the global zone's rootvp, now that it exists.
 867  867           */
 868  868          global_zone->zone_rootvp = rootdir;
 869  869          VN_HOLD(global_zone->zone_rootvp);
 870  870  
 871  871          /*
 872  872           * Notify the module code that it can begin using the
 873  873           * root filesystem instead of the boot program's services.
 874  874           */
 875  875          modrootloaded = 1;
 876  876  
 877  877          /*
 878  878           * Special handling for a ZFS root file system.
 879  879           */
 880  880          zfs_boot_init();
 881  881  
 882  882          /*
 883  883           * Set up mnttab information for root
 884  884           */
 885  885          vfs_setresource(rootvfs, rootfs.bo_name, 0);
 886  886  
 887  887          /*
 888  888           * Notify cluster software that the root filesystem is available.
 889  889           */
 890  890          clboot_mountroot();
 891  891  
 892  892          /* Now that we're all done with the root FS, set up its vopstats */
 893  893          if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
 894  894                  /* Set flag for statistics collection */
 895  895                  if (vswp->vsw_flag & VSW_STATS) {
 896  896                          initialize_vopstats(&rootvfs->vfs_vopstats);
 897  897                          rootvfs->vfs_flag |= VFS_STATS;
 898  898                          rootvfs->vfs_fstypevsp =
 899  899                              get_fstype_vopstats(rootvfs, vswp);
 900  900                          rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
 901  901                  }
 902  902                  vfs_unrefvfssw(vswp);
 903  903          }
 904  904  
 905  905          /*
 906  906           * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
 907  907           * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
 908  908           */
 909  909          vfs_mountdevices();
 910  910          vfs_mountdev1();
 911  911  
 912  912          vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
 913  913          vfs_mountfs("proc", "/proc", "/proc");
 914  914          vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
 915  915          vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
 916  916          vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
 917  917          vfs_mountfs("bootfs", "bootfs", "/system/boot");
 918  918  
 919  919          if (getzoneid() == GLOBAL_ZONEID) {
 920  920                  vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
 921  921          }
 922  922  
 923  923          if (strcmp(rootfs.bo_fstype, "zfs") != 0) {
 924  924                  /*
 925  925                   * Look up the root device via devfs so that a dv_node is
 926  926                   * created for it. The vnode is never VN_RELE()ed.
 927  927                   * We allocate more than MAXPATHLEN so that the
 928  928                   * buffer passed to i_ddi_prompath_to_devfspath() is
 929  929                   * exactly MAXPATHLEN (the function expects a buffer
 930  930                   * of that length).
 931  931                   */
 932  932                  plen = strlen("/devices");
 933  933                  path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
 934  934                  (void) strcpy(path, "/devices");
 935  935  
 936  936                  if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
 937  937                      != DDI_SUCCESS ||
 938  938                      lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
 939  939  
 940  940                          /* NUL terminate in case "path" has garbage */
 941  941                          path[plen + MAXPATHLEN - 1] = '\0';
 942  942  #ifdef  DEBUG
 943  943                          cmn_err(CE_WARN, "!Cannot lookup root device: %s",
 944  944                              path);
 945  945  #endif
 946  946                  }
 947  947                  kmem_free(path, plen + MAXPATHLEN);
 948  948          }
 949  949  
 950  950          vfs_mnttabvp_setup();
 951  951  }
 952  952  
 953  953  /*
 954  954   * Check to see if our "block device" is actually a file.  If so,
 955  955   * automatically add a lofi device, and keep track of this fact.
 956  956   */
 957  957  static int
 958  958  lofi_add(const char *fsname, struct vfs *vfsp,
 959  959      mntopts_t *mntopts, struct mounta *uap)
 960  960  {
 961  961          int fromspace = (uap->flags & MS_SYSSPACE) ?
 962  962              UIO_SYSSPACE : UIO_USERSPACE;
 963  963          struct lofi_ioctl *li = NULL;
 964  964          struct vnode *vp = NULL;
 965  965          struct pathname pn = { NULL };
 966  966          ldi_ident_t ldi_id;
 967  967          ldi_handle_t ldi_hdl;
 968  968          vfssw_t *vfssw;
 969  969          int id;
 970  970          int err = 0;
 971  971  
 972  972          if ((vfssw = vfs_getvfssw(fsname)) == NULL)
 973  973                  return (0);
 974  974  
 975  975          if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
 976  976                  vfs_unrefvfssw(vfssw);
 977  977                  return (0);
 978  978          }
 979  979  
 980  980          vfs_unrefvfssw(vfssw);
 981  981          vfssw = NULL;
 982  982  
 983  983          if (pn_get(uap->spec, fromspace, &pn) != 0)
 984  984                  return (0);
 985  985  
 986  986          if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
 987  987                  goto out;
 988  988  
 989  989          if (vp->v_type != VREG)
 990  990                  goto out;
 991  991  
 992  992          /* OK, this is a lofi mount. */
 993  993  
 994  994          if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
 995  995              vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
 996  996              vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
 997  997              vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
 998  998                  err = EINVAL;
 999  999                  goto out;
1000 1000          }
1001 1001  
1002 1002          ldi_id = ldi_ident_from_anon();
1003 1003          li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1004 1004          (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1005 1005  
1006 1006          err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1007 1007              &ldi_hdl, ldi_id);
1008 1008  
1009 1009          if (err)
1010 1010                  goto out2;
1011 1011  
1012 1012          err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1013 1013              FREAD | FWRITE | FKIOCTL, kcred, &id);
1014 1014  
1015 1015          (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1016 1016  
1017 1017          if (!err)
1018 1018                  vfsp->vfs_lofi_id = id;
1019 1019  
1020 1020  out2:
1021 1021          ldi_ident_release(ldi_id);
1022 1022  out:
1023 1023          if (li != NULL)
1024 1024                  kmem_free(li, sizeof (*li));
1025 1025          if (vp != NULL)
1026 1026                  VN_RELE(vp);
1027 1027          pn_free(&pn);
1028 1028          return (err);
1029 1029  }
1030 1030  
1031 1031  static void
1032 1032  lofi_remove(struct vfs *vfsp)
1033 1033  {
1034 1034          struct lofi_ioctl *li = NULL;
1035 1035          ldi_ident_t ldi_id;
1036 1036          ldi_handle_t ldi_hdl;
1037 1037          int err;
1038 1038  
1039 1039          if (vfsp->vfs_lofi_id == 0)
1040 1040                  return;
1041 1041  
1042 1042          ldi_id = ldi_ident_from_anon();
1043 1043  
1044 1044          li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1045 1045          li->li_id = vfsp->vfs_lofi_id;
1046 1046          li->li_cleanup = B_TRUE;
1047 1047  
1048 1048          err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1049 1049              &ldi_hdl, ldi_id);
1050 1050  
1051 1051          if (err)
1052 1052                  goto out;
1053 1053  
1054 1054          err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1055 1055              FREAD | FWRITE | FKIOCTL, kcred, NULL);
1056 1056  
1057 1057          (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1058 1058  
1059 1059          if (!err)
1060 1060                  vfsp->vfs_lofi_id = 0;
1061 1061  
1062 1062  out:
1063 1063          ldi_ident_release(ldi_id);
1064 1064          if (li != NULL)
1065 1065                  kmem_free(li, sizeof (*li));
1066 1066  }
1067 1067  
1068 1068  /*
1069 1069   * Common mount code.  Called from the system call entry point, from autofs,
1070 1070   * nfsv4 trigger mounts, and from pxfs.
1071 1071   *
1072 1072   * Takes the effective file system type, mount arguments, the mount point
1073 1073   * vnode, flags specifying whether the mount is a remount and whether it
1074 1074   * should be entered into the vfs list, and credentials.  Fills in its vfspp
1075 1075   * parameter with the mounted file system instance's vfs.
1076 1076   *
1077 1077   * Note that the effective file system type is specified as a string.  It may
1078 1078   * be null, in which case it's determined from the mount arguments, and may
1079 1079   * differ from the type specified in the mount arguments; this is a hook to
1080 1080   * allow interposition when instantiating file system instances.
1081 1081   *
1082 1082   * The caller is responsible for releasing its own hold on the mount point
1083 1083   * vp (this routine does its own hold when necessary).
1084 1084   * Also note that for remounts, the mount point vp should be the vnode for
1085 1085   * the root of the file system rather than the vnode that the file system
1086 1086   * is mounted on top of.
1087 1087   */
1088 1088  int
1089 1089  domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1090 1090      struct vfs **vfspp)
1091 1091  {
1092 1092          struct vfssw    *vswp;
1093 1093          vfsops_t        *vfsops;
1094 1094          struct vfs      *vfsp;
1095 1095          struct vnode    *bvp;
1096 1096          dev_t           bdev = 0;
1097 1097          mntopts_t       mnt_mntopts;
1098 1098          int             error = 0;
1099 1099          int             copyout_error = 0;
1100 1100          int             ovflags;
1101 1101          char            *opts = uap->optptr;
1102 1102          char            *inargs = opts;
1103 1103          int             optlen = uap->optlen;
1104 1104          int             remount;
1105 1105          int             rdonly;
1106 1106          int             nbmand = 0;
1107 1107          int             delmip = 0;
1108 1108          int             addmip = 0;
1109 1109          int             splice = ((uap->flags & MS_NOSPLICE) == 0);
1110 1110          int             fromspace = (uap->flags & MS_SYSSPACE) ?
1111 1111              UIO_SYSSPACE : UIO_USERSPACE;
1112 1112          char            *resource = NULL, *mountpt = NULL;
1113 1113          refstr_t        *oldresource, *oldmntpt;
1114 1114          struct pathname pn, rpn;
1115 1115          vsk_anchor_t    *vskap;
1116 1116          char fstname[FSTYPSZ];
1117 1117          zone_t          *zone;
1118 1118  
1119 1119          /*
1120 1120           * The v_flag value for the mount point vp is permanently set
1121 1121           * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1122 1122           * for mount point locking.
1123 1123           */
1124 1124          mutex_enter(&vp->v_lock);
1125 1125          vp->v_flag |= VVFSLOCK;
1126 1126          mutex_exit(&vp->v_lock);
1127 1127  
1128 1128          mnt_mntopts.mo_count = 0;
1129 1129          /*
1130 1130           * Find the ops vector to use to invoke the file system-specific mount
1131 1131           * method.  If the fsname argument is non-NULL, use it directly.
1132 1132           * Otherwise, dig the file system type information out of the mount
1133 1133           * arguments.
1134 1134           *
1135 1135           * A side effect is to hold the vfssw entry.
1136 1136           *
1137 1137           * Mount arguments can be specified in several ways, which are
1138 1138           * distinguished by flag bit settings.  The preferred way is to set
1139 1139           * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1140 1140           * type supplied as a character string and the last two arguments
1141 1141           * being a pointer to a character buffer and the size of the buffer.
1142 1142           * On entry, the buffer holds a null terminated list of options; on
1143 1143           * return, the string is the list of options the file system
1144 1144           * recognized. If MS_DATA is set arguments five and six point to a
1145 1145           * block of binary data which the file system interprets.
1146 1146           * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1147 1147           * consistently with these conventions.  To handle them, we check to
1148 1148           * see whether the pointer to the file system name has a numeric value
1149 1149           * less than 256.  If so, we treat it as an index.
1150 1150           */
1151 1151          if (fsname != NULL) {
1152 1152                  if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1153 1153                          return (EINVAL);
1154 1154                  }
1155 1155          } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1156 1156                  size_t n;
1157 1157                  uint_t fstype;
1158 1158  
1159 1159                  fsname = fstname;
1160 1160  
1161 1161                  if ((fstype = (uintptr_t)uap->fstype) < 256) {
1162 1162                          RLOCK_VFSSW();
1163 1163                          if (fstype == 0 || fstype >= nfstype ||
1164 1164                              !ALLOCATED_VFSSW(&vfssw[fstype])) {
1165 1165                                  RUNLOCK_VFSSW();
1166 1166                                  return (EINVAL);
1167 1167                          }
1168 1168                          (void) strcpy(fsname, vfssw[fstype].vsw_name);
1169 1169                          RUNLOCK_VFSSW();
1170 1170                          if ((vswp = vfs_getvfssw(fsname)) == NULL)
1171 1171                                  return (EINVAL);
1172 1172                  } else {
1173 1173                          /*
1174 1174                           * Handle either kernel or user address space.
1175 1175                           */
1176 1176                          if (uap->flags & MS_SYSSPACE) {
1177 1177                                  error = copystr(uap->fstype, fsname,
1178 1178                                      FSTYPSZ, &n);
1179 1179                          } else {
1180 1180                                  error = copyinstr(uap->fstype, fsname,
1181 1181                                      FSTYPSZ, &n);
1182 1182                          }
1183 1183                          if (error) {
1184 1184                                  if (error == ENAMETOOLONG)
1185 1185                                          return (EINVAL);
1186 1186                                  return (error);
1187 1187                          }
1188 1188                          if ((vswp = vfs_getvfssw(fsname)) == NULL)
1189 1189                                  return (EINVAL);
1190 1190                  }
1191 1191          } else {
1192 1192                  if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1193 1193                          return (EINVAL);
1194 1194                  fsname = vswp->vsw_name;
1195 1195          }
1196 1196          if (!VFS_INSTALLED(vswp))
1197 1197                  return (EINVAL);
1198 1198  
1199 1199          if ((error = secpolicy_fs_allowed_mount(fsname)) != 0)  {
1200 1200                  vfs_unrefvfssw(vswp);
1201 1201                  return (error);
1202 1202          }
1203 1203  
1204 1204          vfsops = &vswp->vsw_vfsops;
1205 1205  
1206 1206          vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1207 1207          /*
1208 1208           * Fetch mount options and parse them for generic vfs options
1209 1209           */
1210 1210          if (uap->flags & MS_OPTIONSTR) {
1211 1211                  /*
1212 1212                   * Limit the buffer size
1213 1213                   */
1214 1214                  if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1215 1215                          error = EINVAL;
1216 1216                          goto errout;
1217 1217                  }
1218 1218                  if ((uap->flags & MS_SYSSPACE) == 0) {
1219 1219                          inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1220 1220                          inargs[0] = '\0';
1221 1221                          if (optlen) {
1222 1222                                  error = copyinstr(opts, inargs, (size_t)optlen,
1223 1223                                      NULL);
1224 1224                                  if (error) {
1225 1225                                          goto errout;
1226 1226                                  }
1227 1227                          }
1228 1228                  }
1229 1229                  vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1230 1230          }
1231 1231          /*
1232 1232           * Flag bits override the options string.
1233 1233           */
1234 1234          if (uap->flags & MS_REMOUNT)
1235 1235                  vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1236 1236          if (uap->flags & MS_RDONLY)
1237 1237                  vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1238 1238          if (uap->flags & MS_NOSUID)
1239 1239                  vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1240 1240  
1241 1241          /*
1242 1242           * Check if this is a remount; must be set in the option string and
1243 1243           * the file system must support a remount option.
1244 1244           */
1245 1245          if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1246 1246              MNTOPT_REMOUNT, NULL)) {
1247 1247                  if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1248 1248                          error = ENOTSUP;
1249 1249                          goto errout;
1250 1250                  }
1251 1251                  uap->flags |= MS_REMOUNT;
1252 1252          }
1253 1253  
1254 1254          /*
1255 1255           * uap->flags and vfs_optionisset() should agree.
1256 1256           */
1257 1257          if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1258 1258                  uap->flags |= MS_RDONLY;
1259 1259          }
1260 1260          if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1261 1261                  uap->flags |= MS_NOSUID;
1262 1262          }
1263 1263          nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1264 1264          ASSERT(splice || !remount);
1265 1265          /*
1266 1266           * If we are splicing the fs into the namespace,
1267 1267           * perform mount point checks.
1268 1268           *
1269 1269           * We want to resolve the path for the mount point to eliminate
1270 1270           * '.' and ".." and symlinks in mount points; we can't do the
1271 1271           * same for the resource string, since it would turn
1272 1272           * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1273 1273           * this before grabbing vn_vfswlock(), because otherwise we
1274 1274           * would deadlock with lookuppn().
1275 1275           */
1276 1276          if (splice) {
1277 1277                  ASSERT(vp->v_count > 0);
1278 1278  
1279 1279                  /*
1280 1280                   * Pick up mount point and device from appropriate space.
1281 1281                   */
1282 1282                  if (pn_get(uap->spec, fromspace, &pn) == 0) {
1283 1283                          resource = kmem_alloc(pn.pn_pathlen + 1,
1284 1284                              KM_SLEEP);
1285 1285                          (void) strcpy(resource, pn.pn_path);
1286 1286                          pn_free(&pn);
1287 1287                  }
1288 1288                  /*
1289 1289                   * Do a lookupname prior to taking the
1290 1290                   * writelock. Mark this as completed if
1291 1291                   * successful for later cleanup and addition to
1292 1292                   * the mount in progress table.
1293 1293                   */
1294 1294                  if ((vswp->vsw_flag & VSW_MOUNTDEV) &&
1295 1295                      (uap->flags & MS_GLOBAL) == 0 &&
1296 1296                      lookupname(uap->spec, fromspace,
1297 1297                      FOLLOW, NULL, &bvp) == 0) {
1298 1298                          addmip = 1;
1299 1299                  }
1300 1300  
1301 1301                  if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1302 1302                          pathname_t *pnp;
1303 1303  
1304 1304                          if (*pn.pn_path != '/') {
1305 1305                                  error = EINVAL;
1306 1306                                  pn_free(&pn);
1307 1307                                  goto errout;
1308 1308                          }
1309 1309                          pn_alloc(&rpn);
1310 1310                          /*
1311 1311                           * Kludge to prevent autofs from deadlocking with
1312 1312                           * itself when it calls domount().
1313 1313                           *
1314 1314                           * If autofs is calling, it is because it is doing
1315 1315                           * (autofs) mounts in the process of an NFS mount.  A
1316 1316                           * lookuppn() here would cause us to block waiting for
1317 1317                           * said NFS mount to complete, which can't since this
1318 1318                           * is the thread that was supposed to doing it.
1319 1319                           */
1320 1320                          if (fromspace == UIO_USERSPACE) {
1321 1321                                  if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1322 1322                                      NULL)) == 0) {
1323 1323                                          pnp = &rpn;
1324 1324                                  } else {
1325 1325                                          /*
1326 1326                                           * The file disappeared or otherwise
1327 1327                                           * became inaccessible since we opened
1328 1328                                           * it; might as well fail the mount
1329 1329                                           * since the mount point is no longer
1330 1330                                           * accessible.
1331 1331                                           */
1332 1332                                          pn_free(&rpn);
1333 1333                                          pn_free(&pn);
1334 1334                                          goto errout;
1335 1335                                  }
1336 1336                          } else {
1337 1337                                  pnp = &pn;
1338 1338                          }
1339 1339                          mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1340 1340                          (void) strcpy(mountpt, pnp->pn_path);
1341 1341  
1342 1342                          /*
1343 1343                           * If the addition of the zone's rootpath
1344 1344                           * would push us over a total path length
1345 1345                           * of MAXPATHLEN, we fail the mount with
1346 1346                           * ENAMETOOLONG, which is what we would have
1347 1347                           * gotten if we were trying to perform the same
1348 1348                           * mount in the global zone.
1349 1349                           *
1350 1350                           * strlen() doesn't count the trailing
1351 1351                           * '\0', but zone_rootpathlen counts both a
1352 1352                           * trailing '/' and the terminating '\0'.
1353 1353                           */
1354 1354                          if ((curproc->p_zone->zone_rootpathlen - 1 +
1355 1355                              strlen(mountpt)) > MAXPATHLEN ||
1356 1356                              (resource != NULL &&
1357 1357                              (curproc->p_zone->zone_rootpathlen - 1 +
1358 1358                              strlen(resource)) > MAXPATHLEN)) {
1359 1359                                  error = ENAMETOOLONG;
1360 1360                          }
1361 1361  
1362 1362                          pn_free(&rpn);
1363 1363                          pn_free(&pn);
1364 1364                  }
1365 1365  
1366 1366                  if (error)
1367 1367                          goto errout;
1368 1368  
1369 1369                  /*
1370 1370                   * Prevent path name resolution from proceeding past
1371 1371                   * the mount point.
1372 1372                   */
1373 1373                  if (vn_vfswlock(vp) != 0) {
1374 1374                          error = EBUSY;
1375 1375                          goto errout;
1376 1376                  }
1377 1377  
1378 1378                  /*
1379 1379                   * Verify that it's legitimate to establish a mount on
1380 1380                   * the prospective mount point.
1381 1381                   */
1382 1382                  if (vn_mountedvfs(vp) != NULL) {
1383 1383                          /*
1384 1384                           * The mount point lock was obtained after some
1385 1385                           * other thread raced through and established a mount.
1386 1386                           */
1387 1387                          vn_vfsunlock(vp);
1388 1388                          error = EBUSY;
1389 1389                          goto errout;
1390 1390                  }
1391 1391                  if (vp->v_flag & VNOMOUNT) {
1392 1392                          vn_vfsunlock(vp);
1393 1393                          error = EINVAL;
1394 1394                          goto errout;
1395 1395                  }
1396 1396          }
1397 1397          if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1398 1398                  uap->dataptr = NULL;
1399 1399                  uap->datalen = 0;
1400 1400          }
1401 1401  
1402 1402          /*
1403 1403           * If this is a remount, we don't want to create a new VFS.
1404 1404           * Instead, we pass the existing one with a remount flag.
1405 1405           */
1406 1406          if (remount) {
1407 1407                  /*
1408 1408                   * Confirm that the mount point is the root vnode of the
1409 1409                   * file system that is being remounted.
1410 1410                   * This can happen if the user specifies a different
1411 1411                   * mount point directory pathname in the (re)mount command.
1412 1412                   *
1413 1413                   * Code below can only be reached if splice is true, so it's
1414 1414                   * safe to do vn_vfsunlock() here.
1415 1415                   */
1416 1416                  if ((vp->v_flag & VROOT) == 0) {
1417 1417                          vn_vfsunlock(vp);
1418 1418                          error = ENOENT;
1419 1419                          goto errout;
1420 1420                  }
1421 1421                  /*
1422 1422                   * Disallow making file systems read-only unless file system
1423 1423                   * explicitly allows it in its vfssw.  Ignore other flags.
1424 1424                   */
1425 1425                  if (rdonly && vn_is_readonly(vp) == 0 &&
1426 1426                      (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1427 1427                          vn_vfsunlock(vp);
1428 1428                          error = EINVAL;
1429 1429                          goto errout;
1430 1430                  }
1431 1431                  /*
1432 1432                   * Disallow changing the NBMAND disposition of the file
1433 1433                   * system on remounts.
1434 1434                   */
1435 1435                  if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1436 1436                      (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1437 1437                          vn_vfsunlock(vp);
1438 1438                          error = EINVAL;
1439 1439                          goto errout;
1440 1440                  }
1441 1441                  vfsp = vp->v_vfsp;
1442 1442                  ovflags = vfsp->vfs_flag;
1443 1443                  vfsp->vfs_flag |= VFS_REMOUNT;
1444 1444                  vfsp->vfs_flag &= ~VFS_RDONLY;
1445 1445          } else {
1446 1446                  vfsp = vfs_alloc(KM_SLEEP);
1447 1447                  VFS_INIT(vfsp, vfsops, NULL);
1448 1448          }
1449 1449  
1450 1450          VFS_HOLD(vfsp);
1451 1451  
1452 1452          if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1453 1453                  if (!remount) {
1454 1454                          if (splice)
1455 1455                                  vn_vfsunlock(vp);
1456 1456                          vfs_free(vfsp);
1457 1457                  } else {
1458 1458                          vn_vfsunlock(vp);
1459 1459                          VFS_RELE(vfsp);
1460 1460                  }
1461 1461                  goto errout;
1462 1462          }
1463 1463  
1464 1464          /*
1465 1465           * PRIV_SYS_MOUNT doesn't mean you can become root.
1466 1466           */
1467 1467          if (vfsp->vfs_lofi_id != 0) {
1468 1468                  uap->flags |= MS_NOSUID;
1469 1469                  vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1470 1470          }
1471 1471  
1472 1472          /*
1473 1473           * The vfs_reflock is not used anymore the code below explicitly
1474 1474           * holds it preventing others accesing it directly.
1475 1475           */
1476 1476          if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1477 1477              !(vfsp->vfs_flag & VFS_REMOUNT))
1478 1478                  cmn_err(CE_WARN,
1479 1479                      "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1480 1480  
1481 1481          /*
1482 1482           * Lock the vfs. If this is a remount we want to avoid spurious umount
1483 1483           * failures that happen as a side-effect of fsflush() and other mount
1484 1484           * and unmount operations that might be going on simultaneously and
1485 1485           * may have locked the vfs currently. To not return EBUSY immediately
1486 1486           * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1487 1487           */
1488 1488          if (!remount) {
1489 1489                  if (error = vfs_lock(vfsp)) {
1490 1490                          vfsp->vfs_flag = ovflags;
1491 1491  
1492 1492                          lofi_remove(vfsp);
1493 1493  
1494 1494                          if (splice)
1495 1495                                  vn_vfsunlock(vp);
1496 1496                          vfs_free(vfsp);
1497 1497                          goto errout;
1498 1498                  }
1499 1499          } else {
1500 1500                  vfs_lock_wait(vfsp);
1501 1501          }
1502 1502  
1503 1503          /*
1504 1504           * Add device to mount in progress table, global mounts require special
1505 1505           * handling. It is possible that we have already done the lookupname
1506 1506           * on a spliced, non-global fs. If so, we don't want to do it again
1507 1507           * since we cannot do a lookupname after taking the
1508 1508           * wlock above. This case is for a non-spliced, non-global filesystem.
1509 1509           */
1510 1510          if (!addmip) {
1511 1511                  if ((vswp->vsw_flag & VSW_MOUNTDEV) &&
1512 1512                      (uap->flags & MS_GLOBAL) == 0 &&
1513 1513                      lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1514 1514                          addmip = 1;
1515 1515                  }
1516 1516          }
1517 1517  
1518 1518          if (addmip) {
1519 1519                  vnode_t *lvp = NULL;
1520 1520  
1521 1521                  error = vfs_get_lofi(vfsp, &lvp);
1522 1522                  if (error > 0) {
1523 1523                          lofi_remove(vfsp);
1524 1524  
1525 1525                          if (splice)
1526 1526                                  vn_vfsunlock(vp);
1527 1527                          vfs_unlock(vfsp);
1528 1528  
1529 1529                          if (remount) {
1530 1530                                  VFS_RELE(vfsp);
1531 1531                          } else {
1532 1532                                  vfs_free(vfsp);
1533 1533                          }
1534 1534  
1535 1535                          goto errout;
1536 1536                  } else if (error == -1) {
1537 1537                          bdev = bvp->v_rdev;
1538 1538                          VN_RELE(bvp);
1539 1539                  } else {
1540 1540                          bdev = lvp->v_rdev;
1541 1541                          VN_RELE(lvp);
1542 1542                          VN_RELE(bvp);
1543 1543                  }
1544 1544  
1545 1545                  vfs_addmip(bdev, vfsp);
1546 1546                  addmip = 0;
1547 1547                  delmip = 1;
1548 1548          }
1549 1549          /*
1550 1550           * Invalidate cached entry for the mount point.
1551 1551           */
1552 1552          if (splice)
1553 1553                  dnlc_purge_vp(vp);
1554 1554  
1555 1555          /*
1556 1556           * If have an option string but the filesystem doesn't supply a
1557 1557           * prototype options table, create a table with the global
1558 1558           * options and sufficient room to accept all the options in the
1559 1559           * string.  Then parse the passed in option string
1560 1560           * accepting all the options in the string.  This gives us an
1561 1561           * option table with all the proper cancel properties for the
1562 1562           * global options.
1563 1563           *
1564 1564           * Filesystems that supply a prototype options table are handled
1565 1565           * earlier in this function.
1566 1566           */
1567 1567          if (uap->flags & MS_OPTIONSTR) {
1568 1568                  if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1569 1569                          mntopts_t tmp_mntopts;
1570 1570  
1571 1571                          tmp_mntopts.mo_count = 0;
1572 1572                          vfs_createopttbl_extend(&tmp_mntopts, inargs,
1573 1573                              &mnt_mntopts);
1574 1574                          vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1575 1575                          vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1576 1576                          vfs_freeopttbl(&tmp_mntopts);
1577 1577                  }
1578 1578          }
1579 1579  
1580 1580          /*
1581 1581           * Serialize with zone state transitions.
1582 1582           * See vfs_list_add; zone mounted into is:
1583 1583           *      zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
1584 1584           * not the zone doing the mount (curproc->p_zone), but if we're already
1585 1585           * inside a NGZ, then we know what zone we are.
1586 1586           */
1587 1587          if (INGLOBALZONE(curproc)) {
1588 1588                  zone = zone_find_by_path(mountpt);
1589 1589                  ASSERT(zone != NULL);
1590 1590          } else {
1591 1591                  zone = curproc->p_zone;
1592 1592                  /*
1593 1593                   * zone_find_by_path does a hold, so do one here too so that
1594 1594                   * we can do a zone_rele after mount_completed.
1595 1595                   */
1596 1596                  zone_hold(zone);
1597 1597          }
1598 1598          mount_in_progress(zone);
1599 1599          /*
1600 1600           * Instantiate (or reinstantiate) the file system.  If appropriate,
1601 1601           * splice it into the file system name space.
1602 1602           *
1603 1603           * We want VFS_MOUNT() to be able to override the vfs_resource
1604 1604           * string if necessary (ie, mntfs), and also for a remount to
1605 1605           * change the same (necessary when remounting '/' during boot).
1606 1606           * So we set up vfs_mntpt and vfs_resource to what we think they
1607 1607           * should be, then hand off control to VFS_MOUNT() which can
1608 1608           * override this.
1609 1609           *
1610 1610           * For safety's sake, when changing vfs_resource or vfs_mntpt of
1611 1611           * a vfs which is on the vfs list (i.e. during a remount), we must
1612 1612           * never set those fields to NULL. Several bits of code make
1613 1613           * assumptions that the fields are always valid.
1614 1614           */
1615 1615          vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1616 1616          if (remount) {
1617 1617                  if ((oldresource = vfsp->vfs_resource) != NULL)
1618 1618                          refstr_hold(oldresource);
1619 1619                  if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1620 1620                          refstr_hold(oldmntpt);
1621 1621          }
1622 1622          vfs_setresource(vfsp, resource, 0);
1623 1623          vfs_setmntpoint(vfsp, mountpt, 0);
1624 1624  
1625 1625          /*
1626 1626           * going to mount on this vnode, so notify.
1627 1627           */
1628 1628          vnevent_mountedover(vp, NULL);
1629 1629          error = VFS_MOUNT(vfsp, vp, uap, credp);
1630 1630  
1631 1631          if (uap->flags & MS_RDONLY)
1632 1632                  vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1633 1633          if (uap->flags & MS_NOSUID)
1634 1634                  vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1635 1635          if (uap->flags & MS_GLOBAL)
1636 1636                  vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1637 1637  
1638 1638          if (error) {
1639 1639                  lofi_remove(vfsp);
1640 1640  
1641 1641                  if (remount) {
1642 1642                          /* put back pre-remount options */
1643 1643                          vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1644 1644                          vfs_setmntpoint(vfsp, refstr_value(oldmntpt),
1645 1645                              VFSSP_VERBATIM);
1646 1646                          if (oldmntpt)
1647 1647                                  refstr_rele(oldmntpt);
1648 1648                          vfs_setresource(vfsp, refstr_value(oldresource),
1649 1649                              VFSSP_VERBATIM);
1650 1650                          if (oldresource)
1651 1651                                  refstr_rele(oldresource);
1652 1652                          vfsp->vfs_flag = ovflags;
1653 1653                          vfs_unlock(vfsp);
1654 1654                          VFS_RELE(vfsp);
1655 1655                  } else {
1656 1656                          vfs_unlock(vfsp);
1657 1657                          vfs_freemnttab(vfsp);
1658 1658                          vfs_free(vfsp);
1659 1659                  }
1660 1660          } else {
1661 1661                  /*
1662 1662                   * Set the mount time to now
1663 1663                   */
1664 1664                  vfsp->vfs_mtime = ddi_get_time();
1665 1665                  if (remount) {
1666 1666                          vfsp->vfs_flag &= ~VFS_REMOUNT;
1667 1667                          if (oldresource)
1668 1668                                  refstr_rele(oldresource);
1669 1669                          if (oldmntpt)
1670 1670                                  refstr_rele(oldmntpt);
1671 1671                  } else if (splice) {
1672 1672                          /*
1673 1673                           * Link vfsp into the name space at the mount
1674 1674                           * point. Vfs_add() is responsible for
1675 1675                           * holding the mount point which will be
1676 1676                           * released when vfs_remove() is called.
1677 1677                           */
1678 1678                          vfs_add(vp, vfsp, uap->flags);
1679 1679                  } else {
1680 1680                          /*
1681 1681                           * Hold the reference to file system which is
1682 1682                           * not linked into the name space.
1683 1683                           */
1684 1684                          vfsp->vfs_zone = NULL;
1685 1685                          VFS_HOLD(vfsp);
1686 1686                          vfsp->vfs_vnodecovered = NULL;
1687 1687                  }
1688 1688                  /*
1689 1689                   * Set flags for global options encountered
1690 1690                   */
1691 1691                  if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1692 1692                          vfsp->vfs_flag |= VFS_RDONLY;
1693 1693                  else
1694 1694                          vfsp->vfs_flag &= ~VFS_RDONLY;
1695 1695                  if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1696 1696                          vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1697 1697                  } else {
1698 1698                          if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1699 1699                                  vfsp->vfs_flag |= VFS_NODEVICES;
1700 1700                          else
1701 1701                                  vfsp->vfs_flag &= ~VFS_NODEVICES;
1702 1702                          if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1703 1703                                  vfsp->vfs_flag |= VFS_NOSETUID;
1704 1704                          else
1705 1705                                  vfsp->vfs_flag &= ~VFS_NOSETUID;
1706 1706                  }
1707 1707                  if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1708 1708                          vfsp->vfs_flag |= VFS_NBMAND;
1709 1709                  else
1710 1710                          vfsp->vfs_flag &= ~VFS_NBMAND;
1711 1711  
1712 1712                  if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1713 1713                          vfsp->vfs_flag |= VFS_XATTR;
1714 1714                  else
1715 1715                          vfsp->vfs_flag &= ~VFS_XATTR;
1716 1716  
1717 1717                  if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1718 1718                          vfsp->vfs_flag |= VFS_NOEXEC;
1719 1719                  else
1720 1720                          vfsp->vfs_flag &= ~VFS_NOEXEC;
1721 1721  
1722 1722                  /*
1723 1723                   * Now construct the output option string of options
1724 1724                   * we recognized.
1725 1725                   */
1726 1726                  if (uap->flags & MS_OPTIONSTR) {
1727 1727                          vfs_list_read_lock();
1728 1728                          copyout_error = vfs_buildoptionstr(
1729 1729                              &vfsp->vfs_mntopts, inargs, optlen);
1730 1730                          vfs_list_unlock();
1731 1731                          if (copyout_error == 0 &&
1732 1732                              (uap->flags & MS_SYSSPACE) == 0) {
1733 1733                                  copyout_error = copyoutstr(inargs, opts,
1734 1734                                      optlen, NULL);
1735 1735                          }
1736 1736                  }
1737 1737  
1738 1738                  /*
1739 1739                   * If this isn't a remount, set up the vopstats before
1740 1740                   * anyone can touch this. We only allow spliced file
1741 1741                   * systems (file systems which are in the namespace) to
1742 1742                   * have the VFS_STATS flag set.
1743 1743                   * NOTE: PxFS mounts the underlying file system with
1744 1744                   * MS_NOSPLICE set and copies those vfs_flags to its private
1745 1745                   * vfs structure. As a result, PxFS should never have
1746 1746                   * the VFS_STATS flag or else we might access the vfs
1747 1747                   * statistics-related fields prior to them being
1748 1748                   * properly initialized.
1749 1749                   */
1750 1750                  if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1751 1751                          initialize_vopstats(&vfsp->vfs_vopstats);
1752 1752                          /*
1753 1753                           * We need to set vfs_vskap to NULL because there's
1754 1754                           * a chance it won't be set below.  This is checked
1755 1755                           * in teardown_vopstats() so we can't have garbage.
1756 1756                           */
1757 1757                          vfsp->vfs_vskap = NULL;
1758 1758                          vfsp->vfs_flag |= VFS_STATS;
1759 1759                          vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1760 1760                  }
1761 1761  
1762 1762                  if (vswp->vsw_flag & VSW_XID)
1763 1763                          vfsp->vfs_flag |= VFS_XID;
1764 1764  
1765 1765                  vfs_unlock(vfsp);
1766 1766          }
1767 1767          mount_completed(zone);
1768 1768          zone_rele(zone);
1769 1769          if (splice)
1770 1770                  vn_vfsunlock(vp);
1771 1771  
1772 1772          if ((error == 0) && (copyout_error == 0)) {
1773 1773                  if (!remount) {
1774 1774                          /*
1775 1775                           * Don't call get_vskstat_anchor() while holding
1776 1776                           * locks since it allocates memory and calls
1777 1777                           * VFS_STATVFS().  For NFS, the latter can generate
1778 1778                           * an over-the-wire call.
1779 1779                           */
1780 1780                          vskap = get_vskstat_anchor(vfsp);
1781 1781                          /* Only take the lock if we have something to do */
1782 1782                          if (vskap != NULL) {
1783 1783                                  vfs_lock_wait(vfsp);
1784 1784                                  if (vfsp->vfs_flag & VFS_STATS) {
1785 1785                                          vfsp->vfs_vskap = vskap;
1786 1786                                  }
1787 1787                                  vfs_unlock(vfsp);
1788 1788                          }
1789 1789                  }
1790 1790                  /* Return vfsp to caller. */
1791 1791                  *vfspp = vfsp;
1792 1792          }
1793 1793  errout:
1794 1794          vfs_freeopttbl(&mnt_mntopts);
1795 1795          if (resource != NULL)
1796 1796                  kmem_free(resource, strlen(resource) + 1);
1797 1797          if (mountpt != NULL)
1798 1798                  kmem_free(mountpt, strlen(mountpt) + 1);
1799 1799          /*
1800 1800           * It is possible we errored prior to adding to mount in progress
1801 1801           * table. Must free vnode we acquired with successful lookupname.
1802 1802           */
1803 1803          if (addmip)
1804 1804                  VN_RELE(bvp);
1805 1805          if (delmip)
1806 1806                  vfs_delmip(vfsp);
1807 1807          ASSERT(vswp != NULL);
1808 1808          vfs_unrefvfssw(vswp);
1809 1809          if (inargs != opts)
1810 1810                  kmem_free(inargs, MAX_MNTOPT_STR);
1811 1811          if (copyout_error) {
1812 1812                  lofi_remove(vfsp);
1813 1813                  VFS_RELE(vfsp);
1814 1814                  error = copyout_error;
1815 1815          }
1816 1816          return (error);
1817 1817  }
1818 1818  
1819 1819  static void
1820 1820  vfs_setpath(
1821 1821      struct vfs *vfsp,           /* vfs being updated */
1822 1822      refstr_t **refp,            /* Ref-count string to contain the new path */
1823 1823      const char *newpath,        /* Path to add to refp (above) */
1824 1824      uint32_t flag)              /* flag */
1825 1825  {
1826 1826          size_t len;
1827 1827          refstr_t *ref;
1828 1828          zone_t *zone = curproc->p_zone;
1829 1829          char *sp;
1830 1830          int have_list_lock = 0;
1831 1831  
1832 1832          ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1833 1833  
1834 1834          /*
1835 1835           * New path must be less than MAXPATHLEN because mntfs
1836 1836           * will only display up to MAXPATHLEN bytes. This is currently
1837 1837           * safe, because domount() uses pn_get(), and other callers
1838 1838           * similarly cap the size to fewer than MAXPATHLEN bytes.
1839 1839           */
1840 1840  
1841 1841          ASSERT(strlen(newpath) < MAXPATHLEN);
1842 1842  
1843 1843          /* mntfs requires consistency while vfs list lock is held */
1844 1844  
1845 1845          if (VFS_ON_LIST(vfsp)) {
1846 1846                  have_list_lock = 1;
1847 1847                  vfs_list_lock();
1848 1848          }
1849 1849  
1850 1850          if (*refp != NULL)
1851 1851                  refstr_rele(*refp);
1852 1852  
1853 1853          /*
1854 1854           * If we are in a non-global zone then we prefix the supplied path,
1855 1855           * newpath, with the zone's root path, with two exceptions. The first
1856 1856           * is where we have been explicitly directed to avoid doing so; this
1857 1857           * will be the case following a failed remount, where the path supplied
1858 1858           * will be a saved version which must now be restored. The second
1859 1859           * exception is where newpath is not a pathname but a descriptive name,
1860 1860           * e.g. "procfs".
1861 1861           */
1862 1862          if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') {
1863 1863                  ref = refstr_alloc(newpath);
1864 1864                  goto out;
1865 1865          }
1866 1866  
1867 1867          /*
1868 1868           * Truncate the trailing '/' in the zoneroot, and merge
1869 1869           * in the zone's rootpath with the "newpath" (resource
1870 1870           * or mountpoint) passed in.
1871 1871           *
1872 1872           * The size of the required buffer is thus the size of
1873 1873           * the buffer required for the passed-in newpath
1874 1874           * (strlen(newpath) + 1), plus the size of the buffer
1875 1875           * required to hold zone_rootpath (zone_rootpathlen)
1876 1876           * minus one for one of the now-superfluous NUL
1877 1877           * terminations, minus one for the trailing '/'.
1878 1878           *
1879 1879           * That gives us:
1880 1880           *
1881 1881           * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1882 1882           *
1883 1883           * Which is what we have below.
1884 1884           */
1885 1885  
1886 1886          len = strlen(newpath) + zone->zone_rootpathlen - 1;
1887 1887          sp = kmem_alloc(len, KM_SLEEP);
1888 1888  
1889 1889          /*
1890 1890           * Copy everything including the trailing slash, which
1891 1891           * we then overwrite with the NUL character.
1892 1892           */
1893 1893  
1894 1894          (void) strcpy(sp, zone->zone_rootpath);
1895 1895          sp[zone->zone_rootpathlen - 2] = '\0';
1896 1896          (void) strcat(sp, newpath);
1897 1897  
1898 1898          ref = refstr_alloc(sp);
1899 1899          kmem_free(sp, len);
1900 1900  out:
1901 1901          *refp = ref;
1902 1902  
1903 1903          if (have_list_lock) {
1904 1904                  vfs_mnttab_modtimeupd();
1905 1905                  vfs_list_unlock();
1906 1906          }
1907 1907  }
1908 1908  
1909 1909  /*
1910 1910   * Record a mounted resource name in a vfs structure.
1911 1911   * If vfsp is already mounted, caller must hold the vfs lock.
1912 1912   */
1913 1913  void
1914 1914  vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag)
1915 1915  {
1916 1916          if (resource == NULL || resource[0] == '\0')
1917 1917                  resource = VFS_NORESOURCE;
1918 1918          vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag);
1919 1919  }
1920 1920  
1921 1921  /*
1922 1922   * Record a mount point name in a vfs structure.
1923 1923   * If vfsp is already mounted, caller must hold the vfs lock.
1924 1924   */
1925 1925  void
1926 1926  vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag)
1927 1927  {
1928 1928          if (mntpt == NULL || mntpt[0] == '\0')
1929 1929                  mntpt = VFS_NOMNTPT;
1930 1930          vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag);
1931 1931  }
1932 1932  
1933 1933  /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1934 1934  
1935 1935  refstr_t *
1936 1936  vfs_getresource(const struct vfs *vfsp)
1937 1937  {
1938 1938          refstr_t *resource;
1939 1939  
1940 1940          vfs_list_read_lock();
1941 1941          resource = vfsp->vfs_resource;
1942 1942          refstr_hold(resource);
1943 1943          vfs_list_unlock();
1944 1944  
1945 1945          return (resource);
1946 1946  }
1947 1947  
1948 1948  /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1949 1949  
1950 1950  refstr_t *
1951 1951  vfs_getmntpoint(const struct vfs *vfsp)
1952 1952  {
1953 1953          refstr_t *mntpt;
1954 1954  
1955 1955          vfs_list_read_lock();
1956 1956          mntpt = vfsp->vfs_mntpt;
1957 1957          refstr_hold(mntpt);
1958 1958          vfs_list_unlock();
1959 1959  
1960 1960          return (mntpt);
1961 1961  }
1962 1962  
1963 1963  /*
1964 1964   * Create an empty options table with enough empty slots to hold all
1965 1965   * The options in the options string passed as an argument.
1966 1966   * Potentially prepend another options table.
1967 1967   *
1968 1968   * Note: caller is responsible for locking the vfs list, if needed,
1969 1969   *       to protect mops.
1970 1970   */
1971 1971  static void
1972 1972  vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1973 1973      const mntopts_t *mtmpl)
1974 1974  {
1975 1975          const char *s = opts;
1976 1976          uint_t count;
1977 1977  
1978 1978          if (opts == NULL || *opts == '\0') {
1979 1979                  count = 0;
1980 1980          } else {
1981 1981                  count = 1;
1982 1982  
1983 1983                  /*
1984 1984                   * Count number of options in the string
1985 1985                   */
1986 1986                  for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1987 1987                          count++;
1988 1988                          s++;
1989 1989                  }
1990 1990          }
1991 1991          vfs_copyopttbl_extend(mtmpl, mops, count);
1992 1992  }
1993 1993  
1994 1994  /*
1995 1995   * Create an empty options table with enough empty slots to hold all
1996 1996   * The options in the options string passed as an argument.
1997 1997   *
1998 1998   * This function is *not* for general use by filesystems.
1999 1999   *
2000 2000   * Note: caller is responsible for locking the vfs list, if needed,
2001 2001   *       to protect mops.
2002 2002   */
2003 2003  void
2004 2004  vfs_createopttbl(mntopts_t *mops, const char *opts)
2005 2005  {
2006 2006          vfs_createopttbl_extend(mops, opts, NULL);
2007 2007  }
2008 2008  
2009 2009  
2010 2010  /*
2011 2011   * Swap two mount options tables
2012 2012   */
2013 2013  static void
2014 2014  vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2015 2015  {
2016 2016          uint_t tmpcnt;
2017 2017          mntopt_t *tmplist;
2018 2018  
2019 2019          tmpcnt = optbl2->mo_count;
2020 2020          tmplist = optbl2->mo_list;
2021 2021          optbl2->mo_count = optbl1->mo_count;
2022 2022          optbl2->mo_list = optbl1->mo_list;
2023 2023          optbl1->mo_count = tmpcnt;
2024 2024          optbl1->mo_list = tmplist;
2025 2025  }
2026 2026  
2027 2027  static void
2028 2028  vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2029 2029  {
2030 2030          vfs_list_lock();
2031 2031          vfs_swapopttbl_nolock(optbl1, optbl2);
2032 2032          vfs_mnttab_modtimeupd();
2033 2033          vfs_list_unlock();
2034 2034  }
2035 2035  
2036 2036  static char **
2037 2037  vfs_copycancelopt_extend(char **const moc, int extend)
2038 2038  {
2039 2039          int i = 0;
2040 2040          int j;
2041 2041          char **result;
2042 2042  
2043 2043          if (moc != NULL) {
2044 2044                  for (; moc[i] != NULL; i++)
2045 2045                          /* count number of options to cancel */;
2046 2046          }
2047 2047  
2048 2048          if (i + extend == 0)
2049 2049                  return (NULL);
2050 2050  
2051 2051          result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2052 2052  
2053 2053          for (j = 0; j < i; j++) {
2054 2054                  result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2055 2055                  (void) strcpy(result[j], moc[j]);
2056 2056          }
2057 2057          for (; j <= i + extend; j++)
2058 2058                  result[j] = NULL;
2059 2059  
2060 2060          return (result);
2061 2061  }
2062 2062  
2063 2063  static void
2064 2064  vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2065 2065  {
2066 2066          char *sp, *dp;
2067 2067  
2068 2068          d->mo_flags = s->mo_flags;
2069 2069          d->mo_data = s->mo_data;
2070 2070          sp = s->mo_name;
2071 2071          if (sp != NULL) {
2072 2072                  dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2073 2073                  (void) strcpy(dp, sp);
2074 2074                  d->mo_name = dp;
2075 2075          } else {
2076 2076                  d->mo_name = NULL; /* should never happen */
2077 2077          }
2078 2078  
2079 2079          d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2080 2080  
2081 2081          sp = s->mo_arg;
2082 2082          if (sp != NULL) {
2083 2083                  dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2084 2084                  (void) strcpy(dp, sp);
2085 2085                  d->mo_arg = dp;
2086 2086          } else {
2087 2087                  d->mo_arg = NULL;
2088 2088          }
2089 2089  }
2090 2090  
2091 2091  /*
2092 2092   * Copy a mount options table, possibly allocating some spare
2093 2093   * slots at the end.  It is permissible to copy_extend the NULL table.
2094 2094   */
2095 2095  static void
2096 2096  vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2097 2097  {
2098 2098          uint_t i, count;
2099 2099          mntopt_t *motbl;
2100 2100  
2101 2101          /*
2102 2102           * Clear out any existing stuff in the options table being initialized
2103 2103           */
2104 2104          vfs_freeopttbl(dmo);
2105 2105          count = (smo == NULL) ? 0 : smo->mo_count;
2106 2106          if ((count + extra) == 0)       /* nothing to do */
2107 2107                  return;
2108 2108          dmo->mo_count = count + extra;
2109 2109          motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2110 2110          dmo->mo_list = motbl;
2111 2111          for (i = 0; i < count; i++) {
2112 2112                  vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2113 2113          }
2114 2114          for (i = count; i < count + extra; i++) {
2115 2115                  motbl[i].mo_flags = MO_EMPTY;
2116 2116          }
2117 2117  }
2118 2118  
2119 2119  /*
2120 2120   * Copy a mount options table.
2121 2121   *
2122 2122   * This function is *not* for general use by filesystems.
2123 2123   *
2124 2124   * Note: caller is responsible for locking the vfs list, if needed,
2125 2125   *       to protect smo and dmo.
2126 2126   */
2127 2127  void
2128 2128  vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2129 2129  {
2130 2130          vfs_copyopttbl_extend(smo, dmo, 0);
2131 2131  }
2132 2132  
2133 2133  static char **
2134 2134  vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2135 2135  {
2136 2136          int c1 = 0;
2137 2137          int c2 = 0;
2138 2138          char **result;
2139 2139          char **sp1, **sp2, **dp;
2140 2140  
2141 2141          /*
2142 2142           * First we count both lists of cancel options.
2143 2143           * If either is NULL or has no elements, we return a copy of
2144 2144           * the other.
2145 2145           */
2146 2146          if (mop1->mo_cancel != NULL) {
2147 2147                  for (; mop1->mo_cancel[c1] != NULL; c1++)
2148 2148                          /* count cancel options in mop1 */;
2149 2149          }
2150 2150  
2151 2151          if (c1 == 0)
2152 2152                  return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2153 2153  
2154 2154          if (mop2->mo_cancel != NULL) {
2155 2155                  for (; mop2->mo_cancel[c2] != NULL; c2++)
2156 2156                          /* count cancel options in mop2 */;
2157 2157          }
2158 2158  
2159 2159          result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2160 2160  
2161 2161          if (c2 == 0)
2162 2162                  return (result);
2163 2163  
2164 2164          /*
2165 2165           * When we get here, we've got two sets of cancel options;
2166 2166           * we need to merge the two sets.  We know that the result
2167 2167           * array has "c1+c2+1" entries and in the end we might shrink
2168 2168           * it.
2169 2169           * Result now has a copy of the c1 entries from mop1; we'll
2170 2170           * now lookup all the entries of mop2 in mop1 and copy it if
2171 2171           * it is unique.
2172 2172           * This operation is O(n^2) but it's only called once per
2173 2173           * filesystem per duplicate option.  This is a situation
2174 2174           * which doesn't arise with the filesystems in ON and
2175 2175           * n is generally 1.
2176 2176           */
2177 2177  
2178 2178          dp = &result[c1];
2179 2179          for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2180 2180                  for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2181 2181                          if (strcmp(*sp1, *sp2) == 0)
2182 2182                                  break;
2183 2183                  }
2184 2184                  if (*sp1 == NULL) {
2185 2185                          /*
2186 2186                           * Option *sp2 not found in mop1, so copy it.
2187 2187                           * The calls to vfs_copycancelopt_extend()
2188 2188                           * guarantee that there's enough room.
2189 2189                           */
2190 2190                          *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2191 2191                          (void) strcpy(*dp++, *sp2);
2192 2192                  }
2193 2193          }
2194 2194          if (dp != &result[c1+c2]) {
2195 2195                  size_t bytes = (dp - result + 1) * sizeof (char *);
2196 2196                  char **nres = kmem_alloc(bytes, KM_SLEEP);
2197 2197  
2198 2198                  bcopy(result, nres, bytes);
2199 2199                  kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2200 2200                  result = nres;
2201 2201          }
2202 2202          return (result);
2203 2203  }
2204 2204  
2205 2205  /*
2206 2206   * Merge two mount option tables (outer and inner) into one.  This is very
2207 2207   * similar to "merging" global variables and automatic variables in C.
2208 2208   *
2209 2209   * This isn't (and doesn't have to be) fast.
2210 2210   *
2211 2211   * This function is *not* for general use by filesystems.
2212 2212   *
2213 2213   * Note: caller is responsible for locking the vfs list, if needed,
2214 2214   *       to protect omo, imo & dmo.
2215 2215   */
2216 2216  void
2217 2217  vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2218 2218  {
2219 2219          uint_t i, count;
2220 2220          mntopt_t *mop, *motbl;
2221 2221          uint_t freeidx;
2222 2222  
2223 2223          /*
2224 2224           * First determine how much space we need to allocate.
2225 2225           */
2226 2226          count = omo->mo_count;
2227 2227          for (i = 0; i < imo->mo_count; i++) {
2228 2228                  if (imo->mo_list[i].mo_flags & MO_EMPTY)
2229 2229                          continue;
2230 2230                  if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2231 2231                          count++;
2232 2232          }
2233 2233          ASSERT(count >= omo->mo_count &&
2234 2234              count <= omo->mo_count + imo->mo_count);
2235 2235          motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2236 2236          for (i = 0; i < omo->mo_count; i++)
2237 2237                  vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2238 2238          freeidx = omo->mo_count;
2239 2239          for (i = 0; i < imo->mo_count; i++) {
2240 2240                  if (imo->mo_list[i].mo_flags & MO_EMPTY)
2241 2241                          continue;
2242 2242                  if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2243 2243                          char **newcanp;
2244 2244                          uint_t index = mop - omo->mo_list;
2245 2245  
2246 2246                          newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2247 2247  
2248 2248                          vfs_freeopt(&motbl[index]);
2249 2249                          vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2250 2250  
2251 2251                          vfs_freecancelopt(motbl[index].mo_cancel);
2252 2252                          motbl[index].mo_cancel = newcanp;
2253 2253                  } else {
2254 2254                          /*
2255 2255                           * If it's a new option, just copy it over to the first
2256 2256                           * free location.
2257 2257                           */
2258 2258                          vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2259 2259                  }
2260 2260          }
2261 2261          dmo->mo_count = count;
2262 2262          dmo->mo_list = motbl;
2263 2263  }
2264 2264  
2265 2265  /*
2266 2266   * Functions to set and clear mount options in a mount options table.
2267 2267   */
2268 2268  
2269 2269  /*
2270 2270   * Clear a mount option, if it exists.
2271 2271   *
2272 2272   * The update_mnttab arg indicates whether mops is part of a vfs that is on
2273 2273   * the vfs list.
2274 2274   */
2275 2275  static void
2276 2276  vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2277 2277  {
2278 2278          struct mntopt *mop;
2279 2279          uint_t i, count;
2280 2280  
2281 2281          ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2282 2282  
2283 2283          count = mops->mo_count;
2284 2284          for (i = 0; i < count; i++) {
2285 2285                  mop = &mops->mo_list[i];
2286 2286  
2287 2287                  if (mop->mo_flags & MO_EMPTY)
2288 2288                          continue;
2289 2289                  if (strcmp(opt, mop->mo_name))
2290 2290                          continue;
2291 2291                  mop->mo_flags &= ~MO_SET;
2292 2292                  if (mop->mo_arg != NULL) {
2293 2293                          kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2294 2294                  }
2295 2295                  mop->mo_arg = NULL;
2296 2296                  if (update_mnttab)
2297 2297                          vfs_mnttab_modtimeupd();
2298 2298                  break;
2299 2299          }
2300 2300  }
2301 2301  
2302 2302  void
2303 2303  vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2304 2304  {
2305 2305          int gotlock = 0;
2306 2306  
2307 2307          if (VFS_ON_LIST(vfsp)) {
2308 2308                  gotlock = 1;
2309 2309                  vfs_list_lock();
2310 2310          }
2311 2311          vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2312 2312          if (gotlock)
2313 2313                  vfs_list_unlock();
2314 2314  }
2315 2315  
2316 2316  
2317 2317  /*
2318 2318   * Set a mount option on.  If it's not found in the table, it's silently
2319 2319   * ignored.  If the option has MO_IGNORE set, it is still set unless the
2320 2320   * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2321 2321   * bits can be used to toggle the MO_NODISPLAY bit for the option.
2322 2322   * If the VFS_CREATEOPT flag bit is set then the first option slot with
2323 2323   * MO_EMPTY set is created as the option passed in.
2324 2324   *
2325 2325   * The update_mnttab arg indicates whether mops is part of a vfs that is on
2326 2326   * the vfs list.
2327 2327   */
2328 2328  static void
2329 2329  vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2330 2330      const char *arg, int flags, int update_mnttab)
2331 2331  {
2332 2332          mntopt_t *mop;
2333 2333          uint_t i, count;
2334 2334          char *sp;
2335 2335  
2336 2336          ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2337 2337  
2338 2338          if (flags & VFS_CREATEOPT) {
2339 2339                  if (vfs_hasopt(mops, opt) != NULL) {
2340 2340                          flags &= ~VFS_CREATEOPT;
2341 2341                  }
2342 2342          }
2343 2343          count = mops->mo_count;
2344 2344          for (i = 0; i < count; i++) {
2345 2345                  mop = &mops->mo_list[i];
2346 2346  
2347 2347                  if (mop->mo_flags & MO_EMPTY) {
2348 2348                          if ((flags & VFS_CREATEOPT) == 0)
2349 2349                                  continue;
2350 2350                          sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2351 2351                          (void) strcpy(sp, opt);
2352 2352                          mop->mo_name = sp;
2353 2353                          if (arg != NULL)
2354 2354                                  mop->mo_flags = MO_HASVALUE;
2355 2355                          else
2356 2356                                  mop->mo_flags = 0;
2357 2357                  } else if (strcmp(opt, mop->mo_name)) {
2358 2358                          continue;
2359 2359                  }
2360 2360                  if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2361 2361                          break;
2362 2362                  if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2363 2363                          sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2364 2364                          (void) strcpy(sp, arg);
2365 2365                  } else {
2366 2366                          sp = NULL;
2367 2367                  }
2368 2368                  if (mop->mo_arg != NULL)
2369 2369                          kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2370 2370                  mop->mo_arg = sp;
2371 2371                  if (flags & VFS_DISPLAY)
2372 2372                          mop->mo_flags &= ~MO_NODISPLAY;
2373 2373                  if (flags & VFS_NODISPLAY)
2374 2374                          mop->mo_flags |= MO_NODISPLAY;
2375 2375                  mop->mo_flags |= MO_SET;
2376 2376                  if (mop->mo_cancel != NULL) {
2377 2377                          char **cp;
2378 2378  
2379 2379                          for (cp = mop->mo_cancel; *cp != NULL; cp++)
2380 2380                                  vfs_clearmntopt_nolock(mops, *cp, 0);
2381 2381                  }
2382 2382                  if (update_mnttab)
2383 2383                          vfs_mnttab_modtimeupd();
2384 2384                  break;
2385 2385          }
2386 2386  }
2387 2387  
2388 2388  void
2389 2389  vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2390 2390  {
2391 2391          int gotlock = 0;
2392 2392  
2393 2393          if (VFS_ON_LIST(vfsp)) {
2394 2394                  gotlock = 1;
2395 2395                  vfs_list_lock();
2396 2396          }
2397 2397          vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2398 2398          if (gotlock)
2399 2399                  vfs_list_unlock();
2400 2400  }
2401 2401  
2402 2402  
2403 2403  /*
2404 2404   * Add a "tag" option to a mounted file system's options list.
2405 2405   *
2406 2406   * Note: caller is responsible for locking the vfs list, if needed,
2407 2407   *       to protect mops.
2408 2408   */
2409 2409  static mntopt_t *
2410 2410  vfs_addtag(mntopts_t *mops, const char *tag)
2411 2411  {
2412 2412          uint_t count;
2413 2413          mntopt_t *mop, *motbl;
2414 2414  
2415 2415          count = mops->mo_count + 1;
2416 2416          motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2417 2417          if (mops->mo_count) {
2418 2418                  size_t len = (count - 1) * sizeof (mntopt_t);
2419 2419  
2420 2420                  bcopy(mops->mo_list, motbl, len);
2421 2421                  kmem_free(mops->mo_list, len);
2422 2422          }
2423 2423          mops->mo_count = count;
2424 2424          mops->mo_list = motbl;
2425 2425          mop = &motbl[count - 1];
2426 2426          mop->mo_flags = MO_TAG;
2427 2427          mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2428 2428          (void) strcpy(mop->mo_name, tag);
2429 2429          return (mop);
2430 2430  }
2431 2431  
2432 2432  /*
2433 2433   * Allow users to set arbitrary "tags" in a vfs's mount options.
2434 2434   * Broader use within the kernel is discouraged.
2435 2435   */
2436 2436  int
2437 2437  vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2438 2438      cred_t *cr)
2439 2439  {
2440 2440          vfs_t *vfsp;
2441 2441          mntopts_t *mops;
2442 2442          mntopt_t *mop;
2443 2443          int found = 0;
2444 2444          dev_t dev = makedevice(major, minor);
2445 2445          int err = 0;
2446 2446          char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2447 2447  
2448 2448          /*
2449 2449           * Find the desired mounted file system
2450 2450           */
2451 2451          vfs_list_lock();
2452 2452          vfsp = rootvfs;
2453 2453          do {
2454 2454                  if (vfsp->vfs_dev == dev &&
2455 2455                      strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2456 2456                          found = 1;
2457 2457                          break;
2458 2458                  }
2459 2459                  vfsp = vfsp->vfs_next;
2460 2460          } while (vfsp != rootvfs);
2461 2461  
2462 2462          if (!found) {
2463 2463                  err = EINVAL;
2464 2464                  goto out;
2465 2465          }
2466 2466          err = secpolicy_fs_config(cr, vfsp);
2467 2467          if (err != 0)
2468 2468                  goto out;
2469 2469  
2470 2470          mops = &vfsp->vfs_mntopts;
2471 2471          /*
2472 2472           * Add tag if it doesn't already exist
2473 2473           */
2474 2474          if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2475 2475                  int len;
2476 2476  
2477 2477                  (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2478 2478                  len = strlen(buf);
2479 2479                  if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2480 2480                          err = ENAMETOOLONG;
2481 2481                          goto out;
2482 2482                  }
2483 2483                  mop = vfs_addtag(mops, tag);
2484 2484          }
2485 2485          if ((mop->mo_flags & MO_TAG) == 0) {
2486 2486                  err = EINVAL;
2487 2487                  goto out;
2488 2488          }
2489 2489          vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2490 2490  out:
2491 2491          vfs_list_unlock();
2492 2492          kmem_free(buf, MAX_MNTOPT_STR);
2493 2493          return (err);
2494 2494  }
2495 2495  
2496 2496  /*
2497 2497   * Allow users to remove arbitrary "tags" in a vfs's mount options.
2498 2498   * Broader use within the kernel is discouraged.
2499 2499   */
2500 2500  int
2501 2501  vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2502 2502      cred_t *cr)
2503 2503  {
2504 2504          vfs_t *vfsp;
2505 2505          mntopt_t *mop;
2506 2506          int found = 0;
2507 2507          dev_t dev = makedevice(major, minor);
2508 2508          int err = 0;
2509 2509  
2510 2510          /*
2511 2511           * Find the desired mounted file system
2512 2512           */
2513 2513          vfs_list_lock();
2514 2514          vfsp = rootvfs;
2515 2515          do {
2516 2516                  if (vfsp->vfs_dev == dev &&
2517 2517                      strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2518 2518                          found = 1;
2519 2519                          break;
2520 2520                  }
2521 2521                  vfsp = vfsp->vfs_next;
2522 2522          } while (vfsp != rootvfs);
2523 2523  
2524 2524          if (!found) {
2525 2525                  err = EINVAL;
2526 2526                  goto out;
2527 2527          }
2528 2528          err = secpolicy_fs_config(cr, vfsp);
2529 2529          if (err != 0)
2530 2530                  goto out;
2531 2531  
2532 2532          if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2533 2533                  err = EINVAL;
2534 2534                  goto out;
2535 2535          }
2536 2536          if ((mop->mo_flags & MO_TAG) == 0) {
2537 2537                  err = EINVAL;
2538 2538                  goto out;
2539 2539          }
2540 2540          vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2541 2541  out:
2542 2542          vfs_list_unlock();
2543 2543          return (err);
2544 2544  }
2545 2545  
2546 2546  /*
2547 2547   * Function to parse an option string and fill in a mount options table.
2548 2548   * Unknown options are silently ignored.  The input option string is modified
2549 2549   * by replacing separators with nulls.  If the create flag is set, options
2550 2550   * not found in the table are just added on the fly.  The table must have
2551 2551   * an option slot marked MO_EMPTY to add an option on the fly.
2552 2552   *
2553 2553   * This function is *not* for general use by filesystems.
2554 2554   *
2555 2555   * Note: caller is responsible for locking the vfs list, if needed,
2556 2556   *       to protect mops..
2557 2557   */
2558 2558  void
2559 2559  vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2560 2560  {
2561 2561          char *s = osp, *p, *nextop, *valp, *cp, *ep;
2562 2562          int setflg = VFS_NOFORCEOPT;
2563 2563  
2564 2564          if (osp == NULL)
2565 2565                  return;
2566 2566          while (*s != '\0') {
2567 2567                  p = strchr(s, ',');     /* find next option */
2568 2568                  if (p == NULL) {
2569 2569                          cp = NULL;
2570 2570                          p = s + strlen(s);
2571 2571                  } else {
2572 2572                          cp = p;         /* save location of comma */
2573 2573                          *p++ = '\0';    /* mark end and point to next option */
2574 2574                  }
2575 2575                  nextop = p;
2576 2576                  p = strchr(s, '=');     /* look for value */
2577 2577                  if (p == NULL) {
2578 2578                          valp = NULL;    /* no value supplied */
2579 2579                  } else {
2580 2580                          ep = p;         /* save location of equals */
2581 2581                          *p++ = '\0';    /* end option and point to value */
2582 2582                          valp = p;
2583 2583                  }
2584 2584                  /*
2585 2585                   * set option into options table
2586 2586                   */
2587 2587                  if (create)
2588 2588                          setflg |= VFS_CREATEOPT;
2589 2589                  vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2590 2590                  if (cp != NULL)
2591 2591                          *cp = ',';      /* restore the comma */
2592 2592                  if (valp != NULL)
2593 2593                          *ep = '=';      /* restore the equals */
2594 2594                  s = nextop;
2595 2595          }
2596 2596  }
2597 2597  
2598 2598  /*
2599 2599   * Function to inquire if an option exists in a mount options table.
2600 2600   * Returns a pointer to the option if it exists, else NULL.
2601 2601   *
2602 2602   * This function is *not* for general use by filesystems.
2603 2603   *
2604 2604   * Note: caller is responsible for locking the vfs list, if needed,
2605 2605   *       to protect mops.
2606 2606   */
2607 2607  struct mntopt *
2608 2608  vfs_hasopt(const mntopts_t *mops, const char *opt)
2609 2609  {
2610 2610          struct mntopt *mop;
2611 2611          uint_t i, count;
2612 2612  
2613 2613          count = mops->mo_count;
2614 2614          for (i = 0; i < count; i++) {
2615 2615                  mop = &mops->mo_list[i];
2616 2616  
2617 2617                  if (mop->mo_flags & MO_EMPTY)
2618 2618                          continue;
2619 2619                  if (strcmp(opt, mop->mo_name) == 0)
2620 2620                          return (mop);
2621 2621          }
2622 2622          return (NULL);
2623 2623  }
2624 2624  
2625 2625  /*
2626 2626   * Function to inquire if an option is set in a mount options table.
2627 2627   * Returns non-zero if set and fills in the arg pointer with a pointer to
2628 2628   * the argument string or NULL if there is no argument string.
2629 2629   */
2630 2630  static int
2631 2631  vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2632 2632  {
2633 2633          struct mntopt *mop;
2634 2634          uint_t i, count;
2635 2635  
2636 2636          count = mops->mo_count;
2637 2637          for (i = 0; i < count; i++) {
2638 2638                  mop = &mops->mo_list[i];
2639 2639  
2640 2640                  if (mop->mo_flags & MO_EMPTY)
2641 2641                          continue;
2642 2642                  if (strcmp(opt, mop->mo_name))
2643 2643                          continue;
2644 2644                  if ((mop->mo_flags & MO_SET) == 0)
2645 2645                          return (0);
2646 2646                  if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2647 2647                          *argp = mop->mo_arg;
2648 2648                  return (1);
2649 2649          }
2650 2650          return (0);
2651 2651  }
2652 2652  
2653 2653  
2654 2654  int
2655 2655  vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2656 2656  {
2657 2657          int ret;
2658 2658  
2659 2659          vfs_list_read_lock();
2660 2660          ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2661 2661          vfs_list_unlock();
2662 2662          return (ret);
2663 2663  }
2664 2664  
2665 2665  
2666 2666  /*
2667 2667   * Construct a comma separated string of the options set in the given
2668 2668   * mount table, return the string in the given buffer.  Return non-zero if
2669 2669   * the buffer would overflow.
2670 2670   *
2671 2671   * This function is *not* for general use by filesystems.
2672 2672   *
2673 2673   * Note: caller is responsible for locking the vfs list, if needed,
2674 2674   *       to protect mp.
2675 2675   */
2676 2676  int
2677 2677  vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2678 2678  {
2679 2679          char *cp;
2680 2680          uint_t i;
2681 2681  
2682 2682          buf[0] = '\0';
2683 2683          cp = buf;
2684 2684          for (i = 0; i < mp->mo_count; i++) {
2685 2685                  struct mntopt *mop;
2686 2686  
2687 2687                  mop = &mp->mo_list[i];
2688 2688                  if (mop->mo_flags & MO_SET) {
2689 2689                          int optlen, comma = 0;
2690 2690  
2691 2691                          if (buf[0] != '\0')
2692 2692                                  comma = 1;
2693 2693                          optlen = strlen(mop->mo_name);
2694 2694                          if (strlen(buf) + comma + optlen + 1 > len)
2695 2695                                  goto err;
2696 2696                          if (comma)
2697 2697                                  *cp++ = ',';
2698 2698                          (void) strcpy(cp, mop->mo_name);
2699 2699                          cp += optlen;
2700 2700                          /*
2701 2701                           * Append option value if there is one
2702 2702                           */
2703 2703                          if (mop->mo_arg != NULL) {
2704 2704                                  int arglen;
2705 2705  
2706 2706                                  arglen = strlen(mop->mo_arg);
2707 2707                                  if (strlen(buf) + arglen + 2 > len)
2708 2708                                          goto err;
2709 2709                                  *cp++ = '=';
2710 2710                                  (void) strcpy(cp, mop->mo_arg);
2711 2711                                  cp += arglen;
2712 2712                          }
2713 2713                  }
2714 2714          }
2715 2715          return (0);
2716 2716  err:
2717 2717          return (EOVERFLOW);
2718 2718  }
2719 2719  
2720 2720  static void
2721 2721  vfs_freecancelopt(char **moc)
2722 2722  {
2723 2723          if (moc != NULL) {
2724 2724                  int ccnt = 0;
2725 2725                  char **cp;
2726 2726  
2727 2727                  for (cp = moc; *cp != NULL; cp++) {
2728 2728                          kmem_free(*cp, strlen(*cp) + 1);
2729 2729                          ccnt++;
2730 2730                  }
2731 2731                  kmem_free(moc, (ccnt + 1) * sizeof (char *));
2732 2732          }
2733 2733  }
2734 2734  
2735 2735  static void
2736 2736  vfs_freeopt(mntopt_t *mop)
2737 2737  {
2738 2738          if (mop->mo_name != NULL)
2739 2739                  kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2740 2740  
2741 2741          vfs_freecancelopt(mop->mo_cancel);
2742 2742  
2743 2743          if (mop->mo_arg != NULL)
2744 2744                  kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2745 2745  }
2746 2746  
2747 2747  /*
2748 2748   * Free a mount options table
2749 2749   *
2750 2750   * This function is *not* for general use by filesystems.
2751 2751   *
2752 2752   * Note: caller is responsible for locking the vfs list, if needed,
2753 2753   *       to protect mp.
2754 2754   */
2755 2755  void
2756 2756  vfs_freeopttbl(mntopts_t *mp)
2757 2757  {
2758 2758          uint_t i, count;
2759 2759  
2760 2760          count = mp->mo_count;
2761 2761          for (i = 0; i < count; i++) {
2762 2762                  vfs_freeopt(&mp->mo_list[i]);
2763 2763          }
2764 2764          if (count) {
2765 2765                  kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2766 2766                  mp->mo_count = 0;
2767 2767                  mp->mo_list = NULL;
2768 2768          }
2769 2769  }
2770 2770  
2771 2771  
2772 2772  /* ARGSUSED */
2773 2773  static int
2774 2774  vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2775 2775      caller_context_t *ct)
2776 2776  {
2777 2777          return (0);
2778 2778  }
2779 2779  
2780 2780  /* ARGSUSED */
2781 2781  static int
2782 2782  vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2783 2783      caller_context_t *ct)
2784 2784  {
2785 2785          return (0);
2786 2786  }
2787 2787  
2788 2788  /*
2789 2789   * The dummy vnode is currently used only by file events notification
2790 2790   * module which is just interested in the timestamps.
2791 2791   */
2792 2792  /* ARGSUSED */
2793 2793  static int
2794 2794  vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2795 2795      caller_context_t *ct)
2796 2796  {
2797 2797          bzero(vap, sizeof (vattr_t));
2798 2798          vap->va_type = VREG;
2799 2799          vap->va_nlink = 1;
2800 2800          vap->va_ctime = vfs_mnttab_ctime;
2801 2801          /*
2802 2802           * it is ok to just copy mtime as the time will be monotonically
2803 2803           * increasing.
2804 2804           */
2805 2805          vap->va_mtime = vfs_mnttab_mtime;
2806 2806          vap->va_atime = vap->va_mtime;
2807 2807          return (0);
2808 2808  }
2809 2809  
2810 2810  static void
2811 2811  vfs_mnttabvp_setup(void)
2812 2812  {
2813 2813          vnode_t *tvp;
2814 2814          vnodeops_t *vfs_mntdummyvnops;
2815 2815          const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2816 2816                  VOPNAME_READ,           { .vop_read = vfs_mntdummyread },
2817 2817                  VOPNAME_WRITE,          { .vop_write = vfs_mntdummywrite },
2818 2818                  VOPNAME_GETATTR,        { .vop_getattr = vfs_mntdummygetattr },
2819 2819                  VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
2820 2820                  NULL,                   NULL
2821 2821          };
2822 2822  
2823 2823          if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2824 2824              &vfs_mntdummyvnops) != 0) {
2825 2825                  cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2826 2826                  /* Shouldn't happen, but not bad enough to panic */
2827 2827                  return;
2828 2828          }
2829 2829  
2830 2830          /*
2831 2831           * A global dummy vnode is allocated to represent mntfs files.
2832 2832           * The mntfs file (/etc/mnttab) can be monitored for file events
2833 2833           * and receive an event when mnttab changes. Dummy VOP calls
2834 2834           * will be made on this vnode. The file events notification module
2835 2835           * intercepts this vnode and delivers relevant events.
2836 2836           */
2837 2837          tvp = vn_alloc(KM_SLEEP);
2838 2838          tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2839 2839          vn_setops(tvp, vfs_mntdummyvnops);
2840 2840          tvp->v_type = VREG;
2841 2841          /*
2842 2842           * The mnt dummy ops do not reference v_data.
2843 2843           * No other module intercepting this vnode should either.
2844 2844           * Just set it to point to itself.
2845 2845           */
2846 2846          tvp->v_data = (caddr_t)tvp;
2847 2847          tvp->v_vfsp = rootvfs;
2848 2848          vfs_mntdummyvp = tvp;
2849 2849  }
2850 2850  
2851 2851  /*
2852 2852   * performs fake read/write ops
2853 2853   */
2854 2854  static void
2855 2855  vfs_mnttab_rwop(int rw)
2856 2856  {
2857 2857          struct uio      uio;
2858 2858          struct iovec    iov;
2859 2859          char    buf[1];
2860 2860  
2861 2861          if (vfs_mntdummyvp == NULL)
2862 2862                  return;
2863 2863  
2864 2864          bzero(&uio, sizeof (uio));
2865 2865          bzero(&iov, sizeof (iov));
2866 2866          iov.iov_base = buf;
2867 2867          iov.iov_len = 0;
2868 2868          uio.uio_iov = &iov;
2869 2869          uio.uio_iovcnt = 1;
2870 2870          uio.uio_loffset = 0;
2871 2871          uio.uio_segflg = UIO_SYSSPACE;
2872 2872          uio.uio_resid = 0;
2873 2873          if (rw) {
2874 2874                  (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2875 2875          } else {
2876 2876                  (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2877 2877          }
2878 2878  }
2879 2879  
2880 2880  /*
2881 2881   * Generate a write operation.
2882 2882   */
2883 2883  void
2884 2884  vfs_mnttab_writeop(void)
2885 2885  {
2886 2886          vfs_mnttab_rwop(1);
2887 2887  }
2888 2888  
2889 2889  /*
2890 2890   * Generate a read operation.
2891 2891   */
2892 2892  void
2893 2893  vfs_mnttab_readop(void)
2894 2894  {
2895 2895          vfs_mnttab_rwop(0);
2896 2896  }
2897 2897  
2898 2898  /*
2899 2899   * Free any mnttab information recorded in the vfs struct.
2900 2900   * The vfs must not be on the vfs list.
2901 2901   */
2902 2902  static void
2903 2903  vfs_freemnttab(struct vfs *vfsp)
2904 2904  {
2905 2905          ASSERT(!VFS_ON_LIST(vfsp));
2906 2906  
2907 2907          /*
2908 2908           * Free device and mount point information
2909 2909           */
2910 2910          if (vfsp->vfs_mntpt != NULL) {
2911 2911                  refstr_rele(vfsp->vfs_mntpt);
2912 2912                  vfsp->vfs_mntpt = NULL;
2913 2913          }
2914 2914          if (vfsp->vfs_resource != NULL) {
2915 2915                  refstr_rele(vfsp->vfs_resource);
2916 2916                  vfsp->vfs_resource = NULL;
2917 2917          }
2918 2918          /*
2919 2919           * Now free mount options information
2920 2920           */
2921 2921          vfs_freeopttbl(&vfsp->vfs_mntopts);
2922 2922  }
2923 2923  
2924 2924  /*
2925 2925   * Return the last mnttab modification time
2926 2926   */
2927 2927  void
2928 2928  vfs_mnttab_modtime(timespec_t *ts)
2929 2929  {
2930 2930          ASSERT(RW_LOCK_HELD(&vfslist));
2931 2931          *ts = vfs_mnttab_mtime;
2932 2932  }
2933 2933  
2934 2934  /*
2935 2935   * See if mnttab is changed
2936 2936   */
2937 2937  void
2938 2938  vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2939 2939  {
2940 2940          int changed;
2941 2941  
2942 2942          *phpp = (struct pollhead *)NULL;
2943 2943  
2944 2944          /*
2945 2945           * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2946 2946           * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2947 2947           * to not grab the vfs list lock because tv_sec is monotonically
2948 2948           * increasing.
2949 2949           */
2950 2950  
2951 2951          changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2952 2952              (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2953 2953          if (!changed) {
2954 2954                  *phpp = &vfs_pollhd;
2955 2955          }
2956 2956  }
2957 2957  
2958 2958  /* Provide a unique and monotonically-increasing timestamp. */
2959 2959  void
2960 2960  vfs_mono_time(timespec_t *ts)
2961 2961  {
2962 2962          static volatile hrtime_t hrt;           /* The saved time. */
2963 2963          hrtime_t        newhrt, oldhrt;         /* For effecting the CAS. */
2964 2964          timespec_t      newts;
2965 2965  
2966 2966          /*
2967 2967           * Try gethrestime() first, but be prepared to fabricate a sensible
2968 2968           * answer at the first sign of any trouble.
2969 2969           */
2970 2970          gethrestime(&newts);
2971 2971          newhrt = ts2hrt(&newts);
2972 2972          for (;;) {
2973 2973                  oldhrt = hrt;
2974 2974                  if (newhrt <= hrt)
2975 2975                          newhrt = hrt + 1;
2976 2976                  if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
2977 2977                          break;
2978 2978          }
2979 2979          hrt2ts(newhrt, ts);
2980 2980  }
2981 2981  
2982 2982  /*
2983 2983   * Update the mnttab modification time and wake up any waiters for
2984 2984   * mnttab changes
2985 2985   */
2986 2986  void
2987 2987  vfs_mnttab_modtimeupd()
2988 2988  {
2989 2989          hrtime_t oldhrt, newhrt;
2990 2990  
2991 2991          ASSERT(RW_WRITE_HELD(&vfslist));
2992 2992          oldhrt = ts2hrt(&vfs_mnttab_mtime);
2993 2993          gethrestime(&vfs_mnttab_mtime);
2994 2994          newhrt = ts2hrt(&vfs_mnttab_mtime);
2995 2995          if (oldhrt == (hrtime_t)0)
2996 2996                  vfs_mnttab_ctime = vfs_mnttab_mtime;
2997 2997          /*
2998 2998           * Attempt to provide unique mtime (like uniqtime but not).
2999 2999           */
3000 3000          if (newhrt == oldhrt) {
3001 3001                  newhrt++;
3002 3002                  hrt2ts(newhrt, &vfs_mnttab_mtime);
3003 3003          }
3004 3004          pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
3005 3005          vfs_mnttab_writeop();
3006 3006  }
3007 3007  
3008 3008  int
3009 3009  dounmount(struct vfs *vfsp, int flag, cred_t *cr)
3010 3010  {
3011 3011          vnode_t *coveredvp;
3012 3012          int error;
3013 3013          extern void teardown_vopstats(vfs_t *);
3014 3014  
3015 3015          /*
3016 3016           * Get covered vnode. This will be NULL if the vfs is not linked
3017 3017           * into the file system name space (i.e., domount() with MNT_NOSPICE).
3018 3018           */
3019 3019          coveredvp = vfsp->vfs_vnodecovered;
3020 3020          ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3021 3021  
3022 3022          /*
3023 3023           * Purge all dnlc entries for this vfs.
3024 3024           */
3025 3025          (void) dnlc_purge_vfsp(vfsp, 0);
3026 3026  
3027 3027          /* For forcible umount, skip VFS_SYNC() since it may hang */
3028 3028          if ((flag & MS_FORCE) == 0)
3029 3029                  (void) VFS_SYNC(vfsp, 0, cr);
3030 3030  
3031 3031          /*
3032 3032           * Lock the vfs to maintain fs status quo during unmount.  This
3033 3033           * has to be done after the sync because ufs_update tries to acquire
3034 3034           * the vfs_reflock.
3035 3035           */
3036 3036          vfs_lock_wait(vfsp);
3037 3037  
3038 3038          if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3039 3039                  vfs_unlock(vfsp);
3040 3040                  if (coveredvp != NULL)
3041 3041                          vn_vfsunlock(coveredvp);
3042 3042          } else if (coveredvp != NULL) {
3043 3043                  teardown_vopstats(vfsp);
3044 3044                  /*
3045 3045                   * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3046 3046                   * when it frees vfsp so we do a VN_HOLD() so we can
3047 3047                   * continue to use coveredvp afterwards.
3048 3048                   */
3049 3049                  VN_HOLD(coveredvp);
3050 3050                  vfs_remove(vfsp);
3051 3051                  vn_vfsunlock(coveredvp);
3052 3052                  VN_RELE(coveredvp);
3053 3053          } else {
3054 3054                  teardown_vopstats(vfsp);
3055 3055                  /*
3056 3056                   * Release the reference to vfs that is not linked
3057 3057                   * into the name space.
3058 3058                   */
3059 3059                  vfs_unlock(vfsp);
3060 3060                  VFS_RELE(vfsp);
3061 3061          }
3062 3062          return (error);
3063 3063  }
3064 3064  
3065 3065  
3066 3066  /*
3067 3067   * Vfs_unmountall() is called by uadmin() to unmount all
3068 3068   * mounted file systems (except the root file system) during shutdown.
3069 3069   * It follows the existing locking protocol when traversing the vfs list
3070 3070   * to sync and unmount vfses. Even though there should be no
3071 3071   * other thread running while the system is shutting down, it is prudent
3072 3072   * to still follow the locking protocol.
3073 3073   */
3074 3074  void
3075 3075  vfs_unmountall(void)
3076 3076  {
3077 3077          struct vfs *vfsp;
3078 3078          struct vfs *prev_vfsp = NULL;
3079 3079          int error;
3080 3080  
3081 3081          /*
3082 3082           * Toss all dnlc entries now so that the per-vfs sync
3083 3083           * and unmount operations don't have to slog through
3084 3084           * a bunch of uninteresting vnodes over and over again.
3085 3085           */
3086 3086          dnlc_purge();
3087 3087  
3088 3088          vfs_list_lock();
3089 3089          for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3090 3090                  prev_vfsp = vfsp->vfs_prev;
3091 3091  
3092 3092                  if (vfs_lock(vfsp) != 0)
3093 3093                          continue;
3094 3094                  error = vn_vfswlock(vfsp->vfs_vnodecovered);
3095 3095                  vfs_unlock(vfsp);
3096 3096                  if (error)
3097 3097                          continue;
3098 3098  
3099 3099                  vfs_list_unlock();
3100 3100  
3101 3101                  (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3102 3102                  (void) dounmount(vfsp, 0, CRED());
3103 3103  
3104 3104                  /*
3105 3105                   * Since we dropped the vfslist lock above we must
3106 3106                   * verify that next_vfsp still exists, else start over.
3107 3107                   */
3108 3108                  vfs_list_lock();
3109 3109                  for (vfsp = rootvfs->vfs_prev;
3110 3110                      vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3111 3111                          if (vfsp == prev_vfsp)
3112 3112                                  break;
3113 3113                  if (vfsp == rootvfs && prev_vfsp != rootvfs)
3114 3114                          prev_vfsp = rootvfs->vfs_prev;
3115 3115          }
3116 3116          vfs_list_unlock();
3117 3117  }
3118 3118  
3119 3119  /*
3120 3120   * Called to add an entry to the end of the vfs mount in progress list
3121 3121   */
3122 3122  void
3123 3123  vfs_addmip(dev_t dev, struct vfs *vfsp)
3124 3124  {
3125 3125          struct ipmnt *mipp;
3126 3126  
3127 3127          mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3128 3128          mipp->mip_next = NULL;
3129 3129          mipp->mip_dev = dev;
3130 3130          mipp->mip_vfsp = vfsp;
3131 3131          mutex_enter(&vfs_miplist_mutex);
3132 3132          if (vfs_miplist_end != NULL)
3133 3133                  vfs_miplist_end->mip_next = mipp;
3134 3134          else
3135 3135                  vfs_miplist = mipp;
3136 3136          vfs_miplist_end = mipp;
3137 3137          mutex_exit(&vfs_miplist_mutex);
3138 3138  }
3139 3139  
3140 3140  /*
3141 3141   * Called to remove an entry from the mount in progress list
3142 3142   * Either because the mount completed or it failed.
3143 3143   */
3144 3144  void
3145 3145  vfs_delmip(struct vfs *vfsp)
3146 3146  {
3147 3147          struct ipmnt *mipp, *mipprev;
3148 3148  
3149 3149          mutex_enter(&vfs_miplist_mutex);
3150 3150          mipprev = NULL;
3151 3151          for (mipp = vfs_miplist;
3152 3152              mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3153 3153                  mipprev = mipp;
3154 3154          }
3155 3155          if (mipp == NULL)
3156 3156                  return; /* shouldn't happen */
3157 3157          if (mipp == vfs_miplist_end)
3158 3158                  vfs_miplist_end = mipprev;
3159 3159          if (mipprev == NULL)
3160 3160                  vfs_miplist = mipp->mip_next;
3161 3161          else
3162 3162                  mipprev->mip_next = mipp->mip_next;
3163 3163          mutex_exit(&vfs_miplist_mutex);
3164 3164          kmem_free(mipp, sizeof (struct ipmnt));
3165 3165  }
3166 3166  
3167 3167  /*
3168 3168   * vfs_add is called by a specific filesystem's mount routine to add
3169 3169   * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3170 3170   * The vfs should already have been locked by the caller.
3171 3171   *
3172 3172   * coveredvp is NULL if this is the root.
3173 3173   */
3174 3174  void
3175 3175  vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3176 3176  {
3177 3177          int newflag;
3178 3178  
3179 3179          ASSERT(vfs_lock_held(vfsp));
3180 3180          VFS_HOLD(vfsp);
3181 3181          newflag = vfsp->vfs_flag;
3182 3182          if (mflag & MS_RDONLY)
3183 3183                  newflag |= VFS_RDONLY;
3184 3184          else
3185 3185                  newflag &= ~VFS_RDONLY;
3186 3186          if (mflag & MS_NOSUID)
3187 3187                  newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3188 3188          else
3189 3189                  newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3190 3190          if (mflag & MS_NOMNTTAB)
3191 3191                  newflag |= VFS_NOMNTTAB;
3192 3192          else
3193 3193                  newflag &= ~VFS_NOMNTTAB;
3194 3194  
3195 3195          if (coveredvp != NULL) {
3196 3196                  ASSERT(vn_vfswlock_held(coveredvp));
3197 3197                  coveredvp->v_vfsmountedhere = vfsp;
3198 3198                  VN_HOLD(coveredvp);
3199 3199          }
3200 3200          vfsp->vfs_vnodecovered = coveredvp;
3201 3201          vfsp->vfs_flag = newflag;
3202 3202  
3203 3203          vfs_list_add(vfsp);
3204 3204  }
3205 3205  
3206 3206  /*
3207 3207   * Remove a vfs from the vfs list, null out the pointer from the
3208 3208   * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3209 3209   * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3210 3210   * reference to the vfs and to the covered vnode.
3211 3211   *
3212 3212   * Called from dounmount after it's confirmed with the file system
3213 3213   * that the unmount is legal.
3214 3214   */
3215 3215  void
3216 3216  vfs_remove(struct vfs *vfsp)
3217 3217  {
3218 3218          vnode_t *vp;
3219 3219  
3220 3220          ASSERT(vfs_lock_held(vfsp));
3221 3221  
3222 3222          /*
3223 3223           * Can't unmount root.  Should never happen because fs will
3224 3224           * be busy.
3225 3225           */
3226 3226          if (vfsp == rootvfs)
3227 3227                  panic("vfs_remove: unmounting root");
3228 3228  
3229 3229          vfs_list_remove(vfsp);
3230 3230  
3231 3231          /*
3232 3232           * Unhook from the file system name space.
3233 3233           */
3234 3234          vp = vfsp->vfs_vnodecovered;
3235 3235          ASSERT(vn_vfswlock_held(vp));
3236 3236          vp->v_vfsmountedhere = NULL;
3237 3237          vfsp->vfs_vnodecovered = NULL;
3238 3238          VN_RELE(vp);
3239 3239  
3240 3240          /*
3241 3241           * Release lock and wakeup anybody waiting.
3242 3242           */
3243 3243          vfs_unlock(vfsp);
3244 3244          VFS_RELE(vfsp);
3245 3245  }
3246 3246  
3247 3247  /*
3248 3248   * Lock a filesystem to prevent access to it while mounting,
3249 3249   * unmounting and syncing.  Return EBUSY immediately if lock
3250 3250   * can't be acquired.
3251 3251   */
3252 3252  int
3253 3253  vfs_lock(vfs_t *vfsp)
3254 3254  {
3255 3255          vn_vfslocks_entry_t *vpvfsentry;
3256 3256  
3257 3257          vpvfsentry = vn_vfslocks_getlock(vfsp);
3258 3258          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3259 3259                  return (0);
3260 3260  
3261 3261          vn_vfslocks_rele(vpvfsentry);
3262 3262          return (EBUSY);
3263 3263  }
3264 3264  
3265 3265  int
3266 3266  vfs_rlock(vfs_t *vfsp)
3267 3267  {
3268 3268          vn_vfslocks_entry_t *vpvfsentry;
3269 3269  
3270 3270          vpvfsentry = vn_vfslocks_getlock(vfsp);
3271 3271  
3272 3272          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3273 3273                  return (0);
3274 3274  
3275 3275          vn_vfslocks_rele(vpvfsentry);
3276 3276          return (EBUSY);
3277 3277  }
3278 3278  
3279 3279  void
3280 3280  vfs_lock_wait(vfs_t *vfsp)
3281 3281  {
3282 3282          vn_vfslocks_entry_t *vpvfsentry;
3283 3283  
3284 3284          vpvfsentry = vn_vfslocks_getlock(vfsp);
3285 3285          rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3286 3286  }
3287 3287  
3288 3288  void
3289 3289  vfs_rlock_wait(vfs_t *vfsp)
3290 3290  {
3291 3291          vn_vfslocks_entry_t *vpvfsentry;
3292 3292  
3293 3293          vpvfsentry = vn_vfslocks_getlock(vfsp);
3294 3294          rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3295 3295  }
3296 3296  
3297 3297  /*
3298 3298   * Unlock a locked filesystem.
3299 3299   */
3300 3300  void
3301 3301  vfs_unlock(vfs_t *vfsp)
3302 3302  {
3303 3303          vn_vfslocks_entry_t *vpvfsentry;
3304 3304  
3305 3305          /*
3306 3306           * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3307 3307           * And these changes should remain for the patch changes as it is.
3308 3308           */
3309 3309          if (panicstr)
3310 3310                  return;
3311 3311  
3312 3312          /*
3313 3313           * ve_refcount needs to be dropped twice here.
3314 3314           * 1. To release refernce after a call to vfs_locks_getlock()
3315 3315           * 2. To release the reference from the locking routines like
3316 3316           *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3317 3317           */
3318 3318  
3319 3319          vpvfsentry = vn_vfslocks_getlock(vfsp);
3320 3320          vn_vfslocks_rele(vpvfsentry);
3321 3321  
3322 3322          rwst_exit(&vpvfsentry->ve_lock);
3323 3323          vn_vfslocks_rele(vpvfsentry);
3324 3324  }
3325 3325  
3326 3326  /*
3327 3327   * Utility routine that allows a filesystem to construct its
3328 3328   * fsid in "the usual way" - by munging some underlying dev_t and
3329 3329   * the filesystem type number into the 64-bit fsid.  Note that
3330 3330   * this implicitly relies on dev_t persistence to make filesystem
3331 3331   * id's persistent.
3332 3332   *
3333 3333   * There's nothing to prevent an individual fs from constructing its
3334 3334   * fsid in a different way, and indeed they should.
3335 3335   *
3336 3336   * Since we want fsids to be 32-bit quantities (so that they can be
3337 3337   * exported identically by either 32-bit or 64-bit APIs, as well as
3338 3338   * the fact that fsid's are "known" to NFS), we compress the device
3339 3339   * number given down to 32-bits, and panic if that isn't possible.
3340 3340   */
3341 3341  void
3342 3342  vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3343 3343  {
3344 3344          if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3345 3345                  panic("device number too big for fsid!");
3346 3346          fsi->val[1] = val;
3347 3347  }
3348 3348  
3349 3349  int
3350 3350  vfs_lock_held(vfs_t *vfsp)
3351 3351  {
3352 3352          int held;
3353 3353          vn_vfslocks_entry_t *vpvfsentry;
3354 3354  
3355 3355          /*
3356 3356           * vfs_lock_held will mimic sema_held behaviour
3357 3357           * if panicstr is set. And these changes should remain
3358 3358           * for the patch changes as it is.
3359 3359           */
3360 3360          if (panicstr)
3361 3361                  return (1);
3362 3362  
3363 3363          vpvfsentry = vn_vfslocks_getlock(vfsp);
3364 3364          held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3365 3365  
3366 3366          vn_vfslocks_rele(vpvfsentry);
3367 3367          return (held);
3368 3368  }
3369 3369  
3370 3370  struct _kthread *
3371 3371  vfs_lock_owner(vfs_t *vfsp)
3372 3372  {
3373 3373          struct _kthread *owner;
3374 3374          vn_vfslocks_entry_t *vpvfsentry;
3375 3375  
3376 3376          /*
3377 3377           * vfs_wlock_held will mimic sema_held behaviour
3378 3378           * if panicstr is set. And these changes should remain
3379 3379           * for the patch changes as it is.
3380 3380           */
3381 3381          if (panicstr)
3382 3382                  return (NULL);
3383 3383  
3384 3384          vpvfsentry = vn_vfslocks_getlock(vfsp);
3385 3385          owner = rwst_owner(&vpvfsentry->ve_lock);
3386 3386  
3387 3387          vn_vfslocks_rele(vpvfsentry);
3388 3388          return (owner);
3389 3389  }
3390 3390  
3391 3391  /*
3392 3392   * vfs list locking.
3393 3393   *
3394 3394   * Rather than manipulate the vfslist lock directly, we abstract into lock
3395 3395   * and unlock routines to allow the locking implementation to be changed for
3396 3396   * clustering.
3397 3397   *
3398 3398   * Whenever the vfs list is modified through its hash links, the overall list
3399 3399   * lock must be obtained before locking the relevant hash bucket.  But to see
3400 3400   * whether a given vfs is on the list, it suffices to obtain the lock for the
3401 3401   * hash bucket without getting the overall list lock.  (See getvfs() below.)
3402 3402   */
3403 3403  
3404 3404  void
3405 3405  vfs_list_lock()
3406 3406  {
3407 3407          rw_enter(&vfslist, RW_WRITER);
3408 3408  }
3409 3409  
3410 3410  void
3411 3411  vfs_list_read_lock()
3412 3412  {
3413 3413          rw_enter(&vfslist, RW_READER);
3414 3414  }
3415 3415  
3416 3416  void
3417 3417  vfs_list_unlock()
3418 3418  {
3419 3419          rw_exit(&vfslist);
3420 3420  }
3421 3421  
3422 3422  /*
3423 3423   * Low level worker routines for adding entries to and removing entries from
3424 3424   * the vfs list.
3425 3425   */
3426 3426  
3427 3427  static void
3428 3428  vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3429 3429  {
3430 3430          int vhno;
3431 3431          struct vfs **hp;
3432 3432          dev_t dev;
3433 3433  
3434 3434          ASSERT(RW_WRITE_HELD(&vfslist));
3435 3435  
3436 3436          dev = expldev(vfsp->vfs_fsid.val[0]);
3437 3437          vhno = VFSHASH(getmajor(dev), getminor(dev));
3438 3438  
3439 3439          mutex_enter(&rvfs_list[vhno].rvfs_lock);
3440 3440  
3441 3441          /*
3442 3442           * Link into the hash table, inserting it at the end, so that LOFS
3443 3443           * with the same fsid as UFS (or other) file systems will not hide the
3444 3444           * UFS.
3445 3445           */
3446 3446          if (insert_at_head) {
3447 3447                  vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3448 3448                  rvfs_list[vhno].rvfs_head = vfsp;
3449 3449          } else {
3450 3450                  for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3451 3451                      hp = &(*hp)->vfs_hash)
3452 3452                          continue;
3453 3453                  /*
3454 3454                   * hp now contains the address of the pointer to update
3455 3455                   * to effect the insertion.
3456 3456                   */
3457 3457                  vfsp->vfs_hash = NULL;
3458 3458                  *hp = vfsp;
3459 3459          }
3460 3460  
3461 3461          rvfs_list[vhno].rvfs_len++;
3462 3462          mutex_exit(&rvfs_list[vhno].rvfs_lock);
3463 3463  }
3464 3464  
3465 3465  
3466 3466  static void
3467 3467  vfs_hash_remove(struct vfs *vfsp)
3468 3468  {
3469 3469          int vhno;
3470 3470          struct vfs *tvfsp;
3471 3471          dev_t dev;
3472 3472  
3473 3473          ASSERT(RW_WRITE_HELD(&vfslist));
3474 3474  
3475 3475          dev = expldev(vfsp->vfs_fsid.val[0]);
3476 3476          vhno = VFSHASH(getmajor(dev), getminor(dev));
3477 3477  
3478 3478          mutex_enter(&rvfs_list[vhno].rvfs_lock);
3479 3479  
3480 3480          /*
3481 3481           * Remove from hash.
3482 3482           */
3483 3483          if (rvfs_list[vhno].rvfs_head == vfsp) {
3484 3484                  rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3485 3485                  rvfs_list[vhno].rvfs_len--;
3486 3486                  goto foundit;
3487 3487          }
3488 3488          for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3489 3489              tvfsp = tvfsp->vfs_hash) {
3490 3490                  if (tvfsp->vfs_hash == vfsp) {
3491 3491                          tvfsp->vfs_hash = vfsp->vfs_hash;
3492 3492                          rvfs_list[vhno].rvfs_len--;
3493 3493                          goto foundit;
3494 3494                  }
3495 3495          }
3496 3496          cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3497 3497  
3498 3498  foundit:
3499 3499  
3500 3500          mutex_exit(&rvfs_list[vhno].rvfs_lock);
3501 3501  }
3502 3502  
3503 3503  
3504 3504  void
3505 3505  vfs_list_add(struct vfs *vfsp)
3506 3506  {
3507 3507          zone_t *zone;
3508 3508  
3509 3509          /*
3510 3510           * Typically, the vfs_t will have been created on behalf of the file
3511 3511           * system in vfs_init, where it will have been provided with a
3512 3512           * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3513 3513           * by an unbundled file system. We therefore check for such an example
3514 3514           * before stamping the vfs_t with its creation time for the benefit of
3515 3515           * mntfs.
3516 3516           */
3517 3517          if (vfsp->vfs_implp == NULL)
3518 3518                  vfsimpl_setup(vfsp);
3519 3519          vfs_mono_time(&vfsp->vfs_hrctime);
3520 3520  
3521 3521          /*
3522 3522           * The zone that owns the mount is the one that performed the mount.
3523 3523           * Note that this isn't necessarily the same as the zone mounted into.
3524 3524           * The corresponding zone_rele_ref() will be done when the vfs_t
3525 3525           * is being free'd.
3526 3526           */
3527 3527          vfsp->vfs_zone = curproc->p_zone;
3528 3528          zone_init_ref(&vfsp->vfs_implp->vi_zone_ref);
3529 3529          zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref,
3530 3530              ZONE_REF_VFS);
3531 3531  
3532 3532          /*
3533 3533           * Find the zone mounted into, and put this mount on its vfs list.
3534 3534           */
3535 3535          zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3536 3536          ASSERT(zone != NULL);
3537 3537          /*
3538 3538           * Special casing for the root vfs.  This structure is allocated
3539 3539           * statically and hooked onto rootvfs at link time.  During the
3540 3540           * vfs_mountroot call at system startup time, the root file system's
3541 3541           * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3542 3542           * as argument.  The code below must detect and handle this special
3543 3543           * case.  The only apparent justification for this special casing is
3544 3544           * to ensure that the root file system appears at the head of the
3545 3545           * list.
3546 3546           *
3547 3547           * XXX: I'm assuming that it's ok to do normal list locking when
3548 3548           *      adding the entry for the root file system (this used to be
3549 3549           *      done with no locks held).
3550 3550           */
3551 3551          vfs_list_lock();
3552 3552          /*
3553 3553           * Link into the vfs list proper.
3554 3554           */
3555 3555          if (vfsp == &root) {
3556 3556                  /*
3557 3557                   * Assert: This vfs is already on the list as its first entry.
3558 3558                   * Thus, there's nothing to do.
3559 3559                   */
3560 3560                  ASSERT(rootvfs == vfsp);
3561 3561                  /*
3562 3562                   * Add it to the head of the global zone's vfslist.
3563 3563                   */
3564 3564                  ASSERT(zone == global_zone);
3565 3565                  ASSERT(zone->zone_vfslist == NULL);
3566 3566                  zone->zone_vfslist = vfsp;
3567 3567          } else {
3568 3568                  /*
3569 3569                   * Link to end of list using vfs_prev (as rootvfs is now a
3570 3570                   * doubly linked circular list) so list is in mount order for
3571 3571                   * mnttab use.
3572 3572                   */
3573 3573                  rootvfs->vfs_prev->vfs_next = vfsp;
3574 3574                  vfsp->vfs_prev = rootvfs->vfs_prev;
3575 3575                  rootvfs->vfs_prev = vfsp;
3576 3576                  vfsp->vfs_next = rootvfs;
3577 3577  
3578 3578                  /*
3579 3579                   * Do it again for the zone-private list (which may be NULL).
3580 3580                   */
3581 3581                  if (zone->zone_vfslist == NULL) {
3582 3582                          ASSERT(zone != global_zone);
3583 3583                          zone->zone_vfslist = vfsp;
3584 3584                  } else {
3585 3585                          zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3586 3586                          vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3587 3587                          zone->zone_vfslist->vfs_zone_prev = vfsp;
3588 3588                          vfsp->vfs_zone_next = zone->zone_vfslist;
3589 3589                  }
3590 3590          }
3591 3591  
3592 3592          /*
3593 3593           * Link into the hash table, inserting it at the end, so that LOFS
3594 3594           * with the same fsid as UFS (or other) file systems will not hide
3595 3595           * the UFS.
3596 3596           */
3597 3597          vfs_hash_add(vfsp, 0);
3598 3598  
3599 3599          /*
3600 3600           * update the mnttab modification time
3601 3601           */
3602 3602          vfs_mnttab_modtimeupd();
3603 3603          vfs_list_unlock();
3604 3604          zone_rele(zone);
3605 3605  }
3606 3606  
3607 3607  void
3608 3608  vfs_list_remove(struct vfs *vfsp)
3609 3609  {
3610 3610          zone_t *zone;
3611 3611  
3612 3612          zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3613 3613          ASSERT(zone != NULL);
3614 3614          /*
3615 3615           * Callers are responsible for preventing attempts to unmount the
3616 3616           * root.
3617 3617           */
3618 3618          ASSERT(vfsp != rootvfs);
3619 3619  
3620 3620          vfs_list_lock();
3621 3621  
3622 3622          /*
3623 3623           * Remove from hash.
3624 3624           */
3625 3625          vfs_hash_remove(vfsp);
3626 3626  
3627 3627          /*
3628 3628           * Remove from vfs list.
3629 3629           */
3630 3630          vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3631 3631          vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3632 3632          vfsp->vfs_next = vfsp->vfs_prev = NULL;
3633 3633  
3634 3634          /*
3635 3635           * Remove from zone-specific vfs list.
3636 3636           */
3637 3637          if (zone->zone_vfslist == vfsp)
3638 3638                  zone->zone_vfslist = vfsp->vfs_zone_next;
3639 3639  
3640 3640          if (vfsp->vfs_zone_next == vfsp) {
3641 3641                  ASSERT(vfsp->vfs_zone_prev == vfsp);
3642 3642                  ASSERT(zone->zone_vfslist == vfsp);
3643 3643                  zone->zone_vfslist = NULL;
3644 3644          }
3645 3645  
3646 3646          vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3647 3647          vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3648 3648          vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3649 3649  
3650 3650          /*
3651 3651           * update the mnttab modification time
3652 3652           */
3653 3653          vfs_mnttab_modtimeupd();
3654 3654          vfs_list_unlock();
3655 3655          zone_rele(zone);
3656 3656  }
3657 3657  
3658 3658  struct vfs *
3659 3659  getvfs(fsid_t *fsid)
3660 3660  {
3661 3661          struct vfs *vfsp;
3662 3662          int val0 = fsid->val[0];
3663 3663          int val1 = fsid->val[1];
3664 3664          dev_t dev = expldev(val0);
3665 3665          int vhno = VFSHASH(getmajor(dev), getminor(dev));
3666 3666          kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3667 3667  
3668 3668          mutex_enter(hmp);
3669 3669          for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3670 3670                  if (vfsp->vfs_fsid.val[0] == val0 &&
3671 3671                      vfsp->vfs_fsid.val[1] == val1) {
3672 3672                          VFS_HOLD(vfsp);
3673 3673                          mutex_exit(hmp);
3674 3674                          return (vfsp);
3675 3675                  }
3676 3676          }
3677 3677          mutex_exit(hmp);
3678 3678          return (NULL);
3679 3679  }
3680 3680  
3681 3681  /*
3682 3682   * Search the vfs mount in progress list for a specified device/vfs entry.
3683 3683   * Returns 0 if the first entry in the list that the device matches has the
3684 3684   * given vfs pointer as well.  If the device matches but a different vfs
3685 3685   * pointer is encountered in the list before the given vfs pointer then
3686 3686   * a 1 is returned.
3687 3687   */
3688 3688  
3689 3689  int
3690 3690  vfs_devmounting(dev_t dev, struct vfs *vfsp)
3691 3691  {
3692 3692          int retval = 0;
3693 3693          struct ipmnt *mipp;
3694 3694  
3695 3695          mutex_enter(&vfs_miplist_mutex);
3696 3696          for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3697 3697                  if (mipp->mip_dev == dev) {
3698 3698                          if (mipp->mip_vfsp != vfsp)
3699 3699                                  retval = 1;
3700 3700                          break;
3701 3701                  }
3702 3702          }
3703 3703          mutex_exit(&vfs_miplist_mutex);
3704 3704          return (retval);
3705 3705  }
3706 3706  
3707 3707  /*
3708 3708   * Search the vfs list for a specified device.  Returns 1, if entry is found
3709 3709   * or 0 if no suitable entry is found.
3710 3710   */
3711 3711  
3712 3712  int
3713 3713  vfs_devismounted(dev_t dev)
3714 3714  {
3715 3715          struct vfs *vfsp;
3716 3716          int found;
3717 3717  
3718 3718          vfs_list_read_lock();
3719 3719          vfsp = rootvfs;
3720 3720          found = 0;
3721 3721          do {
3722 3722                  if (vfsp->vfs_dev == dev) {
3723 3723                          found = 1;
3724 3724                          break;
3725 3725                  }
3726 3726                  vfsp = vfsp->vfs_next;
3727 3727          } while (vfsp != rootvfs);
3728 3728  
3729 3729          vfs_list_unlock();
3730 3730          return (found);
3731 3731  }
3732 3732  
3733 3733  /*
3734 3734   * Search the vfs list for a specified device.  Returns a pointer to it
3735 3735   * or NULL if no suitable entry is found. The caller of this routine
3736 3736   * is responsible for releasing the returned vfs pointer.
3737 3737   */
3738 3738  struct vfs *
3739 3739  vfs_dev2vfsp(dev_t dev)
3740 3740  {
3741 3741          struct vfs *vfsp;
3742 3742          int found;
3743 3743  
3744 3744          vfs_list_read_lock();
3745 3745          vfsp = rootvfs;
3746 3746          found = 0;
3747 3747          do {
3748 3748                  /*
3749 3749                   * The following could be made more efficient by making
3750 3750                   * the entire loop use vfs_zone_next if the call is from
3751 3751                   * a zone.  The only callers, however, ustat(2) and
3752 3752                   * umount2(2), don't seem to justify the added
3753 3753                   * complexity at present.
3754 3754                   */
3755 3755                  if (vfsp->vfs_dev == dev &&
3756 3756                      ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3757 3757                      curproc->p_zone)) {
3758 3758                          VFS_HOLD(vfsp);
3759 3759                          found = 1;
3760 3760                          break;
3761 3761                  }
3762 3762                  vfsp = vfsp->vfs_next;
3763 3763          } while (vfsp != rootvfs);
3764 3764          vfs_list_unlock();
3765 3765          return (found ? vfsp: NULL);
3766 3766  }
3767 3767  
3768 3768  /*
3769 3769   * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3770 3770   * or NULL if no suitable entry is found. The caller of this routine
3771 3771   * is responsible for releasing the returned vfs pointer.
3772 3772   *
3773 3773   * Note that if multiple mntpoints match, the last one matching is
3774 3774   * returned in an attempt to return the "top" mount when overlay
3775 3775   * mounts are covering the same mount point.  This is accomplished by starting
3776 3776   * at the end of the list and working our way backwards, stopping at the first
3777 3777   * matching mount.
3778 3778   */
3779 3779  struct vfs *
3780 3780  vfs_mntpoint2vfsp(const char *mp)
3781 3781  {
3782 3782          struct vfs *vfsp;
3783 3783          struct vfs *retvfsp = NULL;
3784 3784          zone_t *zone = curproc->p_zone;
3785 3785          struct vfs *list;
3786 3786  
3787 3787          vfs_list_read_lock();
3788 3788          if (getzoneid() == GLOBAL_ZONEID) {
3789 3789                  /*
3790 3790                   * The global zone may see filesystems in any zone.
3791 3791                   */
3792 3792                  vfsp = rootvfs->vfs_prev;
3793 3793                  do {
3794 3794                          if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3795 3795                                  retvfsp = vfsp;
3796 3796                                  break;
3797 3797                          }
3798 3798                          vfsp = vfsp->vfs_prev;
3799 3799                  } while (vfsp != rootvfs->vfs_prev);
3800 3800          } else if ((list = zone->zone_vfslist) != NULL) {
3801 3801                  const char *mntpt;
3802 3802  
3803 3803                  vfsp = list->vfs_zone_prev;
3804 3804                  do {
3805 3805                          mntpt = refstr_value(vfsp->vfs_mntpt);
3806 3806                          mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3807 3807                          if (strcmp(mntpt, mp) == 0) {
3808 3808                                  retvfsp = vfsp;
3809 3809                                  break;
3810 3810                          }
3811 3811                          vfsp = vfsp->vfs_zone_prev;
3812 3812                  } while (vfsp != list->vfs_zone_prev);
3813 3813          }
3814 3814          if (retvfsp)
3815 3815                  VFS_HOLD(retvfsp);
3816 3816          vfs_list_unlock();
3817 3817          return (retvfsp);
3818 3818  }
3819 3819  
3820 3820  /*
3821 3821   * Search the vfs list for a specified vfsops.
3822 3822   * if vfs entry is found then return 1, else 0.
3823 3823   */
3824 3824  int
3825 3825  vfs_opsinuse(vfsops_t *ops)
3826 3826  {
3827 3827          struct vfs *vfsp;
3828 3828          int found;
3829 3829  
3830 3830          vfs_list_read_lock();
3831 3831          vfsp = rootvfs;
3832 3832          found = 0;
3833 3833          do {
3834 3834                  if (vfs_getops(vfsp) == ops) {
3835 3835                          found = 1;
3836 3836                          break;
3837 3837                  }
3838 3838                  vfsp = vfsp->vfs_next;
3839 3839          } while (vfsp != rootvfs);
3840 3840          vfs_list_unlock();
3841 3841          return (found);
3842 3842  }
3843 3843  
3844 3844  /*
3845 3845   * Allocate an entry in vfssw for a file system type
3846 3846   */
3847 3847  struct vfssw *
3848 3848  allocate_vfssw(const char *type)
3849 3849  {
3850 3850          struct vfssw *vswp;
3851 3851  
3852 3852          if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3853 3853                  /*
3854 3854                   * The vfssw table uses the empty string to identify an
3855 3855                   * available entry; we cannot add any type which has
3856 3856                   * a leading NUL. The string length is limited to
3857 3857                   * the size of the st_fstype array in struct stat.
3858 3858                   */
3859 3859                  return (NULL);
3860 3860          }
3861 3861  
3862 3862          ASSERT(VFSSW_WRITE_LOCKED());
3863 3863          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3864 3864                  if (!ALLOCATED_VFSSW(vswp)) {
3865 3865                          vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3866 3866                          (void) strcpy(vswp->vsw_name, type);
3867 3867                          ASSERT(vswp->vsw_count == 0);
3868 3868                          vswp->vsw_count = 1;
3869 3869                          mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3870 3870                          return (vswp);
3871 3871                  }
3872 3872          return (NULL);
3873 3873  }
3874 3874  
3875 3875  /*
3876 3876   * Impose additional layer of translation between vfstype names
3877 3877   * and module names in the filesystem.
3878 3878   */
3879 3879  static const char *
3880 3880  vfs_to_modname(const char *vfstype)
3881 3881  {
3882 3882          if (strcmp(vfstype, "proc") == 0) {
3883 3883                  vfstype = "procfs";
3884 3884          } else if (strcmp(vfstype, "fd") == 0) {
3885 3885                  vfstype = "fdfs";
3886 3886          } else if (strncmp(vfstype, "nfs", 3) == 0) {
3887 3887                  vfstype = "nfs";
3888 3888          }
3889 3889  
3890 3890          return (vfstype);
3891 3891  }
3892 3892  
3893 3893  /*
3894 3894   * Find a vfssw entry given a file system type name.
3895 3895   * Try to autoload the filesystem if it's not found.
3896 3896   * If it's installed, return the vfssw locked to prevent unloading.
3897 3897   */
3898 3898  struct vfssw *
3899 3899  vfs_getvfssw(const char *type)
3900 3900  {
3901 3901          struct vfssw *vswp;
3902 3902          const char *modname;
3903 3903  
3904 3904          RLOCK_VFSSW();
3905 3905          vswp = vfs_getvfsswbyname(type);
3906 3906          modname = vfs_to_modname(type);
3907 3907  
3908 3908          if (rootdir == NULL) {
3909 3909                  /*
3910 3910                   * If we haven't yet loaded the root file system, then our
3911 3911                   * _init won't be called until later. Allocate vfssw entry,
3912 3912                   * because mod_installfs won't be called.
3913 3913                   */
3914 3914                  if (vswp == NULL) {
3915 3915                          RUNLOCK_VFSSW();
3916 3916                          WLOCK_VFSSW();
3917 3917                          if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3918 3918                                  if ((vswp = allocate_vfssw(type)) == NULL) {
3919 3919                                          WUNLOCK_VFSSW();
3920 3920                                          return (NULL);
3921 3921                                  }
3922 3922                          }
3923 3923                          WUNLOCK_VFSSW();
3924 3924                          RLOCK_VFSSW();
3925 3925                  }
3926 3926                  if (!VFS_INSTALLED(vswp)) {
3927 3927                          RUNLOCK_VFSSW();
3928 3928                          (void) modloadonly("fs", modname);
3929 3929                  } else
3930 3930                          RUNLOCK_VFSSW();
3931 3931                  return (vswp);
3932 3932          }
3933 3933  
3934 3934          /*
3935 3935           * Try to load the filesystem.  Before calling modload(), we drop
3936 3936           * our lock on the VFS switch table, and pick it up after the
3937 3937           * module is loaded.  However, there is a potential race:  the
3938 3938           * module could be unloaded after the call to modload() completes
3939 3939           * but before we pick up the lock and drive on.  Therefore,
3940 3940           * we keep reloading the module until we've loaded the module
3941 3941           * _and_ we have the lock on the VFS switch table.
3942 3942           */
3943 3943          while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3944 3944                  RUNLOCK_VFSSW();
3945 3945                  if (modload("fs", modname) == -1)
3946 3946                          return (NULL);
3947 3947                  RLOCK_VFSSW();
3948 3948                  if (vswp == NULL)
3949 3949                          if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3950 3950                                  break;
3951 3951          }
3952 3952          RUNLOCK_VFSSW();
3953 3953  
3954 3954          return (vswp);
3955 3955  }
3956 3956  
3957 3957  /*
3958 3958   * Find a vfssw entry given a file system type name.
3959 3959   */
3960 3960  struct vfssw *
3961 3961  vfs_getvfsswbyname(const char *type)
3962 3962  {
3963 3963          struct vfssw *vswp;
3964 3964  
3965 3965          ASSERT(VFSSW_LOCKED());
3966 3966          if (type == NULL || *type == '\0')
3967 3967                  return (NULL);
3968 3968  
3969 3969          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3970 3970                  if (strcmp(type, vswp->vsw_name) == 0) {
3971 3971                          vfs_refvfssw(vswp);
3972 3972                          return (vswp);
3973 3973                  }
3974 3974          }
3975 3975  
3976 3976          return (NULL);
3977 3977  }
3978 3978  
3979 3979  /*
3980 3980   * Find a vfssw entry given a set of vfsops.
3981 3981   */
3982 3982  struct vfssw *
3983 3983  vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3984 3984  {
3985 3985          struct vfssw *vswp;
3986 3986  
3987 3987          RLOCK_VFSSW();
3988 3988          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3989 3989                  if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3990 3990                          vfs_refvfssw(vswp);
3991 3991                          RUNLOCK_VFSSW();
3992 3992                          return (vswp);
3993 3993                  }
3994 3994          }
3995 3995          RUNLOCK_VFSSW();
3996 3996  
3997 3997          return (NULL);
3998 3998  }
3999 3999  
4000 4000  /*
4001 4001   * Reference a vfssw entry.
4002 4002   */
4003 4003  void
4004 4004  vfs_refvfssw(struct vfssw *vswp)
4005 4005  {
4006 4006  
4007 4007          mutex_enter(&vswp->vsw_lock);
4008 4008          vswp->vsw_count++;
4009 4009          mutex_exit(&vswp->vsw_lock);
4010 4010  }
4011 4011  
4012 4012  /*
4013 4013   * Unreference a vfssw entry.
4014 4014   */
4015 4015  void
4016 4016  vfs_unrefvfssw(struct vfssw *vswp)
4017 4017  {
4018 4018  
4019 4019          mutex_enter(&vswp->vsw_lock);
4020 4020          vswp->vsw_count--;
4021 4021          mutex_exit(&vswp->vsw_lock);
4022 4022  }
4023 4023  
4024 4024  static int sync_retries = 20;   /* number of retries when not making progress */
4025 4025  static int sync_triesleft;      /* portion of sync_retries remaining */
4026 4026  
4027 4027  static pgcnt_t old_pgcnt, new_pgcnt;
4028 4028  static int new_bufcnt, old_bufcnt;
4029 4029  
4030 4030  /*
4031 4031   * Sync all of the mounted filesystems, and then wait for the actual i/o to
4032 4032   * complete.  We wait by counting the number of dirty pages and buffers,
4033 4033   * pushing them out using bio_busy() and page_busy(), and then counting again.
4034 4034   * This routine is used during the uadmin A_SHUTDOWN code.  It should only
4035 4035   * be used after some higher-level mechanism has quiesced the system so that
4036 4036   * new writes are not being initiated while we are waiting for completion.
4037 4037   *
4038 4038   * To ensure finite running time, our algorithm uses sync_triesleft (a progress
4039 4039   * counter used by the vfs_syncall() loop below). It is declared above so
4040 4040   * it can be found easily in the debugger.
4041 4041   *
4042 4042   * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
4043 4043   * sync_retries consecutive calls to bio_busy() and page_busy() without
4044 4044   * decreasing either the number of dirty buffers or dirty pages below the
4045 4045   * lowest count we have seen so far, we give up and return from vfs_syncall().
4046 4046   *
4047 4047   * Each loop iteration ends with a call to delay() one second to allow time for
4048 4048   * i/o completion and to permit the user time to read our progress messages.
4049 4049   */
4050 4050  void
4051 4051  vfs_syncall(void)
4052 4052  {
4053 4053          if (rootdir == NULL && !modrootloaded)
4054 4054                  return; /* no filesystems have been loaded yet */
4055 4055  
4056 4056          printf("syncing file systems...");
4057 4057          sync();
4058 4058  
4059 4059          sync_triesleft = sync_retries;
4060 4060  
4061 4061          old_bufcnt = new_bufcnt = INT_MAX;
4062 4062          old_pgcnt = new_pgcnt = ULONG_MAX;
4063 4063  
4064 4064          while (sync_triesleft > 0) {
4065 4065                  old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4066 4066                  old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4067 4067  
4068 4068                  new_bufcnt = bio_busy(B_TRUE);
4069 4069                  new_pgcnt = page_busy(B_TRUE);
4070 4070  
4071 4071                  if (new_bufcnt == 0 && new_pgcnt == 0)
4072 4072                          break;
4073 4073  
4074 4074                  if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4075 4075                          sync_triesleft = sync_retries;
4076 4076                  else
4077 4077                          sync_triesleft--;
4078 4078  
4079 4079                  if (new_bufcnt)
4080 4080                          printf(" [%d]", new_bufcnt);
4081 4081                  if (new_pgcnt)
4082 4082                          printf(" %lu", new_pgcnt);
4083 4083  
4084 4084                  delay(hz);
4085 4085          }
4086 4086  
4087 4087          if (new_bufcnt != 0 || new_pgcnt != 0)
4088 4088                  printf(" done (not all i/o completed)\n");
4089 4089          else
4090 4090                  printf(" done\n");
4091 4091  
4092 4092          delay(hz);
4093 4093  }
4094 4094  
4095 4095  /*
4096 4096   * Map VFS flags to statvfs flags.  These shouldn't really be separate
4097 4097   * flags at all.
4098 4098   */
4099 4099  uint_t
4100 4100  vf_to_stf(uint_t vf)
4101 4101  {
4102 4102          uint_t stf = 0;
4103 4103  
4104 4104          if (vf & VFS_RDONLY)
4105 4105                  stf |= ST_RDONLY;
4106 4106          if (vf & VFS_NOSETUID)
4107 4107                  stf |= ST_NOSUID;
4108 4108          if (vf & VFS_NOTRUNC)
4109 4109                  stf |= ST_NOTRUNC;
4110 4110  
4111 4111          return (stf);
4112 4112  }
4113 4113  
4114 4114  /*
4115 4115   * Entries for (illegal) fstype 0.
4116 4116   */
4117 4117  /* ARGSUSED */
4118 4118  int
4119 4119  vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4120 4120  {
4121 4121          cmn_err(CE_PANIC, "stray vfs operation");
4122 4122          return (0);
4123 4123  }
4124 4124  
4125 4125  /*
4126 4126   * Entries for (illegal) fstype 0.
4127 4127   */
4128 4128  int
4129 4129  vfsstray(void)
4130 4130  {
4131 4131          cmn_err(CE_PANIC, "stray vfs operation");
4132 4132          return (0);
4133 4133  }
4134 4134  
4135 4135  /*
4136 4136   * Support for dealing with forced UFS unmount and its interaction with
4137 4137   * LOFS. Could be used by any filesystem.
4138 4138   * See bug 1203132.
4139 4139   */
4140 4140  int
4141 4141  vfs_EIO(void)
4142 4142  {
4143 4143          return (EIO);
4144 4144  }
4145 4145  
4146 4146  /*
4147 4147   * We've gotta define the op for sync separately, since the compiler gets
4148 4148   * confused if we mix and match ANSI and normal style prototypes when
4149 4149   * a "short" argument is present and spits out a warning.
4150 4150   */
4151 4151  /*ARGSUSED*/
4152 4152  int
4153 4153  vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4154 4154  {
4155 4155          return (EIO);
4156 4156  }
4157 4157  
4158 4158  vfs_t EIO_vfs;
4159 4159  vfsops_t *EIO_vfsops;
4160 4160  
4161 4161  /*
4162 4162   * Called from startup() to initialize all loaded vfs's
4163 4163   */
4164 4164  void
4165 4165  vfsinit(void)
4166 4166  {
4167 4167          struct vfssw *vswp;
4168 4168          int error;
4169 4169          extern int vopstats_enabled;
4170 4170          extern void vopstats_startup();
4171 4171  
4172 4172          static const fs_operation_def_t EIO_vfsops_template[] = {
4173 4173                  VFSNAME_MOUNT,          { .error = vfs_EIO },
4174 4174                  VFSNAME_UNMOUNT,        { .error = vfs_EIO },
4175 4175                  VFSNAME_ROOT,           { .error = vfs_EIO },
4176 4176                  VFSNAME_STATVFS,        { .error = vfs_EIO },
4177 4177                  VFSNAME_SYNC,           { .vfs_sync = vfs_EIO_sync },
4178 4178                  VFSNAME_VGET,           { .error = vfs_EIO },
4179 4179                  VFSNAME_MOUNTROOT,      { .error = vfs_EIO },
4180 4180                  VFSNAME_FREEVFS,        { .error = vfs_EIO },
4181 4181                  VFSNAME_VNSTATE,        { .error = vfs_EIO },
4182 4182                  NULL, NULL
4183 4183          };
4184 4184  
4185 4185          static const fs_operation_def_t stray_vfsops_template[] = {
4186 4186                  VFSNAME_MOUNT,          { .error = vfsstray },
4187 4187                  VFSNAME_UNMOUNT,        { .error = vfsstray },
4188 4188                  VFSNAME_ROOT,           { .error = vfsstray },
4189 4189                  VFSNAME_STATVFS,        { .error = vfsstray },
4190 4190                  VFSNAME_SYNC,           { .vfs_sync = vfsstray_sync },
4191 4191                  VFSNAME_VGET,           { .error = vfsstray },
4192 4192                  VFSNAME_MOUNTROOT,      { .error = vfsstray },
4193 4193                  VFSNAME_FREEVFS,        { .error = vfsstray },
4194 4194                  VFSNAME_VNSTATE,        { .error = vfsstray },
4195 4195                  NULL, NULL
4196 4196          };
4197 4197  
4198 4198          /* Create vfs cache */
4199 4199          vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4200 4200              sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4201 4201  
4202 4202          /* Initialize the vnode cache (file systems may use it during init). */
4203 4203          vn_create_cache();
4204 4204  
4205 4205          /* Setup event monitor framework */
4206 4206          fem_init();
4207 4207  
4208 4208          /* Initialize the dummy stray file system type. */
4209 4209          error = vfs_setfsops(0, stray_vfsops_template, NULL);
4210 4210  
4211 4211          /* Initialize the dummy EIO file system. */
4212 4212          error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4213 4213          if (error != 0) {
4214 4214                  cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4215 4215                  /* Shouldn't happen, but not bad enough to panic */
4216 4216          }
4217 4217  
4218 4218          VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4219 4219  
4220 4220          /*
4221 4221           * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4222 4222           * on this vfs can immediately notice it's invalid.
4223 4223           */
4224 4224          EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4225 4225  
4226 4226          /*
4227 4227           * Call the init routines of non-loadable filesystems only.
4228 4228           * Filesystems which are loaded as separate modules will be
4229 4229           * initialized by the module loading code instead.
4230 4230           */
4231 4231  
4232 4232          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4233 4233                  RLOCK_VFSSW();
4234 4234                  if (vswp->vsw_init != NULL)
4235 4235                          (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4236 4236                  RUNLOCK_VFSSW();
4237 4237          }
4238 4238  
4239 4239          vopstats_startup();
4240 4240  
4241 4241          if (vopstats_enabled) {
4242 4242                  /* EIO_vfs can collect stats, but we don't retrieve them */
4243 4243                  initialize_vopstats(&EIO_vfs.vfs_vopstats);
4244 4244                  EIO_vfs.vfs_fstypevsp = NULL;
4245 4245                  EIO_vfs.vfs_vskap = NULL;
4246 4246                  EIO_vfs.vfs_flag |= VFS_STATS;
4247 4247          }
4248 4248  
4249 4249          xattr_init();
4250 4250  
4251 4251          reparse_point_init();
4252 4252  }
4253 4253  
4254 4254  vfs_t *
4255 4255  vfs_alloc(int kmflag)
4256 4256  {
4257 4257          vfs_t *vfsp;
4258 4258  
4259 4259          vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4260 4260  
4261 4261          /*
4262 4262           * Do the simplest initialization here.
4263 4263           * Everything else gets done in vfs_init()
4264 4264           */
4265 4265          bzero(vfsp, sizeof (vfs_t));
4266 4266          return (vfsp);
4267 4267  }
4268 4268  
4269 4269  void
4270 4270  vfs_free(vfs_t *vfsp)
4271 4271  {
4272 4272          /*
4273 4273           * One would be tempted to assert that "vfsp->vfs_count == 0".
4274 4274           * The problem is that this gets called out of domount() with
4275 4275           * a partially initialized vfs and a vfs_count of 1.  This is
4276 4276           * also called from vfs_rele() with a vfs_count of 0.  We can't
4277 4277           * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4278 4278           * returned.  This is because VFS_MOUNT() fully initializes the
4279 4279           * vfs structure and its associated data.  VFS_RELE() will call
4280 4280           * VFS_FREEVFS() which may panic the system if the data structures
4281 4281           * aren't fully initialized from a successful VFS_MOUNT()).
4282 4282           */
4283 4283  
4284 4284          /* If FEM was in use, make sure everything gets cleaned up */
4285 4285          if (vfsp->vfs_femhead) {
4286 4286                  ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4287 4287                  mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4288 4288                  kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4289 4289                  vfsp->vfs_femhead = NULL;
4290 4290          }
4291 4291  
4292 4292          if (vfsp->vfs_implp)
4293 4293                  vfsimpl_teardown(vfsp);
4294 4294          sema_destroy(&vfsp->vfs_reflock);
4295 4295          kmem_cache_free(vfs_cache, vfsp);
4296 4296  }
4297 4297  
4298 4298  /*
4299 4299   * Increments the vfs reference count by one atomically.
4300 4300   */
4301 4301  void
4302 4302  vfs_hold(vfs_t *vfsp)
4303 4303  {
4304 4304          atomic_inc_32(&vfsp->vfs_count);
4305 4305          ASSERT(vfsp->vfs_count != 0);
4306 4306  }
4307 4307  
4308 4308  /*
4309 4309   * Decrements the vfs reference count by one atomically. When
4310 4310   * vfs reference count becomes zero, it calls the file system
4311 4311   * specific vfs_freevfs() to free up the resources.
4312 4312   */
4313 4313  void
4314 4314  vfs_rele(vfs_t *vfsp)
4315 4315  {
4316 4316          ASSERT(vfsp->vfs_count != 0);
4317 4317          if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) {
4318 4318                  VFS_FREEVFS(vfsp);
4319 4319                  lofi_remove(vfsp);
4320 4320                  if (vfsp->vfs_zone)
4321 4321                          zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
4322 4322                              ZONE_REF_VFS);
4323 4323                  vfs_freemnttab(vfsp);
4324 4324                  vfs_free(vfsp);
4325 4325          }
4326 4326  }
4327 4327  
4328 4328  /*
4329 4329   * Generic operations vector support.
4330 4330   *
4331 4331   * This is used to build operations vectors for both the vfs and vnode.
4332 4332   * It's normally called only when a file system is loaded.
4333 4333   *
4334 4334   * There are many possible algorithms for this, including the following:
4335 4335   *
4336 4336   *   (1) scan the list of known operations; for each, see if the file system
4337 4337   *       includes an entry for it, and fill it in as appropriate.
4338 4338   *
4339 4339   *   (2) set up defaults for all known operations.  scan the list of ops
4340 4340   *       supplied by the file system; for each which is both supplied and
4341 4341   *       known, fill it in.
4342 4342   *
4343 4343   *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4344 4344   *       in entries as we go.
4345 4345   *
4346 4346   * we choose (1) for simplicity, and because performance isn't critical here.
4347 4347   * note that (2) could be sped up using a precomputed hash table on known ops.
4348 4348   * (3) could be faster than either, but only if the lists were very large or
4349 4349   * supplied in sorted order.
4350 4350   *
4351 4351   */
4352 4352  
4353 4353  int
4354 4354  fs_build_vector(void *vector, int *unused_ops,
4355 4355      const fs_operation_trans_def_t *translation,
4356 4356      const fs_operation_def_t *operations)
4357 4357  {
4358 4358          int i, num_trans, num_ops, used;
4359 4359  
4360 4360          /*
4361 4361           * Count the number of translations and the number of supplied
4362 4362           * operations.
4363 4363           */
4364 4364  
4365 4365          {
4366 4366                  const fs_operation_trans_def_t *p;
4367 4367  
4368 4368                  for (num_trans = 0, p = translation;
4369 4369                      p->name != NULL;
4370 4370                      num_trans++, p++)
4371 4371                          ;
4372 4372          }
4373 4373  
4374 4374          {
4375 4375                  const fs_operation_def_t *p;
4376 4376  
4377 4377                  for (num_ops = 0, p = operations;
4378 4378                      p->name != NULL;
4379 4379                      num_ops++, p++)
4380 4380                          ;
4381 4381          }
4382 4382  
4383 4383          /* Walk through each operation known to our caller.  There will be */
4384 4384          /* one entry in the supplied "translation table" for each. */
4385 4385  
4386 4386          used = 0;
4387 4387  
4388 4388          for (i = 0; i < num_trans; i++) {
4389 4389                  int j, found;
4390 4390                  char *curname;
4391 4391                  fs_generic_func_p result;
4392 4392                  fs_generic_func_p *location;
4393 4393  
4394 4394                  curname = translation[i].name;
4395 4395  
4396 4396                  /* Look for a matching operation in the list supplied by the */
4397 4397                  /* file system. */
4398 4398  
4399 4399                  found = 0;
4400 4400  
4401 4401                  for (j = 0; j < num_ops; j++) {
4402 4402                          if (strcmp(operations[j].name, curname) == 0) {
4403 4403                                  used++;
4404 4404                                  found = 1;
4405 4405                                  break;
4406 4406                          }
4407 4407                  }
4408 4408  
4409 4409                  /*
4410 4410                   * If the file system is using a "placeholder" for default
4411 4411                   * or error functions, grab the appropriate function out of
4412 4412                   * the translation table.  If the file system didn't supply
4413 4413                   * this operation at all, use the default function.
4414 4414                   */
4415 4415  
4416 4416                  if (found) {
4417 4417                          result = operations[j].func.fs_generic;
4418 4418                          if (result == fs_default) {
4419 4419                                  result = translation[i].defaultFunc;
4420 4420                          } else if (result == fs_error) {
4421 4421                                  result = translation[i].errorFunc;
4422 4422                          } else if (result == NULL) {
4423 4423                                  /* Null values are PROHIBITED */
4424 4424                                  return (EINVAL);
4425 4425                          }
4426 4426                  } else {
4427 4427                          result = translation[i].defaultFunc;
4428 4428                  }
4429 4429  
4430 4430                  /* Now store the function into the operations vector. */
4431 4431  
4432 4432                  location = (fs_generic_func_p *)
4433 4433                      (((char *)vector) + translation[i].offset);
4434 4434  
4435 4435                  *location = result;
4436 4436          }
4437 4437  
4438 4438          *unused_ops = num_ops - used;
4439 4439  
4440 4440          return (0);
4441 4441  }
4442 4442  
4443 4443  /* Placeholder functions, should never be called. */
4444 4444  
4445 4445  int
4446 4446  fs_error(void)
4447 4447  {
4448 4448          cmn_err(CE_PANIC, "fs_error called");
4449 4449          return (0);
4450 4450  }
4451 4451  
4452 4452  int
4453 4453  fs_default(void)
4454 4454  {
4455 4455          cmn_err(CE_PANIC, "fs_default called");
4456 4456          return (0);
4457 4457  }
4458 4458  
4459 4459  #ifdef __sparc
4460 4460  
4461 4461  /*
4462 4462   * Part of the implementation of booting off a mirrored root
4463 4463   * involves a change of dev_t for the root device.  To
4464 4464   * accomplish this, first remove the existing hash table
4465 4465   * entry for the root device, convert to the new dev_t,
4466 4466   * then re-insert in the hash table at the head of the list.
4467 4467   */
4468 4468  void
4469 4469  vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4470 4470  {
4471 4471          vfs_list_lock();
4472 4472  
4473 4473          vfs_hash_remove(vfsp);
4474 4474  
4475 4475          vfsp->vfs_dev = ndev;
4476 4476          vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4477 4477  
4478 4478          vfs_hash_add(vfsp, 1);

↓ open down ↓

4478 lines elided

↑ open up ↑

4479 4479  
4480 4480          vfs_list_unlock();
4481 4481  }
4482 4482  
4483 4483  #else /* x86 NEWBOOT */
4484 4484  
4485 4485  #if defined(__x86)
4486 4486  extern int hvmboot_rootconf();
4487 4487  #endif /* __x86 */
4488 4488  
     4489 +extern char *aoepath_prop;
4489 4490  extern ib_boot_prop_t *iscsiboot_prop;
4490 4491  
4491 4492  int
4492 4493  rootconf()
4493 4494  {
4494 4495          int error;
4495 4496          struct vfssw *vsw;
4496 4497          extern void pm_init();
4497 4498          char *fstyp, *fsmod;
4498 4499          int ret = -1;

4499 4500  
4500 4501          getrootfs(&fstyp, &fsmod);
4501 4502  
4502 4503  #if defined(__x86)
4503 4504          /*
4504 4505           * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4505 4506           * which lives in /platform/i86hvm, and hence is only available when
4506 4507           * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4507 4508           * is not available then the modstub for this function will return 0.
4508 4509           * If the hvm_bootstrap misc module is available it will be loaded
4509 4510           * and hvmboot_rootconf() will be invoked.
4510 4511           */
4511 4512          if (error = hvmboot_rootconf())
4512 4513                  return (error);
4513 4514  #endif /* __x86 */
4514 4515  
4515 4516          if (error = clboot_rootconf())
4516 4517                  return (error);
4517 4518  
4518 4519          if (modload("fs", fsmod) == -1)
4519 4520                  panic("Cannot _init %s module", fsmod);
4520 4521

↓ open down ↓

22 lines elided

↑ open up ↑

4521 4522          RLOCK_VFSSW();
4522 4523          vsw = vfs_getvfsswbyname(fstyp);
4523 4524          RUNLOCK_VFSSW();
4524 4525          if (vsw == NULL) {
4525 4526                  cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4526 4527                  return (ENXIO);
4527 4528          }
4528 4529          VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4529 4530          VFS_HOLD(rootvfs);
4530 4531  
4531      -        /* always mount readonly first */
     4532 +        /* Always mount readonly first */
4532 4533          rootvfs->vfs_flag |= VFS_RDONLY;
4533 4534  
4534 4535          pm_init();
4535 4536  
4536      -        if (netboot && iscsiboot_prop) {
4537      -                cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4538      -                    " shouldn't happen in the same time");
     4537 +        if ((aoepath_prop != NULL && (iscsiboot_prop != NULL || netboot)) ||
     4538 +            (iscsiboot_prop != NULL && (aoepath_prop != NULL || netboot)) ||
     4539 +            (netboot && (aoepath_prop != NULL || iscsiboot_prop != NULL))) {
     4540 +                cmn_err(CE_WARN, "Only one of AoE, iSCSI or NFS boot "
     4541 +                    "can be specified at time");
4539 4542                  return (EINVAL);
4540 4543          }
4541 4544  
4542      -        if (netboot || iscsiboot_prop) {
     4545 +        if (aoepath_prop != NULL || iscsiboot_prop != NULL || netboot) {
4543 4546                  ret = strplumb();
4544 4547                  if (ret != 0) {
4545 4548                          cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4546 4549                          return (EFAULT);
4547 4550                  }
4548 4551          }
4549 4552  
4550      -        if ((ret == 0) && iscsiboot_prop) {
4551      -                ret = modload("drv", "iscsi");
4552      -                /* -1 indicates fail */
4553      -                if (ret == -1) {
     4553 +        if (aoepath_prop != NULL) {
     4554 +                if (modload("drv", "aoe") == -1 ||
     4555 +                    modload("drv", "aoeblk") == -1) {
     4556 +                        cmn_err(CE_WARN, "Failed to load aoe modules");
     4557 +                        return (EINVAL);
     4558 +                }
     4559 +                if (i_ddi_attach_pseudo_node("aoe") == 0) {
     4560 +                        cmn_err(CE_WARN, "Failed to attach aoe driver");
     4561 +                        return (ENODEV);
     4562 +                }
     4563 +        }
     4564 +
     4565 +        if (iscsiboot_prop != NULL) {
     4566 +                if (modload("drv", "iscsi") == -1) {
4554 4567                          cmn_err(CE_WARN, "Failed to load iscsi module");
4555 4568                          iscsi_boot_prop_free();
4556 4569                          return (EINVAL);
4557      -                } else {
4558      -                        if (!i_ddi_attach_pseudo_node("iscsi")) {
4559      -                                cmn_err(CE_WARN,
4560      -                                    "Failed to attach iscsi driver");
4561      -                                iscsi_boot_prop_free();
4562      -                                return (ENODEV);
4563      -                        }
4564 4570                  }
     4571 +                if (i_ddi_attach_pseudo_node("iscsi") == 0) {
     4572 +                        cmn_err(CE_WARN, "Failed to attach iscsi driver");
     4573 +                        iscsi_boot_prop_free();
     4574 +                        return (ENODEV);
     4575 +                }
4565 4576          }
4566 4577  
4567 4578          error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4568 4579          vfs_unrefvfssw(vsw);
4569 4580          rootdev = rootvfs->vfs_dev;
4570 4581  
4571      -        if (error)
     4582 +        if (error != 0)
4572 4583                  cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4573 4584                      rootfs.bo_name, fstyp);
4574 4585          else
4575 4586                  cmn_err(CE_CONT, "?root on %s fstype %s\n",
4576 4587                      rootfs.bo_name, fstyp);
4577 4588          return (error);
4578 4589  }
4579 4590  
4580 4591  /*
4581 4592   * XXX this is called by nfs only and should probably be removed

4582 4593   * If booted with ASKNAME, prompt on the console for a filesystem
4583 4594   * name and return it.
4584 4595   */
4585 4596  void
4586 4597  getfsname(char *askfor, char *name, size_t namelen)
4587 4598  {
4588 4599          if (boothowto & RB_ASKNAME) {
4589 4600                  printf("%s name: ", askfor);
4590 4601                  console_gets(name, namelen);
4591 4602          }
4592 4603  }
4593 4604  
4594 4605  /*
4595 4606   * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4596 4607   * property.
4597 4608   *
4598 4609   * Filesystem types starting with the prefix "nfs" are diskless clients;
4599 4610   * init the root filename name (rootfs.bo_name), too.
4600 4611   *
4601 4612   * If we are booting via NFS we currently have these options:
4602 4613   *      nfs -   dynamically choose NFS V2, V3, or V4 (default)
4603 4614   *      nfs2 -  force NFS V2
4604 4615   *      nfs3 -  force NFS V3
4605 4616   *      nfs4 -  force NFS V4
4606 4617   * Because we need to maintain backward compatibility with the naming
4607 4618   * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4608 4619   * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs".  The dynamic
4609 4620   * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4610 4621   * This is only for root filesystems, all other uses will expect
4611 4622   * that "nfs" == NFS V2.
4612 4623   */
4613 4624  static void
4614 4625  getrootfs(char **fstypp, char **fsmodp)
4615 4626  {
4616 4627          char *propstr = NULL;
4617 4628  
4618 4629          /*
4619 4630           * Check fstype property; for diskless it should be one of "nfs",
4620 4631           * "nfs2", "nfs3" or "nfs4".
4621 4632           */
4622 4633          if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4623 4634              DDI_PROP_DONTPASS, "fstype", &propstr)
4624 4635              == DDI_SUCCESS) {
4625 4636                  (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4626 4637                  ddi_prop_free(propstr);
4627 4638  
4628 4639          /*
4629 4640           * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4630 4641           * assume the type of this root filesystem is 'zfs'.
4631 4642           */
4632 4643          } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4633 4644              DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4634 4645              == DDI_SUCCESS) {
4635 4646                  (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4636 4647                  ddi_prop_free(propstr);
4637 4648          }
4638 4649  
4639 4650          if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4640 4651                  *fstypp = *fsmodp = rootfs.bo_fstype;
4641 4652                  return;
4642 4653          }
4643 4654  
4644 4655          ++netboot;
4645 4656  
4646 4657          if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4647 4658                  (void) strcpy(rootfs.bo_fstype, "nfs");
4648 4659          else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4649 4660                  (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4650 4661  
4651 4662          /*
4652 4663           * check if path to network interface is specified in bootpath
4653 4664           * or by a hypervisor domain configuration file.
4654 4665           * XXPV - enable strlumb_get_netdev_path()
4655 4666           */
4656 4667          if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4657 4668              "xpv-nfsroot")) {
4658 4669                  (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4659 4670          } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4660 4671              DDI_PROP_DONTPASS, "bootpath", &propstr)
4661 4672              == DDI_SUCCESS) {
4662 4673                  (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4663 4674                  ddi_prop_free(propstr);
4664 4675          } else {
4665 4676                  rootfs.bo_name[0] = '\0';
4666 4677          }
4667 4678          *fstypp = rootfs.bo_fstype;
4668 4679          *fsmodp = "nfs";
4669 4680  }
4670 4681  #endif
4671 4682  
4672 4683  /*
4673 4684   * VFS feature routines
4674 4685   */
4675 4686  
4676 4687  #define VFTINDEX(feature)       (((feature) >> 32) & 0xFFFFFFFF)
4677 4688  #define VFTBITS(feature)        ((feature) & 0xFFFFFFFFLL)
4678 4689  
4679 4690  /* Register a feature in the vfs */
4680 4691  void
4681 4692  vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4682 4693  {
4683 4694          /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4684 4695          if (vfsp->vfs_implp == NULL)
4685 4696                  return;
4686 4697  
4687 4698          vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4688 4699  }
4689 4700  
4690 4701  void
4691 4702  vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature)
4692 4703  {
4693 4704          /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4694 4705          if (vfsp->vfs_implp == NULL)
4695 4706                  return;
4696 4707          vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature);
4697 4708  }
4698 4709  
4699 4710  /*
4700 4711   * Query a vfs for a feature.
4701 4712   * Returns 1 if feature is present, 0 if not
4702 4713   */
4703 4714  int
4704 4715  vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4705 4716  {
4706 4717          int     ret = 0;
4707 4718  
4708 4719          /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4709 4720          if (vfsp->vfs_implp == NULL)
4710 4721                  return (ret);
4711 4722  
4712 4723          if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4713 4724                  ret = 1;
4714 4725  
4715 4726          return (ret);
4716 4727  }
4717 4728  
4718 4729  /*
4719 4730   * Propagate feature set from one vfs to another
4720 4731   */
4721 4732  void
4722 4733  vfs_propagate_features(vfs_t *from, vfs_t *to)
4723 4734  {
4724 4735          int i;
4725 4736  
4726 4737          if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4727 4738                  return;
4728 4739  
4729 4740          for (i = 1; i <= to->vfs_featureset[0]; i++) {
4730 4741                  to->vfs_featureset[i] = from->vfs_featureset[i];
4731 4742          }
4732 4743  }
4733 4744  
4734 4745  #define LOFINODE_PATH "/dev/lofi/%d"
4735 4746  
4736 4747  /*
4737 4748   * Return the vnode for the lofi node if there's a lofi mount in place.
4738 4749   * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4739 4750   * failure.
4740 4751   */
4741 4752  int
4742 4753  vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4743 4754  {
4744 4755          char *path = NULL;
4745 4756          int strsize;
4746 4757          int err;
4747 4758  
4748 4759          if (vfsp->vfs_lofi_id == 0) {
4749 4760                  *vpp = NULL;
4750 4761                  return (-1);
4751 4762          }
4752 4763  
4753 4764          strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_id);
4754 4765          path = kmem_alloc(strsize + 1, KM_SLEEP);
4755 4766          (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_id);
4756 4767  
4757 4768          /*
4758 4769           * We may be inside a zone, so we need to use the /dev path, but
4759 4770           * it's created asynchronously, so we wait here.
4760 4771           */
4761 4772          for (;;) {
4762 4773                  err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4763 4774  
4764 4775                  if (err != ENOENT)
4765 4776                          break;
4766 4777  
4767 4778                  if ((err = delay_sig(hz / 8)) == EINTR)
4768 4779                          break;
4769 4780          }
4770 4781  
4771 4782          if (err)
4772 4783                  *vpp = NULL;
4773 4784  
4774 4785          kmem_free(path, strsize + 1);
4775 4786          return (err);
4776 4787  }

↓ open down ↓

195 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX