1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  *
  25  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  26  *      All rights reserved.
  27  */
  28 
  29 #include <sys/param.h>
  30 #include <sys/types.h>
  31 #include <sys/systm.h>
  32 #include <sys/cred.h>
  33 #include <sys/vfs.h>
  34 #include <sys/vfs_opreg.h>
  35 #include <sys/vnode.h>
  36 #include <sys/pathname.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/kmem.h>
  39 #include <sys/mkdev.h>
  40 #include <sys/mount.h>
  41 #include <sys/mntent.h>
  42 #include <sys/statvfs.h>
  43 #include <sys/errno.h>
  44 #include <sys/debug.h>
  45 #include <sys/cmn_err.h>
  46 #include <sys/utsname.h>
  47 #include <sys/bootconf.h>
  48 #include <sys/modctl.h>
  49 #include <sys/acl.h>
  50 #include <sys/flock.h>
  51 #include <sys/policy.h>
  52 #include <sys/zone.h>
  53 #include <sys/class.h>
  54 #include <sys/socket.h>
  55 #include <sys/netconfig.h>
  56 #include <sys/mntent.h>
  57 #include <sys/tsol/label.h>
  58 
  59 #include <rpc/types.h>
  60 #include <rpc/auth.h>
  61 #include <rpc/clnt.h>
  62 
  63 #include <nfs/nfs.h>
  64 #include <nfs/nfs_clnt.h>
  65 #include <nfs/rnode.h>
  66 #include <nfs/mount.h>
  67 #include <nfs/nfs_acl.h>
  68 
  69 #include <fs/fs_subr.h>
  70 
  71 /*
  72  * From rpcsec module (common/rpcsec).
  73  */
  74 extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
  75 extern void sec_clnt_freeinfo(struct sec_data *);
  76 
  77 static int pathconf_copyin(struct nfs_args *, struct pathcnf *);
  78 static int pathconf_get(struct mntinfo *, struct nfs_args *);
  79 static void pathconf_rele(struct mntinfo *);
  80 
  81 /*
  82  * The order and contents of this structure must be kept in sync with that of
  83  * rfsreqcnt_v2_tmpl in nfs_stats.c
  84  */
  85 static char *rfsnames_v2[] = {
  86         "null", "getattr", "setattr", "unused", "lookup", "readlink", "read",
  87         "unused", "write", "create", "remove", "rename", "link", "symlink",
  88         "mkdir", "rmdir", "readdir", "fsstat"
  89 };
  90 
  91 /*
  92  * This table maps from NFS protocol number into call type.
  93  * Zero means a "Lookup" type call
  94  * One  means a "Read" type call
  95  * Two  means a "Write" type call
  96  * This is used to select a default time-out.
  97  */
  98 static uchar_t call_type_v2[] = {
  99         0, 0, 1, 0, 0, 0, 1,
 100         0, 2, 2, 2, 2, 2, 2,
 101         2, 2, 1, 0
 102 };
 103 
 104 /*
 105  * Similar table, but to determine which timer to use
 106  * (only real reads and writes!)
 107  */
 108 static uchar_t timer_type_v2[] = {
 109         0, 0, 0, 0, 0, 0, 1,
 110         0, 2, 0, 0, 0, 0, 0,
 111         0, 0, 1, 0
 112 };
 113 
 114 /*
 115  * This table maps from NFS protocol number into a call type
 116  * for the semisoft mount option.
 117  * Zero means do not repeat operation.
 118  * One  means repeat.
 119  */
 120 static uchar_t ss_call_type_v2[] = {
 121         0, 0, 1, 0, 0, 0, 0,
 122         0, 1, 1, 1, 1, 1, 1,
 123         1, 1, 0, 0
 124 };
 125 
 126 /*
 127  * nfs vfs operations.
 128  */
 129 static int      nfs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
 130 static int      nfs_unmount(vfs_t *, int, cred_t *);
 131 static int      nfs_root(vfs_t *, vnode_t **);
 132 static int      nfs_statvfs(vfs_t *, struct statvfs64 *);
 133 static int      nfs_sync(vfs_t *, short, cred_t *);
 134 static int      nfs_vget(vfs_t *, vnode_t **, fid_t *);
 135 static int      nfs_mountroot(vfs_t *, whymountroot_t);
 136 static void     nfs_freevfs(vfs_t *);
 137 
 138 static int      nfsrootvp(vnode_t **, vfs_t *, struct servinfo *,
 139                     int, cred_t *, zone_t *);
 140 
 141 /*
 142  * Initialize the vfs structure
 143  */
 144 
 145 int nfsfstyp;
 146 vfsops_t *nfs_vfsops;
 147 
 148 /*
 149  * Debug variable to check for rdma based
 150  * transport startup and cleanup. Controlled
 151  * through /etc/system. Off by default.
 152  */
 153 int rdma_debug = 0;
 154 
 155 int
 156 nfsinit(int fstyp, char *name)
 157 {
 158         static const fs_operation_def_t nfs_vfsops_template[] = {
 159                 VFSNAME_MOUNT,          { .vfs_mount = nfs_mount },
 160                 VFSNAME_UNMOUNT,        { .vfs_unmount = nfs_unmount },
 161                 VFSNAME_ROOT,           { .vfs_root = nfs_root },
 162                 VFSNAME_STATVFS,        { .vfs_statvfs = nfs_statvfs },
 163                 VFSNAME_SYNC,           { .vfs_sync = nfs_sync },
 164                 VFSNAME_VGET,           { .vfs_vget = nfs_vget },
 165                 VFSNAME_MOUNTROOT,      { .vfs_mountroot = nfs_mountroot },
 166                 VFSNAME_FREEVFS,        { .vfs_freevfs = nfs_freevfs },
 167                 NULL,                   NULL
 168         };
 169         int error;
 170 
 171         error = vfs_setfsops(fstyp, nfs_vfsops_template, &nfs_vfsops);
 172         if (error != 0) {
 173                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 174                     "nfsinit: bad vfs ops template");
 175                 return (error);
 176         }
 177 
 178         error = vn_make_ops(name, nfs_vnodeops_template, &nfs_vnodeops);
 179         if (error != 0) {
 180                 (void) vfs_freevfsops_by_type(fstyp);
 181                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 182                     "nfsinit: bad vnode ops template");
 183                 return (error);
 184         }
 185 
 186 
 187         nfsfstyp = fstyp;
 188 
 189         return (0);
 190 }
 191 
 192 void
 193 nfsfini(void)
 194 {
 195 }
 196 
 197 static void
 198 nfs_free_args(struct nfs_args *nargs, nfs_fhandle *fh)
 199 {
 200 
 201         if (fh)
 202                 kmem_free(fh, sizeof (*fh));
 203 
 204         if (nargs->pathconf) {
 205                 kmem_free(nargs->pathconf, sizeof (struct pathcnf));
 206                 nargs->pathconf = NULL;
 207         }
 208 
 209         if (nargs->knconf) {
 210                 if (nargs->knconf->knc_protofmly)
 211                         kmem_free(nargs->knconf->knc_protofmly, KNC_STRSIZE);
 212                 if (nargs->knconf->knc_proto)
 213                         kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
 214                 kmem_free(nargs->knconf, sizeof (*nargs->knconf));
 215                 nargs->knconf = NULL;
 216         }
 217 
 218         if (nargs->fh) {
 219                 kmem_free(nargs->fh, strlen(nargs->fh) + 1);
 220                 nargs->fh = NULL;
 221         }
 222 
 223         if (nargs->hostname) {
 224                 kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
 225                 nargs->hostname = NULL;
 226         }
 227 
 228         if (nargs->addr) {
 229                 if (nargs->addr->buf) {
 230                         ASSERT(nargs->addr->len);
 231                         kmem_free(nargs->addr->buf, nargs->addr->len);
 232                 }
 233                 kmem_free(nargs->addr, sizeof (struct netbuf));
 234                 nargs->addr = NULL;
 235         }
 236 
 237         if (nargs->syncaddr) {
 238                 ASSERT(nargs->syncaddr->len);
 239                 if (nargs->syncaddr->buf) {
 240                         ASSERT(nargs->syncaddr->len);
 241                         kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
 242                 }
 243                 kmem_free(nargs->syncaddr, sizeof (struct netbuf));
 244                 nargs->syncaddr = NULL;
 245         }
 246 
 247         if (nargs->netname) {
 248                 kmem_free(nargs->netname, strlen(nargs->netname) + 1);
 249                 nargs->netname = NULL;
 250         }
 251 
 252         if (nargs->nfs_ext_u.nfs_extA.secdata) {
 253                 sec_clnt_freeinfo(nargs->nfs_ext_u.nfs_extA.secdata);
 254                 nargs->nfs_ext_u.nfs_extA.secdata = NULL;
 255         }
 256 }
 257 
 258 static int
 259 nfs_copyin(char *data, int datalen, struct nfs_args *nargs, nfs_fhandle *fh)
 260 {
 261 
 262         int error;
 263         size_t nlen;                    /* length of netname */
 264         size_t hlen;                    /* length of hostname */
 265         char netname[MAXNETNAMELEN+1];  /* server's netname */
 266         struct netbuf addr;             /* server's address */
 267         struct netbuf syncaddr;         /* AUTH_DES time sync addr */
 268         struct knetconfig *knconf;      /* transport knetconfig structure */
 269         struct sec_data *secdata = NULL;        /* security data */
 270         STRUCT_DECL(nfs_args, args);            /* nfs mount arguments */
 271         STRUCT_DECL(knetconfig, knconf_tmp);
 272         STRUCT_DECL(netbuf, addr_tmp);
 273         int flags;
 274         struct pathcnf  *pc;            /* Pathconf */
 275         char *p, *pf;
 276         char *userbufptr;
 277 
 278 
 279         bzero(nargs, sizeof (*nargs));
 280 
 281         STRUCT_INIT(args, get_udatamodel());
 282         bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
 283         if (copyin(data, STRUCT_BUF(args), MIN(datalen, STRUCT_SIZE(args))))
 284                 return (EFAULT);
 285 
 286         nargs->wsize = STRUCT_FGET(args, wsize);
 287         nargs->rsize = STRUCT_FGET(args, rsize);
 288         nargs->timeo = STRUCT_FGET(args, timeo);
 289         nargs->retrans = STRUCT_FGET(args, retrans);
 290         nargs->acregmin = STRUCT_FGET(args, acregmin);
 291         nargs->acregmax = STRUCT_FGET(args, acregmax);
 292         nargs->acdirmin = STRUCT_FGET(args, acdirmin);
 293         nargs->acdirmax = STRUCT_FGET(args, acdirmax);
 294 
 295         flags = STRUCT_FGET(args, flags);
 296         nargs->flags = flags;
 297 
 298 
 299         addr.buf = NULL;
 300         syncaddr.buf = NULL;
 301 
 302         /*
 303          * Allocate space for a knetconfig structure and
 304          * its strings and copy in from user-land.
 305          */
 306         knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
 307         STRUCT_INIT(knconf_tmp, get_udatamodel());
 308         if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
 309             STRUCT_SIZE(knconf_tmp))) {
 310                 kmem_free(knconf, sizeof (*knconf));
 311                 return (EFAULT);
 312         }
 313 
 314         knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
 315         knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
 316         knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
 317         if (get_udatamodel() != DATAMODEL_LP64) {
 318                 knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
 319         } else {
 320                 knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
 321         }
 322 
 323         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 324         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 325         error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
 326         if (error) {
 327                 kmem_free(pf, KNC_STRSIZE);
 328                 kmem_free(p, KNC_STRSIZE);
 329                 kmem_free(knconf, sizeof (*knconf));
 330                 return (error);
 331         }
 332 
 333         error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
 334         if (error) {
 335                 kmem_free(pf, KNC_STRSIZE);
 336                 kmem_free(p, KNC_STRSIZE);
 337                 kmem_free(knconf, sizeof (*knconf));
 338                 return (error);
 339         }
 340 
 341 
 342         knconf->knc_protofmly = pf;
 343         knconf->knc_proto = p;
 344 
 345         nargs->knconf = knconf;
 346 
 347         /* Copyin pathconf if there is one */
 348         if (STRUCT_FGETP(args, pathconf) != NULL) {
 349                 pc = kmem_alloc(sizeof (*pc), KM_SLEEP);
 350                 error = pathconf_copyin(STRUCT_BUF(args), pc);
 351                 nargs->pathconf = pc;
 352                 if (error)
 353                         goto errout;
 354         }
 355 
 356         /*
 357          * Get server address
 358          */
 359         STRUCT_INIT(addr_tmp, get_udatamodel());
 360         if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
 361             STRUCT_SIZE(addr_tmp))) {
 362                 error = EFAULT;
 363                 goto errout;
 364         }
 365         nargs->addr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
 366         userbufptr = STRUCT_FGETP(addr_tmp, buf);
 367         addr.len = STRUCT_FGET(addr_tmp, len);
 368         addr.buf = kmem_alloc(addr.len, KM_SLEEP);
 369         addr.maxlen = addr.len;
 370         if (copyin(userbufptr, addr.buf, addr.len)) {
 371                 kmem_free(addr.buf, addr.len);
 372                 error = EFAULT;
 373                 goto errout;
 374         }
 375         bcopy(&addr, nargs->addr, sizeof (struct netbuf));
 376 
 377         /*
 378          * Get the root fhandle
 379          */
 380 
 381         if (copyin(STRUCT_FGETP(args, fh), &fh->fh_buf, NFS_FHSIZE)) {
 382                 error = EFAULT;
 383                 goto errout;
 384         }
 385         fh->fh_len = NFS_FHSIZE;
 386 
 387         /*
 388          * Get server's hostname
 389          */
 390         if (flags & NFSMNT_HOSTNAME) {
 391                 error = copyinstr(STRUCT_FGETP(args, hostname), netname,
 392                     sizeof (netname), &hlen);
 393                 if (error)
 394                         goto errout;
 395                 nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
 396                 (void) strcpy(nargs->hostname, netname);
 397 
 398         } else {
 399                 nargs->hostname = NULL;
 400         }
 401 
 402 
 403         /*
 404          * If there are syncaddr and netname data, load them in. This is
 405          * to support data needed for NFSV4 when AUTH_DH is the negotiated
 406          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
 407          */
 408         netname[0] = '\0';
 409         if (flags & NFSMNT_SECURE) {
 410                 if (STRUCT_FGETP(args, syncaddr) == NULL) {
 411                         error = EINVAL;
 412                         goto errout;
 413                 }
 414                 /* get syncaddr */
 415                 STRUCT_INIT(addr_tmp, get_udatamodel());
 416                 if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
 417                     STRUCT_SIZE(addr_tmp))) {
 418                         error = EINVAL;
 419                         goto errout;
 420                 }
 421                 userbufptr = STRUCT_FGETP(addr_tmp, buf);
 422                 syncaddr.len = STRUCT_FGET(addr_tmp, len);
 423                 syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
 424                 syncaddr.maxlen = syncaddr.len;
 425                 if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
 426                         kmem_free(syncaddr.buf, syncaddr.len);
 427                         error = EFAULT;
 428                         goto errout;
 429                 }
 430 
 431                 nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
 432                 bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
 433 
 434                 ASSERT(STRUCT_FGETP(args, netname));
 435                 if (copyinstr(STRUCT_FGETP(args, netname), netname,
 436                     sizeof (netname), &nlen)) {
 437                         error = EFAULT;
 438                         goto errout;
 439                 }
 440 
 441                 netname[nlen] = '\0';
 442                 nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
 443                 (void) strcpy(nargs->netname, netname);
 444         }
 445 
 446         /*
 447          * Get the extention data which has the security data structure.
 448          * This includes data for AUTH_SYS as well.
 449          */
 450         if (flags & NFSMNT_NEWARGS) {
 451                 nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
 452                 if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
 453                     nargs->nfs_args_ext == NFS_ARGS_EXTB) {
 454                         /*
 455                          * Indicating the application is using the new
 456                          * sec_data structure to pass in the security
 457                          * data.
 458                          */
 459                         if (STRUCT_FGETP(args,
 460                             nfs_ext_u.nfs_extA.secdata) != NULL) {
 461                                 error = sec_clnt_loadinfo(
 462                                     (struct sec_data *)STRUCT_FGETP(args,
 463                                     nfs_ext_u.nfs_extA.secdata), &secdata,
 464                                     get_udatamodel());
 465                         }
 466                         nargs->nfs_ext_u.nfs_extA.secdata = secdata;
 467                 }
 468         }
 469 
 470         if (error)
 471                 goto errout;
 472 
 473         /*
 474          * Failover support:
 475          *
 476          * We may have a linked list of nfs_args structures,
 477          * which means the user is looking for failover.  If
 478          * the mount is either not "read-only" or "soft",
 479          * we want to bail out with EINVAL.
 480          */
 481         if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
 482                 nargs->nfs_ext_u.nfs_extB.next =
 483                     STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
 484 
 485 errout:
 486         if (error)
 487                 nfs_free_args(nargs, fh);
 488 
 489         return (error);
 490 }
 491 
 492 
 493 /*
 494  * nfs mount vfsop
 495  * Set up mount info record and attach it to vfs struct.
 496  */
 497 static int
 498 nfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 499 {
 500         char *data = uap->dataptr;
 501         int error;
 502         vnode_t *rtvp;                  /* the server's root */
 503         mntinfo_t *mi;                  /* mount info, pointed at by vfs */
 504         size_t nlen;                    /* length of netname */
 505         struct knetconfig *knconf;      /* transport knetconfig structure */
 506         struct knetconfig *rdma_knconf; /* rdma transport structure */
 507         rnode_t *rp;
 508         struct servinfo *svp;           /* nfs server info */
 509         struct servinfo *svp_tail = NULL; /* previous nfs server info */
 510         struct servinfo *svp_head;      /* first nfs server info */
 511         struct servinfo *svp_2ndlast;   /* 2nd last in the server info list */
 512         struct sec_data *secdata;       /* security data */
 513         struct nfs_args *args = NULL;
 514         int flags, addr_type;
 515         zone_t *zone = nfs_zone();
 516         zone_t *mntzone = NULL;
 517         nfs_fhandle     *fhandle = NULL;
 518 
 519         if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
 520                 return (error);
 521 
 522         if (mvp->v_type != VDIR)
 523                 return (ENOTDIR);
 524 
 525         /*
 526          * get arguments
 527          *
 528          * nfs_args is now versioned and is extensible, so
 529          * uap->datalen might be different from sizeof (args)
 530          * in a compatible situation.
 531          */
 532 more:
 533 
 534         if (!(uap->flags & MS_SYSSPACE)) {
 535                 if (args == NULL)
 536                         args = kmem_alloc(sizeof (struct nfs_args), KM_SLEEP);
 537                 else {
 538                         nfs_free_args(args, fhandle);
 539                         fhandle = NULL;
 540                 }
 541                 if (fhandle == NULL)
 542                         fhandle = kmem_zalloc(sizeof (nfs_fhandle), KM_SLEEP);
 543                 error = nfs_copyin(data, uap->datalen, args, fhandle);
 544                 if (error)  {
 545                         if (args)
 546                                 kmem_free(args, sizeof (*args));
 547                         return (error);
 548                 }
 549         } else {
 550                 args = (struct nfs_args *)data;
 551                 fhandle = (nfs_fhandle *)args->fh;
 552         }
 553 
 554 
 555         flags = args->flags;
 556 
 557         if (uap->flags & MS_REMOUNT) {
 558                 size_t n;
 559                 char name[FSTYPSZ];
 560 
 561                 if (uap->flags & MS_SYSSPACE)
 562                         error = copystr(uap->fstype, name, FSTYPSZ, &n);
 563                 else
 564                         error = copyinstr(uap->fstype, name, FSTYPSZ, &n);
 565 
 566                 if (error) {
 567                         if (error == ENAMETOOLONG)
 568                                 return (EINVAL);
 569                         return (error);
 570                 }
 571 
 572 
 573                 /*
 574                  * This check is to ensure that the request is a
 575                  * genuine nfs remount request.
 576                  */
 577 
 578                 if (strncmp(name, "nfs", 3) != 0)
 579                         return (EINVAL);
 580 
 581                 /*
 582                  * If the request changes the locking type, disallow the
 583                  * remount,
 584                  * because it's questionable whether we can transfer the
 585                  * locking state correctly.
 586                  *
 587                  * Remounts need to save the pathconf information.
 588                  * Part of the infamous static kludge.
 589                  */
 590 
 591                 if ((mi = VFTOMI(vfsp)) != NULL) {
 592                         uint_t new_mi_llock;
 593                         uint_t old_mi_llock;
 594 
 595                         new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
 596                         old_mi_llock = (mi->mi_flags & MI_LLOCK) ? 1 : 0;
 597                         if (old_mi_llock != new_mi_llock)
 598                                 return (EBUSY);
 599                 }
 600                 error = pathconf_get((struct mntinfo *)vfsp->vfs_data, args);
 601 
 602                 if (!(uap->flags & MS_SYSSPACE)) {
 603                         nfs_free_args(args, fhandle);
 604                         kmem_free(args, sizeof (*args));
 605                 }
 606 
 607                 return (error);
 608         }
 609 
 610         mutex_enter(&mvp->v_lock);
 611         if (!(uap->flags & MS_OVERLAY) &&
 612             (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 613                 mutex_exit(&mvp->v_lock);
 614                 if (!(uap->flags & MS_SYSSPACE)) {
 615                         nfs_free_args(args, fhandle);
 616                         kmem_free(args, sizeof (*args));
 617                 }
 618                 return (EBUSY);
 619         }
 620         mutex_exit(&mvp->v_lock);
 621 
 622         /* make sure things are zeroed for errout: */
 623         rtvp = NULL;
 624         mi = NULL;
 625         secdata = NULL;
 626 
 627         /*
 628          * A valid knetconfig structure is required.
 629          */
 630         if (!(flags & NFSMNT_KNCONF)) {
 631                 if (!(uap->flags & MS_SYSSPACE)) {
 632                         nfs_free_args(args, fhandle);
 633                         kmem_free(args, sizeof (*args));
 634                 }
 635                 return (EINVAL);
 636         }
 637 
 638         if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
 639             (strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
 640                 if (!(uap->flags & MS_SYSSPACE)) {
 641                         nfs_free_args(args, fhandle);
 642                         kmem_free(args, sizeof (*args));
 643                 }
 644                 return (EINVAL);
 645         }
 646 
 647 
 648         /*
 649          * Allocate a servinfo struct.
 650          */
 651         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
 652         mutex_init(&svp->sv_lock, NULL, MUTEX_DEFAULT, NULL);
 653         if (svp_tail) {
 654                 svp_2ndlast = svp_tail;
 655                 svp_tail->sv_next = svp;
 656         } else {
 657                 svp_head = svp;
 658                 svp_2ndlast = svp;
 659         }
 660 
 661         svp_tail = svp;
 662 
 663         /*
 664          * Get knetconfig and server address
 665          */
 666         svp->sv_knconf = args->knconf;
 667         args->knconf = NULL;
 668 
 669         if (args->addr == NULL || args->addr->buf == NULL) {
 670                 error = EINVAL;
 671                 goto errout;
 672         }
 673 
 674         svp->sv_addr.maxlen = args->addr->maxlen;
 675         svp->sv_addr.len = args->addr->len;
 676         svp->sv_addr.buf = args->addr->buf;
 677         args->addr->buf = NULL;
 678 
 679         /*
 680          * Get the root fhandle
 681          */
 682         ASSERT(fhandle);
 683 
 684         bcopy(&fhandle->fh_buf, &svp->sv_fhandle.fh_buf, fhandle->fh_len);
 685         svp->sv_fhandle.fh_len = fhandle->fh_len;
 686 
 687         /*
 688          * Get server's hostname
 689          */
 690         if (flags & NFSMNT_HOSTNAME) {
 691                 if (args->hostname == NULL) {
 692                         error = EINVAL;
 693                         goto errout;
 694                 }
 695                 svp->sv_hostnamelen = strlen(args->hostname) + 1;
 696                 svp->sv_hostname = args->hostname;
 697                 args->hostname = NULL;
 698         } else {
 699                 char *p = "unknown-host";
 700                 svp->sv_hostnamelen = strlen(p) + 1;
 701                 svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
 702                 (void) strcpy(svp->sv_hostname, p);
 703         }
 704 
 705 
 706         /*
 707          * RDMA MOUNT SUPPORT FOR NFS v2:
 708          * Establish, is it possible to use RDMA, if so overload the
 709          * knconf with rdma specific knconf and free the orignal.
 710          */
 711         if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
 712                 /*
 713                  * Determine the addr type for RDMA, IPv4 or v6.
 714                  */
 715                 if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
 716                         addr_type = AF_INET;
 717                 else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
 718                         addr_type = AF_INET6;
 719 
 720                 if (rdma_reachable(addr_type, &svp->sv_addr,
 721                     &rdma_knconf) == 0) {
 722                         /*
 723                          * If successful, hijack, the orignal knconf and
 724                          * replace with a new one, depending on the flags.
 725                          */
 726                         svp->sv_origknconf = svp->sv_knconf;
 727                         svp->sv_knconf = rdma_knconf;
 728                         knconf = rdma_knconf;
 729                 } else {
 730                         if (flags & NFSMNT_TRYRDMA) {
 731 #ifdef  DEBUG
 732                                 if (rdma_debug)
 733                                         zcmn_err(getzoneid(), CE_WARN,
 734                                             "no RDMA onboard, revert\n");
 735 #endif
 736                         }
 737 
 738                         if (flags & NFSMNT_DORDMA) {
 739                                 /*
 740                                  * If proto=rdma is specified and no RDMA
 741                                  * path to this server is avialable then
 742                                  * ditch this server.
 743                                  * This is not included in the mountable
 744                                  * server list or the replica list.
 745                                  * Check if more servers are specified;
 746                                  * Failover case, otherwise bail out of mount.
 747                                  */
 748                                 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
 749                                     args->nfs_ext_u.nfs_extB.next != NULL) {
 750                                         data = (char *)
 751                                             args->nfs_ext_u.nfs_extB.next;
 752                                         if (uap->flags & MS_RDONLY &&
 753                                             !(flags & NFSMNT_SOFT)) {
 754                                                 if (svp_head->sv_next == NULL) {
 755                                                         svp_tail = NULL;
 756                                                         svp_2ndlast = NULL;
 757                                                         sv_free(svp_head);
 758                                                         goto more;
 759                                                 } else {
 760                                                         svp_tail = svp_2ndlast;
 761                                                         svp_2ndlast->sv_next =
 762                                                             NULL;
 763                                                         sv_free(svp);
 764                                                         goto more;
 765                                                 }
 766                                         }
 767                                 } else {
 768                                         /*
 769                                          * This is the last server specified
 770                                          * in the nfs_args list passed down
 771                                          * and its not rdma capable.
 772                                          */
 773                                         if (svp_head->sv_next == NULL) {
 774                                                 /*
 775                                                  * Is this the only one
 776                                                  */
 777                                                 error = EINVAL;
 778 #ifdef  DEBUG
 779                                                 if (rdma_debug)
 780                                                         zcmn_err(getzoneid(),
 781                                                             CE_WARN,
 782                                                             "No RDMA srv");
 783 #endif
 784                                                 goto errout;
 785                                         } else {
 786                                                 /*
 787                                                  * There is list, since some
 788                                                  * servers specified before
 789                                                  * this passed all requirements
 790                                                  */
 791                                                 svp_tail = svp_2ndlast;
 792                                                 svp_2ndlast->sv_next = NULL;
 793                                                 sv_free(svp);
 794                                                 goto proceed;
 795                                         }
 796                                 }
 797                         }
 798                 }
 799         }
 800 
 801         /*
 802          * Get the extention data which has the new security data structure.
 803          */
 804         if (flags & NFSMNT_NEWARGS) {
 805                 switch (args->nfs_args_ext) {
 806                 case NFS_ARGS_EXTA:
 807                 case NFS_ARGS_EXTB:
 808                         /*
 809                          * Indicating the application is using the new
 810                          * sec_data structure to pass in the security
 811                          * data.
 812                          */
 813                         secdata = args->nfs_ext_u.nfs_extA.secdata;
 814                         if (secdata == NULL) {
 815                                 error = EINVAL;
 816                         } else {
 817                                 /*
 818                                  * Need to validate the flavor here if
 819                                  * sysspace, userspace was already
 820                                  * validate from the nfs_copyin function.
 821                                  */
 822                                 switch (secdata->rpcflavor) {
 823                                         case AUTH_NONE:
 824                                         case AUTH_UNIX:
 825                                         case AUTH_LOOPBACK:
 826                                         case AUTH_DES:
 827                                         case RPCSEC_GSS:
 828                                                 break;
 829                                         default:
 830                                                 error = EINVAL;
 831                                                 goto errout;
 832                                 }
 833                         }
 834                         args->nfs_ext_u.nfs_extA.secdata = NULL;
 835                         break;
 836 
 837                 default:
 838                         error = EINVAL;
 839                         break;
 840                 }
 841         } else if (flags & NFSMNT_SECURE) {
 842                 /*
 843                  * Keep this for backward compatibility to support
 844                  * NFSMNT_SECURE/NFSMNT_RPCTIMESYNC flags.
 845                  */
 846                 if (args->syncaddr == NULL || args->syncaddr->buf == NULL) {
 847                         error = EINVAL;
 848                         goto errout;
 849                 }
 850 
 851                 /*
 852                  * get time sync address.
 853                  */
 854                 if (args->syncaddr == NULL) {
 855                         error = EFAULT;
 856                         goto errout;
 857                 }
 858 
 859                 /*
 860                  * Move security related data to the sec_data structure.
 861                  */
 862                 {
 863                         dh_k4_clntdata_t *data;
 864                         char *pf, *p;
 865 
 866                         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
 867                         if (flags & NFSMNT_RPCTIMESYNC)
 868                                 secdata->flags |= AUTH_F_RPCTIMESYNC;
 869                         data = kmem_alloc(sizeof (*data), KM_SLEEP);
 870                         bcopy(args->syncaddr, &data->syncaddr,
 871                             sizeof (*args->syncaddr));
 872 
 873 
 874                         /*
 875                          * duplicate the knconf information for the
 876                          * new opaque data.
 877                          */
 878                         data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
 879                         *data->knconf = *knconf;
 880                         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 881                         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 882                         bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
 883                         bcopy(knconf->knc_proto, pf, KNC_STRSIZE);
 884                         data->knconf->knc_protofmly = pf;
 885                         data->knconf->knc_proto = p;
 886 
 887                         /* move server netname to the sec_data structure */
 888                         nlen = strlen(args->hostname) + 1;
 889                         if (nlen != 0) {
 890                                 data->netname = kmem_alloc(nlen, KM_SLEEP);
 891                                 bcopy(args->hostname, data->netname, nlen);
 892                                 data->netnamelen = (int)nlen;
 893                         }
 894                         secdata->secmod = secdata->rpcflavor = AUTH_DES;
 895                         secdata->data = (caddr_t)data;
 896                 }
 897         } else {
 898                 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
 899                 secdata->secmod = secdata->rpcflavor = AUTH_UNIX;
 900                 secdata->data = NULL;
 901         }
 902         svp->sv_secdata = secdata;
 903 
 904         /*
 905          * See bug 1180236.
 906          * If mount secure failed, we will fall back to AUTH_NONE
 907          * and try again.  nfs3rootvp() will turn this back off.
 908          *
 909          * The NFS Version 2 mount uses GETATTR and STATFS procedures.
 910          * The server does not care if these procedures have the proper
 911          * authentication flavor, so if mount retries using AUTH_NONE
 912          * that does not require a credential setup for root then the
 913          * automounter would work without requiring root to be
 914          * keylogged into AUTH_DES.
 915          */
 916         if (secdata->rpcflavor != AUTH_UNIX &&
 917             secdata->rpcflavor != AUTH_LOOPBACK)
 918                 secdata->flags |= AUTH_F_TRYNONE;
 919 
 920         /*
 921          * Failover support:
 922          *
 923          * We may have a linked list of nfs_args structures,
 924          * which means the user is looking for failover.  If
 925          * the mount is either not "read-only" or "soft",
 926          * we want to bail out with EINVAL.
 927          */
 928         if (args->nfs_args_ext == NFS_ARGS_EXTB &&
 929             args->nfs_ext_u.nfs_extB.next != NULL) {
 930                 if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
 931                         data = (char *)args->nfs_ext_u.nfs_extB.next;
 932                         goto more;
 933                 }
 934                 error = EINVAL;
 935                 goto errout;
 936         }
 937 
 938         /*
 939          * Determine the zone we're being mounted into.
 940          */
 941         zone_hold(mntzone = zone);              /* start with this assumption */
 942         if (getzoneid() == GLOBAL_ZONEID) {
 943                 zone_rele(mntzone);
 944                 mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
 945                 ASSERT(mntzone != NULL);
 946                 if (mntzone != zone) {
 947                         error = EBUSY;
 948                         goto errout;
 949                 }
 950         }
 951 
 952         if (is_system_labeled()) {
 953                 error = nfs_mount_label_policy(vfsp, &svp->sv_addr,
 954                     svp->sv_knconf, cr);
 955 
 956                 if (error > 0)
 957                         goto errout;
 958 
 959                 if (error == -1) {
 960                         /* change mount to read-only to prevent write-down */
 961                         vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
 962                 }
 963         }
 964 
 965         /*
 966          * Stop the mount from going any further if the zone is going away.
 967          */
 968         if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
 969                 error = EBUSY;
 970                 goto errout;
 971         }
 972 
 973         /*
 974          * Get root vnode.
 975          */
 976 proceed:
 977         error = nfsrootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
 978 
 979         if (error)
 980                 goto errout;
 981 
 982         /*
 983          * Set option fields in the mount info record
 984          */
 985         mi = VTOMI(rtvp);
 986 
 987         if (svp_head->sv_next)
 988                 mi->mi_flags |= MI_LLOCK;
 989 
 990         error = nfs_setopts(rtvp, DATAMODEL_NATIVE, args);
 991         if (!error) {
 992                 /* static pathconf kludge */
 993                 error = pathconf_get(mi, args);
 994         }
 995 
 996 errout:
 997         if (rtvp != NULL) {
 998                 if (error) {
 999                         rp = VTOR(rtvp);
1000                         if (rp->r_flags & RHASHED)
1001                                 rp_rmhash(rp);
1002                 }
1003                 VN_RELE(rtvp);
1004         }
1005 
1006         if (error) {
1007                 sv_free(svp_head);
1008                 if (mi != NULL) {
1009                         nfs_async_stop(vfsp);
1010                         nfs_async_manager_stop(vfsp);
1011                         if (mi->mi_io_kstats) {
1012                                 kstat_delete(mi->mi_io_kstats);
1013                                 mi->mi_io_kstats = NULL;
1014                         }
1015                         if (mi->mi_ro_kstats) {
1016                                 kstat_delete(mi->mi_ro_kstats);
1017                                 mi->mi_ro_kstats = NULL;
1018                         }
1019                         nfs_free_mi(mi);
1020                 }
1021         }
1022 
1023         if (!(uap->flags & MS_SYSSPACE)) {
1024                 nfs_free_args(args, fhandle);
1025                 kmem_free(args, sizeof (*args));
1026         }
1027 
1028         if (mntzone != NULL)
1029                 zone_rele(mntzone);
1030 
1031         return (error);
1032 }
1033 
1034 /*
1035  * The pathconf information is kept on a linked list of kmem_alloc'ed
1036  * structs. We search the list & add a new struct iff there is no other
1037  * struct with the same information.
1038  * See sys/pathconf.h for ``the rest of the story.''
1039  */
1040 static struct pathcnf *allpc = NULL;
1041 
1042 static int
1043 pathconf_copyin(struct nfs_args *args, struct pathcnf *pc)
1044 {
1045         STRUCT_DECL(pathcnf, pc_tmp);
1046         STRUCT_HANDLE(nfs_args, ap);
1047         int i;
1048         model_t model;
1049 
1050         model = get_udatamodel();
1051         STRUCT_INIT(pc_tmp, model);
1052         STRUCT_SET_HANDLE(ap, model, args);
1053 
1054         if ((STRUCT_FGET(ap, flags) & NFSMNT_POSIX) &&
1055             STRUCT_FGETP(ap, pathconf) != NULL) {
1056                 if (copyin(STRUCT_FGETP(ap, pathconf), STRUCT_BUF(pc_tmp),
1057                     STRUCT_SIZE(pc_tmp)))
1058                         return (EFAULT);
1059                 if (_PC_ISSET(_PC_ERROR, STRUCT_FGET(pc_tmp, pc_mask)))
1060                         return (EINVAL);
1061 
1062                 pc->pc_link_max = STRUCT_FGET(pc_tmp, pc_link_max);
1063                 pc->pc_max_canon = STRUCT_FGET(pc_tmp, pc_max_canon);
1064                 pc->pc_max_input = STRUCT_FGET(pc_tmp, pc_max_input);
1065                 pc->pc_name_max = STRUCT_FGET(pc_tmp, pc_name_max);
1066                 pc->pc_path_max = STRUCT_FGET(pc_tmp, pc_path_max);
1067                 pc->pc_pipe_buf = STRUCT_FGET(pc_tmp, pc_pipe_buf);
1068                 pc->pc_vdisable = STRUCT_FGET(pc_tmp, pc_vdisable);
1069                 pc->pc_xxx = STRUCT_FGET(pc_tmp, pc_xxx);
1070                 for (i = 0; i < _PC_N; i++)
1071                         pc->pc_mask[i] = STRUCT_FGET(pc_tmp, pc_mask[i]);
1072         }
1073         return (0);
1074 }
1075 
1076 static int
1077 pathconf_get(struct mntinfo *mi, struct nfs_args *args)
1078 {
1079         struct pathcnf *p, *pc;
1080 
1081         pc = args->pathconf;
1082         if (mi->mi_pathconf != NULL) {
1083                 pathconf_rele(mi);
1084                 mi->mi_pathconf = NULL;
1085         }
1086 
1087         if (args->flags & NFSMNT_POSIX && args->pathconf != NULL) {
1088                 if (_PC_ISSET(_PC_ERROR, pc->pc_mask))
1089                         return (EINVAL);
1090 
1091                 for (p = allpc; p != NULL; p = p->pc_next) {
1092                         if (PCCMP(p, pc) == 0)
1093                                 break;
1094                 }
1095                 if (p != NULL) {
1096                         mi->mi_pathconf = p;
1097                         p->pc_refcnt++;
1098                 } else {
1099                         p = kmem_alloc(sizeof (*p), KM_SLEEP);
1100                         bcopy(pc, p, sizeof (struct pathcnf));
1101                         p->pc_next = allpc;
1102                         p->pc_refcnt = 1;
1103                         allpc = mi->mi_pathconf = p;
1104                 }
1105         }
1106         return (0);
1107 }
1108 
1109 /*
1110  * release the static pathconf information
1111  */
1112 static void
1113 pathconf_rele(struct mntinfo *mi)
1114 {
1115         if (mi->mi_pathconf != NULL) {
1116                 if (--mi->mi_pathconf->pc_refcnt == 0) {
1117                         struct pathcnf *p;
1118                         struct pathcnf *p2;
1119 
1120                         p2 = p = allpc;
1121                         while (p != NULL && p != mi->mi_pathconf) {
1122                                 p2 = p;
1123                                 p = p->pc_next;
1124                         }
1125                         if (p == NULL) {
1126                                 panic("mi->pathconf");
1127                                 /*NOTREACHED*/
1128                         }
1129                         if (p == allpc)
1130                                 allpc = p->pc_next;
1131                         else
1132                                 p2->pc_next = p->pc_next;
1133                         kmem_free(p, sizeof (*p));
1134                         mi->mi_pathconf = NULL;
1135                 }
1136         }
1137 }
1138 
1139 volatile int nfs_dynamic = 1;   /* global variable to enable dynamic retrans. */
1140 volatile ushort_t nfs_max_threads = 8;  /* max number of active async threads */
1141 volatile uint_t nfs_async_clusters = 1; /* # of reqs from each async queue */
1142 volatile uint_t nfs_cots_timeo = NFS_COTS_TIMEO;
1143 
1144 static int
1145 nfsrootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo *svp,
1146         int flags, cred_t *cr, zone_t *zone)
1147 {
1148         vnode_t *rtvp;
1149         mntinfo_t *mi;
1150         dev_t nfs_dev;
1151         struct vattr va;
1152         int error;
1153         rnode_t *rp;
1154         int i;
1155         struct nfs_stats *nfsstatsp;
1156         cred_t *lcr = NULL, *tcr = cr;
1157 
1158         nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
1159         ASSERT(nfsstatsp != NULL);
1160 
1161         /*
1162          * Create a mount record and link it to the vfs struct.
1163          */
1164         mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
1165         mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
1166         mutex_init(&mi->mi_remap_lock, NULL, MUTEX_DEFAULT, NULL);
1167         mi->mi_flags = MI_ACL | MI_EXTATTR;
1168         if (!(flags & NFSMNT_SOFT))
1169                 mi->mi_flags |= MI_HARD;
1170         if ((flags & NFSMNT_SEMISOFT))
1171                 mi->mi_flags |= MI_SEMISOFT;
1172         if ((flags & NFSMNT_NOPRINT))
1173                 mi->mi_flags |= MI_NOPRINT;
1174         if (flags & NFSMNT_INT)
1175                 mi->mi_flags |= MI_INT;
1176         mi->mi_retrans = NFS_RETRIES;
1177         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1178             svp->sv_knconf->knc_semantics == NC_TPI_COTS)
1179                 mi->mi_timeo = nfs_cots_timeo;
1180         else
1181                 mi->mi_timeo = NFS_TIMEO;
1182         mi->mi_prog = NFS_PROGRAM;
1183         mi->mi_vers = NFS_VERSION;
1184         mi->mi_rfsnames = rfsnames_v2;
1185         mi->mi_reqs = nfsstatsp->nfs_stats_v2.rfsreqcnt_ptr;
1186         mi->mi_call_type = call_type_v2;
1187         mi->mi_ss_call_type = ss_call_type_v2;
1188         mi->mi_timer_type = timer_type_v2;
1189         mi->mi_aclnames = aclnames_v2;
1190         mi->mi_aclreqs = nfsstatsp->nfs_stats_v2.aclreqcnt_ptr;
1191         mi->mi_acl_call_type = acl_call_type_v2;
1192         mi->mi_acl_ss_call_type = acl_ss_call_type_v2;
1193         mi->mi_acl_timer_type = acl_timer_type_v2;
1194         cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
1195         mi->mi_servers = svp;
1196         mi->mi_curr_serv = svp;
1197         mi->mi_acregmin = SEC2HR(ACREGMIN);
1198         mi->mi_acregmax = SEC2HR(ACREGMAX);
1199         mi->mi_acdirmin = SEC2HR(ACDIRMIN);
1200         mi->mi_acdirmax = SEC2HR(ACDIRMAX);
1201 
1202         if (nfs_dynamic)
1203                 mi->mi_flags |= MI_DYNAMIC;
1204 
1205         if (flags & NFSMNT_DIRECTIO)
1206                 mi->mi_flags |= MI_DIRECTIO;
1207 
1208         /*
1209          * Make a vfs struct for nfs.  We do this here instead of below
1210          * because rtvp needs a vfs before we can do a getattr on it.
1211          *
1212          * Assign a unique device id to the mount
1213          */
1214         mutex_enter(&nfs_minor_lock);
1215         do {
1216                 nfs_minor = (nfs_minor + 1) & MAXMIN32;
1217                 nfs_dev = makedevice(nfs_major, nfs_minor);
1218         } while (vfs_devismounted(nfs_dev));
1219         mutex_exit(&nfs_minor_lock);
1220 
1221         vfsp->vfs_dev = nfs_dev;
1222         vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfsfstyp);
1223         vfsp->vfs_data = (caddr_t)mi;
1224         vfsp->vfs_fstype = nfsfstyp;
1225         vfsp->vfs_bsize = NFS_MAXDATA;
1226 
1227         /*
1228          * Initialize fields used to support async putpage operations.
1229          */
1230         for (i = 0; i < NFS_ASYNC_TYPES; i++)
1231                 mi->mi_async_clusters[i] = nfs_async_clusters;
1232         mi->mi_async_init_clusters = nfs_async_clusters;
1233         mi->mi_async_curr[NFS_ASYNC_QUEUE] =
1234             mi->mi_async_curr[NFS_ASYNC_PGOPS_QUEUE] = &mi->mi_async_reqs[0];
1235         mi->mi_max_threads = nfs_max_threads;
1236         mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
1237         cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
1238         cv_init(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE], NULL, CV_DEFAULT, NULL);
1239         cv_init(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE], NULL,
1240             CV_DEFAULT, NULL);
1241         cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
1242 
1243         mi->mi_vfsp = vfsp;
1244         mi->mi_zone = zone;
1245         zone_init_ref(&mi->mi_zone_ref);
1246         zone_hold_ref(zone, &mi->mi_zone_ref, ZONE_REF_NFS);
1247         nfs_mi_zonelist_add(mi);
1248 
1249         /*
1250          * Make the root vnode, use it to get attributes,
1251          * then remake it with the attributes.
1252          */
1253         rtvp = makenfsnode((fhandle_t *)svp->sv_fhandle.fh_buf,
1254             NULL, vfsp, gethrtime(), cr, NULL, NULL);
1255 
1256         va.va_mask = AT_ALL;
1257 
1258         /*
1259          * If the uid is set then set the creds for secure mounts
1260          * by proxy processes such as automountd.
1261          */
1262         if (svp->sv_secdata->uid != 0 &&
1263             svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
1264                 lcr = crdup(cr);
1265                 (void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
1266                 tcr = lcr;
1267         }
1268 
1269         error = nfsgetattr(rtvp, &va, tcr);
1270         if (error)
1271                 goto bad;
1272         rtvp->v_type = va.va_type;
1273 
1274         /*
1275          * Poll every server to get the filesystem stats; we're
1276          * only interested in the server's transfer size, and we
1277          * want the minimum.
1278          *
1279          * While we're looping, we'll turn off AUTH_F_TRYNONE,
1280          * which is only for the mount operation.
1281          */
1282 
1283         mi->mi_tsize = MIN(NFS_MAXDATA, nfstsize());
1284         mi->mi_stsize = MIN(NFS_MAXDATA, nfstsize());
1285 
1286         for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
1287                 struct nfsstatfs fs;
1288                 int douprintf;
1289 
1290                 douprintf = 1;
1291                 mi->mi_curr_serv = svp;
1292 
1293                 error = rfs2call(mi, RFS_STATFS, xdr_fhandle,
1294                     (caddr_t)svp->sv_fhandle.fh_buf, xdr_statfs, (caddr_t)&fs,
1295                     tcr, &douprintf, &fs.fs_status, 0, NULL);
1296                 if (error)
1297                         goto bad;
1298                 mi->mi_stsize = MIN(mi->mi_stsize, fs.fs_tsize);
1299                 svp->sv_secdata->flags &= ~AUTH_F_TRYNONE;
1300         }
1301         mi->mi_curr_serv = mi->mi_servers;
1302         mi->mi_curread = mi->mi_tsize;
1303         mi->mi_curwrite = mi->mi_stsize;
1304 
1305         /*
1306          * Start the manager thread responsible for handling async worker
1307          * threads.
1308          */
1309         VFS_HOLD(vfsp); /* add reference for thread */
1310         mi->mi_manager_thread = zthread_create(NULL, 0, nfs_async_manager,
1311             vfsp, 0, minclsyspri);
1312         ASSERT(mi->mi_manager_thread != NULL);
1313 
1314         /*
1315          * Initialize kstats
1316          */
1317         nfs_mnt_kstat_init(vfsp);
1318 
1319         mi->mi_type = rtvp->v_type;
1320 
1321         *rtvpp = rtvp;
1322         if (lcr != NULL)
1323                 crfree(lcr);
1324 
1325         return (0);
1326 bad:
1327         /*
1328          * An error occurred somewhere, need to clean up...
1329          * We need to release our reference to the root vnode and
1330          * destroy the mntinfo struct that we just created.
1331          */
1332         if (lcr != NULL)
1333                 crfree(lcr);
1334         rp = VTOR(rtvp);
1335         if (rp->r_flags & RHASHED)
1336                 rp_rmhash(rp);
1337         VN_RELE(rtvp);
1338         nfs_async_stop(vfsp);
1339         nfs_async_manager_stop(vfsp);
1340         if (mi->mi_io_kstats) {
1341                 kstat_delete(mi->mi_io_kstats);
1342                 mi->mi_io_kstats = NULL;
1343         }
1344         if (mi->mi_ro_kstats) {
1345                 kstat_delete(mi->mi_ro_kstats);
1346                 mi->mi_ro_kstats = NULL;
1347         }
1348         nfs_free_mi(mi);
1349         *rtvpp = NULL;
1350         return (error);
1351 }
1352 
1353 /*
1354  * vfs operations
1355  */
1356 static int
1357 nfs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
1358 {
1359         mntinfo_t *mi;
1360         ushort_t omax;
1361 
1362         if (secpolicy_fs_unmount(cr, vfsp) != 0)
1363                 return (EPERM);
1364 
1365         mi = VFTOMI(vfsp);
1366         if (flag & MS_FORCE) {
1367 
1368                 vfsp->vfs_flag |= VFS_UNMOUNTED;
1369 
1370                 /*
1371                  * We are about to stop the async manager.
1372                  * Let every one know not to schedule any
1373                  * more async requests.
1374                  */
1375                 mutex_enter(&mi->mi_async_lock);
1376                 mi->mi_max_threads = 0;
1377                 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1378                 mutex_exit(&mi->mi_async_lock);
1379 
1380                 /*
1381                  * We need to stop the manager thread explicitly; the worker
1382                  * threads can time out and exit on their own.
1383                  */
1384                 nfs_async_manager_stop(vfsp);
1385                 destroy_rtable(vfsp, cr);
1386                 if (mi->mi_io_kstats) {
1387                         kstat_delete(mi->mi_io_kstats);
1388                         mi->mi_io_kstats = NULL;
1389                 }
1390                 if (mi->mi_ro_kstats) {
1391                         kstat_delete(mi->mi_ro_kstats);
1392                         mi->mi_ro_kstats = NULL;
1393                 }
1394                 return (0);
1395         }
1396         /*
1397          * Wait until all asynchronous putpage operations on
1398          * this file system are complete before flushing rnodes
1399          * from the cache.
1400          */
1401         omax = mi->mi_max_threads;
1402         if (nfs_async_stop_sig(vfsp)) {
1403                 return (EINTR);
1404         }
1405         rflush(vfsp, cr);
1406         /*
1407          * If there are any active vnodes on this file system,
1408          * then the file system is busy and can't be umounted.
1409          */
1410         if (check_rtable(vfsp)) {
1411                 mutex_enter(&mi->mi_async_lock);
1412                 mi->mi_max_threads = omax;
1413                 mutex_exit(&mi->mi_async_lock);
1414                 return (EBUSY);
1415         }
1416         /*
1417          * The unmount can't fail from now on; stop the manager thread.
1418          */
1419         nfs_async_manager_stop(vfsp);
1420         /*
1421          * Destroy all rnodes belonging to this file system from the
1422          * rnode hash queues and purge any resources allocated to
1423          * them.
1424          */
1425         destroy_rtable(vfsp, cr);
1426         if (mi->mi_io_kstats) {
1427                 kstat_delete(mi->mi_io_kstats);
1428                 mi->mi_io_kstats = NULL;
1429         }
1430         if (mi->mi_ro_kstats) {
1431                 kstat_delete(mi->mi_ro_kstats);
1432                 mi->mi_ro_kstats = NULL;
1433         }
1434         return (0);
1435 }
1436 
1437 /*
1438  * find root of nfs
1439  */
1440 static int
1441 nfs_root(vfs_t *vfsp, vnode_t **vpp)
1442 {
1443         mntinfo_t *mi;
1444         vnode_t *vp;
1445         servinfo_t *svp;
1446         rnode_t *rp;
1447         int error = 0;
1448 
1449         mi = VFTOMI(vfsp);
1450 
1451         if (nfs_zone() != mi->mi_zone)
1452                 return (EPERM);
1453 
1454         svp = mi->mi_curr_serv;
1455         if (svp && (svp->sv_flags & SV_ROOT_STALE)) {
1456                 mutex_enter(&svp->sv_lock);
1457                 svp->sv_flags &= ~SV_ROOT_STALE;
1458                 mutex_exit(&svp->sv_lock);
1459                 error = ENOENT;
1460         }
1461 
1462         vp = makenfsnode((fhandle_t *)mi->mi_curr_serv->sv_fhandle.fh_buf,
1463             NULL, vfsp, gethrtime(), CRED(), NULL, NULL);
1464 
1465         /*
1466          * if the SV_ROOT_STALE flag was reset above, reset the
1467          * RSTALE flag if needed and return an error
1468          */
1469         if (error == ENOENT) {
1470                 rp = VTOR(vp);
1471                 if (svp && rp->r_flags & RSTALE) {
1472                         mutex_enter(&rp->r_statelock);
1473                         rp->r_flags &= ~RSTALE;
1474                         mutex_exit(&rp->r_statelock);
1475                 }
1476                 VN_RELE(vp);
1477                 return (error);
1478         }
1479 
1480         ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
1481 
1482         vp->v_type = mi->mi_type;
1483 
1484         *vpp = vp;
1485 
1486         return (0);
1487 }
1488 
1489 /*
1490  * Get file system statistics.
1491  */
1492 static int
1493 nfs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
1494 {
1495         int error;
1496         mntinfo_t *mi;
1497         struct nfsstatfs fs;
1498         int douprintf;
1499         failinfo_t fi;
1500         vnode_t *vp;
1501 
1502         error = nfs_root(vfsp, &vp);
1503         if (error)
1504                 return (error);
1505 
1506         mi = VFTOMI(vfsp);
1507         douprintf = 1;
1508         fi.vp = vp;
1509         fi.fhp = NULL;          /* no need to update, filehandle not copied */
1510         fi.copyproc = nfscopyfh;
1511         fi.lookupproc = nfslookup;
1512         fi.xattrdirproc = acl_getxattrdir2;
1513 
1514         error = rfs2call(mi, RFS_STATFS, xdr_fhandle, (caddr_t)VTOFH(vp),
1515             xdr_statfs, (caddr_t)&fs, CRED(), &douprintf, &fs.fs_status, 0,
1516             &fi);
1517 
1518         if (!error) {
1519                 error = geterrno(fs.fs_status);
1520                 if (!error) {
1521                         mutex_enter(&mi->mi_lock);
1522                         if (mi->mi_stsize) {
1523                                 mi->mi_stsize = MIN(mi->mi_stsize, fs.fs_tsize);
1524                         } else {
1525                                 mi->mi_stsize = fs.fs_tsize;
1526                                 mi->mi_curwrite = mi->mi_stsize;
1527                         }
1528                         mutex_exit(&mi->mi_lock);
1529                         sbp->f_bsize = fs.fs_bsize;
1530                         sbp->f_frsize = fs.fs_bsize;
1531                         sbp->f_blocks = (fsblkcnt64_t)fs.fs_blocks;
1532                         sbp->f_bfree = (fsblkcnt64_t)fs.fs_bfree;
1533                         /*
1534                          * Some servers may return negative available
1535                          * block counts.  They may do this because they
1536                          * calculate the number of available blocks by
1537                          * subtracting the number of used blocks from
1538                          * the total number of blocks modified by the
1539                          * minimum free value.  For example, if the
1540                          * minumum free percentage is 10 and the file
1541                          * system is greater than 90 percent full, then
1542                          * 90 percent of the total blocks minus the
1543                          * actual number of used blocks may be a
1544                          * negative number.
1545                          *
1546                          * In this case, we need to sign extend the
1547                          * negative number through the assignment from
1548                          * the 32 bit bavail count to the 64 bit bavail
1549                          * count.
1550                          *
1551                          * We need to be able to discern between there
1552                          * just being a lot of available blocks on the
1553                          * file system and the case described above.
1554                          * We are making the assumption that it does
1555                          * not make sense to have more available blocks
1556                          * than there are free blocks.  So, if there
1557                          * are, then we treat the number as if it were
1558                          * a negative number and arrange to have it
1559                          * sign extended when it is converted from 32
1560                          * bits to 64 bits.
1561                          */
1562                         if (fs.fs_bavail <= fs.fs_bfree)
1563                                 sbp->f_bavail = (fsblkcnt64_t)fs.fs_bavail;
1564                         else {
1565                                 sbp->f_bavail =
1566                                     (fsblkcnt64_t)((long)fs.fs_bavail);
1567                         }
1568                         sbp->f_files = (fsfilcnt64_t)-1;
1569                         sbp->f_ffree = (fsfilcnt64_t)-1;
1570                         sbp->f_favail = (fsfilcnt64_t)-1;
1571                         sbp->f_fsid = (unsigned long)vfsp->vfs_fsid.val[0];
1572                         (void) strncpy(sbp->f_basetype,
1573                             vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
1574                         sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
1575                         sbp->f_namemax = (uint32_t)-1;
1576                 } else {
1577                         PURGE_STALE_FH(error, vp, CRED());
1578                 }
1579         }
1580 
1581         VN_RELE(vp);
1582 
1583         return (error);
1584 }
1585 
1586 static kmutex_t nfs_syncbusy;
1587 
1588 /*
1589  * Flush dirty nfs files for file system vfsp.
1590  * If vfsp == NULL, all nfs files are flushed.
1591  */
1592 /* ARGSUSED */
1593 static int
1594 nfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
1595 {
1596         /*
1597          * Cross-zone calls are OK here, since this translates to a
1598          * VOP_PUTPAGE(B_ASYNC), which gets picked up by the right zone.
1599          */
1600         if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs_syncbusy) != 0) {
1601                 rflush(vfsp, cr);
1602                 mutex_exit(&nfs_syncbusy);
1603         }
1604         return (0);
1605 }
1606 
1607 /* ARGSUSED */
1608 static int
1609 nfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1610 {
1611         int error;
1612         vnode_t *vp;
1613         struct vattr va;
1614         struct nfs_fid *nfsfidp = (struct nfs_fid *)fidp;
1615         zoneid_t zoneid = VFTOMI(vfsp)->mi_zone->zone_id;
1616 
1617         if (nfs_zone() != VFTOMI(vfsp)->mi_zone)
1618                 return (EPERM);
1619         if (fidp->fid_len != (sizeof (*nfsfidp) - sizeof (short))) {
1620 #ifdef DEBUG
1621                 zcmn_err(zoneid, CE_WARN,
1622                     "nfs_vget: bad fid len, %d/%d", fidp->fid_len,
1623                     (int)(sizeof (*nfsfidp) - sizeof (short)));
1624 #endif
1625                 *vpp = NULL;
1626                 return (ESTALE);
1627         }
1628 
1629         vp = makenfsnode((fhandle_t *)(nfsfidp->nf_data), NULL, vfsp,
1630             gethrtime(), CRED(), NULL, NULL);
1631 
1632         if (VTOR(vp)->r_flags & RSTALE) {
1633                 VN_RELE(vp);
1634                 *vpp = NULL;
1635                 return (ENOENT);
1636         }
1637 
1638         if (vp->v_type == VNON) {
1639                 va.va_mask = AT_ALL;
1640                 error = nfsgetattr(vp, &va, CRED());
1641                 if (error) {
1642                         VN_RELE(vp);
1643                         *vpp = NULL;
1644                         return (error);
1645                 }
1646                 vp->v_type = va.va_type;
1647         }
1648 
1649         *vpp = vp;
1650 
1651         return (0);
1652 }
1653 
1654 /* ARGSUSED */
1655 static int
1656 nfs_mountroot(vfs_t *vfsp, whymountroot_t why)
1657 {
1658         vnode_t *rtvp;
1659         char root_hostname[SYS_NMLN+1];
1660         struct servinfo *svp;
1661         int error;
1662         int vfsflags;
1663         size_t size;
1664         char *root_path;
1665         struct pathname pn;
1666         char *name;
1667         cred_t *cr;
1668         struct nfs_args args;           /* nfs mount arguments */
1669         static char token[10];
1670 
1671         bzero(&args, sizeof (args));
1672 
1673         /* do this BEFORE getfile which causes xid stamps to be initialized */
1674         clkset(-1L);            /* hack for now - until we get time svc? */
1675 
1676         if (why == ROOT_REMOUNT) {
1677                 /*
1678                  * Shouldn't happen.
1679                  */
1680                 panic("nfs_mountroot: why == ROOT_REMOUNT");
1681         }
1682 
1683         if (why == ROOT_UNMOUNT) {
1684                 /*
1685                  * Nothing to do for NFS.
1686                  */
1687                 return (0);
1688         }
1689 
1690         /*
1691          * why == ROOT_INIT
1692          */
1693 
1694         name = token;
1695         *name = 0;
1696         getfsname("root", name, sizeof (token));
1697 
1698         pn_alloc(&pn);
1699         root_path = pn.pn_path;
1700 
1701         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
1702         svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
1703         svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1704         svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1705 
1706         /*
1707          * Get server address
1708          * Get the root fhandle
1709          * Get server's transport
1710          * Get server's hostname
1711          * Get options
1712          */
1713         args.addr = &svp->sv_addr;
1714         args.fh = (char *)&svp->sv_fhandle.fh_buf;
1715         args.knconf = svp->sv_knconf;
1716         args.hostname = root_hostname;
1717         vfsflags = 0;
1718         if (error = mount_root(*name ? name : "root", root_path, NFS_VERSION,
1719             &args, &vfsflags)) {
1720                 nfs_cmn_err(error, CE_WARN,
1721                     "nfs_mountroot: mount_root failed: %m");
1722                 sv_free(svp);
1723                 pn_free(&pn);
1724                 return (error);
1725         }
1726         svp->sv_fhandle.fh_len = NFS_FHSIZE;
1727         svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
1728         svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
1729         (void) strcpy(svp->sv_hostname, root_hostname);
1730 
1731         /*
1732          * Force root partition to always be mounted with AUTH_UNIX for now
1733          */
1734         svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
1735         svp->sv_secdata->secmod = AUTH_UNIX;
1736         svp->sv_secdata->rpcflavor = AUTH_UNIX;
1737         svp->sv_secdata->data = NULL;
1738 
1739         cr = crgetcred();
1740         rtvp = NULL;
1741 
1742         error = nfsrootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
1743 
1744         crfree(cr);
1745 
1746         if (error) {
1747                 pn_free(&pn);
1748                 sv_free(svp);
1749                 return (error);
1750         }
1751 
1752         error = nfs_setopts(rtvp, DATAMODEL_NATIVE, &args);
1753         if (error) {
1754                 nfs_cmn_err(error, CE_WARN,
1755                     "nfs_mountroot: invalid root mount options");
1756                 pn_free(&pn);
1757                 goto errout;
1758         }
1759 
1760         (void) vfs_lock_wait(vfsp);
1761         vfs_add(NULL, vfsp, vfsflags);
1762         vfs_unlock(vfsp);
1763 
1764         size = strlen(svp->sv_hostname);
1765         (void) strcpy(rootfs.bo_name, svp->sv_hostname);
1766         rootfs.bo_name[size] = ':';
1767         (void) strcpy(&rootfs.bo_name[size + 1], root_path);
1768 
1769         pn_free(&pn);
1770 
1771 errout:
1772         if (error) {
1773                 sv_free(svp);
1774                 nfs_async_stop(vfsp);
1775                 nfs_async_manager_stop(vfsp);
1776         }
1777 
1778         if (rtvp != NULL)
1779                 VN_RELE(rtvp);
1780 
1781         return (error);
1782 }
1783 
1784 /*
1785  * Initialization routine for VFS routines.  Should only be called once
1786  */
1787 int
1788 nfs_vfsinit(void)
1789 {
1790         mutex_init(&nfs_syncbusy, NULL, MUTEX_DEFAULT, NULL);
1791         return (0);
1792 }
1793 
1794 void
1795 nfs_vfsfini(void)
1796 {
1797         mutex_destroy(&nfs_syncbusy);
1798 }
1799 
1800 void
1801 nfs_freevfs(vfs_t *vfsp)
1802 {
1803         mntinfo_t *mi;
1804         servinfo_t *svp;
1805 
1806         /* free up the resources */
1807         mi = VFTOMI(vfsp);
1808         pathconf_rele(mi);
1809         svp = mi->mi_servers;
1810         mi->mi_servers = mi->mi_curr_serv = NULL;
1811         sv_free(svp);
1812 
1813         /*
1814          * By this time we should have already deleted the
1815          * mi kstats in the unmount code. If they are still around
1816          * somethings wrong
1817          */
1818         ASSERT(mi->mi_io_kstats == NULL);
1819         nfs_free_mi(mi);
1820 }