1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All rights reserved.
  29  */
  30 
  31 #include <sys/param.h>
  32 #include <sys/types.h>
  33 #include <sys/systm.h>
  34 #include <sys/cred.h>
  35 #include <sys/vfs.h>
  36 #include <sys/vfs_opreg.h>
  37 #include <sys/vnode.h>
  38 #include <sys/pathname.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/kmem.h>
  41 #include <sys/mkdev.h>
  42 #include <sys/mount.h>
  43 #include <sys/mntent.h>
  44 #include <sys/statvfs.h>
  45 #include <sys/errno.h>
  46 #include <sys/debug.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/utsname.h>
  49 #include <sys/bootconf.h>
  50 #include <sys/modctl.h>
  51 #include <sys/acl.h>
  52 #include <sys/flock.h>
  53 #include <sys/policy.h>
  54 #include <sys/zone.h>
  55 #include <sys/class.h>
  56 #include <sys/socket.h>
  57 #include <sys/netconfig.h>
  58 #include <sys/tsol/tnet.h>
  59 
  60 #include <rpc/types.h>
  61 #include <rpc/auth.h>
  62 #include <rpc/clnt.h>
  63 
  64 #include <nfs/nfs.h>
  65 #include <nfs/nfs_clnt.h>
  66 #include <nfs/rnode.h>
  67 #include <nfs/mount.h>
  68 #include <nfs/nfs_acl.h>
  69 
  70 #include <fs/fs_subr.h>
  71 
  72 /*
  73  * From rpcsec module (common/rpcsec).
  74  */
  75 extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
  76 extern void sec_clnt_freeinfo(struct sec_data *);
  77 
  78 /*
  79  * The order and contents of this structure must be kept in sync with that of
  80  * rfsreqcnt_v3_tmpl in nfs_stats.c
  81  */
  82 static char *rfsnames_v3[] = {
  83         "null", "getattr", "setattr", "lookup", "access", "readlink", "read",
  84         "write", "create", "mkdir", "symlink", "mknod", "remove", "rmdir",
  85         "rename", "link", "readdir", "readdirplus", "fsstat", "fsinfo",
  86         "pathconf", "commit"
  87 };
  88 
  89 /*
  90  * This table maps from NFS protocol number into call type.
  91  * Zero means a "Lookup" type call
  92  * One  means a "Read" type call
  93  * Two  means a "Write" type call
  94  * This is used to select a default time-out.
  95  */
  96 static uchar_t call_type_v3[] = {
  97         0, 0, 1, 0, 0, 0, 1,
  98         2, 2, 2, 2, 2, 2, 2,
  99         2, 2, 1, 2, 0, 0, 0,
 100         2 };
 101 
 102 /*
 103  * Similar table, but to determine which timer to use
 104  * (only real reads and writes!)
 105  */
 106 static uchar_t timer_type_v3[] = {
 107         0, 0, 0, 0, 0, 0, 1,
 108         2, 0, 0, 0, 0, 0, 0,
 109         0, 0, 1, 1, 0, 0, 0,
 110         0 };
 111 
 112 /*
 113  * This table maps from NFS protocol number into a call type
 114  * for the semisoft mount option.
 115  * Zero means do not repeat operation.
 116  * One  means repeat.
 117  */
 118 static uchar_t ss_call_type_v3[] = {
 119         0, 0, 1, 0, 0, 0, 0,
 120         1, 1, 1, 1, 1, 1, 1,
 121         1, 1, 0, 0, 0, 0, 0,
 122         1 };
 123 
 124 /*
 125  * nfs3 vfs operations.
 126  */
 127 static int      nfs3_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
 128 static int      nfs3_unmount(vfs_t *, int, cred_t *);
 129 static int      nfs3_root(vfs_t *, vnode_t **);
 130 static int      nfs3_statvfs(vfs_t *, struct statvfs64 *);
 131 static int      nfs3_sync(vfs_t *, short, cred_t *);
 132 static int      nfs3_vget(vfs_t *, vnode_t **, fid_t *);
 133 static int      nfs3_mountroot(vfs_t *, whymountroot_t);
 134 static void     nfs3_freevfs(vfs_t *);
 135 
 136 static int      nfs3rootvp(vnode_t **, vfs_t *, struct servinfo *,
 137                     int, cred_t *, zone_t *);
 138 
 139 /*
 140  * Initialize the vfs structure
 141  */
 142 
 143 static int nfs3fstyp;
 144 vfsops_t *nfs3_vfsops;
 145 
 146 /*
 147  * Debug variable to check for rdma based
 148  * transport startup and cleanup. Controlled
 149  * through /etc/system. Off by default.
 150  */
 151 extern int rdma_debug;
 152 
 153 int
 154 nfs3init(int fstyp, char *name)
 155 {
 156         static const fs_operation_def_t nfs3_vfsops_template[] = {
 157                 VFSNAME_MOUNT,          { .vfs_mount = nfs3_mount },
 158                 VFSNAME_UNMOUNT,        { .vfs_unmount = nfs3_unmount },
 159                 VFSNAME_ROOT,           { .vfs_root = nfs3_root },
 160                 VFSNAME_STATVFS,        { .vfs_statvfs = nfs3_statvfs },
 161                 VFSNAME_SYNC,           { .vfs_sync = nfs3_sync },
 162                 VFSNAME_VGET,           { .vfs_vget = nfs3_vget },
 163                 VFSNAME_MOUNTROOT,      { .vfs_mountroot = nfs3_mountroot },
 164                 VFSNAME_FREEVFS,        { .vfs_freevfs = nfs3_freevfs },
 165                 NULL,                   NULL
 166         };
 167         int error;
 168 
 169         error = vfs_setfsops(fstyp, nfs3_vfsops_template, &nfs3_vfsops);
 170         if (error != 0) {
 171                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 172                     "nfs3init: bad vfs ops template");
 173                 return (error);
 174         }
 175 
 176         error = vn_make_ops(name, nfs3_vnodeops_template, &nfs3_vnodeops);
 177         if (error != 0) {
 178                 (void) vfs_freevfsops_by_type(fstyp);
 179                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 180                     "nfs3init: bad vnode ops template");
 181                 return (error);
 182         }
 183 
 184         nfs3fstyp = fstyp;
 185 
 186         return (0);
 187 }
 188 
 189 void
 190 nfs3fini(void)
 191 {
 192 }
 193 
 194 static void
 195 nfs3_free_args(struct nfs_args *nargs, nfs_fhandle *fh)
 196 {
 197 
 198         if (fh)
 199                 kmem_free(fh, sizeof (*fh));
 200 
 201         if (nargs->knconf) {
 202                 if (nargs->knconf->knc_protofmly)
 203                         kmem_free(nargs->knconf->knc_protofmly, KNC_STRSIZE);
 204                 if (nargs->knconf->knc_proto)
 205                         kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
 206                 kmem_free(nargs->knconf, sizeof (*nargs->knconf));
 207                 nargs->knconf = NULL;
 208         }
 209 
 210         if (nargs->fh) {
 211                 kmem_free(nargs->fh, strlen(nargs->fh) + 1);
 212                 nargs->fh = NULL;
 213         }
 214 
 215         if (nargs->hostname) {
 216                 kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
 217                 nargs->hostname = NULL;
 218         }
 219 
 220         if (nargs->addr) {
 221                 if (nargs->addr->buf) {
 222                         ASSERT(nargs->addr->len);
 223                         kmem_free(nargs->addr->buf, nargs->addr->len);
 224                 }
 225                 kmem_free(nargs->addr, sizeof (struct netbuf));
 226                 nargs->addr = NULL;
 227         }
 228 
 229         if (nargs->syncaddr) {
 230                 ASSERT(nargs->syncaddr->len);
 231                 if (nargs->syncaddr->buf) {
 232                         ASSERT(nargs->syncaddr->len);
 233                         kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
 234                 }
 235                 kmem_free(nargs->syncaddr, sizeof (struct netbuf));
 236                 nargs->syncaddr = NULL;
 237         }
 238 
 239         if (nargs->netname) {
 240                 kmem_free(nargs->netname, strlen(nargs->netname) + 1);
 241                 nargs->netname = NULL;
 242         }
 243 
 244         if (nargs->nfs_ext_u.nfs_extA.secdata) {
 245                 sec_clnt_freeinfo(nargs->nfs_ext_u.nfs_extA.secdata);
 246                 nargs->nfs_ext_u.nfs_extA.secdata = NULL;
 247         }
 248 }
 249 
 250 static int
 251 nfs3_copyin(char *data, int datalen, struct nfs_args *nargs, nfs_fhandle *fh)
 252 {
 253 
 254         int error;
 255         size_t nlen;                    /* length of netname */
 256         size_t hlen;                    /* length of hostname */
 257         char netname[MAXNETNAMELEN+1];  /* server's netname */
 258         struct netbuf addr;             /* server's address */
 259         struct netbuf syncaddr;         /* AUTH_DES time sync addr */
 260         struct knetconfig *knconf;      /* transport knetconfig structure */
 261         struct sec_data *secdata = NULL;        /* security data */
 262         STRUCT_DECL(nfs_args, args);            /* nfs mount arguments */
 263         STRUCT_DECL(knetconfig, knconf_tmp);
 264         STRUCT_DECL(netbuf, addr_tmp);
 265         int flags;
 266         char *p, *pf;
 267         char *userbufptr;
 268 
 269 
 270         bzero(nargs, sizeof (*nargs));
 271 
 272         STRUCT_INIT(args, get_udatamodel());
 273         bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
 274         if (copyin(data, STRUCT_BUF(args), MIN(datalen, STRUCT_SIZE(args))))
 275                 return (EFAULT);
 276 
 277         nargs->wsize = STRUCT_FGET(args, wsize);
 278         nargs->rsize = STRUCT_FGET(args, rsize);
 279         nargs->timeo = STRUCT_FGET(args, timeo);
 280         nargs->retrans = STRUCT_FGET(args, retrans);
 281         nargs->acregmin = STRUCT_FGET(args, acregmin);
 282         nargs->acregmax = STRUCT_FGET(args, acregmax);
 283         nargs->acdirmin = STRUCT_FGET(args, acdirmin);
 284         nargs->acdirmax = STRUCT_FGET(args, acdirmax);
 285 
 286         flags = STRUCT_FGET(args, flags);
 287         nargs->flags = flags;
 288 
 289         addr.buf = NULL;
 290         syncaddr.buf = NULL;
 291 
 292         /*
 293          * Allocate space for a knetconfig structure and
 294          * its strings and copy in from user-land.
 295          */
 296         knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
 297         STRUCT_INIT(knconf_tmp, get_udatamodel());
 298         if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
 299             STRUCT_SIZE(knconf_tmp))) {
 300                 kmem_free(knconf, sizeof (*knconf));
 301                 return (EFAULT);
 302         }
 303 
 304         knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
 305         knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
 306         knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
 307         if (get_udatamodel() != DATAMODEL_LP64) {
 308                 knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
 309         } else {
 310                 knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
 311         }
 312 
 313         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 314         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 315         error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
 316         if (error) {
 317                 kmem_free(pf, KNC_STRSIZE);
 318                 kmem_free(p, KNC_STRSIZE);
 319                 kmem_free(knconf, sizeof (*knconf));
 320                 return (error);
 321         }
 322 
 323         error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
 324         if (error) {
 325                 kmem_free(pf, KNC_STRSIZE);
 326                 kmem_free(p, KNC_STRSIZE);
 327                 kmem_free(knconf, sizeof (*knconf));
 328                 return (error);
 329         }
 330 
 331 
 332         knconf->knc_protofmly = pf;
 333         knconf->knc_proto = p;
 334 
 335         nargs->knconf = knconf;
 336         /*
 337          * Get server address
 338          */
 339         STRUCT_INIT(addr_tmp, get_udatamodel());
 340         if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
 341             STRUCT_SIZE(addr_tmp))) {
 342                 error = EFAULT;
 343                 goto errout;
 344         }
 345 
 346         nargs->addr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
 347         userbufptr = STRUCT_FGETP(addr_tmp, buf);
 348         addr.len = STRUCT_FGET(addr_tmp, len);
 349         addr.buf = kmem_alloc(addr.len, KM_SLEEP);
 350         addr.maxlen = addr.len;
 351         if (copyin(userbufptr, addr.buf, addr.len)) {
 352                 kmem_free(addr.buf, addr.len);
 353                 error = EFAULT;
 354                 goto errout;
 355         }
 356         bcopy(&addr, nargs->addr, sizeof (struct netbuf));
 357 
 358         /*
 359          * Get the root fhandle
 360          */
 361 
 362         if (copyin(STRUCT_FGETP(args, fh), fh, sizeof (nfs_fhandle))) {
 363                 error = EFAULT;
 364                 goto errout;
 365         }
 366 
 367 
 368         /*
 369          * Get server's hostname
 370          */
 371         if (flags & NFSMNT_HOSTNAME) {
 372                 error = copyinstr(STRUCT_FGETP(args, hostname), netname,
 373                     sizeof (netname), &hlen);
 374         if (error)
 375                 goto errout;
 376         nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
 377         (void) strcpy(nargs->hostname, netname);
 378         } else {
 379         nargs->hostname = NULL;
 380         }
 381 
 382 
 383         /*
 384          * If there are syncaddr and netname data, load them in. This is
 385          * to support data needed for NFSV4 when AUTH_DH is the negotiated
 386          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
 387          */
 388         netname[0] = '\0';
 389         if (flags & NFSMNT_SECURE) {
 390                 if (STRUCT_FGETP(args, syncaddr) == NULL) {
 391                         error = EINVAL;
 392                         goto errout;
 393                 }
 394                 /* get syncaddr */
 395                 STRUCT_INIT(addr_tmp, get_udatamodel());
 396                 if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
 397                     STRUCT_SIZE(addr_tmp))) {
 398                         error = EINVAL;
 399                         goto errout;
 400                 }
 401                 userbufptr = STRUCT_FGETP(addr_tmp, buf);
 402                 syncaddr.len = STRUCT_FGET(addr_tmp, len);
 403                 syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
 404                 syncaddr.maxlen = syncaddr.len;
 405                 if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
 406                         kmem_free(syncaddr.buf, syncaddr.len);
 407                         error = EFAULT;
 408                         goto errout;
 409                 }
 410 
 411                 nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
 412                 bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
 413 
 414                 ASSERT(STRUCT_FGETP(args, netname));
 415 
 416                 if (copyinstr(STRUCT_FGETP(args, netname), netname,
 417                     sizeof (netname), &nlen)) {
 418                         error = EFAULT;
 419                         goto errout;
 420                 }
 421 
 422                 netname[nlen] = '\0';
 423                 nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
 424                 (void) strcpy(nargs->netname, netname);
 425         }
 426 
 427         /*
 428          * Get the extention data which has the security data structure.
 429          * This includes data for AUTH_SYS as well.
 430          */
 431         if (flags & NFSMNT_NEWARGS) {
 432                 nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
 433                 if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
 434                     nargs->nfs_args_ext == NFS_ARGS_EXTB) {
 435                         /*
 436                          * Indicating the application is using the new
 437                          * sec_data structure to pass in the security
 438                          * data.
 439                          */
 440                         if (STRUCT_FGETP(args,
 441                             nfs_ext_u.nfs_extA.secdata) != NULL) {
 442                                 error = sec_clnt_loadinfo(
 443                                     (struct sec_data *)STRUCT_FGETP(args,
 444                                     nfs_ext_u.nfs_extA.secdata), &secdata,
 445                                     get_udatamodel());
 446                         }
 447                         nargs->nfs_ext_u.nfs_extA.secdata = secdata;
 448                 }
 449         }
 450 
 451         if (error)
 452                 goto errout;
 453 
 454         /*
 455          * Failover support:
 456          *
 457          * We may have a linked list of nfs_args structures,
 458          * which means the user is looking for failover.  If
 459          * the mount is either not "read-only" or "soft",
 460          * we want to bail out with EINVAL.
 461          */
 462         if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
 463                 nargs->nfs_ext_u.nfs_extB.next =
 464                     STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
 465 
 466 errout:
 467         if (error)
 468                 nfs3_free_args(nargs, fh);
 469 
 470         return (error);
 471 }
 472 
 473 
 474 /*
 475  * nfs mount vfsop
 476  * Set up mount info record and attach it to vfs struct.
 477  */
 478 static int
 479 nfs3_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 480 {
 481         struct nfs_args *args = NULL;
 482         nfs_fhandle     *fhandle = NULL;
 483         char *data = uap->dataptr;
 484         int error;
 485         vnode_t *rtvp;                  /* the server's root */
 486         mntinfo_t *mi;                  /* mount info, pointed at by vfs */
 487         size_t nlen;                    /* length of netname */
 488         struct knetconfig *knconf;      /* transport knetconfig structure */
 489         struct knetconfig *rdma_knconf; /* rdma transport structure */
 490         rnode_t *rp;
 491         struct servinfo *svp;           /* nfs server info */
 492         struct servinfo *svp_tail = NULL; /* previous nfs server info */
 493         struct servinfo *svp_head;      /* first nfs server info */
 494         struct servinfo *svp_2ndlast;   /* 2nd last in server info list */
 495         struct sec_data *secdata;       /* security data */
 496         int flags, addr_type;
 497         zone_t *zone = nfs_zone();
 498         zone_t *mntzone = NULL;
 499 
 500 
 501         if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
 502                 return (EPERM);
 503 
 504         if (mvp->v_type != VDIR)
 505                 return (ENOTDIR);
 506 
 507         /*
 508          * get arguments
 509          *
 510          * nfs_args is now versioned and is extensible, so
 511          * uap->datalen might be different from sizeof (args)
 512          * in a compatible situation.
 513          */
 514 
 515 more:
 516 
 517         if (!(uap->flags & MS_SYSSPACE)) {
 518                 if (args == NULL)
 519                         args = kmem_alloc(sizeof (struct nfs_args), KM_SLEEP);
 520                 else {
 521                         nfs3_free_args(args, fhandle);
 522                         fhandle = NULL;
 523                 }
 524                 if (fhandle == NULL)
 525                         fhandle = kmem_alloc(sizeof (nfs_fhandle), KM_SLEEP);
 526                 error = nfs3_copyin(data, uap->datalen, args, fhandle);
 527                 if (error) {
 528                         if (args)
 529                                 kmem_free(args, sizeof (*args));
 530                         return (error);
 531                 }
 532         } else {
 533                 args = (struct nfs_args *)data;
 534                 fhandle = (nfs_fhandle *)args->fh;
 535         }
 536 
 537 
 538         flags = args->flags;
 539 
 540         if (uap->flags & MS_REMOUNT) {
 541                 size_t  n;
 542                 char    name[FSTYPSZ];
 543 
 544                 if (uap->flags & MS_SYSSPACE) {
 545                         error = copystr(uap->fstype, name, FSTYPSZ, &n);
 546                 } else {
 547                         nfs3_free_args(args, fhandle);
 548                         kmem_free(args, sizeof (*args));
 549                         error = copyinstr(uap->fstype, name, FSTYPSZ, &n);
 550                 }
 551                 if (error) {
 552                         if (error == ENAMETOOLONG)
 553                                 return (EINVAL);
 554                         return (error);
 555                 }
 556 
 557                 /*
 558                  * This check is to ensure that the request is a
 559                  * genuine nfs remount request.
 560                  */
 561 
 562                 if (strncmp(name, "nfs", 3) != 0)
 563                         return (EINVAL);
 564 
 565                 /*
 566                  * If the request changes the locking type, disallow the
 567                  * remount,
 568                  * because it's questionable whether we can transfer the
 569                  * locking state correctly.
 570                  */
 571 
 572                 if ((mi = VFTOMI(vfsp)) != NULL) {
 573                         uint_t new_mi_llock;
 574                         uint_t old_mi_llock;
 575 
 576                         new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
 577                         old_mi_llock = (mi->mi_flags & MI_LLOCK) ? 1 : 0;
 578                         if (old_mi_llock != new_mi_llock)
 579                                 return (EBUSY);
 580                 }
 581                 return (0);
 582         }
 583 
 584         mutex_enter(&mvp->v_lock);
 585         if (!(uap->flags & MS_OVERLAY) &&
 586             (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 587                 mutex_exit(&mvp->v_lock);
 588                 if (!(uap->flags & MS_SYSSPACE)) {
 589                         nfs3_free_args(args, fhandle);
 590                         kmem_free(args, sizeof (*args));
 591                 }
 592                 return (EBUSY);
 593         }
 594         mutex_exit(&mvp->v_lock);
 595 
 596         /* make sure things are zeroed for errout: */
 597         rtvp = NULL;
 598         mi = NULL;
 599         secdata = NULL;
 600 
 601         /*
 602          * A valid knetconfig structure is required.
 603          */
 604         if (!(flags & NFSMNT_KNCONF)) {
 605                 if (!(uap->flags & MS_SYSSPACE)) {
 606                         nfs3_free_args(args, fhandle);
 607                         kmem_free(args, sizeof (*args));
 608                 }
 609                 return (EINVAL);
 610         }
 611 
 612         if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
 613             (strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
 614                 if (!(uap->flags & MS_SYSSPACE)) {
 615                         nfs3_free_args(args, fhandle);
 616                         kmem_free(args, sizeof (*args));
 617                 }
 618                 return (EINVAL);
 619         }
 620 
 621         /*
 622          * Allocate a servinfo struct.
 623          */
 624         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
 625         mutex_init(&svp->sv_lock, NULL, MUTEX_DEFAULT, NULL);
 626         if (svp_tail) {
 627                 svp_2ndlast = svp_tail;
 628                 svp_tail->sv_next = svp;
 629         } else {
 630                 svp_head = svp;
 631                 svp_2ndlast = svp;
 632         }
 633 
 634         svp_tail = svp;
 635 
 636         svp->sv_knconf = args->knconf;
 637         args->knconf = NULL;
 638 
 639         if (args->addr == NULL || args->addr->buf == NULL) {
 640                 error = EINVAL;
 641                 goto errout;
 642         }
 643 
 644         svp->sv_addr.maxlen = args->addr->maxlen;
 645         svp->sv_addr.len = args->addr->len;
 646         svp->sv_addr.buf = args->addr->buf;
 647         args->addr->buf = NULL;
 648 
 649         /*
 650          * Check the root fhandle length
 651          */
 652         ASSERT(fhandle);
 653         if (fhandle->fh_len > NFS3_FHSIZE || fhandle->fh_len == 0) {
 654                 error = EINVAL;
 655 #ifdef DEBUG
 656                 zcmn_err(getzoneid(), CE_WARN,
 657                     "nfs3_mount: got an invalid fhandle. fh_len = %d",
 658                     fhandle->fh_len);
 659                 fhandle->fh_len = NFS_FHANDLE_LEN;
 660                 nfs_printfhandle(fhandle);
 661 #endif
 662                 goto errout;
 663         }
 664 
 665         bcopy(&fhandle->fh_buf, &svp->sv_fhandle.fh_buf, fhandle->fh_len);
 666         svp->sv_fhandle.fh_len = fhandle->fh_len;
 667 
 668         /*
 669          * Get server's hostname
 670          */
 671         if (flags & NFSMNT_HOSTNAME) {
 672                 if (args->hostname == NULL) {
 673                         error = EINVAL;
 674                         goto errout;
 675                 }
 676                 svp->sv_hostnamelen = strlen(args->hostname) + 1;
 677                 svp->sv_hostname = args->hostname;
 678                 args->hostname = NULL;
 679         } else {
 680                 char *p = "unknown-host";
 681                 svp->sv_hostnamelen = strlen(p) + 1;
 682                 svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
 683                 (void) strcpy(svp->sv_hostname, p);
 684         }
 685 
 686 
 687         /*
 688          * RDMA MOUNT SUPPORT FOR NFS v3:
 689          * Establish, is it possible to use RDMA, if so overload the
 690          * knconf with rdma specific knconf and free the orignal.
 691          */
 692         if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
 693                 /*
 694                  * Determine the addr type for RDMA, IPv4 or v6.
 695                  */
 696                 if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
 697                         addr_type = AF_INET;
 698                 else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
 699                         addr_type = AF_INET6;
 700 
 701                 if (rdma_reachable(addr_type, &svp->sv_addr,
 702                     &rdma_knconf) == 0) {
 703                         /*
 704                          * If successful, hijack the orignal knconf and
 705                          * replace with a new one, depending on the flags.
 706                          */
 707                         svp->sv_origknconf = svp->sv_knconf;
 708                         svp->sv_knconf = rdma_knconf;
 709                         knconf = rdma_knconf;
 710                 } else {
 711                         if (flags & NFSMNT_TRYRDMA) {
 712 #ifdef  DEBUG
 713                                 if (rdma_debug)
 714                                         zcmn_err(getzoneid(), CE_WARN,
 715                                             "no RDMA onboard, revert\n");
 716 #endif
 717                         }
 718 
 719                         if (flags & NFSMNT_DORDMA) {
 720                                 /*
 721                                  * If proto=rdma is specified and no RDMA
 722                                  * path to this server is avialable then
 723                                  * ditch this server.
 724                                  * This is not included in the mountable
 725                                  * server list or the replica list.
 726                                  * Check if more servers are specified;
 727                                  * Failover case, otherwise bail out of mount.
 728                                  */
 729                                 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
 730                                     args->nfs_ext_u.nfs_extB.next != NULL) {
 731                                         data = (char *)
 732                                             args->nfs_ext_u.nfs_extB.next;
 733                                         if (uap->flags & MS_RDONLY &&
 734                                             !(flags & NFSMNT_SOFT)) {
 735                                                 if (svp_head->sv_next == NULL) {
 736                                                         svp_tail = NULL;
 737                                                         svp_2ndlast = NULL;
 738                                                         sv_free(svp_head);
 739                                                         goto more;
 740                                                 } else {
 741                                                         svp_tail = svp_2ndlast;
 742                                                         svp_2ndlast->sv_next =
 743                                                             NULL;
 744                                                         sv_free(svp);
 745                                                         goto more;
 746                                                 }
 747                                         }
 748                                 } else {
 749                                         /*
 750                                          * This is the last server specified
 751                                          * in the nfs_args list passed down
 752                                          * and its not rdma capable.
 753                                          */
 754                                         if (svp_head->sv_next == NULL) {
 755                                                 /*
 756                                                  * Is this the only one
 757                                                  */
 758                                                 error = EINVAL;
 759 #ifdef  DEBUG
 760                                                 if (rdma_debug)
 761                                                         zcmn_err(getzoneid(),
 762                                                             CE_WARN,
 763                                                             "No RDMA srv");
 764 #endif
 765                                                 goto errout;
 766                                         } else {
 767                                                 /*
 768                                                  * There is list, since some
 769                                                  * servers specified before
 770                                                  * this passed all requirements
 771                                                  */
 772                                                 svp_tail = svp_2ndlast;
 773                                                 svp_2ndlast->sv_next = NULL;
 774                                                 sv_free(svp);
 775                                                 goto proceed;
 776                                         }
 777                                 }
 778                         }
 779                 }
 780         }
 781 
 782         /*
 783          * Get the extention data which has the new security data structure.
 784          */
 785         if (flags & NFSMNT_NEWARGS) {
 786                 switch (args->nfs_args_ext) {
 787                 case NFS_ARGS_EXTA:
 788                 case NFS_ARGS_EXTB:
 789                         /*
 790                          * Indicating the application is using the new
 791                          * sec_data structure to pass in the security
 792                          * data.
 793                          */
 794                         secdata = args->nfs_ext_u.nfs_extA.secdata;
 795                         if (args->nfs_ext_u.nfs_extA.secdata == NULL) {
 796                                 error = EINVAL;
 797                         } else {
 798                                 /*
 799                                  * Need to validate the flavor here if
 800                                  * sysspace, userspace was already
 801                                  * validate from the nfs_copyin function.
 802                                  */
 803                                 switch (secdata->rpcflavor) {
 804                                 case AUTH_NONE:
 805                                 case AUTH_UNIX:
 806                                 case AUTH_LOOPBACK:
 807                                 case AUTH_DES:
 808                                 case RPCSEC_GSS:
 809                                         args->nfs_ext_u.nfs_extA.secdata = NULL;
 810                                         break;
 811                                 default:
 812                                         error = EINVAL;
 813                                         goto errout;
 814                                 }
 815                         }
 816                         break;
 817 
 818                 default:
 819                         error = EINVAL;
 820                         break;
 821                 }
 822         } else if (flags & NFSMNT_SECURE) {
 823                 /*
 824                  * Keep this for backward compatibility to support
 825                  * NFSMNT_SECURE/NFSMNT_RPCTIMESYNC flags.
 826                  */
 827                 if (args->syncaddr == NULL || args->syncaddr->buf == NULL) {
 828                         error = EINVAL;
 829                         goto errout;
 830                 }
 831                 /*
 832                  * Move security related data to the sec_data structure.
 833                  */
 834                 {
 835                         dh_k4_clntdata_t *data;
 836                         char *pf, *p;
 837                         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
 838                         if (flags & NFSMNT_RPCTIMESYNC)
 839                                 secdata->flags |= AUTH_F_RPCTIMESYNC;
 840                         data = kmem_alloc(sizeof (*data), KM_SLEEP);
 841                         bcopy(args->syncaddr, &data->syncaddr,
 842                             sizeof (*args->syncaddr));
 843 
 844                         /*
 845                          * duplicate the knconf information for the
 846                          * new opaque data.
 847                          */
 848                         data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
 849                         *data->knconf = *knconf;
 850                         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 851                         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 852                         bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
 853                         bcopy(knconf->knc_proto, pf, KNC_STRSIZE);
 854                         data->knconf->knc_protofmly = pf;
 855                         data->knconf->knc_proto = p;
 856 
 857                         nlen = strlen(args->hostname) + 1;
 858                         /* move server netname to the sec_data structure */
 859                         if (nlen != 0) {
 860                                 data->netname = kmem_alloc(nlen, KM_SLEEP);
 861                                 bcopy(args->hostname, data->netname, nlen);
 862                                 data->netnamelen = nlen;
 863                         }
 864                         secdata->secmod = secdata->rpcflavor = AUTH_DES;
 865                         secdata->data = (caddr_t)data;
 866                 }
 867         } else  {
 868                 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
 869                 secdata->secmod = secdata->rpcflavor = AUTH_UNIX;
 870                 secdata->data = NULL;
 871         }
 872 
 873         svp->sv_secdata = secdata;
 874         if (error)
 875                 goto errout;
 876 
 877         /*
 878          * See bug 1180236.
 879          * If mount secure failed, we will fall back to AUTH_NONE
 880          * and try again.  nfs3rootvp() will turn this back off.
 881          *
 882          * The NFS Version 3 mount uses the FSINFO and GETATTR
 883          * procedures.  The server should not care if these procedures
 884          * have the proper security flavor, so if mount retries using
 885          * AUTH_NONE that does not require a credential setup for root
 886          * then the automounter would work without requiring root to be
 887          * keylogged into AUTH_DES.
 888          */
 889         if (secdata->rpcflavor != AUTH_UNIX &&
 890             secdata->rpcflavor != AUTH_LOOPBACK)
 891                 secdata->flags |= AUTH_F_TRYNONE;
 892 
 893         /*
 894          * Failover support:
 895          *
 896          * We may have a linked list of nfs_args structures,
 897          * which means the user is looking for failover.  If
 898          * the mount is either not "read-only" or "soft",
 899          * we want to bail out with EINVAL.
 900          */
 901         if (args->nfs_args_ext == NFS_ARGS_EXTB &&
 902             args->nfs_ext_u.nfs_extB.next != NULL) {
 903                 if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
 904                         data = (char *)args->nfs_ext_u.nfs_extB.next;
 905                         goto more;
 906                 }
 907                 error = EINVAL;
 908                 goto errout;
 909         }
 910 
 911         /*
 912          * Determine the zone we're being mounted into.
 913          */
 914         zone_hold(mntzone = zone);              /* start with this assumption */
 915         if (getzoneid() == GLOBAL_ZONEID) {
 916                 zone_rele(mntzone);
 917                 mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
 918                 ASSERT(mntzone != NULL);
 919                 if (mntzone != zone) {
 920                         error = EBUSY;
 921                         goto errout;
 922                 }
 923         }
 924 
 925         if (is_system_labeled()) {
 926                 error = nfs_mount_label_policy(vfsp, &svp->sv_addr,
 927                     svp->sv_knconf, cr);
 928 
 929                 if (error > 0)
 930                         goto errout;
 931 
 932                 if (error == -1) {
 933                         /* change mount to read-only to prevent write-down */
 934                         vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
 935                 }
 936         }
 937 
 938         /*
 939          * Stop the mount from going any further if the zone is going away.
 940          */
 941         if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
 942                 error = EBUSY;
 943                 goto errout;
 944         }
 945 
 946         /*
 947          * Get root vnode.
 948          */
 949 proceed:
 950         error = nfs3rootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
 951 
 952         if (error)
 953                 goto errout;
 954 
 955         /*
 956          * Set option fields in the mount info record
 957          */
 958         mi = VTOMI(rtvp);
 959 
 960         if (svp_head->sv_next)
 961                 mi->mi_flags |= MI_LLOCK;
 962 
 963         error = nfs_setopts(rtvp, DATAMODEL_NATIVE, args);
 964 
 965 errout:
 966         if (rtvp != NULL) {
 967                 if (error) {
 968                         rp = VTOR(rtvp);
 969                         if (rp->r_flags & RHASHED)
 970                                 rp_rmhash(rp);
 971                 }
 972                 VN_RELE(rtvp);
 973         }
 974 
 975         if (error) {
 976                 sv_free(svp_head);
 977                 if (mi != NULL) {
 978                         nfs_async_stop(vfsp);
 979                         nfs_async_manager_stop(vfsp);
 980                         if (mi->mi_io_kstats) {
 981                                 kstat_delete(mi->mi_io_kstats);
 982                                 mi->mi_io_kstats = NULL;
 983                         }
 984                         if (mi->mi_ro_kstats) {
 985                                 kstat_delete(mi->mi_ro_kstats);
 986                                 mi->mi_ro_kstats = NULL;
 987                         }
 988                         nfs_free_mi(mi);
 989                 }
 990         }
 991 
 992 
 993         if (!(uap->flags & MS_SYSSPACE)) {
 994                 nfs3_free_args(args, fhandle);
 995                 kmem_free(args, sizeof (*args));
 996         }
 997 
 998         if (mntzone != NULL)
 999                 zone_rele(mntzone);
1000 
1001         return (error);
1002 }
1003 
1004 volatile int nfs3_dynamic = 0;  /* global variable to enable dynamic retrans. */
1005 volatile ushort_t nfs3_max_threads = 8; /* max number of active async threads */
1006 volatile uint_t nfs3_bsize = 32 * 1024; /* client `block' size */
1007 volatile uint_t nfs3_async_clusters = 1; /* # of reqs from each async queue */
1008 volatile uint_t nfs3_cots_timeo = NFS_COTS_TIMEO;
1009 
1010 static int
1011 nfs3rootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo *svp,
1012         int flags, cred_t *cr, zone_t *zone)
1013 {
1014         vnode_t *rtvp;
1015         mntinfo_t *mi;
1016         dev_t nfs_dev;
1017         struct vattr va;
1018         struct FSINFO3args args;
1019         struct FSINFO3res res;
1020         int error;
1021         int douprintf;
1022         rnode_t *rp;
1023         int i;
1024         uint_t max_transfer_size;
1025         struct nfs_stats *nfsstatsp;
1026         cred_t *lcr = NULL, *tcr = cr;
1027 
1028         nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
1029         ASSERT(nfsstatsp != NULL);
1030 
1031         ASSERT(nfs_zone() == zone);
1032         /*
1033          * Create a mount record and link it to the vfs struct.
1034          */
1035         mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
1036         mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
1037         mutex_init(&mi->mi_remap_lock, NULL, MUTEX_DEFAULT, NULL);
1038         mi->mi_flags = MI_ACL | MI_EXTATTR;
1039         if (!(flags & NFSMNT_SOFT))
1040                 mi->mi_flags |= MI_HARD;
1041         if ((flags & NFSMNT_SEMISOFT))
1042                 mi->mi_flags |= MI_SEMISOFT;
1043         if ((flags & NFSMNT_NOPRINT))
1044                 mi->mi_flags |= MI_NOPRINT;
1045         if (flags & NFSMNT_INT)
1046                 mi->mi_flags |= MI_INT;
1047         mi->mi_retrans = NFS_RETRIES;
1048         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1049             svp->sv_knconf->knc_semantics == NC_TPI_COTS)
1050                 mi->mi_timeo = nfs3_cots_timeo;
1051         else
1052                 mi->mi_timeo = NFS_TIMEO;
1053         mi->mi_prog = NFS_PROGRAM;
1054         mi->mi_vers = NFS_V3;
1055         mi->mi_rfsnames = rfsnames_v3;
1056         mi->mi_reqs = nfsstatsp->nfs_stats_v3.rfsreqcnt_ptr;
1057         mi->mi_call_type = call_type_v3;
1058         mi->mi_ss_call_type = ss_call_type_v3;
1059         mi->mi_timer_type = timer_type_v3;
1060         mi->mi_aclnames = aclnames_v3;
1061         mi->mi_aclreqs = nfsstatsp->nfs_stats_v3.aclreqcnt_ptr;
1062         mi->mi_acl_call_type = acl_call_type_v3;
1063         mi->mi_acl_ss_call_type = acl_ss_call_type_v3;
1064         mi->mi_acl_timer_type = acl_timer_type_v3;
1065         cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
1066         mi->mi_servers = svp;
1067         mi->mi_curr_serv = svp;
1068         mi->mi_acregmin = SEC2HR(ACREGMIN);
1069         mi->mi_acregmax = SEC2HR(ACREGMAX);
1070         mi->mi_acdirmin = SEC2HR(ACDIRMIN);
1071         mi->mi_acdirmax = SEC2HR(ACDIRMAX);
1072 
1073         if (nfs3_dynamic)
1074                 mi->mi_flags |= MI_DYNAMIC;
1075 
1076         if (flags & NFSMNT_DIRECTIO)
1077                 mi->mi_flags |= MI_DIRECTIO;
1078 
1079         /*
1080          * Make a vfs struct for nfs.  We do this here instead of below
1081          * because rtvp needs a vfs before we can do a getattr on it.
1082          *
1083          * Assign a unique device id to the mount
1084          */
1085         mutex_enter(&nfs_minor_lock);
1086         do {
1087                 nfs_minor = (nfs_minor + 1) & MAXMIN32;
1088                 nfs_dev = makedevice(nfs_major, nfs_minor);
1089         } while (vfs_devismounted(nfs_dev));
1090         mutex_exit(&nfs_minor_lock);
1091 
1092         vfsp->vfs_dev = nfs_dev;
1093         vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfs3fstyp);
1094         vfsp->vfs_data = (caddr_t)mi;
1095         vfsp->vfs_fstype = nfsfstyp;
1096 
1097         /*
1098          * Verify that nfs3_bsize tuneable is set to an
1099          * acceptable value.  It be a multiple of PAGESIZE or
1100          * file corruption can occur.
1101          */
1102         if (nfs3_bsize & PAGEOFFSET)
1103                 nfs3_bsize &= PAGEMASK;
1104         if (nfs3_bsize < PAGESIZE)
1105                 nfs3_bsize = PAGESIZE;
1106         vfsp->vfs_bsize = nfs3_bsize;
1107 
1108         /*
1109          * Initialize fields used to support async putpage operations.
1110          */
1111         for (i = 0; i < NFS_ASYNC_TYPES; i++)
1112                 mi->mi_async_clusters[i] = nfs3_async_clusters;
1113         mi->mi_async_init_clusters = nfs3_async_clusters;
1114         mi->mi_async_curr[NFS_ASYNC_QUEUE] =
1115             mi->mi_async_curr[NFS_ASYNC_PGOPS_QUEUE] = &mi->mi_async_reqs[0];
1116         mi->mi_max_threads = nfs3_max_threads;
1117         mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
1118         cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
1119         cv_init(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE], NULL, CV_DEFAULT, NULL);
1120         cv_init(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE], NULL,
1121             CV_DEFAULT, NULL);
1122         cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
1123 
1124         mi->mi_vfsp = vfsp;
1125         mi->mi_zone = zone;
1126         zone_init_ref(&mi->mi_zone_ref);
1127         zone_hold_ref(zone, &mi->mi_zone_ref, ZONE_REF_NFS);
1128         nfs_mi_zonelist_add(mi);
1129 
1130         /*
1131          * Make the root vnode, use it to get attributes,
1132          * then remake it with the attributes.
1133          */
1134         rtvp = makenfs3node((nfs_fh3 *)&svp->sv_fhandle,
1135             NULL, vfsp, gethrtime(), cr, NULL, NULL);
1136 
1137         /*
1138          * Make the FSINFO calls, primarily at this point to
1139          * determine the transfer size.  For client failover,
1140          * we'll want this to be the minimum bid from any
1141          * server, so that we don't overrun stated limits.
1142          *
1143          * While we're looping, we'll turn off AUTH_F_TRYNONE,
1144          * which is only for the mount operation.
1145          */
1146 
1147         mi->mi_tsize = nfs3_tsize(svp->sv_knconf);
1148         mi->mi_stsize = mi->mi_tsize;
1149 
1150         mi->mi_curread = nfs3_bsize;
1151         mi->mi_curwrite = mi->mi_curread;
1152 
1153         /*
1154          * If the uid is set then set the creds for secure mounts
1155          * by proxy processes such as automountd.
1156          */
1157         if (svp->sv_secdata->uid != 0 &&
1158             svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
1159                 lcr = crdup(cr);
1160                 (void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
1161                 tcr = lcr;
1162         }
1163 
1164         for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
1165                 douprintf = 1;
1166                 mi->mi_curr_serv = svp;
1167                 max_transfer_size = nfs3_tsize(svp->sv_knconf);
1168                 mi->mi_tsize = MIN(max_transfer_size, mi->mi_tsize);
1169                 mi->mi_stsize = MIN(max_transfer_size, mi->mi_stsize);
1170                 mi->mi_curread = MIN(max_transfer_size, mi->mi_curread);
1171                 mi->mi_curwrite = MIN(max_transfer_size, mi->mi_curwrite);
1172                 args.fsroot = *(nfs_fh3 *)&svp->sv_fhandle;
1173 
1174                 error = rfs3call(mi, NFSPROC3_FSINFO,
1175                     xdr_nfs_fh3, (caddr_t)&args,
1176                     xdr_FSINFO3res, (caddr_t)&res, tcr,
1177                     &douprintf, &res.status, 0, NULL);
1178                 if (error)
1179                         goto bad;
1180                 error = geterrno3(res.status);
1181                 if (error)
1182                         goto bad;
1183 
1184                 /* get type of root node */
1185                 if (res.resok.obj_attributes.attributes) {
1186                         if (res.resok.obj_attributes.attr.type < NF3REG ||
1187                             res.resok.obj_attributes.attr.type > NF3FIFO) {
1188 #ifdef DEBUG
1189                                 zcmn_err(getzoneid(), CE_WARN,
1190                             "NFS3 server %s returned a bad file type for root",
1191                                     svp->sv_hostname);
1192 #else
1193                                 zcmn_err(getzoneid(), CE_WARN,
1194                             "NFS server %s returned a bad file type for root",
1195                                     svp->sv_hostname);
1196 #endif
1197                                 error = EINVAL;
1198                                 goto bad;
1199                         } else {
1200                                 if (rtvp->v_type != VNON && rtvp->v_type !=
1201                                     nf3_to_vt[res.resok.obj_attributes.attr.
1202                                     type]) {
1203 #ifdef DEBUG
1204                                         zcmn_err(getzoneid(), CE_WARN,
1205                 "NFS3 server %s returned a different file type for root",
1206                                             svp->sv_hostname);
1207 #else
1208                                         zcmn_err(getzoneid(), CE_WARN,
1209                 "NFS server %s returned a different file type for root",
1210                                             svp->sv_hostname);
1211 #endif
1212                                         error = EINVAL;
1213                                         goto bad;
1214                                 }
1215                                 rtvp->v_type =
1216                                     nf3_to_vt[res.resok.obj_attributes.attr.
1217                                     type];
1218                         }
1219                 }
1220 
1221                 if (res.resok.rtmax != 0) {
1222                         mi->mi_tsize = MIN(res.resok.rtmax, mi->mi_tsize);
1223                         if (res.resok.rtpref != 0) {
1224                                 mi->mi_curread = MIN(res.resok.rtpref,
1225                                     mi->mi_curread);
1226                         } else {
1227                                 mi->mi_curread = MIN(res.resok.rtmax,
1228                                     mi->mi_curread);
1229                         }
1230                 } else if (res.resok.rtpref != 0) {
1231                         mi->mi_tsize = MIN(res.resok.rtpref, mi->mi_tsize);
1232                         mi->mi_curread = MIN(res.resok.rtpref, mi->mi_curread);
1233                 } else {
1234 #ifdef DEBUG
1235                         zcmn_err(getzoneid(), CE_WARN,
1236                             "NFS3 server %s returned 0 for read transfer sizes",
1237                             svp->sv_hostname);
1238 #else
1239                         zcmn_err(getzoneid(), CE_WARN,
1240                             "NFS server %s returned 0 for read transfer sizes",
1241                             svp->sv_hostname);
1242 #endif
1243                         error = EIO;
1244                         goto bad;
1245                 }
1246                 if (res.resok.wtmax != 0) {
1247                         mi->mi_stsize = MIN(res.resok.wtmax, mi->mi_stsize);
1248                         if (res.resok.wtpref != 0) {
1249                                 mi->mi_curwrite = MIN(res.resok.wtpref,
1250                                     mi->mi_curwrite);
1251                         } else {
1252                                 mi->mi_curwrite = MIN(res.resok.wtmax,
1253                                     mi->mi_curwrite);
1254                         }
1255                 } else if (res.resok.wtpref != 0) {
1256                         mi->mi_stsize = MIN(res.resok.wtpref, mi->mi_stsize);
1257                         mi->mi_curwrite = MIN(res.resok.wtpref,
1258                             mi->mi_curwrite);
1259                 } else {
1260 #ifdef DEBUG
1261                         zcmn_err(getzoneid(), CE_WARN,
1262                         "NFS3 server %s returned 0 for write transfer sizes",
1263                             svp->sv_hostname);
1264 #else
1265                         zcmn_err(getzoneid(), CE_WARN,
1266                         "NFS server %s returned 0 for write transfer sizes",
1267                             svp->sv_hostname);
1268 #endif
1269                         error = EIO;
1270                         goto bad;
1271                 }
1272 
1273                 /*
1274                  * These signal the ability of the server to create
1275                  * hard links and symbolic links, so they really
1276                  * aren't relevant if there is more than one server.
1277                  * We'll set them here, though it probably looks odd.
1278                  */
1279                 if (res.resok.properties & FSF3_LINK)
1280                         mi->mi_flags |= MI_LINK;
1281                 if (res.resok.properties & FSF3_SYMLINK)
1282                         mi->mi_flags |= MI_SYMLINK;
1283 
1284                 /* Pick up smallest non-zero maxfilesize value */
1285                 if (res.resok.maxfilesize) {
1286                         if (mi->mi_maxfilesize) {
1287                                 mi->mi_maxfilesize = MIN(mi->mi_maxfilesize,
1288                                     res.resok.maxfilesize);
1289                         } else
1290                                 mi->mi_maxfilesize = res.resok.maxfilesize;
1291                 }
1292 
1293                 /*
1294                  * AUTH_F_TRYNONE is only for the mount operation,
1295                  * so turn it back off.
1296                  */
1297                 svp->sv_secdata->flags &= ~AUTH_F_TRYNONE;
1298         }
1299         mi->mi_curr_serv = mi->mi_servers;
1300 
1301         /*
1302          * Start the thread responsible for handling async worker threads.
1303          */
1304         VFS_HOLD(vfsp); /* add reference for thread */
1305         mi->mi_manager_thread = zthread_create(NULL, 0, nfs_async_manager,
1306             vfsp, 0, minclsyspri);
1307         ASSERT(mi->mi_manager_thread != NULL);
1308 
1309         /*
1310          * Initialize kstats
1311          */
1312         nfs_mnt_kstat_init(vfsp);
1313 
1314         /* If we didn't get a type, get one now */
1315         if (rtvp->v_type == VNON) {
1316                 va.va_mask = AT_ALL;
1317 
1318                 error = nfs3getattr(rtvp, &va, tcr);
1319                 if (error)
1320                         goto bad;
1321                 rtvp->v_type = va.va_type;
1322         }
1323 
1324         mi->mi_type = rtvp->v_type;
1325 
1326         *rtvpp = rtvp;
1327         if (lcr != NULL)
1328                 crfree(lcr);
1329 
1330         return (0);
1331 bad:
1332         /*
1333          * An error occurred somewhere, need to clean up...
1334          * We need to release our reference to the root vnode and
1335          * destroy the mntinfo struct that we just created.
1336          */
1337         if (lcr != NULL)
1338                 crfree(lcr);
1339         rp = VTOR(rtvp);
1340         if (rp->r_flags & RHASHED)
1341                 rp_rmhash(rp);
1342         VN_RELE(rtvp);
1343         nfs_async_stop(vfsp);
1344         nfs_async_manager_stop(vfsp);
1345         if (mi->mi_io_kstats) {
1346                 kstat_delete(mi->mi_io_kstats);
1347                 mi->mi_io_kstats = NULL;
1348         }
1349         if (mi->mi_ro_kstats) {
1350                 kstat_delete(mi->mi_ro_kstats);
1351                 mi->mi_ro_kstats = NULL;
1352         }
1353         nfs_free_mi(mi);
1354         *rtvpp = NULL;
1355         return (error);
1356 }
1357 
1358 /*
1359  * vfs operations
1360  */
1361 static int
1362 nfs3_unmount(vfs_t *vfsp, int flag, cred_t *cr)
1363 {
1364         mntinfo_t *mi;
1365         ushort_t omax;
1366 
1367         if (secpolicy_fs_unmount(cr, vfsp) != 0)
1368                 return (EPERM);
1369 
1370         mi = VFTOMI(vfsp);
1371         if (flag & MS_FORCE) {
1372 
1373                 vfsp->vfs_flag |= VFS_UNMOUNTED;
1374 
1375                 /*
1376                  * We are about to stop the async manager.
1377                  * Let every one know not to schedule any
1378                  * more async requests
1379                  */
1380                 mutex_enter(&mi->mi_async_lock);
1381                 mi->mi_max_threads = 0;
1382                 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1383                 mutex_exit(&mi->mi_async_lock);
1384 
1385                 /*
1386                  * We need to stop the manager thread explicitly; the worker
1387                  * threads can time out and exit on their own.
1388                  */
1389                 nfs_async_manager_stop(vfsp);
1390                 destroy_rtable(vfsp, cr);
1391                 if (mi->mi_io_kstats) {
1392                         kstat_delete(mi->mi_io_kstats);
1393                         mi->mi_io_kstats = NULL;
1394                 }
1395                 if (mi->mi_ro_kstats) {
1396                         kstat_delete(mi->mi_ro_kstats);
1397                         mi->mi_ro_kstats = NULL;
1398                 }
1399                 return (0);
1400         }
1401         /*
1402          * Wait until all asynchronous putpage operations on
1403          * this file system are complete before flushing rnodes
1404          * from the cache.
1405          */
1406         omax = mi->mi_max_threads;
1407         if (nfs_async_stop_sig(vfsp)) {
1408                 return (EINTR);
1409         }
1410         rflush(vfsp, cr);
1411         /*
1412          * If there are any active vnodes on this file system,
1413          * then the file system is busy and can't be umounted.
1414          */
1415         if (check_rtable(vfsp)) {
1416                 mutex_enter(&mi->mi_async_lock);
1417                 mi->mi_max_threads = omax;
1418                 mutex_exit(&mi->mi_async_lock);
1419                 return (EBUSY);
1420         }
1421         /*
1422          * The unmount can't fail from now on; stop the worker thread manager.
1423          */
1424         nfs_async_manager_stop(vfsp);
1425         /*
1426          * Destroy all rnodes belonging to this file system from the
1427          * rnode hash queues and purge any resources allocated to
1428          * them.
1429          */
1430         destroy_rtable(vfsp, cr);
1431         if (mi->mi_io_kstats) {
1432                 kstat_delete(mi->mi_io_kstats);
1433                 mi->mi_io_kstats = NULL;
1434         }
1435         if (mi->mi_ro_kstats) {
1436                 kstat_delete(mi->mi_ro_kstats);
1437                 mi->mi_ro_kstats = NULL;
1438         }
1439         return (0);
1440 }
1441 
1442 /*
1443  * find root of nfs
1444  */
1445 static int
1446 nfs3_root(vfs_t *vfsp, vnode_t **vpp)
1447 {
1448         mntinfo_t *mi;
1449         vnode_t *vp;
1450         servinfo_t *svp;
1451         rnode_t *rp;
1452         int error = 0;
1453 
1454         mi = VFTOMI(vfsp);
1455 
1456         if (nfs_zone() != mi->mi_zone)
1457                 return (EPERM);
1458 
1459         svp = mi->mi_curr_serv;
1460         if (svp && (svp->sv_flags & SV_ROOT_STALE)) {
1461                 mutex_enter(&svp->sv_lock);
1462                 svp->sv_flags &= ~SV_ROOT_STALE;
1463                 mutex_exit(&svp->sv_lock);
1464                 error = ENOENT;
1465         }
1466 
1467         vp = makenfs3node((nfs_fh3 *)&mi->mi_curr_serv->sv_fhandle,
1468             NULL, vfsp, gethrtime(), CRED(), NULL, NULL);
1469 
1470         /*
1471          * if the SV_ROOT_STALE flag was reset above, reset the
1472          * RSTALE flag if needed and return an error
1473          */
1474         if (error == ENOENT) {
1475                 rp = VTOR(vp);
1476                 if (svp && rp->r_flags & RSTALE) {
1477                         mutex_enter(&rp->r_statelock);
1478                         rp->r_flags &= ~RSTALE;
1479                         mutex_exit(&rp->r_statelock);
1480                 }
1481                 VN_RELE(vp);
1482                 return (error);
1483         }
1484 
1485         ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
1486 
1487         vp->v_type = mi->mi_type;
1488 
1489         *vpp = vp;
1490 
1491         return (0);
1492 }
1493 
1494 /*
1495  * Get file system statistics.
1496  */
1497 static int
1498 nfs3_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
1499 {
1500         int error;
1501         struct mntinfo *mi;
1502         struct FSSTAT3args args;
1503         struct FSSTAT3res res;
1504         int douprintf;
1505         failinfo_t fi;
1506         vnode_t *vp;
1507         cred_t *cr;
1508         hrtime_t t;
1509 
1510         mi = VFTOMI(vfsp);
1511         if (nfs_zone() != mi->mi_zone)
1512                 return (EPERM);
1513         error = nfs3_root(vfsp, &vp);
1514         if (error)
1515                 return (error);
1516 
1517         cr = CRED();
1518 
1519         args.fsroot = *VTOFH3(vp);
1520         fi.vp = vp;
1521         fi.fhp = (caddr_t)&args.fsroot;
1522         fi.copyproc = nfs3copyfh;
1523         fi.lookupproc = nfs3lookup;
1524         fi.xattrdirproc = acl_getxattrdir3;
1525 
1526         douprintf = 1;
1527 
1528         t = gethrtime();
1529 
1530         error = rfs3call(mi, NFSPROC3_FSSTAT,
1531             xdr_nfs_fh3, (caddr_t)&args,
1532             xdr_FSSTAT3res, (caddr_t)&res, cr,
1533             &douprintf, &res.status, 0, &fi);
1534 
1535         if (error) {
1536                 VN_RELE(vp);
1537                 return (error);
1538         }
1539 
1540         error = geterrno3(res.status);
1541         if (!error) {
1542                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
1543                 sbp->f_bsize = MAXBSIZE;
1544                 sbp->f_frsize = DEV_BSIZE;
1545                 /*
1546                  * Allow -1 fields to pass through unconverted.  These
1547                  * indicate "don't know" fields.
1548                  */
1549                 if (res.resok.tbytes == (size3)-1)
1550                         sbp->f_blocks = (fsblkcnt64_t)res.resok.tbytes;
1551                 else {
1552                         sbp->f_blocks = (fsblkcnt64_t)
1553                             (res.resok.tbytes / DEV_BSIZE);
1554                 }
1555                 if (res.resok.fbytes == (size3)-1)
1556                         sbp->f_bfree = (fsblkcnt64_t)res.resok.fbytes;
1557                 else {
1558                         sbp->f_bfree = (fsblkcnt64_t)
1559                             (res.resok.fbytes / DEV_BSIZE);
1560                 }
1561                 if (res.resok.abytes == (size3)-1)
1562                         sbp->f_bavail = (fsblkcnt64_t)res.resok.abytes;
1563                 else {
1564                         sbp->f_bavail = (fsblkcnt64_t)
1565                             (res.resok.abytes / DEV_BSIZE);
1566                 }
1567                 sbp->f_files = (fsfilcnt64_t)res.resok.tfiles;
1568                 sbp->f_ffree = (fsfilcnt64_t)res.resok.ffiles;
1569                 sbp->f_favail = (fsfilcnt64_t)res.resok.afiles;
1570                 sbp->f_fsid = (unsigned long)vfsp->vfs_fsid.val[0];
1571                 (void) strncpy(sbp->f_basetype,
1572                     vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
1573                 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
1574                 sbp->f_namemax = (ulong_t)-1;
1575         } else {
1576                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
1577                 PURGE_STALE_FH(error, vp, cr);
1578         }
1579 
1580         VN_RELE(vp);
1581 
1582         return (error);
1583 }
1584 
1585 static kmutex_t nfs3_syncbusy;
1586 
1587 /*
1588  * Flush dirty nfs files for file system vfsp.
1589  * If vfsp == NULL, all nfs files are flushed.
1590  */
1591 /* ARGSUSED */
1592 static int
1593 nfs3_sync(vfs_t *vfsp, short flag, cred_t *cr)
1594 {
1595         /*
1596          * Cross-zone calls are OK here, since this translates to a
1597          * VOP_PUTPAGE(B_ASYNC), which gets picked up by the right zone.
1598          */
1599         if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs3_syncbusy) != 0) {
1600                 rflush(vfsp, cr);
1601                 mutex_exit(&nfs3_syncbusy);
1602         }
1603         return (0);
1604 }
1605 
1606 /* ARGSUSED */
1607 static int
1608 nfs3_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1609 {
1610         int error;
1611         nfs_fh3 fh;
1612         vnode_t *vp;
1613         struct vattr va;
1614 
1615         if (fidp->fid_len > NFS3_FHSIZE) {
1616                 *vpp = NULL;
1617                 return (ESTALE);
1618         }
1619 
1620         if (nfs_zone() != VFTOMI(vfsp)->mi_zone)
1621                 return (EPERM);
1622         fh.fh3_length = fidp->fid_len;
1623         bcopy(fidp->fid_data, fh.fh3_u.data, fh.fh3_length);
1624 
1625         vp = makenfs3node(&fh, NULL, vfsp, gethrtime(), CRED(), NULL, NULL);
1626 
1627         if (VTOR(vp)->r_flags & RSTALE) {
1628                 VN_RELE(vp);
1629                 *vpp = NULL;
1630                 return (ENOENT);
1631         }
1632 
1633         if (vp->v_type == VNON) {
1634                 va.va_mask = AT_ALL;
1635                 error = nfs3getattr(vp, &va, CRED());
1636                 if (error) {
1637                         VN_RELE(vp);
1638                         *vpp = NULL;
1639                         return (error);
1640                 }
1641                 vp->v_type = va.va_type;
1642         }
1643 
1644         *vpp = vp;
1645 
1646         return (0);
1647 }
1648 
1649 /* ARGSUSED */
1650 static int
1651 nfs3_mountroot(vfs_t *vfsp, whymountroot_t why)
1652 {
1653         vnode_t *rtvp;
1654         char root_hostname[SYS_NMLN+1];
1655         struct servinfo *svp;
1656         int error;
1657         int vfsflags;
1658         size_t size;
1659         char *root_path;
1660         struct pathname pn;
1661         char *name;
1662         cred_t *cr;
1663         struct nfs_args args;           /* nfs mount arguments */
1664         static char token[10];
1665 
1666         bzero(&args, sizeof (args));
1667 
1668         /* do this BEFORE getfile which causes xid stamps to be initialized */
1669         clkset(-1L);            /* hack for now - until we get time svc? */
1670 
1671         if (why == ROOT_REMOUNT) {
1672                 /*
1673                  * Shouldn't happen.
1674                  */
1675                 panic("nfs3_mountroot: why == ROOT_REMOUNT");
1676         }
1677 
1678         if (why == ROOT_UNMOUNT) {
1679                 /*
1680                  * Nothing to do for NFS.
1681                  */
1682                 return (0);
1683         }
1684 
1685         /*
1686          * why == ROOT_INIT
1687          */
1688 
1689         name = token;
1690         *name = 0;
1691         getfsname("root", name, sizeof (token));
1692 
1693         pn_alloc(&pn);
1694         root_path = pn.pn_path;
1695 
1696         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
1697         svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
1698         svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1699         svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1700 
1701         /*
1702          * Get server address
1703          * Get the root fhandle
1704          * Get server's transport
1705          * Get server's hostname
1706          * Get options
1707          */
1708         args.addr = &svp->sv_addr;
1709         args.fh = (char *)&svp->sv_fhandle;
1710         args.knconf = svp->sv_knconf;
1711         args.hostname = root_hostname;
1712         vfsflags = 0;
1713         if (error = mount_root(*name ? name : "root", root_path, NFS_V3,
1714             &args, &vfsflags)) {
1715                 if (error == EPROTONOSUPPORT)
1716                         nfs_cmn_err(error, CE_WARN, "nfs3_mountroot: "
1717                             "mount_root failed: server doesn't support NFS V3");
1718                 else
1719                         nfs_cmn_err(error, CE_WARN,
1720                             "nfs3_mountroot: mount_root failed: %m");
1721                 sv_free(svp);
1722                 pn_free(&pn);
1723                 return (error);
1724         }
1725         svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
1726         svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
1727         (void) strcpy(svp->sv_hostname, root_hostname);
1728 
1729         /*
1730          * Force root partition to always be mounted with AUTH_UNIX for now
1731          */
1732         svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
1733         svp->sv_secdata->secmod = AUTH_UNIX;
1734         svp->sv_secdata->rpcflavor = AUTH_UNIX;
1735         svp->sv_secdata->data = NULL;
1736 
1737         cr = crgetcred();
1738         rtvp = NULL;
1739 
1740         error = nfs3rootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
1741 
1742         crfree(cr);
1743 
1744         if (error) {
1745                 pn_free(&pn);
1746                 sv_free(svp);
1747                 return (error);
1748         }
1749 
1750         error = nfs_setopts(rtvp, DATAMODEL_NATIVE, &args);
1751         if (error) {
1752                 nfs_cmn_err(error, CE_WARN,
1753                     "nfs3_mountroot: invalid root mount options");
1754                 pn_free(&pn);
1755                 goto errout;
1756         }
1757 
1758         (void) vfs_lock_wait(vfsp);
1759         vfs_add(NULL, vfsp, vfsflags);
1760         vfs_unlock(vfsp);
1761 
1762         size = strlen(svp->sv_hostname);
1763         (void) strcpy(rootfs.bo_name, svp->sv_hostname);
1764         rootfs.bo_name[size] = ':';
1765         (void) strcpy(&rootfs.bo_name[size + 1], root_path);
1766 
1767         pn_free(&pn);
1768 
1769 errout:
1770         if (error) {
1771                 sv_free(svp);
1772                 nfs_async_stop(vfsp);
1773                 nfs_async_manager_stop(vfsp);
1774         }
1775 
1776         if (rtvp != NULL)
1777                 VN_RELE(rtvp);
1778 
1779         return (error);
1780 }
1781 
1782 /*
1783  * Initialization routine for VFS routines.  Should only be called once
1784  */
1785 int
1786 nfs3_vfsinit(void)
1787 {
1788         mutex_init(&nfs3_syncbusy, NULL, MUTEX_DEFAULT, NULL);
1789         return (0);
1790 }
1791 
1792 void
1793 nfs3_vfsfini(void)
1794 {
1795         mutex_destroy(&nfs3_syncbusy);
1796 }
1797 
1798 void
1799 nfs3_freevfs(vfs_t *vfsp)
1800 {
1801         mntinfo_t *mi;
1802         servinfo_t *svp;
1803 
1804         /* free up the resources */
1805         mi = VFTOMI(vfsp);
1806         svp = mi->mi_servers;
1807         mi->mi_servers = mi->mi_curr_serv = NULL;
1808         sv_free(svp);
1809 
1810         /*
1811          * By this time we should have already deleted the
1812          * mi kstats in the unmount code. If they are still around
1813          * somethings wrong
1814          */
1815         ASSERT(mi->mi_io_kstats == NULL);
1816         nfs_free_mi(mi);
1817 }