1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2016 by Delphix. All rights reserved.
  25  */
  26 
  27 /*
  28  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  29  *      All rights reserved.
  30  */
  31 
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/cred.h>
  36 #include <sys/buf.h>
  37 #include <sys/vfs.h>
  38 #include <sys/vnode.h>
  39 #include <sys/uio.h>
  40 #include <sys/stat.h>
  41 #include <sys/errno.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/statvfs.h>
  44 #include <sys/kmem.h>
  45 #include <sys/kstat.h>
  46 #include <sys/dirent.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/debug.h>
  49 #include <sys/vtrace.h>
  50 #include <sys/mode.h>
  51 #include <sys/acl.h>
  52 #include <sys/nbmlock.h>
  53 #include <sys/policy.h>
  54 #include <sys/sdt.h>
  55 
  56 #include <rpc/types.h>
  57 #include <rpc/auth.h>
  58 #include <rpc/svc.h>
  59 
  60 #include <nfs/nfs.h>
  61 #include <nfs/export.h>
  62 #include <nfs/nfs_cmd.h>
  63 
  64 #include <vm/hat.h>
  65 #include <vm/as.h>
  66 #include <vm/seg.h>
  67 #include <vm/seg_map.h>
  68 #include <vm/seg_kmem.h>
  69 
  70 #include <sys/strsubr.h>
  71 
  72 /*
  73  * These are the interface routines for the server side of the
  74  * Network File System.  See the NFS version 2 protocol specification
  75  * for a description of this interface.
  76  */
  77 
  78 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  79 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  80                         cred_t *);
  81 
  82 /*
  83  * Some "over the wire" UNIX file types.  These are encoded
  84  * into the mode.  This needs to be fixed in the next rev.
  85  */
  86 #define IFMT            0170000         /* type of file */
  87 #define IFCHR           0020000         /* character special */
  88 #define IFBLK           0060000         /* block special */
  89 #define IFSOCK          0140000         /* socket */
  90 
  91 u_longlong_t nfs2_srv_caller_id;
  92 
  93 /*
  94  * Get file attributes.
  95  * Returns the current attributes of the file with the given fhandle.
  96  */
  97 /* ARGSUSED */
  98 void
  99 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 100     struct svc_req *req, cred_t *cr, bool_t ro)
 101 {
 102         int error;
 103         vnode_t *vp;
 104         struct vattr va;
 105 
 106         vp = nfs_fhtovp(fhp, exi);
 107         if (vp == NULL) {
 108                 ns->ns_status = NFSERR_STALE;
 109                 return;
 110         }
 111 
 112         /*
 113          * Do the getattr.
 114          */
 115         va.va_mask = AT_ALL;    /* we want all the attributes */
 116 
 117         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 118 
 119         /* check for overflows */
 120         if (!error) {
 121                 /* Lie about the object type for a referral */
 122                 if (vn_is_nfs_reparse(vp, cr))
 123                         va.va_type = VLNK;
 124 
 125                 acl_perm(vp, exi, &va, cr);
 126                 error = vattr_to_nattr(&va, &ns->ns_attr);
 127         }
 128 
 129         VN_RELE(vp);
 130 
 131         ns->ns_status = puterrno(error);
 132 }
 133 void *
 134 rfs_getattr_getfh(fhandle_t *fhp)
 135 {
 136         return (fhp);
 137 }
 138 
 139 /*
 140  * Set file attributes.
 141  * Sets the attributes of the file with the given fhandle.  Returns
 142  * the new attributes.
 143  */
 144 /* ARGSUSED */
 145 void
 146 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 147     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 148 {
 149         int error;
 150         int flag;
 151         int in_crit = 0;
 152         vnode_t *vp;
 153         struct vattr va;
 154         struct vattr bva;
 155         struct flock64 bf;
 156         caller_context_t ct;
 157 
 158 
 159         vp = nfs_fhtovp(&args->saa_fh, exi);
 160         if (vp == NULL) {
 161                 ns->ns_status = NFSERR_STALE;
 162                 return;
 163         }
 164 
 165         if (rdonly(ro, vp)) {
 166                 VN_RELE(vp);
 167                 ns->ns_status = NFSERR_ROFS;
 168                 return;
 169         }
 170 
 171         error = sattr_to_vattr(&args->saa_sa, &va);
 172         if (error) {
 173                 VN_RELE(vp);
 174                 ns->ns_status = puterrno(error);
 175                 return;
 176         }
 177 
 178         /*
 179          * If the client is requesting a change to the mtime,
 180          * but the nanosecond field is set to 1 billion, then
 181          * this is a flag to the server that it should set the
 182          * atime and mtime fields to the server's current time.
 183          * The 1 billion number actually came from the client
 184          * as 1 million, but the units in the over the wire
 185          * request are microseconds instead of nanoseconds.
 186          *
 187          * This is an overload of the protocol and should be
 188          * documented in the NFS Version 2 protocol specification.
 189          */
 190         if (va.va_mask & AT_MTIME) {
 191                 if (va.va_mtime.tv_nsec == 1000000000) {
 192                         gethrestime(&va.va_mtime);
 193                         va.va_atime = va.va_mtime;
 194                         va.va_mask |= AT_ATIME;
 195                         flag = 0;
 196                 } else
 197                         flag = ATTR_UTIME;
 198         } else
 199                 flag = 0;
 200 
 201         /*
 202          * If the filesystem is exported with nosuid, then mask off
 203          * the setuid and setgid bits.
 204          */
 205         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 206             (exi->exi_export.ex_flags & EX_NOSUID))
 207                 va.va_mode &= ~(VSUID | VSGID);
 208 
 209         ct.cc_sysid = 0;
 210         ct.cc_pid = 0;
 211         ct.cc_caller_id = nfs2_srv_caller_id;
 212         ct.cc_flags = CC_DONTBLOCK;
 213 
 214         /*
 215          * We need to specially handle size changes because it is
 216          * possible for the client to create a file with modes
 217          * which indicate read-only, but with the file opened for
 218          * writing.  If the client then tries to set the size of
 219          * the file, then the normal access checking done in
 220          * VOP_SETATTR would prevent the client from doing so,
 221          * although it should be legal for it to do so.  To get
 222          * around this, we do the access checking for ourselves
 223          * and then use VOP_SPACE which doesn't do the access
 224          * checking which VOP_SETATTR does. VOP_SPACE can only
 225          * operate on VREG files, let VOP_SETATTR handle the other
 226          * extremely rare cases.
 227          * Also the client should not be allowed to change the
 228          * size of the file if there is a conflicting non-blocking
 229          * mandatory lock in the region of change.
 230          */
 231         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 232                 if (nbl_need_check(vp)) {
 233                         nbl_start_crit(vp, RW_READER);
 234                         in_crit = 1;
 235                 }
 236 
 237                 bva.va_mask = AT_UID | AT_SIZE;
 238 
 239                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 240 
 241                 if (error) {
 242                         if (in_crit)
 243                                 nbl_end_crit(vp);
 244                         VN_RELE(vp);
 245                         ns->ns_status = puterrno(error);
 246                         return;
 247                 }
 248 
 249                 if (in_crit) {
 250                         u_offset_t offset;
 251                         ssize_t length;
 252 
 253                         if (va.va_size < bva.va_size) {
 254                                 offset = va.va_size;
 255                                 length = bva.va_size - va.va_size;
 256                         } else {
 257                                 offset = bva.va_size;
 258                                 length = va.va_size - bva.va_size;
 259                         }
 260                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 261                             NULL)) {
 262                                 error = EACCES;
 263                         }
 264                 }
 265 
 266                 if (crgetuid(cr) == bva.va_uid && !error &&
 267                     va.va_size != bva.va_size) {
 268                         va.va_mask &= ~AT_SIZE;
 269                         bf.l_type = F_WRLCK;
 270                         bf.l_whence = 0;
 271                         bf.l_start = (off64_t)va.va_size;
 272                         bf.l_len = 0;
 273                         bf.l_sysid = 0;
 274                         bf.l_pid = 0;
 275 
 276                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 277                             (offset_t)va.va_size, cr, &ct);
 278                 }
 279                 if (in_crit)
 280                         nbl_end_crit(vp);
 281         } else
 282                 error = 0;
 283 
 284         /*
 285          * Do the setattr.
 286          */
 287         if (!error && va.va_mask) {
 288                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 289         }
 290 
 291         /*
 292          * check if the monitor on either vop_space or vop_setattr detected
 293          * a delegation conflict and if so, mark the thread flag as
 294          * wouldblock so that the response is dropped and the client will
 295          * try again.
 296          */
 297         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 298                 VN_RELE(vp);
 299                 curthread->t_flag |= T_WOULDBLOCK;
 300                 return;
 301         }
 302 
 303         if (!error) {
 304                 va.va_mask = AT_ALL;    /* get everything */
 305 
 306                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 307 
 308                 /* check for overflows */
 309                 if (!error) {
 310                         acl_perm(vp, exi, &va, cr);
 311                         error = vattr_to_nattr(&va, &ns->ns_attr);
 312                 }
 313         }
 314 
 315         ct.cc_flags = 0;
 316 
 317         /*
 318          * Force modified metadata out to stable storage.
 319          */
 320         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 321 
 322         VN_RELE(vp);
 323 
 324         ns->ns_status = puterrno(error);
 325 }
 326 void *
 327 rfs_setattr_getfh(struct nfssaargs *args)
 328 {
 329         return (&args->saa_fh);
 330 }
 331 
 332 /*
 333  * Directory lookup.
 334  * Returns an fhandle and file attributes for file name in a directory.
 335  */
 336 /* ARGSUSED */
 337 void
 338 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 339     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 340 {
 341         int error;
 342         vnode_t *dvp;
 343         vnode_t *vp;
 344         struct vattr va;
 345         fhandle_t *fhp = da->da_fhandle;
 346         struct sec_ol sec = {0, 0};
 347         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 348         char *name;
 349         struct sockaddr *ca;
 350 
 351         /*
 352          * Trusted Extension doesn't support NFSv2. MOUNT
 353          * will reject v2 clients. Need to prevent v2 client
 354          * access via WebNFS here.
 355          */
 356         if (is_system_labeled() && req->rq_vers == 2) {
 357                 dr->dr_status = NFSERR_ACCES;
 358                 return;
 359         }
 360 
 361         /*
 362          * Disallow NULL paths
 363          */
 364         if (da->da_name == NULL || *da->da_name == '\0') {
 365                 dr->dr_status = NFSERR_ACCES;
 366                 return;
 367         }
 368 
 369         /*
 370          * Allow lookups from the root - the default
 371          * location of the public filehandle.
 372          */
 373         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 374                 dvp = rootdir;
 375                 VN_HOLD(dvp);
 376         } else {
 377                 dvp = nfs_fhtovp(fhp, exi);
 378                 if (dvp == NULL) {
 379                         dr->dr_status = NFSERR_STALE;
 380                         return;
 381                 }
 382         }
 383 
 384         /*
 385          * Not allow lookup beyond root.
 386          * If the filehandle matches a filehandle of the exi,
 387          * then the ".." refers beyond the root of an exported filesystem.
 388          */
 389         if (strcmp(da->da_name, "..") == 0 &&
 390             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 391                 VN_RELE(dvp);
 392                 dr->dr_status = NFSERR_NOENT;
 393                 return;
 394         }
 395 
 396         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 397         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 398             MAXPATHLEN);
 399 
 400         if (name == NULL) {
 401                 dr->dr_status = NFSERR_ACCES;
 402                 return;
 403         }
 404 
 405         /*
 406          * If the public filehandle is used then allow
 407          * a multi-component lookup, i.e. evaluate
 408          * a pathname and follow symbolic links if
 409          * necessary.
 410          *
 411          * This may result in a vnode in another filesystem
 412          * which is OK as long as the filesystem is exported.
 413          */
 414         if (PUBLIC_FH2(fhp)) {
 415                 publicfh_flag = TRUE;
 416                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 417                     &sec);
 418         } else {
 419                 /*
 420                  * Do a normal single component lookup.
 421                  */
 422                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 423                     NULL, NULL, NULL);
 424         }
 425 
 426         if (name != da->da_name)
 427                 kmem_free(name, MAXPATHLEN);
 428 
 429 
 430         if (!error) {
 431                 va.va_mask = AT_ALL;    /* we want everything */
 432 
 433                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 434 
 435                 /* check for overflows */
 436                 if (!error) {
 437                         acl_perm(vp, exi, &va, cr);
 438                         error = vattr_to_nattr(&va, &dr->dr_attr);
 439                         if (!error) {
 440                                 if (sec.sec_flags & SEC_QUERY)
 441                                         error = makefh_ol(&dr->dr_fhandle, exi,
 442                                             sec.sec_index);
 443                                 else {
 444                                         error = makefh(&dr->dr_fhandle, vp,
 445                                             exi);
 446                                         if (!error && publicfh_flag &&
 447                                             !chk_clnt_sec(exi, req))
 448                                                 auth_weak = TRUE;
 449                                 }
 450                         }
 451                 }
 452                 VN_RELE(vp);
 453         }
 454 
 455         VN_RELE(dvp);
 456 
 457         /*
 458          * If publicfh_flag is true then we have called rfs_publicfh_mclookup
 459          * and have obtained a new exportinfo in exi which needs to be
 460          * released. Note the the original exportinfo pointed to by exi
 461          * will be released by the caller, comon_dispatch.
 462          */
 463         if (publicfh_flag && exi != NULL)
 464                 exi_rele(exi);
 465 
 466         /*
 467          * If it's public fh, no 0x81, and client's flavor is
 468          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 469          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 470          */
 471         if (auth_weak)
 472                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 473         else
 474                 dr->dr_status = puterrno(error);
 475 }
 476 void *
 477 rfs_lookup_getfh(struct nfsdiropargs *da)
 478 {
 479         return (da->da_fhandle);
 480 }
 481 
 482 /*
 483  * Read symbolic link.
 484  * Returns the string in the symbolic link at the given fhandle.
 485  */
 486 /* ARGSUSED */
 487 void
 488 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 489     struct svc_req *req, cred_t *cr, bool_t ro)
 490 {
 491         int error;
 492         struct iovec iov;
 493         struct uio uio;
 494         vnode_t *vp;
 495         struct vattr va;
 496         struct sockaddr *ca;
 497         char *name = NULL;
 498         int is_referral = 0;
 499 
 500         vp = nfs_fhtovp(fhp, exi);
 501         if (vp == NULL) {
 502                 rl->rl_data = NULL;
 503                 rl->rl_status = NFSERR_STALE;
 504                 return;
 505         }
 506 
 507         va.va_mask = AT_MODE;
 508 
 509         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 510 
 511         if (error) {
 512                 VN_RELE(vp);
 513                 rl->rl_data = NULL;
 514                 rl->rl_status = puterrno(error);
 515                 return;
 516         }
 517 
 518         if (MANDLOCK(vp, va.va_mode)) {
 519                 VN_RELE(vp);
 520                 rl->rl_data = NULL;
 521                 rl->rl_status = NFSERR_ACCES;
 522                 return;
 523         }
 524 
 525         /* We lied about the object type for a referral */
 526         if (vn_is_nfs_reparse(vp, cr))
 527                 is_referral = 1;
 528 
 529         /*
 530          * XNFS and RFC1094 require us to return ENXIO if argument
 531          * is not a link. BUGID 1138002.
 532          */
 533         if (vp->v_type != VLNK && !is_referral) {
 534                 VN_RELE(vp);
 535                 rl->rl_data = NULL;
 536                 rl->rl_status = NFSERR_NXIO;
 537                 return;
 538         }
 539 
 540         /*
 541          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 542          */
 543         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 544 
 545         if (is_referral) {
 546                 char *s;
 547                 size_t strsz;
 548 
 549                 /* Get an artificial symlink based on a referral */
 550                 s = build_symlink(vp, cr, &strsz);
 551                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 552                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 553                     vnode_t *, vp, char *, s);
 554                 if (s == NULL)
 555                         error = EINVAL;
 556                 else {
 557                         error = 0;
 558                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 559                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 560                         kmem_free(s, strsz);
 561                 }
 562 
 563         } else {
 564 
 565                 /*
 566                  * Set up io vector to read sym link data
 567                  */
 568                 iov.iov_base = rl->rl_data;
 569                 iov.iov_len = NFS_MAXPATHLEN;
 570                 uio.uio_iov = &iov;
 571                 uio.uio_iovcnt = 1;
 572                 uio.uio_segflg = UIO_SYSSPACE;
 573                 uio.uio_extflg = UIO_COPY_CACHED;
 574                 uio.uio_loffset = (offset_t)0;
 575                 uio.uio_resid = NFS_MAXPATHLEN;
 576 
 577                 /*
 578                  * Do the readlink.
 579                  */
 580                 error = VOP_READLINK(vp, &uio, cr, NULL);
 581 
 582                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 583 
 584                 if (!error)
 585                         rl->rl_data[rl->rl_count] = '\0';
 586 
 587         }
 588 
 589 
 590         VN_RELE(vp);
 591 
 592         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 593         name = nfscmd_convname(ca, exi, rl->rl_data,
 594             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 595 
 596         if (name != NULL && name != rl->rl_data) {
 597                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 598                 rl->rl_data = name;
 599         }
 600 
 601         /*
 602          * XNFS and RFC1094 require us to return ENXIO if argument
 603          * is not a link. UFS returns EINVAL if this is the case,
 604          * so we do the mapping here. BUGID 1138002.
 605          */
 606         if (error == EINVAL)
 607                 rl->rl_status = NFSERR_NXIO;
 608         else
 609                 rl->rl_status = puterrno(error);
 610 
 611 }
 612 void *
 613 rfs_readlink_getfh(fhandle_t *fhp)
 614 {
 615         return (fhp);
 616 }
 617 /*
 618  * Free data allocated by rfs_readlink
 619  */
 620 void
 621 rfs_rlfree(struct nfsrdlnres *rl)
 622 {
 623         if (rl->rl_data != NULL)
 624                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 625 }
 626 
 627 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 628 
 629 /*
 630  * Read data.
 631  * Returns some data read from the file at the given fhandle.
 632  */
 633 /* ARGSUSED */
 634 void
 635 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 636     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 637 {
 638         vnode_t *vp;
 639         int error;
 640         struct vattr va;
 641         struct iovec iov;
 642         struct uio uio;
 643         mblk_t *mp;
 644         int alloc_err = 0;
 645         int in_crit = 0;
 646         caller_context_t ct;
 647 
 648         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 649         if (vp == NULL) {
 650                 rr->rr_data = NULL;
 651                 rr->rr_status = NFSERR_STALE;
 652                 return;
 653         }
 654 
 655         if (vp->v_type != VREG) {
 656                 VN_RELE(vp);
 657                 rr->rr_data = NULL;
 658                 rr->rr_status = NFSERR_ISDIR;
 659                 return;
 660         }
 661 
 662         ct.cc_sysid = 0;
 663         ct.cc_pid = 0;
 664         ct.cc_caller_id = nfs2_srv_caller_id;
 665         ct.cc_flags = CC_DONTBLOCK;
 666 
 667         /*
 668          * Enter the critical region before calling VOP_RWLOCK
 669          * to avoid a deadlock with write requests.
 670          */
 671         if (nbl_need_check(vp)) {
 672                 nbl_start_crit(vp, RW_READER);
 673                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 674                     0, NULL)) {
 675                         nbl_end_crit(vp);
 676                         VN_RELE(vp);
 677                         rr->rr_data = NULL;
 678                         rr->rr_status = NFSERR_ACCES;
 679                         return;
 680                 }
 681                 in_crit = 1;
 682         }
 683 
 684         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 685 
 686         /* check if a monitor detected a delegation conflict */
 687         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 688                 VN_RELE(vp);
 689                 /* mark as wouldblock so response is dropped */
 690                 curthread->t_flag |= T_WOULDBLOCK;
 691 
 692                 rr->rr_data = NULL;
 693                 return;
 694         }
 695 
 696         va.va_mask = AT_ALL;
 697 
 698         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 699 
 700         if (error) {
 701                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 702                 if (in_crit)
 703                         nbl_end_crit(vp);
 704 
 705                 VN_RELE(vp);
 706                 rr->rr_data = NULL;
 707                 rr->rr_status = puterrno(error);
 708 
 709                 return;
 710         }
 711 
 712         /*
 713          * This is a kludge to allow reading of files created
 714          * with no read permission.  The owner of the file
 715          * is always allowed to read it.
 716          */
 717         if (crgetuid(cr) != va.va_uid) {
 718                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 719 
 720                 if (error) {
 721                         /*
 722                          * Exec is the same as read over the net because
 723                          * of demand loading.
 724                          */
 725                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 726                 }
 727                 if (error) {
 728                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 729                         if (in_crit)
 730                                 nbl_end_crit(vp);
 731                         VN_RELE(vp);
 732                         rr->rr_data = NULL;
 733                         rr->rr_status = puterrno(error);
 734 
 735                         return;
 736                 }
 737         }
 738 
 739         if (MANDLOCK(vp, va.va_mode)) {
 740                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 741                 if (in_crit)
 742                         nbl_end_crit(vp);
 743 
 744                 VN_RELE(vp);
 745                 rr->rr_data = NULL;
 746                 rr->rr_status = NFSERR_ACCES;
 747 
 748                 return;
 749         }
 750 
 751         rr->rr_ok.rrok_wlist_len = 0;
 752         rr->rr_ok.rrok_wlist = NULL;
 753 
 754         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 755                 rr->rr_count = 0;
 756                 rr->rr_data = NULL;
 757                 /*
 758                  * In this case, status is NFS_OK, but there is no data
 759                  * to encode. So set rr_mp to NULL.
 760                  */
 761                 rr->rr_mp = NULL;
 762                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 763                 if (rr->rr_ok.rrok_wlist)
 764                         clist_zero_len(rr->rr_ok.rrok_wlist);
 765                 goto done;
 766         }
 767 
 768         if (ra->ra_wlist) {
 769                 mp = NULL;
 770                 rr->rr_mp = NULL;
 771                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 772                 if (ra->ra_count > iov.iov_len) {
 773                         rr->rr_data = NULL;
 774                         rr->rr_status = NFSERR_INVAL;
 775                         goto done;
 776                 }
 777         } else {
 778                 /*
 779                  * mp will contain the data to be sent out in the read reply.
 780                  * This will be freed after the reply has been sent out (by the
 781                  * driver).
 782                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 783                  * that the call to xdrmblk_putmblk() never fails.
 784                  */
 785                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 786                     &alloc_err);
 787                 ASSERT(mp != NULL);
 788                 ASSERT(alloc_err == 0);
 789 
 790                 rr->rr_mp = mp;
 791 
 792                 /*
 793                  * Set up io vector
 794                  */
 795                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 796                 iov.iov_len = ra->ra_count;
 797         }
 798 
 799         uio.uio_iov = &iov;
 800         uio.uio_iovcnt = 1;
 801         uio.uio_segflg = UIO_SYSSPACE;
 802         uio.uio_extflg = UIO_COPY_CACHED;
 803         uio.uio_loffset = (offset_t)ra->ra_offset;
 804         uio.uio_resid = ra->ra_count;
 805 
 806         error = VOP_READ(vp, &uio, 0, cr, &ct);
 807 
 808         if (error) {
 809                 if (mp)
 810                         freeb(mp);
 811 
 812                 /*
 813                  * check if a monitor detected a delegation conflict and
 814                  * mark as wouldblock so response is dropped
 815                  */
 816                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 817                         curthread->t_flag |= T_WOULDBLOCK;
 818                 else
 819                         rr->rr_status = puterrno(error);
 820 
 821                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 822                 if (in_crit)
 823                         nbl_end_crit(vp);
 824 
 825                 VN_RELE(vp);
 826                 rr->rr_data = NULL;
 827 
 828                 return;
 829         }
 830 
 831         /*
 832          * Get attributes again so we can send the latest access
 833          * time to the client side for its cache.
 834          */
 835         va.va_mask = AT_ALL;
 836 
 837         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 838 
 839         if (error) {
 840                 if (mp)
 841                         freeb(mp);
 842 
 843                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 844                 if (in_crit)
 845                         nbl_end_crit(vp);
 846 
 847                 VN_RELE(vp);
 848                 rr->rr_data = NULL;
 849                 rr->rr_status = puterrno(error);
 850 
 851                 return;
 852         }
 853 
 854         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 855 
 856         if (mp) {
 857                 rr->rr_data = (char *)mp->b_datap->db_base;
 858         } else {
 859                 if (ra->ra_wlist) {
 860                         rr->rr_data = (caddr_t)iov.iov_base;
 861                         if (!rdma_setup_read_data2(ra, rr)) {
 862                                 rr->rr_data = NULL;
 863                                 rr->rr_status = puterrno(NFSERR_INVAL);
 864                         }
 865                 }
 866         }
 867 done:
 868         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 869         if (in_crit)
 870                 nbl_end_crit(vp);
 871 
 872         acl_perm(vp, exi, &va, cr);
 873 
 874         /* check for overflows */
 875         error = vattr_to_nattr(&va, &rr->rr_attr);
 876 
 877         VN_RELE(vp);
 878 
 879         rr->rr_status = puterrno(error);
 880 }
 881 
 882 /*
 883  * Free data allocated by rfs_read
 884  */
 885 void
 886 rfs_rdfree(struct nfsrdresult *rr)
 887 {
 888         mblk_t *mp;
 889 
 890         if (rr->rr_status == NFS_OK) {
 891                 mp = rr->rr_mp;
 892                 if (mp != NULL)
 893                         freeb(mp);
 894         }
 895 }
 896 
 897 void *
 898 rfs_read_getfh(struct nfsreadargs *ra)
 899 {
 900         return (&ra->ra_fhandle);
 901 }
 902 
 903 #define MAX_IOVECS      12
 904 
 905 #ifdef DEBUG
 906 static int rfs_write_sync_hits = 0;
 907 static int rfs_write_sync_misses = 0;
 908 #endif
 909 
 910 /*
 911  * Write data to file.
 912  * Returns attributes of a file after writing some data to it.
 913  *
 914  * Any changes made here, especially in error handling might have
 915  * to also be done in rfs_write (which clusters write requests).
 916  */
 917 /* ARGSUSED */
 918 void
 919 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
 920     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 921 {
 922         int error;
 923         vnode_t *vp;
 924         rlim64_t rlimit;
 925         struct vattr va;
 926         struct uio uio;
 927         struct iovec iov[MAX_IOVECS];
 928         mblk_t *m;
 929         struct iovec *iovp;
 930         int iovcnt;
 931         cred_t *savecred;
 932         int in_crit = 0;
 933         caller_context_t ct;
 934 
 935         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 936         if (vp == NULL) {
 937                 ns->ns_status = NFSERR_STALE;
 938                 return;
 939         }
 940 
 941         if (rdonly(ro, vp)) {
 942                 VN_RELE(vp);
 943                 ns->ns_status = NFSERR_ROFS;
 944                 return;
 945         }
 946 
 947         if (vp->v_type != VREG) {
 948                 VN_RELE(vp);
 949                 ns->ns_status = NFSERR_ISDIR;
 950                 return;
 951         }
 952 
 953         ct.cc_sysid = 0;
 954         ct.cc_pid = 0;
 955         ct.cc_caller_id = nfs2_srv_caller_id;
 956         ct.cc_flags = CC_DONTBLOCK;
 957 
 958         va.va_mask = AT_UID|AT_MODE;
 959 
 960         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 961 
 962         if (error) {
 963                 VN_RELE(vp);
 964                 ns->ns_status = puterrno(error);
 965 
 966                 return;
 967         }
 968 
 969         if (crgetuid(cr) != va.va_uid) {
 970                 /*
 971                  * This is a kludge to allow writes of files created
 972                  * with read only permission.  The owner of the file
 973                  * is always allowed to write it.
 974                  */
 975                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
 976 
 977                 if (error) {
 978                         VN_RELE(vp);
 979                         ns->ns_status = puterrno(error);
 980                         return;
 981                 }
 982         }
 983 
 984         /*
 985          * Can't access a mandatory lock file.  This might cause
 986          * the NFS service thread to block forever waiting for a
 987          * lock to be released that will never be released.
 988          */
 989         if (MANDLOCK(vp, va.va_mode)) {
 990                 VN_RELE(vp);
 991                 ns->ns_status = NFSERR_ACCES;
 992                 return;
 993         }
 994 
 995         /*
 996          * We have to enter the critical region before calling VOP_RWLOCK
 997          * to avoid a deadlock with ufs.
 998          */
 999         if (nbl_need_check(vp)) {
1000                 nbl_start_crit(vp, RW_READER);
1001                 in_crit = 1;
1002                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1003                     wa->wa_count, 0, NULL)) {
1004                         error = EACCES;
1005                         goto out;
1006                 }
1007         }
1008 
1009         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1010 
1011         /* check if a monitor detected a delegation conflict */
1012         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1013                 VN_RELE(vp);
1014                 /* mark as wouldblock so response is dropped */
1015                 curthread->t_flag |= T_WOULDBLOCK;
1016                 return;
1017         }
1018 
1019         if (wa->wa_data || wa->wa_rlist) {
1020                 /* Do the RDMA thing if necessary */
1021                 if (wa->wa_rlist) {
1022                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1023                         iov[0].iov_len = wa->wa_count;
1024                 } else  {
1025                         iov[0].iov_base = wa->wa_data;
1026                         iov[0].iov_len = wa->wa_count;
1027                 }
1028                 uio.uio_iov = iov;
1029                 uio.uio_iovcnt = 1;
1030                 uio.uio_segflg = UIO_SYSSPACE;
1031                 uio.uio_extflg = UIO_COPY_DEFAULT;
1032                 uio.uio_loffset = (offset_t)wa->wa_offset;
1033                 uio.uio_resid = wa->wa_count;
1034                 /*
1035                  * The limit is checked on the client. We
1036                  * should allow any size writes here.
1037                  */
1038                 uio.uio_llimit = curproc->p_fsz_ctl;
1039                 rlimit = uio.uio_llimit - wa->wa_offset;
1040                 if (rlimit < (rlim64_t)uio.uio_resid)
1041                         uio.uio_resid = (uint_t)rlimit;
1042 
1043                 /*
1044                  * for now we assume no append mode
1045                  */
1046                 /*
1047                  * We're changing creds because VM may fault and we need
1048                  * the cred of the current thread to be used if quota
1049                  * checking is enabled.
1050                  */
1051                 savecred = curthread->t_cred;
1052                 curthread->t_cred = cr;
1053                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1054                 curthread->t_cred = savecred;
1055         } else {
1056                 iovcnt = 0;
1057                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1058                         iovcnt++;
1059                 if (iovcnt <= MAX_IOVECS) {
1060 #ifdef DEBUG
1061                         rfs_write_sync_hits++;
1062 #endif
1063                         iovp = iov;
1064                 } else {
1065 #ifdef DEBUG
1066                         rfs_write_sync_misses++;
1067 #endif
1068                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1069                 }
1070                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1071                 uio.uio_iov = iovp;
1072                 uio.uio_iovcnt = iovcnt;
1073                 uio.uio_segflg = UIO_SYSSPACE;
1074                 uio.uio_extflg = UIO_COPY_DEFAULT;
1075                 uio.uio_loffset = (offset_t)wa->wa_offset;
1076                 uio.uio_resid = wa->wa_count;
1077                 /*
1078                  * The limit is checked on the client. We
1079                  * should allow any size writes here.
1080                  */
1081                 uio.uio_llimit = curproc->p_fsz_ctl;
1082                 rlimit = uio.uio_llimit - wa->wa_offset;
1083                 if (rlimit < (rlim64_t)uio.uio_resid)
1084                         uio.uio_resid = (uint_t)rlimit;
1085 
1086                 /*
1087                  * For now we assume no append mode.
1088                  */
1089                 /*
1090                  * We're changing creds because VM may fault and we need
1091                  * the cred of the current thread to be used if quota
1092                  * checking is enabled.
1093                  */
1094                 savecred = curthread->t_cred;
1095                 curthread->t_cred = cr;
1096                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1097                 curthread->t_cred = savecred;
1098 
1099                 if (iovp != iov)
1100                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1101         }
1102 
1103         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1104 
1105         if (!error) {
1106                 /*
1107                  * Get attributes again so we send the latest mod
1108                  * time to the client side for its cache.
1109                  */
1110                 va.va_mask = AT_ALL;    /* now we want everything */
1111 
1112                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1113 
1114                 /* check for overflows */
1115                 if (!error) {
1116                         acl_perm(vp, exi, &va, cr);
1117                         error = vattr_to_nattr(&va, &ns->ns_attr);
1118                 }
1119         }
1120 
1121 out:
1122         if (in_crit)
1123                 nbl_end_crit(vp);
1124         VN_RELE(vp);
1125 
1126         /* check if a monitor detected a delegation conflict */
1127         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1128                 /* mark as wouldblock so response is dropped */
1129                 curthread->t_flag |= T_WOULDBLOCK;
1130         else
1131                 ns->ns_status = puterrno(error);
1132 
1133 }
1134 
1135 struct rfs_async_write {
1136         struct nfswriteargs *wa;
1137         struct nfsattrstat *ns;
1138         struct svc_req *req;
1139         cred_t *cr;
1140         bool_t ro;
1141         kthread_t *thread;
1142         struct rfs_async_write *list;
1143 };
1144 
1145 struct rfs_async_write_list {
1146         fhandle_t *fhp;
1147         kcondvar_t cv;
1148         struct rfs_async_write *list;
1149         struct rfs_async_write_list *next;
1150 };
1151 
1152 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1153 static kmutex_t rfs_async_write_lock;
1154 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1155 
1156 #define MAXCLIOVECS     42
1157 #define RFSWRITE_INITVAL (enum nfsstat) -1
1158 
1159 #ifdef DEBUG
1160 static int rfs_write_hits = 0;
1161 static int rfs_write_misses = 0;
1162 #endif
1163 
1164 /*
1165  * Write data to file.
1166  * Returns attributes of a file after writing some data to it.
1167  */
1168 void
1169 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1170     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1171 {
1172         int error;
1173         vnode_t *vp;
1174         rlim64_t rlimit;
1175         struct vattr va;
1176         struct uio uio;
1177         struct rfs_async_write_list *lp;
1178         struct rfs_async_write_list *nlp;
1179         struct rfs_async_write *rp;
1180         struct rfs_async_write *nrp;
1181         struct rfs_async_write *trp;
1182         struct rfs_async_write *lrp;
1183         int data_written;
1184         int iovcnt;
1185         mblk_t *m;
1186         struct iovec *iovp;
1187         struct iovec *niovp;
1188         struct iovec iov[MAXCLIOVECS];
1189         int count;
1190         int rcount;
1191         uint_t off;
1192         uint_t len;
1193         struct rfs_async_write nrpsp;
1194         struct rfs_async_write_list nlpsp;
1195         ushort_t t_flag;
1196         cred_t *savecred;
1197         int in_crit = 0;
1198         caller_context_t ct;
1199 
1200         if (!rfs_write_async) {
1201                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1202                 return;
1203         }
1204 
1205         /*
1206          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1207          * is considered an OK.
1208          */
1209         ns->ns_status = RFSWRITE_INITVAL;
1210 
1211         nrp = &nrpsp;
1212         nrp->wa = wa;
1213         nrp->ns = ns;
1214         nrp->req = req;
1215         nrp->cr = cr;
1216         nrp->ro = ro;
1217         nrp->thread = curthread;
1218 
1219         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1220 
1221         /*
1222          * Look to see if there is already a cluster started
1223          * for this file.
1224          */
1225         mutex_enter(&rfs_async_write_lock);
1226         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1227                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1228                     sizeof (fhandle_t)) == 0)
1229                         break;
1230         }
1231 
1232         /*
1233          * If lp is non-NULL, then there is already a cluster
1234          * started.  We need to place ourselves in the cluster
1235          * list in the right place as determined by starting
1236          * offset.  Conflicts with non-blocking mandatory locked
1237          * regions will be checked when the cluster is processed.
1238          */
1239         if (lp != NULL) {
1240                 rp = lp->list;
1241                 trp = NULL;
1242                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1243                         trp = rp;
1244                         rp = rp->list;
1245                 }
1246                 nrp->list = rp;
1247                 if (trp == NULL)
1248                         lp->list = nrp;
1249                 else
1250                         trp->list = nrp;
1251                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1252                         cv_wait(&lp->cv, &rfs_async_write_lock);
1253                 mutex_exit(&rfs_async_write_lock);
1254 
1255                 return;
1256         }
1257 
1258         /*
1259          * No cluster started yet, start one and add ourselves
1260          * to the list of clusters.
1261          */
1262         nrp->list = NULL;
1263 
1264         nlp = &nlpsp;
1265         nlp->fhp = &wa->wa_fhandle;
1266         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1267         nlp->list = nrp;
1268         nlp->next = NULL;
1269 
1270         if (rfs_async_write_head == NULL) {
1271                 rfs_async_write_head = nlp;
1272         } else {
1273                 lp = rfs_async_write_head;
1274                 while (lp->next != NULL)
1275                         lp = lp->next;
1276                 lp->next = nlp;
1277         }
1278         mutex_exit(&rfs_async_write_lock);
1279 
1280         /*
1281          * Convert the file handle common to all of the requests
1282          * in this cluster to a vnode.
1283          */
1284         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1285         if (vp == NULL) {
1286                 mutex_enter(&rfs_async_write_lock);
1287                 if (rfs_async_write_head == nlp)
1288                         rfs_async_write_head = nlp->next;
1289                 else {
1290                         lp = rfs_async_write_head;
1291                         while (lp->next != nlp)
1292                                 lp = lp->next;
1293                         lp->next = nlp->next;
1294                 }
1295                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1296                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1297                         rp->ns->ns_status = NFSERR_STALE;
1298                         rp->thread->t_flag |= t_flag;
1299                 }
1300                 cv_broadcast(&nlp->cv);
1301                 mutex_exit(&rfs_async_write_lock);
1302 
1303                 return;
1304         }
1305 
1306         /*
1307          * Can only write regular files.  Attempts to write any
1308          * other file types fail with EISDIR.
1309          */
1310         if (vp->v_type != VREG) {
1311                 VN_RELE(vp);
1312                 mutex_enter(&rfs_async_write_lock);
1313                 if (rfs_async_write_head == nlp)
1314                         rfs_async_write_head = nlp->next;
1315                 else {
1316                         lp = rfs_async_write_head;
1317                         while (lp->next != nlp)
1318                                 lp = lp->next;
1319                         lp->next = nlp->next;
1320                 }
1321                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1322                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1323                         rp->ns->ns_status = NFSERR_ISDIR;
1324                         rp->thread->t_flag |= t_flag;
1325                 }
1326                 cv_broadcast(&nlp->cv);
1327                 mutex_exit(&rfs_async_write_lock);
1328 
1329                 return;
1330         }
1331 
1332         /*
1333          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1334          * deadlock with ufs.
1335          */
1336         if (nbl_need_check(vp)) {
1337                 nbl_start_crit(vp, RW_READER);
1338                 in_crit = 1;
1339         }
1340 
1341         ct.cc_sysid = 0;
1342         ct.cc_pid = 0;
1343         ct.cc_caller_id = nfs2_srv_caller_id;
1344         ct.cc_flags = CC_DONTBLOCK;
1345 
1346         /*
1347          * Lock the file for writing.  This operation provides
1348          * the delay which allows clusters to grow.
1349          */
1350         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1351 
1352         /* check if a monitor detected a delegation conflict */
1353         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1354                 if (in_crit)
1355                         nbl_end_crit(vp);
1356                 VN_RELE(vp);
1357                 /* mark as wouldblock so response is dropped */
1358                 curthread->t_flag |= T_WOULDBLOCK;
1359                 mutex_enter(&rfs_async_write_lock);
1360                 if (rfs_async_write_head == nlp)
1361                         rfs_async_write_head = nlp->next;
1362                 else {
1363                         lp = rfs_async_write_head;
1364                         while (lp->next != nlp)
1365                                 lp = lp->next;
1366                         lp->next = nlp->next;
1367                 }
1368                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1369                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1370                                 rp->ns->ns_status = puterrno(error);
1371                                 rp->thread->t_flag |= T_WOULDBLOCK;
1372                         }
1373                 }
1374                 cv_broadcast(&nlp->cv);
1375                 mutex_exit(&rfs_async_write_lock);
1376 
1377                 return;
1378         }
1379 
1380         /*
1381          * Disconnect this cluster from the list of clusters.
1382          * The cluster that is being dealt with must be fixed
1383          * in size after this point, so there is no reason
1384          * to leave it on the list so that new requests can
1385          * find it.
1386          *
1387          * The algorithm is that the first write request will
1388          * create a cluster, convert the file handle to a
1389          * vnode pointer, and then lock the file for writing.
1390          * This request is not likely to be clustered with
1391          * any others.  However, the next request will create
1392          * a new cluster and be blocked in VOP_RWLOCK while
1393          * the first request is being processed.  This delay
1394          * will allow more requests to be clustered in this
1395          * second cluster.
1396          */
1397         mutex_enter(&rfs_async_write_lock);
1398         if (rfs_async_write_head == nlp)
1399                 rfs_async_write_head = nlp->next;
1400         else {
1401                 lp = rfs_async_write_head;
1402                 while (lp->next != nlp)
1403                         lp = lp->next;
1404                 lp->next = nlp->next;
1405         }
1406         mutex_exit(&rfs_async_write_lock);
1407 
1408         /*
1409          * Step through the list of requests in this cluster.
1410          * We need to check permissions to make sure that all
1411          * of the requests have sufficient permission to write
1412          * the file.  A cluster can be composed of requests
1413          * from different clients and different users on each
1414          * client.
1415          *
1416          * As a side effect, we also calculate the size of the
1417          * byte range that this cluster encompasses.
1418          */
1419         rp = nlp->list;
1420         off = rp->wa->wa_offset;
1421         len = (uint_t)0;
1422         do {
1423                 if (rdonly(rp->ro, vp)) {
1424                         rp->ns->ns_status = NFSERR_ROFS;
1425                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1426                         rp->thread->t_flag |= t_flag;
1427                         continue;
1428                 }
1429 
1430                 va.va_mask = AT_UID|AT_MODE;
1431 
1432                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1433 
1434                 if (!error) {
1435                         if (crgetuid(rp->cr) != va.va_uid) {
1436                                 /*
1437                                  * This is a kludge to allow writes of files
1438                                  * created with read only permission.  The
1439                                  * owner of the file is always allowed to
1440                                  * write it.
1441                                  */
1442                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1443                         }
1444                         if (!error && MANDLOCK(vp, va.va_mode))
1445                                 error = EACCES;
1446                 }
1447 
1448                 /*
1449                  * Check for a conflict with a nbmand-locked region.
1450                  */
1451                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1452                     rp->wa->wa_count, 0, NULL)) {
1453                         error = EACCES;
1454                 }
1455 
1456                 if (error) {
1457                         rp->ns->ns_status = puterrno(error);
1458                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1459                         rp->thread->t_flag |= t_flag;
1460                         continue;
1461                 }
1462                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1463                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1464         } while ((rp = rp->list) != NULL);
1465 
1466         /*
1467          * Step through the cluster attempting to gather as many
1468          * requests which are contiguous as possible.  These
1469          * contiguous requests are handled via one call to VOP_WRITE
1470          * instead of different calls to VOP_WRITE.  We also keep
1471          * track of the fact that any data was written.
1472          */
1473         rp = nlp->list;
1474         data_written = 0;
1475         do {
1476                 /*
1477                  * Skip any requests which are already marked as having an
1478                  * error.
1479                  */
1480                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1481                         rp = rp->list;
1482                         continue;
1483                 }
1484 
1485                 /*
1486                  * Count the number of iovec's which are required
1487                  * to handle this set of requests.  One iovec is
1488                  * needed for each data buffer, whether addressed
1489                  * by wa_data or by the b_rptr pointers in the
1490                  * mblk chains.
1491                  */
1492                 iovcnt = 0;
1493                 lrp = rp;
1494                 for (;;) {
1495                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1496                                 iovcnt++;
1497                         else {
1498                                 m = lrp->wa->wa_mblk;
1499                                 while (m != NULL) {
1500                                         iovcnt++;
1501                                         m = m->b_cont;
1502                                 }
1503                         }
1504                         if (lrp->list == NULL ||
1505                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1506                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1507                             lrp->list->wa->wa_offset) {
1508                                 lrp = lrp->list;
1509                                 break;
1510                         }
1511                         lrp = lrp->list;
1512                 }
1513 
1514                 if (iovcnt <= MAXCLIOVECS) {
1515 #ifdef DEBUG
1516                         rfs_write_hits++;
1517 #endif
1518                         niovp = iov;
1519                 } else {
1520 #ifdef DEBUG
1521                         rfs_write_misses++;
1522 #endif
1523                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1524                 }
1525                 /*
1526                  * Put together the scatter/gather iovecs.
1527                  */
1528                 iovp = niovp;
1529                 trp = rp;
1530                 count = 0;
1531                 do {
1532                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1533                                 if (trp->wa->wa_rlist) {
1534                                         iovp->iov_base =
1535                                             (char *)((trp->wa->wa_rlist)->
1536                                             u.c_daddr3);
1537                                         iovp->iov_len = trp->wa->wa_count;
1538                                 } else  {
1539                                         iovp->iov_base = trp->wa->wa_data;
1540                                         iovp->iov_len = trp->wa->wa_count;
1541                                 }
1542                                 iovp++;
1543                         } else {
1544                                 m = trp->wa->wa_mblk;
1545                                 rcount = trp->wa->wa_count;
1546                                 while (m != NULL) {
1547                                         iovp->iov_base = (caddr_t)m->b_rptr;
1548                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1549                                         rcount -= iovp->iov_len;
1550                                         if (rcount < 0)
1551                                                 iovp->iov_len += rcount;
1552                                         iovp++;
1553                                         if (rcount <= 0)
1554                                                 break;
1555                                         m = m->b_cont;
1556                                 }
1557                         }
1558                         count += trp->wa->wa_count;
1559                         trp = trp->list;
1560                 } while (trp != lrp);
1561 
1562                 uio.uio_iov = niovp;
1563                 uio.uio_iovcnt = iovcnt;
1564                 uio.uio_segflg = UIO_SYSSPACE;
1565                 uio.uio_extflg = UIO_COPY_DEFAULT;
1566                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1567                 uio.uio_resid = count;
1568                 /*
1569                  * The limit is checked on the client. We
1570                  * should allow any size writes here.
1571                  */
1572                 uio.uio_llimit = curproc->p_fsz_ctl;
1573                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1574                 if (rlimit < (rlim64_t)uio.uio_resid)
1575                         uio.uio_resid = (uint_t)rlimit;
1576 
1577                 /*
1578                  * For now we assume no append mode.
1579                  */
1580 
1581                 /*
1582                  * We're changing creds because VM may fault
1583                  * and we need the cred of the current
1584                  * thread to be used if quota * checking is
1585                  * enabled.
1586                  */
1587                 savecred = curthread->t_cred;
1588                 curthread->t_cred = cr;
1589                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1590                 curthread->t_cred = savecred;
1591 
1592                 /* check if a monitor detected a delegation conflict */
1593                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1594                         /* mark as wouldblock so response is dropped */
1595                         curthread->t_flag |= T_WOULDBLOCK;
1596 
1597                 if (niovp != iov)
1598                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1599 
1600                 if (!error) {
1601                         data_written = 1;
1602                         /*
1603                          * Get attributes again so we send the latest mod
1604                          * time to the client side for its cache.
1605                          */
1606                         va.va_mask = AT_ALL;    /* now we want everything */
1607 
1608                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1609 
1610                         if (!error)
1611                                 acl_perm(vp, exi, &va, rp->cr);
1612                 }
1613 
1614                 /*
1615                  * Fill in the status responses for each request
1616                  * which was just handled.  Also, copy the latest
1617                  * attributes in to the attribute responses if
1618                  * appropriate.
1619                  */
1620                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1621                 do {
1622                         rp->thread->t_flag |= t_flag;
1623                         /* check for overflows */
1624                         if (!error) {
1625                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1626                         }
1627                         rp->ns->ns_status = puterrno(error);
1628                         rp = rp->list;
1629                 } while (rp != lrp);
1630         } while (rp != NULL);
1631 
1632         /*
1633          * If any data was written at all, then we need to flush
1634          * the data and metadata to stable storage.
1635          */
1636         if (data_written) {
1637                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1638 
1639                 if (!error) {
1640                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1641                 }
1642         }
1643 
1644         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1645 
1646         if (in_crit)
1647                 nbl_end_crit(vp);
1648         VN_RELE(vp);
1649 
1650         t_flag = curthread->t_flag & T_WOULDBLOCK;
1651         mutex_enter(&rfs_async_write_lock);
1652         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1653                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1654                         rp->ns->ns_status = puterrno(error);
1655                         rp->thread->t_flag |= t_flag;
1656                 }
1657         }
1658         cv_broadcast(&nlp->cv);
1659         mutex_exit(&rfs_async_write_lock);
1660 
1661 }
1662 
1663 void *
1664 rfs_write_getfh(struct nfswriteargs *wa)
1665 {
1666         return (&wa->wa_fhandle);
1667 }
1668 
1669 /*
1670  * Create a file.
1671  * Creates a file with given attributes and returns those attributes
1672  * and an fhandle for the new file.
1673  */
1674 void
1675 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1676     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1677 {
1678         int error;
1679         int lookuperr;
1680         int in_crit = 0;
1681         struct vattr va;
1682         vnode_t *vp;
1683         vnode_t *realvp;
1684         vnode_t *dvp;
1685         char *name = args->ca_da.da_name;
1686         vnode_t *tvp = NULL;
1687         int mode;
1688         int lookup_ok;
1689         bool_t trunc;
1690         struct sockaddr *ca;
1691 
1692         /*
1693          * Disallow NULL paths
1694          */
1695         if (name == NULL || *name == '\0') {
1696                 dr->dr_status = NFSERR_ACCES;
1697                 return;
1698         }
1699 
1700         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1701         if (dvp == NULL) {
1702                 dr->dr_status = NFSERR_STALE;
1703                 return;
1704         }
1705 
1706         error = sattr_to_vattr(args->ca_sa, &va);
1707         if (error) {
1708                 dr->dr_status = puterrno(error);
1709                 return;
1710         }
1711 
1712         /*
1713          * Must specify the mode.
1714          */
1715         if (!(va.va_mask & AT_MODE)) {
1716                 VN_RELE(dvp);
1717                 dr->dr_status = NFSERR_INVAL;
1718                 return;
1719         }
1720 
1721         /*
1722          * This is a completely gross hack to make mknod
1723          * work over the wire until we can wack the protocol
1724          */
1725         if ((va.va_mode & IFMT) == IFCHR) {
1726                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1727                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1728                 else {
1729                         va.va_type = VCHR;
1730                         /*
1731                          * uncompress the received dev_t
1732                          * if the top half is zero indicating a request
1733                          * from an `older style' OS.
1734                          */
1735                         if ((va.va_size & 0xffff0000) == 0)
1736                                 va.va_rdev = nfsv2_expdev(va.va_size);
1737                         else
1738                                 va.va_rdev = (dev_t)va.va_size;
1739                 }
1740                 va.va_mask &= ~AT_SIZE;
1741         } else if ((va.va_mode & IFMT) == IFBLK) {
1742                 va.va_type = VBLK;
1743                 /*
1744                  * uncompress the received dev_t
1745                  * if the top half is zero indicating a request
1746                  * from an `older style' OS.
1747                  */
1748                 if ((va.va_size & 0xffff0000) == 0)
1749                         va.va_rdev = nfsv2_expdev(va.va_size);
1750                 else
1751                         va.va_rdev = (dev_t)va.va_size;
1752                 va.va_mask &= ~AT_SIZE;
1753         } else if ((va.va_mode & IFMT) == IFSOCK) {
1754                 va.va_type = VSOCK;
1755         } else {
1756                 va.va_type = VREG;
1757         }
1758         va.va_mode &= ~IFMT;
1759         va.va_mask |= AT_TYPE;
1760 
1761         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1762         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1763             MAXPATHLEN);
1764         if (name == NULL) {
1765                 dr->dr_status = puterrno(EINVAL);
1766                 return;
1767         }
1768 
1769         /*
1770          * Why was the choice made to use VWRITE as the mode to the
1771          * call to VOP_CREATE ? This results in a bug.  When a client
1772          * opens a file that already exists and is RDONLY, the second
1773          * open fails with an EACESS because of the mode.
1774          * bug ID 1054648.
1775          */
1776         lookup_ok = 0;
1777         mode = VWRITE;
1778         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1779                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1780                     NULL, NULL, NULL);
1781                 if (!error) {
1782                         struct vattr at;
1783 
1784                         lookup_ok = 1;
1785                         at.va_mask = AT_MODE;
1786                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1787                         if (!error)
1788                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1789                         VN_RELE(tvp);
1790                         tvp = NULL;
1791                 }
1792         }
1793 
1794         if (!lookup_ok) {
1795                 if (rdonly(ro, dvp)) {
1796                         error = EROFS;
1797                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1798                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1799                         error = EPERM;
1800                 } else {
1801                         error = 0;
1802                 }
1803         }
1804 
1805         /*
1806          * If file size is being modified on an already existing file
1807          * make sure that there are no conflicting non-blocking mandatory
1808          * locks in the region being manipulated. Return EACCES if there
1809          * are conflicting locks.
1810          */
1811         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1812                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1813                     NULL, NULL, NULL);
1814 
1815                 if (!lookuperr &&
1816                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1817                         VN_RELE(tvp);
1818                         curthread->t_flag |= T_WOULDBLOCK;
1819                         goto out;
1820                 }
1821 
1822                 if (!lookuperr && nbl_need_check(tvp)) {
1823                         /*
1824                          * The file exists. Now check if it has any
1825                          * conflicting non-blocking mandatory locks
1826                          * in the region being changed.
1827                          */
1828                         struct vattr bva;
1829                         u_offset_t offset;
1830                         ssize_t length;
1831 
1832                         nbl_start_crit(tvp, RW_READER);
1833                         in_crit = 1;
1834 
1835                         bva.va_mask = AT_SIZE;
1836                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1837                         if (!error) {
1838                                 if (va.va_size < bva.va_size) {
1839                                         offset = va.va_size;
1840                                         length = bva.va_size - va.va_size;
1841                                 } else {
1842                                         offset = bva.va_size;
1843                                         length = va.va_size - bva.va_size;
1844                                 }
1845                                 if (length) {
1846                                         if (nbl_conflict(tvp, NBL_WRITE,
1847                                             offset, length, 0, NULL)) {
1848                                                 error = EACCES;
1849                                         }
1850                                 }
1851                         }
1852                         if (error) {
1853                                 nbl_end_crit(tvp);
1854                                 VN_RELE(tvp);
1855                                 in_crit = 0;
1856                         }
1857                 } else if (tvp != NULL) {
1858                         VN_RELE(tvp);
1859                 }
1860         }
1861 
1862         if (!error) {
1863                 /*
1864                  * If filesystem is shared with nosuid the remove any
1865                  * setuid/setgid bits on create.
1866                  */
1867                 if (va.va_type == VREG &&
1868                     exi->exi_export.ex_flags & EX_NOSUID)
1869                         va.va_mode &= ~(VSUID | VSGID);
1870 
1871                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1872                     NULL, NULL);
1873 
1874                 if (!error) {
1875 
1876                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1877                                 trunc = TRUE;
1878                         else
1879                                 trunc = FALSE;
1880 
1881                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1882                                 VN_RELE(vp);
1883                                 curthread->t_flag |= T_WOULDBLOCK;
1884                                 goto out;
1885                         }
1886                         va.va_mask = AT_ALL;
1887 
1888                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1889 
1890                         /* check for overflows */
1891                         if (!error) {
1892                                 acl_perm(vp, exi, &va, cr);
1893                                 error = vattr_to_nattr(&va, &dr->dr_attr);
1894                                 if (!error) {
1895                                         error = makefh(&dr->dr_fhandle, vp,
1896                                             exi);
1897                                 }
1898                         }
1899                         /*
1900                          * Force modified metadata out to stable storage.
1901                          *
1902                          * if a underlying vp exists, pass it to VOP_FSYNC
1903                          */
1904                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
1905                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1906                         else
1907                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1908                         VN_RELE(vp);
1909                 }
1910 
1911                 if (in_crit) {
1912                         nbl_end_crit(tvp);
1913                         VN_RELE(tvp);
1914                 }
1915         }
1916 
1917         /*
1918          * Force modified data and metadata out to stable storage.
1919          */
1920         (void) VOP_FSYNC(dvp, 0, cr, NULL);
1921 
1922 out:
1923 
1924         VN_RELE(dvp);
1925 
1926         dr->dr_status = puterrno(error);
1927 
1928         if (name != args->ca_da.da_name)
1929                 kmem_free(name, MAXPATHLEN);
1930 }
1931 void *
1932 rfs_create_getfh(struct nfscreatargs *args)
1933 {
1934         return (args->ca_da.da_fhandle);
1935 }
1936 
1937 /*
1938  * Remove a file.
1939  * Remove named file from parent directory.
1940  */
1941 /* ARGSUSED */
1942 void
1943 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1944     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1945 {
1946         int error = 0;
1947         vnode_t *vp;
1948         vnode_t *targvp;
1949         int in_crit = 0;
1950 
1951         /*
1952          * Disallow NULL paths
1953          */
1954         if (da->da_name == NULL || *da->da_name == '\0') {
1955                 *status = NFSERR_ACCES;
1956                 return;
1957         }
1958 
1959         vp = nfs_fhtovp(da->da_fhandle, exi);
1960         if (vp == NULL) {
1961                 *status = NFSERR_STALE;
1962                 return;
1963         }
1964 
1965         if (rdonly(ro, vp)) {
1966                 VN_RELE(vp);
1967                 *status = NFSERR_ROFS;
1968                 return;
1969         }
1970 
1971         /*
1972          * Check for a conflict with a non-blocking mandatory share reservation.
1973          */
1974         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1975             NULL, cr, NULL, NULL, NULL);
1976         if (error != 0) {
1977                 VN_RELE(vp);
1978                 *status = puterrno(error);
1979                 return;
1980         }
1981 
1982         /*
1983          * If the file is delegated to an v4 client, then initiate
1984          * recall and drop this request (by setting T_WOULDBLOCK).
1985          * The client will eventually re-transmit the request and
1986          * (hopefully), by then, the v4 client will have returned
1987          * the delegation.
1988          */
1989 
1990         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1991                 VN_RELE(vp);
1992                 VN_RELE(targvp);
1993                 curthread->t_flag |= T_WOULDBLOCK;
1994                 return;
1995         }
1996 
1997         if (nbl_need_check(targvp)) {
1998                 nbl_start_crit(targvp, RW_READER);
1999                 in_crit = 1;
2000                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2001                         error = EACCES;
2002                         goto out;
2003                 }
2004         }
2005 
2006         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2007 
2008         /*
2009          * Force modified data and metadata out to stable storage.
2010          */
2011         (void) VOP_FSYNC(vp, 0, cr, NULL);
2012 
2013 out:
2014         if (in_crit)
2015                 nbl_end_crit(targvp);
2016         VN_RELE(targvp);
2017         VN_RELE(vp);
2018 
2019         *status = puterrno(error);
2020 
2021 }
2022 
2023 void *
2024 rfs_remove_getfh(struct nfsdiropargs *da)
2025 {
2026         return (da->da_fhandle);
2027 }
2028 
2029 /*
2030  * rename a file
2031  * Give a file (from) a new name (to).
2032  */
2033 /* ARGSUSED */
2034 void
2035 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2036     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2037 {
2038         int error = 0;
2039         vnode_t *fromvp;
2040         vnode_t *tovp;
2041         struct exportinfo *to_exi;
2042         fhandle_t *fh;
2043         vnode_t *srcvp;
2044         vnode_t *targvp;
2045         int in_crit = 0;
2046 
2047         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2048         if (fromvp == NULL) {
2049                 *status = NFSERR_STALE;
2050                 return;
2051         }
2052 
2053         fh = args->rna_to.da_fhandle;
2054         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2055         if (to_exi == NULL) {
2056                 VN_RELE(fromvp);
2057                 *status = NFSERR_ACCES;
2058                 return;
2059         }
2060         exi_rele(to_exi);
2061 
2062         if (to_exi != exi) {
2063                 VN_RELE(fromvp);
2064                 *status = NFSERR_XDEV;
2065                 return;
2066         }
2067 
2068         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2069         if (tovp == NULL) {
2070                 VN_RELE(fromvp);
2071                 *status = NFSERR_STALE;
2072                 return;
2073         }
2074 
2075         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2076                 VN_RELE(tovp);
2077                 VN_RELE(fromvp);
2078                 *status = NFSERR_NOTDIR;
2079                 return;
2080         }
2081 
2082         /*
2083          * Disallow NULL paths
2084          */
2085         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2086             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2087                 VN_RELE(tovp);
2088                 VN_RELE(fromvp);
2089                 *status = NFSERR_ACCES;
2090                 return;
2091         }
2092 
2093         if (rdonly(ro, tovp)) {
2094                 VN_RELE(tovp);
2095                 VN_RELE(fromvp);
2096                 *status = NFSERR_ROFS;
2097                 return;
2098         }
2099 
2100         /*
2101          * Check for a conflict with a non-blocking mandatory share reservation.
2102          */
2103         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2104             NULL, cr, NULL, NULL, NULL);
2105         if (error != 0) {
2106                 VN_RELE(tovp);
2107                 VN_RELE(fromvp);
2108                 *status = puterrno(error);
2109                 return;
2110         }
2111 
2112         /* Check for delegations on the source file */
2113 
2114         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2115                 VN_RELE(tovp);
2116                 VN_RELE(fromvp);
2117                 VN_RELE(srcvp);
2118                 curthread->t_flag |= T_WOULDBLOCK;
2119                 return;
2120         }
2121 
2122         /* Check for delegation on the file being renamed over, if it exists */
2123 
2124         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2125             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2126             NULL, NULL, NULL) == 0) {
2127 
2128                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2129                         VN_RELE(tovp);
2130                         VN_RELE(fromvp);
2131                         VN_RELE(srcvp);
2132                         VN_RELE(targvp);
2133                         curthread->t_flag |= T_WOULDBLOCK;
2134                         return;
2135                 }
2136                 VN_RELE(targvp);
2137         }
2138 
2139 
2140         if (nbl_need_check(srcvp)) {
2141                 nbl_start_crit(srcvp, RW_READER);
2142                 in_crit = 1;
2143                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2144                         error = EACCES;
2145                         goto out;
2146                 }
2147         }
2148 
2149         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2150             tovp, args->rna_to.da_name, cr, NULL, 0);
2151 
2152         if (error == 0)
2153                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2154                     strlen(args->rna_to.da_name));
2155 
2156         /*
2157          * Force modified data and metadata out to stable storage.
2158          */
2159         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2160         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2161 
2162 out:
2163         if (in_crit)
2164                 nbl_end_crit(srcvp);
2165         VN_RELE(srcvp);
2166         VN_RELE(tovp);
2167         VN_RELE(fromvp);
2168 
2169         *status = puterrno(error);
2170 
2171 }
2172 void *
2173 rfs_rename_getfh(struct nfsrnmargs *args)
2174 {
2175         return (args->rna_from.da_fhandle);
2176 }
2177 
2178 /*
2179  * Link to a file.
2180  * Create a file (to) which is a hard link to the given file (from).
2181  */
2182 /* ARGSUSED */
2183 void
2184 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2185     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2186 {
2187         int error;
2188         vnode_t *fromvp;
2189         vnode_t *tovp;
2190         struct exportinfo *to_exi;
2191         fhandle_t *fh;
2192 
2193         fromvp = nfs_fhtovp(args->la_from, exi);
2194         if (fromvp == NULL) {
2195                 *status = NFSERR_STALE;
2196                 return;
2197         }
2198 
2199         fh = args->la_to.da_fhandle;
2200         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2201         if (to_exi == NULL) {
2202                 VN_RELE(fromvp);
2203                 *status = NFSERR_ACCES;
2204                 return;
2205         }
2206         exi_rele(to_exi);
2207 
2208         if (to_exi != exi) {
2209                 VN_RELE(fromvp);
2210                 *status = NFSERR_XDEV;
2211                 return;
2212         }
2213 
2214         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2215         if (tovp == NULL) {
2216                 VN_RELE(fromvp);
2217                 *status = NFSERR_STALE;
2218                 return;
2219         }
2220 
2221         if (tovp->v_type != VDIR) {
2222                 VN_RELE(tovp);
2223                 VN_RELE(fromvp);
2224                 *status = NFSERR_NOTDIR;
2225                 return;
2226         }
2227         /*
2228          * Disallow NULL paths
2229          */
2230         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2231                 VN_RELE(tovp);
2232                 VN_RELE(fromvp);
2233                 *status = NFSERR_ACCES;
2234                 return;
2235         }
2236 
2237         if (rdonly(ro, tovp)) {
2238                 VN_RELE(tovp);
2239                 VN_RELE(fromvp);
2240                 *status = NFSERR_ROFS;
2241                 return;
2242         }
2243 
2244         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2245 
2246         /*
2247          * Force modified data and metadata out to stable storage.
2248          */
2249         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2250         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2251 
2252         VN_RELE(tovp);
2253         VN_RELE(fromvp);
2254 
2255         *status = puterrno(error);
2256 
2257 }
2258 void *
2259 rfs_link_getfh(struct nfslinkargs *args)
2260 {
2261         return (args->la_from);
2262 }
2263 
2264 /*
2265  * Symbolicly link to a file.
2266  * Create a file (to) with the given attributes which is a symbolic link
2267  * to the given path name (to).
2268  */
2269 void
2270 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2271     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2272 {
2273         int error;
2274         struct vattr va;
2275         vnode_t *vp;
2276         vnode_t *svp;
2277         int lerror;
2278         struct sockaddr *ca;
2279         char *name = NULL;
2280 
2281         /*
2282          * Disallow NULL paths
2283          */
2284         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2285                 *status = NFSERR_ACCES;
2286                 return;
2287         }
2288 
2289         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2290         if (vp == NULL) {
2291                 *status = NFSERR_STALE;
2292                 return;
2293         }
2294 
2295         if (rdonly(ro, vp)) {
2296                 VN_RELE(vp);
2297                 *status = NFSERR_ROFS;
2298                 return;
2299         }
2300 
2301         error = sattr_to_vattr(args->sla_sa, &va);
2302         if (error) {
2303                 VN_RELE(vp);
2304                 *status = puterrno(error);
2305                 return;
2306         }
2307 
2308         if (!(va.va_mask & AT_MODE)) {
2309                 VN_RELE(vp);
2310                 *status = NFSERR_INVAL;
2311                 return;
2312         }
2313 
2314         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2315         name = nfscmd_convname(ca, exi, args->sla_tnm,
2316             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2317 
2318         if (name == NULL) {
2319                 *status = NFSERR_ACCES;
2320                 return;
2321         }
2322 
2323         va.va_type = VLNK;
2324         va.va_mask |= AT_TYPE;
2325 
2326         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2327 
2328         /*
2329          * Force new data and metadata out to stable storage.
2330          */
2331         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2332             NULL, cr, NULL, NULL, NULL);
2333 
2334         if (!lerror) {
2335                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2336                 VN_RELE(svp);
2337         }
2338 
2339         /*
2340          * Force modified data and metadata out to stable storage.
2341          */
2342         (void) VOP_FSYNC(vp, 0, cr, NULL);
2343 
2344         VN_RELE(vp);
2345 
2346         *status = puterrno(error);
2347         if (name != args->sla_tnm)
2348                 kmem_free(name, MAXPATHLEN);
2349 
2350 }
2351 void *
2352 rfs_symlink_getfh(struct nfsslargs *args)
2353 {
2354         return (args->sla_from.da_fhandle);
2355 }
2356 
2357 /*
2358  * Make a directory.
2359  * Create a directory with the given name, parent directory, and attributes.
2360  * Returns a file handle and attributes for the new directory.
2361  */
2362 /* ARGSUSED */
2363 void
2364 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2365     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2366 {
2367         int error;
2368         struct vattr va;
2369         vnode_t *dvp = NULL;
2370         vnode_t *vp;
2371         char *name = args->ca_da.da_name;
2372 
2373         /*
2374          * Disallow NULL paths
2375          */
2376         if (name == NULL || *name == '\0') {
2377                 dr->dr_status = NFSERR_ACCES;
2378                 return;
2379         }
2380 
2381         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2382         if (vp == NULL) {
2383                 dr->dr_status = NFSERR_STALE;
2384                 return;
2385         }
2386 
2387         if (rdonly(ro, vp)) {
2388                 VN_RELE(vp);
2389                 dr->dr_status = NFSERR_ROFS;
2390                 return;
2391         }
2392 
2393         error = sattr_to_vattr(args->ca_sa, &va);
2394         if (error) {
2395                 VN_RELE(vp);
2396                 dr->dr_status = puterrno(error);
2397                 return;
2398         }
2399 
2400         if (!(va.va_mask & AT_MODE)) {
2401                 VN_RELE(vp);
2402                 dr->dr_status = NFSERR_INVAL;
2403                 return;
2404         }
2405 
2406         va.va_type = VDIR;
2407         va.va_mask |= AT_TYPE;
2408 
2409         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2410 
2411         if (!error) {
2412                 /*
2413                  * Attribtutes of the newly created directory should
2414                  * be returned to the client.
2415                  */
2416                 va.va_mask = AT_ALL; /* We want everything */
2417                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2418 
2419                 /* check for overflows */
2420                 if (!error) {
2421                         acl_perm(vp, exi, &va, cr);
2422                         error = vattr_to_nattr(&va, &dr->dr_attr);
2423                         if (!error) {
2424                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2425                         }
2426                 }
2427                 /*
2428                  * Force new data and metadata out to stable storage.
2429                  */
2430                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2431                 VN_RELE(dvp);
2432         }
2433 
2434         /*
2435          * Force modified data and metadata out to stable storage.
2436          */
2437         (void) VOP_FSYNC(vp, 0, cr, NULL);
2438 
2439         VN_RELE(vp);
2440 
2441         dr->dr_status = puterrno(error);
2442 
2443 }
2444 void *
2445 rfs_mkdir_getfh(struct nfscreatargs *args)
2446 {
2447         return (args->ca_da.da_fhandle);
2448 }
2449 
2450 /*
2451  * Remove a directory.
2452  * Remove the given directory name from the given parent directory.
2453  */
2454 /* ARGSUSED */
2455 void
2456 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2457     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2458 {
2459         int error;
2460         vnode_t *vp;
2461 
2462         /*
2463          * Disallow NULL paths
2464          */
2465         if (da->da_name == NULL || *da->da_name == '\0') {
2466                 *status = NFSERR_ACCES;
2467                 return;
2468         }
2469 
2470         vp = nfs_fhtovp(da->da_fhandle, exi);
2471         if (vp == NULL) {
2472                 *status = NFSERR_STALE;
2473                 return;
2474         }
2475 
2476         if (rdonly(ro, vp)) {
2477                 VN_RELE(vp);
2478                 *status = NFSERR_ROFS;
2479                 return;
2480         }
2481 
2482         /*
2483          * VOP_RMDIR takes a third argument (the current
2484          * directory of the process).  That's because someone
2485          * wants to return EINVAL if one tries to remove ".".
2486          * Of course, NFS servers have no idea what their
2487          * clients' current directories are.  We fake it by
2488          * supplying a vnode known to exist and illegal to
2489          * remove.
2490          */
2491         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2492 
2493         /*
2494          * Force modified data and metadata out to stable storage.
2495          */
2496         (void) VOP_FSYNC(vp, 0, cr, NULL);
2497 
2498         VN_RELE(vp);
2499 
2500         /*
2501          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2502          * if the directory is not empty.  A System V NFS server
2503          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2504          * over the wire.
2505          */
2506         if (error == EEXIST)
2507                 *status = NFSERR_NOTEMPTY;
2508         else
2509                 *status = puterrno(error);
2510 
2511 }
2512 void *
2513 rfs_rmdir_getfh(struct nfsdiropargs *da)
2514 {
2515         return (da->da_fhandle);
2516 }
2517 
2518 /* ARGSUSED */
2519 void
2520 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2521     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2522 {
2523         int error;
2524         int iseof;
2525         struct iovec iov;
2526         struct uio uio;
2527         vnode_t *vp;
2528         char *ndata = NULL;
2529         struct sockaddr *ca;
2530         size_t nents;
2531         int ret;
2532 
2533         vp = nfs_fhtovp(&rda->rda_fh, exi);
2534         if (vp == NULL) {
2535                 rd->rd_entries = NULL;
2536                 rd->rd_status = NFSERR_STALE;
2537                 return;
2538         }
2539 
2540         if (vp->v_type != VDIR) {
2541                 VN_RELE(vp);
2542                 rd->rd_entries = NULL;
2543                 rd->rd_status = NFSERR_NOTDIR;
2544                 return;
2545         }
2546 
2547         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2548 
2549         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2550 
2551         if (error) {
2552                 rd->rd_entries = NULL;
2553                 goto bad;
2554         }
2555 
2556         if (rda->rda_count == 0) {
2557                 rd->rd_entries = NULL;
2558                 rd->rd_size = 0;
2559                 rd->rd_eof = FALSE;
2560                 goto bad;
2561         }
2562 
2563         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2564 
2565         /*
2566          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2567          */
2568         rd->rd_bufsize = (uint_t)rda->rda_count;
2569         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2570 
2571         /*
2572          * Set up io vector to read directory data
2573          */
2574         iov.iov_base = (caddr_t)rd->rd_entries;
2575         iov.iov_len = rda->rda_count;
2576         uio.uio_iov = &iov;
2577         uio.uio_iovcnt = 1;
2578         uio.uio_segflg = UIO_SYSSPACE;
2579         uio.uio_extflg = UIO_COPY_CACHED;
2580         uio.uio_loffset = (offset_t)rda->rda_offset;
2581         uio.uio_resid = rda->rda_count;
2582 
2583         /*
2584          * read directory
2585          */
2586         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2587 
2588         /*
2589          * Clean up
2590          */
2591         if (!error) {
2592                 /*
2593                  * set size and eof
2594                  */
2595                 if (uio.uio_resid == rda->rda_count) {
2596                         rd->rd_size = 0;
2597                         rd->rd_eof = TRUE;
2598                 } else {
2599                         rd->rd_size = (uint32_t)(rda->rda_count -
2600                             uio.uio_resid);
2601                         rd->rd_eof = iseof ? TRUE : FALSE;
2602                 }
2603         }
2604 
2605         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2606         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2607         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2608             rda->rda_count, &ndata);
2609 
2610         if (ret != 0) {
2611                 size_t dropbytes;
2612                 /*
2613                  * We had to drop one or more entries in order to fit
2614                  * during the character conversion.  We need to patch
2615                  * up the size and eof info.
2616                  */
2617                 if (rd->rd_eof)
2618                         rd->rd_eof = FALSE;
2619                 dropbytes = nfscmd_dropped_entrysize(
2620                     (struct dirent64 *)rd->rd_entries, nents, ret);
2621                 rd->rd_size -= dropbytes;
2622         }
2623         if (ndata == NULL) {
2624                 ndata = (char *)rd->rd_entries;
2625         } else if (ndata != (char *)rd->rd_entries) {
2626                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2627                 rd->rd_entries = (void *)ndata;
2628                 rd->rd_bufsize = rda->rda_count;
2629         }
2630 
2631 bad:
2632         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2633 
2634 #if 0 /* notyet */
2635         /*
2636          * Don't do this.  It causes local disk writes when just
2637          * reading the file and the overhead is deemed larger
2638          * than the benefit.
2639          */
2640         /*
2641          * Force modified metadata out to stable storage.
2642          */
2643         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2644 #endif
2645 
2646         VN_RELE(vp);
2647 
2648         rd->rd_status = puterrno(error);
2649 
2650 }
2651 void *
2652 rfs_readdir_getfh(struct nfsrddirargs *rda)
2653 {
2654         return (&rda->rda_fh);
2655 }
2656 void
2657 rfs_rddirfree(struct nfsrddirres *rd)
2658 {
2659         if (rd->rd_entries != NULL)
2660                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2661 }
2662 
2663 /* ARGSUSED */
2664 void
2665 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2666     struct svc_req *req, cred_t *cr, bool_t ro)
2667 {
2668         int error;
2669         struct statvfs64 sb;
2670         vnode_t *vp;
2671 
2672         vp = nfs_fhtovp(fh, exi);
2673         if (vp == NULL) {
2674                 fs->fs_status = NFSERR_STALE;
2675                 return;
2676         }
2677 
2678         error = VFS_STATVFS(vp->v_vfsp, &sb);
2679 
2680         if (!error) {
2681                 fs->fs_tsize = nfstsize();
2682                 fs->fs_bsize = sb.f_frsize;
2683                 fs->fs_blocks = sb.f_blocks;
2684                 fs->fs_bfree = sb.f_bfree;
2685                 fs->fs_bavail = sb.f_bavail;
2686         }
2687 
2688         VN_RELE(vp);
2689 
2690         fs->fs_status = puterrno(error);
2691 
2692 }
2693 void *
2694 rfs_statfs_getfh(fhandle_t *fh)
2695 {
2696         return (fh);
2697 }
2698 
2699 static int
2700 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2701 {
2702         vap->va_mask = 0;
2703 
2704         /*
2705          * There was a sign extension bug in some VFS based systems
2706          * which stored the mode as a short.  When it would get
2707          * assigned to a u_long, no sign extension would occur.
2708          * It needed to, but this wasn't noticed because sa_mode
2709          * would then get assigned back to the short, thus ignoring
2710          * the upper 16 bits of sa_mode.
2711          *
2712          * To make this implementation work for both broken
2713          * clients and good clients, we check for both versions
2714          * of the mode.
2715          */
2716         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2717             sa->sa_mode != (uint32_t)-1) {
2718                 vap->va_mask |= AT_MODE;
2719                 vap->va_mode = sa->sa_mode;
2720         }
2721         if (sa->sa_uid != (uint32_t)-1) {
2722                 vap->va_mask |= AT_UID;
2723                 vap->va_uid = sa->sa_uid;
2724         }
2725         if (sa->sa_gid != (uint32_t)-1) {
2726                 vap->va_mask |= AT_GID;
2727                 vap->va_gid = sa->sa_gid;
2728         }
2729         if (sa->sa_size != (uint32_t)-1) {
2730                 vap->va_mask |= AT_SIZE;
2731                 vap->va_size = sa->sa_size;
2732         }
2733         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2734             sa->sa_atime.tv_usec != (int32_t)-1) {
2735 #ifndef _LP64
2736                 /* return error if time overflow */
2737                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2738                         return (EOVERFLOW);
2739 #endif
2740                 vap->va_mask |= AT_ATIME;
2741                 /*
2742                  * nfs protocol defines times as unsigned so don't extend sign,
2743                  * unless sysadmin set nfs_allow_preepoch_time.
2744                  */
2745                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2746                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2747         }
2748         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2749             sa->sa_mtime.tv_usec != (int32_t)-1) {
2750 #ifndef _LP64
2751                 /* return error if time overflow */
2752                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2753                         return (EOVERFLOW);
2754 #endif
2755                 vap->va_mask |= AT_MTIME;
2756                 /*
2757                  * nfs protocol defines times as unsigned so don't extend sign,
2758                  * unless sysadmin set nfs_allow_preepoch_time.
2759                  */
2760                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2761                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2762         }
2763         return (0);
2764 }
2765 
2766 static enum nfsftype vt_to_nf[] = {
2767         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2768 };
2769 
2770 /*
2771  * check the following fields for overflow: nodeid, size, and time.
2772  * There could be a problem when converting 64-bit LP64 fields
2773  * into 32-bit ones.  Return an error if there is an overflow.
2774  */
2775 int
2776 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2777 {
2778         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2779         na->na_type = vt_to_nf[vap->va_type];
2780 
2781         if (vap->va_mode == (unsigned short) -1)
2782                 na->na_mode = (uint32_t)-1;
2783         else
2784                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2785 
2786         if (vap->va_uid == (unsigned short)(-1))
2787                 na->na_uid = (uint32_t)(-1);
2788         else if (vap->va_uid == UID_NOBODY)
2789                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2790         else
2791                 na->na_uid = vap->va_uid;
2792 
2793         if (vap->va_gid == (unsigned short)(-1))
2794                 na->na_gid = (uint32_t)-1;
2795         else if (vap->va_gid == GID_NOBODY)
2796                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2797         else
2798                 na->na_gid = vap->va_gid;
2799 
2800         /*
2801          * Do we need to check fsid for overflow?  It is 64-bit in the
2802          * vattr, but are bigger than 32 bit values supported?
2803          */
2804         na->na_fsid = vap->va_fsid;
2805 
2806         na->na_nodeid = vap->va_nodeid;
2807 
2808         /*
2809          * Check to make sure that the nodeid is representable over the
2810          * wire without losing bits.
2811          */
2812         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2813                 return (EFBIG);
2814         na->na_nlink = vap->va_nlink;
2815 
2816         /*
2817          * Check for big files here, instead of at the caller.  See
2818          * comments in cstat for large special file explanation.
2819          */
2820         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2821                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2822                         return (EFBIG);
2823                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2824                         /* UNKNOWN_SIZE | OVERFLOW */
2825                         na->na_size = MAXOFF32_T;
2826                 } else
2827                         na->na_size = vap->va_size;
2828         } else
2829                 na->na_size = vap->va_size;
2830 
2831         /*
2832          * If the vnode times overflow the 32-bit times that NFS2
2833          * uses on the wire then return an error.
2834          */
2835         if (!NFS_VAP_TIME_OK(vap)) {
2836                 return (EOVERFLOW);
2837         }
2838         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2839         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2840 
2841         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2842         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2843 
2844         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2845         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2846 
2847         /*
2848          * If the dev_t will fit into 16 bits then compress
2849          * it, otherwise leave it alone. See comments in
2850          * nfs_client.c.
2851          */
2852         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2853             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2854                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2855         else
2856                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2857 
2858         na->na_blocks = vap->va_nblocks;
2859         na->na_blocksize = vap->va_blksize;
2860 
2861         /*
2862          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2863          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2864          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2865          *
2866          * BUYER BEWARE:
2867          *  If you are porting the NFS to a non-Sun server, you probably
2868          *  don't want to include the following block of code.  The
2869          *  over-the-wire special file types will be changing with the
2870          *  NFS Protocol Revision.
2871          */
2872         if (vap->va_type == VFIFO)
2873                 NA_SETFIFO(na);
2874         return (0);
2875 }
2876 
2877 /*
2878  * acl v2 support: returns approximate permission.
2879  *      default: returns minimal permission (more restrictive)
2880  *      aclok: returns maximal permission (less restrictive)
2881  *      This routine changes the permissions that are alaredy in *va.
2882  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2883  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2884  */
2885 static void
2886 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2887 {
2888         vsecattr_t      vsa;
2889         int             aclcnt;
2890         aclent_t        *aclentp;
2891         mode_t          mask_perm;
2892         mode_t          grp_perm;
2893         mode_t          other_perm;
2894         mode_t          other_orig;
2895         int             error;
2896 
2897         /* dont care default acl */
2898         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2899         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2900 
2901         if (!error) {
2902                 aclcnt = vsa.vsa_aclcnt;
2903                 if (aclcnt > MIN_ACL_ENTRIES) {
2904                         /* non-trivial ACL */
2905                         aclentp = vsa.vsa_aclentp;
2906                         if (exi->exi_export.ex_flags & EX_ACLOK) {
2907                                 /* maximal permissions */
2908                                 grp_perm = 0;
2909                                 other_perm = 0;
2910                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2911                                         switch (aclentp->a_type) {
2912                                         case USER_OBJ:
2913                                                 break;
2914                                         case USER:
2915                                                 grp_perm |=
2916                                                     aclentp->a_perm << 3;
2917                                                 other_perm |= aclentp->a_perm;
2918                                                 break;
2919                                         case GROUP_OBJ:
2920                                                 grp_perm |=
2921                                                     aclentp->a_perm << 3;
2922                                                 break;
2923                                         case GROUP:
2924                                                 other_perm |= aclentp->a_perm;
2925                                                 break;
2926                                         case OTHER_OBJ:
2927                                                 other_orig = aclentp->a_perm;
2928                                                 break;
2929                                         case CLASS_OBJ:
2930                                                 mask_perm = aclentp->a_perm;
2931                                                 break;
2932                                         default:
2933                                                 break;
2934                                         }
2935                                 }
2936                                 grp_perm &= mask_perm << 3;
2937                                 other_perm &= mask_perm;
2938                                 other_perm |= other_orig;
2939 
2940                         } else {
2941                                 /* minimal permissions */
2942                                 grp_perm = 070;
2943                                 other_perm = 07;
2944                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2945                                         switch (aclentp->a_type) {
2946                                         case USER_OBJ:
2947                                                 break;
2948                                         case USER:
2949                                         case CLASS_OBJ:
2950                                                 grp_perm &=
2951                                                     aclentp->a_perm << 3;
2952                                                 other_perm &=
2953                                                     aclentp->a_perm;
2954                                                 break;
2955                                         case GROUP_OBJ:
2956                                                 grp_perm &=
2957                                                     aclentp->a_perm << 3;
2958                                                 break;
2959                                         case GROUP:
2960                                                 other_perm &=
2961                                                     aclentp->a_perm;
2962                                                 break;
2963                                         case OTHER_OBJ:
2964                                                 other_perm &=
2965                                                     aclentp->a_perm;
2966                                                 break;
2967                                         default:
2968                                                 break;
2969                                         }
2970                                 }
2971                         }
2972                         /* copy to va */
2973                         va->va_mode &= ~077;
2974                         va->va_mode |= grp_perm | other_perm;
2975                 }
2976                 if (vsa.vsa_aclcnt)
2977                         kmem_free(vsa.vsa_aclentp,
2978                             vsa.vsa_aclcnt * sizeof (aclent_t));
2979         }
2980 }
2981 
2982 void
2983 rfs_srvrinit(void)
2984 {
2985         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2986         nfs2_srv_caller_id = fs_new_caller_id();
2987 }
2988 
2989 void
2990 rfs_srvrfini(void)
2991 {
2992         mutex_destroy(&rfs_async_write_lock);
2993 }
2994 
2995 static int
2996 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2997 {
2998         struct clist    *wcl;
2999         int             wlist_len;
3000         uint32_t        count = rr->rr_count;
3001 
3002         wcl = ra->ra_wlist;
3003 
3004         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3005                 return (FALSE);
3006         }
3007 
3008         wcl = ra->ra_wlist;
3009         rr->rr_ok.rrok_wlist_len = wlist_len;
3010         rr->rr_ok.rrok_wlist = wcl;
3011 
3012         return (TRUE);
3013 }