vgusev-nfsstat New usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 
  33 /*
  34  * Copyright 2018 Nexenta Systems, Inc.
  35  * Copyright (c) 2016 by Delphix. All rights reserved.
  36  */
  37 
  38 #include <sys/param.h>
  39 #include <sys/types.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/buf.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/stat.h>
  47 #include <sys/errno.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/statvfs.h>
  50 #include <sys/kmem.h>
  51 #include <sys/kstat.h>
  52 #include <sys/dirent.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/debug.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/mode.h>
  57 #include <sys/acl.h>
  58 #include <sys/nbmlock.h>
  59 #include <sys/policy.h>
  60 #include <sys/sdt.h>
  61 
  62 #include <rpc/types.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/svc.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/export.h>
  68 #include <nfs/nfs_cmd.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_kmem.h>
  75 
  76 #include <sys/strsubr.h>
  77 
  78 struct rfs_async_write_list;
  79 
  80 /*
  81  * Zone globals of NFSv2 server
  82  */
  83 typedef struct nfs_srv {
  84         kmutex_t                        async_write_lock;
  85         struct rfs_async_write_list     *async_write_head;
  86 
  87         /*
  88          * enables write clustering if == 1
  89          */
  90         int             write_async;
  91 } nfs_srv_t;
  92 
  93 /*
  94  * These are the interface routines for the server side of the
  95  * Network File System.  See the NFS version 2 protocol specification
  96  * for a description of this interface.
  97  */
  98 
  99 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101                         cred_t *);
 102 
 103 
 104 /*
 105  * Some "over the wire" UNIX file types.  These are encoded
 106  * into the mode.  This needs to be fixed in the next rev.
 107  */
 108 #define IFMT            0170000         /* type of file */
 109 #define IFCHR           0020000         /* character special */
 110 #define IFBLK           0060000         /* block special */
 111 #define IFSOCK          0140000         /* socket */
 112 
 113 u_longlong_t nfs2_srv_caller_id;
 114 
 115 static nfs_srv_t *
 116 nfs_get_srv(void)
 117 {
 118         nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 119         nfs_srv_t *srv = ng->nfs_srv;
 120         ASSERT(srv != NULL);
 121         return (srv);
 122 }
 123 
 124 /*
 125  * Get file attributes.
 126  * Returns the current attributes of the file with the given fhandle.
 127  */
 128 /* ARGSUSED */
 129 void
 130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131     struct svc_req *req, cred_t *cr, bool_t ro)
 132 {
 133         int error;
 134         vnode_t *vp;
 135         struct vattr va;
 136 
 137         vp = nfs_fhtovp(fhp, exi);
 138         if (vp == NULL) {
 139                 ns->ns_status = NFSERR_STALE;
 140                 return;
 141         }
 142 
 143         /*
 144          * Do the getattr.
 145          */
 146         va.va_mask = AT_ALL;    /* we want all the attributes */
 147 
 148         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 149 
 150         /* check for overflows */
 151         if (!error) {
 152                 /* Lie about the object type for a referral */
 153                 if (vn_is_nfs_reparse(vp, cr))
 154                         va.va_type = VLNK;
 155 
 156                 acl_perm(vp, exi, &va, cr);
 157                 error = vattr_to_nattr(&va, &ns->ns_attr);
 158         }
 159 
 160         VN_RELE(vp);
 161 
 162         ns->ns_status = puterrno(error);
 163 }
 164 void *
 165 rfs_getattr_getfh(fhandle_t *fhp)
 166 {
 167         return (fhp);
 168 }
 169 
 170 /*
 171  * Set file attributes.
 172  * Sets the attributes of the file with the given fhandle.  Returns
 173  * the new attributes.
 174  */
 175 /* ARGSUSED */
 176 void
 177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 178     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 179 {
 180         int error;
 181         int flag;
 182         int in_crit = 0;
 183         vnode_t *vp;
 184         struct vattr va;
 185         struct vattr bva;
 186         struct flock64 bf;
 187         caller_context_t ct;
 188 
 189 
 190         vp = nfs_fhtovp(&args->saa_fh, exi);
 191         if (vp == NULL) {
 192                 ns->ns_status = NFSERR_STALE;
 193                 return;
 194         }
 195 
 196         if (rdonly(ro, vp)) {
 197                 VN_RELE(vp);
 198                 ns->ns_status = NFSERR_ROFS;
 199                 return;
 200         }
 201 
 202         error = sattr_to_vattr(&args->saa_sa, &va);
 203         if (error) {
 204                 VN_RELE(vp);
 205                 ns->ns_status = puterrno(error);
 206                 return;
 207         }
 208 
 209         /*
 210          * If the client is requesting a change to the mtime,
 211          * but the nanosecond field is set to 1 billion, then
 212          * this is a flag to the server that it should set the
 213          * atime and mtime fields to the server's current time.
 214          * The 1 billion number actually came from the client
 215          * as 1 million, but the units in the over the wire
 216          * request are microseconds instead of nanoseconds.
 217          *
 218          * This is an overload of the protocol and should be
 219          * documented in the NFS Version 2 protocol specification.
 220          */
 221         if (va.va_mask & AT_MTIME) {
 222                 if (va.va_mtime.tv_nsec == 1000000000) {
 223                         gethrestime(&va.va_mtime);
 224                         va.va_atime = va.va_mtime;
 225                         va.va_mask |= AT_ATIME;
 226                         flag = 0;
 227                 } else
 228                         flag = ATTR_UTIME;
 229         } else
 230                 flag = 0;
 231 
 232         /*
 233          * If the filesystem is exported with nosuid, then mask off
 234          * the setuid and setgid bits.
 235          */
 236         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 237             (exi->exi_export.ex_flags & EX_NOSUID))
 238                 va.va_mode &= ~(VSUID | VSGID);
 239 
 240         ct.cc_sysid = 0;
 241         ct.cc_pid = 0;
 242         ct.cc_caller_id = nfs2_srv_caller_id;
 243         ct.cc_flags = CC_DONTBLOCK;
 244 
 245         /*
 246          * We need to specially handle size changes because it is
 247          * possible for the client to create a file with modes
 248          * which indicate read-only, but with the file opened for
 249          * writing.  If the client then tries to set the size of
 250          * the file, then the normal access checking done in
 251          * VOP_SETATTR would prevent the client from doing so,
 252          * although it should be legal for it to do so.  To get
 253          * around this, we do the access checking for ourselves
 254          * and then use VOP_SPACE which doesn't do the access
 255          * checking which VOP_SETATTR does. VOP_SPACE can only
 256          * operate on VREG files, let VOP_SETATTR handle the other
 257          * extremely rare cases.
 258          * Also the client should not be allowed to change the
 259          * size of the file if there is a conflicting non-blocking
 260          * mandatory lock in the region of change.
 261          */
 262         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 263                 if (nbl_need_check(vp)) {
 264                         nbl_start_crit(vp, RW_READER);
 265                         in_crit = 1;
 266                 }
 267 
 268                 bva.va_mask = AT_UID | AT_SIZE;
 269 
 270                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 271 
 272                 if (error) {
 273                         if (in_crit)
 274                                 nbl_end_crit(vp);
 275                         VN_RELE(vp);
 276                         ns->ns_status = puterrno(error);
 277                         return;
 278                 }
 279 
 280                 if (in_crit) {
 281                         u_offset_t offset;
 282                         ssize_t length;
 283 
 284                         if (va.va_size < bva.va_size) {
 285                                 offset = va.va_size;
 286                                 length = bva.va_size - va.va_size;
 287                         } else {
 288                                 offset = bva.va_size;
 289                                 length = va.va_size - bva.va_size;
 290                         }
 291                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 292                             NULL)) {
 293                                 error = EACCES;
 294                         }
 295                 }
 296 
 297                 if (crgetuid(cr) == bva.va_uid && !error &&
 298                     va.va_size != bva.va_size) {
 299                         va.va_mask &= ~AT_SIZE;
 300                         bf.l_type = F_WRLCK;
 301                         bf.l_whence = 0;
 302                         bf.l_start = (off64_t)va.va_size;
 303                         bf.l_len = 0;
 304                         bf.l_sysid = 0;
 305                         bf.l_pid = 0;
 306 
 307                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 308                             (offset_t)va.va_size, cr, &ct);
 309                 }
 310                 if (in_crit)
 311                         nbl_end_crit(vp);
 312         } else
 313                 error = 0;
 314 
 315         /*
 316          * Do the setattr.
 317          */
 318         if (!error && va.va_mask) {
 319                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 320         }
 321 
 322         /*
 323          * check if the monitor on either vop_space or vop_setattr detected
 324          * a delegation conflict and if so, mark the thread flag as
 325          * wouldblock so that the response is dropped and the client will
 326          * try again.
 327          */
 328         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 329                 VN_RELE(vp);
 330                 curthread->t_flag |= T_WOULDBLOCK;
 331                 return;
 332         }
 333 
 334         if (!error) {
 335                 va.va_mask = AT_ALL;    /* get everything */
 336 
 337                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 338 
 339                 /* check for overflows */
 340                 if (!error) {
 341                         acl_perm(vp, exi, &va, cr);
 342                         error = vattr_to_nattr(&va, &ns->ns_attr);
 343                 }
 344         }
 345 
 346         ct.cc_flags = 0;
 347 
 348         /*
 349          * Force modified metadata out to stable storage.
 350          */
 351         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 352 
 353         VN_RELE(vp);
 354 
 355         ns->ns_status = puterrno(error);
 356 }
 357 void *
 358 rfs_setattr_getfh(struct nfssaargs *args)
 359 {
 360         return (&args->saa_fh);
 361 }
 362 
 363 /* Change and release @exip and @vpp only in success */
 364 int
 365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 366 {
 367         struct exportinfo *exi;
 368         vnode_t *vp = *vpp;
 369         fid_t fid;
 370         int error;
 371 
 372         VN_HOLD(vp);
 373 
 374         if ((error = traverse(&vp)) != 0) {
 375                 VN_RELE(vp);
 376                 return (error);
 377         }
 378 
 379         bzero(&fid, sizeof (fid));
 380         fid.fid_len = MAXFIDSZ;
 381         error = VOP_FID(vp, &fid, NULL);
 382         if (error) {
 383                 VN_RELE(vp);
 384                 return (error);
 385         }
 386 
 387         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 388         if (exi == NULL ||
 389             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 390                 /*
 391                  * It is not error, just subdir is not exported
 392                  * or "nohide" is not set
 393                  */
 394                 if (exi != NULL)
 395                         exi_rele(exi);
 396                 VN_RELE(vp);
 397         } else {
 398                 /* go to submount */
 399                 exi_rele(*exip);
 400                 *exip = exi;
 401 
 402                 VN_RELE(*vpp);
 403                 *vpp = vp;
 404         }
 405 
 406         return (0);
 407 }
 408 
 409 /*
 410  * Given mounted "dvp" and "exi", go upper mountpoint
 411  * with dvp/exi correction
 412  * Return 0 in success
 413  */
 414 int
 415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416 {
 417         struct exportinfo *exi;
 418         vnode_t *dvp = *dvpp;
 419         vnode_t *zone_rootvp;
 420 
 421         zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
 422         ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
 423 
 424         VN_HOLD(dvp);
 425         dvp = untraverse(dvp, zone_rootvp);
 426         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 427         if (exi == NULL) {
 428                 VN_RELE(dvp);
 429                 return (-1);
 430         }
 431 
 432         ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
 433         exi_rele(*exip);
 434         *exip = exi;
 435         VN_RELE(*dvpp);
 436         *dvpp = dvp;
 437 
 438         return (0);
 439 }
 440 /*
 441  * Directory lookup.
 442  * Returns an fhandle and file attributes for file name in a directory.
 443  */
 444 /* ARGSUSED */
 445 void
 446 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 447     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 448 {
 449         int error;
 450         vnode_t *dvp;
 451         vnode_t *vp;
 452         struct vattr va;
 453         fhandle_t *fhp = da->da_fhandle;
 454         struct sec_ol sec = {0, 0};
 455         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 456         char *name;
 457         struct sockaddr *ca;
 458 
 459         /*
 460          * Trusted Extension doesn't support NFSv2. MOUNT
 461          * will reject v2 clients. Need to prevent v2 client
 462          * access via WebNFS here.
 463          */
 464         if (is_system_labeled() && req->rq_vers == 2) {
 465                 dr->dr_status = NFSERR_ACCES;
 466                 return;
 467         }
 468 
 469         /*
 470          * Disallow NULL paths
 471          */
 472         if (da->da_name == NULL || *da->da_name == '\0') {
 473                 dr->dr_status = NFSERR_ACCES;
 474                 return;
 475         }
 476 
 477         /*
 478          * Allow lookups from the root - the default
 479          * location of the public filehandle.
 480          */
 481         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 482                 dvp = ZONE_ROOTVP();
 483                 VN_HOLD(dvp);
 484         } else {
 485                 dvp = nfs_fhtovp(fhp, exi);
 486                 if (dvp == NULL) {
 487                         dr->dr_status = NFSERR_STALE;
 488                         return;
 489                 }
 490         }
 491 
 492         exi_hold(exi);
 493         ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 494 
 495         /*
 496          * Not allow lookup beyond root.
 497          * If the filehandle matches a filehandle of the exi,
 498          * then the ".." refers beyond the root of an exported filesystem.
 499          */
 500         if (strcmp(da->da_name, "..") == 0 &&
 501             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 502                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 503                     ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 504                         /*
 505                          * special case for ".." and 'nohide'exported root
 506                          */
 507                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 508                                 error = NFSERR_ACCES;
 509                                 goto out;
 510                         }
 511                 } else  {
 512                         error = NFSERR_NOENT;
 513                         goto out;
 514                 }
 515         }
 516 
 517         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 518         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 519             MAXPATHLEN);
 520 
 521         if (name == NULL) {
 522                 error = NFSERR_ACCES;
 523                 goto out;
 524         }
 525 
 526         /*
 527          * If the public filehandle is used then allow
 528          * a multi-component lookup, i.e. evaluate
 529          * a pathname and follow symbolic links if
 530          * necessary.
 531          *
 532          * This may result in a vnode in another filesystem
 533          * which is OK as long as the filesystem is exported.
 534          */
 535         if (PUBLIC_FH2(fhp)) {
 536                 publicfh_flag = TRUE;
 537 
 538                 exi_rele(exi);
 539                 exi = NULL;
 540 
 541                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 542                     &sec);
 543         } else {
 544                 /*
 545                  * Do a normal single component lookup.
 546                  */
 547                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 548                     NULL, NULL, NULL);
 549         }
 550 
 551         if (name != da->da_name)
 552                 kmem_free(name, MAXPATHLEN);
 553 
 554         if (error == 0 && vn_ismntpt(vp)) {
 555                 error = rfs_cross_mnt(&vp, &exi);
 556                 if (error)
 557                         VN_RELE(vp);
 558         }
 559 
 560         if (!error) {
 561                 va.va_mask = AT_ALL;    /* we want everything */
 562 
 563                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 564 
 565                 /* check for overflows */
 566                 if (!error) {
 567                         acl_perm(vp, exi, &va, cr);
 568                         error = vattr_to_nattr(&va, &dr->dr_attr);
 569                         if (!error) {
 570                                 if (sec.sec_flags & SEC_QUERY)
 571                                         error = makefh_ol(&dr->dr_fhandle, exi,
 572                                             sec.sec_index);
 573                                 else {
 574                                         error = makefh(&dr->dr_fhandle, vp,
 575                                             exi);
 576                                         if (!error && publicfh_flag &&
 577                                             !chk_clnt_sec(exi, req))
 578                                                 auth_weak = TRUE;
 579                                 }
 580                         }
 581                 }
 582                 VN_RELE(vp);
 583         }
 584 
 585 out:
 586         VN_RELE(dvp);
 587 
 588         if (exi != NULL)
 589                 exi_rele(exi);
 590 
 591         /*
 592          * If it's public fh, no 0x81, and client's flavor is
 593          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 594          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 595          */
 596         if (auth_weak)
 597                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 598         else
 599                 dr->dr_status = puterrno(error);
 600 }
 601 void *
 602 rfs_lookup_getfh(struct nfsdiropargs *da)
 603 {
 604         return (da->da_fhandle);
 605 }
 606 
 607 /*
 608  * Read symbolic link.
 609  * Returns the string in the symbolic link at the given fhandle.
 610  */
 611 /* ARGSUSED */
 612 void
 613 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 614     struct svc_req *req, cred_t *cr, bool_t ro)
 615 {
 616         int error;
 617         struct iovec iov;
 618         struct uio uio;
 619         vnode_t *vp;
 620         struct vattr va;
 621         struct sockaddr *ca;
 622         char *name = NULL;
 623         int is_referral = 0;
 624 
 625         vp = nfs_fhtovp(fhp, exi);
 626         if (vp == NULL) {
 627                 rl->rl_data = NULL;
 628                 rl->rl_status = NFSERR_STALE;
 629                 return;
 630         }
 631 
 632         va.va_mask = AT_MODE;
 633 
 634         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 635 
 636         if (error) {
 637                 VN_RELE(vp);
 638                 rl->rl_data = NULL;
 639                 rl->rl_status = puterrno(error);
 640                 return;
 641         }
 642 
 643         if (MANDLOCK(vp, va.va_mode)) {
 644                 VN_RELE(vp);
 645                 rl->rl_data = NULL;
 646                 rl->rl_status = NFSERR_ACCES;
 647                 return;
 648         }
 649 
 650         /* We lied about the object type for a referral */
 651         if (vn_is_nfs_reparse(vp, cr))
 652                 is_referral = 1;
 653 
 654         /*
 655          * XNFS and RFC1094 require us to return ENXIO if argument
 656          * is not a link. BUGID 1138002.
 657          */
 658         if (vp->v_type != VLNK && !is_referral) {
 659                 VN_RELE(vp);
 660                 rl->rl_data = NULL;
 661                 rl->rl_status = NFSERR_NXIO;
 662                 return;
 663         }
 664 
 665         /*
 666          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 667          */
 668         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 669 
 670         if (is_referral) {
 671                 char *s;
 672                 size_t strsz;
 673                 kstat_named_t *stat =
 674                     exi->exi_ne->ne_globals->svstat[NFS_VERSION];
 675 
 676                 /* Get an artificial symlink based on a referral */
 677                 s = build_symlink(vp, cr, &strsz);
 678                 stat[NFS_REFERLINKS].value.ui64++;
 679                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 680                     vnode_t *, vp, char *, s);
 681                 if (s == NULL)
 682                         error = EINVAL;
 683                 else {
 684                         error = 0;
 685                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 686                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 687                         kmem_free(s, strsz);
 688                 }
 689 
 690         } else {
 691 
 692                 /*
 693                  * Set up io vector to read sym link data
 694                  */
 695                 iov.iov_base = rl->rl_data;
 696                 iov.iov_len = NFS_MAXPATHLEN;
 697                 uio.uio_iov = &iov;
 698                 uio.uio_iovcnt = 1;
 699                 uio.uio_segflg = UIO_SYSSPACE;
 700                 uio.uio_extflg = UIO_COPY_CACHED;
 701                 uio.uio_loffset = (offset_t)0;
 702                 uio.uio_resid = NFS_MAXPATHLEN;
 703 
 704                 /*
 705                  * Do the readlink.
 706                  */
 707                 error = VOP_READLINK(vp, &uio, cr, NULL);
 708 
 709                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 710 
 711                 if (!error)
 712                         rl->rl_data[rl->rl_count] = '\0';
 713 
 714         }
 715 
 716 
 717         VN_RELE(vp);
 718 
 719         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 720         name = nfscmd_convname(ca, exi, rl->rl_data,
 721             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 722 
 723         if (name != NULL && name != rl->rl_data) {
 724                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 725                 rl->rl_data = name;
 726         }
 727 
 728         /*
 729          * XNFS and RFC1094 require us to return ENXIO if argument
 730          * is not a link. UFS returns EINVAL if this is the case,
 731          * so we do the mapping here. BUGID 1138002.
 732          */
 733         if (error == EINVAL)
 734                 rl->rl_status = NFSERR_NXIO;
 735         else
 736                 rl->rl_status = puterrno(error);
 737 
 738 }
 739 void *
 740 rfs_readlink_getfh(fhandle_t *fhp)
 741 {
 742         return (fhp);
 743 }
 744 /*
 745  * Free data allocated by rfs_readlink
 746  */
 747 void
 748 rfs_rlfree(struct nfsrdlnres *rl)
 749 {
 750         if (rl->rl_data != NULL)
 751                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 752 }
 753 
 754 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 755 
 756 /*
 757  * Read data.
 758  * Returns some data read from the file at the given fhandle.
 759  */
 760 /* ARGSUSED */
 761 void
 762 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 763     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 764 {
 765         vnode_t *vp;
 766         int error;
 767         struct vattr va;
 768         struct iovec iov;
 769         struct uio uio;
 770         mblk_t *mp;
 771         int alloc_err = 0;
 772         int in_crit = 0;
 773         caller_context_t ct;
 774 
 775         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 776         if (vp == NULL) {
 777                 rr->rr_data = NULL;
 778                 rr->rr_status = NFSERR_STALE;
 779                 return;
 780         }
 781 
 782         if (vp->v_type != VREG) {
 783                 VN_RELE(vp);
 784                 rr->rr_data = NULL;
 785                 rr->rr_status = NFSERR_ISDIR;
 786                 return;
 787         }
 788 
 789         ct.cc_sysid = 0;
 790         ct.cc_pid = 0;
 791         ct.cc_caller_id = nfs2_srv_caller_id;
 792         ct.cc_flags = CC_DONTBLOCK;
 793 
 794         /*
 795          * Enter the critical region before calling VOP_RWLOCK
 796          * to avoid a deadlock with write requests.
 797          */
 798         if (nbl_need_check(vp)) {
 799                 nbl_start_crit(vp, RW_READER);
 800                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 801                     0, NULL)) {
 802                         nbl_end_crit(vp);
 803                         VN_RELE(vp);
 804                         rr->rr_data = NULL;
 805                         rr->rr_status = NFSERR_ACCES;
 806                         return;
 807                 }
 808                 in_crit = 1;
 809         }
 810 
 811         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 812 
 813         /* check if a monitor detected a delegation conflict */
 814         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 815                 if (in_crit)
 816                         nbl_end_crit(vp);
 817                 VN_RELE(vp);
 818                 /* mark as wouldblock so response is dropped */
 819                 curthread->t_flag |= T_WOULDBLOCK;
 820 
 821                 rr->rr_data = NULL;
 822                 return;
 823         }
 824 
 825         va.va_mask = AT_ALL;
 826 
 827         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 828 
 829         if (error) {
 830                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 831                 if (in_crit)
 832                         nbl_end_crit(vp);
 833 
 834                 VN_RELE(vp);
 835                 rr->rr_data = NULL;
 836                 rr->rr_status = puterrno(error);
 837 
 838                 return;
 839         }
 840 
 841         /*
 842          * This is a kludge to allow reading of files created
 843          * with no read permission.  The owner of the file
 844          * is always allowed to read it.
 845          */
 846         if (crgetuid(cr) != va.va_uid) {
 847                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 848 
 849                 if (error) {
 850                         /*
 851                          * Exec is the same as read over the net because
 852                          * of demand loading.
 853                          */
 854                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 855                 }
 856                 if (error) {
 857                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 858                         if (in_crit)
 859                                 nbl_end_crit(vp);
 860                         VN_RELE(vp);
 861                         rr->rr_data = NULL;
 862                         rr->rr_status = puterrno(error);
 863 
 864                         return;
 865                 }
 866         }
 867 
 868         if (MANDLOCK(vp, va.va_mode)) {
 869                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 870                 if (in_crit)
 871                         nbl_end_crit(vp);
 872 
 873                 VN_RELE(vp);
 874                 rr->rr_data = NULL;
 875                 rr->rr_status = NFSERR_ACCES;
 876 
 877                 return;
 878         }
 879 
 880         rr->rr_ok.rrok_wlist_len = 0;
 881         rr->rr_ok.rrok_wlist = NULL;
 882 
 883         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 884                 rr->rr_count = 0;
 885                 rr->rr_data = NULL;
 886                 /*
 887                  * In this case, status is NFS_OK, but there is no data
 888                  * to encode. So set rr_mp to NULL.
 889                  */
 890                 rr->rr_mp = NULL;
 891                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 892                 if (rr->rr_ok.rrok_wlist)
 893                         clist_zero_len(rr->rr_ok.rrok_wlist);
 894                 goto done;
 895         }
 896 
 897         if (ra->ra_wlist) {
 898                 mp = NULL;
 899                 rr->rr_mp = NULL;
 900                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 901                 if (ra->ra_count > iov.iov_len) {
 902                         rr->rr_data = NULL;
 903                         rr->rr_status = NFSERR_INVAL;
 904                         goto done;
 905                 }
 906         } else {
 907                 /*
 908                  * mp will contain the data to be sent out in the read reply.
 909                  * This will be freed after the reply has been sent out (by the
 910                  * driver).
 911                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 912                  * that the call to xdrmblk_putmblk() never fails.
 913                  */
 914                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 915                     &alloc_err);
 916                 ASSERT(mp != NULL);
 917                 ASSERT(alloc_err == 0);
 918 
 919                 rr->rr_mp = mp;
 920 
 921                 /*
 922                  * Set up io vector
 923                  */
 924                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 925                 iov.iov_len = ra->ra_count;
 926         }
 927 
 928         uio.uio_iov = &iov;
 929         uio.uio_iovcnt = 1;
 930         uio.uio_segflg = UIO_SYSSPACE;
 931         uio.uio_extflg = UIO_COPY_CACHED;
 932         uio.uio_loffset = (offset_t)ra->ra_offset;
 933         uio.uio_resid = ra->ra_count;
 934 
 935         error = VOP_READ(vp, &uio, 0, cr, &ct);
 936 
 937         if (error) {
 938                 if (mp)
 939                         freeb(mp);
 940 
 941                 /*
 942                  * check if a monitor detected a delegation conflict and
 943                  * mark as wouldblock so response is dropped
 944                  */
 945                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 946                         curthread->t_flag |= T_WOULDBLOCK;
 947                 else
 948                         rr->rr_status = puterrno(error);
 949 
 950                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 951                 if (in_crit)
 952                         nbl_end_crit(vp);
 953 
 954                 VN_RELE(vp);
 955                 rr->rr_data = NULL;
 956 
 957                 return;
 958         }
 959 
 960         /*
 961          * Get attributes again so we can send the latest access
 962          * time to the client side for its cache.
 963          */
 964         va.va_mask = AT_ALL;
 965 
 966         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 967 
 968         if (error) {
 969                 if (mp)
 970                         freeb(mp);
 971 
 972                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 973                 if (in_crit)
 974                         nbl_end_crit(vp);
 975 
 976                 VN_RELE(vp);
 977                 rr->rr_data = NULL;
 978                 rr->rr_status = puterrno(error);
 979 
 980                 return;
 981         }
 982 
 983         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 984 
 985         if (mp) {
 986                 rr->rr_data = (char *)mp->b_datap->db_base;
 987         } else {
 988                 if (ra->ra_wlist) {
 989                         rr->rr_data = (caddr_t)iov.iov_base;
 990                         if (!rdma_setup_read_data2(ra, rr)) {
 991                                 rr->rr_data = NULL;
 992                                 rr->rr_status = puterrno(NFSERR_INVAL);
 993                         }
 994                 }
 995         }
 996 done:
 997         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 998         if (in_crit)
 999                 nbl_end_crit(vp);
1000 
1001         acl_perm(vp, exi, &va, cr);
1002 
1003         /* check for overflows */
1004         error = vattr_to_nattr(&va, &rr->rr_attr);
1005 
1006         VN_RELE(vp);
1007 
1008         rr->rr_status = puterrno(error);
1009 }
1010 
1011 /*
1012  * Free data allocated by rfs_read
1013  */
1014 void
1015 rfs_rdfree(struct nfsrdresult *rr)
1016 {
1017         mblk_t *mp;
1018 
1019         if (rr->rr_status == NFS_OK) {
1020                 mp = rr->rr_mp;
1021                 if (mp != NULL)
1022                         freeb(mp);
1023         }
1024 }
1025 
1026 void *
1027 rfs_read_getfh(struct nfsreadargs *ra)
1028 {
1029         return (&ra->ra_fhandle);
1030 }
1031 
1032 #define MAX_IOVECS      12
1033 
1034 #ifdef DEBUG
1035 static int rfs_write_sync_hits = 0;
1036 static int rfs_write_sync_misses = 0;
1037 #endif
1038 
1039 /*
1040  * Write data to file.
1041  * Returns attributes of a file after writing some data to it.
1042  *
1043  * Any changes made here, especially in error handling might have
1044  * to also be done in rfs_write (which clusters write requests).
1045  */
1046 /* ARGSUSED */
1047 void
1048 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1049     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1050 {
1051         int error;
1052         vnode_t *vp;
1053         rlim64_t rlimit;
1054         struct vattr va;
1055         struct uio uio;
1056         struct iovec iov[MAX_IOVECS];
1057         mblk_t *m;
1058         struct iovec *iovp;
1059         int iovcnt;
1060         cred_t *savecred;
1061         int in_crit = 0;
1062         caller_context_t ct;
1063 
1064         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1065         if (vp == NULL) {
1066                 ns->ns_status = NFSERR_STALE;
1067                 return;
1068         }
1069 
1070         if (rdonly(ro, vp)) {
1071                 VN_RELE(vp);
1072                 ns->ns_status = NFSERR_ROFS;
1073                 return;
1074         }
1075 
1076         if (vp->v_type != VREG) {
1077                 VN_RELE(vp);
1078                 ns->ns_status = NFSERR_ISDIR;
1079                 return;
1080         }
1081 
1082         ct.cc_sysid = 0;
1083         ct.cc_pid = 0;
1084         ct.cc_caller_id = nfs2_srv_caller_id;
1085         ct.cc_flags = CC_DONTBLOCK;
1086 
1087         va.va_mask = AT_UID|AT_MODE;
1088 
1089         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1090 
1091         if (error) {
1092                 VN_RELE(vp);
1093                 ns->ns_status = puterrno(error);
1094 
1095                 return;
1096         }
1097 
1098         if (crgetuid(cr) != va.va_uid) {
1099                 /*
1100                  * This is a kludge to allow writes of files created
1101                  * with read only permission.  The owner of the file
1102                  * is always allowed to write it.
1103                  */
1104                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1105 
1106                 if (error) {
1107                         VN_RELE(vp);
1108                         ns->ns_status = puterrno(error);
1109                         return;
1110                 }
1111         }
1112 
1113         /*
1114          * Can't access a mandatory lock file.  This might cause
1115          * the NFS service thread to block forever waiting for a
1116          * lock to be released that will never be released.
1117          */
1118         if (MANDLOCK(vp, va.va_mode)) {
1119                 VN_RELE(vp);
1120                 ns->ns_status = NFSERR_ACCES;
1121                 return;
1122         }
1123 
1124         /*
1125          * We have to enter the critical region before calling VOP_RWLOCK
1126          * to avoid a deadlock with ufs.
1127          */
1128         if (nbl_need_check(vp)) {
1129                 nbl_start_crit(vp, RW_READER);
1130                 in_crit = 1;
1131                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1132                     wa->wa_count, 0, NULL)) {
1133                         error = EACCES;
1134                         goto out;
1135                 }
1136         }
1137 
1138         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1139 
1140         /* check if a monitor detected a delegation conflict */
1141         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1142                 goto out;
1143         }
1144 
1145         if (wa->wa_data || wa->wa_rlist) {
1146                 /* Do the RDMA thing if necessary */
1147                 if (wa->wa_rlist) {
1148                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1149                         iov[0].iov_len = wa->wa_count;
1150                 } else  {
1151                         iov[0].iov_base = wa->wa_data;
1152                         iov[0].iov_len = wa->wa_count;
1153                 }
1154                 uio.uio_iov = iov;
1155                 uio.uio_iovcnt = 1;
1156                 uio.uio_segflg = UIO_SYSSPACE;
1157                 uio.uio_extflg = UIO_COPY_DEFAULT;
1158                 uio.uio_loffset = (offset_t)wa->wa_offset;
1159                 uio.uio_resid = wa->wa_count;
1160                 /*
1161                  * The limit is checked on the client. We
1162                  * should allow any size writes here.
1163                  */
1164                 uio.uio_llimit = curproc->p_fsz_ctl;
1165                 rlimit = uio.uio_llimit - wa->wa_offset;
1166                 if (rlimit < (rlim64_t)uio.uio_resid)
1167                         uio.uio_resid = (uint_t)rlimit;
1168 
1169                 /*
1170                  * for now we assume no append mode
1171                  */
1172                 /*
1173                  * We're changing creds because VM may fault and we need
1174                  * the cred of the current thread to be used if quota
1175                  * checking is enabled.
1176                  */
1177                 savecred = curthread->t_cred;
1178                 curthread->t_cred = cr;
1179                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1180                 curthread->t_cred = savecred;
1181         } else {
1182 
1183                 iovcnt = 0;
1184                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1185                         iovcnt++;
1186                 if (iovcnt <= MAX_IOVECS) {
1187 #ifdef DEBUG
1188                         rfs_write_sync_hits++;
1189 #endif
1190                         iovp = iov;
1191                 } else {
1192 #ifdef DEBUG
1193                         rfs_write_sync_misses++;
1194 #endif
1195                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1196                 }
1197                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1198                 uio.uio_iov = iovp;
1199                 uio.uio_iovcnt = iovcnt;
1200                 uio.uio_segflg = UIO_SYSSPACE;
1201                 uio.uio_extflg = UIO_COPY_DEFAULT;
1202                 uio.uio_loffset = (offset_t)wa->wa_offset;
1203                 uio.uio_resid = wa->wa_count;
1204                 /*
1205                  * The limit is checked on the client. We
1206                  * should allow any size writes here.
1207                  */
1208                 uio.uio_llimit = curproc->p_fsz_ctl;
1209                 rlimit = uio.uio_llimit - wa->wa_offset;
1210                 if (rlimit < (rlim64_t)uio.uio_resid)
1211                         uio.uio_resid = (uint_t)rlimit;
1212 
1213                 /*
1214                  * For now we assume no append mode.
1215                  */
1216                 /*
1217                  * We're changing creds because VM may fault and we need
1218                  * the cred of the current thread to be used if quota
1219                  * checking is enabled.
1220                  */
1221                 savecred = curthread->t_cred;
1222                 curthread->t_cred = cr;
1223                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1224                 curthread->t_cred = savecred;
1225 
1226                 if (iovp != iov)
1227                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1228         }
1229 
1230         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1231 
1232         if (!error) {
1233                 /*
1234                  * Get attributes again so we send the latest mod
1235                  * time to the client side for its cache.
1236                  */
1237                 va.va_mask = AT_ALL;    /* now we want everything */
1238 
1239                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1240 
1241                 /* check for overflows */
1242                 if (!error) {
1243                         acl_perm(vp, exi, &va, cr);
1244                         error = vattr_to_nattr(&va, &ns->ns_attr);
1245                 }
1246         }
1247 
1248 out:
1249         if (in_crit)
1250                 nbl_end_crit(vp);
1251         VN_RELE(vp);
1252 
1253         /* check if a monitor detected a delegation conflict */
1254         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1255                 /* mark as wouldblock so response is dropped */
1256                 curthread->t_flag |= T_WOULDBLOCK;
1257         else
1258                 ns->ns_status = puterrno(error);
1259 
1260 }
1261 
1262 struct rfs_async_write {
1263         struct nfswriteargs *wa;
1264         struct nfsattrstat *ns;
1265         struct svc_req *req;
1266         cred_t *cr;
1267         bool_t ro;
1268         kthread_t *thread;
1269         struct rfs_async_write *list;
1270 };
1271 
1272 struct rfs_async_write_list {
1273         fhandle_t *fhp;
1274         kcondvar_t cv;
1275         struct rfs_async_write *list;
1276         struct rfs_async_write_list *next;
1277 };
1278 
1279 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1280 static kmutex_t rfs_async_write_lock;
1281 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1282 
1283 #define MAXCLIOVECS     42
1284 #define RFSWRITE_INITVAL (enum nfsstat) -1
1285 
1286 #ifdef DEBUG
1287 static int rfs_write_hits = 0;
1288 static int rfs_write_misses = 0;
1289 #endif
1290 
1291 /*
1292  * Write data to file.
1293  * Returns attributes of a file after writing some data to it.
1294  */
1295 void
1296 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1297     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1298 {
1299         int error;
1300         vnode_t *vp;
1301         rlim64_t rlimit;
1302         struct vattr va;
1303         struct uio uio;
1304         struct rfs_async_write_list *lp;
1305         struct rfs_async_write_list *nlp;
1306         struct rfs_async_write *rp;
1307         struct rfs_async_write *nrp;
1308         struct rfs_async_write *trp;
1309         struct rfs_async_write *lrp;
1310         int data_written;
1311         int iovcnt;
1312         mblk_t *m;
1313         struct iovec *iovp;
1314         struct iovec *niovp;
1315         struct iovec iov[MAXCLIOVECS];
1316         int count;
1317         int rcount;
1318         uint_t off;
1319         uint_t len;
1320         struct rfs_async_write nrpsp;
1321         struct rfs_async_write_list nlpsp;
1322         ushort_t t_flag;
1323         cred_t *savecred;
1324         int in_crit = 0;
1325         caller_context_t ct;
1326         nfs_srv_t *nsrv;
1327 
1328         ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1329         nsrv = nfs_get_srv();
1330         if (!nsrv->write_async) {
1331                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1332                 return;
1333         }
1334 
1335         /*
1336          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1337          * is considered an OK.
1338          */
1339         ns->ns_status = RFSWRITE_INITVAL;
1340 
1341         nrp = &nrpsp;
1342         nrp->wa = wa;
1343         nrp->ns = ns;
1344         nrp->req = req;
1345         nrp->cr = cr;
1346         nrp->ro = ro;
1347         nrp->thread = curthread;
1348 
1349         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1350 
1351         /*
1352          * Look to see if there is already a cluster started
1353          * for this file.
1354          */
1355         mutex_enter(&nsrv->async_write_lock);
1356         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1357                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1358                     sizeof (fhandle_t)) == 0)
1359                         break;
1360         }
1361 
1362         /*
1363          * If lp is non-NULL, then there is already a cluster
1364          * started.  We need to place ourselves in the cluster
1365          * list in the right place as determined by starting
1366          * offset.  Conflicts with non-blocking mandatory locked
1367          * regions will be checked when the cluster is processed.
1368          */
1369         if (lp != NULL) {
1370                 rp = lp->list;
1371                 trp = NULL;
1372                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1373                         trp = rp;
1374                         rp = rp->list;
1375                 }
1376                 nrp->list = rp;
1377                 if (trp == NULL)
1378                         lp->list = nrp;
1379                 else
1380                         trp->list = nrp;
1381                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1382                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1383                 mutex_exit(&nsrv->async_write_lock);
1384 
1385                 return;
1386         }
1387 
1388         /*
1389          * No cluster started yet, start one and add ourselves
1390          * to the list of clusters.
1391          */
1392         nrp->list = NULL;
1393 
1394         nlp = &nlpsp;
1395         nlp->fhp = &wa->wa_fhandle;
1396         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1397         nlp->list = nrp;
1398         nlp->next = NULL;
1399 
1400         if (nsrv->async_write_head == NULL) {
1401                 nsrv->async_write_head = nlp;
1402         } else {
1403                 lp = nsrv->async_write_head;
1404                 while (lp->next != NULL)
1405                         lp = lp->next;
1406                 lp->next = nlp;
1407         }
1408         mutex_exit(&nsrv->async_write_lock);
1409 
1410         /*
1411          * Convert the file handle common to all of the requests
1412          * in this cluster to a vnode.
1413          */
1414         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1415         if (vp == NULL) {
1416                 mutex_enter(&nsrv->async_write_lock);
1417                 if (nsrv->async_write_head == nlp)
1418                         nsrv->async_write_head = nlp->next;
1419                 else {
1420                         lp = nsrv->async_write_head;
1421                         while (lp->next != nlp)
1422                                 lp = lp->next;
1423                         lp->next = nlp->next;
1424                 }
1425                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1426                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1427                         rp->ns->ns_status = NFSERR_STALE;
1428                         rp->thread->t_flag |= t_flag;
1429                 }
1430                 cv_broadcast(&nlp->cv);
1431                 mutex_exit(&nsrv->async_write_lock);
1432 
1433                 return;
1434         }
1435 
1436         /*
1437          * Can only write regular files.  Attempts to write any
1438          * other file types fail with EISDIR.
1439          */
1440         if (vp->v_type != VREG) {
1441                 VN_RELE(vp);
1442                 mutex_enter(&nsrv->async_write_lock);
1443                 if (nsrv->async_write_head == nlp)
1444                         nsrv->async_write_head = nlp->next;
1445                 else {
1446                         lp = nsrv->async_write_head;
1447                         while (lp->next != nlp)
1448                                 lp = lp->next;
1449                         lp->next = nlp->next;
1450                 }
1451                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1452                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1453                         rp->ns->ns_status = NFSERR_ISDIR;
1454                         rp->thread->t_flag |= t_flag;
1455                 }
1456                 cv_broadcast(&nlp->cv);
1457                 mutex_exit(&nsrv->async_write_lock);
1458 
1459                 return;
1460         }
1461 
1462         /*
1463          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1464          * deadlock with ufs.
1465          */
1466         if (nbl_need_check(vp)) {
1467                 nbl_start_crit(vp, RW_READER);
1468                 in_crit = 1;
1469         }
1470 
1471         ct.cc_sysid = 0;
1472         ct.cc_pid = 0;
1473         ct.cc_caller_id = nfs2_srv_caller_id;
1474         ct.cc_flags = CC_DONTBLOCK;
1475 
1476         /*
1477          * Lock the file for writing.  This operation provides
1478          * the delay which allows clusters to grow.
1479          */
1480         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1481 
1482         /* check if a monitor detected a delegation conflict */
1483         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1484                 if (in_crit)
1485                         nbl_end_crit(vp);
1486                 VN_RELE(vp);
1487                 /* mark as wouldblock so response is dropped */
1488                 curthread->t_flag |= T_WOULDBLOCK;
1489                 mutex_enter(&nsrv->async_write_lock);
1490                 if (nsrv->async_write_head == nlp)
1491                         nsrv->async_write_head = nlp->next;
1492                 else {
1493                         lp = nsrv->async_write_head;
1494                         while (lp->next != nlp)
1495                                 lp = lp->next;
1496                         lp->next = nlp->next;
1497                 }
1498                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1499                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1500                                 rp->ns->ns_status = puterrno(error);
1501                                 rp->thread->t_flag |= T_WOULDBLOCK;
1502                         }
1503                 }
1504                 cv_broadcast(&nlp->cv);
1505                 mutex_exit(&nsrv->async_write_lock);
1506 
1507                 return;
1508         }
1509 
1510         /*
1511          * Disconnect this cluster from the list of clusters.
1512          * The cluster that is being dealt with must be fixed
1513          * in size after this point, so there is no reason
1514          * to leave it on the list so that new requests can
1515          * find it.
1516          *
1517          * The algorithm is that the first write request will
1518          * create a cluster, convert the file handle to a
1519          * vnode pointer, and then lock the file for writing.
1520          * This request is not likely to be clustered with
1521          * any others.  However, the next request will create
1522          * a new cluster and be blocked in VOP_RWLOCK while
1523          * the first request is being processed.  This delay
1524          * will allow more requests to be clustered in this
1525          * second cluster.
1526          */
1527         mutex_enter(&nsrv->async_write_lock);
1528         if (nsrv->async_write_head == nlp)
1529                 nsrv->async_write_head = nlp->next;
1530         else {
1531                 lp = nsrv->async_write_head;
1532                 while (lp->next != nlp)
1533                         lp = lp->next;
1534                 lp->next = nlp->next;
1535         }
1536         mutex_exit(&nsrv->async_write_lock);
1537 
1538         /*
1539          * Step through the list of requests in this cluster.
1540          * We need to check permissions to make sure that all
1541          * of the requests have sufficient permission to write
1542          * the file.  A cluster can be composed of requests
1543          * from different clients and different users on each
1544          * client.
1545          *
1546          * As a side effect, we also calculate the size of the
1547          * byte range that this cluster encompasses.
1548          */
1549         rp = nlp->list;
1550         off = rp->wa->wa_offset;
1551         len = (uint_t)0;
1552         do {
1553                 if (rdonly(rp->ro, vp)) {
1554                         rp->ns->ns_status = NFSERR_ROFS;
1555                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1556                         rp->thread->t_flag |= t_flag;
1557                         continue;
1558                 }
1559 
1560                 va.va_mask = AT_UID|AT_MODE;
1561 
1562                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1563 
1564                 if (!error) {
1565                         if (crgetuid(rp->cr) != va.va_uid) {
1566                                 /*
1567                                  * This is a kludge to allow writes of files
1568                                  * created with read only permission.  The
1569                                  * owner of the file is always allowed to
1570                                  * write it.
1571                                  */
1572                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1573                         }
1574                         if (!error && MANDLOCK(vp, va.va_mode))
1575                                 error = EACCES;
1576                 }
1577 
1578                 /*
1579                  * Check for a conflict with a nbmand-locked region.
1580                  */
1581                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1582                     rp->wa->wa_count, 0, NULL)) {
1583                         error = EACCES;
1584                 }
1585 
1586                 if (error) {
1587                         rp->ns->ns_status = puterrno(error);
1588                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1589                         rp->thread->t_flag |= t_flag;
1590                         continue;
1591                 }
1592                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1593                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1594         } while ((rp = rp->list) != NULL);
1595 
1596         /*
1597          * Step through the cluster attempting to gather as many
1598          * requests which are contiguous as possible.  These
1599          * contiguous requests are handled via one call to VOP_WRITE
1600          * instead of different calls to VOP_WRITE.  We also keep
1601          * track of the fact that any data was written.
1602          */
1603         rp = nlp->list;
1604         data_written = 0;
1605         do {
1606                 /*
1607                  * Skip any requests which are already marked as having an
1608                  * error.
1609                  */
1610                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1611                         rp = rp->list;
1612                         continue;
1613                 }
1614 
1615                 /*
1616                  * Count the number of iovec's which are required
1617                  * to handle this set of requests.  One iovec is
1618                  * needed for each data buffer, whether addressed
1619                  * by wa_data or by the b_rptr pointers in the
1620                  * mblk chains.
1621                  */
1622                 iovcnt = 0;
1623                 lrp = rp;
1624                 for (;;) {
1625                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1626                                 iovcnt++;
1627                         else {
1628                                 m = lrp->wa->wa_mblk;
1629                                 while (m != NULL) {
1630                                         iovcnt++;
1631                                         m = m->b_cont;
1632                                 }
1633                         }
1634                         if (lrp->list == NULL ||
1635                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1636                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1637                             lrp->list->wa->wa_offset) {
1638                                 lrp = lrp->list;
1639                                 break;
1640                         }
1641                         lrp = lrp->list;
1642                 }
1643 
1644                 if (iovcnt <= MAXCLIOVECS) {
1645 #ifdef DEBUG
1646                         rfs_write_hits++;
1647 #endif
1648                         niovp = iov;
1649                 } else {
1650 #ifdef DEBUG
1651                         rfs_write_misses++;
1652 #endif
1653                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1654                 }
1655                 /*
1656                  * Put together the scatter/gather iovecs.
1657                  */
1658                 iovp = niovp;
1659                 trp = rp;
1660                 count = 0;
1661                 do {
1662                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1663                                 if (trp->wa->wa_rlist) {
1664                                         iovp->iov_base =
1665                                             (char *)((trp->wa->wa_rlist)->
1666                                             u.c_daddr3);
1667                                         iovp->iov_len = trp->wa->wa_count;
1668                                 } else  {
1669                                         iovp->iov_base = trp->wa->wa_data;
1670                                         iovp->iov_len = trp->wa->wa_count;
1671                                 }
1672                                 iovp++;
1673                         } else {
1674                                 m = trp->wa->wa_mblk;
1675                                 rcount = trp->wa->wa_count;
1676                                 while (m != NULL) {
1677                                         iovp->iov_base = (caddr_t)m->b_rptr;
1678                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1679                                         rcount -= iovp->iov_len;
1680                                         if (rcount < 0)
1681                                                 iovp->iov_len += rcount;
1682                                         iovp++;
1683                                         if (rcount <= 0)
1684                                                 break;
1685                                         m = m->b_cont;
1686                                 }
1687                         }
1688                         count += trp->wa->wa_count;
1689                         trp = trp->list;
1690                 } while (trp != lrp);
1691 
1692                 uio.uio_iov = niovp;
1693                 uio.uio_iovcnt = iovcnt;
1694                 uio.uio_segflg = UIO_SYSSPACE;
1695                 uio.uio_extflg = UIO_COPY_DEFAULT;
1696                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1697                 uio.uio_resid = count;
1698                 /*
1699                  * The limit is checked on the client. We
1700                  * should allow any size writes here.
1701                  */
1702                 uio.uio_llimit = curproc->p_fsz_ctl;
1703                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1704                 if (rlimit < (rlim64_t)uio.uio_resid)
1705                         uio.uio_resid = (uint_t)rlimit;
1706 
1707                 /*
1708                  * For now we assume no append mode.
1709                  */
1710 
1711                 /*
1712                  * We're changing creds because VM may fault
1713                  * and we need the cred of the current
1714                  * thread to be used if quota * checking is
1715                  * enabled.
1716                  */
1717                 savecred = curthread->t_cred;
1718                 curthread->t_cred = cr;
1719                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1720                 curthread->t_cred = savecred;
1721 
1722                 /* check if a monitor detected a delegation conflict */
1723                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1724                         /* mark as wouldblock so response is dropped */
1725                         curthread->t_flag |= T_WOULDBLOCK;
1726 
1727                 if (niovp != iov)
1728                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1729 
1730                 if (!error) {
1731                         data_written = 1;
1732                         /*
1733                          * Get attributes again so we send the latest mod
1734                          * time to the client side for its cache.
1735                          */
1736                         va.va_mask = AT_ALL;    /* now we want everything */
1737 
1738                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1739 
1740                         if (!error)
1741                                 acl_perm(vp, exi, &va, rp->cr);
1742                 }
1743 
1744                 /*
1745                  * Fill in the status responses for each request
1746                  * which was just handled.  Also, copy the latest
1747                  * attributes in to the attribute responses if
1748                  * appropriate.
1749                  */
1750                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1751                 do {
1752                         rp->thread->t_flag |= t_flag;
1753                         /* check for overflows */
1754                         if (!error) {
1755                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1756                         }
1757                         rp->ns->ns_status = puterrno(error);
1758                         rp = rp->list;
1759                 } while (rp != lrp);
1760         } while (rp != NULL);
1761 
1762         /*
1763          * If any data was written at all, then we need to flush
1764          * the data and metadata to stable storage.
1765          */
1766         if (data_written) {
1767                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1768 
1769                 if (!error) {
1770                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1771                 }
1772         }
1773 
1774         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1775 
1776         if (in_crit)
1777                 nbl_end_crit(vp);
1778         VN_RELE(vp);
1779 
1780         t_flag = curthread->t_flag & T_WOULDBLOCK;
1781         mutex_enter(&nsrv->async_write_lock);
1782         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1783                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1784                         rp->ns->ns_status = puterrno(error);
1785                         rp->thread->t_flag |= t_flag;
1786                 }
1787         }
1788         cv_broadcast(&nlp->cv);
1789         mutex_exit(&nsrv->async_write_lock);
1790 
1791 }
1792 
1793 void *
1794 rfs_write_getfh(struct nfswriteargs *wa)
1795 {
1796         return (&wa->wa_fhandle);
1797 }
1798 
1799 /*
1800  * Create a file.
1801  * Creates a file with given attributes and returns those attributes
1802  * and an fhandle for the new file.
1803  */
1804 void
1805 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1806     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1807 {
1808         int error;
1809         int lookuperr;
1810         int in_crit = 0;
1811         struct vattr va;
1812         vnode_t *vp;
1813         vnode_t *realvp;
1814         vnode_t *dvp;
1815         char *name = args->ca_da.da_name;
1816         vnode_t *tvp = NULL;
1817         int mode;
1818         int lookup_ok;
1819         bool_t trunc;
1820         struct sockaddr *ca;
1821 
1822         /*
1823          * Disallow NULL paths
1824          */
1825         if (name == NULL || *name == '\0') {
1826                 dr->dr_status = NFSERR_ACCES;
1827                 return;
1828         }
1829 
1830         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1831         if (dvp == NULL) {
1832                 dr->dr_status = NFSERR_STALE;
1833                 return;
1834         }
1835 
1836         error = sattr_to_vattr(args->ca_sa, &va);
1837         if (error) {
1838                 dr->dr_status = puterrno(error);
1839                 return;
1840         }
1841 
1842         /*
1843          * Must specify the mode.
1844          */
1845         if (!(va.va_mask & AT_MODE)) {
1846                 VN_RELE(dvp);
1847                 dr->dr_status = NFSERR_INVAL;
1848                 return;
1849         }
1850 
1851         /*
1852          * This is a completely gross hack to make mknod
1853          * work over the wire until we can wack the protocol
1854          */
1855         if ((va.va_mode & IFMT) == IFCHR) {
1856                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1857                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1858                 else {
1859                         va.va_type = VCHR;
1860                         /*
1861                          * uncompress the received dev_t
1862                          * if the top half is zero indicating a request
1863                          * from an `older style' OS.
1864                          */
1865                         if ((va.va_size & 0xffff0000) == 0)
1866                                 va.va_rdev = nfsv2_expdev(va.va_size);
1867                         else
1868                                 va.va_rdev = (dev_t)va.va_size;
1869                 }
1870                 va.va_mask &= ~AT_SIZE;
1871         } else if ((va.va_mode & IFMT) == IFBLK) {
1872                 va.va_type = VBLK;
1873                 /*
1874                  * uncompress the received dev_t
1875                  * if the top half is zero indicating a request
1876                  * from an `older style' OS.
1877                  */
1878                 if ((va.va_size & 0xffff0000) == 0)
1879                         va.va_rdev = nfsv2_expdev(va.va_size);
1880                 else
1881                         va.va_rdev = (dev_t)va.va_size;
1882                 va.va_mask &= ~AT_SIZE;
1883         } else if ((va.va_mode & IFMT) == IFSOCK) {
1884                 va.va_type = VSOCK;
1885         } else {
1886                 va.va_type = VREG;
1887         }
1888         va.va_mode &= ~IFMT;
1889         va.va_mask |= AT_TYPE;
1890 
1891         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1892         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1893             MAXPATHLEN);
1894         if (name == NULL) {
1895                 dr->dr_status = puterrno(EINVAL);
1896                 return;
1897         }
1898 
1899         /*
1900          * Why was the choice made to use VWRITE as the mode to the
1901          * call to VOP_CREATE ? This results in a bug.  When a client
1902          * opens a file that already exists and is RDONLY, the second
1903          * open fails with an EACESS because of the mode.
1904          * bug ID 1054648.
1905          */
1906         lookup_ok = 0;
1907         mode = VWRITE;
1908         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1909                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1910                     NULL, NULL, NULL);
1911                 if (!error) {
1912                         struct vattr at;
1913 
1914                         lookup_ok = 1;
1915                         at.va_mask = AT_MODE;
1916                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1917                         if (!error)
1918                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1919                         VN_RELE(tvp);
1920                         tvp = NULL;
1921                 }
1922         }
1923 
1924         if (!lookup_ok) {
1925                 if (rdonly(ro, dvp)) {
1926                         error = EROFS;
1927                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1928                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1929                         error = EPERM;
1930                 } else {
1931                         error = 0;
1932                 }
1933         }
1934 
1935         /*
1936          * If file size is being modified on an already existing file
1937          * make sure that there are no conflicting non-blocking mandatory
1938          * locks in the region being manipulated. Return EACCES if there
1939          * are conflicting locks.
1940          */
1941         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1942                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1943                     NULL, NULL, NULL);
1944 
1945                 if (!lookuperr &&
1946                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1947                         VN_RELE(tvp);
1948                         curthread->t_flag |= T_WOULDBLOCK;
1949                         goto out;
1950                 }
1951 
1952                 if (!lookuperr && nbl_need_check(tvp)) {
1953                         /*
1954                          * The file exists. Now check if it has any
1955                          * conflicting non-blocking mandatory locks
1956                          * in the region being changed.
1957                          */
1958                         struct vattr bva;
1959                         u_offset_t offset;
1960                         ssize_t length;
1961 
1962                         nbl_start_crit(tvp, RW_READER);
1963                         in_crit = 1;
1964 
1965                         bva.va_mask = AT_SIZE;
1966                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1967                         if (!error) {
1968                                 if (va.va_size < bva.va_size) {
1969                                         offset = va.va_size;
1970                                         length = bva.va_size - va.va_size;
1971                                 } else {
1972                                         offset = bva.va_size;
1973                                         length = va.va_size - bva.va_size;
1974                                 }
1975                                 if (length) {
1976                                         if (nbl_conflict(tvp, NBL_WRITE,
1977                                             offset, length, 0, NULL)) {
1978                                                 error = EACCES;
1979                                         }
1980                                 }
1981                         }
1982                         if (error) {
1983                                 nbl_end_crit(tvp);
1984                                 VN_RELE(tvp);
1985                                 in_crit = 0;
1986                         }
1987                 } else if (tvp != NULL) {
1988                         VN_RELE(tvp);
1989                 }
1990         }
1991 
1992         if (!error) {
1993                 /*
1994                  * If filesystem is shared with nosuid the remove any
1995                  * setuid/setgid bits on create.
1996                  */
1997                 if (va.va_type == VREG &&
1998                     exi->exi_export.ex_flags & EX_NOSUID)
1999                         va.va_mode &= ~(VSUID | VSGID);
2000 
2001                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
2002                     NULL, NULL);
2003 
2004                 if (!error) {
2005 
2006                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2007                                 trunc = TRUE;
2008                         else
2009                                 trunc = FALSE;
2010 
2011                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2012                                 VN_RELE(vp);
2013                                 curthread->t_flag |= T_WOULDBLOCK;
2014                                 goto out;
2015                         }
2016                         va.va_mask = AT_ALL;
2017 
2018                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2019 
2020                         /* check for overflows */
2021                         if (!error) {
2022                                 acl_perm(vp, exi, &va, cr);
2023                                 error = vattr_to_nattr(&va, &dr->dr_attr);
2024                                 if (!error) {
2025                                         error = makefh(&dr->dr_fhandle, vp,
2026                                             exi);
2027                                 }
2028                         }
2029                         /*
2030                          * Force modified metadata out to stable storage.
2031                          *
2032                          * if a underlying vp exists, pass it to VOP_FSYNC
2033                          */
2034                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
2035                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2036                         else
2037                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2038                         VN_RELE(vp);
2039                 }
2040 
2041                 if (in_crit) {
2042                         nbl_end_crit(tvp);
2043                         VN_RELE(tvp);
2044                 }
2045         }
2046 
2047         /*
2048          * Force modified data and metadata out to stable storage.
2049          */
2050         (void) VOP_FSYNC(dvp, 0, cr, NULL);
2051 
2052 out:
2053 
2054         VN_RELE(dvp);
2055 
2056         dr->dr_status = puterrno(error);
2057 
2058         if (name != args->ca_da.da_name)
2059                 kmem_free(name, MAXPATHLEN);
2060 }
2061 void *
2062 rfs_create_getfh(struct nfscreatargs *args)
2063 {
2064         return (args->ca_da.da_fhandle);
2065 }
2066 
2067 /*
2068  * Remove a file.
2069  * Remove named file from parent directory.
2070  */
2071 /* ARGSUSED */
2072 void
2073 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2074     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2075 {
2076         int error = 0;
2077         vnode_t *vp;
2078         vnode_t *targvp;
2079         int in_crit = 0;
2080 
2081         /*
2082          * Disallow NULL paths
2083          */
2084         if (da->da_name == NULL || *da->da_name == '\0') {
2085                 *status = NFSERR_ACCES;
2086                 return;
2087         }
2088 
2089         vp = nfs_fhtovp(da->da_fhandle, exi);
2090         if (vp == NULL) {
2091                 *status = NFSERR_STALE;
2092                 return;
2093         }
2094 
2095         if (rdonly(ro, vp)) {
2096                 VN_RELE(vp);
2097                 *status = NFSERR_ROFS;
2098                 return;
2099         }
2100 
2101         /*
2102          * Check for a conflict with a non-blocking mandatory share reservation.
2103          */
2104         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2105             NULL, cr, NULL, NULL, NULL);
2106         if (error != 0) {
2107                 VN_RELE(vp);
2108                 *status = puterrno(error);
2109                 return;
2110         }
2111 
2112         /*
2113          * If the file is delegated to an v4 client, then initiate
2114          * recall and drop this request (by setting T_WOULDBLOCK).
2115          * The client will eventually re-transmit the request and
2116          * (hopefully), by then, the v4 client will have returned
2117          * the delegation.
2118          */
2119 
2120         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2121                 VN_RELE(vp);
2122                 VN_RELE(targvp);
2123                 curthread->t_flag |= T_WOULDBLOCK;
2124                 return;
2125         }
2126 
2127         if (nbl_need_check(targvp)) {
2128                 nbl_start_crit(targvp, RW_READER);
2129                 in_crit = 1;
2130                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2131                         error = EACCES;
2132                         goto out;
2133                 }
2134         }
2135 
2136         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2137 
2138         /*
2139          * Force modified data and metadata out to stable storage.
2140          */
2141         (void) VOP_FSYNC(vp, 0, cr, NULL);
2142 
2143 out:
2144         if (in_crit)
2145                 nbl_end_crit(targvp);
2146         VN_RELE(targvp);
2147         VN_RELE(vp);
2148 
2149         *status = puterrno(error);
2150 
2151 }
2152 
2153 void *
2154 rfs_remove_getfh(struct nfsdiropargs *da)
2155 {
2156         return (da->da_fhandle);
2157 }
2158 
2159 /*
2160  * rename a file
2161  * Give a file (from) a new name (to).
2162  */
2163 /* ARGSUSED */
2164 void
2165 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2166     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2167 {
2168         int error = 0;
2169         vnode_t *fromvp;
2170         vnode_t *tovp;
2171         struct exportinfo *to_exi;
2172         fhandle_t *fh;
2173         vnode_t *srcvp;
2174         vnode_t *targvp;
2175         int in_crit = 0;
2176 
2177         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2178         if (fromvp == NULL) {
2179                 *status = NFSERR_STALE;
2180                 return;
2181         }
2182 
2183         fh = args->rna_to.da_fhandle;
2184         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2185         if (to_exi == NULL) {
2186                 VN_RELE(fromvp);
2187                 *status = NFSERR_ACCES;
2188                 return;
2189         }
2190         exi_rele(to_exi);
2191 
2192         if (to_exi != exi) {
2193                 VN_RELE(fromvp);
2194                 *status = NFSERR_XDEV;
2195                 return;
2196         }
2197 
2198         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2199         if (tovp == NULL) {
2200                 VN_RELE(fromvp);
2201                 *status = NFSERR_STALE;
2202                 return;
2203         }
2204 
2205         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2206                 VN_RELE(tovp);
2207                 VN_RELE(fromvp);
2208                 *status = NFSERR_NOTDIR;
2209                 return;
2210         }
2211 
2212         /*
2213          * Disallow NULL paths
2214          */
2215         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2216             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2217                 VN_RELE(tovp);
2218                 VN_RELE(fromvp);
2219                 *status = NFSERR_ACCES;
2220                 return;
2221         }
2222 
2223         if (rdonly(ro, tovp)) {
2224                 VN_RELE(tovp);
2225                 VN_RELE(fromvp);
2226                 *status = NFSERR_ROFS;
2227                 return;
2228         }
2229 
2230         /*
2231          * Check for a conflict with a non-blocking mandatory share reservation.
2232          */
2233         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2234             NULL, cr, NULL, NULL, NULL);
2235         if (error != 0) {
2236                 VN_RELE(tovp);
2237                 VN_RELE(fromvp);
2238                 *status = puterrno(error);
2239                 return;
2240         }
2241 
2242         /* Check for delegations on the source file */
2243 
2244         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2245                 VN_RELE(tovp);
2246                 VN_RELE(fromvp);
2247                 VN_RELE(srcvp);
2248                 curthread->t_flag |= T_WOULDBLOCK;
2249                 return;
2250         }
2251 
2252         /* Check for delegation on the file being renamed over, if it exists */
2253 
2254         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2255             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2256             NULL, NULL, NULL) == 0) {
2257 
2258                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2259                         VN_RELE(tovp);
2260                         VN_RELE(fromvp);
2261                         VN_RELE(srcvp);
2262                         VN_RELE(targvp);
2263                         curthread->t_flag |= T_WOULDBLOCK;
2264                         return;
2265                 }
2266                 VN_RELE(targvp);
2267         }
2268 
2269 
2270         if (nbl_need_check(srcvp)) {
2271                 nbl_start_crit(srcvp, RW_READER);
2272                 in_crit = 1;
2273                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2274                         error = EACCES;
2275                         goto out;
2276                 }
2277         }
2278 
2279         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2280             tovp, args->rna_to.da_name, cr, NULL, 0);
2281 
2282         if (error == 0)
2283                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2284                     strlen(args->rna_to.da_name));
2285 
2286         /*
2287          * Force modified data and metadata out to stable storage.
2288          */
2289         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2290         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2291 
2292 out:
2293         if (in_crit)
2294                 nbl_end_crit(srcvp);
2295         VN_RELE(srcvp);
2296         VN_RELE(tovp);
2297         VN_RELE(fromvp);
2298 
2299         *status = puterrno(error);
2300 
2301 }
2302 void *
2303 rfs_rename_getfh(struct nfsrnmargs *args)
2304 {
2305         return (args->rna_from.da_fhandle);
2306 }
2307 
2308 /*
2309  * Link to a file.
2310  * Create a file (to) which is a hard link to the given file (from).
2311  */
2312 /* ARGSUSED */
2313 void
2314 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2315     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2316 {
2317         int error;
2318         vnode_t *fromvp;
2319         vnode_t *tovp;
2320         struct exportinfo *to_exi;
2321         fhandle_t *fh;
2322 
2323         fromvp = nfs_fhtovp(args->la_from, exi);
2324         if (fromvp == NULL) {
2325                 *status = NFSERR_STALE;
2326                 return;
2327         }
2328 
2329         fh = args->la_to.da_fhandle;
2330         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2331         if (to_exi == NULL) {
2332                 VN_RELE(fromvp);
2333                 *status = NFSERR_ACCES;
2334                 return;
2335         }
2336         exi_rele(to_exi);
2337 
2338         if (to_exi != exi) {
2339                 VN_RELE(fromvp);
2340                 *status = NFSERR_XDEV;
2341                 return;
2342         }
2343 
2344         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2345         if (tovp == NULL) {
2346                 VN_RELE(fromvp);
2347                 *status = NFSERR_STALE;
2348                 return;
2349         }
2350 
2351         if (tovp->v_type != VDIR) {
2352                 VN_RELE(tovp);
2353                 VN_RELE(fromvp);
2354                 *status = NFSERR_NOTDIR;
2355                 return;
2356         }
2357         /*
2358          * Disallow NULL paths
2359          */
2360         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2361                 VN_RELE(tovp);
2362                 VN_RELE(fromvp);
2363                 *status = NFSERR_ACCES;
2364                 return;
2365         }
2366 
2367         if (rdonly(ro, tovp)) {
2368                 VN_RELE(tovp);
2369                 VN_RELE(fromvp);
2370                 *status = NFSERR_ROFS;
2371                 return;
2372         }
2373 
2374         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2375 
2376         /*
2377          * Force modified data and metadata out to stable storage.
2378          */
2379         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2380         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2381 
2382         VN_RELE(tovp);
2383         VN_RELE(fromvp);
2384 
2385         *status = puterrno(error);
2386 
2387 }
2388 void *
2389 rfs_link_getfh(struct nfslinkargs *args)
2390 {
2391         return (args->la_from);
2392 }
2393 
2394 /*
2395  * Symbolicly link to a file.
2396  * Create a file (to) with the given attributes which is a symbolic link
2397  * to the given path name (to).
2398  */
2399 void
2400 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2401     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2402 {
2403         int error;
2404         struct vattr va;
2405         vnode_t *vp;
2406         vnode_t *svp;
2407         int lerror;
2408         struct sockaddr *ca;
2409         char *name = NULL;
2410 
2411         /*
2412          * Disallow NULL paths
2413          */
2414         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2415                 *status = NFSERR_ACCES;
2416                 return;
2417         }
2418 
2419         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2420         if (vp == NULL) {
2421                 *status = NFSERR_STALE;
2422                 return;
2423         }
2424 
2425         if (rdonly(ro, vp)) {
2426                 VN_RELE(vp);
2427                 *status = NFSERR_ROFS;
2428                 return;
2429         }
2430 
2431         error = sattr_to_vattr(args->sla_sa, &va);
2432         if (error) {
2433                 VN_RELE(vp);
2434                 *status = puterrno(error);
2435                 return;
2436         }
2437 
2438         if (!(va.va_mask & AT_MODE)) {
2439                 VN_RELE(vp);
2440                 *status = NFSERR_INVAL;
2441                 return;
2442         }
2443 
2444         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2445         name = nfscmd_convname(ca, exi, args->sla_tnm,
2446             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2447 
2448         if (name == NULL) {
2449                 *status = NFSERR_ACCES;
2450                 return;
2451         }
2452 
2453         va.va_type = VLNK;
2454         va.va_mask |= AT_TYPE;
2455 
2456         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2457 
2458         /*
2459          * Force new data and metadata out to stable storage.
2460          */
2461         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2462             NULL, cr, NULL, NULL, NULL);
2463 
2464         if (!lerror) {
2465                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2466                 VN_RELE(svp);
2467         }
2468 
2469         /*
2470          * Force modified data and metadata out to stable storage.
2471          */
2472         (void) VOP_FSYNC(vp, 0, cr, NULL);
2473 
2474         VN_RELE(vp);
2475 
2476         *status = puterrno(error);
2477         if (name != args->sla_tnm)
2478                 kmem_free(name, MAXPATHLEN);
2479 
2480 }
2481 void *
2482 rfs_symlink_getfh(struct nfsslargs *args)
2483 {
2484         return (args->sla_from.da_fhandle);
2485 }
2486 
2487 /*
2488  * Make a directory.
2489  * Create a directory with the given name, parent directory, and attributes.
2490  * Returns a file handle and attributes for the new directory.
2491  */
2492 /* ARGSUSED */
2493 void
2494 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2495     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2496 {
2497         int error;
2498         struct vattr va;
2499         vnode_t *dvp = NULL;
2500         vnode_t *vp;
2501         char *name = args->ca_da.da_name;
2502 
2503         /*
2504          * Disallow NULL paths
2505          */
2506         if (name == NULL || *name == '\0') {
2507                 dr->dr_status = NFSERR_ACCES;
2508                 return;
2509         }
2510 
2511         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2512         if (vp == NULL) {
2513                 dr->dr_status = NFSERR_STALE;
2514                 return;
2515         }
2516 
2517         if (rdonly(ro, vp)) {
2518                 VN_RELE(vp);
2519                 dr->dr_status = NFSERR_ROFS;
2520                 return;
2521         }
2522 
2523         error = sattr_to_vattr(args->ca_sa, &va);
2524         if (error) {
2525                 VN_RELE(vp);
2526                 dr->dr_status = puterrno(error);
2527                 return;
2528         }
2529 
2530         if (!(va.va_mask & AT_MODE)) {
2531                 VN_RELE(vp);
2532                 dr->dr_status = NFSERR_INVAL;
2533                 return;
2534         }
2535 
2536         va.va_type = VDIR;
2537         va.va_mask |= AT_TYPE;
2538 
2539         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2540 
2541         if (!error) {
2542                 /*
2543                  * Attribtutes of the newly created directory should
2544                  * be returned to the client.
2545                  */
2546                 va.va_mask = AT_ALL; /* We want everything */
2547                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2548 
2549                 /* check for overflows */
2550                 if (!error) {
2551                         acl_perm(vp, exi, &va, cr);
2552                         error = vattr_to_nattr(&va, &dr->dr_attr);
2553                         if (!error) {
2554                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2555                         }
2556                 }
2557                 /*
2558                  * Force new data and metadata out to stable storage.
2559                  */
2560                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2561                 VN_RELE(dvp);
2562         }
2563 
2564         /*
2565          * Force modified data and metadata out to stable storage.
2566          */
2567         (void) VOP_FSYNC(vp, 0, cr, NULL);
2568 
2569         VN_RELE(vp);
2570 
2571         dr->dr_status = puterrno(error);
2572 
2573 }
2574 void *
2575 rfs_mkdir_getfh(struct nfscreatargs *args)
2576 {
2577         return (args->ca_da.da_fhandle);
2578 }
2579 
2580 /*
2581  * Remove a directory.
2582  * Remove the given directory name from the given parent directory.
2583  */
2584 /* ARGSUSED */
2585 void
2586 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2587     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2588 {
2589         int error;
2590         vnode_t *vp;
2591 
2592         /*
2593          * Disallow NULL paths
2594          */
2595         if (da->da_name == NULL || *da->da_name == '\0') {
2596                 *status = NFSERR_ACCES;
2597                 return;
2598         }
2599 
2600         vp = nfs_fhtovp(da->da_fhandle, exi);
2601         if (vp == NULL) {
2602                 *status = NFSERR_STALE;
2603                 return;
2604         }
2605 
2606         if (rdonly(ro, vp)) {
2607                 VN_RELE(vp);
2608                 *status = NFSERR_ROFS;
2609                 return;
2610         }
2611 
2612         /*
2613          * VOP_RMDIR takes a third argument (the current
2614          * directory of the process).  That's because someone
2615          * wants to return EINVAL if one tries to remove ".".
2616          * Of course, NFS servers have no idea what their
2617          * clients' current directories are.  We fake it by
2618          * supplying a vnode known to exist and illegal to
2619          * remove.
2620          */
2621         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2622 
2623         /*
2624          * Force modified data and metadata out to stable storage.
2625          */
2626         (void) VOP_FSYNC(vp, 0, cr, NULL);
2627 
2628         VN_RELE(vp);
2629 
2630         /*
2631          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2632          * if the directory is not empty.  A System V NFS server
2633          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2634          * over the wire.
2635          */
2636         if (error == EEXIST)
2637                 *status = NFSERR_NOTEMPTY;
2638         else
2639                 *status = puterrno(error);
2640 
2641 }
2642 void *
2643 rfs_rmdir_getfh(struct nfsdiropargs *da)
2644 {
2645         return (da->da_fhandle);
2646 }
2647 
2648 /* ARGSUSED */
2649 void
2650 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2651     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2652 {
2653         int error;
2654         int iseof;
2655         struct iovec iov;
2656         struct uio uio;
2657         vnode_t *vp;
2658         char *ndata = NULL;
2659         struct sockaddr *ca;
2660         size_t nents;
2661         int ret;
2662 
2663         vp = nfs_fhtovp(&rda->rda_fh, exi);
2664         if (vp == NULL) {
2665                 rd->rd_entries = NULL;
2666                 rd->rd_status = NFSERR_STALE;
2667                 return;
2668         }
2669 
2670         if (vp->v_type != VDIR) {
2671                 VN_RELE(vp);
2672                 rd->rd_entries = NULL;
2673                 rd->rd_status = NFSERR_NOTDIR;
2674                 return;
2675         }
2676 
2677         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2678 
2679         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2680 
2681         if (error) {
2682                 rd->rd_entries = NULL;
2683                 goto bad;
2684         }
2685 
2686         if (rda->rda_count == 0) {
2687                 rd->rd_entries = NULL;
2688                 rd->rd_size = 0;
2689                 rd->rd_eof = FALSE;
2690                 goto bad;
2691         }
2692 
2693         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2694 
2695         /*
2696          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2697          */
2698         rd->rd_bufsize = (uint_t)rda->rda_count;
2699         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2700 
2701         /*
2702          * Set up io vector to read directory data
2703          */
2704         iov.iov_base = (caddr_t)rd->rd_entries;
2705         iov.iov_len = rda->rda_count;
2706         uio.uio_iov = &iov;
2707         uio.uio_iovcnt = 1;
2708         uio.uio_segflg = UIO_SYSSPACE;
2709         uio.uio_extflg = UIO_COPY_CACHED;
2710         uio.uio_loffset = (offset_t)rda->rda_offset;
2711         uio.uio_resid = rda->rda_count;
2712 
2713         /*
2714          * read directory
2715          */
2716         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2717 
2718         /*
2719          * Clean up
2720          */
2721         if (!error) {
2722                 /*
2723                  * set size and eof
2724                  */
2725                 if (uio.uio_resid == rda->rda_count) {
2726                         rd->rd_size = 0;
2727                         rd->rd_eof = TRUE;
2728                 } else {
2729                         rd->rd_size = (uint32_t)(rda->rda_count -
2730                             uio.uio_resid);
2731                         rd->rd_eof = iseof ? TRUE : FALSE;
2732                 }
2733         }
2734 
2735         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2736         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2737         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2738             rda->rda_count, &ndata);
2739 
2740         if (ret != 0) {
2741                 size_t dropbytes;
2742                 /*
2743                  * We had to drop one or more entries in order to fit
2744                  * during the character conversion.  We need to patch
2745                  * up the size and eof info.
2746                  */
2747                 if (rd->rd_eof)
2748                         rd->rd_eof = FALSE;
2749                 dropbytes = nfscmd_dropped_entrysize(
2750                     (struct dirent64 *)rd->rd_entries, nents, ret);
2751                 rd->rd_size -= dropbytes;
2752         }
2753         if (ndata == NULL) {
2754                 ndata = (char *)rd->rd_entries;
2755         } else if (ndata != (char *)rd->rd_entries) {
2756                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2757                 rd->rd_entries = (void *)ndata;
2758                 rd->rd_bufsize = rda->rda_count;
2759         }
2760 
2761 bad:
2762         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2763 
2764 #if 0 /* notyet */
2765         /*
2766          * Don't do this.  It causes local disk writes when just
2767          * reading the file and the overhead is deemed larger
2768          * than the benefit.
2769          */
2770         /*
2771          * Force modified metadata out to stable storage.
2772          */
2773         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2774 #endif
2775 
2776         VN_RELE(vp);
2777 
2778         rd->rd_status = puterrno(error);
2779 
2780 }
2781 void *
2782 rfs_readdir_getfh(struct nfsrddirargs *rda)
2783 {
2784         return (&rda->rda_fh);
2785 }
2786 void
2787 rfs_rddirfree(struct nfsrddirres *rd)
2788 {
2789         if (rd->rd_entries != NULL)
2790                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2791 }
2792 
2793 /* ARGSUSED */
2794 void
2795 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2796     struct svc_req *req, cred_t *cr, bool_t ro)
2797 {
2798         int error;
2799         struct statvfs64 sb;
2800         vnode_t *vp;
2801 
2802         vp = nfs_fhtovp(fh, exi);
2803         if (vp == NULL) {
2804                 fs->fs_status = NFSERR_STALE;
2805                 return;
2806         }
2807 
2808         error = VFS_STATVFS(vp->v_vfsp, &sb);
2809 
2810         if (!error) {
2811                 fs->fs_tsize = nfstsize();
2812                 fs->fs_bsize = sb.f_frsize;
2813                 fs->fs_blocks = sb.f_blocks;
2814                 fs->fs_bfree = sb.f_bfree;
2815                 fs->fs_bavail = sb.f_bavail;
2816         }
2817 
2818         VN_RELE(vp);
2819 
2820         fs->fs_status = puterrno(error);
2821 
2822 }
2823 void *
2824 rfs_statfs_getfh(fhandle_t *fh)
2825 {
2826         return (fh);
2827 }
2828 
2829 static int
2830 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2831 {
2832         vap->va_mask = 0;
2833 
2834         /*
2835          * There was a sign extension bug in some VFS based systems
2836          * which stored the mode as a short.  When it would get
2837          * assigned to a u_long, no sign extension would occur.
2838          * It needed to, but this wasn't noticed because sa_mode
2839          * would then get assigned back to the short, thus ignoring
2840          * the upper 16 bits of sa_mode.
2841          *
2842          * To make this implementation work for both broken
2843          * clients and good clients, we check for both versions
2844          * of the mode.
2845          */
2846         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2847             sa->sa_mode != (uint32_t)-1) {
2848                 vap->va_mask |= AT_MODE;
2849                 vap->va_mode = sa->sa_mode;
2850         }
2851         if (sa->sa_uid != (uint32_t)-1) {
2852                 vap->va_mask |= AT_UID;
2853                 vap->va_uid = sa->sa_uid;
2854         }
2855         if (sa->sa_gid != (uint32_t)-1) {
2856                 vap->va_mask |= AT_GID;
2857                 vap->va_gid = sa->sa_gid;
2858         }
2859         if (sa->sa_size != (uint32_t)-1) {
2860                 vap->va_mask |= AT_SIZE;
2861                 vap->va_size = sa->sa_size;
2862         }
2863         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2864             sa->sa_atime.tv_usec != (int32_t)-1) {
2865 #ifndef _LP64
2866                 /* return error if time overflow */
2867                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2868                         return (EOVERFLOW);
2869 #endif
2870                 vap->va_mask |= AT_ATIME;
2871                 /*
2872                  * nfs protocol defines times as unsigned so don't extend sign,
2873                  * unless sysadmin set nfs_allow_preepoch_time.
2874                  */
2875                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2876                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2877         }
2878         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2879             sa->sa_mtime.tv_usec != (int32_t)-1) {
2880 #ifndef _LP64
2881                 /* return error if time overflow */
2882                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2883                         return (EOVERFLOW);
2884 #endif
2885                 vap->va_mask |= AT_MTIME;
2886                 /*
2887                  * nfs protocol defines times as unsigned so don't extend sign,
2888                  * unless sysadmin set nfs_allow_preepoch_time.
2889                  */
2890                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2891                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2892         }
2893         return (0);
2894 }
2895 
2896 static const enum nfsftype vt_to_nf[] = {
2897         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2898 };
2899 
2900 /*
2901  * check the following fields for overflow: nodeid, size, and time.
2902  * There could be a problem when converting 64-bit LP64 fields
2903  * into 32-bit ones.  Return an error if there is an overflow.
2904  */
2905 int
2906 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2907 {
2908         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2909         na->na_type = vt_to_nf[vap->va_type];
2910 
2911         if (vap->va_mode == (unsigned short) -1)
2912                 na->na_mode = (uint32_t)-1;
2913         else
2914                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2915 
2916         if (vap->va_uid == (unsigned short)(-1))
2917                 na->na_uid = (uint32_t)(-1);
2918         else if (vap->va_uid == UID_NOBODY)
2919                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2920         else
2921                 na->na_uid = vap->va_uid;
2922 
2923         if (vap->va_gid == (unsigned short)(-1))
2924                 na->na_gid = (uint32_t)-1;
2925         else if (vap->va_gid == GID_NOBODY)
2926                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2927         else
2928                 na->na_gid = vap->va_gid;
2929 
2930         /*
2931          * Do we need to check fsid for overflow?  It is 64-bit in the
2932          * vattr, but are bigger than 32 bit values supported?
2933          */
2934         na->na_fsid = vap->va_fsid;
2935 
2936         na->na_nodeid = vap->va_nodeid;
2937 
2938         /*
2939          * Check to make sure that the nodeid is representable over the
2940          * wire without losing bits.
2941          */
2942         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2943                 return (EFBIG);
2944         na->na_nlink = vap->va_nlink;
2945 
2946         /*
2947          * Check for big files here, instead of at the caller.  See
2948          * comments in cstat for large special file explanation.
2949          */
2950         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2951                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2952                         return (EFBIG);
2953                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2954                         /* UNKNOWN_SIZE | OVERFLOW */
2955                         na->na_size = MAXOFF32_T;
2956                 } else
2957                         na->na_size = vap->va_size;
2958         } else
2959                 na->na_size = vap->va_size;
2960 
2961         /*
2962          * If the vnode times overflow the 32-bit times that NFS2
2963          * uses on the wire then return an error.
2964          */
2965         if (!NFS_VAP_TIME_OK(vap)) {
2966                 return (EOVERFLOW);
2967         }
2968         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2969         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2970 
2971         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2972         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2973 
2974         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2975         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2976 
2977         /*
2978          * If the dev_t will fit into 16 bits then compress
2979          * it, otherwise leave it alone. See comments in
2980          * nfs_client.c.
2981          */
2982         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2983             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2984                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2985         else
2986                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2987 
2988         na->na_blocks = vap->va_nblocks;
2989         na->na_blocksize = vap->va_blksize;
2990 
2991         /*
2992          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2993          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2994          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2995          *
2996          * BUYER BEWARE:
2997          *  If you are porting the NFS to a non-Sun server, you probably
2998          *  don't want to include the following block of code.  The
2999          *  over-the-wire special file types will be changing with the
3000          *  NFS Protocol Revision.
3001          */
3002         if (vap->va_type == VFIFO)
3003                 NA_SETFIFO(na);
3004         return (0);
3005 }
3006 
3007 /*
3008  * acl v2 support: returns approximate permission.
3009  *      default: returns minimal permission (more restrictive)
3010  *      aclok: returns maximal permission (less restrictive)
3011  *      This routine changes the permissions that are alaredy in *va.
3012  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3013  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3014  */
3015 static void
3016 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3017 {
3018         vsecattr_t      vsa;
3019         int             aclcnt;
3020         aclent_t        *aclentp;
3021         mode_t          mask_perm;
3022         mode_t          grp_perm;
3023         mode_t          other_perm;
3024         mode_t          other_orig;
3025         int             error;
3026 
3027         /* dont care default acl */
3028         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3029         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3030 
3031         if (!error) {
3032                 aclcnt = vsa.vsa_aclcnt;
3033                 if (aclcnt > MIN_ACL_ENTRIES) {
3034                         /* non-trivial ACL */
3035                         aclentp = vsa.vsa_aclentp;
3036                         if (exi->exi_export.ex_flags & EX_ACLOK) {
3037                                 /* maximal permissions */
3038                                 grp_perm = 0;
3039                                 other_perm = 0;
3040                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3041                                         switch (aclentp->a_type) {
3042                                         case USER_OBJ:
3043                                                 break;
3044                                         case USER:
3045                                                 grp_perm |=
3046                                                     aclentp->a_perm << 3;
3047                                                 other_perm |= aclentp->a_perm;
3048                                                 break;
3049                                         case GROUP_OBJ:
3050                                                 grp_perm |=
3051                                                     aclentp->a_perm << 3;
3052                                                 break;
3053                                         case GROUP:
3054                                                 other_perm |= aclentp->a_perm;
3055                                                 break;
3056                                         case OTHER_OBJ:
3057                                                 other_orig = aclentp->a_perm;
3058                                                 break;
3059                                         case CLASS_OBJ:
3060                                                 mask_perm = aclentp->a_perm;
3061                                                 break;
3062                                         default:
3063                                                 break;
3064                                         }
3065                                 }
3066                                 grp_perm &= mask_perm << 3;
3067                                 other_perm &= mask_perm;
3068                                 other_perm |= other_orig;
3069 
3070                         } else {
3071                                 /* minimal permissions */
3072                                 grp_perm = 070;
3073                                 other_perm = 07;
3074                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3075                                         switch (aclentp->a_type) {
3076                                         case USER_OBJ:
3077                                                 break;
3078                                         case USER:
3079                                         case CLASS_OBJ:
3080                                                 grp_perm &=
3081                                                     aclentp->a_perm << 3;
3082                                                 other_perm &=
3083                                                     aclentp->a_perm;
3084                                                 break;
3085                                         case GROUP_OBJ:
3086                                                 grp_perm &=
3087                                                     aclentp->a_perm << 3;
3088                                                 break;
3089                                         case GROUP:
3090                                                 other_perm &=
3091                                                     aclentp->a_perm;
3092                                                 break;
3093                                         case OTHER_OBJ:
3094                                                 other_perm &=
3095                                                     aclentp->a_perm;
3096                                                 break;
3097                                         default:
3098                                                 break;
3099                                         }
3100                                 }
3101                         }
3102                         /* copy to va */
3103                         va->va_mode &= ~077;
3104                         va->va_mode |= grp_perm | other_perm;
3105                 }
3106                 if (vsa.vsa_aclcnt)
3107                         kmem_free(vsa.vsa_aclentp,
3108                             vsa.vsa_aclcnt * sizeof (aclent_t));
3109         }
3110 }
3111 
3112 void
3113 rfs_srvrinit(void)
3114 {
3115         nfs2_srv_caller_id = fs_new_caller_id();
3116 }
3117 
3118 void
3119 rfs_srvrfini(void)
3120 {
3121 }
3122 
3123 /* ARGSUSED */
3124 void
3125 rfs_srv_zone_init(nfs_globals_t *ng)
3126 {
3127         nfs_srv_t *ns;
3128 
3129         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3130 
3131         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3132         ns->write_async = 1;
3133 
3134         ng->nfs_srv = ns;
3135 }
3136 
3137 /* ARGSUSED */
3138 void
3139 rfs_srv_zone_fini(nfs_globals_t *ng)
3140 {
3141         nfs_srv_t *ns = ng->nfs_srv;
3142 
3143         ng->nfs_srv = NULL;
3144 
3145         mutex_destroy(&ns->async_write_lock);
3146         kmem_free(ns, sizeof (*ns));
3147 }
3148 
3149 static int
3150 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3151 {
3152         struct clist    *wcl;
3153         int             wlist_len;
3154         uint32_t        count = rr->rr_count;
3155 
3156         wcl = ra->ra_wlist;
3157 
3158         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3159                 return (FALSE);
3160         }
3161 
3162         wcl = ra->ra_wlist;
3163         rr->rr_ok.rrok_wlist_len = wlist_len;
3164         rr->rr_ok.rrok_wlist = wcl;
3165 
3166         return (TRUE);
3167 }