1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 
  33 /*
  34  * Copyright 2018 Nexenta Systems, Inc.
  35  * Copyright (c) 2016 by Delphix. All rights reserved.
  36  */
  37 
  38 #include <sys/param.h>
  39 #include <sys/types.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/buf.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/stat.h>
  47 #include <sys/errno.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/statvfs.h>
  50 #include <sys/kmem.h>
  51 #include <sys/kstat.h>
  52 #include <sys/dirent.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/debug.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/mode.h>
  57 #include <sys/acl.h>
  58 #include <sys/nbmlock.h>
  59 #include <sys/policy.h>
  60 #include <sys/sdt.h>
  61 
  62 #include <rpc/types.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/svc.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/export.h>
  68 #include <nfs/nfs_cmd.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_kmem.h>
  75 
  76 #include <sys/strsubr.h>
  77 
  78 struct rfs_async_write_list;
  79 
  80 /*
  81  * Zone globals of NFSv2 server
  82  */
  83 typedef struct nfs_srv {
  84         kmutex_t                        async_write_lock;
  85         struct rfs_async_write_list     *async_write_head;
  86 
  87         /*
  88          * enables write clustering if == 1
  89          */
  90         int             write_async;
  91 } nfs_srv_t;
  92 
  93 /*
  94  * These are the interface routines for the server side of the
  95  * Network File System.  See the NFS version 2 protocol specification
  96  * for a description of this interface.
  97  */
  98 
  99 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101                         cred_t *);
 102 
 103 
 104 /*
 105  * Some "over the wire" UNIX file types.  These are encoded
 106  * into the mode.  This needs to be fixed in the next rev.
 107  */
 108 #define IFMT            0170000         /* type of file */
 109 #define IFCHR           0020000         /* character special */
 110 #define IFBLK           0060000         /* block special */
 111 #define IFSOCK          0140000         /* socket */
 112 
 113 u_longlong_t nfs2_srv_caller_id;
 114 
 115 static nfs_srv_t *
 116 nfs_get_srv(void)
 117 {
 118         nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 119         nfs_srv_t *srv = ng->nfs_srv;
 120         ASSERT(srv != NULL);
 121         return (srv);
 122 }
 123 
 124 /*
 125  * Get file attributes.
 126  * Returns the current attributes of the file with the given fhandle.
 127  */
 128 /* ARGSUSED */
 129 void
 130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131     struct svc_req *req, cred_t *cr, bool_t ro)
 132 {
 133         int error;
 134         vnode_t *vp;
 135         struct vattr va;
 136 
 137         vp = nfs_fhtovp(fhp, exi);
 138         if (vp == NULL) {
 139                 ns->ns_status = NFSERR_STALE;
 140                 return;
 141         }
 142 
 143         /*
 144          * Do the getattr.
 145          */
 146         va.va_mask = AT_ALL;    /* we want all the attributes */
 147 
 148         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 149 
 150         /* check for overflows */
 151         if (!error) {
 152                 /* Lie about the object type for a referral */
 153                 if (vn_is_nfs_reparse(vp, cr))
 154                         va.va_type = VLNK;
 155 
 156                 acl_perm(vp, exi, &va, cr);
 157                 error = vattr_to_nattr(&va, &ns->ns_attr);
 158         }
 159 
 160         VN_RELE(vp);
 161 
 162         ns->ns_status = puterrno(error);
 163 }
 164 void *
 165 rfs_getattr_getfh(fhandle_t *fhp)
 166 {
 167         return (fhp);
 168 }
 169 
 170 /*
 171  * Set file attributes.
 172  * Sets the attributes of the file with the given fhandle.  Returns
 173  * the new attributes.
 174  */
 175 /* ARGSUSED */
 176 void
 177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 178     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 179 {
 180         int error;
 181         int flag;
 182         int in_crit = 0;
 183         vnode_t *vp;
 184         struct vattr va;
 185         struct vattr bva;
 186         struct flock64 bf;
 187         caller_context_t ct;
 188 
 189 
 190         vp = nfs_fhtovp(&args->saa_fh, exi);
 191         if (vp == NULL) {
 192                 ns->ns_status = NFSERR_STALE;
 193                 return;
 194         }
 195 
 196         if (rdonly(ro, vp)) {
 197                 VN_RELE(vp);
 198                 ns->ns_status = NFSERR_ROFS;
 199                 return;
 200         }
 201 
 202         error = sattr_to_vattr(&args->saa_sa, &va);
 203         if (error) {
 204                 VN_RELE(vp);
 205                 ns->ns_status = puterrno(error);
 206                 return;
 207         }
 208 
 209         /*
 210          * If the client is requesting a change to the mtime,
 211          * but the nanosecond field is set to 1 billion, then
 212          * this is a flag to the server that it should set the
 213          * atime and mtime fields to the server's current time.
 214          * The 1 billion number actually came from the client
 215          * as 1 million, but the units in the over the wire
 216          * request are microseconds instead of nanoseconds.
 217          *
 218          * This is an overload of the protocol and should be
 219          * documented in the NFS Version 2 protocol specification.
 220          */
 221         if (va.va_mask & AT_MTIME) {
 222                 if (va.va_mtime.tv_nsec == 1000000000) {
 223                         gethrestime(&va.va_mtime);
 224                         va.va_atime = va.va_mtime;
 225                         va.va_mask |= AT_ATIME;
 226                         flag = 0;
 227                 } else
 228                         flag = ATTR_UTIME;
 229         } else
 230                 flag = 0;
 231 
 232         /*
 233          * If the filesystem is exported with nosuid, then mask off
 234          * the setuid and setgid bits.
 235          */
 236         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 237             (exi->exi_export.ex_flags & EX_NOSUID))
 238                 va.va_mode &= ~(VSUID | VSGID);
 239 
 240         ct.cc_sysid = 0;
 241         ct.cc_pid = 0;
 242         ct.cc_caller_id = nfs2_srv_caller_id;
 243         ct.cc_flags = CC_DONTBLOCK;
 244 
 245         /*
 246          * We need to specially handle size changes because it is
 247          * possible for the client to create a file with modes
 248          * which indicate read-only, but with the file opened for
 249          * writing.  If the client then tries to set the size of
 250          * the file, then the normal access checking done in
 251          * VOP_SETATTR would prevent the client from doing so,
 252          * although it should be legal for it to do so.  To get
 253          * around this, we do the access checking for ourselves
 254          * and then use VOP_SPACE which doesn't do the access
 255          * checking which VOP_SETATTR does. VOP_SPACE can only
 256          * operate on VREG files, let VOP_SETATTR handle the other
 257          * extremely rare cases.
 258          * Also the client should not be allowed to change the
 259          * size of the file if there is a conflicting non-blocking
 260          * mandatory lock in the region of change.
 261          */
 262         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 263                 if (nbl_need_check(vp)) {
 264                         nbl_start_crit(vp, RW_READER);
 265                         in_crit = 1;
 266                 }
 267 
 268                 bva.va_mask = AT_UID | AT_SIZE;
 269 
 270                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 271 
 272                 if (error) {
 273                         if (in_crit)
 274                                 nbl_end_crit(vp);
 275                         VN_RELE(vp);
 276                         ns->ns_status = puterrno(error);
 277                         return;
 278                 }
 279 
 280                 if (in_crit) {
 281                         u_offset_t offset;
 282                         ssize_t length;
 283 
 284                         if (va.va_size < bva.va_size) {
 285                                 offset = va.va_size;
 286                                 length = bva.va_size - va.va_size;
 287                         } else {
 288                                 offset = bva.va_size;
 289                                 length = va.va_size - bva.va_size;
 290                         }
 291                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 292                             NULL)) {
 293                                 error = EACCES;
 294                         }
 295                 }
 296 
 297                 if (crgetuid(cr) == bva.va_uid && !error &&
 298                     va.va_size != bva.va_size) {
 299                         va.va_mask &= ~AT_SIZE;
 300                         bf.l_type = F_WRLCK;
 301                         bf.l_whence = 0;
 302                         bf.l_start = (off64_t)va.va_size;
 303                         bf.l_len = 0;
 304                         bf.l_sysid = 0;
 305                         bf.l_pid = 0;
 306 
 307                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 308                             (offset_t)va.va_size, cr, &ct);
 309                 }
 310                 if (in_crit)
 311                         nbl_end_crit(vp);
 312         } else
 313                 error = 0;
 314 
 315         /*
 316          * Do the setattr.
 317          */
 318         if (!error && va.va_mask) {
 319                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 320         }
 321 
 322         /*
 323          * check if the monitor on either vop_space or vop_setattr detected
 324          * a delegation conflict and if so, mark the thread flag as
 325          * wouldblock so that the response is dropped and the client will
 326          * try again.
 327          */
 328         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 329                 VN_RELE(vp);
 330                 curthread->t_flag |= T_WOULDBLOCK;
 331                 return;
 332         }
 333 
 334         if (!error) {
 335                 va.va_mask = AT_ALL;    /* get everything */
 336 
 337                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 338 
 339                 /* check for overflows */
 340                 if (!error) {
 341                         acl_perm(vp, exi, &va, cr);
 342                         error = vattr_to_nattr(&va, &ns->ns_attr);
 343                 }
 344         }
 345 
 346         ct.cc_flags = 0;
 347 
 348         /*
 349          * Force modified metadata out to stable storage.
 350          */
 351         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 352 
 353         VN_RELE(vp);
 354 
 355         ns->ns_status = puterrno(error);
 356 }
 357 void *
 358 rfs_setattr_getfh(struct nfssaargs *args)
 359 {
 360         return (&args->saa_fh);
 361 }
 362 
 363 /* Change and release @exip and @vpp only in success */
 364 int
 365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 366 {
 367         struct exportinfo *exi;
 368         vnode_t *vp = *vpp;
 369         fid_t fid;
 370         int error;
 371 
 372         VN_HOLD(vp);
 373 
 374         if ((error = traverse(&vp)) != 0) {
 375                 VN_RELE(vp);
 376                 return (error);
 377         }
 378 
 379         bzero(&fid, sizeof (fid));
 380         fid.fid_len = MAXFIDSZ;
 381         error = VOP_FID(vp, &fid, NULL);
 382         if (error) {
 383                 VN_RELE(vp);
 384                 return (error);
 385         }
 386 
 387         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 388         if (exi == NULL ||
 389             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 390                 /*
 391                  * It is not error, just subdir is not exported
 392                  * or "nohide" is not set
 393                  */
 394                 if (exi != NULL)
 395                         exi_rele(exi);
 396                 VN_RELE(vp);
 397         } else {
 398                 /* go to submount */
 399                 exi_rele(*exip);
 400                 *exip = exi;
 401 
 402                 VN_RELE(*vpp);
 403                 *vpp = vp;
 404         }
 405 
 406         return (0);
 407 }
 408 
 409 /*
 410  * Given mounted "dvp" and "exi", go upper mountpoint
 411  * with dvp/exi correction
 412  * Return 0 in success
 413  */
 414 int
 415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416 {
 417         struct exportinfo *exi;
 418         vnode_t *dvp = *dvpp;
 419         vnode_t *zone_rootvp;
 420 
 421         zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
 422         ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
 423 
 424         VN_HOLD(dvp);
 425         dvp = untraverse(dvp, zone_rootvp);
 426         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 427         if (exi == NULL) {
 428                 VN_RELE(dvp);
 429                 return (-1);
 430         }
 431 
 432         ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
 433         exi_rele(*exip);
 434         *exip = exi;
 435         VN_RELE(*dvpp);
 436         *dvpp = dvp;
 437 
 438         return (0);
 439 }
 440 /*
 441  * Directory lookup.
 442  * Returns an fhandle and file attributes for file name in a directory.
 443  */
 444 /* ARGSUSED */
 445 void
 446 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 447     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 448 {
 449         int error;
 450         vnode_t *dvp;
 451         vnode_t *vp;
 452         struct vattr va;
 453         fhandle_t *fhp = da->da_fhandle;
 454         struct sec_ol sec = {0, 0};
 455         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 456         char *name;
 457         struct sockaddr *ca;
 458 
 459         /*
 460          * Trusted Extension doesn't support NFSv2. MOUNT
 461          * will reject v2 clients. Need to prevent v2 client
 462          * access via WebNFS here.
 463          */
 464         if (is_system_labeled() && req->rq_vers == 2) {
 465                 dr->dr_status = NFSERR_ACCES;
 466                 return;
 467         }
 468 
 469         /*
 470          * Disallow NULL paths
 471          */
 472         if (da->da_name == NULL || *da->da_name == '\0') {
 473                 dr->dr_status = NFSERR_ACCES;
 474                 return;
 475         }
 476 
 477         /*
 478          * Allow lookups from the root - the default
 479          * location of the public filehandle.
 480          */
 481         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 482                 dvp = ZONE_ROOTVP();
 483                 VN_HOLD(dvp);
 484         } else {
 485                 dvp = nfs_fhtovp(fhp, exi);
 486                 if (dvp == NULL) {
 487                         dr->dr_status = NFSERR_STALE;
 488                         return;
 489                 }
 490         }
 491 
 492         exi_hold(exi);
 493         ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 494 
 495         /*
 496          * Not allow lookup beyond root.
 497          * If the filehandle matches a filehandle of the exi,
 498          * then the ".." refers beyond the root of an exported filesystem.
 499          */
 500         if (strcmp(da->da_name, "..") == 0 &&
 501             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 502                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 503                     ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 504                         /*
 505                          * special case for ".." and 'nohide'exported root
 506                          */
 507                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 508                                 error = NFSERR_ACCES;
 509                                 goto out;
 510                         }
 511                 } else  {
 512                         error = NFSERR_NOENT;
 513                         goto out;
 514                 }
 515         }
 516 
 517         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 518         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 519             MAXPATHLEN);
 520 
 521         if (name == NULL) {
 522                 error = NFSERR_ACCES;
 523                 goto out;
 524         }
 525 
 526         /*
 527          * If the public filehandle is used then allow
 528          * a multi-component lookup, i.e. evaluate
 529          * a pathname and follow symbolic links if
 530          * necessary.
 531          *
 532          * This may result in a vnode in another filesystem
 533          * which is OK as long as the filesystem is exported.
 534          */
 535         if (PUBLIC_FH2(fhp)) {
 536                 publicfh_flag = TRUE;
 537 
 538                 exi_rele(exi);
 539                 exi = NULL;
 540 
 541                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 542                     &sec);
 543         } else {
 544                 /*
 545                  * Do a normal single component lookup.
 546                  */
 547                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 548                     NULL, NULL, NULL);
 549         }
 550 
 551         if (name != da->da_name)
 552                 kmem_free(name, MAXPATHLEN);
 553 
 554         if (error == 0 && vn_ismntpt(vp)) {
 555                 error = rfs_cross_mnt(&vp, &exi);
 556                 if (error)
 557                         VN_RELE(vp);
 558         }
 559 
 560         if (!error) {
 561                 va.va_mask = AT_ALL;    /* we want everything */
 562 
 563                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 564 
 565                 /* check for overflows */
 566                 if (!error) {
 567                         acl_perm(vp, exi, &va, cr);
 568                         error = vattr_to_nattr(&va, &dr->dr_attr);
 569                         if (!error) {
 570                                 if (sec.sec_flags & SEC_QUERY)
 571                                         error = makefh_ol(&dr->dr_fhandle, exi,
 572                                             sec.sec_index);
 573                                 else {
 574                                         error = makefh(&dr->dr_fhandle, vp,
 575                                             exi);
 576                                         if (!error && publicfh_flag &&
 577                                             !chk_clnt_sec(exi, req))
 578                                                 auth_weak = TRUE;
 579                                 }
 580                         }
 581                 }
 582                 VN_RELE(vp);
 583         }
 584 
 585 out:
 586         VN_RELE(dvp);
 587 
 588         if (exi != NULL)
 589                 exi_rele(exi);
 590 
 591         /*
 592          * If it's public fh, no 0x81, and client's flavor is
 593          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 594          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 595          */
 596         if (auth_weak)
 597                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 598         else
 599                 dr->dr_status = puterrno(error);
 600 }
 601 void *
 602 rfs_lookup_getfh(struct nfsdiropargs *da)
 603 {
 604         return (da->da_fhandle);
 605 }
 606 
 607 /*
 608  * Read symbolic link.
 609  * Returns the string in the symbolic link at the given fhandle.
 610  */
 611 /* ARGSUSED */
 612 void
 613 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 614     struct svc_req *req, cred_t *cr, bool_t ro)
 615 {
 616         int error;
 617         struct iovec iov;
 618         struct uio uio;
 619         vnode_t *vp;
 620         struct vattr va;
 621         struct sockaddr *ca;
 622         char *name = NULL;
 623         int is_referral = 0;
 624 
 625         vp = nfs_fhtovp(fhp, exi);
 626         if (vp == NULL) {
 627                 rl->rl_data = NULL;
 628                 rl->rl_status = NFSERR_STALE;
 629                 return;
 630         }
 631 
 632         va.va_mask = AT_MODE;
 633 
 634         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 635 
 636         if (error) {
 637                 VN_RELE(vp);
 638                 rl->rl_data = NULL;
 639                 rl->rl_status = puterrno(error);
 640                 return;
 641         }
 642 
 643         if (MANDLOCK(vp, va.va_mode)) {
 644                 VN_RELE(vp);
 645                 rl->rl_data = NULL;
 646                 rl->rl_status = NFSERR_ACCES;
 647                 return;
 648         }
 649 
 650         /* We lied about the object type for a referral */
 651         if (vn_is_nfs_reparse(vp, cr))
 652                 is_referral = 1;
 653 
 654         /*
 655          * XNFS and RFC1094 require us to return ENXIO if argument
 656          * is not a link. BUGID 1138002.
 657          */
 658         if (vp->v_type != VLNK && !is_referral) {
 659                 VN_RELE(vp);
 660                 rl->rl_data = NULL;
 661                 rl->rl_status = NFSERR_NXIO;
 662                 return;
 663         }
 664 
 665         /*
 666          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 667          */
 668         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 669 
 670         if (is_referral) {
 671                 char *s;
 672                 size_t strsz;
 673 
 674                 /* Get an artificial symlink based on a referral */
 675                 s = build_symlink(vp, cr, &strsz);
 676                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 677                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 678                     vnode_t *, vp, char *, s);
 679                 if (s == NULL)
 680                         error = EINVAL;
 681                 else {
 682                         error = 0;
 683                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 684                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 685                         kmem_free(s, strsz);
 686                 }
 687 
 688         } else {
 689 
 690                 /*
 691                  * Set up io vector to read sym link data
 692                  */
 693                 iov.iov_base = rl->rl_data;
 694                 iov.iov_len = NFS_MAXPATHLEN;
 695                 uio.uio_iov = &iov;
 696                 uio.uio_iovcnt = 1;
 697                 uio.uio_segflg = UIO_SYSSPACE;
 698                 uio.uio_extflg = UIO_COPY_CACHED;
 699                 uio.uio_loffset = (offset_t)0;
 700                 uio.uio_resid = NFS_MAXPATHLEN;
 701 
 702                 /*
 703                  * Do the readlink.
 704                  */
 705                 error = VOP_READLINK(vp, &uio, cr, NULL);
 706 
 707                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 708 
 709                 if (!error)
 710                         rl->rl_data[rl->rl_count] = '\0';
 711 
 712         }
 713 
 714 
 715         VN_RELE(vp);
 716 
 717         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 718         name = nfscmd_convname(ca, exi, rl->rl_data,
 719             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 720 
 721         if (name != NULL && name != rl->rl_data) {
 722                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 723                 rl->rl_data = name;
 724         }
 725 
 726         /*
 727          * XNFS and RFC1094 require us to return ENXIO if argument
 728          * is not a link. UFS returns EINVAL if this is the case,
 729          * so we do the mapping here. BUGID 1138002.
 730          */
 731         if (error == EINVAL)
 732                 rl->rl_status = NFSERR_NXIO;
 733         else
 734                 rl->rl_status = puterrno(error);
 735 
 736 }
 737 void *
 738 rfs_readlink_getfh(fhandle_t *fhp)
 739 {
 740         return (fhp);
 741 }
 742 /*
 743  * Free data allocated by rfs_readlink
 744  */
 745 void
 746 rfs_rlfree(struct nfsrdlnres *rl)
 747 {
 748         if (rl->rl_data != NULL)
 749                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 750 }
 751 
 752 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 753 
 754 /*
 755  * Read data.
 756  * Returns some data read from the file at the given fhandle.
 757  */
 758 /* ARGSUSED */
 759 void
 760 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 761     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 762 {
 763         vnode_t *vp;
 764         int error;
 765         struct vattr va;
 766         struct iovec iov;
 767         struct uio uio;
 768         mblk_t *mp;
 769         int alloc_err = 0;
 770         int in_crit = 0;
 771         caller_context_t ct;
 772 
 773         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 774         if (vp == NULL) {
 775                 rr->rr_data = NULL;
 776                 rr->rr_status = NFSERR_STALE;
 777                 return;
 778         }
 779 
 780         if (vp->v_type != VREG) {
 781                 VN_RELE(vp);
 782                 rr->rr_data = NULL;
 783                 rr->rr_status = NFSERR_ISDIR;
 784                 return;
 785         }
 786 
 787         ct.cc_sysid = 0;
 788         ct.cc_pid = 0;
 789         ct.cc_caller_id = nfs2_srv_caller_id;
 790         ct.cc_flags = CC_DONTBLOCK;
 791 
 792         /*
 793          * Enter the critical region before calling VOP_RWLOCK
 794          * to avoid a deadlock with write requests.
 795          */
 796         if (nbl_need_check(vp)) {
 797                 nbl_start_crit(vp, RW_READER);
 798                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 799                     0, NULL)) {
 800                         nbl_end_crit(vp);
 801                         VN_RELE(vp);
 802                         rr->rr_data = NULL;
 803                         rr->rr_status = NFSERR_ACCES;
 804                         return;
 805                 }
 806                 in_crit = 1;
 807         }
 808 
 809         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 810 
 811         /* check if a monitor detected a delegation conflict */
 812         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 813                 if (in_crit)
 814                         nbl_end_crit(vp);
 815                 VN_RELE(vp);
 816                 /* mark as wouldblock so response is dropped */
 817                 curthread->t_flag |= T_WOULDBLOCK;
 818 
 819                 rr->rr_data = NULL;
 820                 return;
 821         }
 822 
 823         va.va_mask = AT_ALL;
 824 
 825         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 826 
 827         if (error) {
 828                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 829                 if (in_crit)
 830                         nbl_end_crit(vp);
 831 
 832                 VN_RELE(vp);
 833                 rr->rr_data = NULL;
 834                 rr->rr_status = puterrno(error);
 835 
 836                 return;
 837         }
 838 
 839         /*
 840          * This is a kludge to allow reading of files created
 841          * with no read permission.  The owner of the file
 842          * is always allowed to read it.
 843          */
 844         if (crgetuid(cr) != va.va_uid) {
 845                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 846 
 847                 if (error) {
 848                         /*
 849                          * Exec is the same as read over the net because
 850                          * of demand loading.
 851                          */
 852                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 853                 }
 854                 if (error) {
 855                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 856                         if (in_crit)
 857                                 nbl_end_crit(vp);
 858                         VN_RELE(vp);
 859                         rr->rr_data = NULL;
 860                         rr->rr_status = puterrno(error);
 861 
 862                         return;
 863                 }
 864         }
 865 
 866         if (MANDLOCK(vp, va.va_mode)) {
 867                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 868                 if (in_crit)
 869                         nbl_end_crit(vp);
 870 
 871                 VN_RELE(vp);
 872                 rr->rr_data = NULL;
 873                 rr->rr_status = NFSERR_ACCES;
 874 
 875                 return;
 876         }
 877 
 878         rr->rr_ok.rrok_wlist_len = 0;
 879         rr->rr_ok.rrok_wlist = NULL;
 880 
 881         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 882                 rr->rr_count = 0;
 883                 rr->rr_data = NULL;
 884                 /*
 885                  * In this case, status is NFS_OK, but there is no data
 886                  * to encode. So set rr_mp to NULL.
 887                  */
 888                 rr->rr_mp = NULL;
 889                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 890                 if (rr->rr_ok.rrok_wlist)
 891                         clist_zero_len(rr->rr_ok.rrok_wlist);
 892                 goto done;
 893         }
 894 
 895         if (ra->ra_wlist) {
 896                 mp = NULL;
 897                 rr->rr_mp = NULL;
 898                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 899                 if (ra->ra_count > iov.iov_len) {
 900                         rr->rr_data = NULL;
 901                         rr->rr_status = NFSERR_INVAL;
 902                         goto done;
 903                 }
 904         } else {
 905                 /*
 906                  * mp will contain the data to be sent out in the read reply.
 907                  * This will be freed after the reply has been sent out (by the
 908                  * driver).
 909                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 910                  * that the call to xdrmblk_putmblk() never fails.
 911                  */
 912                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 913                     &alloc_err);
 914                 ASSERT(mp != NULL);
 915                 ASSERT(alloc_err == 0);
 916 
 917                 rr->rr_mp = mp;
 918 
 919                 /*
 920                  * Set up io vector
 921                  */
 922                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 923                 iov.iov_len = ra->ra_count;
 924         }
 925 
 926         uio.uio_iov = &iov;
 927         uio.uio_iovcnt = 1;
 928         uio.uio_segflg = UIO_SYSSPACE;
 929         uio.uio_extflg = UIO_COPY_CACHED;
 930         uio.uio_loffset = (offset_t)ra->ra_offset;
 931         uio.uio_resid = ra->ra_count;
 932 
 933         error = VOP_READ(vp, &uio, 0, cr, &ct);
 934 
 935         if (error) {
 936                 if (mp)
 937                         freeb(mp);
 938 
 939                 /*
 940                  * check if a monitor detected a delegation conflict and
 941                  * mark as wouldblock so response is dropped
 942                  */
 943                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 944                         curthread->t_flag |= T_WOULDBLOCK;
 945                 else
 946                         rr->rr_status = puterrno(error);
 947 
 948                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 949                 if (in_crit)
 950                         nbl_end_crit(vp);
 951 
 952                 VN_RELE(vp);
 953                 rr->rr_data = NULL;
 954 
 955                 return;
 956         }
 957 
 958         /*
 959          * Get attributes again so we can send the latest access
 960          * time to the client side for its cache.
 961          */
 962         va.va_mask = AT_ALL;
 963 
 964         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 965 
 966         if (error) {
 967                 if (mp)
 968                         freeb(mp);
 969 
 970                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 971                 if (in_crit)
 972                         nbl_end_crit(vp);
 973 
 974                 VN_RELE(vp);
 975                 rr->rr_data = NULL;
 976                 rr->rr_status = puterrno(error);
 977 
 978                 return;
 979         }
 980 
 981         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 982 
 983         if (mp) {
 984                 rr->rr_data = (char *)mp->b_datap->db_base;
 985         } else {
 986                 if (ra->ra_wlist) {
 987                         rr->rr_data = (caddr_t)iov.iov_base;
 988                         if (!rdma_setup_read_data2(ra, rr)) {
 989                                 rr->rr_data = NULL;
 990                                 rr->rr_status = puterrno(NFSERR_INVAL);
 991                         }
 992                 }
 993         }
 994 done:
 995         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 996         if (in_crit)
 997                 nbl_end_crit(vp);
 998 
 999         acl_perm(vp, exi, &va, cr);
1000 
1001         /* check for overflows */
1002         error = vattr_to_nattr(&va, &rr->rr_attr);
1003 
1004         VN_RELE(vp);
1005 
1006         rr->rr_status = puterrno(error);
1007 }
1008 
1009 /*
1010  * Free data allocated by rfs_read
1011  */
1012 void
1013 rfs_rdfree(struct nfsrdresult *rr)
1014 {
1015         mblk_t *mp;
1016 
1017         if (rr->rr_status == NFS_OK) {
1018                 mp = rr->rr_mp;
1019                 if (mp != NULL)
1020                         freeb(mp);
1021         }
1022 }
1023 
1024 void *
1025 rfs_read_getfh(struct nfsreadargs *ra)
1026 {
1027         return (&ra->ra_fhandle);
1028 }
1029 
1030 #define MAX_IOVECS      12
1031 
1032 #ifdef DEBUG
1033 static int rfs_write_sync_hits = 0;
1034 static int rfs_write_sync_misses = 0;
1035 #endif
1036 
1037 /*
1038  * Write data to file.
1039  * Returns attributes of a file after writing some data to it.
1040  *
1041  * Any changes made here, especially in error handling might have
1042  * to also be done in rfs_write (which clusters write requests).
1043  */
1044 /* ARGSUSED */
1045 void
1046 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1047     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1048 {
1049         int error;
1050         vnode_t *vp;
1051         rlim64_t rlimit;
1052         struct vattr va;
1053         struct uio uio;
1054         struct iovec iov[MAX_IOVECS];
1055         mblk_t *m;
1056         struct iovec *iovp;
1057         int iovcnt;
1058         cred_t *savecred;
1059         int in_crit = 0;
1060         caller_context_t ct;
1061 
1062         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1063         if (vp == NULL) {
1064                 ns->ns_status = NFSERR_STALE;
1065                 return;
1066         }
1067 
1068         if (rdonly(ro, vp)) {
1069                 VN_RELE(vp);
1070                 ns->ns_status = NFSERR_ROFS;
1071                 return;
1072         }
1073 
1074         if (vp->v_type != VREG) {
1075                 VN_RELE(vp);
1076                 ns->ns_status = NFSERR_ISDIR;
1077                 return;
1078         }
1079 
1080         ct.cc_sysid = 0;
1081         ct.cc_pid = 0;
1082         ct.cc_caller_id = nfs2_srv_caller_id;
1083         ct.cc_flags = CC_DONTBLOCK;
1084 
1085         va.va_mask = AT_UID|AT_MODE;
1086 
1087         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1088 
1089         if (error) {
1090                 VN_RELE(vp);
1091                 ns->ns_status = puterrno(error);
1092 
1093                 return;
1094         }
1095 
1096         if (crgetuid(cr) != va.va_uid) {
1097                 /*
1098                  * This is a kludge to allow writes of files created
1099                  * with read only permission.  The owner of the file
1100                  * is always allowed to write it.
1101                  */
1102                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1103 
1104                 if (error) {
1105                         VN_RELE(vp);
1106                         ns->ns_status = puterrno(error);
1107                         return;
1108                 }
1109         }
1110 
1111         /*
1112          * Can't access a mandatory lock file.  This might cause
1113          * the NFS service thread to block forever waiting for a
1114          * lock to be released that will never be released.
1115          */
1116         if (MANDLOCK(vp, va.va_mode)) {
1117                 VN_RELE(vp);
1118                 ns->ns_status = NFSERR_ACCES;
1119                 return;
1120         }
1121 
1122         /*
1123          * We have to enter the critical region before calling VOP_RWLOCK
1124          * to avoid a deadlock with ufs.
1125          */
1126         if (nbl_need_check(vp)) {
1127                 nbl_start_crit(vp, RW_READER);
1128                 in_crit = 1;
1129                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1130                     wa->wa_count, 0, NULL)) {
1131                         error = EACCES;
1132                         goto out;
1133                 }
1134         }
1135 
1136         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1137 
1138         /* check if a monitor detected a delegation conflict */
1139         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1140                 goto out;
1141         }
1142 
1143         if (wa->wa_data || wa->wa_rlist) {
1144                 /* Do the RDMA thing if necessary */
1145                 if (wa->wa_rlist) {
1146                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1147                         iov[0].iov_len = wa->wa_count;
1148                 } else  {
1149                         iov[0].iov_base = wa->wa_data;
1150                         iov[0].iov_len = wa->wa_count;
1151                 }
1152                 uio.uio_iov = iov;
1153                 uio.uio_iovcnt = 1;
1154                 uio.uio_segflg = UIO_SYSSPACE;
1155                 uio.uio_extflg = UIO_COPY_DEFAULT;
1156                 uio.uio_loffset = (offset_t)wa->wa_offset;
1157                 uio.uio_resid = wa->wa_count;
1158                 /*
1159                  * The limit is checked on the client. We
1160                  * should allow any size writes here.
1161                  */
1162                 uio.uio_llimit = curproc->p_fsz_ctl;
1163                 rlimit = uio.uio_llimit - wa->wa_offset;
1164                 if (rlimit < (rlim64_t)uio.uio_resid)
1165                         uio.uio_resid = (uint_t)rlimit;
1166 
1167                 /*
1168                  * for now we assume no append mode
1169                  */
1170                 /*
1171                  * We're changing creds because VM may fault and we need
1172                  * the cred of the current thread to be used if quota
1173                  * checking is enabled.
1174                  */
1175                 savecred = curthread->t_cred;
1176                 curthread->t_cred = cr;
1177                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1178                 curthread->t_cred = savecred;
1179         } else {
1180 
1181                 iovcnt = 0;
1182                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1183                         iovcnt++;
1184                 if (iovcnt <= MAX_IOVECS) {
1185 #ifdef DEBUG
1186                         rfs_write_sync_hits++;
1187 #endif
1188                         iovp = iov;
1189                 } else {
1190 #ifdef DEBUG
1191                         rfs_write_sync_misses++;
1192 #endif
1193                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1194                 }
1195                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1196                 uio.uio_iov = iovp;
1197                 uio.uio_iovcnt = iovcnt;
1198                 uio.uio_segflg = UIO_SYSSPACE;
1199                 uio.uio_extflg = UIO_COPY_DEFAULT;
1200                 uio.uio_loffset = (offset_t)wa->wa_offset;
1201                 uio.uio_resid = wa->wa_count;
1202                 /*
1203                  * The limit is checked on the client. We
1204                  * should allow any size writes here.
1205                  */
1206                 uio.uio_llimit = curproc->p_fsz_ctl;
1207                 rlimit = uio.uio_llimit - wa->wa_offset;
1208                 if (rlimit < (rlim64_t)uio.uio_resid)
1209                         uio.uio_resid = (uint_t)rlimit;
1210 
1211                 /*
1212                  * For now we assume no append mode.
1213                  */
1214                 /*
1215                  * We're changing creds because VM may fault and we need
1216                  * the cred of the current thread to be used if quota
1217                  * checking is enabled.
1218                  */
1219                 savecred = curthread->t_cred;
1220                 curthread->t_cred = cr;
1221                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1222                 curthread->t_cred = savecred;
1223 
1224                 if (iovp != iov)
1225                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1226         }
1227 
1228         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1229 
1230         if (!error) {
1231                 /*
1232                  * Get attributes again so we send the latest mod
1233                  * time to the client side for its cache.
1234                  */
1235                 va.va_mask = AT_ALL;    /* now we want everything */
1236 
1237                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1238 
1239                 /* check for overflows */
1240                 if (!error) {
1241                         acl_perm(vp, exi, &va, cr);
1242                         error = vattr_to_nattr(&va, &ns->ns_attr);
1243                 }
1244         }
1245 
1246 out:
1247         if (in_crit)
1248                 nbl_end_crit(vp);
1249         VN_RELE(vp);
1250 
1251         /* check if a monitor detected a delegation conflict */
1252         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1253                 /* mark as wouldblock so response is dropped */
1254                 curthread->t_flag |= T_WOULDBLOCK;
1255         else
1256                 ns->ns_status = puterrno(error);
1257 
1258 }
1259 
1260 struct rfs_async_write {
1261         struct nfswriteargs *wa;
1262         struct nfsattrstat *ns;
1263         struct svc_req *req;
1264         cred_t *cr;
1265         bool_t ro;
1266         kthread_t *thread;
1267         struct rfs_async_write *list;
1268 };
1269 
1270 struct rfs_async_write_list {
1271         fhandle_t *fhp;
1272         kcondvar_t cv;
1273         struct rfs_async_write *list;
1274         struct rfs_async_write_list *next;
1275 };
1276 
1277 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1278 static kmutex_t rfs_async_write_lock;
1279 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1280 
1281 #define MAXCLIOVECS     42
1282 #define RFSWRITE_INITVAL (enum nfsstat) -1
1283 
1284 #ifdef DEBUG
1285 static int rfs_write_hits = 0;
1286 static int rfs_write_misses = 0;
1287 #endif
1288 
1289 /*
1290  * Write data to file.
1291  * Returns attributes of a file after writing some data to it.
1292  */
1293 void
1294 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1295     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1296 {
1297         int error;
1298         vnode_t *vp;
1299         rlim64_t rlimit;
1300         struct vattr va;
1301         struct uio uio;
1302         struct rfs_async_write_list *lp;
1303         struct rfs_async_write_list *nlp;
1304         struct rfs_async_write *rp;
1305         struct rfs_async_write *nrp;
1306         struct rfs_async_write *trp;
1307         struct rfs_async_write *lrp;
1308         int data_written;
1309         int iovcnt;
1310         mblk_t *m;
1311         struct iovec *iovp;
1312         struct iovec *niovp;
1313         struct iovec iov[MAXCLIOVECS];
1314         int count;
1315         int rcount;
1316         uint_t off;
1317         uint_t len;
1318         struct rfs_async_write nrpsp;
1319         struct rfs_async_write_list nlpsp;
1320         ushort_t t_flag;
1321         cred_t *savecred;
1322         int in_crit = 0;
1323         caller_context_t ct;
1324         nfs_srv_t *nsrv;
1325 
1326         ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1327         nsrv = nfs_get_srv();
1328         if (!nsrv->write_async) {
1329                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1330                 return;
1331         }
1332 
1333         /*
1334          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1335          * is considered an OK.
1336          */
1337         ns->ns_status = RFSWRITE_INITVAL;
1338 
1339         nrp = &nrpsp;
1340         nrp->wa = wa;
1341         nrp->ns = ns;
1342         nrp->req = req;
1343         nrp->cr = cr;
1344         nrp->ro = ro;
1345         nrp->thread = curthread;
1346 
1347         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1348 
1349         /*
1350          * Look to see if there is already a cluster started
1351          * for this file.
1352          */
1353         mutex_enter(&nsrv->async_write_lock);
1354         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1355                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1356                     sizeof (fhandle_t)) == 0)
1357                         break;
1358         }
1359 
1360         /*
1361          * If lp is non-NULL, then there is already a cluster
1362          * started.  We need to place ourselves in the cluster
1363          * list in the right place as determined by starting
1364          * offset.  Conflicts with non-blocking mandatory locked
1365          * regions will be checked when the cluster is processed.
1366          */
1367         if (lp != NULL) {
1368                 rp = lp->list;
1369                 trp = NULL;
1370                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1371                         trp = rp;
1372                         rp = rp->list;
1373                 }
1374                 nrp->list = rp;
1375                 if (trp == NULL)
1376                         lp->list = nrp;
1377                 else
1378                         trp->list = nrp;
1379                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1380                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1381                 mutex_exit(&nsrv->async_write_lock);
1382 
1383                 return;
1384         }
1385 
1386         /*
1387          * No cluster started yet, start one and add ourselves
1388          * to the list of clusters.
1389          */
1390         nrp->list = NULL;
1391 
1392         nlp = &nlpsp;
1393         nlp->fhp = &wa->wa_fhandle;
1394         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1395         nlp->list = nrp;
1396         nlp->next = NULL;
1397 
1398         if (nsrv->async_write_head == NULL) {
1399                 nsrv->async_write_head = nlp;
1400         } else {
1401                 lp = nsrv->async_write_head;
1402                 while (lp->next != NULL)
1403                         lp = lp->next;
1404                 lp->next = nlp;
1405         }
1406         mutex_exit(&nsrv->async_write_lock);
1407 
1408         /*
1409          * Convert the file handle common to all of the requests
1410          * in this cluster to a vnode.
1411          */
1412         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1413         if (vp == NULL) {
1414                 mutex_enter(&nsrv->async_write_lock);
1415                 if (nsrv->async_write_head == nlp)
1416                         nsrv->async_write_head = nlp->next;
1417                 else {
1418                         lp = nsrv->async_write_head;
1419                         while (lp->next != nlp)
1420                                 lp = lp->next;
1421                         lp->next = nlp->next;
1422                 }
1423                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1424                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1425                         rp->ns->ns_status = NFSERR_STALE;
1426                         rp->thread->t_flag |= t_flag;
1427                 }
1428                 cv_broadcast(&nlp->cv);
1429                 mutex_exit(&nsrv->async_write_lock);
1430 
1431                 return;
1432         }
1433 
1434         /*
1435          * Can only write regular files.  Attempts to write any
1436          * other file types fail with EISDIR.
1437          */
1438         if (vp->v_type != VREG) {
1439                 VN_RELE(vp);
1440                 mutex_enter(&nsrv->async_write_lock);
1441                 if (nsrv->async_write_head == nlp)
1442                         nsrv->async_write_head = nlp->next;
1443                 else {
1444                         lp = nsrv->async_write_head;
1445                         while (lp->next != nlp)
1446                                 lp = lp->next;
1447                         lp->next = nlp->next;
1448                 }
1449                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1450                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1451                         rp->ns->ns_status = NFSERR_ISDIR;
1452                         rp->thread->t_flag |= t_flag;
1453                 }
1454                 cv_broadcast(&nlp->cv);
1455                 mutex_exit(&nsrv->async_write_lock);
1456 
1457                 return;
1458         }
1459 
1460         /*
1461          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1462          * deadlock with ufs.
1463          */
1464         if (nbl_need_check(vp)) {
1465                 nbl_start_crit(vp, RW_READER);
1466                 in_crit = 1;
1467         }
1468 
1469         ct.cc_sysid = 0;
1470         ct.cc_pid = 0;
1471         ct.cc_caller_id = nfs2_srv_caller_id;
1472         ct.cc_flags = CC_DONTBLOCK;
1473 
1474         /*
1475          * Lock the file for writing.  This operation provides
1476          * the delay which allows clusters to grow.
1477          */
1478         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1479 
1480         /* check if a monitor detected a delegation conflict */
1481         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1482                 if (in_crit)
1483                         nbl_end_crit(vp);
1484                 VN_RELE(vp);
1485                 /* mark as wouldblock so response is dropped */
1486                 curthread->t_flag |= T_WOULDBLOCK;
1487                 mutex_enter(&nsrv->async_write_lock);
1488                 if (nsrv->async_write_head == nlp)
1489                         nsrv->async_write_head = nlp->next;
1490                 else {
1491                         lp = nsrv->async_write_head;
1492                         while (lp->next != nlp)
1493                                 lp = lp->next;
1494                         lp->next = nlp->next;
1495                 }
1496                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1497                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1498                                 rp->ns->ns_status = puterrno(error);
1499                                 rp->thread->t_flag |= T_WOULDBLOCK;
1500                         }
1501                 }
1502                 cv_broadcast(&nlp->cv);
1503                 mutex_exit(&nsrv->async_write_lock);
1504 
1505                 return;
1506         }
1507 
1508         /*
1509          * Disconnect this cluster from the list of clusters.
1510          * The cluster that is being dealt with must be fixed
1511          * in size after this point, so there is no reason
1512          * to leave it on the list so that new requests can
1513          * find it.
1514          *
1515          * The algorithm is that the first write request will
1516          * create a cluster, convert the file handle to a
1517          * vnode pointer, and then lock the file for writing.
1518          * This request is not likely to be clustered with
1519          * any others.  However, the next request will create
1520          * a new cluster and be blocked in VOP_RWLOCK while
1521          * the first request is being processed.  This delay
1522          * will allow more requests to be clustered in this
1523          * second cluster.
1524          */
1525         mutex_enter(&nsrv->async_write_lock);
1526         if (nsrv->async_write_head == nlp)
1527                 nsrv->async_write_head = nlp->next;
1528         else {
1529                 lp = nsrv->async_write_head;
1530                 while (lp->next != nlp)
1531                         lp = lp->next;
1532                 lp->next = nlp->next;
1533         }
1534         mutex_exit(&nsrv->async_write_lock);
1535 
1536         /*
1537          * Step through the list of requests in this cluster.
1538          * We need to check permissions to make sure that all
1539          * of the requests have sufficient permission to write
1540          * the file.  A cluster can be composed of requests
1541          * from different clients and different users on each
1542          * client.
1543          *
1544          * As a side effect, we also calculate the size of the
1545          * byte range that this cluster encompasses.
1546          */
1547         rp = nlp->list;
1548         off = rp->wa->wa_offset;
1549         len = (uint_t)0;
1550         do {
1551                 if (rdonly(rp->ro, vp)) {
1552                         rp->ns->ns_status = NFSERR_ROFS;
1553                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1554                         rp->thread->t_flag |= t_flag;
1555                         continue;
1556                 }
1557 
1558                 va.va_mask = AT_UID|AT_MODE;
1559 
1560                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1561 
1562                 if (!error) {
1563                         if (crgetuid(rp->cr) != va.va_uid) {
1564                                 /*
1565                                  * This is a kludge to allow writes of files
1566                                  * created with read only permission.  The
1567                                  * owner of the file is always allowed to
1568                                  * write it.
1569                                  */
1570                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1571                         }
1572                         if (!error && MANDLOCK(vp, va.va_mode))
1573                                 error = EACCES;
1574                 }
1575 
1576                 /*
1577                  * Check for a conflict with a nbmand-locked region.
1578                  */
1579                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1580                     rp->wa->wa_count, 0, NULL)) {
1581                         error = EACCES;
1582                 }
1583 
1584                 if (error) {
1585                         rp->ns->ns_status = puterrno(error);
1586                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1587                         rp->thread->t_flag |= t_flag;
1588                         continue;
1589                 }
1590                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1591                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1592         } while ((rp = rp->list) != NULL);
1593 
1594         /*
1595          * Step through the cluster attempting to gather as many
1596          * requests which are contiguous as possible.  These
1597          * contiguous requests are handled via one call to VOP_WRITE
1598          * instead of different calls to VOP_WRITE.  We also keep
1599          * track of the fact that any data was written.
1600          */
1601         rp = nlp->list;
1602         data_written = 0;
1603         do {
1604                 /*
1605                  * Skip any requests which are already marked as having an
1606                  * error.
1607                  */
1608                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1609                         rp = rp->list;
1610                         continue;
1611                 }
1612 
1613                 /*
1614                  * Count the number of iovec's which are required
1615                  * to handle this set of requests.  One iovec is
1616                  * needed for each data buffer, whether addressed
1617                  * by wa_data or by the b_rptr pointers in the
1618                  * mblk chains.
1619                  */
1620                 iovcnt = 0;
1621                 lrp = rp;
1622                 for (;;) {
1623                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1624                                 iovcnt++;
1625                         else {
1626                                 m = lrp->wa->wa_mblk;
1627                                 while (m != NULL) {
1628                                         iovcnt++;
1629                                         m = m->b_cont;
1630                                 }
1631                         }
1632                         if (lrp->list == NULL ||
1633                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1634                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1635                             lrp->list->wa->wa_offset) {
1636                                 lrp = lrp->list;
1637                                 break;
1638                         }
1639                         lrp = lrp->list;
1640                 }
1641 
1642                 if (iovcnt <= MAXCLIOVECS) {
1643 #ifdef DEBUG
1644                         rfs_write_hits++;
1645 #endif
1646                         niovp = iov;
1647                 } else {
1648 #ifdef DEBUG
1649                         rfs_write_misses++;
1650 #endif
1651                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1652                 }
1653                 /*
1654                  * Put together the scatter/gather iovecs.
1655                  */
1656                 iovp = niovp;
1657                 trp = rp;
1658                 count = 0;
1659                 do {
1660                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1661                                 if (trp->wa->wa_rlist) {
1662                                         iovp->iov_base =
1663                                             (char *)((trp->wa->wa_rlist)->
1664                                             u.c_daddr3);
1665                                         iovp->iov_len = trp->wa->wa_count;
1666                                 } else  {
1667                                         iovp->iov_base = trp->wa->wa_data;
1668                                         iovp->iov_len = trp->wa->wa_count;
1669                                 }
1670                                 iovp++;
1671                         } else {
1672                                 m = trp->wa->wa_mblk;
1673                                 rcount = trp->wa->wa_count;
1674                                 while (m != NULL) {
1675                                         iovp->iov_base = (caddr_t)m->b_rptr;
1676                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1677                                         rcount -= iovp->iov_len;
1678                                         if (rcount < 0)
1679                                                 iovp->iov_len += rcount;
1680                                         iovp++;
1681                                         if (rcount <= 0)
1682                                                 break;
1683                                         m = m->b_cont;
1684                                 }
1685                         }
1686                         count += trp->wa->wa_count;
1687                         trp = trp->list;
1688                 } while (trp != lrp);
1689 
1690                 uio.uio_iov = niovp;
1691                 uio.uio_iovcnt = iovcnt;
1692                 uio.uio_segflg = UIO_SYSSPACE;
1693                 uio.uio_extflg = UIO_COPY_DEFAULT;
1694                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1695                 uio.uio_resid = count;
1696                 /*
1697                  * The limit is checked on the client. We
1698                  * should allow any size writes here.
1699                  */
1700                 uio.uio_llimit = curproc->p_fsz_ctl;
1701                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1702                 if (rlimit < (rlim64_t)uio.uio_resid)
1703                         uio.uio_resid = (uint_t)rlimit;
1704 
1705                 /*
1706                  * For now we assume no append mode.
1707                  */
1708 
1709                 /*
1710                  * We're changing creds because VM may fault
1711                  * and we need the cred of the current
1712                  * thread to be used if quota * checking is
1713                  * enabled.
1714                  */
1715                 savecred = curthread->t_cred;
1716                 curthread->t_cred = cr;
1717                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1718                 curthread->t_cred = savecred;
1719 
1720                 /* check if a monitor detected a delegation conflict */
1721                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1722                         /* mark as wouldblock so response is dropped */
1723                         curthread->t_flag |= T_WOULDBLOCK;
1724 
1725                 if (niovp != iov)
1726                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1727 
1728                 if (!error) {
1729                         data_written = 1;
1730                         /*
1731                          * Get attributes again so we send the latest mod
1732                          * time to the client side for its cache.
1733                          */
1734                         va.va_mask = AT_ALL;    /* now we want everything */
1735 
1736                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1737 
1738                         if (!error)
1739                                 acl_perm(vp, exi, &va, rp->cr);
1740                 }
1741 
1742                 /*
1743                  * Fill in the status responses for each request
1744                  * which was just handled.  Also, copy the latest
1745                  * attributes in to the attribute responses if
1746                  * appropriate.
1747                  */
1748                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1749                 do {
1750                         rp->thread->t_flag |= t_flag;
1751                         /* check for overflows */
1752                         if (!error) {
1753                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1754                         }
1755                         rp->ns->ns_status = puterrno(error);
1756                         rp = rp->list;
1757                 } while (rp != lrp);
1758         } while (rp != NULL);
1759 
1760         /*
1761          * If any data was written at all, then we need to flush
1762          * the data and metadata to stable storage.
1763          */
1764         if (data_written) {
1765                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1766 
1767                 if (!error) {
1768                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1769                 }
1770         }
1771 
1772         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1773 
1774         if (in_crit)
1775                 nbl_end_crit(vp);
1776         VN_RELE(vp);
1777 
1778         t_flag = curthread->t_flag & T_WOULDBLOCK;
1779         mutex_enter(&nsrv->async_write_lock);
1780         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1781                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1782                         rp->ns->ns_status = puterrno(error);
1783                         rp->thread->t_flag |= t_flag;
1784                 }
1785         }
1786         cv_broadcast(&nlp->cv);
1787         mutex_exit(&nsrv->async_write_lock);
1788 
1789 }
1790 
1791 void *
1792 rfs_write_getfh(struct nfswriteargs *wa)
1793 {
1794         return (&wa->wa_fhandle);
1795 }
1796 
1797 /*
1798  * Create a file.
1799  * Creates a file with given attributes and returns those attributes
1800  * and an fhandle for the new file.
1801  */
1802 void
1803 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1804     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1805 {
1806         int error;
1807         int lookuperr;
1808         int in_crit = 0;
1809         struct vattr va;
1810         vnode_t *vp;
1811         vnode_t *realvp;
1812         vnode_t *dvp;
1813         char *name = args->ca_da.da_name;
1814         vnode_t *tvp = NULL;
1815         int mode;
1816         int lookup_ok;
1817         bool_t trunc;
1818         struct sockaddr *ca;
1819 
1820         /*
1821          * Disallow NULL paths
1822          */
1823         if (name == NULL || *name == '\0') {
1824                 dr->dr_status = NFSERR_ACCES;
1825                 return;
1826         }
1827 
1828         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1829         if (dvp == NULL) {
1830                 dr->dr_status = NFSERR_STALE;
1831                 return;
1832         }
1833 
1834         error = sattr_to_vattr(args->ca_sa, &va);
1835         if (error) {
1836                 dr->dr_status = puterrno(error);
1837                 return;
1838         }
1839 
1840         /*
1841          * Must specify the mode.
1842          */
1843         if (!(va.va_mask & AT_MODE)) {
1844                 VN_RELE(dvp);
1845                 dr->dr_status = NFSERR_INVAL;
1846                 return;
1847         }
1848 
1849         /*
1850          * This is a completely gross hack to make mknod
1851          * work over the wire until we can wack the protocol
1852          */
1853         if ((va.va_mode & IFMT) == IFCHR) {
1854                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1855                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1856                 else {
1857                         va.va_type = VCHR;
1858                         /*
1859                          * uncompress the received dev_t
1860                          * if the top half is zero indicating a request
1861                          * from an `older style' OS.
1862                          */
1863                         if ((va.va_size & 0xffff0000) == 0)
1864                                 va.va_rdev = nfsv2_expdev(va.va_size);
1865                         else
1866                                 va.va_rdev = (dev_t)va.va_size;
1867                 }
1868                 va.va_mask &= ~AT_SIZE;
1869         } else if ((va.va_mode & IFMT) == IFBLK) {
1870                 va.va_type = VBLK;
1871                 /*
1872                  * uncompress the received dev_t
1873                  * if the top half is zero indicating a request
1874                  * from an `older style' OS.
1875                  */
1876                 if ((va.va_size & 0xffff0000) == 0)
1877                         va.va_rdev = nfsv2_expdev(va.va_size);
1878                 else
1879                         va.va_rdev = (dev_t)va.va_size;
1880                 va.va_mask &= ~AT_SIZE;
1881         } else if ((va.va_mode & IFMT) == IFSOCK) {
1882                 va.va_type = VSOCK;
1883         } else {
1884                 va.va_type = VREG;
1885         }
1886         va.va_mode &= ~IFMT;
1887         va.va_mask |= AT_TYPE;
1888 
1889         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1890         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1891             MAXPATHLEN);
1892         if (name == NULL) {
1893                 dr->dr_status = puterrno(EINVAL);
1894                 return;
1895         }
1896 
1897         /*
1898          * Why was the choice made to use VWRITE as the mode to the
1899          * call to VOP_CREATE ? This results in a bug.  When a client
1900          * opens a file that already exists and is RDONLY, the second
1901          * open fails with an EACESS because of the mode.
1902          * bug ID 1054648.
1903          */
1904         lookup_ok = 0;
1905         mode = VWRITE;
1906         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1907                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1908                     NULL, NULL, NULL);
1909                 if (!error) {
1910                         struct vattr at;
1911 
1912                         lookup_ok = 1;
1913                         at.va_mask = AT_MODE;
1914                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1915                         if (!error)
1916                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1917                         VN_RELE(tvp);
1918                         tvp = NULL;
1919                 }
1920         }
1921 
1922         if (!lookup_ok) {
1923                 if (rdonly(ro, dvp)) {
1924                         error = EROFS;
1925                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1926                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1927                         error = EPERM;
1928                 } else {
1929                         error = 0;
1930                 }
1931         }
1932 
1933         /*
1934          * If file size is being modified on an already existing file
1935          * make sure that there are no conflicting non-blocking mandatory
1936          * locks in the region being manipulated. Return EACCES if there
1937          * are conflicting locks.
1938          */
1939         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1940                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1941                     NULL, NULL, NULL);
1942 
1943                 if (!lookuperr &&
1944                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1945                         VN_RELE(tvp);
1946                         curthread->t_flag |= T_WOULDBLOCK;
1947                         goto out;
1948                 }
1949 
1950                 if (!lookuperr && nbl_need_check(tvp)) {
1951                         /*
1952                          * The file exists. Now check if it has any
1953                          * conflicting non-blocking mandatory locks
1954                          * in the region being changed.
1955                          */
1956                         struct vattr bva;
1957                         u_offset_t offset;
1958                         ssize_t length;
1959 
1960                         nbl_start_crit(tvp, RW_READER);
1961                         in_crit = 1;
1962 
1963                         bva.va_mask = AT_SIZE;
1964                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1965                         if (!error) {
1966                                 if (va.va_size < bva.va_size) {
1967                                         offset = va.va_size;
1968                                         length = bva.va_size - va.va_size;
1969                                 } else {
1970                                         offset = bva.va_size;
1971                                         length = va.va_size - bva.va_size;
1972                                 }
1973                                 if (length) {
1974                                         if (nbl_conflict(tvp, NBL_WRITE,
1975                                             offset, length, 0, NULL)) {
1976                                                 error = EACCES;
1977                                         }
1978                                 }
1979                         }
1980                         if (error) {
1981                                 nbl_end_crit(tvp);
1982                                 VN_RELE(tvp);
1983                                 in_crit = 0;
1984                         }
1985                 } else if (tvp != NULL) {
1986                         VN_RELE(tvp);
1987                 }
1988         }
1989 
1990         if (!error) {
1991                 /*
1992                  * If filesystem is shared with nosuid the remove any
1993                  * setuid/setgid bits on create.
1994                  */
1995                 if (va.va_type == VREG &&
1996                     exi->exi_export.ex_flags & EX_NOSUID)
1997                         va.va_mode &= ~(VSUID | VSGID);
1998 
1999                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
2000                     NULL, NULL);
2001 
2002                 if (!error) {
2003 
2004                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2005                                 trunc = TRUE;
2006                         else
2007                                 trunc = FALSE;
2008 
2009                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2010                                 VN_RELE(vp);
2011                                 curthread->t_flag |= T_WOULDBLOCK;
2012                                 goto out;
2013                         }
2014                         va.va_mask = AT_ALL;
2015 
2016                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2017 
2018                         /* check for overflows */
2019                         if (!error) {
2020                                 acl_perm(vp, exi, &va, cr);
2021                                 error = vattr_to_nattr(&va, &dr->dr_attr);
2022                                 if (!error) {
2023                                         error = makefh(&dr->dr_fhandle, vp,
2024                                             exi);
2025                                 }
2026                         }
2027                         /*
2028                          * Force modified metadata out to stable storage.
2029                          *
2030                          * if a underlying vp exists, pass it to VOP_FSYNC
2031                          */
2032                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
2033                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2034                         else
2035                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2036                         VN_RELE(vp);
2037                 }
2038 
2039                 if (in_crit) {
2040                         nbl_end_crit(tvp);
2041                         VN_RELE(tvp);
2042                 }
2043         }
2044 
2045         /*
2046          * Force modified data and metadata out to stable storage.
2047          */
2048         (void) VOP_FSYNC(dvp, 0, cr, NULL);
2049 
2050 out:
2051 
2052         VN_RELE(dvp);
2053 
2054         dr->dr_status = puterrno(error);
2055 
2056         if (name != args->ca_da.da_name)
2057                 kmem_free(name, MAXPATHLEN);
2058 }
2059 void *
2060 rfs_create_getfh(struct nfscreatargs *args)
2061 {
2062         return (args->ca_da.da_fhandle);
2063 }
2064 
2065 /*
2066  * Remove a file.
2067  * Remove named file from parent directory.
2068  */
2069 /* ARGSUSED */
2070 void
2071 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2072     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2073 {
2074         int error = 0;
2075         vnode_t *vp;
2076         vnode_t *targvp;
2077         int in_crit = 0;
2078 
2079         /*
2080          * Disallow NULL paths
2081          */
2082         if (da->da_name == NULL || *da->da_name == '\0') {
2083                 *status = NFSERR_ACCES;
2084                 return;
2085         }
2086 
2087         vp = nfs_fhtovp(da->da_fhandle, exi);
2088         if (vp == NULL) {
2089                 *status = NFSERR_STALE;
2090                 return;
2091         }
2092 
2093         if (rdonly(ro, vp)) {
2094                 VN_RELE(vp);
2095                 *status = NFSERR_ROFS;
2096                 return;
2097         }
2098 
2099         /*
2100          * Check for a conflict with a non-blocking mandatory share reservation.
2101          */
2102         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2103             NULL, cr, NULL, NULL, NULL);
2104         if (error != 0) {
2105                 VN_RELE(vp);
2106                 *status = puterrno(error);
2107                 return;
2108         }
2109 
2110         /*
2111          * If the file is delegated to an v4 client, then initiate
2112          * recall and drop this request (by setting T_WOULDBLOCK).
2113          * The client will eventually re-transmit the request and
2114          * (hopefully), by then, the v4 client will have returned
2115          * the delegation.
2116          */
2117 
2118         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2119                 VN_RELE(vp);
2120                 VN_RELE(targvp);
2121                 curthread->t_flag |= T_WOULDBLOCK;
2122                 return;
2123         }
2124 
2125         if (nbl_need_check(targvp)) {
2126                 nbl_start_crit(targvp, RW_READER);
2127                 in_crit = 1;
2128                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2129                         error = EACCES;
2130                         goto out;
2131                 }
2132         }
2133 
2134         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2135 
2136         /*
2137          * Force modified data and metadata out to stable storage.
2138          */
2139         (void) VOP_FSYNC(vp, 0, cr, NULL);
2140 
2141 out:
2142         if (in_crit)
2143                 nbl_end_crit(targvp);
2144         VN_RELE(targvp);
2145         VN_RELE(vp);
2146 
2147         *status = puterrno(error);
2148 
2149 }
2150 
2151 void *
2152 rfs_remove_getfh(struct nfsdiropargs *da)
2153 {
2154         return (da->da_fhandle);
2155 }
2156 
2157 /*
2158  * rename a file
2159  * Give a file (from) a new name (to).
2160  */
2161 /* ARGSUSED */
2162 void
2163 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2164     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2165 {
2166         int error = 0;
2167         vnode_t *fromvp;
2168         vnode_t *tovp;
2169         struct exportinfo *to_exi;
2170         fhandle_t *fh;
2171         vnode_t *srcvp;
2172         vnode_t *targvp;
2173         int in_crit = 0;
2174 
2175         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2176         if (fromvp == NULL) {
2177                 *status = NFSERR_STALE;
2178                 return;
2179         }
2180 
2181         fh = args->rna_to.da_fhandle;
2182         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2183         if (to_exi == NULL) {
2184                 VN_RELE(fromvp);
2185                 *status = NFSERR_ACCES;
2186                 return;
2187         }
2188         exi_rele(to_exi);
2189 
2190         if (to_exi != exi) {
2191                 VN_RELE(fromvp);
2192                 *status = NFSERR_XDEV;
2193                 return;
2194         }
2195 
2196         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2197         if (tovp == NULL) {
2198                 VN_RELE(fromvp);
2199                 *status = NFSERR_STALE;
2200                 return;
2201         }
2202 
2203         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2204                 VN_RELE(tovp);
2205                 VN_RELE(fromvp);
2206                 *status = NFSERR_NOTDIR;
2207                 return;
2208         }
2209 
2210         /*
2211          * Disallow NULL paths
2212          */
2213         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2214             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2215                 VN_RELE(tovp);
2216                 VN_RELE(fromvp);
2217                 *status = NFSERR_ACCES;
2218                 return;
2219         }
2220 
2221         if (rdonly(ro, tovp)) {
2222                 VN_RELE(tovp);
2223                 VN_RELE(fromvp);
2224                 *status = NFSERR_ROFS;
2225                 return;
2226         }
2227 
2228         /*
2229          * Check for a conflict with a non-blocking mandatory share reservation.
2230          */
2231         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2232             NULL, cr, NULL, NULL, NULL);
2233         if (error != 0) {
2234                 VN_RELE(tovp);
2235                 VN_RELE(fromvp);
2236                 *status = puterrno(error);
2237                 return;
2238         }
2239 
2240         /* Check for delegations on the source file */
2241 
2242         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2243                 VN_RELE(tovp);
2244                 VN_RELE(fromvp);
2245                 VN_RELE(srcvp);
2246                 curthread->t_flag |= T_WOULDBLOCK;
2247                 return;
2248         }
2249 
2250         /* Check for delegation on the file being renamed over, if it exists */
2251 
2252         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2253             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2254             NULL, NULL, NULL) == 0) {
2255 
2256                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2257                         VN_RELE(tovp);
2258                         VN_RELE(fromvp);
2259                         VN_RELE(srcvp);
2260                         VN_RELE(targvp);
2261                         curthread->t_flag |= T_WOULDBLOCK;
2262                         return;
2263                 }
2264                 VN_RELE(targvp);
2265         }
2266 
2267 
2268         if (nbl_need_check(srcvp)) {
2269                 nbl_start_crit(srcvp, RW_READER);
2270                 in_crit = 1;
2271                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2272                         error = EACCES;
2273                         goto out;
2274                 }
2275         }
2276 
2277         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2278             tovp, args->rna_to.da_name, cr, NULL, 0);
2279 
2280         if (error == 0)
2281                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2282                     strlen(args->rna_to.da_name));
2283 
2284         /*
2285          * Force modified data and metadata out to stable storage.
2286          */
2287         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2288         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2289 
2290 out:
2291         if (in_crit)
2292                 nbl_end_crit(srcvp);
2293         VN_RELE(srcvp);
2294         VN_RELE(tovp);
2295         VN_RELE(fromvp);
2296 
2297         *status = puterrno(error);
2298 
2299 }
2300 void *
2301 rfs_rename_getfh(struct nfsrnmargs *args)
2302 {
2303         return (args->rna_from.da_fhandle);
2304 }
2305 
2306 /*
2307  * Link to a file.
2308  * Create a file (to) which is a hard link to the given file (from).
2309  */
2310 /* ARGSUSED */
2311 void
2312 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2313     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2314 {
2315         int error;
2316         vnode_t *fromvp;
2317         vnode_t *tovp;
2318         struct exportinfo *to_exi;
2319         fhandle_t *fh;
2320 
2321         fromvp = nfs_fhtovp(args->la_from, exi);
2322         if (fromvp == NULL) {
2323                 *status = NFSERR_STALE;
2324                 return;
2325         }
2326 
2327         fh = args->la_to.da_fhandle;
2328         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2329         if (to_exi == NULL) {
2330                 VN_RELE(fromvp);
2331                 *status = NFSERR_ACCES;
2332                 return;
2333         }
2334         exi_rele(to_exi);
2335 
2336         if (to_exi != exi) {
2337                 VN_RELE(fromvp);
2338                 *status = NFSERR_XDEV;
2339                 return;
2340         }
2341 
2342         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2343         if (tovp == NULL) {
2344                 VN_RELE(fromvp);
2345                 *status = NFSERR_STALE;
2346                 return;
2347         }
2348 
2349         if (tovp->v_type != VDIR) {
2350                 VN_RELE(tovp);
2351                 VN_RELE(fromvp);
2352                 *status = NFSERR_NOTDIR;
2353                 return;
2354         }
2355         /*
2356          * Disallow NULL paths
2357          */
2358         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2359                 VN_RELE(tovp);
2360                 VN_RELE(fromvp);
2361                 *status = NFSERR_ACCES;
2362                 return;
2363         }
2364 
2365         if (rdonly(ro, tovp)) {
2366                 VN_RELE(tovp);
2367                 VN_RELE(fromvp);
2368                 *status = NFSERR_ROFS;
2369                 return;
2370         }
2371 
2372         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2373 
2374         /*
2375          * Force modified data and metadata out to stable storage.
2376          */
2377         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2378         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2379 
2380         VN_RELE(tovp);
2381         VN_RELE(fromvp);
2382 
2383         *status = puterrno(error);
2384 
2385 }
2386 void *
2387 rfs_link_getfh(struct nfslinkargs *args)
2388 {
2389         return (args->la_from);
2390 }
2391 
2392 /*
2393  * Symbolicly link to a file.
2394  * Create a file (to) with the given attributes which is a symbolic link
2395  * to the given path name (to).
2396  */
2397 void
2398 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2399     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2400 {
2401         int error;
2402         struct vattr va;
2403         vnode_t *vp;
2404         vnode_t *svp;
2405         int lerror;
2406         struct sockaddr *ca;
2407         char *name = NULL;
2408 
2409         /*
2410          * Disallow NULL paths
2411          */
2412         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2413                 *status = NFSERR_ACCES;
2414                 return;
2415         }
2416 
2417         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2418         if (vp == NULL) {
2419                 *status = NFSERR_STALE;
2420                 return;
2421         }
2422 
2423         if (rdonly(ro, vp)) {
2424                 VN_RELE(vp);
2425                 *status = NFSERR_ROFS;
2426                 return;
2427         }
2428 
2429         error = sattr_to_vattr(args->sla_sa, &va);
2430         if (error) {
2431                 VN_RELE(vp);
2432                 *status = puterrno(error);
2433                 return;
2434         }
2435 
2436         if (!(va.va_mask & AT_MODE)) {
2437                 VN_RELE(vp);
2438                 *status = NFSERR_INVAL;
2439                 return;
2440         }
2441 
2442         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2443         name = nfscmd_convname(ca, exi, args->sla_tnm,
2444             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2445 
2446         if (name == NULL) {
2447                 *status = NFSERR_ACCES;
2448                 return;
2449         }
2450 
2451         va.va_type = VLNK;
2452         va.va_mask |= AT_TYPE;
2453 
2454         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2455 
2456         /*
2457          * Force new data and metadata out to stable storage.
2458          */
2459         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2460             NULL, cr, NULL, NULL, NULL);
2461 
2462         if (!lerror) {
2463                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2464                 VN_RELE(svp);
2465         }
2466 
2467         /*
2468          * Force modified data and metadata out to stable storage.
2469          */
2470         (void) VOP_FSYNC(vp, 0, cr, NULL);
2471 
2472         VN_RELE(vp);
2473 
2474         *status = puterrno(error);
2475         if (name != args->sla_tnm)
2476                 kmem_free(name, MAXPATHLEN);
2477 
2478 }
2479 void *
2480 rfs_symlink_getfh(struct nfsslargs *args)
2481 {
2482         return (args->sla_from.da_fhandle);
2483 }
2484 
2485 /*
2486  * Make a directory.
2487  * Create a directory with the given name, parent directory, and attributes.
2488  * Returns a file handle and attributes for the new directory.
2489  */
2490 /* ARGSUSED */
2491 void
2492 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2493     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2494 {
2495         int error;
2496         struct vattr va;
2497         vnode_t *dvp = NULL;
2498         vnode_t *vp;
2499         char *name = args->ca_da.da_name;
2500 
2501         /*
2502          * Disallow NULL paths
2503          */
2504         if (name == NULL || *name == '\0') {
2505                 dr->dr_status = NFSERR_ACCES;
2506                 return;
2507         }
2508 
2509         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2510         if (vp == NULL) {
2511                 dr->dr_status = NFSERR_STALE;
2512                 return;
2513         }
2514 
2515         if (rdonly(ro, vp)) {
2516                 VN_RELE(vp);
2517                 dr->dr_status = NFSERR_ROFS;
2518                 return;
2519         }
2520 
2521         error = sattr_to_vattr(args->ca_sa, &va);
2522         if (error) {
2523                 VN_RELE(vp);
2524                 dr->dr_status = puterrno(error);
2525                 return;
2526         }
2527 
2528         if (!(va.va_mask & AT_MODE)) {
2529                 VN_RELE(vp);
2530                 dr->dr_status = NFSERR_INVAL;
2531                 return;
2532         }
2533 
2534         va.va_type = VDIR;
2535         va.va_mask |= AT_TYPE;
2536 
2537         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2538 
2539         if (!error) {
2540                 /*
2541                  * Attribtutes of the newly created directory should
2542                  * be returned to the client.
2543                  */
2544                 va.va_mask = AT_ALL; /* We want everything */
2545                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2546 
2547                 /* check for overflows */
2548                 if (!error) {
2549                         acl_perm(vp, exi, &va, cr);
2550                         error = vattr_to_nattr(&va, &dr->dr_attr);
2551                         if (!error) {
2552                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2553                         }
2554                 }
2555                 /*
2556                  * Force new data and metadata out to stable storage.
2557                  */
2558                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2559                 VN_RELE(dvp);
2560         }
2561 
2562         /*
2563          * Force modified data and metadata out to stable storage.
2564          */
2565         (void) VOP_FSYNC(vp, 0, cr, NULL);
2566 
2567         VN_RELE(vp);
2568 
2569         dr->dr_status = puterrno(error);
2570 
2571 }
2572 void *
2573 rfs_mkdir_getfh(struct nfscreatargs *args)
2574 {
2575         return (args->ca_da.da_fhandle);
2576 }
2577 
2578 /*
2579  * Remove a directory.
2580  * Remove the given directory name from the given parent directory.
2581  */
2582 /* ARGSUSED */
2583 void
2584 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2585     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2586 {
2587         int error;
2588         vnode_t *vp;
2589 
2590         /*
2591          * Disallow NULL paths
2592          */
2593         if (da->da_name == NULL || *da->da_name == '\0') {
2594                 *status = NFSERR_ACCES;
2595                 return;
2596         }
2597 
2598         vp = nfs_fhtovp(da->da_fhandle, exi);
2599         if (vp == NULL) {
2600                 *status = NFSERR_STALE;
2601                 return;
2602         }
2603 
2604         if (rdonly(ro, vp)) {
2605                 VN_RELE(vp);
2606                 *status = NFSERR_ROFS;
2607                 return;
2608         }
2609 
2610         /*
2611          * VOP_RMDIR takes a third argument (the current
2612          * directory of the process).  That's because someone
2613          * wants to return EINVAL if one tries to remove ".".
2614          * Of course, NFS servers have no idea what their
2615          * clients' current directories are.  We fake it by
2616          * supplying a vnode known to exist and illegal to
2617          * remove.
2618          */
2619         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2620 
2621         /*
2622          * Force modified data and metadata out to stable storage.
2623          */
2624         (void) VOP_FSYNC(vp, 0, cr, NULL);
2625 
2626         VN_RELE(vp);
2627 
2628         /*
2629          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2630          * if the directory is not empty.  A System V NFS server
2631          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2632          * over the wire.
2633          */
2634         if (error == EEXIST)
2635                 *status = NFSERR_NOTEMPTY;
2636         else
2637                 *status = puterrno(error);
2638 
2639 }
2640 void *
2641 rfs_rmdir_getfh(struct nfsdiropargs *da)
2642 {
2643         return (da->da_fhandle);
2644 }
2645 
2646 /* ARGSUSED */
2647 void
2648 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2649     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2650 {
2651         int error;
2652         int iseof;
2653         struct iovec iov;
2654         struct uio uio;
2655         vnode_t *vp;
2656         char *ndata = NULL;
2657         struct sockaddr *ca;
2658         size_t nents;
2659         int ret;
2660 
2661         vp = nfs_fhtovp(&rda->rda_fh, exi);
2662         if (vp == NULL) {
2663                 rd->rd_entries = NULL;
2664                 rd->rd_status = NFSERR_STALE;
2665                 return;
2666         }
2667 
2668         if (vp->v_type != VDIR) {
2669                 VN_RELE(vp);
2670                 rd->rd_entries = NULL;
2671                 rd->rd_status = NFSERR_NOTDIR;
2672                 return;
2673         }
2674 
2675         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2676 
2677         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2678 
2679         if (error) {
2680                 rd->rd_entries = NULL;
2681                 goto bad;
2682         }
2683 
2684         if (rda->rda_count == 0) {
2685                 rd->rd_entries = NULL;
2686                 rd->rd_size = 0;
2687                 rd->rd_eof = FALSE;
2688                 goto bad;
2689         }
2690 
2691         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2692 
2693         /*
2694          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2695          */
2696         rd->rd_bufsize = (uint_t)rda->rda_count;
2697         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2698 
2699         /*
2700          * Set up io vector to read directory data
2701          */
2702         iov.iov_base = (caddr_t)rd->rd_entries;
2703         iov.iov_len = rda->rda_count;
2704         uio.uio_iov = &iov;
2705         uio.uio_iovcnt = 1;
2706         uio.uio_segflg = UIO_SYSSPACE;
2707         uio.uio_extflg = UIO_COPY_CACHED;
2708         uio.uio_loffset = (offset_t)rda->rda_offset;
2709         uio.uio_resid = rda->rda_count;
2710 
2711         /*
2712          * read directory
2713          */
2714         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2715 
2716         /*
2717          * Clean up
2718          */
2719         if (!error) {
2720                 /*
2721                  * set size and eof
2722                  */
2723                 if (uio.uio_resid == rda->rda_count) {
2724                         rd->rd_size = 0;
2725                         rd->rd_eof = TRUE;
2726                 } else {
2727                         rd->rd_size = (uint32_t)(rda->rda_count -
2728                             uio.uio_resid);
2729                         rd->rd_eof = iseof ? TRUE : FALSE;
2730                 }
2731         }
2732 
2733         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2734         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2735         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2736             rda->rda_count, &ndata);
2737 
2738         if (ret != 0) {
2739                 size_t dropbytes;
2740                 /*
2741                  * We had to drop one or more entries in order to fit
2742                  * during the character conversion.  We need to patch
2743                  * up the size and eof info.
2744                  */
2745                 if (rd->rd_eof)
2746                         rd->rd_eof = FALSE;
2747                 dropbytes = nfscmd_dropped_entrysize(
2748                     (struct dirent64 *)rd->rd_entries, nents, ret);
2749                 rd->rd_size -= dropbytes;
2750         }
2751         if (ndata == NULL) {
2752                 ndata = (char *)rd->rd_entries;
2753         } else if (ndata != (char *)rd->rd_entries) {
2754                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2755                 rd->rd_entries = (void *)ndata;
2756                 rd->rd_bufsize = rda->rda_count;
2757         }
2758 
2759 bad:
2760         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2761 
2762 #if 0 /* notyet */
2763         /*
2764          * Don't do this.  It causes local disk writes when just
2765          * reading the file and the overhead is deemed larger
2766          * than the benefit.
2767          */
2768         /*
2769          * Force modified metadata out to stable storage.
2770          */
2771         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2772 #endif
2773 
2774         VN_RELE(vp);
2775 
2776         rd->rd_status = puterrno(error);
2777 
2778 }
2779 void *
2780 rfs_readdir_getfh(struct nfsrddirargs *rda)
2781 {
2782         return (&rda->rda_fh);
2783 }
2784 void
2785 rfs_rddirfree(struct nfsrddirres *rd)
2786 {
2787         if (rd->rd_entries != NULL)
2788                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2789 }
2790 
2791 /* ARGSUSED */
2792 void
2793 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2794     struct svc_req *req, cred_t *cr, bool_t ro)
2795 {
2796         int error;
2797         struct statvfs64 sb;
2798         vnode_t *vp;
2799 
2800         vp = nfs_fhtovp(fh, exi);
2801         if (vp == NULL) {
2802                 fs->fs_status = NFSERR_STALE;
2803                 return;
2804         }
2805 
2806         error = VFS_STATVFS(vp->v_vfsp, &sb);
2807 
2808         if (!error) {
2809                 fs->fs_tsize = nfstsize();
2810                 fs->fs_bsize = sb.f_frsize;
2811                 fs->fs_blocks = sb.f_blocks;
2812                 fs->fs_bfree = sb.f_bfree;
2813                 fs->fs_bavail = sb.f_bavail;
2814         }
2815 
2816         VN_RELE(vp);
2817 
2818         fs->fs_status = puterrno(error);
2819 
2820 }
2821 void *
2822 rfs_statfs_getfh(fhandle_t *fh)
2823 {
2824         return (fh);
2825 }
2826 
2827 static int
2828 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2829 {
2830         vap->va_mask = 0;
2831 
2832         /*
2833          * There was a sign extension bug in some VFS based systems
2834          * which stored the mode as a short.  When it would get
2835          * assigned to a u_long, no sign extension would occur.
2836          * It needed to, but this wasn't noticed because sa_mode
2837          * would then get assigned back to the short, thus ignoring
2838          * the upper 16 bits of sa_mode.
2839          *
2840          * To make this implementation work for both broken
2841          * clients and good clients, we check for both versions
2842          * of the mode.
2843          */
2844         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2845             sa->sa_mode != (uint32_t)-1) {
2846                 vap->va_mask |= AT_MODE;
2847                 vap->va_mode = sa->sa_mode;
2848         }
2849         if (sa->sa_uid != (uint32_t)-1) {
2850                 vap->va_mask |= AT_UID;
2851                 vap->va_uid = sa->sa_uid;
2852         }
2853         if (sa->sa_gid != (uint32_t)-1) {
2854                 vap->va_mask |= AT_GID;
2855                 vap->va_gid = sa->sa_gid;
2856         }
2857         if (sa->sa_size != (uint32_t)-1) {
2858                 vap->va_mask |= AT_SIZE;
2859                 vap->va_size = sa->sa_size;
2860         }
2861         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2862             sa->sa_atime.tv_usec != (int32_t)-1) {
2863 #ifndef _LP64
2864                 /* return error if time overflow */
2865                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2866                         return (EOVERFLOW);
2867 #endif
2868                 vap->va_mask |= AT_ATIME;
2869                 /*
2870                  * nfs protocol defines times as unsigned so don't extend sign,
2871                  * unless sysadmin set nfs_allow_preepoch_time.
2872                  */
2873                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2874                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2875         }
2876         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2877             sa->sa_mtime.tv_usec != (int32_t)-1) {
2878 #ifndef _LP64
2879                 /* return error if time overflow */
2880                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2881                         return (EOVERFLOW);
2882 #endif
2883                 vap->va_mask |= AT_MTIME;
2884                 /*
2885                  * nfs protocol defines times as unsigned so don't extend sign,
2886                  * unless sysadmin set nfs_allow_preepoch_time.
2887                  */
2888                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2889                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2890         }
2891         return (0);
2892 }
2893 
2894 static const enum nfsftype vt_to_nf[] = {
2895         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2896 };
2897 
2898 /*
2899  * check the following fields for overflow: nodeid, size, and time.
2900  * There could be a problem when converting 64-bit LP64 fields
2901  * into 32-bit ones.  Return an error if there is an overflow.
2902  */
2903 int
2904 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2905 {
2906         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2907         na->na_type = vt_to_nf[vap->va_type];
2908 
2909         if (vap->va_mode == (unsigned short) -1)
2910                 na->na_mode = (uint32_t)-1;
2911         else
2912                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2913 
2914         if (vap->va_uid == (unsigned short)(-1))
2915                 na->na_uid = (uint32_t)(-1);
2916         else if (vap->va_uid == UID_NOBODY)
2917                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2918         else
2919                 na->na_uid = vap->va_uid;
2920 
2921         if (vap->va_gid == (unsigned short)(-1))
2922                 na->na_gid = (uint32_t)-1;
2923         else if (vap->va_gid == GID_NOBODY)
2924                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2925         else
2926                 na->na_gid = vap->va_gid;
2927 
2928         /*
2929          * Do we need to check fsid for overflow?  It is 64-bit in the
2930          * vattr, but are bigger than 32 bit values supported?
2931          */
2932         na->na_fsid = vap->va_fsid;
2933 
2934         na->na_nodeid = vap->va_nodeid;
2935 
2936         /*
2937          * Check to make sure that the nodeid is representable over the
2938          * wire without losing bits.
2939          */
2940         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2941                 return (EFBIG);
2942         na->na_nlink = vap->va_nlink;
2943 
2944         /*
2945          * Check for big files here, instead of at the caller.  See
2946          * comments in cstat for large special file explanation.
2947          */
2948         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2949                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2950                         return (EFBIG);
2951                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2952                         /* UNKNOWN_SIZE | OVERFLOW */
2953                         na->na_size = MAXOFF32_T;
2954                 } else
2955                         na->na_size = vap->va_size;
2956         } else
2957                 na->na_size = vap->va_size;
2958 
2959         /*
2960          * If the vnode times overflow the 32-bit times that NFS2
2961          * uses on the wire then return an error.
2962          */
2963         if (!NFS_VAP_TIME_OK(vap)) {
2964                 return (EOVERFLOW);
2965         }
2966         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2967         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2968 
2969         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2970         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2971 
2972         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2973         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2974 
2975         /*
2976          * If the dev_t will fit into 16 bits then compress
2977          * it, otherwise leave it alone. See comments in
2978          * nfs_client.c.
2979          */
2980         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2981             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2982                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2983         else
2984                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2985 
2986         na->na_blocks = vap->va_nblocks;
2987         na->na_blocksize = vap->va_blksize;
2988 
2989         /*
2990          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2991          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2992          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2993          *
2994          * BUYER BEWARE:
2995          *  If you are porting the NFS to a non-Sun server, you probably
2996          *  don't want to include the following block of code.  The
2997          *  over-the-wire special file types will be changing with the
2998          *  NFS Protocol Revision.
2999          */
3000         if (vap->va_type == VFIFO)
3001                 NA_SETFIFO(na);
3002         return (0);
3003 }
3004 
3005 /*
3006  * acl v2 support: returns approximate permission.
3007  *      default: returns minimal permission (more restrictive)
3008  *      aclok: returns maximal permission (less restrictive)
3009  *      This routine changes the permissions that are alaredy in *va.
3010  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3011  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3012  */
3013 static void
3014 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3015 {
3016         vsecattr_t      vsa;
3017         int             aclcnt;
3018         aclent_t        *aclentp;
3019         mode_t          mask_perm;
3020         mode_t          grp_perm;
3021         mode_t          other_perm;
3022         mode_t          other_orig;
3023         int             error;
3024 
3025         /* dont care default acl */
3026         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3027         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3028 
3029         if (!error) {
3030                 aclcnt = vsa.vsa_aclcnt;
3031                 if (aclcnt > MIN_ACL_ENTRIES) {
3032                         /* non-trivial ACL */
3033                         aclentp = vsa.vsa_aclentp;
3034                         if (exi->exi_export.ex_flags & EX_ACLOK) {
3035                                 /* maximal permissions */
3036                                 grp_perm = 0;
3037                                 other_perm = 0;
3038                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3039                                         switch (aclentp->a_type) {
3040                                         case USER_OBJ:
3041                                                 break;
3042                                         case USER:
3043                                                 grp_perm |=
3044                                                     aclentp->a_perm << 3;
3045                                                 other_perm |= aclentp->a_perm;
3046                                                 break;
3047                                         case GROUP_OBJ:
3048                                                 grp_perm |=
3049                                                     aclentp->a_perm << 3;
3050                                                 break;
3051                                         case GROUP:
3052                                                 other_perm |= aclentp->a_perm;
3053                                                 break;
3054                                         case OTHER_OBJ:
3055                                                 other_orig = aclentp->a_perm;
3056                                                 break;
3057                                         case CLASS_OBJ:
3058                                                 mask_perm = aclentp->a_perm;
3059                                                 break;
3060                                         default:
3061                                                 break;
3062                                         }
3063                                 }
3064                                 grp_perm &= mask_perm << 3;
3065                                 other_perm &= mask_perm;
3066                                 other_perm |= other_orig;
3067 
3068                         } else {
3069                                 /* minimal permissions */
3070                                 grp_perm = 070;
3071                                 other_perm = 07;
3072                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3073                                         switch (aclentp->a_type) {
3074                                         case USER_OBJ:
3075                                                 break;
3076                                         case USER:
3077                                         case CLASS_OBJ:
3078                                                 grp_perm &=
3079                                                     aclentp->a_perm << 3;
3080                                                 other_perm &=
3081                                                     aclentp->a_perm;
3082                                                 break;
3083                                         case GROUP_OBJ:
3084                                                 grp_perm &=
3085                                                     aclentp->a_perm << 3;
3086                                                 break;
3087                                         case GROUP:
3088                                                 other_perm &=
3089                                                     aclentp->a_perm;
3090                                                 break;
3091                                         case OTHER_OBJ:
3092                                                 other_perm &=
3093                                                     aclentp->a_perm;
3094                                                 break;
3095                                         default:
3096                                                 break;
3097                                         }
3098                                 }
3099                         }
3100                         /* copy to va */
3101                         va->va_mode &= ~077;
3102                         va->va_mode |= grp_perm | other_perm;
3103                 }
3104                 if (vsa.vsa_aclcnt)
3105                         kmem_free(vsa.vsa_aclentp,
3106                             vsa.vsa_aclcnt * sizeof (aclent_t));
3107         }
3108 }
3109 
3110 void
3111 rfs_srvrinit(void)
3112 {
3113         nfs2_srv_caller_id = fs_new_caller_id();
3114 }
3115 
3116 void
3117 rfs_srvrfini(void)
3118 {
3119 }
3120 
3121 /* ARGSUSED */
3122 void
3123 rfs_srv_zone_init(nfs_globals_t *ng)
3124 {
3125         nfs_srv_t *ns;
3126 
3127         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3128 
3129         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3130         ns->write_async = 1;
3131 
3132         ng->nfs_srv = ns;
3133 }
3134 
3135 /* ARGSUSED */
3136 void
3137 rfs_srv_zone_fini(nfs_globals_t *ng)
3138 {
3139         nfs_srv_t *ns = ng->nfs_srv;
3140 
3141         ng->nfs_srv = NULL;
3142 
3143         mutex_destroy(&ns->async_write_lock);
3144         kmem_free(ns, sizeof (*ns));
3145 }
3146 
3147 static int
3148 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3149 {
3150         struct clist    *wcl;
3151         int             wlist_len;
3152         uint32_t        count = rr->rr_count;
3153 
3154         wcl = ra->ra_wlist;
3155 
3156         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3157                 return (FALSE);
3158         }
3159 
3160         wcl = ra->ra_wlist;
3161         rr->rr_ok.rrok_wlist_len = wlist_len;
3162         rr->rr_ok.rrok_wlist = wcl;
3163 
3164         return (TRUE);
3165 }