do-not-assume-VROOT Old usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 
  33 /*
  34  * Copyright 2018 Nexenta Systems, Inc.
  35  * Copyright (c) 2016 by Delphix. All rights reserved.
  36  */
  37 
  38 #include <sys/param.h>
  39 #include <sys/types.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/buf.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/stat.h>
  47 #include <sys/errno.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/statvfs.h>
  50 #include <sys/kmem.h>
  51 #include <sys/kstat.h>
  52 #include <sys/dirent.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/debug.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/mode.h>
  57 #include <sys/acl.h>
  58 #include <sys/nbmlock.h>
  59 #include <sys/policy.h>
  60 #include <sys/sdt.h>
  61 
  62 #include <rpc/types.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/svc.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/export.h>
  68 #include <nfs/nfs_cmd.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_kmem.h>
  75 
  76 #include <sys/strsubr.h>
  77 
  78 struct rfs_async_write_list;
  79 
  80 /*
  81  * Zone globals of NFSv2 server
  82  */
  83 typedef struct nfs_srv {
  84         kmutex_t                        async_write_lock;
  85         struct rfs_async_write_list     *async_write_head;
  86 
  87         /*
  88          * enables write clustering if == 1
  89          */
  90         int             write_async;
  91 } nfs_srv_t;
  92 
  93 /*
  94  * These are the interface routines for the server side of the
  95  * Network File System.  See the NFS version 2 protocol specification
  96  * for a description of this interface.
  97  */
  98 
  99 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101                         cred_t *);
 102 static void     *rfs_zone_init(zoneid_t zoneid);
 103 static void     rfs_zone_fini(zoneid_t zoneid, void *data);
 104 
 105 
 106 /*
 107  * Some "over the wire" UNIX file types.  These are encoded
 108  * into the mode.  This needs to be fixed in the next rev.
 109  */
 110 #define IFMT            0170000         /* type of file */
 111 #define IFCHR           0020000         /* character special */
 112 #define IFBLK           0060000         /* block special */
 113 #define IFSOCK          0140000         /* socket */
 114 
 115 u_longlong_t nfs2_srv_caller_id;
 116 static zone_key_t rfs_zone_key;
 117 
 118 /*
 119  * Get file attributes.
 120  * Returns the current attributes of the file with the given fhandle.
 121  */
 122 /* ARGSUSED */
 123 void
 124 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 125     struct svc_req *req, cred_t *cr, bool_t ro)
 126 {
 127         int error;
 128         vnode_t *vp;
 129         struct vattr va;
 130 
 131         vp = nfs_fhtovp(fhp, exi);
 132         if (vp == NULL) {
 133                 ns->ns_status = NFSERR_STALE;
 134                 return;
 135         }
 136 
 137         /*
 138          * Do the getattr.
 139          */
 140         va.va_mask = AT_ALL;    /* we want all the attributes */
 141 
 142         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 143 
 144         /* check for overflows */
 145         if (!error) {
 146                 /* Lie about the object type for a referral */
 147                 if (vn_is_nfs_reparse(vp, cr))
 148                         va.va_type = VLNK;
 149 
 150                 acl_perm(vp, exi, &va, cr);
 151                 error = vattr_to_nattr(&va, &ns->ns_attr);
 152         }
 153 
 154         VN_RELE(vp);
 155 
 156         ns->ns_status = puterrno(error);
 157 }
 158 void *
 159 rfs_getattr_getfh(fhandle_t *fhp)
 160 {
 161         return (fhp);
 162 }
 163 
 164 /*
 165  * Set file attributes.
 166  * Sets the attributes of the file with the given fhandle.  Returns
 167  * the new attributes.
 168  */
 169 /* ARGSUSED */
 170 void
 171 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 172     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 173 {
 174         int error;
 175         int flag;
 176         int in_crit = 0;
 177         vnode_t *vp;
 178         struct vattr va;
 179         struct vattr bva;
 180         struct flock64 bf;
 181         caller_context_t ct;
 182 
 183 
 184         vp = nfs_fhtovp(&args->saa_fh, exi);
 185         if (vp == NULL) {
 186                 ns->ns_status = NFSERR_STALE;
 187                 return;
 188         }
 189 
 190         if (rdonly(ro, vp)) {
 191                 VN_RELE(vp);
 192                 ns->ns_status = NFSERR_ROFS;
 193                 return;
 194         }
 195 
 196         error = sattr_to_vattr(&args->saa_sa, &va);
 197         if (error) {
 198                 VN_RELE(vp);
 199                 ns->ns_status = puterrno(error);
 200                 return;
 201         }
 202 
 203         /*
 204          * If the client is requesting a change to the mtime,
 205          * but the nanosecond field is set to 1 billion, then
 206          * this is a flag to the server that it should set the
 207          * atime and mtime fields to the server's current time.
 208          * The 1 billion number actually came from the client
 209          * as 1 million, but the units in the over the wire
 210          * request are microseconds instead of nanoseconds.
 211          *
 212          * This is an overload of the protocol and should be
 213          * documented in the NFS Version 2 protocol specification.
 214          */
 215         if (va.va_mask & AT_MTIME) {
 216                 if (va.va_mtime.tv_nsec == 1000000000) {
 217                         gethrestime(&va.va_mtime);
 218                         va.va_atime = va.va_mtime;
 219                         va.va_mask |= AT_ATIME;
 220                         flag = 0;
 221                 } else
 222                         flag = ATTR_UTIME;
 223         } else
 224                 flag = 0;
 225 
 226         /*
 227          * If the filesystem is exported with nosuid, then mask off
 228          * the setuid and setgid bits.
 229          */
 230         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 231             (exi->exi_export.ex_flags & EX_NOSUID))
 232                 va.va_mode &= ~(VSUID | VSGID);
 233 
 234         ct.cc_sysid = 0;
 235         ct.cc_pid = 0;
 236         ct.cc_caller_id = nfs2_srv_caller_id;
 237         ct.cc_flags = CC_DONTBLOCK;
 238 
 239         /*
 240          * We need to specially handle size changes because it is
 241          * possible for the client to create a file with modes
 242          * which indicate read-only, but with the file opened for
 243          * writing.  If the client then tries to set the size of
 244          * the file, then the normal access checking done in
 245          * VOP_SETATTR would prevent the client from doing so,
 246          * although it should be legal for it to do so.  To get
 247          * around this, we do the access checking for ourselves
 248          * and then use VOP_SPACE which doesn't do the access
 249          * checking which VOP_SETATTR does. VOP_SPACE can only
 250          * operate on VREG files, let VOP_SETATTR handle the other
 251          * extremely rare cases.
 252          * Also the client should not be allowed to change the
 253          * size of the file if there is a conflicting non-blocking
 254          * mandatory lock in the region of change.
 255          */
 256         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 257                 if (nbl_need_check(vp)) {
 258                         nbl_start_crit(vp, RW_READER);
 259                         in_crit = 1;
 260                 }
 261 
 262                 bva.va_mask = AT_UID | AT_SIZE;
 263 
 264                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 265 
 266                 if (error) {
 267                         if (in_crit)
 268                                 nbl_end_crit(vp);
 269                         VN_RELE(vp);
 270                         ns->ns_status = puterrno(error);
 271                         return;
 272                 }
 273 
 274                 if (in_crit) {
 275                         u_offset_t offset;
 276                         ssize_t length;
 277 
 278                         if (va.va_size < bva.va_size) {
 279                                 offset = va.va_size;
 280                                 length = bva.va_size - va.va_size;
 281                         } else {
 282                                 offset = bva.va_size;
 283                                 length = va.va_size - bva.va_size;
 284                         }
 285                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 286                             NULL)) {
 287                                 error = EACCES;
 288                         }
 289                 }
 290 
 291                 if (crgetuid(cr) == bva.va_uid && !error &&
 292                     va.va_size != bva.va_size) {
 293                         va.va_mask &= ~AT_SIZE;
 294                         bf.l_type = F_WRLCK;
 295                         bf.l_whence = 0;
 296                         bf.l_start = (off64_t)va.va_size;
 297                         bf.l_len = 0;
 298                         bf.l_sysid = 0;
 299                         bf.l_pid = 0;
 300 
 301                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 302                             (offset_t)va.va_size, cr, &ct);
 303                 }
 304                 if (in_crit)
 305                         nbl_end_crit(vp);
 306         } else
 307                 error = 0;
 308 
 309         /*
 310          * Do the setattr.
 311          */
 312         if (!error && va.va_mask) {
 313                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 314         }
 315 
 316         /*
 317          * check if the monitor on either vop_space or vop_setattr detected
 318          * a delegation conflict and if so, mark the thread flag as
 319          * wouldblock so that the response is dropped and the client will
 320          * try again.
 321          */
 322         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 323                 VN_RELE(vp);
 324                 curthread->t_flag |= T_WOULDBLOCK;
 325                 return;
 326         }
 327 
 328         if (!error) {
 329                 va.va_mask = AT_ALL;    /* get everything */
 330 
 331                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 332 
 333                 /* check for overflows */
 334                 if (!error) {
 335                         acl_perm(vp, exi, &va, cr);
 336                         error = vattr_to_nattr(&va, &ns->ns_attr);
 337                 }
 338         }
 339 
 340         ct.cc_flags = 0;
 341 
 342         /*
 343          * Force modified metadata out to stable storage.
 344          */
 345         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 346 
 347         VN_RELE(vp);
 348 
 349         ns->ns_status = puterrno(error);
 350 }
 351 void *
 352 rfs_setattr_getfh(struct nfssaargs *args)
 353 {
 354         return (&args->saa_fh);
 355 }
 356 
 357 /* Change and release @exip and @vpp only in success */
 358 int
 359 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 360 {
 361         struct exportinfo *exi;
 362         vnode_t *vp = *vpp;
 363         fid_t fid;
 364         int error;
 365 
 366         VN_HOLD(vp);
 367 
 368         if ((error = traverse(&vp)) != 0) {
 369                 VN_RELE(vp);
 370                 return (error);
 371         }
 372 
 373         bzero(&fid, sizeof (fid));
 374         fid.fid_len = MAXFIDSZ;
 375         error = VOP_FID(vp, &fid, NULL);
 376         if (error) {
 377                 VN_RELE(vp);
 378                 return (error);
 379         }
 380 
 381         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 382         if (exi == NULL ||
 383             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 384                 /*
 385                  * It is not error, just subdir is not exported
 386                  * or "nohide" is not set
 387                  */
 388                 if (exi != NULL)
 389                         exi_rele(exi);
 390                 VN_RELE(vp);
 391         } else {
 392                 /* go to submount */
 393                 exi_rele(*exip);
 394                 *exip = exi;
 395 
 396                 VN_RELE(*vpp);
 397                 *vpp = vp;
 398         }
 399 
 400         return (0);
 401 }
 402 
 403 /*
 404  * Given mounted "dvp" and "exi", go upper mountpoint
 405  * with dvp/exi correction
 406  * Return 0 in success
 407  */
 408 int
 409 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 410 {
 411         struct exportinfo *exi;
 412         vnode_t *dvp = *dvpp;
 413 
 414         ASSERT(dvp->v_flag & VROOT);
 415 
 416         VN_HOLD(dvp);
 417         dvp = untraverse(dvp);
 418         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 419         if (exi == NULL) {
 420                 VN_RELE(dvp);
 421                 return (-1);
 422         }
 423 
 424         exi_rele(*exip);
 425         *exip = exi;
 426         VN_RELE(*dvpp);
 427         *dvpp = dvp;
 428 
 429         return (0);
 430 }
 431 /*
 432  * Directory lookup.
 433  * Returns an fhandle and file attributes for file name in a directory.
 434  */
 435 /* ARGSUSED */
 436 void
 437 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 438     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 439 {
 440         int error;
 441         vnode_t *dvp;
 442         vnode_t *vp;
 443         struct vattr va;
 444         fhandle_t *fhp = da->da_fhandle;
 445         struct sec_ol sec = {0, 0};
 446         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 447         char *name;
 448         struct sockaddr *ca;
 449 
 450         /*
 451          * Trusted Extension doesn't support NFSv2. MOUNT
 452          * will reject v2 clients. Need to prevent v2 client
 453          * access via WebNFS here.
 454          */
 455         if (is_system_labeled() && req->rq_vers == 2) {
 456                 dr->dr_status = NFSERR_ACCES;
 457                 return;
 458         }
 459 
 460         /*
 461          * Disallow NULL paths
 462          */
 463         if (da->da_name == NULL || *da->da_name == '\0') {
 464                 dr->dr_status = NFSERR_ACCES;
 465                 return;
 466         }
 467 
 468         /*
 469          * Allow lookups from the root - the default
 470          * location of the public filehandle.
 471          */
 472         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 473                 dvp = ZONE_ROOTVP();
 474                 VN_HOLD(dvp);
 475         } else {
 476                 dvp = nfs_fhtovp(fhp, exi);
 477                 if (dvp == NULL) {
 478                         dr->dr_status = NFSERR_STALE;
 479                         return;
 480                 }
 481         }
 482 
 483         exi_hold(exi);
 484 
 485         /*
 486          * Not allow lookup beyond root.
 487          * If the filehandle matches a filehandle of the exi,
 488          * then the ".." refers beyond the root of an exported filesystem.
 489          */
 490         if (strcmp(da->da_name, "..") == 0 &&
 491             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 492                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 493                     (dvp->v_flag & VROOT)) {
 494                         /*
 495                          * special case for ".." and 'nohide'exported root
 496                          */
 497                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 498                                 error = NFSERR_ACCES;
 499                                 goto out;
 500                         }
 501                 } else  {
 502                         error = NFSERR_NOENT;
 503                         goto out;
 504                 }
 505         }
 506 
 507         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 508         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 509             MAXPATHLEN);
 510 
 511         if (name == NULL) {
 512                 error = NFSERR_ACCES;
 513                 goto out;
 514         }
 515 
 516         /*
 517          * If the public filehandle is used then allow
 518          * a multi-component lookup, i.e. evaluate
 519          * a pathname and follow symbolic links if
 520          * necessary.
 521          *
 522          * This may result in a vnode in another filesystem
 523          * which is OK as long as the filesystem is exported.
 524          */
 525         if (PUBLIC_FH2(fhp)) {
 526                 publicfh_flag = TRUE;
 527 
 528                 exi_rele(exi);
 529 
 530                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 531                     &sec);
 532         } else {
 533                 /*
 534                  * Do a normal single component lookup.
 535                  */
 536                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 537                     NULL, NULL, NULL);
 538         }
 539 
 540         if (name != da->da_name)
 541                 kmem_free(name, MAXPATHLEN);
 542 
 543         if (error == 0 && vn_ismntpt(vp)) {
 544                 error = rfs_cross_mnt(&vp, &exi);
 545                 if (error)
 546                         VN_RELE(vp);
 547         }
 548 
 549         if (!error) {
 550                 va.va_mask = AT_ALL;    /* we want everything */
 551 
 552                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 553 
 554                 /* check for overflows */
 555                 if (!error) {
 556                         acl_perm(vp, exi, &va, cr);
 557                         error = vattr_to_nattr(&va, &dr->dr_attr);
 558                         if (!error) {
 559                                 if (sec.sec_flags & SEC_QUERY)
 560                                         error = makefh_ol(&dr->dr_fhandle, exi,
 561                                             sec.sec_index);
 562                                 else {
 563                                         error = makefh(&dr->dr_fhandle, vp,
 564                                             exi);
 565                                         if (!error && publicfh_flag &&
 566                                             !chk_clnt_sec(exi, req))
 567                                                 auth_weak = TRUE;
 568                                 }
 569                         }
 570                 }
 571                 VN_RELE(vp);
 572         }
 573 
 574 out:
 575         VN_RELE(dvp);
 576 
 577         if (exi != NULL)
 578                 exi_rele(exi);
 579 
 580         /*
 581          * If it's public fh, no 0x81, and client's flavor is
 582          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 583          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 584          */
 585         if (auth_weak)
 586                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 587         else
 588                 dr->dr_status = puterrno(error);
 589 }
 590 void *
 591 rfs_lookup_getfh(struct nfsdiropargs *da)
 592 {
 593         return (da->da_fhandle);
 594 }
 595 
 596 /*
 597  * Read symbolic link.
 598  * Returns the string in the symbolic link at the given fhandle.
 599  */
 600 /* ARGSUSED */
 601 void
 602 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 603     struct svc_req *req, cred_t *cr, bool_t ro)
 604 {
 605         int error;
 606         struct iovec iov;
 607         struct uio uio;
 608         vnode_t *vp;
 609         struct vattr va;
 610         struct sockaddr *ca;
 611         char *name = NULL;
 612         int is_referral = 0;
 613 
 614         vp = nfs_fhtovp(fhp, exi);
 615         if (vp == NULL) {
 616                 rl->rl_data = NULL;
 617                 rl->rl_status = NFSERR_STALE;
 618                 return;
 619         }
 620 
 621         va.va_mask = AT_MODE;
 622 
 623         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 624 
 625         if (error) {
 626                 VN_RELE(vp);
 627                 rl->rl_data = NULL;
 628                 rl->rl_status = puterrno(error);
 629                 return;
 630         }
 631 
 632         if (MANDLOCK(vp, va.va_mode)) {
 633                 VN_RELE(vp);
 634                 rl->rl_data = NULL;
 635                 rl->rl_status = NFSERR_ACCES;
 636                 return;
 637         }
 638 
 639         /* We lied about the object type for a referral */
 640         if (vn_is_nfs_reparse(vp, cr))
 641                 is_referral = 1;
 642 
 643         /*
 644          * XNFS and RFC1094 require us to return ENXIO if argument
 645          * is not a link. BUGID 1138002.
 646          */
 647         if (vp->v_type != VLNK && !is_referral) {
 648                 VN_RELE(vp);
 649                 rl->rl_data = NULL;
 650                 rl->rl_status = NFSERR_NXIO;
 651                 return;
 652         }
 653 
 654         /*
 655          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 656          */
 657         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 658 
 659         if (is_referral) {
 660                 char *s;
 661                 size_t strsz;
 662 
 663                 /* Get an artificial symlink based on a referral */
 664                 s = build_symlink(vp, cr, &strsz);
 665                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 666                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 667                     vnode_t *, vp, char *, s);
 668                 if (s == NULL)
 669                         error = EINVAL;
 670                 else {
 671                         error = 0;
 672                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 673                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 674                         kmem_free(s, strsz);
 675                 }
 676 
 677         } else {
 678 
 679                 /*
 680                  * Set up io vector to read sym link data
 681                  */
 682                 iov.iov_base = rl->rl_data;
 683                 iov.iov_len = NFS_MAXPATHLEN;
 684                 uio.uio_iov = &iov;
 685                 uio.uio_iovcnt = 1;
 686                 uio.uio_segflg = UIO_SYSSPACE;
 687                 uio.uio_extflg = UIO_COPY_CACHED;
 688                 uio.uio_loffset = (offset_t)0;
 689                 uio.uio_resid = NFS_MAXPATHLEN;
 690 
 691                 /*
 692                  * Do the readlink.
 693                  */
 694                 error = VOP_READLINK(vp, &uio, cr, NULL);
 695 
 696                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 697 
 698                 if (!error)
 699                         rl->rl_data[rl->rl_count] = '\0';
 700 
 701         }
 702 
 703 
 704         VN_RELE(vp);
 705 
 706         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 707         name = nfscmd_convname(ca, exi, rl->rl_data,
 708             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 709 
 710         if (name != NULL && name != rl->rl_data) {
 711                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 712                 rl->rl_data = name;
 713         }
 714 
 715         /*
 716          * XNFS and RFC1094 require us to return ENXIO if argument
 717          * is not a link. UFS returns EINVAL if this is the case,
 718          * so we do the mapping here. BUGID 1138002.
 719          */
 720         if (error == EINVAL)
 721                 rl->rl_status = NFSERR_NXIO;
 722         else
 723                 rl->rl_status = puterrno(error);
 724 
 725 }
 726 void *
 727 rfs_readlink_getfh(fhandle_t *fhp)
 728 {
 729         return (fhp);
 730 }
 731 /*
 732  * Free data allocated by rfs_readlink
 733  */
 734 void
 735 rfs_rlfree(struct nfsrdlnres *rl)
 736 {
 737         if (rl->rl_data != NULL)
 738                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 739 }
 740 
 741 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 742 
 743 /*
 744  * Read data.
 745  * Returns some data read from the file at the given fhandle.
 746  */
 747 /* ARGSUSED */
 748 void
 749 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 750     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 751 {
 752         vnode_t *vp;
 753         int error;
 754         struct vattr va;
 755         struct iovec iov;
 756         struct uio uio;
 757         mblk_t *mp;
 758         int alloc_err = 0;
 759         int in_crit = 0;
 760         caller_context_t ct;
 761 
 762         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 763         if (vp == NULL) {
 764                 rr->rr_data = NULL;
 765                 rr->rr_status = NFSERR_STALE;
 766                 return;
 767         }
 768 
 769         if (vp->v_type != VREG) {
 770                 VN_RELE(vp);
 771                 rr->rr_data = NULL;
 772                 rr->rr_status = NFSERR_ISDIR;
 773                 return;
 774         }
 775 
 776         ct.cc_sysid = 0;
 777         ct.cc_pid = 0;
 778         ct.cc_caller_id = nfs2_srv_caller_id;
 779         ct.cc_flags = CC_DONTBLOCK;
 780 
 781         /*
 782          * Enter the critical region before calling VOP_RWLOCK
 783          * to avoid a deadlock with write requests.
 784          */
 785         if (nbl_need_check(vp)) {
 786                 nbl_start_crit(vp, RW_READER);
 787                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 788                     0, NULL)) {
 789                         nbl_end_crit(vp);
 790                         VN_RELE(vp);
 791                         rr->rr_data = NULL;
 792                         rr->rr_status = NFSERR_ACCES;
 793                         return;
 794                 }
 795                 in_crit = 1;
 796         }
 797 
 798         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 799 
 800         /* check if a monitor detected a delegation conflict */
 801         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 802                 if (in_crit)
 803                         nbl_end_crit(vp);
 804                 VN_RELE(vp);
 805                 /* mark as wouldblock so response is dropped */
 806                 curthread->t_flag |= T_WOULDBLOCK;
 807 
 808                 rr->rr_data = NULL;
 809                 return;
 810         }
 811 
 812         va.va_mask = AT_ALL;
 813 
 814         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 815 
 816         if (error) {
 817                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 818                 if (in_crit)
 819                         nbl_end_crit(vp);
 820 
 821                 VN_RELE(vp);
 822                 rr->rr_data = NULL;
 823                 rr->rr_status = puterrno(error);
 824 
 825                 return;
 826         }
 827 
 828         /*
 829          * This is a kludge to allow reading of files created
 830          * with no read permission.  The owner of the file
 831          * is always allowed to read it.
 832          */
 833         if (crgetuid(cr) != va.va_uid) {
 834                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 835 
 836                 if (error) {
 837                         /*
 838                          * Exec is the same as read over the net because
 839                          * of demand loading.
 840                          */
 841                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 842                 }
 843                 if (error) {
 844                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 845                         if (in_crit)
 846                                 nbl_end_crit(vp);
 847                         VN_RELE(vp);
 848                         rr->rr_data = NULL;
 849                         rr->rr_status = puterrno(error);
 850 
 851                         return;
 852                 }
 853         }
 854 
 855         if (MANDLOCK(vp, va.va_mode)) {
 856                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 857                 if (in_crit)
 858                         nbl_end_crit(vp);
 859 
 860                 VN_RELE(vp);
 861                 rr->rr_data = NULL;
 862                 rr->rr_status = NFSERR_ACCES;
 863 
 864                 return;
 865         }
 866 
 867         rr->rr_ok.rrok_wlist_len = 0;
 868         rr->rr_ok.rrok_wlist = NULL;
 869 
 870         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 871                 rr->rr_count = 0;
 872                 rr->rr_data = NULL;
 873                 /*
 874                  * In this case, status is NFS_OK, but there is no data
 875                  * to encode. So set rr_mp to NULL.
 876                  */
 877                 rr->rr_mp = NULL;
 878                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 879                 if (rr->rr_ok.rrok_wlist)
 880                         clist_zero_len(rr->rr_ok.rrok_wlist);
 881                 goto done;
 882         }
 883 
 884         if (ra->ra_wlist) {
 885                 mp = NULL;
 886                 rr->rr_mp = NULL;
 887                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 888                 if (ra->ra_count > iov.iov_len) {
 889                         rr->rr_data = NULL;
 890                         rr->rr_status = NFSERR_INVAL;
 891                         goto done;
 892                 }
 893         } else {
 894                 /*
 895                  * mp will contain the data to be sent out in the read reply.
 896                  * This will be freed after the reply has been sent out (by the
 897                  * driver).
 898                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 899                  * that the call to xdrmblk_putmblk() never fails.
 900                  */
 901                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 902                     &alloc_err);
 903                 ASSERT(mp != NULL);
 904                 ASSERT(alloc_err == 0);
 905 
 906                 rr->rr_mp = mp;
 907 
 908                 /*
 909                  * Set up io vector
 910                  */
 911                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 912                 iov.iov_len = ra->ra_count;
 913         }
 914 
 915         uio.uio_iov = &iov;
 916         uio.uio_iovcnt = 1;
 917         uio.uio_segflg = UIO_SYSSPACE;
 918         uio.uio_extflg = UIO_COPY_CACHED;
 919         uio.uio_loffset = (offset_t)ra->ra_offset;
 920         uio.uio_resid = ra->ra_count;
 921 
 922         error = VOP_READ(vp, &uio, 0, cr, &ct);
 923 
 924         if (error) {
 925                 if (mp)
 926                         freeb(mp);
 927 
 928                 /*
 929                  * check if a monitor detected a delegation conflict and
 930                  * mark as wouldblock so response is dropped
 931                  */
 932                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 933                         curthread->t_flag |= T_WOULDBLOCK;
 934                 else
 935                         rr->rr_status = puterrno(error);
 936 
 937                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 938                 if (in_crit)
 939                         nbl_end_crit(vp);
 940 
 941                 VN_RELE(vp);
 942                 rr->rr_data = NULL;
 943 
 944                 return;
 945         }
 946 
 947         /*
 948          * Get attributes again so we can send the latest access
 949          * time to the client side for its cache.
 950          */
 951         va.va_mask = AT_ALL;
 952 
 953         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 954 
 955         if (error) {
 956                 if (mp)
 957                         freeb(mp);
 958 
 959                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 960                 if (in_crit)
 961                         nbl_end_crit(vp);
 962 
 963                 VN_RELE(vp);
 964                 rr->rr_data = NULL;
 965                 rr->rr_status = puterrno(error);
 966 
 967                 return;
 968         }
 969 
 970         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 971 
 972         if (mp) {
 973                 rr->rr_data = (char *)mp->b_datap->db_base;
 974         } else {
 975                 if (ra->ra_wlist) {
 976                         rr->rr_data = (caddr_t)iov.iov_base;
 977                         if (!rdma_setup_read_data2(ra, rr)) {
 978                                 rr->rr_data = NULL;
 979                                 rr->rr_status = puterrno(NFSERR_INVAL);
 980                         }
 981                 }
 982         }
 983 done:
 984         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 985         if (in_crit)
 986                 nbl_end_crit(vp);
 987 
 988         acl_perm(vp, exi, &va, cr);
 989 
 990         /* check for overflows */
 991         error = vattr_to_nattr(&va, &rr->rr_attr);
 992 
 993         VN_RELE(vp);
 994 
 995         rr->rr_status = puterrno(error);
 996 }
 997 
 998 /*
 999  * Free data allocated by rfs_read
1000  */
1001 void
1002 rfs_rdfree(struct nfsrdresult *rr)
1003 {
1004         mblk_t *mp;
1005 
1006         if (rr->rr_status == NFS_OK) {
1007                 mp = rr->rr_mp;
1008                 if (mp != NULL)
1009                         freeb(mp);
1010         }
1011 }
1012 
1013 void *
1014 rfs_read_getfh(struct nfsreadargs *ra)
1015 {
1016         return (&ra->ra_fhandle);
1017 }
1018 
1019 #define MAX_IOVECS      12
1020 
1021 #ifdef DEBUG
1022 static int rfs_write_sync_hits = 0;
1023 static int rfs_write_sync_misses = 0;
1024 #endif
1025 
1026 /*
1027  * Write data to file.
1028  * Returns attributes of a file after writing some data to it.
1029  *
1030  * Any changes made here, especially in error handling might have
1031  * to also be done in rfs_write (which clusters write requests).
1032  */
1033 /* ARGSUSED */
1034 void
1035 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1036     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1037 {
1038         int error;
1039         vnode_t *vp;
1040         rlim64_t rlimit;
1041         struct vattr va;
1042         struct uio uio;
1043         struct iovec iov[MAX_IOVECS];
1044         mblk_t *m;
1045         struct iovec *iovp;
1046         int iovcnt;
1047         cred_t *savecred;
1048         int in_crit = 0;
1049         caller_context_t ct;
1050 
1051         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1052         if (vp == NULL) {
1053                 ns->ns_status = NFSERR_STALE;
1054                 return;
1055         }
1056 
1057         if (rdonly(ro, vp)) {
1058                 VN_RELE(vp);
1059                 ns->ns_status = NFSERR_ROFS;
1060                 return;
1061         }
1062 
1063         if (vp->v_type != VREG) {
1064                 VN_RELE(vp);
1065                 ns->ns_status = NFSERR_ISDIR;
1066                 return;
1067         }
1068 
1069         ct.cc_sysid = 0;
1070         ct.cc_pid = 0;
1071         ct.cc_caller_id = nfs2_srv_caller_id;
1072         ct.cc_flags = CC_DONTBLOCK;
1073 
1074         va.va_mask = AT_UID|AT_MODE;
1075 
1076         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1077 
1078         if (error) {
1079                 VN_RELE(vp);
1080                 ns->ns_status = puterrno(error);
1081 
1082                 return;
1083         }
1084 
1085         if (crgetuid(cr) != va.va_uid) {
1086                 /*
1087                  * This is a kludge to allow writes of files created
1088                  * with read only permission.  The owner of the file
1089                  * is always allowed to write it.
1090                  */
1091                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1092 
1093                 if (error) {
1094                         VN_RELE(vp);
1095                         ns->ns_status = puterrno(error);
1096                         return;
1097                 }
1098         }
1099 
1100         /*
1101          * Can't access a mandatory lock file.  This might cause
1102          * the NFS service thread to block forever waiting for a
1103          * lock to be released that will never be released.
1104          */
1105         if (MANDLOCK(vp, va.va_mode)) {
1106                 VN_RELE(vp);
1107                 ns->ns_status = NFSERR_ACCES;
1108                 return;
1109         }
1110 
1111         /*
1112          * We have to enter the critical region before calling VOP_RWLOCK
1113          * to avoid a deadlock with ufs.
1114          */
1115         if (nbl_need_check(vp)) {
1116                 nbl_start_crit(vp, RW_READER);
1117                 in_crit = 1;
1118                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1119                     wa->wa_count, 0, NULL)) {
1120                         error = EACCES;
1121                         goto out;
1122                 }
1123         }
1124 
1125         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1126 
1127         /* check if a monitor detected a delegation conflict */
1128         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1129                 goto out;
1130         }
1131 
1132         if (wa->wa_data || wa->wa_rlist) {
1133                 /* Do the RDMA thing if necessary */
1134                 if (wa->wa_rlist) {
1135                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1136                         iov[0].iov_len = wa->wa_count;
1137                 } else  {
1138                         iov[0].iov_base = wa->wa_data;
1139                         iov[0].iov_len = wa->wa_count;
1140                 }
1141                 uio.uio_iov = iov;
1142                 uio.uio_iovcnt = 1;
1143                 uio.uio_segflg = UIO_SYSSPACE;
1144                 uio.uio_extflg = UIO_COPY_DEFAULT;
1145                 uio.uio_loffset = (offset_t)wa->wa_offset;
1146                 uio.uio_resid = wa->wa_count;
1147                 /*
1148                  * The limit is checked on the client. We
1149                  * should allow any size writes here.
1150                  */
1151                 uio.uio_llimit = curproc->p_fsz_ctl;
1152                 rlimit = uio.uio_llimit - wa->wa_offset;
1153                 if (rlimit < (rlim64_t)uio.uio_resid)
1154                         uio.uio_resid = (uint_t)rlimit;
1155 
1156                 /*
1157                  * for now we assume no append mode
1158                  */
1159                 /*
1160                  * We're changing creds because VM may fault and we need
1161                  * the cred of the current thread to be used if quota
1162                  * checking is enabled.
1163                  */
1164                 savecred = curthread->t_cred;
1165                 curthread->t_cred = cr;
1166                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1167                 curthread->t_cred = savecred;
1168         } else {
1169 
1170                 iovcnt = 0;
1171                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1172                         iovcnt++;
1173                 if (iovcnt <= MAX_IOVECS) {
1174 #ifdef DEBUG
1175                         rfs_write_sync_hits++;
1176 #endif
1177                         iovp = iov;
1178                 } else {
1179 #ifdef DEBUG
1180                         rfs_write_sync_misses++;
1181 #endif
1182                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1183                 }
1184                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1185                 uio.uio_iov = iovp;
1186                 uio.uio_iovcnt = iovcnt;
1187                 uio.uio_segflg = UIO_SYSSPACE;
1188                 uio.uio_extflg = UIO_COPY_DEFAULT;
1189                 uio.uio_loffset = (offset_t)wa->wa_offset;
1190                 uio.uio_resid = wa->wa_count;
1191                 /*
1192                  * The limit is checked on the client. We
1193                  * should allow any size writes here.
1194                  */
1195                 uio.uio_llimit = curproc->p_fsz_ctl;
1196                 rlimit = uio.uio_llimit - wa->wa_offset;
1197                 if (rlimit < (rlim64_t)uio.uio_resid)
1198                         uio.uio_resid = (uint_t)rlimit;
1199 
1200                 /*
1201                  * For now we assume no append mode.
1202                  */
1203                 /*
1204                  * We're changing creds because VM may fault and we need
1205                  * the cred of the current thread to be used if quota
1206                  * checking is enabled.
1207                  */
1208                 savecred = curthread->t_cred;
1209                 curthread->t_cred = cr;
1210                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1211                 curthread->t_cred = savecred;
1212 
1213                 if (iovp != iov)
1214                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1215         }
1216 
1217         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1218 
1219         if (!error) {
1220                 /*
1221                  * Get attributes again so we send the latest mod
1222                  * time to the client side for its cache.
1223                  */
1224                 va.va_mask = AT_ALL;    /* now we want everything */
1225 
1226                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1227 
1228                 /* check for overflows */
1229                 if (!error) {
1230                         acl_perm(vp, exi, &va, cr);
1231                         error = vattr_to_nattr(&va, &ns->ns_attr);
1232                 }
1233         }
1234 
1235 out:
1236         if (in_crit)
1237                 nbl_end_crit(vp);
1238         VN_RELE(vp);
1239 
1240         /* check if a monitor detected a delegation conflict */
1241         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1242                 /* mark as wouldblock so response is dropped */
1243                 curthread->t_flag |= T_WOULDBLOCK;
1244         else
1245                 ns->ns_status = puterrno(error);
1246 
1247 }
1248 
1249 struct rfs_async_write {
1250         struct nfswriteargs *wa;
1251         struct nfsattrstat *ns;
1252         struct svc_req *req;
1253         cred_t *cr;
1254         bool_t ro;
1255         kthread_t *thread;
1256         struct rfs_async_write *list;
1257 };
1258 
1259 struct rfs_async_write_list {
1260         fhandle_t *fhp;
1261         kcondvar_t cv;
1262         struct rfs_async_write *list;
1263         struct rfs_async_write_list *next;
1264 };
1265 
1266 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1267 static kmutex_t rfs_async_write_lock;
1268 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1269 
1270 #define MAXCLIOVECS     42
1271 #define RFSWRITE_INITVAL (enum nfsstat) -1
1272 
1273 #ifdef DEBUG
1274 static int rfs_write_hits = 0;
1275 static int rfs_write_misses = 0;
1276 #endif
1277 
1278 /*
1279  * Write data to file.
1280  * Returns attributes of a file after writing some data to it.
1281  */
1282 void
1283 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1284     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1285 {
1286         int error;
1287         vnode_t *vp;
1288         rlim64_t rlimit;
1289         struct vattr va;
1290         struct uio uio;
1291         struct rfs_async_write_list *lp;
1292         struct rfs_async_write_list *nlp;
1293         struct rfs_async_write *rp;
1294         struct rfs_async_write *nrp;
1295         struct rfs_async_write *trp;
1296         struct rfs_async_write *lrp;
1297         int data_written;
1298         int iovcnt;
1299         mblk_t *m;
1300         struct iovec *iovp;
1301         struct iovec *niovp;
1302         struct iovec iov[MAXCLIOVECS];
1303         int count;
1304         int rcount;
1305         uint_t off;
1306         uint_t len;
1307         struct rfs_async_write nrpsp;
1308         struct rfs_async_write_list nlpsp;
1309         ushort_t t_flag;
1310         cred_t *savecred;
1311         int in_crit = 0;
1312         caller_context_t ct;
1313         nfs_srv_t *nsrv;
1314 
1315         nsrv = zone_getspecific(rfs_zone_key, curzone);
1316         if (!nsrv->write_async) {
1317                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1318                 return;
1319         }
1320 
1321         /*
1322          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1323          * is considered an OK.
1324          */
1325         ns->ns_status = RFSWRITE_INITVAL;
1326 
1327         nrp = &nrpsp;
1328         nrp->wa = wa;
1329         nrp->ns = ns;
1330         nrp->req = req;
1331         nrp->cr = cr;
1332         nrp->ro = ro;
1333         nrp->thread = curthread;
1334 
1335         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1336 
1337         /*
1338          * Look to see if there is already a cluster started
1339          * for this file.
1340          */
1341         mutex_enter(&nsrv->async_write_lock);
1342         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1343                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1344                     sizeof (fhandle_t)) == 0)
1345                         break;
1346         }
1347 
1348         /*
1349          * If lp is non-NULL, then there is already a cluster
1350          * started.  We need to place ourselves in the cluster
1351          * list in the right place as determined by starting
1352          * offset.  Conflicts with non-blocking mandatory locked
1353          * regions will be checked when the cluster is processed.
1354          */
1355         if (lp != NULL) {
1356                 rp = lp->list;
1357                 trp = NULL;
1358                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1359                         trp = rp;
1360                         rp = rp->list;
1361                 }
1362                 nrp->list = rp;
1363                 if (trp == NULL)
1364                         lp->list = nrp;
1365                 else
1366                         trp->list = nrp;
1367                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1368                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1369                 mutex_exit(&nsrv->async_write_lock);
1370 
1371                 return;
1372         }
1373 
1374         /*
1375          * No cluster started yet, start one and add ourselves
1376          * to the list of clusters.
1377          */
1378         nrp->list = NULL;
1379 
1380         nlp = &nlpsp;
1381         nlp->fhp = &wa->wa_fhandle;
1382         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1383         nlp->list = nrp;
1384         nlp->next = NULL;
1385 
1386         if (nsrv->async_write_head == NULL) {
1387                 nsrv->async_write_head = nlp;
1388         } else {
1389                 lp = nsrv->async_write_head;
1390                 while (lp->next != NULL)
1391                         lp = lp->next;
1392                 lp->next = nlp;
1393         }
1394         mutex_exit(&nsrv->async_write_lock);
1395 
1396         /*
1397          * Convert the file handle common to all of the requests
1398          * in this cluster to a vnode.
1399          */
1400         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1401         if (vp == NULL) {
1402                 mutex_enter(&nsrv->async_write_lock);
1403                 if (nsrv->async_write_head == nlp)
1404                         nsrv->async_write_head = nlp->next;
1405                 else {
1406                         lp = nsrv->async_write_head;
1407                         while (lp->next != nlp)
1408                                 lp = lp->next;
1409                         lp->next = nlp->next;
1410                 }
1411                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1412                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1413                         rp->ns->ns_status = NFSERR_STALE;
1414                         rp->thread->t_flag |= t_flag;
1415                 }
1416                 cv_broadcast(&nlp->cv);
1417                 mutex_exit(&nsrv->async_write_lock);
1418 
1419                 return;
1420         }
1421 
1422         /*
1423          * Can only write regular files.  Attempts to write any
1424          * other file types fail with EISDIR.
1425          */
1426         if (vp->v_type != VREG) {
1427                 VN_RELE(vp);
1428                 mutex_enter(&nsrv->async_write_lock);
1429                 if (nsrv->async_write_head == nlp)
1430                         nsrv->async_write_head = nlp->next;
1431                 else {
1432                         lp = nsrv->async_write_head;
1433                         while (lp->next != nlp)
1434                                 lp = lp->next;
1435                         lp->next = nlp->next;
1436                 }
1437                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1438                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1439                         rp->ns->ns_status = NFSERR_ISDIR;
1440                         rp->thread->t_flag |= t_flag;
1441                 }
1442                 cv_broadcast(&nlp->cv);
1443                 mutex_exit(&nsrv->async_write_lock);
1444 
1445                 return;
1446         }
1447 
1448         /*
1449          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1450          * deadlock with ufs.
1451          */
1452         if (nbl_need_check(vp)) {
1453                 nbl_start_crit(vp, RW_READER);
1454                 in_crit = 1;
1455         }
1456 
1457         ct.cc_sysid = 0;
1458         ct.cc_pid = 0;
1459         ct.cc_caller_id = nfs2_srv_caller_id;
1460         ct.cc_flags = CC_DONTBLOCK;
1461 
1462         /*
1463          * Lock the file for writing.  This operation provides
1464          * the delay which allows clusters to grow.
1465          */
1466         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1467 
1468         /* check if a monitor detected a delegation conflict */
1469         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1470                 if (in_crit)
1471                         nbl_end_crit(vp);
1472                 VN_RELE(vp);
1473                 /* mark as wouldblock so response is dropped */
1474                 curthread->t_flag |= T_WOULDBLOCK;
1475                 mutex_enter(&nsrv->async_write_lock);
1476                 if (nsrv->async_write_head == nlp)
1477                         nsrv->async_write_head = nlp->next;
1478                 else {
1479                         lp = nsrv->async_write_head;
1480                         while (lp->next != nlp)
1481                                 lp = lp->next;
1482                         lp->next = nlp->next;
1483                 }
1484                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1485                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1486                                 rp->ns->ns_status = puterrno(error);
1487                                 rp->thread->t_flag |= T_WOULDBLOCK;
1488                         }
1489                 }
1490                 cv_broadcast(&nlp->cv);
1491                 mutex_exit(&nsrv->async_write_lock);
1492 
1493                 return;
1494         }
1495 
1496         /*
1497          * Disconnect this cluster from the list of clusters.
1498          * The cluster that is being dealt with must be fixed
1499          * in size after this point, so there is no reason
1500          * to leave it on the list so that new requests can
1501          * find it.
1502          *
1503          * The algorithm is that the first write request will
1504          * create a cluster, convert the file handle to a
1505          * vnode pointer, and then lock the file for writing.
1506          * This request is not likely to be clustered with
1507          * any others.  However, the next request will create
1508          * a new cluster and be blocked in VOP_RWLOCK while
1509          * the first request is being processed.  This delay
1510          * will allow more requests to be clustered in this
1511          * second cluster.
1512          */
1513         mutex_enter(&nsrv->async_write_lock);
1514         if (nsrv->async_write_head == nlp)
1515                 nsrv->async_write_head = nlp->next;
1516         else {
1517                 lp = nsrv->async_write_head;
1518                 while (lp->next != nlp)
1519                         lp = lp->next;
1520                 lp->next = nlp->next;
1521         }
1522         mutex_exit(&nsrv->async_write_lock);
1523 
1524         /*
1525          * Step through the list of requests in this cluster.
1526          * We need to check permissions to make sure that all
1527          * of the requests have sufficient permission to write
1528          * the file.  A cluster can be composed of requests
1529          * from different clients and different users on each
1530          * client.
1531          *
1532          * As a side effect, we also calculate the size of the
1533          * byte range that this cluster encompasses.
1534          */
1535         rp = nlp->list;
1536         off = rp->wa->wa_offset;
1537         len = (uint_t)0;
1538         do {
1539                 if (rdonly(rp->ro, vp)) {
1540                         rp->ns->ns_status = NFSERR_ROFS;
1541                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1542                         rp->thread->t_flag |= t_flag;
1543                         continue;
1544                 }
1545 
1546                 va.va_mask = AT_UID|AT_MODE;
1547 
1548                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1549 
1550                 if (!error) {
1551                         if (crgetuid(rp->cr) != va.va_uid) {
1552                                 /*
1553                                  * This is a kludge to allow writes of files
1554                                  * created with read only permission.  The
1555                                  * owner of the file is always allowed to
1556                                  * write it.
1557                                  */
1558                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1559                         }
1560                         if (!error && MANDLOCK(vp, va.va_mode))
1561                                 error = EACCES;
1562                 }
1563 
1564                 /*
1565                  * Check for a conflict with a nbmand-locked region.
1566                  */
1567                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1568                     rp->wa->wa_count, 0, NULL)) {
1569                         error = EACCES;
1570                 }
1571 
1572                 if (error) {
1573                         rp->ns->ns_status = puterrno(error);
1574                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1575                         rp->thread->t_flag |= t_flag;
1576                         continue;
1577                 }
1578                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1579                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1580         } while ((rp = rp->list) != NULL);
1581 
1582         /*
1583          * Step through the cluster attempting to gather as many
1584          * requests which are contiguous as possible.  These
1585          * contiguous requests are handled via one call to VOP_WRITE
1586          * instead of different calls to VOP_WRITE.  We also keep
1587          * track of the fact that any data was written.
1588          */
1589         rp = nlp->list;
1590         data_written = 0;
1591         do {
1592                 /*
1593                  * Skip any requests which are already marked as having an
1594                  * error.
1595                  */
1596                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1597                         rp = rp->list;
1598                         continue;
1599                 }
1600 
1601                 /*
1602                  * Count the number of iovec's which are required
1603                  * to handle this set of requests.  One iovec is
1604                  * needed for each data buffer, whether addressed
1605                  * by wa_data or by the b_rptr pointers in the
1606                  * mblk chains.
1607                  */
1608                 iovcnt = 0;
1609                 lrp = rp;
1610                 for (;;) {
1611                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1612                                 iovcnt++;
1613                         else {
1614                                 m = lrp->wa->wa_mblk;
1615                                 while (m != NULL) {
1616                                         iovcnt++;
1617                                         m = m->b_cont;
1618                                 }
1619                         }
1620                         if (lrp->list == NULL ||
1621                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1622                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1623                             lrp->list->wa->wa_offset) {
1624                                 lrp = lrp->list;
1625                                 break;
1626                         }
1627                         lrp = lrp->list;
1628                 }
1629 
1630                 if (iovcnt <= MAXCLIOVECS) {
1631 #ifdef DEBUG
1632                         rfs_write_hits++;
1633 #endif
1634                         niovp = iov;
1635                 } else {
1636 #ifdef DEBUG
1637                         rfs_write_misses++;
1638 #endif
1639                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1640                 }
1641                 /*
1642                  * Put together the scatter/gather iovecs.
1643                  */
1644                 iovp = niovp;
1645                 trp = rp;
1646                 count = 0;
1647                 do {
1648                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1649                                 if (trp->wa->wa_rlist) {
1650                                         iovp->iov_base =
1651                                             (char *)((trp->wa->wa_rlist)->
1652                                             u.c_daddr3);
1653                                         iovp->iov_len = trp->wa->wa_count;
1654                                 } else  {
1655                                         iovp->iov_base = trp->wa->wa_data;
1656                                         iovp->iov_len = trp->wa->wa_count;
1657                                 }
1658                                 iovp++;
1659                         } else {
1660                                 m = trp->wa->wa_mblk;
1661                                 rcount = trp->wa->wa_count;
1662                                 while (m != NULL) {
1663                                         iovp->iov_base = (caddr_t)m->b_rptr;
1664                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1665                                         rcount -= iovp->iov_len;
1666                                         if (rcount < 0)
1667                                                 iovp->iov_len += rcount;
1668                                         iovp++;
1669                                         if (rcount <= 0)
1670                                                 break;
1671                                         m = m->b_cont;
1672                                 }
1673                         }
1674                         count += trp->wa->wa_count;
1675                         trp = trp->list;
1676                 } while (trp != lrp);
1677 
1678                 uio.uio_iov = niovp;
1679                 uio.uio_iovcnt = iovcnt;
1680                 uio.uio_segflg = UIO_SYSSPACE;
1681                 uio.uio_extflg = UIO_COPY_DEFAULT;
1682                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1683                 uio.uio_resid = count;
1684                 /*
1685                  * The limit is checked on the client. We
1686                  * should allow any size writes here.
1687                  */
1688                 uio.uio_llimit = curproc->p_fsz_ctl;
1689                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1690                 if (rlimit < (rlim64_t)uio.uio_resid)
1691                         uio.uio_resid = (uint_t)rlimit;
1692 
1693                 /*
1694                  * For now we assume no append mode.
1695                  */
1696 
1697                 /*
1698                  * We're changing creds because VM may fault
1699                  * and we need the cred of the current
1700                  * thread to be used if quota * checking is
1701                  * enabled.
1702                  */
1703                 savecred = curthread->t_cred;
1704                 curthread->t_cred = cr;
1705                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1706                 curthread->t_cred = savecred;
1707 
1708                 /* check if a monitor detected a delegation conflict */
1709                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1710                         /* mark as wouldblock so response is dropped */
1711                         curthread->t_flag |= T_WOULDBLOCK;
1712 
1713                 if (niovp != iov)
1714                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1715 
1716                 if (!error) {
1717                         data_written = 1;
1718                         /*
1719                          * Get attributes again so we send the latest mod
1720                          * time to the client side for its cache.
1721                          */
1722                         va.va_mask = AT_ALL;    /* now we want everything */
1723 
1724                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1725 
1726                         if (!error)
1727                                 acl_perm(vp, exi, &va, rp->cr);
1728                 }
1729 
1730                 /*
1731                  * Fill in the status responses for each request
1732                  * which was just handled.  Also, copy the latest
1733                  * attributes in to the attribute responses if
1734                  * appropriate.
1735                  */
1736                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1737                 do {
1738                         rp->thread->t_flag |= t_flag;
1739                         /* check for overflows */
1740                         if (!error) {
1741                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1742                         }
1743                         rp->ns->ns_status = puterrno(error);
1744                         rp = rp->list;
1745                 } while (rp != lrp);
1746         } while (rp != NULL);
1747 
1748         /*
1749          * If any data was written at all, then we need to flush
1750          * the data and metadata to stable storage.
1751          */
1752         if (data_written) {
1753                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1754 
1755                 if (!error) {
1756                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1757                 }
1758         }
1759 
1760         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1761 
1762         if (in_crit)
1763                 nbl_end_crit(vp);
1764         VN_RELE(vp);
1765 
1766         t_flag = curthread->t_flag & T_WOULDBLOCK;
1767         mutex_enter(&nsrv->async_write_lock);
1768         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1769                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1770                         rp->ns->ns_status = puterrno(error);
1771                         rp->thread->t_flag |= t_flag;
1772                 }
1773         }
1774         cv_broadcast(&nlp->cv);
1775         mutex_exit(&nsrv->async_write_lock);
1776 
1777 }
1778 
1779 void *
1780 rfs_write_getfh(struct nfswriteargs *wa)
1781 {
1782         return (&wa->wa_fhandle);
1783 }
1784 
1785 /*
1786  * Create a file.
1787  * Creates a file with given attributes and returns those attributes
1788  * and an fhandle for the new file.
1789  */
1790 void
1791 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1792     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1793 {
1794         int error;
1795         int lookuperr;
1796         int in_crit = 0;
1797         struct vattr va;
1798         vnode_t *vp;
1799         vnode_t *realvp;
1800         vnode_t *dvp;
1801         char *name = args->ca_da.da_name;
1802         vnode_t *tvp = NULL;
1803         int mode;
1804         int lookup_ok;
1805         bool_t trunc;
1806         struct sockaddr *ca;
1807 
1808         /*
1809          * Disallow NULL paths
1810          */
1811         if (name == NULL || *name == '\0') {
1812                 dr->dr_status = NFSERR_ACCES;
1813                 return;
1814         }
1815 
1816         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1817         if (dvp == NULL) {
1818                 dr->dr_status = NFSERR_STALE;
1819                 return;
1820         }
1821 
1822         error = sattr_to_vattr(args->ca_sa, &va);
1823         if (error) {
1824                 dr->dr_status = puterrno(error);
1825                 return;
1826         }
1827 
1828         /*
1829          * Must specify the mode.
1830          */
1831         if (!(va.va_mask & AT_MODE)) {
1832                 VN_RELE(dvp);
1833                 dr->dr_status = NFSERR_INVAL;
1834                 return;
1835         }
1836 
1837         /*
1838          * This is a completely gross hack to make mknod
1839          * work over the wire until we can wack the protocol
1840          */
1841         if ((va.va_mode & IFMT) == IFCHR) {
1842                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1843                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1844                 else {
1845                         va.va_type = VCHR;
1846                         /*
1847                          * uncompress the received dev_t
1848                          * if the top half is zero indicating a request
1849                          * from an `older style' OS.
1850                          */
1851                         if ((va.va_size & 0xffff0000) == 0)
1852                                 va.va_rdev = nfsv2_expdev(va.va_size);
1853                         else
1854                                 va.va_rdev = (dev_t)va.va_size;
1855                 }
1856                 va.va_mask &= ~AT_SIZE;
1857         } else if ((va.va_mode & IFMT) == IFBLK) {
1858                 va.va_type = VBLK;
1859                 /*
1860                  * uncompress the received dev_t
1861                  * if the top half is zero indicating a request
1862                  * from an `older style' OS.
1863                  */
1864                 if ((va.va_size & 0xffff0000) == 0)
1865                         va.va_rdev = nfsv2_expdev(va.va_size);
1866                 else
1867                         va.va_rdev = (dev_t)va.va_size;
1868                 va.va_mask &= ~AT_SIZE;
1869         } else if ((va.va_mode & IFMT) == IFSOCK) {
1870                 va.va_type = VSOCK;
1871         } else {
1872                 va.va_type = VREG;
1873         }
1874         va.va_mode &= ~IFMT;
1875         va.va_mask |= AT_TYPE;
1876 
1877         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1878         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1879             MAXPATHLEN);
1880         if (name == NULL) {
1881                 dr->dr_status = puterrno(EINVAL);
1882                 return;
1883         }
1884 
1885         /*
1886          * Why was the choice made to use VWRITE as the mode to the
1887          * call to VOP_CREATE ? This results in a bug.  When a client
1888          * opens a file that already exists and is RDONLY, the second
1889          * open fails with an EACESS because of the mode.
1890          * bug ID 1054648.
1891          */
1892         lookup_ok = 0;
1893         mode = VWRITE;
1894         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1895                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1896                     NULL, NULL, NULL);
1897                 if (!error) {
1898                         struct vattr at;
1899 
1900                         lookup_ok = 1;
1901                         at.va_mask = AT_MODE;
1902                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1903                         if (!error)
1904                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1905                         VN_RELE(tvp);
1906                         tvp = NULL;
1907                 }
1908         }
1909 
1910         if (!lookup_ok) {
1911                 if (rdonly(ro, dvp)) {
1912                         error = EROFS;
1913                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1914                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1915                         error = EPERM;
1916                 } else {
1917                         error = 0;
1918                 }
1919         }
1920 
1921         /*
1922          * If file size is being modified on an already existing file
1923          * make sure that there are no conflicting non-blocking mandatory
1924          * locks in the region being manipulated. Return EACCES if there
1925          * are conflicting locks.
1926          */
1927         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1928                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1929                     NULL, NULL, NULL);
1930 
1931                 if (!lookuperr &&
1932                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1933                         VN_RELE(tvp);
1934                         curthread->t_flag |= T_WOULDBLOCK;
1935                         goto out;
1936                 }
1937 
1938                 if (!lookuperr && nbl_need_check(tvp)) {
1939                         /*
1940                          * The file exists. Now check if it has any
1941                          * conflicting non-blocking mandatory locks
1942                          * in the region being changed.
1943                          */
1944                         struct vattr bva;
1945                         u_offset_t offset;
1946                         ssize_t length;
1947 
1948                         nbl_start_crit(tvp, RW_READER);
1949                         in_crit = 1;
1950 
1951                         bva.va_mask = AT_SIZE;
1952                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1953                         if (!error) {
1954                                 if (va.va_size < bva.va_size) {
1955                                         offset = va.va_size;
1956                                         length = bva.va_size - va.va_size;
1957                                 } else {
1958                                         offset = bva.va_size;
1959                                         length = va.va_size - bva.va_size;
1960                                 }
1961                                 if (length) {
1962                                         if (nbl_conflict(tvp, NBL_WRITE,
1963                                             offset, length, 0, NULL)) {
1964                                                 error = EACCES;
1965                                         }
1966                                 }
1967                         }
1968                         if (error) {
1969                                 nbl_end_crit(tvp);
1970                                 VN_RELE(tvp);
1971                                 in_crit = 0;
1972                         }
1973                 } else if (tvp != NULL) {
1974                         VN_RELE(tvp);
1975                 }
1976         }
1977 
1978         if (!error) {
1979                 /*
1980                  * If filesystem is shared with nosuid the remove any
1981                  * setuid/setgid bits on create.
1982                  */
1983                 if (va.va_type == VREG &&
1984                     exi->exi_export.ex_flags & EX_NOSUID)
1985                         va.va_mode &= ~(VSUID | VSGID);
1986 
1987                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1988                     NULL, NULL);
1989 
1990                 if (!error) {
1991 
1992                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1993                                 trunc = TRUE;
1994                         else
1995                                 trunc = FALSE;
1996 
1997                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1998                                 VN_RELE(vp);
1999                                 curthread->t_flag |= T_WOULDBLOCK;
2000                                 goto out;
2001                         }
2002                         va.va_mask = AT_ALL;
2003 
2004                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2005 
2006                         /* check for overflows */
2007                         if (!error) {
2008                                 acl_perm(vp, exi, &va, cr);
2009                                 error = vattr_to_nattr(&va, &dr->dr_attr);
2010                                 if (!error) {
2011                                         error = makefh(&dr->dr_fhandle, vp,
2012                                             exi);
2013                                 }
2014                         }
2015                         /*
2016                          * Force modified metadata out to stable storage.
2017                          *
2018                          * if a underlying vp exists, pass it to VOP_FSYNC
2019                          */
2020                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
2021                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2022                         else
2023                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2024                         VN_RELE(vp);
2025                 }
2026 
2027                 if (in_crit) {
2028                         nbl_end_crit(tvp);
2029                         VN_RELE(tvp);
2030                 }
2031         }
2032 
2033         /*
2034          * Force modified data and metadata out to stable storage.
2035          */
2036         (void) VOP_FSYNC(dvp, 0, cr, NULL);
2037 
2038 out:
2039 
2040         VN_RELE(dvp);
2041 
2042         dr->dr_status = puterrno(error);
2043 
2044         if (name != args->ca_da.da_name)
2045                 kmem_free(name, MAXPATHLEN);
2046 }
2047 void *
2048 rfs_create_getfh(struct nfscreatargs *args)
2049 {
2050         return (args->ca_da.da_fhandle);
2051 }
2052 
2053 /*
2054  * Remove a file.
2055  * Remove named file from parent directory.
2056  */
2057 /* ARGSUSED */
2058 void
2059 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2060     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2061 {
2062         int error = 0;
2063         vnode_t *vp;
2064         vnode_t *targvp;
2065         int in_crit = 0;
2066 
2067         /*
2068          * Disallow NULL paths
2069          */
2070         if (da->da_name == NULL || *da->da_name == '\0') {
2071                 *status = NFSERR_ACCES;
2072                 return;
2073         }
2074 
2075         vp = nfs_fhtovp(da->da_fhandle, exi);
2076         if (vp == NULL) {
2077                 *status = NFSERR_STALE;
2078                 return;
2079         }
2080 
2081         if (rdonly(ro, vp)) {
2082                 VN_RELE(vp);
2083                 *status = NFSERR_ROFS;
2084                 return;
2085         }
2086 
2087         /*
2088          * Check for a conflict with a non-blocking mandatory share reservation.
2089          */
2090         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2091             NULL, cr, NULL, NULL, NULL);
2092         if (error != 0) {
2093                 VN_RELE(vp);
2094                 *status = puterrno(error);
2095                 return;
2096         }
2097 
2098         /*
2099          * If the file is delegated to an v4 client, then initiate
2100          * recall and drop this request (by setting T_WOULDBLOCK).
2101          * The client will eventually re-transmit the request and
2102          * (hopefully), by then, the v4 client will have returned
2103          * the delegation.
2104          */
2105 
2106         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2107                 VN_RELE(vp);
2108                 VN_RELE(targvp);
2109                 curthread->t_flag |= T_WOULDBLOCK;
2110                 return;
2111         }
2112 
2113         if (nbl_need_check(targvp)) {
2114                 nbl_start_crit(targvp, RW_READER);
2115                 in_crit = 1;
2116                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2117                         error = EACCES;
2118                         goto out;
2119                 }
2120         }
2121 
2122         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2123 
2124         /*
2125          * Force modified data and metadata out to stable storage.
2126          */
2127         (void) VOP_FSYNC(vp, 0, cr, NULL);
2128 
2129 out:
2130         if (in_crit)
2131                 nbl_end_crit(targvp);
2132         VN_RELE(targvp);
2133         VN_RELE(vp);
2134 
2135         *status = puterrno(error);
2136 
2137 }
2138 
2139 void *
2140 rfs_remove_getfh(struct nfsdiropargs *da)
2141 {
2142         return (da->da_fhandle);
2143 }
2144 
2145 /*
2146  * rename a file
2147  * Give a file (from) a new name (to).
2148  */
2149 /* ARGSUSED */
2150 void
2151 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2152     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2153 {
2154         int error = 0;
2155         vnode_t *fromvp;
2156         vnode_t *tovp;
2157         struct exportinfo *to_exi;
2158         fhandle_t *fh;
2159         vnode_t *srcvp;
2160         vnode_t *targvp;
2161         int in_crit = 0;
2162 
2163         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2164         if (fromvp == NULL) {
2165                 *status = NFSERR_STALE;
2166                 return;
2167         }
2168 
2169         fh = args->rna_to.da_fhandle;
2170         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2171         if (to_exi == NULL) {
2172                 VN_RELE(fromvp);
2173                 *status = NFSERR_ACCES;
2174                 return;
2175         }
2176         exi_rele(to_exi);
2177 
2178         if (to_exi != exi) {
2179                 VN_RELE(fromvp);
2180                 *status = NFSERR_XDEV;
2181                 return;
2182         }
2183 
2184         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2185         if (tovp == NULL) {
2186                 VN_RELE(fromvp);
2187                 *status = NFSERR_STALE;
2188                 return;
2189         }
2190 
2191         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2192                 VN_RELE(tovp);
2193                 VN_RELE(fromvp);
2194                 *status = NFSERR_NOTDIR;
2195                 return;
2196         }
2197 
2198         /*
2199          * Disallow NULL paths
2200          */
2201         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2202             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2203                 VN_RELE(tovp);
2204                 VN_RELE(fromvp);
2205                 *status = NFSERR_ACCES;
2206                 return;
2207         }
2208 
2209         if (rdonly(ro, tovp)) {
2210                 VN_RELE(tovp);
2211                 VN_RELE(fromvp);
2212                 *status = NFSERR_ROFS;
2213                 return;
2214         }
2215 
2216         /*
2217          * Check for a conflict with a non-blocking mandatory share reservation.
2218          */
2219         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2220             NULL, cr, NULL, NULL, NULL);
2221         if (error != 0) {
2222                 VN_RELE(tovp);
2223                 VN_RELE(fromvp);
2224                 *status = puterrno(error);
2225                 return;
2226         }
2227 
2228         /* Check for delegations on the source file */
2229 
2230         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2231                 VN_RELE(tovp);
2232                 VN_RELE(fromvp);
2233                 VN_RELE(srcvp);
2234                 curthread->t_flag |= T_WOULDBLOCK;
2235                 return;
2236         }
2237 
2238         /* Check for delegation on the file being renamed over, if it exists */
2239 
2240         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2241             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2242             NULL, NULL, NULL) == 0) {
2243 
2244                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2245                         VN_RELE(tovp);
2246                         VN_RELE(fromvp);
2247                         VN_RELE(srcvp);
2248                         VN_RELE(targvp);
2249                         curthread->t_flag |= T_WOULDBLOCK;
2250                         return;
2251                 }
2252                 VN_RELE(targvp);
2253         }
2254 
2255 
2256         if (nbl_need_check(srcvp)) {
2257                 nbl_start_crit(srcvp, RW_READER);
2258                 in_crit = 1;
2259                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2260                         error = EACCES;
2261                         goto out;
2262                 }
2263         }
2264 
2265         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2266             tovp, args->rna_to.da_name, cr, NULL, 0);
2267 
2268         if (error == 0)
2269                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2270                     strlen(args->rna_to.da_name));
2271 
2272         /*
2273          * Force modified data and metadata out to stable storage.
2274          */
2275         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2276         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2277 
2278 out:
2279         if (in_crit)
2280                 nbl_end_crit(srcvp);
2281         VN_RELE(srcvp);
2282         VN_RELE(tovp);
2283         VN_RELE(fromvp);
2284 
2285         *status = puterrno(error);
2286 
2287 }
2288 void *
2289 rfs_rename_getfh(struct nfsrnmargs *args)
2290 {
2291         return (args->rna_from.da_fhandle);
2292 }
2293 
2294 /*
2295  * Link to a file.
2296  * Create a file (to) which is a hard link to the given file (from).
2297  */
2298 /* ARGSUSED */
2299 void
2300 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2301     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2302 {
2303         int error;
2304         vnode_t *fromvp;
2305         vnode_t *tovp;
2306         struct exportinfo *to_exi;
2307         fhandle_t *fh;
2308 
2309         fromvp = nfs_fhtovp(args->la_from, exi);
2310         if (fromvp == NULL) {
2311                 *status = NFSERR_STALE;
2312                 return;
2313         }
2314 
2315         fh = args->la_to.da_fhandle;
2316         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2317         if (to_exi == NULL) {
2318                 VN_RELE(fromvp);
2319                 *status = NFSERR_ACCES;
2320                 return;
2321         }
2322         exi_rele(to_exi);
2323 
2324         if (to_exi != exi) {
2325                 VN_RELE(fromvp);
2326                 *status = NFSERR_XDEV;
2327                 return;
2328         }
2329 
2330         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2331         if (tovp == NULL) {
2332                 VN_RELE(fromvp);
2333                 *status = NFSERR_STALE;
2334                 return;
2335         }
2336 
2337         if (tovp->v_type != VDIR) {
2338                 VN_RELE(tovp);
2339                 VN_RELE(fromvp);
2340                 *status = NFSERR_NOTDIR;
2341                 return;
2342         }
2343         /*
2344          * Disallow NULL paths
2345          */
2346         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2347                 VN_RELE(tovp);
2348                 VN_RELE(fromvp);
2349                 *status = NFSERR_ACCES;
2350                 return;
2351         }
2352 
2353         if (rdonly(ro, tovp)) {
2354                 VN_RELE(tovp);
2355                 VN_RELE(fromvp);
2356                 *status = NFSERR_ROFS;
2357                 return;
2358         }
2359 
2360         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2361 
2362         /*
2363          * Force modified data and metadata out to stable storage.
2364          */
2365         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2366         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2367 
2368         VN_RELE(tovp);
2369         VN_RELE(fromvp);
2370 
2371         *status = puterrno(error);
2372 
2373 }
2374 void *
2375 rfs_link_getfh(struct nfslinkargs *args)
2376 {
2377         return (args->la_from);
2378 }
2379 
2380 /*
2381  * Symbolicly link to a file.
2382  * Create a file (to) with the given attributes which is a symbolic link
2383  * to the given path name (to).
2384  */
2385 void
2386 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2387     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2388 {
2389         int error;
2390         struct vattr va;
2391         vnode_t *vp;
2392         vnode_t *svp;
2393         int lerror;
2394         struct sockaddr *ca;
2395         char *name = NULL;
2396 
2397         /*
2398          * Disallow NULL paths
2399          */
2400         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2401                 *status = NFSERR_ACCES;
2402                 return;
2403         }
2404 
2405         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2406         if (vp == NULL) {
2407                 *status = NFSERR_STALE;
2408                 return;
2409         }
2410 
2411         if (rdonly(ro, vp)) {
2412                 VN_RELE(vp);
2413                 *status = NFSERR_ROFS;
2414                 return;
2415         }
2416 
2417         error = sattr_to_vattr(args->sla_sa, &va);
2418         if (error) {
2419                 VN_RELE(vp);
2420                 *status = puterrno(error);
2421                 return;
2422         }
2423 
2424         if (!(va.va_mask & AT_MODE)) {
2425                 VN_RELE(vp);
2426                 *status = NFSERR_INVAL;
2427                 return;
2428         }
2429 
2430         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2431         name = nfscmd_convname(ca, exi, args->sla_tnm,
2432             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2433 
2434         if (name == NULL) {
2435                 *status = NFSERR_ACCES;
2436                 return;
2437         }
2438 
2439         va.va_type = VLNK;
2440         va.va_mask |= AT_TYPE;
2441 
2442         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2443 
2444         /*
2445          * Force new data and metadata out to stable storage.
2446          */
2447         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2448             NULL, cr, NULL, NULL, NULL);
2449 
2450         if (!lerror) {
2451                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2452                 VN_RELE(svp);
2453         }
2454 
2455         /*
2456          * Force modified data and metadata out to stable storage.
2457          */
2458         (void) VOP_FSYNC(vp, 0, cr, NULL);
2459 
2460         VN_RELE(vp);
2461 
2462         *status = puterrno(error);
2463         if (name != args->sla_tnm)
2464                 kmem_free(name, MAXPATHLEN);
2465 
2466 }
2467 void *
2468 rfs_symlink_getfh(struct nfsslargs *args)
2469 {
2470         return (args->sla_from.da_fhandle);
2471 }
2472 
2473 /*
2474  * Make a directory.
2475  * Create a directory with the given name, parent directory, and attributes.
2476  * Returns a file handle and attributes for the new directory.
2477  */
2478 /* ARGSUSED */
2479 void
2480 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2481     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2482 {
2483         int error;
2484         struct vattr va;
2485         vnode_t *dvp = NULL;
2486         vnode_t *vp;
2487         char *name = args->ca_da.da_name;
2488 
2489         /*
2490          * Disallow NULL paths
2491          */
2492         if (name == NULL || *name == '\0') {
2493                 dr->dr_status = NFSERR_ACCES;
2494                 return;
2495         }
2496 
2497         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2498         if (vp == NULL) {
2499                 dr->dr_status = NFSERR_STALE;
2500                 return;
2501         }
2502 
2503         if (rdonly(ro, vp)) {
2504                 VN_RELE(vp);
2505                 dr->dr_status = NFSERR_ROFS;
2506                 return;
2507         }
2508 
2509         error = sattr_to_vattr(args->ca_sa, &va);
2510         if (error) {
2511                 VN_RELE(vp);
2512                 dr->dr_status = puterrno(error);
2513                 return;
2514         }
2515 
2516         if (!(va.va_mask & AT_MODE)) {
2517                 VN_RELE(vp);
2518                 dr->dr_status = NFSERR_INVAL;
2519                 return;
2520         }
2521 
2522         va.va_type = VDIR;
2523         va.va_mask |= AT_TYPE;
2524 
2525         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2526 
2527         if (!error) {
2528                 /*
2529                  * Attribtutes of the newly created directory should
2530                  * be returned to the client.
2531                  */
2532                 va.va_mask = AT_ALL; /* We want everything */
2533                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2534 
2535                 /* check for overflows */
2536                 if (!error) {
2537                         acl_perm(vp, exi, &va, cr);
2538                         error = vattr_to_nattr(&va, &dr->dr_attr);
2539                         if (!error) {
2540                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2541                         }
2542                 }
2543                 /*
2544                  * Force new data and metadata out to stable storage.
2545                  */
2546                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2547                 VN_RELE(dvp);
2548         }
2549 
2550         /*
2551          * Force modified data and metadata out to stable storage.
2552          */
2553         (void) VOP_FSYNC(vp, 0, cr, NULL);
2554 
2555         VN_RELE(vp);
2556 
2557         dr->dr_status = puterrno(error);
2558 
2559 }
2560 void *
2561 rfs_mkdir_getfh(struct nfscreatargs *args)
2562 {
2563         return (args->ca_da.da_fhandle);
2564 }
2565 
2566 /*
2567  * Remove a directory.
2568  * Remove the given directory name from the given parent directory.
2569  */
2570 /* ARGSUSED */
2571 void
2572 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2573     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2574 {
2575         int error;
2576         vnode_t *vp;
2577 
2578         /*
2579          * Disallow NULL paths
2580          */
2581         if (da->da_name == NULL || *da->da_name == '\0') {
2582                 *status = NFSERR_ACCES;
2583                 return;
2584         }
2585 
2586         vp = nfs_fhtovp(da->da_fhandle, exi);
2587         if (vp == NULL) {
2588                 *status = NFSERR_STALE;
2589                 return;
2590         }
2591 
2592         if (rdonly(ro, vp)) {
2593                 VN_RELE(vp);
2594                 *status = NFSERR_ROFS;
2595                 return;
2596         }
2597 
2598         /*
2599          * VOP_RMDIR takes a third argument (the current
2600          * directory of the process).  That's because someone
2601          * wants to return EINVAL if one tries to remove ".".
2602          * Of course, NFS servers have no idea what their
2603          * clients' current directories are.  We fake it by
2604          * supplying a vnode known to exist and illegal to
2605          * remove.
2606          */
2607         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2608 
2609         /*
2610          * Force modified data and metadata out to stable storage.
2611          */
2612         (void) VOP_FSYNC(vp, 0, cr, NULL);
2613 
2614         VN_RELE(vp);
2615 
2616         /*
2617          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2618          * if the directory is not empty.  A System V NFS server
2619          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2620          * over the wire.
2621          */
2622         if (error == EEXIST)
2623                 *status = NFSERR_NOTEMPTY;
2624         else
2625                 *status = puterrno(error);
2626 
2627 }
2628 void *
2629 rfs_rmdir_getfh(struct nfsdiropargs *da)
2630 {
2631         return (da->da_fhandle);
2632 }
2633 
2634 /* ARGSUSED */
2635 void
2636 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2637     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2638 {
2639         int error;
2640         int iseof;
2641         struct iovec iov;
2642         struct uio uio;
2643         vnode_t *vp;
2644         char *ndata = NULL;
2645         struct sockaddr *ca;
2646         size_t nents;
2647         int ret;
2648 
2649         vp = nfs_fhtovp(&rda->rda_fh, exi);
2650         if (vp == NULL) {
2651                 rd->rd_entries = NULL;
2652                 rd->rd_status = NFSERR_STALE;
2653                 return;
2654         }
2655 
2656         if (vp->v_type != VDIR) {
2657                 VN_RELE(vp);
2658                 rd->rd_entries = NULL;
2659                 rd->rd_status = NFSERR_NOTDIR;
2660                 return;
2661         }
2662 
2663         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2664 
2665         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2666 
2667         if (error) {
2668                 rd->rd_entries = NULL;
2669                 goto bad;
2670         }
2671 
2672         if (rda->rda_count == 0) {
2673                 rd->rd_entries = NULL;
2674                 rd->rd_size = 0;
2675                 rd->rd_eof = FALSE;
2676                 goto bad;
2677         }
2678 
2679         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2680 
2681         /*
2682          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2683          */
2684         rd->rd_bufsize = (uint_t)rda->rda_count;
2685         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2686 
2687         /*
2688          * Set up io vector to read directory data
2689          */
2690         iov.iov_base = (caddr_t)rd->rd_entries;
2691         iov.iov_len = rda->rda_count;
2692         uio.uio_iov = &iov;
2693         uio.uio_iovcnt = 1;
2694         uio.uio_segflg = UIO_SYSSPACE;
2695         uio.uio_extflg = UIO_COPY_CACHED;
2696         uio.uio_loffset = (offset_t)rda->rda_offset;
2697         uio.uio_resid = rda->rda_count;
2698 
2699         /*
2700          * read directory
2701          */
2702         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2703 
2704         /*
2705          * Clean up
2706          */
2707         if (!error) {
2708                 /*
2709                  * set size and eof
2710                  */
2711                 if (uio.uio_resid == rda->rda_count) {
2712                         rd->rd_size = 0;
2713                         rd->rd_eof = TRUE;
2714                 } else {
2715                         rd->rd_size = (uint32_t)(rda->rda_count -
2716                             uio.uio_resid);
2717                         rd->rd_eof = iseof ? TRUE : FALSE;
2718                 }
2719         }
2720 
2721         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2722         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2723         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2724             rda->rda_count, &ndata);
2725 
2726         if (ret != 0) {
2727                 size_t dropbytes;
2728                 /*
2729                  * We had to drop one or more entries in order to fit
2730                  * during the character conversion.  We need to patch
2731                  * up the size and eof info.
2732                  */
2733                 if (rd->rd_eof)
2734                         rd->rd_eof = FALSE;
2735                 dropbytes = nfscmd_dropped_entrysize(
2736                     (struct dirent64 *)rd->rd_entries, nents, ret);
2737                 rd->rd_size -= dropbytes;
2738         }
2739         if (ndata == NULL) {
2740                 ndata = (char *)rd->rd_entries;
2741         } else if (ndata != (char *)rd->rd_entries) {
2742                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2743                 rd->rd_entries = (void *)ndata;
2744                 rd->rd_bufsize = rda->rda_count;
2745         }
2746 
2747 bad:
2748         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2749 
2750 #if 0 /* notyet */
2751         /*
2752          * Don't do this.  It causes local disk writes when just
2753          * reading the file and the overhead is deemed larger
2754          * than the benefit.
2755          */
2756         /*
2757          * Force modified metadata out to stable storage.
2758          */
2759         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2760 #endif
2761 
2762         VN_RELE(vp);
2763 
2764         rd->rd_status = puterrno(error);
2765 
2766 }
2767 void *
2768 rfs_readdir_getfh(struct nfsrddirargs *rda)
2769 {
2770         return (&rda->rda_fh);
2771 }
2772 void
2773 rfs_rddirfree(struct nfsrddirres *rd)
2774 {
2775         if (rd->rd_entries != NULL)
2776                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2777 }
2778 
2779 /* ARGSUSED */
2780 void
2781 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2782     struct svc_req *req, cred_t *cr, bool_t ro)
2783 {
2784         int error;
2785         struct statvfs64 sb;
2786         vnode_t *vp;
2787 
2788         vp = nfs_fhtovp(fh, exi);
2789         if (vp == NULL) {
2790                 fs->fs_status = NFSERR_STALE;
2791                 return;
2792         }
2793 
2794         error = VFS_STATVFS(vp->v_vfsp, &sb);
2795 
2796         if (!error) {
2797                 fs->fs_tsize = nfstsize();
2798                 fs->fs_bsize = sb.f_frsize;
2799                 fs->fs_blocks = sb.f_blocks;
2800                 fs->fs_bfree = sb.f_bfree;
2801                 fs->fs_bavail = sb.f_bavail;
2802         }
2803 
2804         VN_RELE(vp);
2805 
2806         fs->fs_status = puterrno(error);
2807 
2808 }
2809 void *
2810 rfs_statfs_getfh(fhandle_t *fh)
2811 {
2812         return (fh);
2813 }
2814 
2815 static int
2816 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2817 {
2818         vap->va_mask = 0;
2819 
2820         /*
2821          * There was a sign extension bug in some VFS based systems
2822          * which stored the mode as a short.  When it would get
2823          * assigned to a u_long, no sign extension would occur.
2824          * It needed to, but this wasn't noticed because sa_mode
2825          * would then get assigned back to the short, thus ignoring
2826          * the upper 16 bits of sa_mode.
2827          *
2828          * To make this implementation work for both broken
2829          * clients and good clients, we check for both versions
2830          * of the mode.
2831          */
2832         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2833             sa->sa_mode != (uint32_t)-1) {
2834                 vap->va_mask |= AT_MODE;
2835                 vap->va_mode = sa->sa_mode;
2836         }
2837         if (sa->sa_uid != (uint32_t)-1) {
2838                 vap->va_mask |= AT_UID;
2839                 vap->va_uid = sa->sa_uid;
2840         }
2841         if (sa->sa_gid != (uint32_t)-1) {
2842                 vap->va_mask |= AT_GID;
2843                 vap->va_gid = sa->sa_gid;
2844         }
2845         if (sa->sa_size != (uint32_t)-1) {
2846                 vap->va_mask |= AT_SIZE;
2847                 vap->va_size = sa->sa_size;
2848         }
2849         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2850             sa->sa_atime.tv_usec != (int32_t)-1) {
2851 #ifndef _LP64
2852                 /* return error if time overflow */
2853                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2854                         return (EOVERFLOW);
2855 #endif
2856                 vap->va_mask |= AT_ATIME;
2857                 /*
2858                  * nfs protocol defines times as unsigned so don't extend sign,
2859                  * unless sysadmin set nfs_allow_preepoch_time.
2860                  */
2861                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2862                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2863         }
2864         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2865             sa->sa_mtime.tv_usec != (int32_t)-1) {
2866 #ifndef _LP64
2867                 /* return error if time overflow */
2868                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2869                         return (EOVERFLOW);
2870 #endif
2871                 vap->va_mask |= AT_MTIME;
2872                 /*
2873                  * nfs protocol defines times as unsigned so don't extend sign,
2874                  * unless sysadmin set nfs_allow_preepoch_time.
2875                  */
2876                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2877                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2878         }
2879         return (0);
2880 }
2881 
2882 static const enum nfsftype vt_to_nf[] = {
2883         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2884 };
2885 
2886 /*
2887  * check the following fields for overflow: nodeid, size, and time.
2888  * There could be a problem when converting 64-bit LP64 fields
2889  * into 32-bit ones.  Return an error if there is an overflow.
2890  */
2891 int
2892 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2893 {
2894         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2895         na->na_type = vt_to_nf[vap->va_type];
2896 
2897         if (vap->va_mode == (unsigned short) -1)
2898                 na->na_mode = (uint32_t)-1;
2899         else
2900                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2901 
2902         if (vap->va_uid == (unsigned short)(-1))
2903                 na->na_uid = (uint32_t)(-1);
2904         else if (vap->va_uid == UID_NOBODY)
2905                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2906         else
2907                 na->na_uid = vap->va_uid;
2908 
2909         if (vap->va_gid == (unsigned short)(-1))
2910                 na->na_gid = (uint32_t)-1;
2911         else if (vap->va_gid == GID_NOBODY)
2912                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2913         else
2914                 na->na_gid = vap->va_gid;
2915 
2916         /*
2917          * Do we need to check fsid for overflow?  It is 64-bit in the
2918          * vattr, but are bigger than 32 bit values supported?
2919          */
2920         na->na_fsid = vap->va_fsid;
2921 
2922         na->na_nodeid = vap->va_nodeid;
2923 
2924         /*
2925          * Check to make sure that the nodeid is representable over the
2926          * wire without losing bits.
2927          */
2928         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2929                 return (EFBIG);
2930         na->na_nlink = vap->va_nlink;
2931 
2932         /*
2933          * Check for big files here, instead of at the caller.  See
2934          * comments in cstat for large special file explanation.
2935          */
2936         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2937                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2938                         return (EFBIG);
2939                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2940                         /* UNKNOWN_SIZE | OVERFLOW */
2941                         na->na_size = MAXOFF32_T;
2942                 } else
2943                         na->na_size = vap->va_size;
2944         } else
2945                 na->na_size = vap->va_size;
2946 
2947         /*
2948          * If the vnode times overflow the 32-bit times that NFS2
2949          * uses on the wire then return an error.
2950          */
2951         if (!NFS_VAP_TIME_OK(vap)) {
2952                 return (EOVERFLOW);
2953         }
2954         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2955         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2956 
2957         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2958         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2959 
2960         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2961         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2962 
2963         /*
2964          * If the dev_t will fit into 16 bits then compress
2965          * it, otherwise leave it alone. See comments in
2966          * nfs_client.c.
2967          */
2968         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2969             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2970                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2971         else
2972                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2973 
2974         na->na_blocks = vap->va_nblocks;
2975         na->na_blocksize = vap->va_blksize;
2976 
2977         /*
2978          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2979          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2980          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2981          *
2982          * BUYER BEWARE:
2983          *  If you are porting the NFS to a non-Sun server, you probably
2984          *  don't want to include the following block of code.  The
2985          *  over-the-wire special file types will be changing with the
2986          *  NFS Protocol Revision.
2987          */
2988         if (vap->va_type == VFIFO)
2989                 NA_SETFIFO(na);
2990         return (0);
2991 }
2992 
2993 /*
2994  * acl v2 support: returns approximate permission.
2995  *      default: returns minimal permission (more restrictive)
2996  *      aclok: returns maximal permission (less restrictive)
2997  *      This routine changes the permissions that are alaredy in *va.
2998  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2999  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3000  */
3001 static void
3002 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3003 {
3004         vsecattr_t      vsa;
3005         int             aclcnt;
3006         aclent_t        *aclentp;
3007         mode_t          mask_perm;
3008         mode_t          grp_perm;
3009         mode_t          other_perm;
3010         mode_t          other_orig;
3011         int             error;
3012 
3013         /* dont care default acl */
3014         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3015         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3016 
3017         if (!error) {
3018                 aclcnt = vsa.vsa_aclcnt;
3019                 if (aclcnt > MIN_ACL_ENTRIES) {
3020                         /* non-trivial ACL */
3021                         aclentp = vsa.vsa_aclentp;
3022                         if (exi->exi_export.ex_flags & EX_ACLOK) {
3023                                 /* maximal permissions */
3024                                 grp_perm = 0;
3025                                 other_perm = 0;
3026                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3027                                         switch (aclentp->a_type) {
3028                                         case USER_OBJ:
3029                                                 break;
3030                                         case USER:
3031                                                 grp_perm |=
3032                                                     aclentp->a_perm << 3;
3033                                                 other_perm |= aclentp->a_perm;
3034                                                 break;
3035                                         case GROUP_OBJ:
3036                                                 grp_perm |=
3037                                                     aclentp->a_perm << 3;
3038                                                 break;
3039                                         case GROUP:
3040                                                 other_perm |= aclentp->a_perm;
3041                                                 break;
3042                                         case OTHER_OBJ:
3043                                                 other_orig = aclentp->a_perm;
3044                                                 break;
3045                                         case CLASS_OBJ:
3046                                                 mask_perm = aclentp->a_perm;
3047                                                 break;
3048                                         default:
3049                                                 break;
3050                                         }
3051                                 }
3052                                 grp_perm &= mask_perm << 3;
3053                                 other_perm &= mask_perm;
3054                                 other_perm |= other_orig;
3055 
3056                         } else {
3057                                 /* minimal permissions */
3058                                 grp_perm = 070;
3059                                 other_perm = 07;
3060                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3061                                         switch (aclentp->a_type) {
3062                                         case USER_OBJ:
3063                                                 break;
3064                                         case USER:
3065                                         case CLASS_OBJ:
3066                                                 grp_perm &=
3067                                                     aclentp->a_perm << 3;
3068                                                 other_perm &=
3069                                                     aclentp->a_perm;
3070                                                 break;
3071                                         case GROUP_OBJ:
3072                                                 grp_perm &=
3073                                                     aclentp->a_perm << 3;
3074                                                 break;
3075                                         case GROUP:
3076                                                 other_perm &=
3077                                                     aclentp->a_perm;
3078                                                 break;
3079                                         case OTHER_OBJ:
3080                                                 other_perm &=
3081                                                     aclentp->a_perm;
3082                                                 break;
3083                                         default:
3084                                                 break;
3085                                         }
3086                                 }
3087                         }
3088                         /* copy to va */
3089                         va->va_mode &= ~077;
3090                         va->va_mode |= grp_perm | other_perm;
3091                 }
3092                 if (vsa.vsa_aclcnt)
3093                         kmem_free(vsa.vsa_aclentp,
3094                             vsa.vsa_aclcnt * sizeof (aclent_t));
3095         }
3096 }
3097 
3098 void
3099 rfs_srvrinit(void)
3100 {
3101         nfs2_srv_caller_id = fs_new_caller_id();
3102         zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3103 }
3104 
3105 void
3106 rfs_srvrfini(void)
3107 {
3108 }
3109 
3110 /* ARGSUSED */
3111 static void *
3112 rfs_zone_init(zoneid_t zoneid)
3113 {
3114         nfs_srv_t *ns;
3115 
3116         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3117 
3118         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3119         ns->write_async = 1;
3120 
3121         return (ns);
3122 }
3123 
3124 /* ARGSUSED */
3125 static void
3126 rfs_zone_fini(zoneid_t zoneid, void *data)
3127 {
3128         nfs_srv_t *ns;
3129 
3130         ns = (nfs_srv_t *)data;
3131         mutex_destroy(&ns->async_write_lock);
3132         kmem_free(ns, sizeof (*ns));
3133 }
3134 
3135 static int
3136 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3137 {
3138         struct clist    *wcl;
3139         int             wlist_len;
3140         uint32_t        count = rr->rr_count;
3141 
3142         wcl = ra->ra_wlist;
3143 
3144         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3145                 return (FALSE);
3146         }
3147 
3148         wcl = ra->ra_wlist;
3149         rr->rr_ok.rrok_wlist_len = wlist_len;
3150         rr->rr_ok.rrok_wlist = wcl;
3151 
3152         return (TRUE);
3153 }