gwr-requested Old usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 
  33 /*
  34  * Copyright 2018 Nexenta Systems, Inc.
  35  * Copyright (c) 2016 by Delphix. All rights reserved.
  36  */
  37 
  38 #include <sys/param.h>
  39 #include <sys/types.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/buf.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/stat.h>
  47 #include <sys/errno.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/statvfs.h>
  50 #include <sys/kmem.h>
  51 #include <sys/kstat.h>
  52 #include <sys/dirent.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/debug.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/mode.h>
  57 #include <sys/acl.h>
  58 #include <sys/nbmlock.h>
  59 #include <sys/policy.h>
  60 #include <sys/sdt.h>
  61 
  62 #include <rpc/types.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/svc.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/export.h>
  68 #include <nfs/nfs_cmd.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_kmem.h>
  75 
  76 #include <sys/strsubr.h>
  77 
  78 struct rfs_async_write_list;
  79 
  80 /*
  81  * Zone globals of NFSv2 server
  82  */
  83 typedef struct nfs_srv {
  84         kmutex_t                        async_write_lock;
  85         struct rfs_async_write_list     *async_write_head;
  86 
  87         /*
  88          * enables write clustering if == 1
  89          */
  90         int             write_async;
  91 } nfs_srv_t;
  92 
  93 /*
  94  * These are the interface routines for the server side of the
  95  * Network File System.  See the NFS version 2 protocol specification
  96  * for a description of this interface.
  97  */
  98 
  99 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101                         cred_t *);
 102 
 103 
 104 /*
 105  * Some "over the wire" UNIX file types.  These are encoded
 106  * into the mode.  This needs to be fixed in the next rev.
 107  */
 108 #define IFMT            0170000         /* type of file */
 109 #define IFCHR           0020000         /* character special */
 110 #define IFBLK           0060000         /* block special */
 111 #define IFSOCK          0140000         /* socket */
 112 
 113 u_longlong_t nfs2_srv_caller_id;
 114 
 115 static nfs_srv_t *
 116 nfs_get_srv(void)
 117 {
 118         nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 119         nfs_srv_t *srv = ng->nfs_srv;
 120         ASSERT(srv != NULL);
 121         return (srv);
 122 }
 123 
 124 /*
 125  * Get file attributes.
 126  * Returns the current attributes of the file with the given fhandle.
 127  */
 128 /* ARGSUSED */
 129 void
 130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131     struct svc_req *req, cred_t *cr, bool_t ro)
 132 {
 133         int error;
 134         vnode_t *vp;
 135         struct vattr va;
 136 
 137         vp = nfs_fhtovp(fhp, exi);
 138         if (vp == NULL) {
 139                 ns->ns_status = NFSERR_STALE;
 140                 return;
 141         }
 142 
 143         /*
 144          * Do the getattr.
 145          */
 146         va.va_mask = AT_ALL;    /* we want all the attributes */
 147 
 148         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 149 
 150         /* check for overflows */
 151         if (!error) {
 152                 /* Lie about the object type for a referral */
 153                 if (vn_is_nfs_reparse(vp, cr))
 154                         va.va_type = VLNK;
 155 
 156                 acl_perm(vp, exi, &va, cr);
 157                 error = vattr_to_nattr(&va, &ns->ns_attr);
 158         }
 159 
 160         VN_RELE(vp);
 161 
 162         ns->ns_status = puterrno(error);
 163 }
 164 void *
 165 rfs_getattr_getfh(fhandle_t *fhp)
 166 {
 167         return (fhp);
 168 }
 169 
 170 /*
 171  * Set file attributes.
 172  * Sets the attributes of the file with the given fhandle.  Returns
 173  * the new attributes.
 174  */
 175 /* ARGSUSED */
 176 void
 177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 178     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 179 {
 180         int error;
 181         int flag;
 182         int in_crit = 0;
 183         vnode_t *vp;
 184         struct vattr va;
 185         struct vattr bva;
 186         struct flock64 bf;
 187         caller_context_t ct;
 188 
 189 
 190         vp = nfs_fhtovp(&args->saa_fh, exi);
 191         if (vp == NULL) {
 192                 ns->ns_status = NFSERR_STALE;
 193                 return;
 194         }
 195 
 196         if (rdonly(ro, vp)) {
 197                 VN_RELE(vp);
 198                 ns->ns_status = NFSERR_ROFS;
 199                 return;
 200         }
 201 
 202         error = sattr_to_vattr(&args->saa_sa, &va);
 203         if (error) {
 204                 VN_RELE(vp);
 205                 ns->ns_status = puterrno(error);
 206                 return;
 207         }
 208 
 209         /*
 210          * If the client is requesting a change to the mtime,
 211          * but the nanosecond field is set to 1 billion, then
 212          * this is a flag to the server that it should set the
 213          * atime and mtime fields to the server's current time.
 214          * The 1 billion number actually came from the client
 215          * as 1 million, but the units in the over the wire
 216          * request are microseconds instead of nanoseconds.
 217          *
 218          * This is an overload of the protocol and should be
 219          * documented in the NFS Version 2 protocol specification.
 220          */
 221         if (va.va_mask & AT_MTIME) {
 222                 if (va.va_mtime.tv_nsec == 1000000000) {
 223                         gethrestime(&va.va_mtime);
 224                         va.va_atime = va.va_mtime;
 225                         va.va_mask |= AT_ATIME;
 226                         flag = 0;
 227                 } else
 228                         flag = ATTR_UTIME;
 229         } else
 230                 flag = 0;
 231 
 232         /*
 233          * If the filesystem is exported with nosuid, then mask off
 234          * the setuid and setgid bits.
 235          */
 236         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 237             (exi->exi_export.ex_flags & EX_NOSUID))
 238                 va.va_mode &= ~(VSUID | VSGID);
 239 
 240         ct.cc_sysid = 0;
 241         ct.cc_pid = 0;
 242         ct.cc_caller_id = nfs2_srv_caller_id;
 243         ct.cc_flags = CC_DONTBLOCK;
 244 
 245         /*
 246          * We need to specially handle size changes because it is
 247          * possible for the client to create a file with modes
 248          * which indicate read-only, but with the file opened for
 249          * writing.  If the client then tries to set the size of
 250          * the file, then the normal access checking done in
 251          * VOP_SETATTR would prevent the client from doing so,
 252          * although it should be legal for it to do so.  To get
 253          * around this, we do the access checking for ourselves
 254          * and then use VOP_SPACE which doesn't do the access
 255          * checking which VOP_SETATTR does. VOP_SPACE can only
 256          * operate on VREG files, let VOP_SETATTR handle the other
 257          * extremely rare cases.
 258          * Also the client should not be allowed to change the
 259          * size of the file if there is a conflicting non-blocking
 260          * mandatory lock in the region of change.
 261          */
 262         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 263                 if (nbl_need_check(vp)) {
 264                         nbl_start_crit(vp, RW_READER);
 265                         in_crit = 1;
 266                 }
 267 
 268                 bva.va_mask = AT_UID | AT_SIZE;
 269 
 270                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 271 
 272                 if (error) {
 273                         if (in_crit)
 274                                 nbl_end_crit(vp);
 275                         VN_RELE(vp);
 276                         ns->ns_status = puterrno(error);
 277                         return;
 278                 }
 279 
 280                 if (in_crit) {
 281                         u_offset_t offset;
 282                         ssize_t length;
 283 
 284                         if (va.va_size < bva.va_size) {
 285                                 offset = va.va_size;
 286                                 length = bva.va_size - va.va_size;
 287                         } else {
 288                                 offset = bva.va_size;
 289                                 length = va.va_size - bva.va_size;
 290                         }
 291                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 292                             NULL)) {
 293                                 error = EACCES;
 294                         }
 295                 }
 296 
 297                 if (crgetuid(cr) == bva.va_uid && !error &&
 298                     va.va_size != bva.va_size) {
 299                         va.va_mask &= ~AT_SIZE;
 300                         bf.l_type = F_WRLCK;
 301                         bf.l_whence = 0;
 302                         bf.l_start = (off64_t)va.va_size;
 303                         bf.l_len = 0;
 304                         bf.l_sysid = 0;
 305                         bf.l_pid = 0;
 306 
 307                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 308                             (offset_t)va.va_size, cr, &ct);
 309                 }
 310                 if (in_crit)
 311                         nbl_end_crit(vp);
 312         } else
 313                 error = 0;
 314 
 315         /*
 316          * Do the setattr.
 317          */
 318         if (!error && va.va_mask) {
 319                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 320         }
 321 
 322         /*
 323          * check if the monitor on either vop_space or vop_setattr detected
 324          * a delegation conflict and if so, mark the thread flag as
 325          * wouldblock so that the response is dropped and the client will
 326          * try again.
 327          */
 328         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 329                 VN_RELE(vp);
 330                 curthread->t_flag |= T_WOULDBLOCK;
 331                 return;
 332         }
 333 
 334         if (!error) {
 335                 va.va_mask = AT_ALL;    /* get everything */
 336 
 337                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 338 
 339                 /* check for overflows */
 340                 if (!error) {
 341                         acl_perm(vp, exi, &va, cr);
 342                         error = vattr_to_nattr(&va, &ns->ns_attr);
 343                 }
 344         }
 345 
 346         ct.cc_flags = 0;
 347 
 348         /*
 349          * Force modified metadata out to stable storage.
 350          */
 351         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 352 
 353         VN_RELE(vp);
 354 
 355         ns->ns_status = puterrno(error);
 356 }
 357 void *
 358 rfs_setattr_getfh(struct nfssaargs *args)
 359 {
 360         return (&args->saa_fh);
 361 }
 362 
 363 /* Change and release @exip and @vpp only in success */
 364 int
 365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 366 {
 367         struct exportinfo *exi;
 368         vnode_t *vp = *vpp;
 369         fid_t fid;
 370         int error;
 371 
 372         VN_HOLD(vp);
 373 
 374         if ((error = traverse(&vp)) != 0) {
 375                 VN_RELE(vp);
 376                 return (error);
 377         }
 378 
 379         bzero(&fid, sizeof (fid));
 380         fid.fid_len = MAXFIDSZ;
 381         error = VOP_FID(vp, &fid, NULL);
 382         if (error) {
 383                 VN_RELE(vp);
 384                 return (error);
 385         }
 386 
 387         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 388         if (exi == NULL ||
 389             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 390                 /*
 391                  * It is not error, just subdir is not exported
 392                  * or "nohide" is not set
 393                  */
 394                 if (exi != NULL)
 395                         exi_rele(exi);
 396                 VN_RELE(vp);
 397         } else {
 398                 /* go to submount */
 399                 exi_rele(*exip);
 400                 *exip = exi;
 401 
 402                 VN_RELE(*vpp);
 403                 *vpp = vp;
 404         }
 405 
 406         return (0);
 407 }
 408 
 409 /*
 410  * Given mounted "dvp" and "exi", go upper mountpoint
 411  * with dvp/exi correction
 412  * Return 0 in success
 413  */
 414 int
 415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416 {
 417         struct exportinfo *exi;
 418         vnode_t *dvp = *dvpp;
 419 
 420         ASSERT(dvp->v_flag & VROOT);
 421 
 422         VN_HOLD(dvp);
 423         dvp = untraverse(dvp);
 424         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 425         if (exi == NULL) {
 426                 VN_RELE(dvp);
 427                 return (-1);
 428         }
 429 
 430         exi_rele(*exip);
 431         *exip = exi;
 432         VN_RELE(*dvpp);
 433         *dvpp = dvp;
 434 
 435         return (0);
 436 }
 437 /*
 438  * Directory lookup.
 439  * Returns an fhandle and file attributes for file name in a directory.
 440  */
 441 /* ARGSUSED */
 442 void
 443 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 444     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 445 {
 446         int error;
 447         vnode_t *dvp;
 448         vnode_t *vp;
 449         struct vattr va;
 450         fhandle_t *fhp = da->da_fhandle;
 451         struct sec_ol sec = {0, 0};
 452         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 453         char *name;
 454         struct sockaddr *ca;
 455 
 456         /*
 457          * Trusted Extension doesn't support NFSv2. MOUNT
 458          * will reject v2 clients. Need to prevent v2 client
 459          * access via WebNFS here.
 460          */
 461         if (is_system_labeled() && req->rq_vers == 2) {
 462                 dr->dr_status = NFSERR_ACCES;
 463                 return;
 464         }
 465 
 466         /*
 467          * Disallow NULL paths
 468          */
 469         if (da->da_name == NULL || *da->da_name == '\0') {
 470                 dr->dr_status = NFSERR_ACCES;
 471                 return;
 472         }
 473 
 474         /*
 475          * Allow lookups from the root - the default
 476          * location of the public filehandle.
 477          */
 478         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 479                 dvp = ZONE_ROOTVP();
 480                 VN_HOLD(dvp);
 481         } else {
 482                 dvp = nfs_fhtovp(fhp, exi);
 483                 if (dvp == NULL) {
 484                         dr->dr_status = NFSERR_STALE;
 485                         return;
 486                 }
 487         }
 488 
 489         exi_hold(exi);
 490 
 491         /*
 492          * Not allow lookup beyond root.
 493          * If the filehandle matches a filehandle of the exi,
 494          * then the ".." refers beyond the root of an exported filesystem.
 495          */
 496         if (strcmp(da->da_name, "..") == 0 &&
 497             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 498                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 499                     (dvp->v_flag & VROOT)) {
 500                         /*
 501                          * special case for ".." and 'nohide'exported root
 502                          */
 503                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 504                                 error = NFSERR_ACCES;
 505                                 goto out;
 506                         }
 507                 } else  {
 508                         error = NFSERR_NOENT;
 509                         goto out;
 510                 }
 511         }
 512 
 513         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 514         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 515             MAXPATHLEN);
 516 
 517         if (name == NULL) {
 518                 error = NFSERR_ACCES;
 519                 goto out;
 520         }
 521 
 522         /*
 523          * If the public filehandle is used then allow
 524          * a multi-component lookup, i.e. evaluate
 525          * a pathname and follow symbolic links if
 526          * necessary.
 527          *
 528          * This may result in a vnode in another filesystem
 529          * which is OK as long as the filesystem is exported.
 530          */
 531         if (PUBLIC_FH2(fhp)) {
 532                 publicfh_flag = TRUE;
 533 
 534                 exi_rele(exi);
 535 
 536                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 537                     &sec);
 538         } else {
 539                 /*
 540                  * Do a normal single component lookup.
 541                  */
 542                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 543                     NULL, NULL, NULL);
 544         }
 545 
 546         if (name != da->da_name)
 547                 kmem_free(name, MAXPATHLEN);
 548 
 549         if (error == 0 && vn_ismntpt(vp)) {
 550                 error = rfs_cross_mnt(&vp, &exi);
 551                 if (error)
 552                         VN_RELE(vp);
 553         }
 554 
 555         if (!error) {
 556                 va.va_mask = AT_ALL;    /* we want everything */
 557 
 558                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 559 
 560                 /* check for overflows */
 561                 if (!error) {
 562                         acl_perm(vp, exi, &va, cr);
 563                         error = vattr_to_nattr(&va, &dr->dr_attr);
 564                         if (!error) {
 565                                 if (sec.sec_flags & SEC_QUERY)
 566                                         error = makefh_ol(&dr->dr_fhandle, exi,
 567                                             sec.sec_index);
 568                                 else {
 569                                         error = makefh(&dr->dr_fhandle, vp,
 570                                             exi);
 571                                         if (!error && publicfh_flag &&
 572                                             !chk_clnt_sec(exi, req))
 573                                                 auth_weak = TRUE;
 574                                 }
 575                         }
 576                 }
 577                 VN_RELE(vp);
 578         }
 579 
 580 out:
 581         VN_RELE(dvp);
 582 
 583         if (exi != NULL)
 584                 exi_rele(exi);
 585 
 586         /*
 587          * If it's public fh, no 0x81, and client's flavor is
 588          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 589          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 590          */
 591         if (auth_weak)
 592                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 593         else
 594                 dr->dr_status = puterrno(error);
 595 }
 596 void *
 597 rfs_lookup_getfh(struct nfsdiropargs *da)
 598 {
 599         return (da->da_fhandle);
 600 }
 601 
 602 /*
 603  * Read symbolic link.
 604  * Returns the string in the symbolic link at the given fhandle.
 605  */
 606 /* ARGSUSED */
 607 void
 608 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 609     struct svc_req *req, cred_t *cr, bool_t ro)
 610 {
 611         int error;
 612         struct iovec iov;
 613         struct uio uio;
 614         vnode_t *vp;
 615         struct vattr va;
 616         struct sockaddr *ca;
 617         char *name = NULL;
 618         int is_referral = 0;
 619 
 620         vp = nfs_fhtovp(fhp, exi);
 621         if (vp == NULL) {
 622                 rl->rl_data = NULL;
 623                 rl->rl_status = NFSERR_STALE;
 624                 return;
 625         }
 626 
 627         va.va_mask = AT_MODE;
 628 
 629         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 630 
 631         if (error) {
 632                 VN_RELE(vp);
 633                 rl->rl_data = NULL;
 634                 rl->rl_status = puterrno(error);
 635                 return;
 636         }
 637 
 638         if (MANDLOCK(vp, va.va_mode)) {
 639                 VN_RELE(vp);
 640                 rl->rl_data = NULL;
 641                 rl->rl_status = NFSERR_ACCES;
 642                 return;
 643         }
 644 
 645         /* We lied about the object type for a referral */
 646         if (vn_is_nfs_reparse(vp, cr))
 647                 is_referral = 1;
 648 
 649         /*
 650          * XNFS and RFC1094 require us to return ENXIO if argument
 651          * is not a link. BUGID 1138002.
 652          */
 653         if (vp->v_type != VLNK && !is_referral) {
 654                 VN_RELE(vp);
 655                 rl->rl_data = NULL;
 656                 rl->rl_status = NFSERR_NXIO;
 657                 return;
 658         }
 659 
 660         /*
 661          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 662          */
 663         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 664 
 665         if (is_referral) {
 666                 char *s;
 667                 size_t strsz;
 668 
 669                 /* Get an artificial symlink based on a referral */
 670                 s = build_symlink(vp, cr, &strsz);
 671                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 672                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 673                     vnode_t *, vp, char *, s);
 674                 if (s == NULL)
 675                         error = EINVAL;
 676                 else {
 677                         error = 0;
 678                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 679                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 680                         kmem_free(s, strsz);
 681                 }
 682 
 683         } else {
 684 
 685                 /*
 686                  * Set up io vector to read sym link data
 687                  */
 688                 iov.iov_base = rl->rl_data;
 689                 iov.iov_len = NFS_MAXPATHLEN;
 690                 uio.uio_iov = &iov;
 691                 uio.uio_iovcnt = 1;
 692                 uio.uio_segflg = UIO_SYSSPACE;
 693                 uio.uio_extflg = UIO_COPY_CACHED;
 694                 uio.uio_loffset = (offset_t)0;
 695                 uio.uio_resid = NFS_MAXPATHLEN;
 696 
 697                 /*
 698                  * Do the readlink.
 699                  */
 700                 error = VOP_READLINK(vp, &uio, cr, NULL);
 701 
 702                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 703 
 704                 if (!error)
 705                         rl->rl_data[rl->rl_count] = '\0';
 706 
 707         }
 708 
 709 
 710         VN_RELE(vp);
 711 
 712         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 713         name = nfscmd_convname(ca, exi, rl->rl_data,
 714             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 715 
 716         if (name != NULL && name != rl->rl_data) {
 717                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 718                 rl->rl_data = name;
 719         }
 720 
 721         /*
 722          * XNFS and RFC1094 require us to return ENXIO if argument
 723          * is not a link. UFS returns EINVAL if this is the case,
 724          * so we do the mapping here. BUGID 1138002.
 725          */
 726         if (error == EINVAL)
 727                 rl->rl_status = NFSERR_NXIO;
 728         else
 729                 rl->rl_status = puterrno(error);
 730 
 731 }
 732 void *
 733 rfs_readlink_getfh(fhandle_t *fhp)
 734 {
 735         return (fhp);
 736 }
 737 /*
 738  * Free data allocated by rfs_readlink
 739  */
 740 void
 741 rfs_rlfree(struct nfsrdlnres *rl)
 742 {
 743         if (rl->rl_data != NULL)
 744                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 745 }
 746 
 747 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 748 
 749 /*
 750  * Read data.
 751  * Returns some data read from the file at the given fhandle.
 752  */
 753 /* ARGSUSED */
 754 void
 755 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 756     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 757 {
 758         vnode_t *vp;
 759         int error;
 760         struct vattr va;
 761         struct iovec iov;
 762         struct uio uio;
 763         mblk_t *mp;
 764         int alloc_err = 0;
 765         int in_crit = 0;
 766         caller_context_t ct;
 767 
 768         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 769         if (vp == NULL) {
 770                 rr->rr_data = NULL;
 771                 rr->rr_status = NFSERR_STALE;
 772                 return;
 773         }
 774 
 775         if (vp->v_type != VREG) {
 776                 VN_RELE(vp);
 777                 rr->rr_data = NULL;
 778                 rr->rr_status = NFSERR_ISDIR;
 779                 return;
 780         }
 781 
 782         ct.cc_sysid = 0;
 783         ct.cc_pid = 0;
 784         ct.cc_caller_id = nfs2_srv_caller_id;
 785         ct.cc_flags = CC_DONTBLOCK;
 786 
 787         /*
 788          * Enter the critical region before calling VOP_RWLOCK
 789          * to avoid a deadlock with write requests.
 790          */
 791         if (nbl_need_check(vp)) {
 792                 nbl_start_crit(vp, RW_READER);
 793                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 794                     0, NULL)) {
 795                         nbl_end_crit(vp);
 796                         VN_RELE(vp);
 797                         rr->rr_data = NULL;
 798                         rr->rr_status = NFSERR_ACCES;
 799                         return;
 800                 }
 801                 in_crit = 1;
 802         }
 803 
 804         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 805 
 806         /* check if a monitor detected a delegation conflict */
 807         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 808                 if (in_crit)
 809                         nbl_end_crit(vp);
 810                 VN_RELE(vp);
 811                 /* mark as wouldblock so response is dropped */
 812                 curthread->t_flag |= T_WOULDBLOCK;
 813 
 814                 rr->rr_data = NULL;
 815                 return;
 816         }
 817 
 818         va.va_mask = AT_ALL;
 819 
 820         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 821 
 822         if (error) {
 823                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 824                 if (in_crit)
 825                         nbl_end_crit(vp);
 826 
 827                 VN_RELE(vp);
 828                 rr->rr_data = NULL;
 829                 rr->rr_status = puterrno(error);
 830 
 831                 return;
 832         }
 833 
 834         /*
 835          * This is a kludge to allow reading of files created
 836          * with no read permission.  The owner of the file
 837          * is always allowed to read it.
 838          */
 839         if (crgetuid(cr) != va.va_uid) {
 840                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 841 
 842                 if (error) {
 843                         /*
 844                          * Exec is the same as read over the net because
 845                          * of demand loading.
 846                          */
 847                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 848                 }
 849                 if (error) {
 850                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 851                         if (in_crit)
 852                                 nbl_end_crit(vp);
 853                         VN_RELE(vp);
 854                         rr->rr_data = NULL;
 855                         rr->rr_status = puterrno(error);
 856 
 857                         return;
 858                 }
 859         }
 860 
 861         if (MANDLOCK(vp, va.va_mode)) {
 862                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 863                 if (in_crit)
 864                         nbl_end_crit(vp);
 865 
 866                 VN_RELE(vp);
 867                 rr->rr_data = NULL;
 868                 rr->rr_status = NFSERR_ACCES;
 869 
 870                 return;
 871         }
 872 
 873         rr->rr_ok.rrok_wlist_len = 0;
 874         rr->rr_ok.rrok_wlist = NULL;
 875 
 876         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 877                 rr->rr_count = 0;
 878                 rr->rr_data = NULL;
 879                 /*
 880                  * In this case, status is NFS_OK, but there is no data
 881                  * to encode. So set rr_mp to NULL.
 882                  */
 883                 rr->rr_mp = NULL;
 884                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 885                 if (rr->rr_ok.rrok_wlist)
 886                         clist_zero_len(rr->rr_ok.rrok_wlist);
 887                 goto done;
 888         }
 889 
 890         if (ra->ra_wlist) {
 891                 mp = NULL;
 892                 rr->rr_mp = NULL;
 893                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 894                 if (ra->ra_count > iov.iov_len) {
 895                         rr->rr_data = NULL;
 896                         rr->rr_status = NFSERR_INVAL;
 897                         goto done;
 898                 }
 899         } else {
 900                 /*
 901                  * mp will contain the data to be sent out in the read reply.
 902                  * This will be freed after the reply has been sent out (by the
 903                  * driver).
 904                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 905                  * that the call to xdrmblk_putmblk() never fails.
 906                  */
 907                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 908                     &alloc_err);
 909                 ASSERT(mp != NULL);
 910                 ASSERT(alloc_err == 0);
 911 
 912                 rr->rr_mp = mp;
 913 
 914                 /*
 915                  * Set up io vector
 916                  */
 917                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 918                 iov.iov_len = ra->ra_count;
 919         }
 920 
 921         uio.uio_iov = &iov;
 922         uio.uio_iovcnt = 1;
 923         uio.uio_segflg = UIO_SYSSPACE;
 924         uio.uio_extflg = UIO_COPY_CACHED;
 925         uio.uio_loffset = (offset_t)ra->ra_offset;
 926         uio.uio_resid = ra->ra_count;
 927 
 928         error = VOP_READ(vp, &uio, 0, cr, &ct);
 929 
 930         if (error) {
 931                 if (mp)
 932                         freeb(mp);
 933 
 934                 /*
 935                  * check if a monitor detected a delegation conflict and
 936                  * mark as wouldblock so response is dropped
 937                  */
 938                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 939                         curthread->t_flag |= T_WOULDBLOCK;
 940                 else
 941                         rr->rr_status = puterrno(error);
 942 
 943                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 944                 if (in_crit)
 945                         nbl_end_crit(vp);
 946 
 947                 VN_RELE(vp);
 948                 rr->rr_data = NULL;
 949 
 950                 return;
 951         }
 952 
 953         /*
 954          * Get attributes again so we can send the latest access
 955          * time to the client side for its cache.
 956          */
 957         va.va_mask = AT_ALL;
 958 
 959         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 960 
 961         if (error) {
 962                 if (mp)
 963                         freeb(mp);
 964 
 965                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 966                 if (in_crit)
 967                         nbl_end_crit(vp);
 968 
 969                 VN_RELE(vp);
 970                 rr->rr_data = NULL;
 971                 rr->rr_status = puterrno(error);
 972 
 973                 return;
 974         }
 975 
 976         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 977 
 978         if (mp) {
 979                 rr->rr_data = (char *)mp->b_datap->db_base;
 980         } else {
 981                 if (ra->ra_wlist) {
 982                         rr->rr_data = (caddr_t)iov.iov_base;
 983                         if (!rdma_setup_read_data2(ra, rr)) {
 984                                 rr->rr_data = NULL;
 985                                 rr->rr_status = puterrno(NFSERR_INVAL);
 986                         }
 987                 }
 988         }
 989 done:
 990         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 991         if (in_crit)
 992                 nbl_end_crit(vp);
 993 
 994         acl_perm(vp, exi, &va, cr);
 995 
 996         /* check for overflows */
 997         error = vattr_to_nattr(&va, &rr->rr_attr);
 998 
 999         VN_RELE(vp);
1000 
1001         rr->rr_status = puterrno(error);
1002 }
1003 
1004 /*
1005  * Free data allocated by rfs_read
1006  */
1007 void
1008 rfs_rdfree(struct nfsrdresult *rr)
1009 {
1010         mblk_t *mp;
1011 
1012         if (rr->rr_status == NFS_OK) {
1013                 mp = rr->rr_mp;
1014                 if (mp != NULL)
1015                         freeb(mp);
1016         }
1017 }
1018 
1019 void *
1020 rfs_read_getfh(struct nfsreadargs *ra)
1021 {
1022         return (&ra->ra_fhandle);
1023 }
1024 
1025 #define MAX_IOVECS      12
1026 
1027 #ifdef DEBUG
1028 static int rfs_write_sync_hits = 0;
1029 static int rfs_write_sync_misses = 0;
1030 #endif
1031 
1032 /*
1033  * Write data to file.
1034  * Returns attributes of a file after writing some data to it.
1035  *
1036  * Any changes made here, especially in error handling might have
1037  * to also be done in rfs_write (which clusters write requests).
1038  */
1039 /* ARGSUSED */
1040 void
1041 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1042     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1043 {
1044         int error;
1045         vnode_t *vp;
1046         rlim64_t rlimit;
1047         struct vattr va;
1048         struct uio uio;
1049         struct iovec iov[MAX_IOVECS];
1050         mblk_t *m;
1051         struct iovec *iovp;
1052         int iovcnt;
1053         cred_t *savecred;
1054         int in_crit = 0;
1055         caller_context_t ct;
1056 
1057         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1058         if (vp == NULL) {
1059                 ns->ns_status = NFSERR_STALE;
1060                 return;
1061         }
1062 
1063         if (rdonly(ro, vp)) {
1064                 VN_RELE(vp);
1065                 ns->ns_status = NFSERR_ROFS;
1066                 return;
1067         }
1068 
1069         if (vp->v_type != VREG) {
1070                 VN_RELE(vp);
1071                 ns->ns_status = NFSERR_ISDIR;
1072                 return;
1073         }
1074 
1075         ct.cc_sysid = 0;
1076         ct.cc_pid = 0;
1077         ct.cc_caller_id = nfs2_srv_caller_id;
1078         ct.cc_flags = CC_DONTBLOCK;
1079 
1080         va.va_mask = AT_UID|AT_MODE;
1081 
1082         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1083 
1084         if (error) {
1085                 VN_RELE(vp);
1086                 ns->ns_status = puterrno(error);
1087 
1088                 return;
1089         }
1090 
1091         if (crgetuid(cr) != va.va_uid) {
1092                 /*
1093                  * This is a kludge to allow writes of files created
1094                  * with read only permission.  The owner of the file
1095                  * is always allowed to write it.
1096                  */
1097                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1098 
1099                 if (error) {
1100                         VN_RELE(vp);
1101                         ns->ns_status = puterrno(error);
1102                         return;
1103                 }
1104         }
1105 
1106         /*
1107          * Can't access a mandatory lock file.  This might cause
1108          * the NFS service thread to block forever waiting for a
1109          * lock to be released that will never be released.
1110          */
1111         if (MANDLOCK(vp, va.va_mode)) {
1112                 VN_RELE(vp);
1113                 ns->ns_status = NFSERR_ACCES;
1114                 return;
1115         }
1116 
1117         /*
1118          * We have to enter the critical region before calling VOP_RWLOCK
1119          * to avoid a deadlock with ufs.
1120          */
1121         if (nbl_need_check(vp)) {
1122                 nbl_start_crit(vp, RW_READER);
1123                 in_crit = 1;
1124                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1125                     wa->wa_count, 0, NULL)) {
1126                         error = EACCES;
1127                         goto out;
1128                 }
1129         }
1130 
1131         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1132 
1133         /* check if a monitor detected a delegation conflict */
1134         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1135                 goto out;
1136         }
1137 
1138         if (wa->wa_data || wa->wa_rlist) {
1139                 /* Do the RDMA thing if necessary */
1140                 if (wa->wa_rlist) {
1141                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1142                         iov[0].iov_len = wa->wa_count;
1143                 } else  {
1144                         iov[0].iov_base = wa->wa_data;
1145                         iov[0].iov_len = wa->wa_count;
1146                 }
1147                 uio.uio_iov = iov;
1148                 uio.uio_iovcnt = 1;
1149                 uio.uio_segflg = UIO_SYSSPACE;
1150                 uio.uio_extflg = UIO_COPY_DEFAULT;
1151                 uio.uio_loffset = (offset_t)wa->wa_offset;
1152                 uio.uio_resid = wa->wa_count;
1153                 /*
1154                  * The limit is checked on the client. We
1155                  * should allow any size writes here.
1156                  */
1157                 uio.uio_llimit = curproc->p_fsz_ctl;
1158                 rlimit = uio.uio_llimit - wa->wa_offset;
1159                 if (rlimit < (rlim64_t)uio.uio_resid)
1160                         uio.uio_resid = (uint_t)rlimit;
1161 
1162                 /*
1163                  * for now we assume no append mode
1164                  */
1165                 /*
1166                  * We're changing creds because VM may fault and we need
1167                  * the cred of the current thread to be used if quota
1168                  * checking is enabled.
1169                  */
1170                 savecred = curthread->t_cred;
1171                 curthread->t_cred = cr;
1172                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1173                 curthread->t_cred = savecred;
1174         } else {
1175 
1176                 iovcnt = 0;
1177                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1178                         iovcnt++;
1179                 if (iovcnt <= MAX_IOVECS) {
1180 #ifdef DEBUG
1181                         rfs_write_sync_hits++;
1182 #endif
1183                         iovp = iov;
1184                 } else {
1185 #ifdef DEBUG
1186                         rfs_write_sync_misses++;
1187 #endif
1188                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1189                 }
1190                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1191                 uio.uio_iov = iovp;
1192                 uio.uio_iovcnt = iovcnt;
1193                 uio.uio_segflg = UIO_SYSSPACE;
1194                 uio.uio_extflg = UIO_COPY_DEFAULT;
1195                 uio.uio_loffset = (offset_t)wa->wa_offset;
1196                 uio.uio_resid = wa->wa_count;
1197                 /*
1198                  * The limit is checked on the client. We
1199                  * should allow any size writes here.
1200                  */
1201                 uio.uio_llimit = curproc->p_fsz_ctl;
1202                 rlimit = uio.uio_llimit - wa->wa_offset;
1203                 if (rlimit < (rlim64_t)uio.uio_resid)
1204                         uio.uio_resid = (uint_t)rlimit;
1205 
1206                 /*
1207                  * For now we assume no append mode.
1208                  */
1209                 /*
1210                  * We're changing creds because VM may fault and we need
1211                  * the cred of the current thread to be used if quota
1212                  * checking is enabled.
1213                  */
1214                 savecred = curthread->t_cred;
1215                 curthread->t_cred = cr;
1216                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1217                 curthread->t_cred = savecred;
1218 
1219                 if (iovp != iov)
1220                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1221         }
1222 
1223         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1224 
1225         if (!error) {
1226                 /*
1227                  * Get attributes again so we send the latest mod
1228                  * time to the client side for its cache.
1229                  */
1230                 va.va_mask = AT_ALL;    /* now we want everything */
1231 
1232                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1233 
1234                 /* check for overflows */
1235                 if (!error) {
1236                         acl_perm(vp, exi, &va, cr);
1237                         error = vattr_to_nattr(&va, &ns->ns_attr);
1238                 }
1239         }
1240 
1241 out:
1242         if (in_crit)
1243                 nbl_end_crit(vp);
1244         VN_RELE(vp);
1245 
1246         /* check if a monitor detected a delegation conflict */
1247         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1248                 /* mark as wouldblock so response is dropped */
1249                 curthread->t_flag |= T_WOULDBLOCK;
1250         else
1251                 ns->ns_status = puterrno(error);
1252 
1253 }
1254 
1255 struct rfs_async_write {
1256         struct nfswriteargs *wa;
1257         struct nfsattrstat *ns;
1258         struct svc_req *req;
1259         cred_t *cr;
1260         bool_t ro;
1261         kthread_t *thread;
1262         struct rfs_async_write *list;
1263 };
1264 
1265 struct rfs_async_write_list {
1266         fhandle_t *fhp;
1267         kcondvar_t cv;
1268         struct rfs_async_write *list;
1269         struct rfs_async_write_list *next;
1270 };
1271 
1272 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1273 static kmutex_t rfs_async_write_lock;
1274 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1275 
1276 #define MAXCLIOVECS     42
1277 #define RFSWRITE_INITVAL (enum nfsstat) -1
1278 
1279 #ifdef DEBUG
1280 static int rfs_write_hits = 0;
1281 static int rfs_write_misses = 0;
1282 #endif
1283 
1284 /*
1285  * Write data to file.
1286  * Returns attributes of a file after writing some data to it.
1287  */
1288 void
1289 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1290     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1291 {
1292         int error;
1293         vnode_t *vp;
1294         rlim64_t rlimit;
1295         struct vattr va;
1296         struct uio uio;
1297         struct rfs_async_write_list *lp;
1298         struct rfs_async_write_list *nlp;
1299         struct rfs_async_write *rp;
1300         struct rfs_async_write *nrp;
1301         struct rfs_async_write *trp;
1302         struct rfs_async_write *lrp;
1303         int data_written;
1304         int iovcnt;
1305         mblk_t *m;
1306         struct iovec *iovp;
1307         struct iovec *niovp;
1308         struct iovec iov[MAXCLIOVECS];
1309         int count;
1310         int rcount;
1311         uint_t off;
1312         uint_t len;
1313         struct rfs_async_write nrpsp;
1314         struct rfs_async_write_list nlpsp;
1315         ushort_t t_flag;
1316         cred_t *savecred;
1317         int in_crit = 0;
1318         caller_context_t ct;
1319         nfs_srv_t *nsrv;
1320 
1321         nsrv = nfs_get_srv();
1322         if (!nsrv->write_async) {
1323                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1324                 return;
1325         }
1326 
1327         /*
1328          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1329          * is considered an OK.
1330          */
1331         ns->ns_status = RFSWRITE_INITVAL;
1332 
1333         nrp = &nrpsp;
1334         nrp->wa = wa;
1335         nrp->ns = ns;
1336         nrp->req = req;
1337         nrp->cr = cr;
1338         nrp->ro = ro;
1339         nrp->thread = curthread;
1340 
1341         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1342 
1343         /*
1344          * Look to see if there is already a cluster started
1345          * for this file.
1346          */
1347         mutex_enter(&nsrv->async_write_lock);
1348         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1349                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1350                     sizeof (fhandle_t)) == 0)
1351                         break;
1352         }
1353 
1354         /*
1355          * If lp is non-NULL, then there is already a cluster
1356          * started.  We need to place ourselves in the cluster
1357          * list in the right place as determined by starting
1358          * offset.  Conflicts with non-blocking mandatory locked
1359          * regions will be checked when the cluster is processed.
1360          */
1361         if (lp != NULL) {
1362                 rp = lp->list;
1363                 trp = NULL;
1364                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1365                         trp = rp;
1366                         rp = rp->list;
1367                 }
1368                 nrp->list = rp;
1369                 if (trp == NULL)
1370                         lp->list = nrp;
1371                 else
1372                         trp->list = nrp;
1373                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1374                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1375                 mutex_exit(&nsrv->async_write_lock);
1376 
1377                 return;
1378         }
1379 
1380         /*
1381          * No cluster started yet, start one and add ourselves
1382          * to the list of clusters.
1383          */
1384         nrp->list = NULL;
1385 
1386         nlp = &nlpsp;
1387         nlp->fhp = &wa->wa_fhandle;
1388         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1389         nlp->list = nrp;
1390         nlp->next = NULL;
1391 
1392         if (nsrv->async_write_head == NULL) {
1393                 nsrv->async_write_head = nlp;
1394         } else {
1395                 lp = nsrv->async_write_head;
1396                 while (lp->next != NULL)
1397                         lp = lp->next;
1398                 lp->next = nlp;
1399         }
1400         mutex_exit(&nsrv->async_write_lock);
1401 
1402         /*
1403          * Convert the file handle common to all of the requests
1404          * in this cluster to a vnode.
1405          */
1406         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1407         if (vp == NULL) {
1408                 mutex_enter(&nsrv->async_write_lock);
1409                 if (nsrv->async_write_head == nlp)
1410                         nsrv->async_write_head = nlp->next;
1411                 else {
1412                         lp = nsrv->async_write_head;
1413                         while (lp->next != nlp)
1414                                 lp = lp->next;
1415                         lp->next = nlp->next;
1416                 }
1417                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1418                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1419                         rp->ns->ns_status = NFSERR_STALE;
1420                         rp->thread->t_flag |= t_flag;
1421                 }
1422                 cv_broadcast(&nlp->cv);
1423                 mutex_exit(&nsrv->async_write_lock);
1424 
1425                 return;
1426         }
1427 
1428         /*
1429          * Can only write regular files.  Attempts to write any
1430          * other file types fail with EISDIR.
1431          */
1432         if (vp->v_type != VREG) {
1433                 VN_RELE(vp);
1434                 mutex_enter(&nsrv->async_write_lock);
1435                 if (nsrv->async_write_head == nlp)
1436                         nsrv->async_write_head = nlp->next;
1437                 else {
1438                         lp = nsrv->async_write_head;
1439                         while (lp->next != nlp)
1440                                 lp = lp->next;
1441                         lp->next = nlp->next;
1442                 }
1443                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1444                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1445                         rp->ns->ns_status = NFSERR_ISDIR;
1446                         rp->thread->t_flag |= t_flag;
1447                 }
1448                 cv_broadcast(&nlp->cv);
1449                 mutex_exit(&nsrv->async_write_lock);
1450 
1451                 return;
1452         }
1453 
1454         /*
1455          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1456          * deadlock with ufs.
1457          */
1458         if (nbl_need_check(vp)) {
1459                 nbl_start_crit(vp, RW_READER);
1460                 in_crit = 1;
1461         }
1462 
1463         ct.cc_sysid = 0;
1464         ct.cc_pid = 0;
1465         ct.cc_caller_id = nfs2_srv_caller_id;
1466         ct.cc_flags = CC_DONTBLOCK;
1467 
1468         /*
1469          * Lock the file for writing.  This operation provides
1470          * the delay which allows clusters to grow.
1471          */
1472         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1473 
1474         /* check if a monitor detected a delegation conflict */
1475         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1476                 if (in_crit)
1477                         nbl_end_crit(vp);
1478                 VN_RELE(vp);
1479                 /* mark as wouldblock so response is dropped */
1480                 curthread->t_flag |= T_WOULDBLOCK;
1481                 mutex_enter(&nsrv->async_write_lock);
1482                 if (nsrv->async_write_head == nlp)
1483                         nsrv->async_write_head = nlp->next;
1484                 else {
1485                         lp = nsrv->async_write_head;
1486                         while (lp->next != nlp)
1487                                 lp = lp->next;
1488                         lp->next = nlp->next;
1489                 }
1490                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1491                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1492                                 rp->ns->ns_status = puterrno(error);
1493                                 rp->thread->t_flag |= T_WOULDBLOCK;
1494                         }
1495                 }
1496                 cv_broadcast(&nlp->cv);
1497                 mutex_exit(&nsrv->async_write_lock);
1498 
1499                 return;
1500         }
1501 
1502         /*
1503          * Disconnect this cluster from the list of clusters.
1504          * The cluster that is being dealt with must be fixed
1505          * in size after this point, so there is no reason
1506          * to leave it on the list so that new requests can
1507          * find it.
1508          *
1509          * The algorithm is that the first write request will
1510          * create a cluster, convert the file handle to a
1511          * vnode pointer, and then lock the file for writing.
1512          * This request is not likely to be clustered with
1513          * any others.  However, the next request will create
1514          * a new cluster and be blocked in VOP_RWLOCK while
1515          * the first request is being processed.  This delay
1516          * will allow more requests to be clustered in this
1517          * second cluster.
1518          */
1519         mutex_enter(&nsrv->async_write_lock);
1520         if (nsrv->async_write_head == nlp)
1521                 nsrv->async_write_head = nlp->next;
1522         else {
1523                 lp = nsrv->async_write_head;
1524                 while (lp->next != nlp)
1525                         lp = lp->next;
1526                 lp->next = nlp->next;
1527         }
1528         mutex_exit(&nsrv->async_write_lock);
1529 
1530         /*
1531          * Step through the list of requests in this cluster.
1532          * We need to check permissions to make sure that all
1533          * of the requests have sufficient permission to write
1534          * the file.  A cluster can be composed of requests
1535          * from different clients and different users on each
1536          * client.
1537          *
1538          * As a side effect, we also calculate the size of the
1539          * byte range that this cluster encompasses.
1540          */
1541         rp = nlp->list;
1542         off = rp->wa->wa_offset;
1543         len = (uint_t)0;
1544         do {
1545                 if (rdonly(rp->ro, vp)) {
1546                         rp->ns->ns_status = NFSERR_ROFS;
1547                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1548                         rp->thread->t_flag |= t_flag;
1549                         continue;
1550                 }
1551 
1552                 va.va_mask = AT_UID|AT_MODE;
1553 
1554                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1555 
1556                 if (!error) {
1557                         if (crgetuid(rp->cr) != va.va_uid) {
1558                                 /*
1559                                  * This is a kludge to allow writes of files
1560                                  * created with read only permission.  The
1561                                  * owner of the file is always allowed to
1562                                  * write it.
1563                                  */
1564                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1565                         }
1566                         if (!error && MANDLOCK(vp, va.va_mode))
1567                                 error = EACCES;
1568                 }
1569 
1570                 /*
1571                  * Check for a conflict with a nbmand-locked region.
1572                  */
1573                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1574                     rp->wa->wa_count, 0, NULL)) {
1575                         error = EACCES;
1576                 }
1577 
1578                 if (error) {
1579                         rp->ns->ns_status = puterrno(error);
1580                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1581                         rp->thread->t_flag |= t_flag;
1582                         continue;
1583                 }
1584                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1585                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1586         } while ((rp = rp->list) != NULL);
1587 
1588         /*
1589          * Step through the cluster attempting to gather as many
1590          * requests which are contiguous as possible.  These
1591          * contiguous requests are handled via one call to VOP_WRITE
1592          * instead of different calls to VOP_WRITE.  We also keep
1593          * track of the fact that any data was written.
1594          */
1595         rp = nlp->list;
1596         data_written = 0;
1597         do {
1598                 /*
1599                  * Skip any requests which are already marked as having an
1600                  * error.
1601                  */
1602                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1603                         rp = rp->list;
1604                         continue;
1605                 }
1606 
1607                 /*
1608                  * Count the number of iovec's which are required
1609                  * to handle this set of requests.  One iovec is
1610                  * needed for each data buffer, whether addressed
1611                  * by wa_data or by the b_rptr pointers in the
1612                  * mblk chains.
1613                  */
1614                 iovcnt = 0;
1615                 lrp = rp;
1616                 for (;;) {
1617                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1618                                 iovcnt++;
1619                         else {
1620                                 m = lrp->wa->wa_mblk;
1621                                 while (m != NULL) {
1622                                         iovcnt++;
1623                                         m = m->b_cont;
1624                                 }
1625                         }
1626                         if (lrp->list == NULL ||
1627                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1628                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1629                             lrp->list->wa->wa_offset) {
1630                                 lrp = lrp->list;
1631                                 break;
1632                         }
1633                         lrp = lrp->list;
1634                 }
1635 
1636                 if (iovcnt <= MAXCLIOVECS) {
1637 #ifdef DEBUG
1638                         rfs_write_hits++;
1639 #endif
1640                         niovp = iov;
1641                 } else {
1642 #ifdef DEBUG
1643                         rfs_write_misses++;
1644 #endif
1645                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1646                 }
1647                 /*
1648                  * Put together the scatter/gather iovecs.
1649                  */
1650                 iovp = niovp;
1651                 trp = rp;
1652                 count = 0;
1653                 do {
1654                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1655                                 if (trp->wa->wa_rlist) {
1656                                         iovp->iov_base =
1657                                             (char *)((trp->wa->wa_rlist)->
1658                                             u.c_daddr3);
1659                                         iovp->iov_len = trp->wa->wa_count;
1660                                 } else  {
1661                                         iovp->iov_base = trp->wa->wa_data;
1662                                         iovp->iov_len = trp->wa->wa_count;
1663                                 }
1664                                 iovp++;
1665                         } else {
1666                                 m = trp->wa->wa_mblk;
1667                                 rcount = trp->wa->wa_count;
1668                                 while (m != NULL) {
1669                                         iovp->iov_base = (caddr_t)m->b_rptr;
1670                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1671                                         rcount -= iovp->iov_len;
1672                                         if (rcount < 0)
1673                                                 iovp->iov_len += rcount;
1674                                         iovp++;
1675                                         if (rcount <= 0)
1676                                                 break;
1677                                         m = m->b_cont;
1678                                 }
1679                         }
1680                         count += trp->wa->wa_count;
1681                         trp = trp->list;
1682                 } while (trp != lrp);
1683 
1684                 uio.uio_iov = niovp;
1685                 uio.uio_iovcnt = iovcnt;
1686                 uio.uio_segflg = UIO_SYSSPACE;
1687                 uio.uio_extflg = UIO_COPY_DEFAULT;
1688                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1689                 uio.uio_resid = count;
1690                 /*
1691                  * The limit is checked on the client. We
1692                  * should allow any size writes here.
1693                  */
1694                 uio.uio_llimit = curproc->p_fsz_ctl;
1695                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1696                 if (rlimit < (rlim64_t)uio.uio_resid)
1697                         uio.uio_resid = (uint_t)rlimit;
1698 
1699                 /*
1700                  * For now we assume no append mode.
1701                  */
1702 
1703                 /*
1704                  * We're changing creds because VM may fault
1705                  * and we need the cred of the current
1706                  * thread to be used if quota * checking is
1707                  * enabled.
1708                  */
1709                 savecred = curthread->t_cred;
1710                 curthread->t_cred = cr;
1711                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1712                 curthread->t_cred = savecred;
1713 
1714                 /* check if a monitor detected a delegation conflict */
1715                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1716                         /* mark as wouldblock so response is dropped */
1717                         curthread->t_flag |= T_WOULDBLOCK;
1718 
1719                 if (niovp != iov)
1720                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1721 
1722                 if (!error) {
1723                         data_written = 1;
1724                         /*
1725                          * Get attributes again so we send the latest mod
1726                          * time to the client side for its cache.
1727                          */
1728                         va.va_mask = AT_ALL;    /* now we want everything */
1729 
1730                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1731 
1732                         if (!error)
1733                                 acl_perm(vp, exi, &va, rp->cr);
1734                 }
1735 
1736                 /*
1737                  * Fill in the status responses for each request
1738                  * which was just handled.  Also, copy the latest
1739                  * attributes in to the attribute responses if
1740                  * appropriate.
1741                  */
1742                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1743                 do {
1744                         rp->thread->t_flag |= t_flag;
1745                         /* check for overflows */
1746                         if (!error) {
1747                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1748                         }
1749                         rp->ns->ns_status = puterrno(error);
1750                         rp = rp->list;
1751                 } while (rp != lrp);
1752         } while (rp != NULL);
1753 
1754         /*
1755          * If any data was written at all, then we need to flush
1756          * the data and metadata to stable storage.
1757          */
1758         if (data_written) {
1759                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1760 
1761                 if (!error) {
1762                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1763                 }
1764         }
1765 
1766         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1767 
1768         if (in_crit)
1769                 nbl_end_crit(vp);
1770         VN_RELE(vp);
1771 
1772         t_flag = curthread->t_flag & T_WOULDBLOCK;
1773         mutex_enter(&nsrv->async_write_lock);
1774         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1775                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1776                         rp->ns->ns_status = puterrno(error);
1777                         rp->thread->t_flag |= t_flag;
1778                 }
1779         }
1780         cv_broadcast(&nlp->cv);
1781         mutex_exit(&nsrv->async_write_lock);
1782 
1783 }
1784 
1785 void *
1786 rfs_write_getfh(struct nfswriteargs *wa)
1787 {
1788         return (&wa->wa_fhandle);
1789 }
1790 
1791 /*
1792  * Create a file.
1793  * Creates a file with given attributes and returns those attributes
1794  * and an fhandle for the new file.
1795  */
1796 void
1797 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1798     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1799 {
1800         int error;
1801         int lookuperr;
1802         int in_crit = 0;
1803         struct vattr va;
1804         vnode_t *vp;
1805         vnode_t *realvp;
1806         vnode_t *dvp;
1807         char *name = args->ca_da.da_name;
1808         vnode_t *tvp = NULL;
1809         int mode;
1810         int lookup_ok;
1811         bool_t trunc;
1812         struct sockaddr *ca;
1813 
1814         /*
1815          * Disallow NULL paths
1816          */
1817         if (name == NULL || *name == '\0') {
1818                 dr->dr_status = NFSERR_ACCES;
1819                 return;
1820         }
1821 
1822         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1823         if (dvp == NULL) {
1824                 dr->dr_status = NFSERR_STALE;
1825                 return;
1826         }
1827 
1828         error = sattr_to_vattr(args->ca_sa, &va);
1829         if (error) {
1830                 dr->dr_status = puterrno(error);
1831                 return;
1832         }
1833 
1834         /*
1835          * Must specify the mode.
1836          */
1837         if (!(va.va_mask & AT_MODE)) {
1838                 VN_RELE(dvp);
1839                 dr->dr_status = NFSERR_INVAL;
1840                 return;
1841         }
1842 
1843         /*
1844          * This is a completely gross hack to make mknod
1845          * work over the wire until we can wack the protocol
1846          */
1847         if ((va.va_mode & IFMT) == IFCHR) {
1848                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1849                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1850                 else {
1851                         va.va_type = VCHR;
1852                         /*
1853                          * uncompress the received dev_t
1854                          * if the top half is zero indicating a request
1855                          * from an `older style' OS.
1856                          */
1857                         if ((va.va_size & 0xffff0000) == 0)
1858                                 va.va_rdev = nfsv2_expdev(va.va_size);
1859                         else
1860                                 va.va_rdev = (dev_t)va.va_size;
1861                 }
1862                 va.va_mask &= ~AT_SIZE;
1863         } else if ((va.va_mode & IFMT) == IFBLK) {
1864                 va.va_type = VBLK;
1865                 /*
1866                  * uncompress the received dev_t
1867                  * if the top half is zero indicating a request
1868                  * from an `older style' OS.
1869                  */
1870                 if ((va.va_size & 0xffff0000) == 0)
1871                         va.va_rdev = nfsv2_expdev(va.va_size);
1872                 else
1873                         va.va_rdev = (dev_t)va.va_size;
1874                 va.va_mask &= ~AT_SIZE;
1875         } else if ((va.va_mode & IFMT) == IFSOCK) {
1876                 va.va_type = VSOCK;
1877         } else {
1878                 va.va_type = VREG;
1879         }
1880         va.va_mode &= ~IFMT;
1881         va.va_mask |= AT_TYPE;
1882 
1883         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1884         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1885             MAXPATHLEN);
1886         if (name == NULL) {
1887                 dr->dr_status = puterrno(EINVAL);
1888                 return;
1889         }
1890 
1891         /*
1892          * Why was the choice made to use VWRITE as the mode to the
1893          * call to VOP_CREATE ? This results in a bug.  When a client
1894          * opens a file that already exists and is RDONLY, the second
1895          * open fails with an EACESS because of the mode.
1896          * bug ID 1054648.
1897          */
1898         lookup_ok = 0;
1899         mode = VWRITE;
1900         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1901                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1902                     NULL, NULL, NULL);
1903                 if (!error) {
1904                         struct vattr at;
1905 
1906                         lookup_ok = 1;
1907                         at.va_mask = AT_MODE;
1908                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1909                         if (!error)
1910                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1911                         VN_RELE(tvp);
1912                         tvp = NULL;
1913                 }
1914         }
1915 
1916         if (!lookup_ok) {
1917                 if (rdonly(ro, dvp)) {
1918                         error = EROFS;
1919                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1920                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1921                         error = EPERM;
1922                 } else {
1923                         error = 0;
1924                 }
1925         }
1926 
1927         /*
1928          * If file size is being modified on an already existing file
1929          * make sure that there are no conflicting non-blocking mandatory
1930          * locks in the region being manipulated. Return EACCES if there
1931          * are conflicting locks.
1932          */
1933         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1934                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1935                     NULL, NULL, NULL);
1936 
1937                 if (!lookuperr &&
1938                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1939                         VN_RELE(tvp);
1940                         curthread->t_flag |= T_WOULDBLOCK;
1941                         goto out;
1942                 }
1943 
1944                 if (!lookuperr && nbl_need_check(tvp)) {
1945                         /*
1946                          * The file exists. Now check if it has any
1947                          * conflicting non-blocking mandatory locks
1948                          * in the region being changed.
1949                          */
1950                         struct vattr bva;
1951                         u_offset_t offset;
1952                         ssize_t length;
1953 
1954                         nbl_start_crit(tvp, RW_READER);
1955                         in_crit = 1;
1956 
1957                         bva.va_mask = AT_SIZE;
1958                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1959                         if (!error) {
1960                                 if (va.va_size < bva.va_size) {
1961                                         offset = va.va_size;
1962                                         length = bva.va_size - va.va_size;
1963                                 } else {
1964                                         offset = bva.va_size;
1965                                         length = va.va_size - bva.va_size;
1966                                 }
1967                                 if (length) {
1968                                         if (nbl_conflict(tvp, NBL_WRITE,
1969                                             offset, length, 0, NULL)) {
1970                                                 error = EACCES;
1971                                         }
1972                                 }
1973                         }
1974                         if (error) {
1975                                 nbl_end_crit(tvp);
1976                                 VN_RELE(tvp);
1977                                 in_crit = 0;
1978                         }
1979                 } else if (tvp != NULL) {
1980                         VN_RELE(tvp);
1981                 }
1982         }
1983 
1984         if (!error) {
1985                 /*
1986                  * If filesystem is shared with nosuid the remove any
1987                  * setuid/setgid bits on create.
1988                  */
1989                 if (va.va_type == VREG &&
1990                     exi->exi_export.ex_flags & EX_NOSUID)
1991                         va.va_mode &= ~(VSUID | VSGID);
1992 
1993                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1994                     NULL, NULL);
1995 
1996                 if (!error) {
1997 
1998                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1999                                 trunc = TRUE;
2000                         else
2001                                 trunc = FALSE;
2002 
2003                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2004                                 VN_RELE(vp);
2005                                 curthread->t_flag |= T_WOULDBLOCK;
2006                                 goto out;
2007                         }
2008                         va.va_mask = AT_ALL;
2009 
2010                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2011 
2012                         /* check for overflows */
2013                         if (!error) {
2014                                 acl_perm(vp, exi, &va, cr);
2015                                 error = vattr_to_nattr(&va, &dr->dr_attr);
2016                                 if (!error) {
2017                                         error = makefh(&dr->dr_fhandle, vp,
2018                                             exi);
2019                                 }
2020                         }
2021                         /*
2022                          * Force modified metadata out to stable storage.
2023                          *
2024                          * if a underlying vp exists, pass it to VOP_FSYNC
2025                          */
2026                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
2027                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2028                         else
2029                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2030                         VN_RELE(vp);
2031                 }
2032 
2033                 if (in_crit) {
2034                         nbl_end_crit(tvp);
2035                         VN_RELE(tvp);
2036                 }
2037         }
2038 
2039         /*
2040          * Force modified data and metadata out to stable storage.
2041          */
2042         (void) VOP_FSYNC(dvp, 0, cr, NULL);
2043 
2044 out:
2045 
2046         VN_RELE(dvp);
2047 
2048         dr->dr_status = puterrno(error);
2049 
2050         if (name != args->ca_da.da_name)
2051                 kmem_free(name, MAXPATHLEN);
2052 }
2053 void *
2054 rfs_create_getfh(struct nfscreatargs *args)
2055 {
2056         return (args->ca_da.da_fhandle);
2057 }
2058 
2059 /*
2060  * Remove a file.
2061  * Remove named file from parent directory.
2062  */
2063 /* ARGSUSED */
2064 void
2065 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2066     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2067 {
2068         int error = 0;
2069         vnode_t *vp;
2070         vnode_t *targvp;
2071         int in_crit = 0;
2072 
2073         /*
2074          * Disallow NULL paths
2075          */
2076         if (da->da_name == NULL || *da->da_name == '\0') {
2077                 *status = NFSERR_ACCES;
2078                 return;
2079         }
2080 
2081         vp = nfs_fhtovp(da->da_fhandle, exi);
2082         if (vp == NULL) {
2083                 *status = NFSERR_STALE;
2084                 return;
2085         }
2086 
2087         if (rdonly(ro, vp)) {
2088                 VN_RELE(vp);
2089                 *status = NFSERR_ROFS;
2090                 return;
2091         }
2092 
2093         /*
2094          * Check for a conflict with a non-blocking mandatory share reservation.
2095          */
2096         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2097             NULL, cr, NULL, NULL, NULL);
2098         if (error != 0) {
2099                 VN_RELE(vp);
2100                 *status = puterrno(error);
2101                 return;
2102         }
2103 
2104         /*
2105          * If the file is delegated to an v4 client, then initiate
2106          * recall and drop this request (by setting T_WOULDBLOCK).
2107          * The client will eventually re-transmit the request and
2108          * (hopefully), by then, the v4 client will have returned
2109          * the delegation.
2110          */
2111 
2112         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2113                 VN_RELE(vp);
2114                 VN_RELE(targvp);
2115                 curthread->t_flag |= T_WOULDBLOCK;
2116                 return;
2117         }
2118 
2119         if (nbl_need_check(targvp)) {
2120                 nbl_start_crit(targvp, RW_READER);
2121                 in_crit = 1;
2122                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2123                         error = EACCES;
2124                         goto out;
2125                 }
2126         }
2127 
2128         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2129 
2130         /*
2131          * Force modified data and metadata out to stable storage.
2132          */
2133         (void) VOP_FSYNC(vp, 0, cr, NULL);
2134 
2135 out:
2136         if (in_crit)
2137                 nbl_end_crit(targvp);
2138         VN_RELE(targvp);
2139         VN_RELE(vp);
2140 
2141         *status = puterrno(error);
2142 
2143 }
2144 
2145 void *
2146 rfs_remove_getfh(struct nfsdiropargs *da)
2147 {
2148         return (da->da_fhandle);
2149 }
2150 
2151 /*
2152  * rename a file
2153  * Give a file (from) a new name (to).
2154  */
2155 /* ARGSUSED */
2156 void
2157 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2158     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2159 {
2160         int error = 0;
2161         vnode_t *fromvp;
2162         vnode_t *tovp;
2163         struct exportinfo *to_exi;
2164         fhandle_t *fh;
2165         vnode_t *srcvp;
2166         vnode_t *targvp;
2167         int in_crit = 0;
2168 
2169         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2170         if (fromvp == NULL) {
2171                 *status = NFSERR_STALE;
2172                 return;
2173         }
2174 
2175         fh = args->rna_to.da_fhandle;
2176         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2177         if (to_exi == NULL) {
2178                 VN_RELE(fromvp);
2179                 *status = NFSERR_ACCES;
2180                 return;
2181         }
2182         exi_rele(to_exi);
2183 
2184         if (to_exi != exi) {
2185                 VN_RELE(fromvp);
2186                 *status = NFSERR_XDEV;
2187                 return;
2188         }
2189 
2190         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2191         if (tovp == NULL) {
2192                 VN_RELE(fromvp);
2193                 *status = NFSERR_STALE;
2194                 return;
2195         }
2196 
2197         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2198                 VN_RELE(tovp);
2199                 VN_RELE(fromvp);
2200                 *status = NFSERR_NOTDIR;
2201                 return;
2202         }
2203 
2204         /*
2205          * Disallow NULL paths
2206          */
2207         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2208             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2209                 VN_RELE(tovp);
2210                 VN_RELE(fromvp);
2211                 *status = NFSERR_ACCES;
2212                 return;
2213         }
2214 
2215         if (rdonly(ro, tovp)) {
2216                 VN_RELE(tovp);
2217                 VN_RELE(fromvp);
2218                 *status = NFSERR_ROFS;
2219                 return;
2220         }
2221 
2222         /*
2223          * Check for a conflict with a non-blocking mandatory share reservation.
2224          */
2225         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2226             NULL, cr, NULL, NULL, NULL);
2227         if (error != 0) {
2228                 VN_RELE(tovp);
2229                 VN_RELE(fromvp);
2230                 *status = puterrno(error);
2231                 return;
2232         }
2233 
2234         /* Check for delegations on the source file */
2235 
2236         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2237                 VN_RELE(tovp);
2238                 VN_RELE(fromvp);
2239                 VN_RELE(srcvp);
2240                 curthread->t_flag |= T_WOULDBLOCK;
2241                 return;
2242         }
2243 
2244         /* Check for delegation on the file being renamed over, if it exists */
2245 
2246         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2247             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2248             NULL, NULL, NULL) == 0) {
2249 
2250                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2251                         VN_RELE(tovp);
2252                         VN_RELE(fromvp);
2253                         VN_RELE(srcvp);
2254                         VN_RELE(targvp);
2255                         curthread->t_flag |= T_WOULDBLOCK;
2256                         return;
2257                 }
2258                 VN_RELE(targvp);
2259         }
2260 
2261 
2262         if (nbl_need_check(srcvp)) {
2263                 nbl_start_crit(srcvp, RW_READER);
2264                 in_crit = 1;
2265                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2266                         error = EACCES;
2267                         goto out;
2268                 }
2269         }
2270 
2271         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2272             tovp, args->rna_to.da_name, cr, NULL, 0);
2273 
2274         if (error == 0)
2275                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2276                     strlen(args->rna_to.da_name));
2277 
2278         /*
2279          * Force modified data and metadata out to stable storage.
2280          */
2281         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2282         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2283 
2284 out:
2285         if (in_crit)
2286                 nbl_end_crit(srcvp);
2287         VN_RELE(srcvp);
2288         VN_RELE(tovp);
2289         VN_RELE(fromvp);
2290 
2291         *status = puterrno(error);
2292 
2293 }
2294 void *
2295 rfs_rename_getfh(struct nfsrnmargs *args)
2296 {
2297         return (args->rna_from.da_fhandle);
2298 }
2299 
2300 /*
2301  * Link to a file.
2302  * Create a file (to) which is a hard link to the given file (from).
2303  */
2304 /* ARGSUSED */
2305 void
2306 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2307     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2308 {
2309         int error;
2310         vnode_t *fromvp;
2311         vnode_t *tovp;
2312         struct exportinfo *to_exi;
2313         fhandle_t *fh;
2314 
2315         fromvp = nfs_fhtovp(args->la_from, exi);
2316         if (fromvp == NULL) {
2317                 *status = NFSERR_STALE;
2318                 return;
2319         }
2320 
2321         fh = args->la_to.da_fhandle;
2322         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2323         if (to_exi == NULL) {
2324                 VN_RELE(fromvp);
2325                 *status = NFSERR_ACCES;
2326                 return;
2327         }
2328         exi_rele(to_exi);
2329 
2330         if (to_exi != exi) {
2331                 VN_RELE(fromvp);
2332                 *status = NFSERR_XDEV;
2333                 return;
2334         }
2335 
2336         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2337         if (tovp == NULL) {
2338                 VN_RELE(fromvp);
2339                 *status = NFSERR_STALE;
2340                 return;
2341         }
2342 
2343         if (tovp->v_type != VDIR) {
2344                 VN_RELE(tovp);
2345                 VN_RELE(fromvp);
2346                 *status = NFSERR_NOTDIR;
2347                 return;
2348         }
2349         /*
2350          * Disallow NULL paths
2351          */
2352         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2353                 VN_RELE(tovp);
2354                 VN_RELE(fromvp);
2355                 *status = NFSERR_ACCES;
2356                 return;
2357         }
2358 
2359         if (rdonly(ro, tovp)) {
2360                 VN_RELE(tovp);
2361                 VN_RELE(fromvp);
2362                 *status = NFSERR_ROFS;
2363                 return;
2364         }
2365 
2366         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2367 
2368         /*
2369          * Force modified data and metadata out to stable storage.
2370          */
2371         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2372         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2373 
2374         VN_RELE(tovp);
2375         VN_RELE(fromvp);
2376 
2377         *status = puterrno(error);
2378 
2379 }
2380 void *
2381 rfs_link_getfh(struct nfslinkargs *args)
2382 {
2383         return (args->la_from);
2384 }
2385 
2386 /*
2387  * Symbolicly link to a file.
2388  * Create a file (to) with the given attributes which is a symbolic link
2389  * to the given path name (to).
2390  */
2391 void
2392 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2393     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2394 {
2395         int error;
2396         struct vattr va;
2397         vnode_t *vp;
2398         vnode_t *svp;
2399         int lerror;
2400         struct sockaddr *ca;
2401         char *name = NULL;
2402 
2403         /*
2404          * Disallow NULL paths
2405          */
2406         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2407                 *status = NFSERR_ACCES;
2408                 return;
2409         }
2410 
2411         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2412         if (vp == NULL) {
2413                 *status = NFSERR_STALE;
2414                 return;
2415         }
2416 
2417         if (rdonly(ro, vp)) {
2418                 VN_RELE(vp);
2419                 *status = NFSERR_ROFS;
2420                 return;
2421         }
2422 
2423         error = sattr_to_vattr(args->sla_sa, &va);
2424         if (error) {
2425                 VN_RELE(vp);
2426                 *status = puterrno(error);
2427                 return;
2428         }
2429 
2430         if (!(va.va_mask & AT_MODE)) {
2431                 VN_RELE(vp);
2432                 *status = NFSERR_INVAL;
2433                 return;
2434         }
2435 
2436         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2437         name = nfscmd_convname(ca, exi, args->sla_tnm,
2438             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2439 
2440         if (name == NULL) {
2441                 *status = NFSERR_ACCES;
2442                 return;
2443         }
2444 
2445         va.va_type = VLNK;
2446         va.va_mask |= AT_TYPE;
2447 
2448         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2449 
2450         /*
2451          * Force new data and metadata out to stable storage.
2452          */
2453         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2454             NULL, cr, NULL, NULL, NULL);
2455 
2456         if (!lerror) {
2457                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2458                 VN_RELE(svp);
2459         }
2460 
2461         /*
2462          * Force modified data and metadata out to stable storage.
2463          */
2464         (void) VOP_FSYNC(vp, 0, cr, NULL);
2465 
2466         VN_RELE(vp);
2467 
2468         *status = puterrno(error);
2469         if (name != args->sla_tnm)
2470                 kmem_free(name, MAXPATHLEN);
2471 
2472 }
2473 void *
2474 rfs_symlink_getfh(struct nfsslargs *args)
2475 {
2476         return (args->sla_from.da_fhandle);
2477 }
2478 
2479 /*
2480  * Make a directory.
2481  * Create a directory with the given name, parent directory, and attributes.
2482  * Returns a file handle and attributes for the new directory.
2483  */
2484 /* ARGSUSED */
2485 void
2486 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2487     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2488 {
2489         int error;
2490         struct vattr va;
2491         vnode_t *dvp = NULL;
2492         vnode_t *vp;
2493         char *name = args->ca_da.da_name;
2494 
2495         /*
2496          * Disallow NULL paths
2497          */
2498         if (name == NULL || *name == '\0') {
2499                 dr->dr_status = NFSERR_ACCES;
2500                 return;
2501         }
2502 
2503         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2504         if (vp == NULL) {
2505                 dr->dr_status = NFSERR_STALE;
2506                 return;
2507         }
2508 
2509         if (rdonly(ro, vp)) {
2510                 VN_RELE(vp);
2511                 dr->dr_status = NFSERR_ROFS;
2512                 return;
2513         }
2514 
2515         error = sattr_to_vattr(args->ca_sa, &va);
2516         if (error) {
2517                 VN_RELE(vp);
2518                 dr->dr_status = puterrno(error);
2519                 return;
2520         }
2521 
2522         if (!(va.va_mask & AT_MODE)) {
2523                 VN_RELE(vp);
2524                 dr->dr_status = NFSERR_INVAL;
2525                 return;
2526         }
2527 
2528         va.va_type = VDIR;
2529         va.va_mask |= AT_TYPE;
2530 
2531         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2532 
2533         if (!error) {
2534                 /*
2535                  * Attribtutes of the newly created directory should
2536                  * be returned to the client.
2537                  */
2538                 va.va_mask = AT_ALL; /* We want everything */
2539                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2540 
2541                 /* check for overflows */
2542                 if (!error) {
2543                         acl_perm(vp, exi, &va, cr);
2544                         error = vattr_to_nattr(&va, &dr->dr_attr);
2545                         if (!error) {
2546                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2547                         }
2548                 }
2549                 /*
2550                  * Force new data and metadata out to stable storage.
2551                  */
2552                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2553                 VN_RELE(dvp);
2554         }
2555 
2556         /*
2557          * Force modified data and metadata out to stable storage.
2558          */
2559         (void) VOP_FSYNC(vp, 0, cr, NULL);
2560 
2561         VN_RELE(vp);
2562 
2563         dr->dr_status = puterrno(error);
2564 
2565 }
2566 void *
2567 rfs_mkdir_getfh(struct nfscreatargs *args)
2568 {
2569         return (args->ca_da.da_fhandle);
2570 }
2571 
2572 /*
2573  * Remove a directory.
2574  * Remove the given directory name from the given parent directory.
2575  */
2576 /* ARGSUSED */
2577 void
2578 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2579     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2580 {
2581         int error;
2582         vnode_t *vp;
2583 
2584         /*
2585          * Disallow NULL paths
2586          */
2587         if (da->da_name == NULL || *da->da_name == '\0') {
2588                 *status = NFSERR_ACCES;
2589                 return;
2590         }
2591 
2592         vp = nfs_fhtovp(da->da_fhandle, exi);
2593         if (vp == NULL) {
2594                 *status = NFSERR_STALE;
2595                 return;
2596         }
2597 
2598         if (rdonly(ro, vp)) {
2599                 VN_RELE(vp);
2600                 *status = NFSERR_ROFS;
2601                 return;
2602         }
2603 
2604         /*
2605          * VOP_RMDIR takes a third argument (the current
2606          * directory of the process).  That's because someone
2607          * wants to return EINVAL if one tries to remove ".".
2608          * Of course, NFS servers have no idea what their
2609          * clients' current directories are.  We fake it by
2610          * supplying a vnode known to exist and illegal to
2611          * remove.
2612          */
2613         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2614 
2615         /*
2616          * Force modified data and metadata out to stable storage.
2617          */
2618         (void) VOP_FSYNC(vp, 0, cr, NULL);
2619 
2620         VN_RELE(vp);
2621 
2622         /*
2623          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2624          * if the directory is not empty.  A System V NFS server
2625          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2626          * over the wire.
2627          */
2628         if (error == EEXIST)
2629                 *status = NFSERR_NOTEMPTY;
2630         else
2631                 *status = puterrno(error);
2632 
2633 }
2634 void *
2635 rfs_rmdir_getfh(struct nfsdiropargs *da)
2636 {
2637         return (da->da_fhandle);
2638 }
2639 
2640 /* ARGSUSED */
2641 void
2642 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2643     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2644 {
2645         int error;
2646         int iseof;
2647         struct iovec iov;
2648         struct uio uio;
2649         vnode_t *vp;
2650         char *ndata = NULL;
2651         struct sockaddr *ca;
2652         size_t nents;
2653         int ret;
2654 
2655         vp = nfs_fhtovp(&rda->rda_fh, exi);
2656         if (vp == NULL) {
2657                 rd->rd_entries = NULL;
2658                 rd->rd_status = NFSERR_STALE;
2659                 return;
2660         }
2661 
2662         if (vp->v_type != VDIR) {
2663                 VN_RELE(vp);
2664                 rd->rd_entries = NULL;
2665                 rd->rd_status = NFSERR_NOTDIR;
2666                 return;
2667         }
2668 
2669         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2670 
2671         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2672 
2673         if (error) {
2674                 rd->rd_entries = NULL;
2675                 goto bad;
2676         }
2677 
2678         if (rda->rda_count == 0) {
2679                 rd->rd_entries = NULL;
2680                 rd->rd_size = 0;
2681                 rd->rd_eof = FALSE;
2682                 goto bad;
2683         }
2684 
2685         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2686 
2687         /*
2688          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2689          */
2690         rd->rd_bufsize = (uint_t)rda->rda_count;
2691         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2692 
2693         /*
2694          * Set up io vector to read directory data
2695          */
2696         iov.iov_base = (caddr_t)rd->rd_entries;
2697         iov.iov_len = rda->rda_count;
2698         uio.uio_iov = &iov;
2699         uio.uio_iovcnt = 1;
2700         uio.uio_segflg = UIO_SYSSPACE;
2701         uio.uio_extflg = UIO_COPY_CACHED;
2702         uio.uio_loffset = (offset_t)rda->rda_offset;
2703         uio.uio_resid = rda->rda_count;
2704 
2705         /*
2706          * read directory
2707          */
2708         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2709 
2710         /*
2711          * Clean up
2712          */
2713         if (!error) {
2714                 /*
2715                  * set size and eof
2716                  */
2717                 if (uio.uio_resid == rda->rda_count) {
2718                         rd->rd_size = 0;
2719                         rd->rd_eof = TRUE;
2720                 } else {
2721                         rd->rd_size = (uint32_t)(rda->rda_count -
2722                             uio.uio_resid);
2723                         rd->rd_eof = iseof ? TRUE : FALSE;
2724                 }
2725         }
2726 
2727         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2728         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2729         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2730             rda->rda_count, &ndata);
2731 
2732         if (ret != 0) {
2733                 size_t dropbytes;
2734                 /*
2735                  * We had to drop one or more entries in order to fit
2736                  * during the character conversion.  We need to patch
2737                  * up the size and eof info.
2738                  */
2739                 if (rd->rd_eof)
2740                         rd->rd_eof = FALSE;
2741                 dropbytes = nfscmd_dropped_entrysize(
2742                     (struct dirent64 *)rd->rd_entries, nents, ret);
2743                 rd->rd_size -= dropbytes;
2744         }
2745         if (ndata == NULL) {
2746                 ndata = (char *)rd->rd_entries;
2747         } else if (ndata != (char *)rd->rd_entries) {
2748                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2749                 rd->rd_entries = (void *)ndata;
2750                 rd->rd_bufsize = rda->rda_count;
2751         }
2752 
2753 bad:
2754         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2755 
2756 #if 0 /* notyet */
2757         /*
2758          * Don't do this.  It causes local disk writes when just
2759          * reading the file and the overhead is deemed larger
2760          * than the benefit.
2761          */
2762         /*
2763          * Force modified metadata out to stable storage.
2764          */
2765         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2766 #endif
2767 
2768         VN_RELE(vp);
2769 
2770         rd->rd_status = puterrno(error);
2771 
2772 }
2773 void *
2774 rfs_readdir_getfh(struct nfsrddirargs *rda)
2775 {
2776         return (&rda->rda_fh);
2777 }
2778 void
2779 rfs_rddirfree(struct nfsrddirres *rd)
2780 {
2781         if (rd->rd_entries != NULL)
2782                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2783 }
2784 
2785 /* ARGSUSED */
2786 void
2787 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2788     struct svc_req *req, cred_t *cr, bool_t ro)
2789 {
2790         int error;
2791         struct statvfs64 sb;
2792         vnode_t *vp;
2793 
2794         vp = nfs_fhtovp(fh, exi);
2795         if (vp == NULL) {
2796                 fs->fs_status = NFSERR_STALE;
2797                 return;
2798         }
2799 
2800         error = VFS_STATVFS(vp->v_vfsp, &sb);
2801 
2802         if (!error) {
2803                 fs->fs_tsize = nfstsize();
2804                 fs->fs_bsize = sb.f_frsize;
2805                 fs->fs_blocks = sb.f_blocks;
2806                 fs->fs_bfree = sb.f_bfree;
2807                 fs->fs_bavail = sb.f_bavail;
2808         }
2809 
2810         VN_RELE(vp);
2811 
2812         fs->fs_status = puterrno(error);
2813 
2814 }
2815 void *
2816 rfs_statfs_getfh(fhandle_t *fh)
2817 {
2818         return (fh);
2819 }
2820 
2821 static int
2822 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2823 {
2824         vap->va_mask = 0;
2825 
2826         /*
2827          * There was a sign extension bug in some VFS based systems
2828          * which stored the mode as a short.  When it would get
2829          * assigned to a u_long, no sign extension would occur.
2830          * It needed to, but this wasn't noticed because sa_mode
2831          * would then get assigned back to the short, thus ignoring
2832          * the upper 16 bits of sa_mode.
2833          *
2834          * To make this implementation work for both broken
2835          * clients and good clients, we check for both versions
2836          * of the mode.
2837          */
2838         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2839             sa->sa_mode != (uint32_t)-1) {
2840                 vap->va_mask |= AT_MODE;
2841                 vap->va_mode = sa->sa_mode;
2842         }
2843         if (sa->sa_uid != (uint32_t)-1) {
2844                 vap->va_mask |= AT_UID;
2845                 vap->va_uid = sa->sa_uid;
2846         }
2847         if (sa->sa_gid != (uint32_t)-1) {
2848                 vap->va_mask |= AT_GID;
2849                 vap->va_gid = sa->sa_gid;
2850         }
2851         if (sa->sa_size != (uint32_t)-1) {
2852                 vap->va_mask |= AT_SIZE;
2853                 vap->va_size = sa->sa_size;
2854         }
2855         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2856             sa->sa_atime.tv_usec != (int32_t)-1) {
2857 #ifndef _LP64
2858                 /* return error if time overflow */
2859                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2860                         return (EOVERFLOW);
2861 #endif
2862                 vap->va_mask |= AT_ATIME;
2863                 /*
2864                  * nfs protocol defines times as unsigned so don't extend sign,
2865                  * unless sysadmin set nfs_allow_preepoch_time.
2866                  */
2867                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2868                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2869         }
2870         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2871             sa->sa_mtime.tv_usec != (int32_t)-1) {
2872 #ifndef _LP64
2873                 /* return error if time overflow */
2874                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2875                         return (EOVERFLOW);
2876 #endif
2877                 vap->va_mask |= AT_MTIME;
2878                 /*
2879                  * nfs protocol defines times as unsigned so don't extend sign,
2880                  * unless sysadmin set nfs_allow_preepoch_time.
2881                  */
2882                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2883                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2884         }
2885         return (0);
2886 }
2887 
2888 static const enum nfsftype vt_to_nf[] = {
2889         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2890 };
2891 
2892 /*
2893  * check the following fields for overflow: nodeid, size, and time.
2894  * There could be a problem when converting 64-bit LP64 fields
2895  * into 32-bit ones.  Return an error if there is an overflow.
2896  */
2897 int
2898 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2899 {
2900         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2901         na->na_type = vt_to_nf[vap->va_type];
2902 
2903         if (vap->va_mode == (unsigned short) -1)
2904                 na->na_mode = (uint32_t)-1;
2905         else
2906                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2907 
2908         if (vap->va_uid == (unsigned short)(-1))
2909                 na->na_uid = (uint32_t)(-1);
2910         else if (vap->va_uid == UID_NOBODY)
2911                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2912         else
2913                 na->na_uid = vap->va_uid;
2914 
2915         if (vap->va_gid == (unsigned short)(-1))
2916                 na->na_gid = (uint32_t)-1;
2917         else if (vap->va_gid == GID_NOBODY)
2918                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2919         else
2920                 na->na_gid = vap->va_gid;
2921 
2922         /*
2923          * Do we need to check fsid for overflow?  It is 64-bit in the
2924          * vattr, but are bigger than 32 bit values supported?
2925          */
2926         na->na_fsid = vap->va_fsid;
2927 
2928         na->na_nodeid = vap->va_nodeid;
2929 
2930         /*
2931          * Check to make sure that the nodeid is representable over the
2932          * wire without losing bits.
2933          */
2934         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2935                 return (EFBIG);
2936         na->na_nlink = vap->va_nlink;
2937 
2938         /*
2939          * Check for big files here, instead of at the caller.  See
2940          * comments in cstat for large special file explanation.
2941          */
2942         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2943                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2944                         return (EFBIG);
2945                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2946                         /* UNKNOWN_SIZE | OVERFLOW */
2947                         na->na_size = MAXOFF32_T;
2948                 } else
2949                         na->na_size = vap->va_size;
2950         } else
2951                 na->na_size = vap->va_size;
2952 
2953         /*
2954          * If the vnode times overflow the 32-bit times that NFS2
2955          * uses on the wire then return an error.
2956          */
2957         if (!NFS_VAP_TIME_OK(vap)) {
2958                 return (EOVERFLOW);
2959         }
2960         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2961         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2962 
2963         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2964         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2965 
2966         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2967         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2968 
2969         /*
2970          * If the dev_t will fit into 16 bits then compress
2971          * it, otherwise leave it alone. See comments in
2972          * nfs_client.c.
2973          */
2974         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2975             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2976                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2977         else
2978                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2979 
2980         na->na_blocks = vap->va_nblocks;
2981         na->na_blocksize = vap->va_blksize;
2982 
2983         /*
2984          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2985          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2986          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2987          *
2988          * BUYER BEWARE:
2989          *  If you are porting the NFS to a non-Sun server, you probably
2990          *  don't want to include the following block of code.  The
2991          *  over-the-wire special file types will be changing with the
2992          *  NFS Protocol Revision.
2993          */
2994         if (vap->va_type == VFIFO)
2995                 NA_SETFIFO(na);
2996         return (0);
2997 }
2998 
2999 /*
3000  * acl v2 support: returns approximate permission.
3001  *      default: returns minimal permission (more restrictive)
3002  *      aclok: returns maximal permission (less restrictive)
3003  *      This routine changes the permissions that are alaredy in *va.
3004  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3005  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3006  */
3007 static void
3008 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3009 {
3010         vsecattr_t      vsa;
3011         int             aclcnt;
3012         aclent_t        *aclentp;
3013         mode_t          mask_perm;
3014         mode_t          grp_perm;
3015         mode_t          other_perm;
3016         mode_t          other_orig;
3017         int             error;
3018 
3019         /* dont care default acl */
3020         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3021         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3022 
3023         if (!error) {
3024                 aclcnt = vsa.vsa_aclcnt;
3025                 if (aclcnt > MIN_ACL_ENTRIES) {
3026                         /* non-trivial ACL */
3027                         aclentp = vsa.vsa_aclentp;
3028                         if (exi->exi_export.ex_flags & EX_ACLOK) {
3029                                 /* maximal permissions */
3030                                 grp_perm = 0;
3031                                 other_perm = 0;
3032                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3033                                         switch (aclentp->a_type) {
3034                                         case USER_OBJ:
3035                                                 break;
3036                                         case USER:
3037                                                 grp_perm |=
3038                                                     aclentp->a_perm << 3;
3039                                                 other_perm |= aclentp->a_perm;
3040                                                 break;
3041                                         case GROUP_OBJ:
3042                                                 grp_perm |=
3043                                                     aclentp->a_perm << 3;
3044                                                 break;
3045                                         case GROUP:
3046                                                 other_perm |= aclentp->a_perm;
3047                                                 break;
3048                                         case OTHER_OBJ:
3049                                                 other_orig = aclentp->a_perm;
3050                                                 break;
3051                                         case CLASS_OBJ:
3052                                                 mask_perm = aclentp->a_perm;
3053                                                 break;
3054                                         default:
3055                                                 break;
3056                                         }
3057                                 }
3058                                 grp_perm &= mask_perm << 3;
3059                                 other_perm &= mask_perm;
3060                                 other_perm |= other_orig;
3061 
3062                         } else {
3063                                 /* minimal permissions */
3064                                 grp_perm = 070;
3065                                 other_perm = 07;
3066                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3067                                         switch (aclentp->a_type) {
3068                                         case USER_OBJ:
3069                                                 break;
3070                                         case USER:
3071                                         case CLASS_OBJ:
3072                                                 grp_perm &=
3073                                                     aclentp->a_perm << 3;
3074                                                 other_perm &=
3075                                                     aclentp->a_perm;
3076                                                 break;
3077                                         case GROUP_OBJ:
3078                                                 grp_perm &=
3079                                                     aclentp->a_perm << 3;
3080                                                 break;
3081                                         case GROUP:
3082                                                 other_perm &=
3083                                                     aclentp->a_perm;
3084                                                 break;
3085                                         case OTHER_OBJ:
3086                                                 other_perm &=
3087                                                     aclentp->a_perm;
3088                                                 break;
3089                                         default:
3090                                                 break;
3091                                         }
3092                                 }
3093                         }
3094                         /* copy to va */
3095                         va->va_mode &= ~077;
3096                         va->va_mode |= grp_perm | other_perm;
3097                 }
3098                 if (vsa.vsa_aclcnt)
3099                         kmem_free(vsa.vsa_aclentp,
3100                             vsa.vsa_aclcnt * sizeof (aclent_t));
3101         }
3102 }
3103 
3104 void
3105 rfs_srvrinit(void)
3106 {
3107         nfs2_srv_caller_id = fs_new_caller_id();
3108 }
3109 
3110 void
3111 rfs_srvrfini(void)
3112 {
3113 }
3114 
3115 /* ARGSUSED */
3116 void
3117 rfs_srv_zone_init(nfs_globals_t *ng)
3118 {
3119         nfs_srv_t *ns;
3120 
3121         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3122 
3123         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3124         ns->write_async = 1;
3125 
3126         ng->nfs_srv = ns;
3127 }
3128 
3129 /* ARGSUSED */
3130 void
3131 rfs_srv_zone_fini(nfs_globals_t *ng)
3132 {
3133         nfs_srv_t *ns = ng->nfs_srv;
3134 
3135         ng->nfs_srv = NULL;
3136 
3137         mutex_destroy(&ns->async_write_lock);
3138         kmem_free(ns, sizeof (*ns));
3139 }
3140 
3141 static int
3142 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3143 {
3144         struct clist    *wcl;
3145         int             wlist_len;
3146         uint32_t        count = rr->rr_count;
3147 
3148         wcl = ra->ra_wlist;
3149 
3150         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3151                 return (FALSE);
3152         }
3153 
3154         wcl = ra->ra_wlist;
3155         rr->rr_ok.rrok_wlist_len = wlist_len;
3156         rr->rr_ok.rrok_wlist = wcl;
3157 
3158         return (TRUE);
3159 }