2988 New usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 
  33 #include <sys/param.h>
  34 #include <sys/types.h>
  35 #include <sys/systm.h>
  36 #include <sys/cred.h>
  37 #include <sys/buf.h>
  38 #include <sys/vfs.h>
  39 #include <sys/vnode.h>
  40 #include <sys/uio.h>
  41 #include <sys/stat.h>
  42 #include <sys/errno.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/statvfs.h>
  45 #include <sys/kmem.h>
  46 #include <sys/kstat.h>
  47 #include <sys/dirent.h>
  48 #include <sys/cmn_err.h>
  49 #include <sys/debug.h>
  50 #include <sys/vtrace.h>
  51 #include <sys/mode.h>
  52 #include <sys/acl.h>
  53 #include <sys/nbmlock.h>
  54 #include <sys/policy.h>
  55 #include <sys/sdt.h>
  56 
  57 #include <rpc/types.h>
  58 #include <rpc/auth.h>
  59 #include <rpc/svc.h>
  60 
  61 #include <nfs/nfs.h>
  62 #include <nfs/export.h>
  63 #include <nfs/nfs_cmd.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_map.h>
  69 #include <vm/seg_kmem.h>
  70 
  71 #include <sys/strsubr.h>
  72 
  73 /*
  74  * These are the interface routines for the server side of the
  75  * Network File System.  See the NFS version 2 protocol specification
  76  * for a description of this interface.
  77  */
  78 
  79 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  80 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  81                         cred_t *);
  82 
  83 /*
  84  * Some "over the wire" UNIX file types.  These are encoded
  85  * into the mode.  This needs to be fixed in the next rev.
  86  */
  87 #define IFMT            0170000         /* type of file */
  88 #define IFCHR           0020000         /* character special */
  89 #define IFBLK           0060000         /* block special */
  90 #define IFSOCK          0140000         /* socket */
  91 
  92 u_longlong_t nfs2_srv_caller_id;
  93 
  94 /*
  95  * Get file attributes.
  96  * Returns the current attributes of the file with the given fhandle.
  97  */
  98 /* ARGSUSED */
  99 void
 100 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 101     struct svc_req *req, cred_t *cr, bool_t ro)
 102 {
 103         int error;
 104         vnode_t *vp;
 105         struct vattr va;
 106 
 107         vp = nfs_fhtovp(fhp, exi);
 108         if (vp == NULL) {
 109                 ns->ns_status = NFSERR_STALE;
 110                 return;
 111         }
 112 
 113         /*
 114          * Do the getattr.
 115          */
 116         va.va_mask = AT_ALL;    /* we want all the attributes */
 117 
 118         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 119 
 120         /* check for overflows */
 121         if (!error) {
 122                 /* Lie about the object type for a referral */
 123                 if (vn_is_nfs_reparse(vp, cr))
 124                         va.va_type = VLNK;
 125 
 126                 acl_perm(vp, exi, &va, cr);
 127                 error = vattr_to_nattr(&va, &ns->ns_attr);
 128         }
 129 
 130         VN_RELE(vp);
 131 
 132         ns->ns_status = puterrno(error);
 133 }
 134 void *
 135 rfs_getattr_getfh(fhandle_t *fhp)
 136 {
 137         return (fhp);
 138 }
 139 
 140 /*
 141  * Set file attributes.
 142  * Sets the attributes of the file with the given fhandle.  Returns
 143  * the new attributes.
 144  */
 145 /* ARGSUSED */
 146 void
 147 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 148     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 149 {
 150         int error;
 151         int flag;
 152         int in_crit = 0;
 153         vnode_t *vp;
 154         struct vattr va;
 155         struct vattr bva;
 156         struct flock64 bf;
 157         caller_context_t ct;
 158 
 159 
 160         vp = nfs_fhtovp(&args->saa_fh, exi);
 161         if (vp == NULL) {
 162                 ns->ns_status = NFSERR_STALE;
 163                 return;
 164         }
 165 
 166         if (rdonly(ro, vp)) {
 167                 VN_RELE(vp);
 168                 ns->ns_status = NFSERR_ROFS;
 169                 return;
 170         }
 171 
 172         error = sattr_to_vattr(&args->saa_sa, &va);
 173         if (error) {
 174                 VN_RELE(vp);
 175                 ns->ns_status = puterrno(error);
 176                 return;
 177         }
 178 
 179         /*
 180          * If the client is requesting a change to the mtime,
 181          * but the nanosecond field is set to 1 billion, then
 182          * this is a flag to the server that it should set the
 183          * atime and mtime fields to the server's current time.
 184          * The 1 billion number actually came from the client
 185          * as 1 million, but the units in the over the wire
 186          * request are microseconds instead of nanoseconds.
 187          *
 188          * This is an overload of the protocol and should be
 189          * documented in the NFS Version 2 protocol specification.
 190          */
 191         if (va.va_mask & AT_MTIME) {
 192                 if (va.va_mtime.tv_nsec == 1000000000) {
 193                         gethrestime(&va.va_mtime);
 194                         va.va_atime = va.va_mtime;
 195                         va.va_mask |= AT_ATIME;
 196                         flag = 0;
 197                 } else
 198                         flag = ATTR_UTIME;
 199         } else
 200                 flag = 0;
 201 
 202         /*
 203          * If the filesystem is exported with nosuid, then mask off
 204          * the setuid and setgid bits.
 205          */
 206         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 207             (exi->exi_export.ex_flags & EX_NOSUID))
 208                 va.va_mode &= ~(VSUID | VSGID);
 209 
 210         ct.cc_sysid = 0;
 211         ct.cc_pid = 0;
 212         ct.cc_caller_id = nfs2_srv_caller_id;
 213         ct.cc_flags = CC_DONTBLOCK;
 214 
 215         /*
 216          * We need to specially handle size changes because it is
 217          * possible for the client to create a file with modes
 218          * which indicate read-only, but with the file opened for
 219          * writing.  If the client then tries to set the size of
 220          * the file, then the normal access checking done in
 221          * VOP_SETATTR would prevent the client from doing so,
 222          * although it should be legal for it to do so.  To get
 223          * around this, we do the access checking for ourselves
 224          * and then use VOP_SPACE which doesn't do the access
 225          * checking which VOP_SETATTR does. VOP_SPACE can only
 226          * operate on VREG files, let VOP_SETATTR handle the other
 227          * extremely rare cases.
 228          * Also the client should not be allowed to change the
 229          * size of the file if there is a conflicting non-blocking
 230          * mandatory lock in the region of change.
 231          */
 232         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 233                 if (nbl_need_check(vp)) {
 234                         nbl_start_crit(vp, RW_READER);
 235                         in_crit = 1;
 236                 }
 237 
 238                 bva.va_mask = AT_UID | AT_SIZE;
 239 
 240                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 241 
 242                 if (error) {
 243                         if (in_crit)
 244                                 nbl_end_crit(vp);
 245                         VN_RELE(vp);
 246                         ns->ns_status = puterrno(error);
 247                         return;
 248                 }
 249 
 250                 if (in_crit) {
 251                         u_offset_t offset;
 252                         ssize_t length;
 253 
 254                         if (va.va_size < bva.va_size) {
 255                                 offset = va.va_size;
 256                                 length = bva.va_size - va.va_size;
 257                         } else {
 258                                 offset = bva.va_size;
 259                                 length = va.va_size - bva.va_size;
 260                         }
 261                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 262                             NULL)) {
 263                                 error = EACCES;
 264                         }
 265                 }
 266 
 267                 if (crgetuid(cr) == bva.va_uid && !error &&
 268                     va.va_size != bva.va_size) {
 269                         va.va_mask &= ~AT_SIZE;
 270                         bf.l_type = F_WRLCK;
 271                         bf.l_whence = 0;
 272                         bf.l_start = (off64_t)va.va_size;
 273                         bf.l_len = 0;
 274                         bf.l_sysid = 0;
 275                         bf.l_pid = 0;
 276 
 277                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 278                             (offset_t)va.va_size, cr, &ct);
 279                 }
 280                 if (in_crit)
 281                         nbl_end_crit(vp);
 282         } else
 283                 error = 0;
 284 
 285         /*
 286          * Do the setattr.
 287          */
 288         if (!error && va.va_mask) {
 289                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 290         }
 291 
 292         /*
 293          * check if the monitor on either vop_space or vop_setattr detected
 294          * a delegation conflict and if so, mark the thread flag as
 295          * wouldblock so that the response is dropped and the client will
 296          * try again.
 297          */
 298         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 299                 VN_RELE(vp);
 300                 curthread->t_flag |= T_WOULDBLOCK;
 301                 return;
 302         }
 303 
 304         if (!error) {
 305                 va.va_mask = AT_ALL;    /* get everything */
 306 
 307                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 308 
 309                 /* check for overflows */
 310                 if (!error) {
 311                         acl_perm(vp, exi, &va, cr);
 312                         error = vattr_to_nattr(&va, &ns->ns_attr);
 313                 }
 314         }
 315 
 316         ct.cc_flags = 0;
 317 
 318         /*
 319          * Force modified metadata out to stable storage.
 320          */
 321         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 322 
 323         VN_RELE(vp);
 324 
 325         ns->ns_status = puterrno(error);
 326 }
 327 void *
 328 rfs_setattr_getfh(struct nfssaargs *args)
 329 {
 330         return (&args->saa_fh);
 331 }
 332 
 333 /* Change and release @exip and @vpp only in success */
 334 int
 335 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 336 {
 337         struct exportinfo *exi;
 338         vnode_t *vp = *vpp;
 339         fid_t fid;
 340         int error;
 341 
 342         VN_HOLD(vp);
 343 
 344         if ((error = traverse(&vp)) != 0) {
 345                 VN_RELE(vp);
 346                 return (error);
 347         }
 348 
 349         bzero(&fid, sizeof (fid));
 350         fid.fid_len = MAXFIDSZ;
 351         error = VOP_FID(vp, &fid, NULL);
 352         if (error) {
 353                 VN_RELE(vp);
 354                 return (error);
 355         }
 356 
 357         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 358         if (exi == NULL ||
 359             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 360                 /*
 361                  * It is not error, just subdir is not exported
 362                  * or "nohide" is not set
 363                  */
 364                 if (exi != NULL)
 365                         exi_rele(exi);
 366                 VN_RELE(vp);
 367         } else {
 368                 /* go to submount */
 369                 exi_rele(*exip);
 370                 *exip = exi;
 371 
 372                 VN_RELE(*vpp);
 373                 *vpp = vp;
 374         }
 375 
 376         return (0);
 377 }
 378 
 379 /*
 380  * Given mounted "dvp" and "exi", go upper mountpoint
 381  * with dvp/exi correction
 382  * Return 0 in success
 383  */
 384 int
 385 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 386 {
 387         struct exportinfo *exi;
 388         vnode_t *dvp = *dvpp;
 389 
 390         ASSERT(dvp->v_flag & VROOT);
 391 
 392         VN_HOLD(dvp);
 393         dvp = untraverse(dvp);
 394         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 395         if (exi == NULL) {
 396                 VN_RELE(dvp);
 397                 return (-1);
 398         }
 399 
 400         exi_rele(*exip);
 401         *exip = exi;
 402         VN_RELE(*dvpp);
 403         *dvpp = dvp;
 404 
 405         return (0);
 406 }
 407 /*
 408  * Directory lookup.
 409  * Returns an fhandle and file attributes for file name in a directory.
 410  */
 411 /* ARGSUSED */
 412 void
 413 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 414     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 415 {
 416         int error;
 417         vnode_t *dvp;
 418         vnode_t *vp;
 419         struct vattr va;
 420         fhandle_t *fhp = da->da_fhandle;
 421         struct sec_ol sec = {0, 0};
 422         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 423         char *name;
 424         struct sockaddr *ca;
 425 
 426         /*
 427          * Trusted Extension doesn't support NFSv2. MOUNT
 428          * will reject v2 clients. Need to prevent v2 client
 429          * access via WebNFS here.
 430          */
 431         if (is_system_labeled() && req->rq_vers == 2) {
 432                 dr->dr_status = NFSERR_ACCES;
 433                 return;
 434         }
 435 
 436         /*
 437          * Disallow NULL paths
 438          */
 439         if (da->da_name == NULL || *da->da_name == '\0') {
 440                 dr->dr_status = NFSERR_ACCES;
 441                 return;
 442         }
 443 
 444         /*
 445          * Allow lookups from the root - the default
 446          * location of the public filehandle.
 447          */
 448         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 449                 dvp = rootdir;
 450                 VN_HOLD(dvp);
 451         } else {
 452                 dvp = nfs_fhtovp(fhp, exi);
 453                 if (dvp == NULL) {
 454                         dr->dr_status = NFSERR_STALE;
 455                         return;
 456                 }
 457         }
 458 
 459         exi_hold(exi);
 460 
 461         /*
 462          * Not allow lookup beyond root.
 463          * If the filehandle matches a filehandle of the exi,
 464          * then the ".." refers beyond the root of an exported filesystem.
 465          */
 466         if (strcmp(da->da_name, "..") == 0 &&
 467             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 468                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 469                     (dvp->v_flag & VROOT)) {
 470                         /*
 471                          * special case for ".." and 'nohide'exported root
 472                          */
 473                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 474                                 error = NFSERR_ACCES;
 475                                 goto out;
 476                         }
 477                 } else  {
 478                         error = NFSERR_NOENT;
 479                         goto out;
 480                 }
 481         }
 482 
 483         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 484         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 485             MAXPATHLEN);
 486 
 487         if (name == NULL) {
 488                 error = NFSERR_ACCES;
 489                 goto out;
 490         }
 491 
 492         /*
 493          * If the public filehandle is used then allow
 494          * a multi-component lookup, i.e. evaluate
 495          * a pathname and follow symbolic links if
 496          * necessary.
 497          *
 498          * This may result in a vnode in another filesystem
 499          * which is OK as long as the filesystem is exported.
 500          */
 501         if (PUBLIC_FH2(fhp)) {
 502                 publicfh_flag = TRUE;
 503 
 504                 exi_rele(exi);
 505 
 506                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 507                     &sec);
 508         } else {
 509                 /*
 510                  * Do a normal single component lookup.
 511                  */
 512                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 513                     NULL, NULL, NULL);
 514         }
 515 
 516         if (name != da->da_name)
 517                 kmem_free(name, MAXPATHLEN);
 518 
 519         if (error == 0 && vn_ismntpt(vp)) {
 520                 error = rfs_cross_mnt(&vp, &exi);
 521                 if (error)
 522                         VN_RELE(vp);
 523         }
 524 
 525         if (!error) {
 526                 va.va_mask = AT_ALL;    /* we want everything */
 527 
 528                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 529 
 530                 /* check for overflows */
 531                 if (!error) {
 532                         acl_perm(vp, exi, &va, cr);
 533                         error = vattr_to_nattr(&va, &dr->dr_attr);
 534                         if (!error) {
 535                                 if (sec.sec_flags & SEC_QUERY)
 536                                         error = makefh_ol(&dr->dr_fhandle, exi,
 537                                             sec.sec_index);
 538                                 else {
 539                                         error = makefh(&dr->dr_fhandle, vp,
 540                                             exi);
 541                                         if (!error && publicfh_flag &&
 542                                             !chk_clnt_sec(exi, req))
 543                                                 auth_weak = TRUE;
 544                                 }
 545                         }
 546                 }
 547                 VN_RELE(vp);
 548         }
 549 
 550 out:
 551         VN_RELE(dvp);
 552 
 553         if (exi != NULL)
 554                 exi_rele(exi);
 555 
 556         /*
 557          * If it's public fh, no 0x81, and client's flavor is
 558          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 559          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 560          */
 561         if (auth_weak)
 562                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 563         else
 564                 dr->dr_status = puterrno(error);
 565 }
 566 void *
 567 rfs_lookup_getfh(struct nfsdiropargs *da)
 568 {
 569         return (da->da_fhandle);
 570 }
 571 
 572 /*
 573  * Read symbolic link.
 574  * Returns the string in the symbolic link at the given fhandle.
 575  */
 576 /* ARGSUSED */
 577 void
 578 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 579     struct svc_req *req, cred_t *cr, bool_t ro)
 580 {
 581         int error;
 582         struct iovec iov;
 583         struct uio uio;
 584         vnode_t *vp;
 585         struct vattr va;
 586         struct sockaddr *ca;
 587         char *name = NULL;
 588         int is_referral = 0;
 589 
 590         vp = nfs_fhtovp(fhp, exi);
 591         if (vp == NULL) {
 592                 rl->rl_data = NULL;
 593                 rl->rl_status = NFSERR_STALE;
 594                 return;
 595         }
 596 
 597         va.va_mask = AT_MODE;
 598 
 599         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 600 
 601         if (error) {
 602                 VN_RELE(vp);
 603                 rl->rl_data = NULL;
 604                 rl->rl_status = puterrno(error);
 605                 return;
 606         }
 607 
 608         if (MANDLOCK(vp, va.va_mode)) {
 609                 VN_RELE(vp);
 610                 rl->rl_data = NULL;
 611                 rl->rl_status = NFSERR_ACCES;
 612                 return;
 613         }
 614 
 615         /* We lied about the object type for a referral */
 616         if (vn_is_nfs_reparse(vp, cr))
 617                 is_referral = 1;
 618 
 619         /*
 620          * XNFS and RFC1094 require us to return ENXIO if argument
 621          * is not a link. BUGID 1138002.
 622          */
 623         if (vp->v_type != VLNK && !is_referral) {
 624                 VN_RELE(vp);
 625                 rl->rl_data = NULL;
 626                 rl->rl_status = NFSERR_NXIO;
 627                 return;
 628         }
 629 
 630         /*
 631          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 632          */
 633         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 634 
 635         if (is_referral) {
 636                 char *s;
 637                 size_t strsz;
 638 
 639                 /* Get an artificial symlink based on a referral */
 640                 s = build_symlink(vp, cr, &strsz);
 641                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 642                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 643                     vnode_t *, vp, char *, s);
 644                 if (s == NULL)
 645                         error = EINVAL;
 646                 else {
 647                         error = 0;
 648                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 649                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 650                         kmem_free(s, strsz);
 651                 }
 652 
 653         } else {
 654 
 655                 /*
 656                  * Set up io vector to read sym link data
 657                  */
 658                 iov.iov_base = rl->rl_data;
 659                 iov.iov_len = NFS_MAXPATHLEN;
 660                 uio.uio_iov = &iov;
 661                 uio.uio_iovcnt = 1;
 662                 uio.uio_segflg = UIO_SYSSPACE;
 663                 uio.uio_extflg = UIO_COPY_CACHED;
 664                 uio.uio_loffset = (offset_t)0;
 665                 uio.uio_resid = NFS_MAXPATHLEN;
 666 
 667                 /*
 668                  * Do the readlink.
 669                  */
 670                 error = VOP_READLINK(vp, &uio, cr, NULL);
 671 
 672                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 673 
 674                 if (!error)
 675                         rl->rl_data[rl->rl_count] = '\0';
 676 
 677         }
 678 
 679 
 680         VN_RELE(vp);
 681 
 682         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 683         name = nfscmd_convname(ca, exi, rl->rl_data,
 684             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 685 
 686         if (name != NULL && name != rl->rl_data) {
 687                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 688                 rl->rl_data = name;
 689         }
 690 
 691         /*
 692          * XNFS and RFC1094 require us to return ENXIO if argument
 693          * is not a link. UFS returns EINVAL if this is the case,
 694          * so we do the mapping here. BUGID 1138002.
 695          */
 696         if (error == EINVAL)
 697                 rl->rl_status = NFSERR_NXIO;
 698         else
 699                 rl->rl_status = puterrno(error);
 700 
 701 }
 702 void *
 703 rfs_readlink_getfh(fhandle_t *fhp)
 704 {
 705         return (fhp);
 706 }
 707 /*
 708  * Free data allocated by rfs_readlink
 709  */
 710 void
 711 rfs_rlfree(struct nfsrdlnres *rl)
 712 {
 713         if (rl->rl_data != NULL)
 714                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 715 }
 716 
 717 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 718 
 719 /*
 720  * Read data.
 721  * Returns some data read from the file at the given fhandle.
 722  */
 723 /* ARGSUSED */
 724 void
 725 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 726     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 727 {
 728         vnode_t *vp;
 729         int error;
 730         struct vattr va;
 731         struct iovec iov;
 732         struct uio uio;
 733         mblk_t *mp;
 734         int alloc_err = 0;
 735         int in_crit = 0;
 736         caller_context_t ct;
 737 
 738         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 739         if (vp == NULL) {
 740                 rr->rr_data = NULL;
 741                 rr->rr_status = NFSERR_STALE;
 742                 return;
 743         }
 744 
 745         if (vp->v_type != VREG) {
 746                 VN_RELE(vp);
 747                 rr->rr_data = NULL;
 748                 rr->rr_status = NFSERR_ISDIR;
 749                 return;
 750         }
 751 
 752         ct.cc_sysid = 0;
 753         ct.cc_pid = 0;
 754         ct.cc_caller_id = nfs2_srv_caller_id;
 755         ct.cc_flags = CC_DONTBLOCK;
 756 
 757         /*
 758          * Enter the critical region before calling VOP_RWLOCK
 759          * to avoid a deadlock with write requests.
 760          */
 761         if (nbl_need_check(vp)) {
 762                 nbl_start_crit(vp, RW_READER);
 763                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 764                     0, NULL)) {
 765                         nbl_end_crit(vp);
 766                         VN_RELE(vp);
 767                         rr->rr_data = NULL;
 768                         rr->rr_status = NFSERR_ACCES;
 769                         return;
 770                 }
 771                 in_crit = 1;
 772         }
 773 
 774         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 775 
 776         /* check if a monitor detected a delegation conflict */
 777         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 778                 VN_RELE(vp);
 779                 /* mark as wouldblock so response is dropped */
 780                 curthread->t_flag |= T_WOULDBLOCK;
 781 
 782                 rr->rr_data = NULL;
 783                 return;
 784         }
 785 
 786         va.va_mask = AT_ALL;
 787 
 788         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 789 
 790         if (error) {
 791                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 792                 if (in_crit)
 793                         nbl_end_crit(vp);
 794 
 795                 VN_RELE(vp);
 796                 rr->rr_data = NULL;
 797                 rr->rr_status = puterrno(error);
 798 
 799                 return;
 800         }
 801 
 802         /*
 803          * This is a kludge to allow reading of files created
 804          * with no read permission.  The owner of the file
 805          * is always allowed to read it.
 806          */
 807         if (crgetuid(cr) != va.va_uid) {
 808                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 809 
 810                 if (error) {
 811                         /*
 812                          * Exec is the same as read over the net because
 813                          * of demand loading.
 814                          */
 815                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 816                 }
 817                 if (error) {
 818                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 819                         if (in_crit)
 820                                 nbl_end_crit(vp);
 821                         VN_RELE(vp);
 822                         rr->rr_data = NULL;
 823                         rr->rr_status = puterrno(error);
 824 
 825                         return;
 826                 }
 827         }
 828 
 829         if (MANDLOCK(vp, va.va_mode)) {
 830                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 831                 if (in_crit)
 832                         nbl_end_crit(vp);
 833 
 834                 VN_RELE(vp);
 835                 rr->rr_data = NULL;
 836                 rr->rr_status = NFSERR_ACCES;
 837 
 838                 return;
 839         }
 840 
 841         rr->rr_ok.rrok_wlist_len = 0;
 842         rr->rr_ok.rrok_wlist = NULL;
 843 
 844         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 845                 rr->rr_count = 0;
 846                 rr->rr_data = NULL;
 847                 /*
 848                  * In this case, status is NFS_OK, but there is no data
 849                  * to encode. So set rr_mp to NULL.
 850                  */
 851                 rr->rr_mp = NULL;
 852                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 853                 if (rr->rr_ok.rrok_wlist)
 854                         clist_zero_len(rr->rr_ok.rrok_wlist);
 855                 goto done;
 856         }
 857 
 858         if (ra->ra_wlist) {
 859                 mp = NULL;
 860                 rr->rr_mp = NULL;
 861                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 862                 if (ra->ra_count > iov.iov_len) {
 863                         rr->rr_data = NULL;
 864                         rr->rr_status = NFSERR_INVAL;
 865                         goto done;
 866                 }
 867         } else {
 868                 /*
 869                  * mp will contain the data to be sent out in the read reply.
 870                  * This will be freed after the reply has been sent out (by the
 871                  * driver).
 872                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 873                  * that the call to xdrmblk_putmblk() never fails.
 874                  */
 875                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 876                     &alloc_err);
 877                 ASSERT(mp != NULL);
 878                 ASSERT(alloc_err == 0);
 879 
 880                 rr->rr_mp = mp;
 881 
 882                 /*
 883                  * Set up io vector
 884                  */
 885                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 886                 iov.iov_len = ra->ra_count;
 887         }
 888 
 889         uio.uio_iov = &iov;
 890         uio.uio_iovcnt = 1;
 891         uio.uio_segflg = UIO_SYSSPACE;
 892         uio.uio_extflg = UIO_COPY_CACHED;
 893         uio.uio_loffset = (offset_t)ra->ra_offset;
 894         uio.uio_resid = ra->ra_count;
 895 
 896         error = VOP_READ(vp, &uio, 0, cr, &ct);
 897 
 898         if (error) {
 899                 if (mp)
 900                         freeb(mp);
 901 
 902                 /*
 903                  * check if a monitor detected a delegation conflict and
 904                  * mark as wouldblock so response is dropped
 905                  */
 906                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 907                         curthread->t_flag |= T_WOULDBLOCK;
 908                 else
 909                         rr->rr_status = puterrno(error);
 910 
 911                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 912                 if (in_crit)
 913                         nbl_end_crit(vp);
 914 
 915                 VN_RELE(vp);
 916                 rr->rr_data = NULL;
 917 
 918                 return;
 919         }
 920 
 921         /*
 922          * Get attributes again so we can send the latest access
 923          * time to the client side for its cache.
 924          */
 925         va.va_mask = AT_ALL;
 926 
 927         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 928 
 929         if (error) {
 930                 if (mp)
 931                         freeb(mp);
 932 
 933                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 934                 if (in_crit)
 935                         nbl_end_crit(vp);
 936 
 937                 VN_RELE(vp);
 938                 rr->rr_data = NULL;
 939                 rr->rr_status = puterrno(error);
 940 
 941                 return;
 942         }
 943 
 944         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 945 
 946         if (mp) {
 947                 rr->rr_data = (char *)mp->b_datap->db_base;
 948         } else {
 949                 if (ra->ra_wlist) {
 950                         rr->rr_data = (caddr_t)iov.iov_base;
 951                         if (!rdma_setup_read_data2(ra, rr)) {
 952                                 rr->rr_data = NULL;
 953                                 rr->rr_status = puterrno(NFSERR_INVAL);
 954                         }
 955                 }
 956         }
 957 done:
 958         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 959         if (in_crit)
 960                 nbl_end_crit(vp);
 961 
 962         acl_perm(vp, exi, &va, cr);
 963 
 964         /* check for overflows */
 965         error = vattr_to_nattr(&va, &rr->rr_attr);
 966 
 967         VN_RELE(vp);
 968 
 969         rr->rr_status = puterrno(error);
 970 }
 971 
 972 /*
 973  * Free data allocated by rfs_read
 974  */
 975 void
 976 rfs_rdfree(struct nfsrdresult *rr)
 977 {
 978         mblk_t *mp;
 979 
 980         if (rr->rr_status == NFS_OK) {
 981                 mp = rr->rr_mp;
 982                 if (mp != NULL)
 983                         freeb(mp);
 984         }
 985 }
 986 
 987 void *
 988 rfs_read_getfh(struct nfsreadargs *ra)
 989 {
 990         return (&ra->ra_fhandle);
 991 }
 992 
 993 #define MAX_IOVECS      12
 994 
 995 #ifdef DEBUG
 996 static int rfs_write_sync_hits = 0;
 997 static int rfs_write_sync_misses = 0;
 998 #endif
 999 
1000 /*
1001  * Write data to file.
1002  * Returns attributes of a file after writing some data to it.
1003  *
1004  * Any changes made here, especially in error handling might have
1005  * to also be done in rfs_write (which clusters write requests).
1006  */
1007 /* ARGSUSED */
1008 void
1009 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1010     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1011 {
1012         int error;
1013         vnode_t *vp;
1014         rlim64_t rlimit;
1015         struct vattr va;
1016         struct uio uio;
1017         struct iovec iov[MAX_IOVECS];
1018         mblk_t *m;
1019         struct iovec *iovp;
1020         int iovcnt;
1021         cred_t *savecred;
1022         int in_crit = 0;
1023         caller_context_t ct;
1024 
1025         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1026         if (vp == NULL) {
1027                 ns->ns_status = NFSERR_STALE;
1028                 return;
1029         }
1030 
1031         if (rdonly(ro, vp)) {
1032                 VN_RELE(vp);
1033                 ns->ns_status = NFSERR_ROFS;
1034                 return;
1035         }
1036 
1037         if (vp->v_type != VREG) {
1038                 VN_RELE(vp);
1039                 ns->ns_status = NFSERR_ISDIR;
1040                 return;
1041         }
1042 
1043         ct.cc_sysid = 0;
1044         ct.cc_pid = 0;
1045         ct.cc_caller_id = nfs2_srv_caller_id;
1046         ct.cc_flags = CC_DONTBLOCK;
1047 
1048         va.va_mask = AT_UID|AT_MODE;
1049 
1050         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1051 
1052         if (error) {
1053                 VN_RELE(vp);
1054                 ns->ns_status = puterrno(error);
1055 
1056                 return;
1057         }
1058 
1059         if (crgetuid(cr) != va.va_uid) {
1060                 /*
1061                  * This is a kludge to allow writes of files created
1062                  * with read only permission.  The owner of the file
1063                  * is always allowed to write it.
1064                  */
1065                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1066 
1067                 if (error) {
1068                         VN_RELE(vp);
1069                         ns->ns_status = puterrno(error);
1070                         return;
1071                 }
1072         }
1073 
1074         /*
1075          * Can't access a mandatory lock file.  This might cause
1076          * the NFS service thread to block forever waiting for a
1077          * lock to be released that will never be released.
1078          */
1079         if (MANDLOCK(vp, va.va_mode)) {
1080                 VN_RELE(vp);
1081                 ns->ns_status = NFSERR_ACCES;
1082                 return;
1083         }
1084 
1085         /*
1086          * We have to enter the critical region before calling VOP_RWLOCK
1087          * to avoid a deadlock with ufs.
1088          */
1089         if (nbl_need_check(vp)) {
1090                 nbl_start_crit(vp, RW_READER);
1091                 in_crit = 1;
1092                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1093                     wa->wa_count, 0, NULL)) {
1094                         error = EACCES;
1095                         goto out;
1096                 }
1097         }
1098 
1099         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1100 
1101         /* check if a monitor detected a delegation conflict */
1102         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1103                 VN_RELE(vp);
1104                 /* mark as wouldblock so response is dropped */
1105                 curthread->t_flag |= T_WOULDBLOCK;
1106                 return;
1107         }
1108 
1109         if (wa->wa_data || wa->wa_rlist) {
1110                 /* Do the RDMA thing if necessary */
1111                 if (wa->wa_rlist) {
1112                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1113                         iov[0].iov_len = wa->wa_count;
1114                 } else  {
1115                         iov[0].iov_base = wa->wa_data;
1116                         iov[0].iov_len = wa->wa_count;
1117                 }
1118                 uio.uio_iov = iov;
1119                 uio.uio_iovcnt = 1;
1120                 uio.uio_segflg = UIO_SYSSPACE;
1121                 uio.uio_extflg = UIO_COPY_DEFAULT;
1122                 uio.uio_loffset = (offset_t)wa->wa_offset;
1123                 uio.uio_resid = wa->wa_count;
1124                 /*
1125                  * The limit is checked on the client. We
1126                  * should allow any size writes here.
1127                  */
1128                 uio.uio_llimit = curproc->p_fsz_ctl;
1129                 rlimit = uio.uio_llimit - wa->wa_offset;
1130                 if (rlimit < (rlim64_t)uio.uio_resid)
1131                         uio.uio_resid = (uint_t)rlimit;
1132 
1133                 /*
1134                  * for now we assume no append mode
1135                  */
1136                 /*
1137                  * We're changing creds because VM may fault and we need
1138                  * the cred of the current thread to be used if quota
1139                  * checking is enabled.
1140                  */
1141                 savecred = curthread->t_cred;
1142                 curthread->t_cred = cr;
1143                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1144                 curthread->t_cred = savecred;
1145         } else {
1146                 iovcnt = 0;
1147                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1148                         iovcnt++;
1149                 if (iovcnt <= MAX_IOVECS) {
1150 #ifdef DEBUG
1151                         rfs_write_sync_hits++;
1152 #endif
1153                         iovp = iov;
1154                 } else {
1155 #ifdef DEBUG
1156                         rfs_write_sync_misses++;
1157 #endif
1158                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1159                 }
1160                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1161                 uio.uio_iov = iovp;
1162                 uio.uio_iovcnt = iovcnt;
1163                 uio.uio_segflg = UIO_SYSSPACE;
1164                 uio.uio_extflg = UIO_COPY_DEFAULT;
1165                 uio.uio_loffset = (offset_t)wa->wa_offset;
1166                 uio.uio_resid = wa->wa_count;
1167                 /*
1168                  * The limit is checked on the client. We
1169                  * should allow any size writes here.
1170                  */
1171                 uio.uio_llimit = curproc->p_fsz_ctl;
1172                 rlimit = uio.uio_llimit - wa->wa_offset;
1173                 if (rlimit < (rlim64_t)uio.uio_resid)
1174                         uio.uio_resid = (uint_t)rlimit;
1175 
1176                 /*
1177                  * For now we assume no append mode.
1178                  */
1179                 /*
1180                  * We're changing creds because VM may fault and we need
1181                  * the cred of the current thread to be used if quota
1182                  * checking is enabled.
1183                  */
1184                 savecred = curthread->t_cred;
1185                 curthread->t_cred = cr;
1186                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1187                 curthread->t_cred = savecred;
1188 
1189                 if (iovp != iov)
1190                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1191         }
1192 
1193         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1194 
1195         if (!error) {
1196                 /*
1197                  * Get attributes again so we send the latest mod
1198                  * time to the client side for its cache.
1199                  */
1200                 va.va_mask = AT_ALL;    /* now we want everything */
1201 
1202                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1203 
1204                 /* check for overflows */
1205                 if (!error) {
1206                         acl_perm(vp, exi, &va, cr);
1207                         error = vattr_to_nattr(&va, &ns->ns_attr);
1208                 }
1209         }
1210 
1211 out:
1212         if (in_crit)
1213                 nbl_end_crit(vp);
1214         VN_RELE(vp);
1215 
1216         /* check if a monitor detected a delegation conflict */
1217         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1218                 /* mark as wouldblock so response is dropped */
1219                 curthread->t_flag |= T_WOULDBLOCK;
1220         else
1221                 ns->ns_status = puterrno(error);
1222 
1223 }
1224 
1225 struct rfs_async_write {
1226         struct nfswriteargs *wa;
1227         struct nfsattrstat *ns;
1228         struct svc_req *req;
1229         cred_t *cr;
1230         bool_t ro;
1231         kthread_t *thread;
1232         struct rfs_async_write *list;
1233 };
1234 
1235 struct rfs_async_write_list {
1236         fhandle_t *fhp;
1237         kcondvar_t cv;
1238         struct rfs_async_write *list;
1239         struct rfs_async_write_list *next;
1240 };
1241 
1242 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1243 static kmutex_t rfs_async_write_lock;
1244 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1245 
1246 #define MAXCLIOVECS     42
1247 #define RFSWRITE_INITVAL (enum nfsstat) -1
1248 
1249 #ifdef DEBUG
1250 static int rfs_write_hits = 0;
1251 static int rfs_write_misses = 0;
1252 #endif
1253 
1254 /*
1255  * Write data to file.
1256  * Returns attributes of a file after writing some data to it.
1257  */
1258 void
1259 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1260     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1261 {
1262         int error;
1263         vnode_t *vp;
1264         rlim64_t rlimit;
1265         struct vattr va;
1266         struct uio uio;
1267         struct rfs_async_write_list *lp;
1268         struct rfs_async_write_list *nlp;
1269         struct rfs_async_write *rp;
1270         struct rfs_async_write *nrp;
1271         struct rfs_async_write *trp;
1272         struct rfs_async_write *lrp;
1273         int data_written;
1274         int iovcnt;
1275         mblk_t *m;
1276         struct iovec *iovp;
1277         struct iovec *niovp;
1278         struct iovec iov[MAXCLIOVECS];
1279         int count;
1280         int rcount;
1281         uint_t off;
1282         uint_t len;
1283         struct rfs_async_write nrpsp;
1284         struct rfs_async_write_list nlpsp;
1285         ushort_t t_flag;
1286         cred_t *savecred;
1287         int in_crit = 0;
1288         caller_context_t ct;
1289 
1290         if (!rfs_write_async) {
1291                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1292                 return;
1293         }
1294 
1295         /*
1296          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1297          * is considered an OK.
1298          */
1299         ns->ns_status = RFSWRITE_INITVAL;
1300 
1301         nrp = &nrpsp;
1302         nrp->wa = wa;
1303         nrp->ns = ns;
1304         nrp->req = req;
1305         nrp->cr = cr;
1306         nrp->ro = ro;
1307         nrp->thread = curthread;
1308 
1309         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1310 
1311         /*
1312          * Look to see if there is already a cluster started
1313          * for this file.
1314          */
1315         mutex_enter(&rfs_async_write_lock);
1316         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1317                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1318                     sizeof (fhandle_t)) == 0)
1319                         break;
1320         }
1321 
1322         /*
1323          * If lp is non-NULL, then there is already a cluster
1324          * started.  We need to place ourselves in the cluster
1325          * list in the right place as determined by starting
1326          * offset.  Conflicts with non-blocking mandatory locked
1327          * regions will be checked when the cluster is processed.
1328          */
1329         if (lp != NULL) {
1330                 rp = lp->list;
1331                 trp = NULL;
1332                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1333                         trp = rp;
1334                         rp = rp->list;
1335                 }
1336                 nrp->list = rp;
1337                 if (trp == NULL)
1338                         lp->list = nrp;
1339                 else
1340                         trp->list = nrp;
1341                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1342                         cv_wait(&lp->cv, &rfs_async_write_lock);
1343                 mutex_exit(&rfs_async_write_lock);
1344 
1345                 return;
1346         }
1347 
1348         /*
1349          * No cluster started yet, start one and add ourselves
1350          * to the list of clusters.
1351          */
1352         nrp->list = NULL;
1353 
1354         nlp = &nlpsp;
1355         nlp->fhp = &wa->wa_fhandle;
1356         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1357         nlp->list = nrp;
1358         nlp->next = NULL;
1359 
1360         if (rfs_async_write_head == NULL) {
1361                 rfs_async_write_head = nlp;
1362         } else {
1363                 lp = rfs_async_write_head;
1364                 while (lp->next != NULL)
1365                         lp = lp->next;
1366                 lp->next = nlp;
1367         }
1368         mutex_exit(&rfs_async_write_lock);
1369 
1370         /*
1371          * Convert the file handle common to all of the requests
1372          * in this cluster to a vnode.
1373          */
1374         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1375         if (vp == NULL) {
1376                 mutex_enter(&rfs_async_write_lock);
1377                 if (rfs_async_write_head == nlp)
1378                         rfs_async_write_head = nlp->next;
1379                 else {
1380                         lp = rfs_async_write_head;
1381                         while (lp->next != nlp)
1382                                 lp = lp->next;
1383                         lp->next = nlp->next;
1384                 }
1385                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1386                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1387                         rp->ns->ns_status = NFSERR_STALE;
1388                         rp->thread->t_flag |= t_flag;
1389                 }
1390                 cv_broadcast(&nlp->cv);
1391                 mutex_exit(&rfs_async_write_lock);
1392 
1393                 return;
1394         }
1395 
1396         /*
1397          * Can only write regular files.  Attempts to write any
1398          * other file types fail with EISDIR.
1399          */
1400         if (vp->v_type != VREG) {
1401                 VN_RELE(vp);
1402                 mutex_enter(&rfs_async_write_lock);
1403                 if (rfs_async_write_head == nlp)
1404                         rfs_async_write_head = nlp->next;
1405                 else {
1406                         lp = rfs_async_write_head;
1407                         while (lp->next != nlp)
1408                                 lp = lp->next;
1409                         lp->next = nlp->next;
1410                 }
1411                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1412                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1413                         rp->ns->ns_status = NFSERR_ISDIR;
1414                         rp->thread->t_flag |= t_flag;
1415                 }
1416                 cv_broadcast(&nlp->cv);
1417                 mutex_exit(&rfs_async_write_lock);
1418 
1419                 return;
1420         }
1421 
1422         /*
1423          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1424          * deadlock with ufs.
1425          */
1426         if (nbl_need_check(vp)) {
1427                 nbl_start_crit(vp, RW_READER);
1428                 in_crit = 1;
1429         }
1430 
1431         ct.cc_sysid = 0;
1432         ct.cc_pid = 0;
1433         ct.cc_caller_id = nfs2_srv_caller_id;
1434         ct.cc_flags = CC_DONTBLOCK;
1435 
1436         /*
1437          * Lock the file for writing.  This operation provides
1438          * the delay which allows clusters to grow.
1439          */
1440         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1441 
1442         /* check if a monitor detected a delegation conflict */
1443         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1444                 if (in_crit)
1445                         nbl_end_crit(vp);
1446                 VN_RELE(vp);
1447                 /* mark as wouldblock so response is dropped */
1448                 curthread->t_flag |= T_WOULDBLOCK;
1449                 mutex_enter(&rfs_async_write_lock);
1450                 if (rfs_async_write_head == nlp)
1451                         rfs_async_write_head = nlp->next;
1452                 else {
1453                         lp = rfs_async_write_head;
1454                         while (lp->next != nlp)
1455                                 lp = lp->next;
1456                         lp->next = nlp->next;
1457                 }
1458                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1459                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1460                                 rp->ns->ns_status = puterrno(error);
1461                                 rp->thread->t_flag |= T_WOULDBLOCK;
1462                         }
1463                 }
1464                 cv_broadcast(&nlp->cv);
1465                 mutex_exit(&rfs_async_write_lock);
1466 
1467                 return;
1468         }
1469 
1470         /*
1471          * Disconnect this cluster from the list of clusters.
1472          * The cluster that is being dealt with must be fixed
1473          * in size after this point, so there is no reason
1474          * to leave it on the list so that new requests can
1475          * find it.
1476          *
1477          * The algorithm is that the first write request will
1478          * create a cluster, convert the file handle to a
1479          * vnode pointer, and then lock the file for writing.
1480          * This request is not likely to be clustered with
1481          * any others.  However, the next request will create
1482          * a new cluster and be blocked in VOP_RWLOCK while
1483          * the first request is being processed.  This delay
1484          * will allow more requests to be clustered in this
1485          * second cluster.
1486          */
1487         mutex_enter(&rfs_async_write_lock);
1488         if (rfs_async_write_head == nlp)
1489                 rfs_async_write_head = nlp->next;
1490         else {
1491                 lp = rfs_async_write_head;
1492                 while (lp->next != nlp)
1493                         lp = lp->next;
1494                 lp->next = nlp->next;
1495         }
1496         mutex_exit(&rfs_async_write_lock);
1497 
1498         /*
1499          * Step through the list of requests in this cluster.
1500          * We need to check permissions to make sure that all
1501          * of the requests have sufficient permission to write
1502          * the file.  A cluster can be composed of requests
1503          * from different clients and different users on each
1504          * client.
1505          *
1506          * As a side effect, we also calculate the size of the
1507          * byte range that this cluster encompasses.
1508          */
1509         rp = nlp->list;
1510         off = rp->wa->wa_offset;
1511         len = (uint_t)0;
1512         do {
1513                 if (rdonly(rp->ro, vp)) {
1514                         rp->ns->ns_status = NFSERR_ROFS;
1515                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1516                         rp->thread->t_flag |= t_flag;
1517                         continue;
1518                 }
1519 
1520                 va.va_mask = AT_UID|AT_MODE;
1521 
1522                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1523 
1524                 if (!error) {
1525                         if (crgetuid(rp->cr) != va.va_uid) {
1526                                 /*
1527                                  * This is a kludge to allow writes of files
1528                                  * created with read only permission.  The
1529                                  * owner of the file is always allowed to
1530                                  * write it.
1531                                  */
1532                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1533                         }
1534                         if (!error && MANDLOCK(vp, va.va_mode))
1535                                 error = EACCES;
1536                 }
1537 
1538                 /*
1539                  * Check for a conflict with a nbmand-locked region.
1540                  */
1541                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1542                     rp->wa->wa_count, 0, NULL)) {
1543                         error = EACCES;
1544                 }
1545 
1546                 if (error) {
1547                         rp->ns->ns_status = puterrno(error);
1548                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1549                         rp->thread->t_flag |= t_flag;
1550                         continue;
1551                 }
1552                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1553                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1554         } while ((rp = rp->list) != NULL);
1555 
1556         /*
1557          * Step through the cluster attempting to gather as many
1558          * requests which are contiguous as possible.  These
1559          * contiguous requests are handled via one call to VOP_WRITE
1560          * instead of different calls to VOP_WRITE.  We also keep
1561          * track of the fact that any data was written.
1562          */
1563         rp = nlp->list;
1564         data_written = 0;
1565         do {
1566                 /*
1567                  * Skip any requests which are already marked as having an
1568                  * error.
1569                  */
1570                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1571                         rp = rp->list;
1572                         continue;
1573                 }
1574 
1575                 /*
1576                  * Count the number of iovec's which are required
1577                  * to handle this set of requests.  One iovec is
1578                  * needed for each data buffer, whether addressed
1579                  * by wa_data or by the b_rptr pointers in the
1580                  * mblk chains.
1581                  */
1582                 iovcnt = 0;
1583                 lrp = rp;
1584                 for (;;) {
1585                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1586                                 iovcnt++;
1587                         else {
1588                                 m = lrp->wa->wa_mblk;
1589                                 while (m != NULL) {
1590                                         iovcnt++;
1591                                         m = m->b_cont;
1592                                 }
1593                         }
1594                         if (lrp->list == NULL ||
1595                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1596                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1597                             lrp->list->wa->wa_offset) {
1598                                 lrp = lrp->list;
1599                                 break;
1600                         }
1601                         lrp = lrp->list;
1602                 }
1603 
1604                 if (iovcnt <= MAXCLIOVECS) {
1605 #ifdef DEBUG
1606                         rfs_write_hits++;
1607 #endif
1608                         niovp = iov;
1609                 } else {
1610 #ifdef DEBUG
1611                         rfs_write_misses++;
1612 #endif
1613                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1614                 }
1615                 /*
1616                  * Put together the scatter/gather iovecs.
1617                  */
1618                 iovp = niovp;
1619                 trp = rp;
1620                 count = 0;
1621                 do {
1622                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1623                                 if (trp->wa->wa_rlist) {
1624                                         iovp->iov_base =
1625                                             (char *)((trp->wa->wa_rlist)->
1626                                             u.c_daddr3);
1627                                         iovp->iov_len = trp->wa->wa_count;
1628                                 } else  {
1629                                         iovp->iov_base = trp->wa->wa_data;
1630                                         iovp->iov_len = trp->wa->wa_count;
1631                                 }
1632                                 iovp++;
1633                         } else {
1634                                 m = trp->wa->wa_mblk;
1635                                 rcount = trp->wa->wa_count;
1636                                 while (m != NULL) {
1637                                         iovp->iov_base = (caddr_t)m->b_rptr;
1638                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1639                                         rcount -= iovp->iov_len;
1640                                         if (rcount < 0)
1641                                                 iovp->iov_len += rcount;
1642                                         iovp++;
1643                                         if (rcount <= 0)
1644                                                 break;
1645                                         m = m->b_cont;
1646                                 }
1647                         }
1648                         count += trp->wa->wa_count;
1649                         trp = trp->list;
1650                 } while (trp != lrp);
1651 
1652                 uio.uio_iov = niovp;
1653                 uio.uio_iovcnt = iovcnt;
1654                 uio.uio_segflg = UIO_SYSSPACE;
1655                 uio.uio_extflg = UIO_COPY_DEFAULT;
1656                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1657                 uio.uio_resid = count;
1658                 /*
1659                  * The limit is checked on the client. We
1660                  * should allow any size writes here.
1661                  */
1662                 uio.uio_llimit = curproc->p_fsz_ctl;
1663                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1664                 if (rlimit < (rlim64_t)uio.uio_resid)
1665                         uio.uio_resid = (uint_t)rlimit;
1666 
1667                 /*
1668                  * For now we assume no append mode.
1669                  */
1670 
1671                 /*
1672                  * We're changing creds because VM may fault
1673                  * and we need the cred of the current
1674                  * thread to be used if quota * checking is
1675                  * enabled.
1676                  */
1677                 savecred = curthread->t_cred;
1678                 curthread->t_cred = cr;
1679                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1680                 curthread->t_cred = savecred;
1681 
1682                 /* check if a monitor detected a delegation conflict */
1683                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1684                         /* mark as wouldblock so response is dropped */
1685                         curthread->t_flag |= T_WOULDBLOCK;
1686 
1687                 if (niovp != iov)
1688                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1689 
1690                 if (!error) {
1691                         data_written = 1;
1692                         /*
1693                          * Get attributes again so we send the latest mod
1694                          * time to the client side for its cache.
1695                          */
1696                         va.va_mask = AT_ALL;    /* now we want everything */
1697 
1698                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1699 
1700                         if (!error)
1701                                 acl_perm(vp, exi, &va, rp->cr);
1702                 }
1703 
1704                 /*
1705                  * Fill in the status responses for each request
1706                  * which was just handled.  Also, copy the latest
1707                  * attributes in to the attribute responses if
1708                  * appropriate.
1709                  */
1710                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1711                 do {
1712                         rp->thread->t_flag |= t_flag;
1713                         /* check for overflows */
1714                         if (!error) {
1715                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1716                         }
1717                         rp->ns->ns_status = puterrno(error);
1718                         rp = rp->list;
1719                 } while (rp != lrp);
1720         } while (rp != NULL);
1721 
1722         /*
1723          * If any data was written at all, then we need to flush
1724          * the data and metadata to stable storage.
1725          */
1726         if (data_written) {
1727                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1728 
1729                 if (!error) {
1730                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1731                 }
1732         }
1733 
1734         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1735 
1736         if (in_crit)
1737                 nbl_end_crit(vp);
1738         VN_RELE(vp);
1739 
1740         t_flag = curthread->t_flag & T_WOULDBLOCK;
1741         mutex_enter(&rfs_async_write_lock);
1742         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1743                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1744                         rp->ns->ns_status = puterrno(error);
1745                         rp->thread->t_flag |= t_flag;
1746                 }
1747         }
1748         cv_broadcast(&nlp->cv);
1749         mutex_exit(&rfs_async_write_lock);
1750 
1751 }
1752 
1753 void *
1754 rfs_write_getfh(struct nfswriteargs *wa)
1755 {
1756         return (&wa->wa_fhandle);
1757 }
1758 
1759 /*
1760  * Create a file.
1761  * Creates a file with given attributes and returns those attributes
1762  * and an fhandle for the new file.
1763  */
1764 void
1765 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1766     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1767 {
1768         int error;
1769         int lookuperr;
1770         int in_crit = 0;
1771         struct vattr va;
1772         vnode_t *vp;
1773         vnode_t *realvp;
1774         vnode_t *dvp;
1775         char *name = args->ca_da.da_name;
1776         vnode_t *tvp = NULL;
1777         int mode;
1778         int lookup_ok;
1779         bool_t trunc;
1780         struct sockaddr *ca;
1781 
1782         /*
1783          * Disallow NULL paths
1784          */
1785         if (name == NULL || *name == '\0') {
1786                 dr->dr_status = NFSERR_ACCES;
1787                 return;
1788         }
1789 
1790         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1791         if (dvp == NULL) {
1792                 dr->dr_status = NFSERR_STALE;
1793                 return;
1794         }
1795 
1796         error = sattr_to_vattr(args->ca_sa, &va);
1797         if (error) {
1798                 dr->dr_status = puterrno(error);
1799                 return;
1800         }
1801 
1802         /*
1803          * Must specify the mode.
1804          */
1805         if (!(va.va_mask & AT_MODE)) {
1806                 VN_RELE(dvp);
1807                 dr->dr_status = NFSERR_INVAL;
1808                 return;
1809         }
1810 
1811         /*
1812          * This is a completely gross hack to make mknod
1813          * work over the wire until we can wack the protocol
1814          */
1815         if ((va.va_mode & IFMT) == IFCHR) {
1816                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1817                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1818                 else {
1819                         va.va_type = VCHR;
1820                         /*
1821                          * uncompress the received dev_t
1822                          * if the top half is zero indicating a request
1823                          * from an `older style' OS.
1824                          */
1825                         if ((va.va_size & 0xffff0000) == 0)
1826                                 va.va_rdev = nfsv2_expdev(va.va_size);
1827                         else
1828                                 va.va_rdev = (dev_t)va.va_size;
1829                 }
1830                 va.va_mask &= ~AT_SIZE;
1831         } else if ((va.va_mode & IFMT) == IFBLK) {
1832                 va.va_type = VBLK;
1833                 /*
1834                  * uncompress the received dev_t
1835                  * if the top half is zero indicating a request
1836                  * from an `older style' OS.
1837                  */
1838                 if ((va.va_size & 0xffff0000) == 0)
1839                         va.va_rdev = nfsv2_expdev(va.va_size);
1840                 else
1841                         va.va_rdev = (dev_t)va.va_size;
1842                 va.va_mask &= ~AT_SIZE;
1843         } else if ((va.va_mode & IFMT) == IFSOCK) {
1844                 va.va_type = VSOCK;
1845         } else {
1846                 va.va_type = VREG;
1847         }
1848         va.va_mode &= ~IFMT;
1849         va.va_mask |= AT_TYPE;
1850 
1851         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1852         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1853             MAXPATHLEN);
1854         if (name == NULL) {
1855                 dr->dr_status = puterrno(EINVAL);
1856                 return;
1857         }
1858 
1859         /*
1860          * Why was the choice made to use VWRITE as the mode to the
1861          * call to VOP_CREATE ? This results in a bug.  When a client
1862          * opens a file that already exists and is RDONLY, the second
1863          * open fails with an EACESS because of the mode.
1864          * bug ID 1054648.
1865          */
1866         lookup_ok = 0;
1867         mode = VWRITE;
1868         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1869                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1870                     NULL, NULL, NULL);
1871                 if (!error) {
1872                         struct vattr at;
1873 
1874                         lookup_ok = 1;
1875                         at.va_mask = AT_MODE;
1876                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1877                         if (!error)
1878                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1879                         VN_RELE(tvp);
1880                         tvp = NULL;
1881                 }
1882         }
1883 
1884         if (!lookup_ok) {
1885                 if (rdonly(ro, dvp)) {
1886                         error = EROFS;
1887                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1888                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1889                         error = EPERM;
1890                 } else {
1891                         error = 0;
1892                 }
1893         }
1894 
1895         /*
1896          * If file size is being modified on an already existing file
1897          * make sure that there are no conflicting non-blocking mandatory
1898          * locks in the region being manipulated. Return EACCES if there
1899          * are conflicting locks.
1900          */
1901         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1902                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1903                     NULL, NULL, NULL);
1904 
1905                 if (!lookuperr &&
1906                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1907                         VN_RELE(tvp);
1908                         curthread->t_flag |= T_WOULDBLOCK;
1909                         goto out;
1910                 }
1911 
1912                 if (!lookuperr && nbl_need_check(tvp)) {
1913                         /*
1914                          * The file exists. Now check if it has any
1915                          * conflicting non-blocking mandatory locks
1916                          * in the region being changed.
1917                          */
1918                         struct vattr bva;
1919                         u_offset_t offset;
1920                         ssize_t length;
1921 
1922                         nbl_start_crit(tvp, RW_READER);
1923                         in_crit = 1;
1924 
1925                         bva.va_mask = AT_SIZE;
1926                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1927                         if (!error) {
1928                                 if (va.va_size < bva.va_size) {
1929                                         offset = va.va_size;
1930                                         length = bva.va_size - va.va_size;
1931                                 } else {
1932                                         offset = bva.va_size;
1933                                         length = va.va_size - bva.va_size;
1934                                 }
1935                                 if (length) {
1936                                         if (nbl_conflict(tvp, NBL_WRITE,
1937                                             offset, length, 0, NULL)) {
1938                                                 error = EACCES;
1939                                         }
1940                                 }
1941                         }
1942                         if (error) {
1943                                 nbl_end_crit(tvp);
1944                                 VN_RELE(tvp);
1945                                 in_crit = 0;
1946                         }
1947                 } else if (tvp != NULL) {
1948                         VN_RELE(tvp);
1949                 }
1950         }
1951 
1952         if (!error) {
1953                 /*
1954                  * If filesystem is shared with nosuid the remove any
1955                  * setuid/setgid bits on create.
1956                  */
1957                 if (va.va_type == VREG &&
1958                     exi->exi_export.ex_flags & EX_NOSUID)
1959                         va.va_mode &= ~(VSUID | VSGID);
1960 
1961                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1962                     NULL, NULL);
1963 
1964                 if (!error) {
1965 
1966                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1967                                 trunc = TRUE;
1968                         else
1969                                 trunc = FALSE;
1970 
1971                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1972                                 VN_RELE(vp);
1973                                 curthread->t_flag |= T_WOULDBLOCK;
1974                                 goto out;
1975                         }
1976                         va.va_mask = AT_ALL;
1977 
1978                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1979 
1980                         /* check for overflows */
1981                         if (!error) {
1982                                 acl_perm(vp, exi, &va, cr);
1983                                 error = vattr_to_nattr(&va, &dr->dr_attr);
1984                                 if (!error) {
1985                                         error = makefh(&dr->dr_fhandle, vp,
1986                                             exi);
1987                                 }
1988                         }
1989                         /*
1990                          * Force modified metadata out to stable storage.
1991                          *
1992                          * if a underlying vp exists, pass it to VOP_FSYNC
1993                          */
1994                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
1995                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1996                         else
1997                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1998                         VN_RELE(vp);
1999                 }
2000 
2001                 if (in_crit) {
2002                         nbl_end_crit(tvp);
2003                         VN_RELE(tvp);
2004                 }
2005         }
2006 
2007         /*
2008          * Force modified data and metadata out to stable storage.
2009          */
2010         (void) VOP_FSYNC(dvp, 0, cr, NULL);
2011 
2012 out:
2013 
2014         VN_RELE(dvp);
2015 
2016         dr->dr_status = puterrno(error);
2017 
2018         if (name != args->ca_da.da_name)
2019                 kmem_free(name, MAXPATHLEN);
2020 }
2021 void *
2022 rfs_create_getfh(struct nfscreatargs *args)
2023 {
2024         return (args->ca_da.da_fhandle);
2025 }
2026 
2027 /*
2028  * Remove a file.
2029  * Remove named file from parent directory.
2030  */
2031 /* ARGSUSED */
2032 void
2033 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2034     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2035 {
2036         int error = 0;
2037         vnode_t *vp;
2038         vnode_t *targvp;
2039         int in_crit = 0;
2040 
2041         /*
2042          * Disallow NULL paths
2043          */
2044         if (da->da_name == NULL || *da->da_name == '\0') {
2045                 *status = NFSERR_ACCES;
2046                 return;
2047         }
2048 
2049         vp = nfs_fhtovp(da->da_fhandle, exi);
2050         if (vp == NULL) {
2051                 *status = NFSERR_STALE;
2052                 return;
2053         }
2054 
2055         if (rdonly(ro, vp)) {
2056                 VN_RELE(vp);
2057                 *status = NFSERR_ROFS;
2058                 return;
2059         }
2060 
2061         /*
2062          * Check for a conflict with a non-blocking mandatory share reservation.
2063          */
2064         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2065             NULL, cr, NULL, NULL, NULL);
2066         if (error != 0) {
2067                 VN_RELE(vp);
2068                 *status = puterrno(error);
2069                 return;
2070         }
2071 
2072         /*
2073          * If the file is delegated to an v4 client, then initiate
2074          * recall and drop this request (by setting T_WOULDBLOCK).
2075          * The client will eventually re-transmit the request and
2076          * (hopefully), by then, the v4 client will have returned
2077          * the delegation.
2078          */
2079 
2080         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2081                 VN_RELE(vp);
2082                 VN_RELE(targvp);
2083                 curthread->t_flag |= T_WOULDBLOCK;
2084                 return;
2085         }
2086 
2087         if (nbl_need_check(targvp)) {
2088                 nbl_start_crit(targvp, RW_READER);
2089                 in_crit = 1;
2090                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2091                         error = EACCES;
2092                         goto out;
2093                 }
2094         }
2095 
2096         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2097 
2098         /*
2099          * Force modified data and metadata out to stable storage.
2100          */
2101         (void) VOP_FSYNC(vp, 0, cr, NULL);
2102 
2103 out:
2104         if (in_crit)
2105                 nbl_end_crit(targvp);
2106         VN_RELE(targvp);
2107         VN_RELE(vp);
2108 
2109         *status = puterrno(error);
2110 
2111 }
2112 
2113 void *
2114 rfs_remove_getfh(struct nfsdiropargs *da)
2115 {
2116         return (da->da_fhandle);
2117 }
2118 
2119 /*
2120  * rename a file
2121  * Give a file (from) a new name (to).
2122  */
2123 /* ARGSUSED */
2124 void
2125 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2126     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2127 {
2128         int error = 0;
2129         vnode_t *fromvp;
2130         vnode_t *tovp;
2131         struct exportinfo *to_exi;
2132         fhandle_t *fh;
2133         vnode_t *srcvp;
2134         vnode_t *targvp;
2135         int in_crit = 0;
2136 
2137         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2138         if (fromvp == NULL) {
2139                 *status = NFSERR_STALE;
2140                 return;
2141         }
2142 
2143         fh = args->rna_to.da_fhandle;
2144         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2145         if (to_exi == NULL) {
2146                 VN_RELE(fromvp);
2147                 *status = NFSERR_ACCES;
2148                 return;
2149         }
2150         exi_rele(to_exi);
2151 
2152         if (to_exi != exi) {
2153                 VN_RELE(fromvp);
2154                 *status = NFSERR_XDEV;
2155                 return;
2156         }
2157 
2158         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2159         if (tovp == NULL) {
2160                 VN_RELE(fromvp);
2161                 *status = NFSERR_STALE;
2162                 return;
2163         }
2164 
2165         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2166                 VN_RELE(tovp);
2167                 VN_RELE(fromvp);
2168                 *status = NFSERR_NOTDIR;
2169                 return;
2170         }
2171 
2172         /*
2173          * Disallow NULL paths
2174          */
2175         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2176             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2177                 VN_RELE(tovp);
2178                 VN_RELE(fromvp);
2179                 *status = NFSERR_ACCES;
2180                 return;
2181         }
2182 
2183         if (rdonly(ro, tovp)) {
2184                 VN_RELE(tovp);
2185                 VN_RELE(fromvp);
2186                 *status = NFSERR_ROFS;
2187                 return;
2188         }
2189 
2190         /*
2191          * Check for a conflict with a non-blocking mandatory share reservation.
2192          */
2193         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2194             NULL, cr, NULL, NULL, NULL);
2195         if (error != 0) {
2196                 VN_RELE(tovp);
2197                 VN_RELE(fromvp);
2198                 *status = puterrno(error);
2199                 return;
2200         }
2201 
2202         /* Check for delegations on the source file */
2203 
2204         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2205                 VN_RELE(tovp);
2206                 VN_RELE(fromvp);
2207                 VN_RELE(srcvp);
2208                 curthread->t_flag |= T_WOULDBLOCK;
2209                 return;
2210         }
2211 
2212         /* Check for delegation on the file being renamed over, if it exists */
2213 
2214         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2215             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2216             NULL, NULL, NULL) == 0) {
2217 
2218                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2219                         VN_RELE(tovp);
2220                         VN_RELE(fromvp);
2221                         VN_RELE(srcvp);
2222                         VN_RELE(targvp);
2223                         curthread->t_flag |= T_WOULDBLOCK;
2224                         return;
2225                 }
2226                 VN_RELE(targvp);
2227         }
2228 
2229 
2230         if (nbl_need_check(srcvp)) {
2231                 nbl_start_crit(srcvp, RW_READER);
2232                 in_crit = 1;
2233                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2234                         error = EACCES;
2235                         goto out;
2236                 }
2237         }
2238 
2239         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2240             tovp, args->rna_to.da_name, cr, NULL, 0);
2241 
2242         if (error == 0)
2243                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2244                     strlen(args->rna_to.da_name));
2245 
2246         /*
2247          * Force modified data and metadata out to stable storage.
2248          */
2249         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2250         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2251 
2252 out:
2253         if (in_crit)
2254                 nbl_end_crit(srcvp);
2255         VN_RELE(srcvp);
2256         VN_RELE(tovp);
2257         VN_RELE(fromvp);
2258 
2259         *status = puterrno(error);
2260 
2261 }
2262 void *
2263 rfs_rename_getfh(struct nfsrnmargs *args)
2264 {
2265         return (args->rna_from.da_fhandle);
2266 }
2267 
2268 /*
2269  * Link to a file.
2270  * Create a file (to) which is a hard link to the given file (from).
2271  */
2272 /* ARGSUSED */
2273 void
2274 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2275     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2276 {
2277         int error;
2278         vnode_t *fromvp;
2279         vnode_t *tovp;
2280         struct exportinfo *to_exi;
2281         fhandle_t *fh;
2282 
2283         fromvp = nfs_fhtovp(args->la_from, exi);
2284         if (fromvp == NULL) {
2285                 *status = NFSERR_STALE;
2286                 return;
2287         }
2288 
2289         fh = args->la_to.da_fhandle;
2290         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2291         if (to_exi == NULL) {
2292                 VN_RELE(fromvp);
2293                 *status = NFSERR_ACCES;
2294                 return;
2295         }
2296         exi_rele(to_exi);
2297 
2298         if (to_exi != exi) {
2299                 VN_RELE(fromvp);
2300                 *status = NFSERR_XDEV;
2301                 return;
2302         }
2303 
2304         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2305         if (tovp == NULL) {
2306                 VN_RELE(fromvp);
2307                 *status = NFSERR_STALE;
2308                 return;
2309         }
2310 
2311         if (tovp->v_type != VDIR) {
2312                 VN_RELE(tovp);
2313                 VN_RELE(fromvp);
2314                 *status = NFSERR_NOTDIR;
2315                 return;
2316         }
2317         /*
2318          * Disallow NULL paths
2319          */
2320         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2321                 VN_RELE(tovp);
2322                 VN_RELE(fromvp);
2323                 *status = NFSERR_ACCES;
2324                 return;
2325         }
2326 
2327         if (rdonly(ro, tovp)) {
2328                 VN_RELE(tovp);
2329                 VN_RELE(fromvp);
2330                 *status = NFSERR_ROFS;
2331                 return;
2332         }
2333 
2334         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2335 
2336         /*
2337          * Force modified data and metadata out to stable storage.
2338          */
2339         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2340         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2341 
2342         VN_RELE(tovp);
2343         VN_RELE(fromvp);
2344 
2345         *status = puterrno(error);
2346 
2347 }
2348 void *
2349 rfs_link_getfh(struct nfslinkargs *args)
2350 {
2351         return (args->la_from);
2352 }
2353 
2354 /*
2355  * Symbolicly link to a file.
2356  * Create a file (to) with the given attributes which is a symbolic link
2357  * to the given path name (to).
2358  */
2359 void
2360 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2361     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2362 {
2363         int error;
2364         struct vattr va;
2365         vnode_t *vp;
2366         vnode_t *svp;
2367         int lerror;
2368         struct sockaddr *ca;
2369         char *name = NULL;
2370 
2371         /*
2372          * Disallow NULL paths
2373          */
2374         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2375                 *status = NFSERR_ACCES;
2376                 return;
2377         }
2378 
2379         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2380         if (vp == NULL) {
2381                 *status = NFSERR_STALE;
2382                 return;
2383         }
2384 
2385         if (rdonly(ro, vp)) {
2386                 VN_RELE(vp);
2387                 *status = NFSERR_ROFS;
2388                 return;
2389         }
2390 
2391         error = sattr_to_vattr(args->sla_sa, &va);
2392         if (error) {
2393                 VN_RELE(vp);
2394                 *status = puterrno(error);
2395                 return;
2396         }
2397 
2398         if (!(va.va_mask & AT_MODE)) {
2399                 VN_RELE(vp);
2400                 *status = NFSERR_INVAL;
2401                 return;
2402         }
2403 
2404         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2405         name = nfscmd_convname(ca, exi, args->sla_tnm,
2406             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2407 
2408         if (name == NULL) {
2409                 *status = NFSERR_ACCES;
2410                 return;
2411         }
2412 
2413         va.va_type = VLNK;
2414         va.va_mask |= AT_TYPE;
2415 
2416         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2417 
2418         /*
2419          * Force new data and metadata out to stable storage.
2420          */
2421         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2422             NULL, cr, NULL, NULL, NULL);
2423 
2424         if (!lerror) {
2425                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2426                 VN_RELE(svp);
2427         }
2428 
2429         /*
2430          * Force modified data and metadata out to stable storage.
2431          */
2432         (void) VOP_FSYNC(vp, 0, cr, NULL);
2433 
2434         VN_RELE(vp);
2435 
2436         *status = puterrno(error);
2437         if (name != args->sla_tnm)
2438                 kmem_free(name, MAXPATHLEN);
2439 
2440 }
2441 void *
2442 rfs_symlink_getfh(struct nfsslargs *args)
2443 {
2444         return (args->sla_from.da_fhandle);
2445 }
2446 
2447 /*
2448  * Make a directory.
2449  * Create a directory with the given name, parent directory, and attributes.
2450  * Returns a file handle and attributes for the new directory.
2451  */
2452 /* ARGSUSED */
2453 void
2454 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2455     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2456 {
2457         int error;
2458         struct vattr va;
2459         vnode_t *dvp = NULL;
2460         vnode_t *vp;
2461         char *name = args->ca_da.da_name;
2462 
2463         /*
2464          * Disallow NULL paths
2465          */
2466         if (name == NULL || *name == '\0') {
2467                 dr->dr_status = NFSERR_ACCES;
2468                 return;
2469         }
2470 
2471         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2472         if (vp == NULL) {
2473                 dr->dr_status = NFSERR_STALE;
2474                 return;
2475         }
2476 
2477         if (rdonly(ro, vp)) {
2478                 VN_RELE(vp);
2479                 dr->dr_status = NFSERR_ROFS;
2480                 return;
2481         }
2482 
2483         error = sattr_to_vattr(args->ca_sa, &va);
2484         if (error) {
2485                 VN_RELE(vp);
2486                 dr->dr_status = puterrno(error);
2487                 return;
2488         }
2489 
2490         if (!(va.va_mask & AT_MODE)) {
2491                 VN_RELE(vp);
2492                 dr->dr_status = NFSERR_INVAL;
2493                 return;
2494         }
2495 
2496         va.va_type = VDIR;
2497         va.va_mask |= AT_TYPE;
2498 
2499         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2500 
2501         if (!error) {
2502                 /*
2503                  * Attribtutes of the newly created directory should
2504                  * be returned to the client.
2505                  */
2506                 va.va_mask = AT_ALL; /* We want everything */
2507                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2508 
2509                 /* check for overflows */
2510                 if (!error) {
2511                         acl_perm(vp, exi, &va, cr);
2512                         error = vattr_to_nattr(&va, &dr->dr_attr);
2513                         if (!error) {
2514                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2515                         }
2516                 }
2517                 /*
2518                  * Force new data and metadata out to stable storage.
2519                  */
2520                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2521                 VN_RELE(dvp);
2522         }
2523 
2524         /*
2525          * Force modified data and metadata out to stable storage.
2526          */
2527         (void) VOP_FSYNC(vp, 0, cr, NULL);
2528 
2529         VN_RELE(vp);
2530 
2531         dr->dr_status = puterrno(error);
2532 
2533 }
2534 void *
2535 rfs_mkdir_getfh(struct nfscreatargs *args)
2536 {
2537         return (args->ca_da.da_fhandle);
2538 }
2539 
2540 /*
2541  * Remove a directory.
2542  * Remove the given directory name from the given parent directory.
2543  */
2544 /* ARGSUSED */
2545 void
2546 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2547     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2548 {
2549         int error;
2550         vnode_t *vp;
2551 
2552         /*
2553          * Disallow NULL paths
2554          */
2555         if (da->da_name == NULL || *da->da_name == '\0') {
2556                 *status = NFSERR_ACCES;
2557                 return;
2558         }
2559 
2560         vp = nfs_fhtovp(da->da_fhandle, exi);
2561         if (vp == NULL) {
2562                 *status = NFSERR_STALE;
2563                 return;
2564         }
2565 
2566         if (rdonly(ro, vp)) {
2567                 VN_RELE(vp);
2568                 *status = NFSERR_ROFS;
2569                 return;
2570         }
2571 
2572         /*
2573          * VOP_RMDIR takes a third argument (the current
2574          * directory of the process).  That's because someone
2575          * wants to return EINVAL if one tries to remove ".".
2576          * Of course, NFS servers have no idea what their
2577          * clients' current directories are.  We fake it by
2578          * supplying a vnode known to exist and illegal to
2579          * remove.
2580          */
2581         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2582 
2583         /*
2584          * Force modified data and metadata out to stable storage.
2585          */
2586         (void) VOP_FSYNC(vp, 0, cr, NULL);
2587 
2588         VN_RELE(vp);
2589 
2590         /*
2591          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2592          * if the directory is not empty.  A System V NFS server
2593          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2594          * over the wire.
2595          */
2596         if (error == EEXIST)
2597                 *status = NFSERR_NOTEMPTY;
2598         else
2599                 *status = puterrno(error);
2600 
2601 }
2602 void *
2603 rfs_rmdir_getfh(struct nfsdiropargs *da)
2604 {
2605         return (da->da_fhandle);
2606 }
2607 
2608 /* ARGSUSED */
2609 void
2610 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2611     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2612 {
2613         int error;
2614         int iseof;
2615         struct iovec iov;
2616         struct uio uio;
2617         vnode_t *vp;
2618         char *ndata = NULL;
2619         struct sockaddr *ca;
2620         size_t nents;
2621         int ret;
2622 
2623         vp = nfs_fhtovp(&rda->rda_fh, exi);
2624         if (vp == NULL) {
2625                 rd->rd_entries = NULL;
2626                 rd->rd_status = NFSERR_STALE;
2627                 return;
2628         }
2629 
2630         if (vp->v_type != VDIR) {
2631                 VN_RELE(vp);
2632                 rd->rd_entries = NULL;
2633                 rd->rd_status = NFSERR_NOTDIR;
2634                 return;
2635         }
2636 
2637         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2638 
2639         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2640 
2641         if (error) {
2642                 rd->rd_entries = NULL;
2643                 goto bad;
2644         }
2645 
2646         if (rda->rda_count == 0) {
2647                 rd->rd_entries = NULL;
2648                 rd->rd_size = 0;
2649                 rd->rd_eof = FALSE;
2650                 goto bad;
2651         }
2652 
2653         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2654 
2655         /*
2656          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2657          */
2658         rd->rd_bufsize = (uint_t)rda->rda_count;
2659         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2660 
2661         /*
2662          * Set up io vector to read directory data
2663          */
2664         iov.iov_base = (caddr_t)rd->rd_entries;
2665         iov.iov_len = rda->rda_count;
2666         uio.uio_iov = &iov;
2667         uio.uio_iovcnt = 1;
2668         uio.uio_segflg = UIO_SYSSPACE;
2669         uio.uio_extflg = UIO_COPY_CACHED;
2670         uio.uio_loffset = (offset_t)rda->rda_offset;
2671         uio.uio_resid = rda->rda_count;
2672 
2673         /*
2674          * read directory
2675          */
2676         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2677 
2678         /*
2679          * Clean up
2680          */
2681         if (!error) {
2682                 /*
2683                  * set size and eof
2684                  */
2685                 if (uio.uio_resid == rda->rda_count) {
2686                         rd->rd_size = 0;
2687                         rd->rd_eof = TRUE;
2688                 } else {
2689                         rd->rd_size = (uint32_t)(rda->rda_count -
2690                             uio.uio_resid);
2691                         rd->rd_eof = iseof ? TRUE : FALSE;
2692                 }
2693         }
2694 
2695         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2696         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2697         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2698             rda->rda_count, &ndata);
2699 
2700         if (ret != 0) {
2701                 size_t dropbytes;
2702                 /*
2703                  * We had to drop one or more entries in order to fit
2704                  * during the character conversion.  We need to patch
2705                  * up the size and eof info.
2706                  */
2707                 if (rd->rd_eof)
2708                         rd->rd_eof = FALSE;
2709                 dropbytes = nfscmd_dropped_entrysize(
2710                     (struct dirent64 *)rd->rd_entries, nents, ret);
2711                 rd->rd_size -= dropbytes;
2712         }
2713         if (ndata == NULL) {
2714                 ndata = (char *)rd->rd_entries;
2715         } else if (ndata != (char *)rd->rd_entries) {
2716                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2717                 rd->rd_entries = (void *)ndata;
2718                 rd->rd_bufsize = rda->rda_count;
2719         }
2720 
2721 bad:
2722         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2723 
2724 #if 0 /* notyet */
2725         /*
2726          * Don't do this.  It causes local disk writes when just
2727          * reading the file and the overhead is deemed larger
2728          * than the benefit.
2729          */
2730         /*
2731          * Force modified metadata out to stable storage.
2732          */
2733         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2734 #endif
2735 
2736         VN_RELE(vp);
2737 
2738         rd->rd_status = puterrno(error);
2739 
2740 }
2741 void *
2742 rfs_readdir_getfh(struct nfsrddirargs *rda)
2743 {
2744         return (&rda->rda_fh);
2745 }
2746 void
2747 rfs_rddirfree(struct nfsrddirres *rd)
2748 {
2749         if (rd->rd_entries != NULL)
2750                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2751 }
2752 
2753 /* ARGSUSED */
2754 void
2755 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2756     struct svc_req *req, cred_t *cr, bool_t ro)
2757 {
2758         int error;
2759         struct statvfs64 sb;
2760         vnode_t *vp;
2761 
2762         vp = nfs_fhtovp(fh, exi);
2763         if (vp == NULL) {
2764                 fs->fs_status = NFSERR_STALE;
2765                 return;
2766         }
2767 
2768         error = VFS_STATVFS(vp->v_vfsp, &sb);
2769 
2770         if (!error) {
2771                 fs->fs_tsize = nfstsize();
2772                 fs->fs_bsize = sb.f_frsize;
2773                 fs->fs_blocks = sb.f_blocks;
2774                 fs->fs_bfree = sb.f_bfree;
2775                 fs->fs_bavail = sb.f_bavail;
2776         }
2777 
2778         VN_RELE(vp);
2779 
2780         fs->fs_status = puterrno(error);
2781 
2782 }
2783 void *
2784 rfs_statfs_getfh(fhandle_t *fh)
2785 {
2786         return (fh);
2787 }
2788 
2789 static int
2790 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2791 {
2792         vap->va_mask = 0;
2793 
2794         /*
2795          * There was a sign extension bug in some VFS based systems
2796          * which stored the mode as a short.  When it would get
2797          * assigned to a u_long, no sign extension would occur.
2798          * It needed to, but this wasn't noticed because sa_mode
2799          * would then get assigned back to the short, thus ignoring
2800          * the upper 16 bits of sa_mode.
2801          *
2802          * To make this implementation work for both broken
2803          * clients and good clients, we check for both versions
2804          * of the mode.
2805          */
2806         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2807             sa->sa_mode != (uint32_t)-1) {
2808                 vap->va_mask |= AT_MODE;
2809                 vap->va_mode = sa->sa_mode;
2810         }
2811         if (sa->sa_uid != (uint32_t)-1) {
2812                 vap->va_mask |= AT_UID;
2813                 vap->va_uid = sa->sa_uid;
2814         }
2815         if (sa->sa_gid != (uint32_t)-1) {
2816                 vap->va_mask |= AT_GID;
2817                 vap->va_gid = sa->sa_gid;
2818         }
2819         if (sa->sa_size != (uint32_t)-1) {
2820                 vap->va_mask |= AT_SIZE;
2821                 vap->va_size = sa->sa_size;
2822         }
2823         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2824             sa->sa_atime.tv_usec != (int32_t)-1) {
2825 #ifndef _LP64
2826                 /* return error if time overflow */
2827                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2828                         return (EOVERFLOW);
2829 #endif
2830                 vap->va_mask |= AT_ATIME;
2831                 /*
2832                  * nfs protocol defines times as unsigned so don't extend sign,
2833                  * unless sysadmin set nfs_allow_preepoch_time.
2834                  */
2835                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2836                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2837         }
2838         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2839             sa->sa_mtime.tv_usec != (int32_t)-1) {
2840 #ifndef _LP64
2841                 /* return error if time overflow */
2842                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2843                         return (EOVERFLOW);
2844 #endif
2845                 vap->va_mask |= AT_MTIME;
2846                 /*
2847                  * nfs protocol defines times as unsigned so don't extend sign,
2848                  * unless sysadmin set nfs_allow_preepoch_time.
2849                  */
2850                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2851                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2852         }
2853         return (0);
2854 }
2855 
2856 static enum nfsftype vt_to_nf[] = {
2857         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2858 };
2859 
2860 /*
2861  * check the following fields for overflow: nodeid, size, and time.
2862  * There could be a problem when converting 64-bit LP64 fields
2863  * into 32-bit ones.  Return an error if there is an overflow.
2864  */
2865 int
2866 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2867 {
2868         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2869         na->na_type = vt_to_nf[vap->va_type];
2870 
2871         if (vap->va_mode == (unsigned short) -1)
2872                 na->na_mode = (uint32_t)-1;
2873         else
2874                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2875 
2876         if (vap->va_uid == (unsigned short)(-1))
2877                 na->na_uid = (uint32_t)(-1);
2878         else if (vap->va_uid == UID_NOBODY)
2879                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2880         else
2881                 na->na_uid = vap->va_uid;
2882 
2883         if (vap->va_gid == (unsigned short)(-1))
2884                 na->na_gid = (uint32_t)-1;
2885         else if (vap->va_gid == GID_NOBODY)
2886                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2887         else
2888                 na->na_gid = vap->va_gid;
2889 
2890         /*
2891          * Do we need to check fsid for overflow?  It is 64-bit in the
2892          * vattr, but are bigger than 32 bit values supported?
2893          */
2894         na->na_fsid = vap->va_fsid;
2895 
2896         na->na_nodeid = vap->va_nodeid;
2897 
2898         /*
2899          * Check to make sure that the nodeid is representable over the
2900          * wire without losing bits.
2901          */
2902         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2903                 return (EFBIG);
2904         na->na_nlink = vap->va_nlink;
2905 
2906         /*
2907          * Check for big files here, instead of at the caller.  See
2908          * comments in cstat for large special file explanation.
2909          */
2910         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2911                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2912                         return (EFBIG);
2913                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2914                         /* UNKNOWN_SIZE | OVERFLOW */
2915                         na->na_size = MAXOFF32_T;
2916                 } else
2917                         na->na_size = vap->va_size;
2918         } else
2919                 na->na_size = vap->va_size;
2920 
2921         /*
2922          * If the vnode times overflow the 32-bit times that NFS2
2923          * uses on the wire then return an error.
2924          */
2925         if (!NFS_VAP_TIME_OK(vap)) {
2926                 return (EOVERFLOW);
2927         }
2928         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2929         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2930 
2931         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2932         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2933 
2934         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2935         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2936 
2937         /*
2938          * If the dev_t will fit into 16 bits then compress
2939          * it, otherwise leave it alone. See comments in
2940          * nfs_client.c.
2941          */
2942         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2943             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2944                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2945         else
2946                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2947 
2948         na->na_blocks = vap->va_nblocks;
2949         na->na_blocksize = vap->va_blksize;
2950 
2951         /*
2952          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2953          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2954          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2955          *
2956          * BUYER BEWARE:
2957          *  If you are porting the NFS to a non-Sun server, you probably
2958          *  don't want to include the following block of code.  The
2959          *  over-the-wire special file types will be changing with the
2960          *  NFS Protocol Revision.
2961          */
2962         if (vap->va_type == VFIFO)
2963                 NA_SETFIFO(na);
2964         return (0);
2965 }
2966 
2967 /*
2968  * acl v2 support: returns approximate permission.
2969  *      default: returns minimal permission (more restrictive)
2970  *      aclok: returns maximal permission (less restrictive)
2971  *      This routine changes the permissions that are alaredy in *va.
2972  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2973  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2974  */
2975 static void
2976 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2977 {
2978         vsecattr_t      vsa;
2979         int             aclcnt;
2980         aclent_t        *aclentp;
2981         mode_t          mask_perm;
2982         mode_t          grp_perm;
2983         mode_t          other_perm;
2984         mode_t          other_orig;
2985         int             error;
2986 
2987         /* dont care default acl */
2988         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2989         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2990 
2991         if (!error) {
2992                 aclcnt = vsa.vsa_aclcnt;
2993                 if (aclcnt > MIN_ACL_ENTRIES) {
2994                         /* non-trivial ACL */
2995                         aclentp = vsa.vsa_aclentp;
2996                         if (exi->exi_export.ex_flags & EX_ACLOK) {
2997                                 /* maximal permissions */
2998                                 grp_perm = 0;
2999                                 other_perm = 0;
3000                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3001                                         switch (aclentp->a_type) {
3002                                         case USER_OBJ:
3003                                                 break;
3004                                         case USER:
3005                                                 grp_perm |=
3006                                                     aclentp->a_perm << 3;
3007                                                 other_perm |= aclentp->a_perm;
3008                                                 break;
3009                                         case GROUP_OBJ:
3010                                                 grp_perm |=
3011                                                     aclentp->a_perm << 3;
3012                                                 break;
3013                                         case GROUP:
3014                                                 other_perm |= aclentp->a_perm;
3015                                                 break;
3016                                         case OTHER_OBJ:
3017                                                 other_orig = aclentp->a_perm;
3018                                                 break;
3019                                         case CLASS_OBJ:
3020                                                 mask_perm = aclentp->a_perm;
3021                                                 break;
3022                                         default:
3023                                                 break;
3024                                         }
3025                                 }
3026                                 grp_perm &= mask_perm << 3;
3027                                 other_perm &= mask_perm;
3028                                 other_perm |= other_orig;
3029 
3030                         } else {
3031                                 /* minimal permissions */
3032                                 grp_perm = 070;
3033                                 other_perm = 07;
3034                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3035                                         switch (aclentp->a_type) {
3036                                         case USER_OBJ:
3037                                                 break;
3038                                         case USER:
3039                                         case CLASS_OBJ:
3040                                                 grp_perm &=
3041                                                     aclentp->a_perm << 3;
3042                                                 other_perm &=
3043                                                     aclentp->a_perm;
3044                                                 break;
3045                                         case GROUP_OBJ:
3046                                                 grp_perm &=
3047                                                     aclentp->a_perm << 3;
3048                                                 break;
3049                                         case GROUP:
3050                                                 other_perm &=
3051                                                     aclentp->a_perm;
3052                                                 break;
3053                                         case OTHER_OBJ:
3054                                                 other_perm &=
3055                                                     aclentp->a_perm;
3056                                                 break;
3057                                         default:
3058                                                 break;
3059                                         }
3060                                 }
3061                         }
3062                         /* copy to va */
3063                         va->va_mode &= ~077;
3064                         va->va_mode |= grp_perm | other_perm;
3065                 }
3066                 if (vsa.vsa_aclcnt)
3067                         kmem_free(vsa.vsa_aclentp,
3068                             vsa.vsa_aclcnt * sizeof (aclent_t));
3069         }
3070 }
3071 
3072 void
3073 rfs_srvrinit(void)
3074 {
3075         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3076         nfs2_srv_caller_id = fs_new_caller_id();
3077 }
3078 
3079 void
3080 rfs_srvrfini(void)
3081 {
3082         mutex_destroy(&rfs_async_write_lock);
3083 }
3084 
3085 static int
3086 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3087 {
3088         struct clist    *wcl;
3089         int             wlist_len;
3090         uint32_t        count = rr->rr_count;
3091 
3092         wcl = ra->ra_wlist;
3093 
3094         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3095                 return (FALSE);
3096         }
3097 
3098         wcl = ra->ra_wlist;
3099         rr->rr_ok.rrok_wlist_len = wlist_len;
3100         rr->rr_ok.rrok_wlist = wcl;
3101 
3102         return (TRUE);
3103 }