big-one New usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All rights reserved.
  29  */
  30 
  31 /*
  32  * Copyright 2018 Nexenta Systems, Inc.
  33  * Copyright (c) 2016 by Delphix. All rights reserved.
  34  */
  35 
  36 #include <sys/param.h>
  37 #include <sys/types.h>
  38 #include <sys/systm.h>
  39 #include <sys/cred.h>
  40 #include <sys/buf.h>
  41 #include <sys/vfs.h>
  42 #include <sys/vnode.h>
  43 #include <sys/uio.h>
  44 #include <sys/stat.h>
  45 #include <sys/errno.h>
  46 #include <sys/sysmacros.h>
  47 #include <sys/statvfs.h>
  48 #include <sys/kmem.h>
  49 #include <sys/kstat.h>
  50 #include <sys/dirent.h>
  51 #include <sys/cmn_err.h>
  52 #include <sys/debug.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/mode.h>
  55 #include <sys/acl.h>
  56 #include <sys/nbmlock.h>
  57 #include <sys/policy.h>
  58 #include <sys/sdt.h>
  59 
  60 #include <rpc/types.h>
  61 #include <rpc/auth.h>
  62 #include <rpc/svc.h>
  63 
  64 #include <nfs/nfs.h>
  65 #include <nfs/export.h>
  66 #include <nfs/nfs_cmd.h>
  67 
  68 #include <vm/hat.h>
  69 #include <vm/as.h>
  70 #include <vm/seg.h>
  71 #include <vm/seg_map.h>
  72 #include <vm/seg_kmem.h>
  73 
  74 #include <sys/strsubr.h>
  75 
  76 struct rfs_async_write_list;
  77 
  78 /*
  79  * Zone globals of NFSv2 server
  80  */
  81 typedef struct nfs_srv {
  82         kmutex_t                        async_write_lock;
  83         struct rfs_async_write_list     *async_write_head;
  84 
  85         /*
  86          * enables write clustering if == 1
  87          */
  88         int             write_async;
  89 } nfs_srv_t;
  90 
  91 /*
  92  * These are the interface routines for the server side of the
  93  * Network File System.  See the NFS version 2 protocol specification
  94  * for a description of this interface.
  95  */
  96 
  97 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  98 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  99                         cred_t *);
 100 static void     *rfs_zone_init(zoneid_t zoneid);
 101 static void     rfs_zone_fini(zoneid_t zoneid, void *data);
 102 
 103 
 104 /*
 105  * Some "over the wire" UNIX file types.  These are encoded
 106  * into the mode.  This needs to be fixed in the next rev.
 107  */
 108 #define IFMT            0170000         /* type of file */
 109 #define IFCHR           0020000         /* character special */
 110 #define IFBLK           0060000         /* block special */
 111 #define IFSOCK          0140000         /* socket */
 112 
 113 u_longlong_t nfs2_srv_caller_id;
 114 static zone_key_t rfs_zone_key;
 115 
 116 /*
 117  * Get file attributes.
 118  * Returns the current attributes of the file with the given fhandle.
 119  */
 120 /* ARGSUSED */
 121 void
 122 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 123     struct svc_req *req, cred_t *cr, bool_t ro)
 124 {
 125         int error;
 126         vnode_t *vp;
 127         struct vattr va;
 128 
 129         vp = nfs_fhtovp(fhp, exi);
 130         if (vp == NULL) {
 131                 ns->ns_status = NFSERR_STALE;
 132                 return;
 133         }
 134 
 135         /*
 136          * Do the getattr.
 137          */
 138         va.va_mask = AT_ALL;    /* we want all the attributes */
 139 
 140         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 141 
 142         /* check for overflows */
 143         if (!error) {
 144                 /* Lie about the object type for a referral */
 145                 if (vn_is_nfs_reparse(vp, cr))
 146                         va.va_type = VLNK;
 147 
 148                 acl_perm(vp, exi, &va, cr);
 149                 error = vattr_to_nattr(&va, &ns->ns_attr);
 150         }
 151 
 152         VN_RELE(vp);
 153 
 154         ns->ns_status = puterrno(error);
 155 }
 156 void *
 157 rfs_getattr_getfh(fhandle_t *fhp)
 158 {
 159         return (fhp);
 160 }
 161 
 162 /*
 163  * Set file attributes.
 164  * Sets the attributes of the file with the given fhandle.  Returns
 165  * the new attributes.
 166  */
 167 /* ARGSUSED */
 168 void
 169 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 170     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 171 {
 172         int error;
 173         int flag;
 174         int in_crit = 0;
 175         vnode_t *vp;
 176         struct vattr va;
 177         struct vattr bva;
 178         struct flock64 bf;
 179         caller_context_t ct;
 180 
 181 
 182         vp = nfs_fhtovp(&args->saa_fh, exi);
 183         if (vp == NULL) {
 184                 ns->ns_status = NFSERR_STALE;
 185                 return;
 186         }
 187 
 188         if (rdonly(ro, vp)) {
 189                 VN_RELE(vp);
 190                 ns->ns_status = NFSERR_ROFS;
 191                 return;
 192         }
 193 
 194         error = sattr_to_vattr(&args->saa_sa, &va);
 195         if (error) {
 196                 VN_RELE(vp);
 197                 ns->ns_status = puterrno(error);
 198                 return;
 199         }
 200 
 201         /*
 202          * If the client is requesting a change to the mtime,
 203          * but the nanosecond field is set to 1 billion, then
 204          * this is a flag to the server that it should set the
 205          * atime and mtime fields to the server's current time.
 206          * The 1 billion number actually came from the client
 207          * as 1 million, but the units in the over the wire
 208          * request are microseconds instead of nanoseconds.
 209          *
 210          * This is an overload of the protocol and should be
 211          * documented in the NFS Version 2 protocol specification.
 212          */
 213         if (va.va_mask & AT_MTIME) {
 214                 if (va.va_mtime.tv_nsec == 1000000000) {
 215                         gethrestime(&va.va_mtime);
 216                         va.va_atime = va.va_mtime;
 217                         va.va_mask |= AT_ATIME;
 218                         flag = 0;
 219                 } else
 220                         flag = ATTR_UTIME;
 221         } else
 222                 flag = 0;
 223 
 224         /*
 225          * If the filesystem is exported with nosuid, then mask off
 226          * the setuid and setgid bits.
 227          */
 228         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 229             (exi->exi_export.ex_flags & EX_NOSUID))
 230                 va.va_mode &= ~(VSUID | VSGID);
 231 
 232         ct.cc_sysid = 0;
 233         ct.cc_pid = 0;
 234         ct.cc_caller_id = nfs2_srv_caller_id;
 235         ct.cc_flags = CC_DONTBLOCK;
 236 
 237         /*
 238          * We need to specially handle size changes because it is
 239          * possible for the client to create a file with modes
 240          * which indicate read-only, but with the file opened for
 241          * writing.  If the client then tries to set the size of
 242          * the file, then the normal access checking done in
 243          * VOP_SETATTR would prevent the client from doing so,
 244          * although it should be legal for it to do so.  To get
 245          * around this, we do the access checking for ourselves
 246          * and then use VOP_SPACE which doesn't do the access
 247          * checking which VOP_SETATTR does. VOP_SPACE can only
 248          * operate on VREG files, let VOP_SETATTR handle the other
 249          * extremely rare cases.
 250          * Also the client should not be allowed to change the
 251          * size of the file if there is a conflicting non-blocking
 252          * mandatory lock in the region of change.
 253          */
 254         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 255                 if (nbl_need_check(vp)) {
 256                         nbl_start_crit(vp, RW_READER);
 257                         in_crit = 1;
 258                 }
 259 
 260                 bva.va_mask = AT_UID | AT_SIZE;
 261 
 262                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 263 
 264                 if (error) {
 265                         if (in_crit)
 266                                 nbl_end_crit(vp);
 267                         VN_RELE(vp);
 268                         ns->ns_status = puterrno(error);
 269                         return;
 270                 }
 271 
 272                 if (in_crit) {
 273                         u_offset_t offset;
 274                         ssize_t length;
 275 
 276                         if (va.va_size < bva.va_size) {
 277                                 offset = va.va_size;
 278                                 length = bva.va_size - va.va_size;
 279                         } else {
 280                                 offset = bva.va_size;
 281                                 length = va.va_size - bva.va_size;
 282                         }
 283                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 284                             NULL)) {
 285                                 error = EACCES;
 286                         }
 287                 }
 288 
 289                 if (crgetuid(cr) == bva.va_uid && !error &&
 290                     va.va_size != bva.va_size) {
 291                         va.va_mask &= ~AT_SIZE;
 292                         bf.l_type = F_WRLCK;
 293                         bf.l_whence = 0;
 294                         bf.l_start = (off64_t)va.va_size;
 295                         bf.l_len = 0;
 296                         bf.l_sysid = 0;
 297                         bf.l_pid = 0;
 298 
 299                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 300                             (offset_t)va.va_size, cr, &ct);
 301                 }
 302                 if (in_crit)
 303                         nbl_end_crit(vp);
 304         } else
 305                 error = 0;
 306 
 307         /*
 308          * Do the setattr.
 309          */
 310         if (!error && va.va_mask) {
 311                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 312         }
 313 
 314         /*
 315          * check if the monitor on either vop_space or vop_setattr detected
 316          * a delegation conflict and if so, mark the thread flag as
 317          * wouldblock so that the response is dropped and the client will
 318          * try again.
 319          */
 320         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 321                 VN_RELE(vp);
 322                 curthread->t_flag |= T_WOULDBLOCK;
 323                 return;
 324         }
 325 
 326         if (!error) {
 327                 va.va_mask = AT_ALL;    /* get everything */
 328 
 329                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 330 
 331                 /* check for overflows */
 332                 if (!error) {
 333                         acl_perm(vp, exi, &va, cr);
 334                         error = vattr_to_nattr(&va, &ns->ns_attr);
 335                 }
 336         }
 337 
 338         ct.cc_flags = 0;
 339 
 340         /*
 341          * Force modified metadata out to stable storage.
 342          */
 343         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 344 
 345         VN_RELE(vp);
 346 
 347         ns->ns_status = puterrno(error);
 348 }
 349 void *
 350 rfs_setattr_getfh(struct nfssaargs *args)
 351 {
 352         return (&args->saa_fh);
 353 }
 354 
 355 /* Change and release @exip and @vpp only in success */
 356 int
 357 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 358 {
 359         struct exportinfo *exi;
 360         vnode_t *vp = *vpp;
 361         fid_t fid;
 362         int error;
 363 
 364         VN_HOLD(vp);
 365 
 366         if ((error = traverse(&vp)) != 0) {
 367                 VN_RELE(vp);
 368                 return (error);
 369         }
 370 
 371         bzero(&fid, sizeof (fid));
 372         fid.fid_len = MAXFIDSZ;
 373         error = VOP_FID(vp, &fid, NULL);
 374         if (error) {
 375                 VN_RELE(vp);
 376                 return (error);
 377         }
 378 
 379         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 380         if (exi == NULL ||
 381             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 382                 /*
 383                  * It is not error, just subdir is not exported
 384                  * or "nohide" is not set
 385                  */
 386                 if (exi != NULL)
 387                         exi_rele(&exi);
 388                 VN_RELE(vp);
 389         } else {
 390                 /* go to submount */
 391                 exi_rele(exip);
 392                 *exip = exi;
 393 
 394                 VN_RELE(*vpp);
 395                 *vpp = vp;
 396         }
 397 
 398         return (0);
 399 }
 400 
 401 /*
 402  * Given mounted "dvp" and "exi", go upper mountpoint
 403  * with dvp/exi correction
 404  * Return 0 in success
 405  */
 406 int
 407 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 408 {
 409         struct exportinfo *exi;
 410         vnode_t *dvp = *dvpp;
 411 
 412         ASSERT(dvp->v_flag & VROOT);
 413 
 414         VN_HOLD(dvp);
 415         dvp = untraverse(dvp);
 416         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 417         if (exi == NULL) {
 418                 VN_RELE(dvp);
 419                 return (-1);
 420         }
 421 
 422         exi_rele(exip);
 423         *exip = exi;
 424         VN_RELE(*dvpp);
 425         *dvpp = dvp;
 426 
 427         return (0);
 428 }
 429 /*
 430  * Directory lookup.
 431  * Returns an fhandle and file attributes for file name in a directory.
 432  */
 433 /* ARGSUSED */
 434 void
 435 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 436     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 437 {
 438         int error;
 439         vnode_t *dvp;
 440         vnode_t *vp;
 441         struct vattr va;
 442         fhandle_t *fhp = da->da_fhandle;
 443         struct sec_ol sec = {0, 0};
 444         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 445         char *name;
 446         struct sockaddr *ca;
 447 
 448         /*
 449          * Trusted Extension doesn't support NFSv2. MOUNT
 450          * will reject v2 clients. Need to prevent v2 client
 451          * access via WebNFS here.
 452          */
 453         if (is_system_labeled() && req->rq_vers == 2) {
 454                 dr->dr_status = NFSERR_ACCES;
 455                 return;
 456         }
 457 
 458         /*
 459          * Disallow NULL paths
 460          */
 461         if (da->da_name == NULL || *da->da_name == '\0') {
 462                 dr->dr_status = NFSERR_ACCES;
 463                 return;
 464         }
 465 
 466         /*
 467          * Allow lookups from the root - the default
 468          * location of the public filehandle.
 469          */
 470         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 471                 dvp = ZONE_ROOTVP();
 472                 VN_HOLD(dvp);
 473         } else {
 474                 dvp = nfs_fhtovp(fhp, exi);
 475                 if (dvp == NULL) {
 476                         dr->dr_status = NFSERR_STALE;
 477                         return;
 478                 }
 479         }
 480 
 481         exi_hold(exi);
 482 
 483         /*
 484          * Not allow lookup beyond root.
 485          * If the filehandle matches a filehandle of the exi,
 486          * then the ".." refers beyond the root of an exported filesystem.
 487          */
 488         if (strcmp(da->da_name, "..") == 0 &&
 489             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 490                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 491                     (dvp->v_flag & VROOT)) {
 492                         /*
 493                          * special case for ".." and 'nohide'exported root
 494                          */
 495                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 496                                 error = NFSERR_ACCES;
 497                                 goto out;
 498                         }
 499                 } else  {
 500                         error = NFSERR_NOENT;
 501                         goto out;
 502                 }
 503         }
 504 
 505         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 506         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 507             MAXPATHLEN);
 508 
 509         if (name == NULL) {
 510                 error = NFSERR_ACCES;
 511                 goto out;
 512         }
 513 
 514         /*
 515          * If the public filehandle is used then allow
 516          * a multi-component lookup, i.e. evaluate
 517          * a pathname and follow symbolic links if
 518          * necessary.
 519          *
 520          * This may result in a vnode in another filesystem
 521          * which is OK as long as the filesystem is exported.
 522          */
 523         if (PUBLIC_FH2(fhp)) {
 524                 publicfh_flag = TRUE;
 525 
 526                 exi_rele(&exi);
 527 
 528                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 529                     &sec);
 530         } else {
 531                 /*
 532                  * Do a normal single component lookup.
 533                  */
 534                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 535                     NULL, NULL, NULL);
 536         }
 537 
 538         if (name != da->da_name)
 539                 kmem_free(name, MAXPATHLEN);
 540 
 541         if (error == 0 && vn_ismntpt(vp)) {
 542                 error = rfs_cross_mnt(&vp, &exi);
 543                 if (error)
 544                         VN_RELE(vp);
 545         }
 546 
 547         if (!error) {
 548                 va.va_mask = AT_ALL;    /* we want everything */
 549 
 550                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 551 
 552                 /* check for overflows */
 553                 if (!error) {
 554                         acl_perm(vp, exi, &va, cr);
 555                         error = vattr_to_nattr(&va, &dr->dr_attr);
 556                         if (!error) {
 557                                 if (sec.sec_flags & SEC_QUERY)
 558                                         error = makefh_ol(&dr->dr_fhandle, exi,
 559                                             sec.sec_index);
 560                                 else {
 561                                         error = makefh(&dr->dr_fhandle, vp,
 562                                             exi);
 563                                         if (!error && publicfh_flag &&
 564                                             !chk_clnt_sec(exi, req))
 565                                                 auth_weak = TRUE;
 566                                 }
 567                         }
 568                 }
 569                 VN_RELE(vp);
 570         }
 571 
 572 out:
 573         VN_RELE(dvp);
 574 
 575         if (exi != NULL)
 576                 exi_rele(&exi);
 577 
 578         /*
 579          * If it's public fh, no 0x81, and client's flavor is
 580          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 581          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 582          */
 583         if (auth_weak)
 584                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 585         else
 586                 dr->dr_status = puterrno(error);
 587 }
 588 void *
 589 rfs_lookup_getfh(struct nfsdiropargs *da)
 590 {
 591         return (da->da_fhandle);
 592 }
 593 
 594 /*
 595  * Read symbolic link.
 596  * Returns the string in the symbolic link at the given fhandle.
 597  */
 598 /* ARGSUSED */
 599 void
 600 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 601     struct svc_req *req, cred_t *cr, bool_t ro)
 602 {
 603         int error;
 604         struct iovec iov;
 605         struct uio uio;
 606         vnode_t *vp;
 607         struct vattr va;
 608         struct sockaddr *ca;
 609         char *name = NULL;
 610         int is_referral = 0;
 611 
 612         vp = nfs_fhtovp(fhp, exi);
 613         if (vp == NULL) {
 614                 rl->rl_data = NULL;
 615                 rl->rl_status = NFSERR_STALE;
 616                 return;
 617         }
 618 
 619         va.va_mask = AT_MODE;
 620 
 621         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 622 
 623         if (error) {
 624                 VN_RELE(vp);
 625                 rl->rl_data = NULL;
 626                 rl->rl_status = puterrno(error);
 627                 return;
 628         }
 629 
 630         if (MANDLOCK(vp, va.va_mode)) {
 631                 VN_RELE(vp);
 632                 rl->rl_data = NULL;
 633                 rl->rl_status = NFSERR_ACCES;
 634                 return;
 635         }
 636 
 637         /* We lied about the object type for a referral */
 638         if (vn_is_nfs_reparse(vp, cr))
 639                 is_referral = 1;
 640 
 641         /*
 642          * XNFS and RFC1094 require us to return ENXIO if argument
 643          * is not a link. BUGID 1138002.
 644          */
 645         if (vp->v_type != VLNK && !is_referral) {
 646                 VN_RELE(vp);
 647                 rl->rl_data = NULL;
 648                 rl->rl_status = NFSERR_NXIO;
 649                 return;
 650         }
 651 
 652         /*
 653          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 654          */
 655         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 656 
 657         if (is_referral) {
 658                 char *s;
 659                 size_t strsz;
 660 
 661                 /* Get an artificial symlink based on a referral */
 662                 s = build_symlink(vp, cr, &strsz);
 663                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 664                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 665                     vnode_t *, vp, char *, s);
 666                 if (s == NULL)
 667                         error = EINVAL;
 668                 else {
 669                         error = 0;
 670                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 671                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 672                         kmem_free(s, strsz);
 673                 }
 674 
 675         } else {
 676 
 677                 /*
 678                  * Set up io vector to read sym link data
 679                  */
 680                 iov.iov_base = rl->rl_data;
 681                 iov.iov_len = NFS_MAXPATHLEN;
 682                 uio.uio_iov = &iov;
 683                 uio.uio_iovcnt = 1;
 684                 uio.uio_segflg = UIO_SYSSPACE;
 685                 uio.uio_extflg = UIO_COPY_CACHED;
 686                 uio.uio_loffset = (offset_t)0;
 687                 uio.uio_resid = NFS_MAXPATHLEN;
 688 
 689                 /*
 690                  * Do the readlink.
 691                  */
 692                 error = VOP_READLINK(vp, &uio, cr, NULL);
 693 
 694                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 695 
 696                 if (!error)
 697                         rl->rl_data[rl->rl_count] = '\0';
 698 
 699         }
 700 
 701 
 702         VN_RELE(vp);
 703 
 704         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 705         name = nfscmd_convname(ca, exi, rl->rl_data,
 706             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 707 
 708         if (name != NULL && name != rl->rl_data) {
 709                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 710                 rl->rl_data = name;
 711         }
 712 
 713         /*
 714          * XNFS and RFC1094 require us to return ENXIO if argument
 715          * is not a link. UFS returns EINVAL if this is the case,
 716          * so we do the mapping here. BUGID 1138002.
 717          */
 718         if (error == EINVAL)
 719                 rl->rl_status = NFSERR_NXIO;
 720         else
 721                 rl->rl_status = puterrno(error);
 722 
 723 }
 724 void *
 725 rfs_readlink_getfh(fhandle_t *fhp)
 726 {
 727         return (fhp);
 728 }
 729 /*
 730  * Free data allocated by rfs_readlink
 731  */
 732 void
 733 rfs_rlfree(struct nfsrdlnres *rl)
 734 {
 735         if (rl->rl_data != NULL)
 736                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 737 }
 738 
 739 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 740 
 741 /*
 742  * Read data.
 743  * Returns some data read from the file at the given fhandle.
 744  */
 745 /* ARGSUSED */
 746 void
 747 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 748     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 749 {
 750         vnode_t *vp;
 751         int error;
 752         struct vattr va;
 753         struct iovec iov;
 754         struct uio uio;
 755         mblk_t *mp;
 756         int alloc_err = 0;
 757         int in_crit = 0;
 758         caller_context_t ct;
 759 
 760         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 761         if (vp == NULL) {
 762                 rr->rr_data = NULL;
 763                 rr->rr_status = NFSERR_STALE;
 764                 return;
 765         }
 766 
 767         if (vp->v_type != VREG) {
 768                 VN_RELE(vp);
 769                 rr->rr_data = NULL;
 770                 rr->rr_status = NFSERR_ISDIR;
 771                 return;
 772         }
 773 
 774         ct.cc_sysid = 0;
 775         ct.cc_pid = 0;
 776         ct.cc_caller_id = nfs2_srv_caller_id;
 777         ct.cc_flags = CC_DONTBLOCK;
 778 
 779         /*
 780          * Enter the critical region before calling VOP_RWLOCK
 781          * to avoid a deadlock with write requests.
 782          */
 783         if (nbl_need_check(vp)) {
 784                 nbl_start_crit(vp, RW_READER);
 785                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 786                     0, NULL)) {
 787                         nbl_end_crit(vp);
 788                         VN_RELE(vp);
 789                         rr->rr_data = NULL;
 790                         rr->rr_status = NFSERR_ACCES;
 791                         return;
 792                 }
 793                 in_crit = 1;
 794         }
 795 
 796         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 797 
 798         /* check if a monitor detected a delegation conflict */
 799         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 800                 if (in_crit)
 801                         nbl_end_crit(vp);
 802                 VN_RELE(vp);
 803                 /* mark as wouldblock so response is dropped */
 804                 curthread->t_flag |= T_WOULDBLOCK;
 805 
 806                 rr->rr_data = NULL;
 807                 return;
 808         }
 809 
 810         va.va_mask = AT_ALL;
 811 
 812         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 813 
 814         if (error) {
 815                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 816                 if (in_crit)
 817                         nbl_end_crit(vp);
 818 
 819                 VN_RELE(vp);
 820                 rr->rr_data = NULL;
 821                 rr->rr_status = puterrno(error);
 822 
 823                 return;
 824         }
 825 
 826         /*
 827          * This is a kludge to allow reading of files created
 828          * with no read permission.  The owner of the file
 829          * is always allowed to read it.
 830          */
 831         if (crgetuid(cr) != va.va_uid) {
 832                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 833 
 834                 if (error) {
 835                         /*
 836                          * Exec is the same as read over the net because
 837                          * of demand loading.
 838                          */
 839                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 840                 }
 841                 if (error) {
 842                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 843                         if (in_crit)
 844                                 nbl_end_crit(vp);
 845                         VN_RELE(vp);
 846                         rr->rr_data = NULL;
 847                         rr->rr_status = puterrno(error);
 848 
 849                         return;
 850                 }
 851         }
 852 
 853         if (MANDLOCK(vp, va.va_mode)) {
 854                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 855                 if (in_crit)
 856                         nbl_end_crit(vp);
 857 
 858                 VN_RELE(vp);
 859                 rr->rr_data = NULL;
 860                 rr->rr_status = NFSERR_ACCES;
 861 
 862                 return;
 863         }
 864 
 865         rr->rr_ok.rrok_wlist_len = 0;
 866         rr->rr_ok.rrok_wlist = NULL;
 867 
 868         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 869                 rr->rr_count = 0;
 870                 rr->rr_data = NULL;
 871                 /*
 872                  * In this case, status is NFS_OK, but there is no data
 873                  * to encode. So set rr_mp to NULL.
 874                  */
 875                 rr->rr_mp = NULL;
 876                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 877                 if (rr->rr_ok.rrok_wlist)
 878                         clist_zero_len(rr->rr_ok.rrok_wlist);
 879                 goto done;
 880         }
 881 
 882         if (ra->ra_wlist) {
 883                 mp = NULL;
 884                 rr->rr_mp = NULL;
 885                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 886                 if (ra->ra_count > iov.iov_len) {
 887                         rr->rr_data = NULL;
 888                         rr->rr_status = NFSERR_INVAL;
 889                         goto done;
 890                 }
 891         } else {
 892                 /*
 893                  * mp will contain the data to be sent out in the read reply.
 894                  * This will be freed after the reply has been sent out (by the
 895                  * driver).
 896                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 897                  * that the call to xdrmblk_putmblk() never fails.
 898                  */
 899                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 900                     &alloc_err);
 901                 ASSERT(mp != NULL);
 902                 ASSERT(alloc_err == 0);
 903 
 904                 rr->rr_mp = mp;
 905 
 906                 /*
 907                  * Set up io vector
 908                  */
 909                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 910                 iov.iov_len = ra->ra_count;
 911         }
 912 
 913         uio.uio_iov = &iov;
 914         uio.uio_iovcnt = 1;
 915         uio.uio_segflg = UIO_SYSSPACE;
 916         uio.uio_extflg = UIO_COPY_CACHED;
 917         uio.uio_loffset = (offset_t)ra->ra_offset;
 918         uio.uio_resid = ra->ra_count;
 919 
 920         error = VOP_READ(vp, &uio, 0, cr, &ct);
 921 
 922         if (error) {
 923                 if (mp)
 924                         freeb(mp);
 925 
 926                 /*
 927                  * check if a monitor detected a delegation conflict and
 928                  * mark as wouldblock so response is dropped
 929                  */
 930                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 931                         curthread->t_flag |= T_WOULDBLOCK;
 932                 else
 933                         rr->rr_status = puterrno(error);
 934 
 935                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 936                 if (in_crit)
 937                         nbl_end_crit(vp);
 938 
 939                 VN_RELE(vp);
 940                 rr->rr_data = NULL;
 941 
 942                 return;
 943         }
 944 
 945         /*
 946          * Get attributes again so we can send the latest access
 947          * time to the client side for its cache.
 948          */
 949         va.va_mask = AT_ALL;
 950 
 951         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 952 
 953         if (error) {
 954                 if (mp)
 955                         freeb(mp);
 956 
 957                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 958                 if (in_crit)
 959                         nbl_end_crit(vp);
 960 
 961                 VN_RELE(vp);
 962                 rr->rr_data = NULL;
 963                 rr->rr_status = puterrno(error);
 964 
 965                 return;
 966         }
 967 
 968         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 969 
 970         if (mp) {
 971                 rr->rr_data = (char *)mp->b_datap->db_base;
 972         } else {
 973                 if (ra->ra_wlist) {
 974                         rr->rr_data = (caddr_t)iov.iov_base;
 975                         if (!rdma_setup_read_data2(ra, rr)) {
 976                                 rr->rr_data = NULL;
 977                                 rr->rr_status = puterrno(NFSERR_INVAL);
 978                         }
 979                 }
 980         }
 981 done:
 982         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 983         if (in_crit)
 984                 nbl_end_crit(vp);
 985 
 986         acl_perm(vp, exi, &va, cr);
 987 
 988         /* check for overflows */
 989         error = vattr_to_nattr(&va, &rr->rr_attr);
 990 
 991         VN_RELE(vp);
 992 
 993         rr->rr_status = puterrno(error);
 994 }
 995 
 996 /*
 997  * Free data allocated by rfs_read
 998  */
 999 void
1000 rfs_rdfree(struct nfsrdresult *rr)
1001 {
1002         mblk_t *mp;
1003 
1004         if (rr->rr_status == NFS_OK) {
1005                 mp = rr->rr_mp;
1006                 if (mp != NULL)
1007                         freeb(mp);
1008         }
1009 }
1010 
1011 void *
1012 rfs_read_getfh(struct nfsreadargs *ra)
1013 {
1014         return (&ra->ra_fhandle);
1015 }
1016 
1017 #define MAX_IOVECS      12
1018 
1019 #ifdef DEBUG
1020 static int rfs_write_sync_hits = 0;
1021 static int rfs_write_sync_misses = 0;
1022 #endif
1023 
1024 /*
1025  * Write data to file.
1026  * Returns attributes of a file after writing some data to it.
1027  *
1028  * Any changes made here, especially in error handling might have
1029  * to also be done in rfs_write (which clusters write requests).
1030  */
1031 /* ARGSUSED */
1032 void
1033 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1034     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1035 {
1036         int error;
1037         vnode_t *vp;
1038         rlim64_t rlimit;
1039         struct vattr va;
1040         struct uio uio;
1041         struct iovec iov[MAX_IOVECS];
1042         mblk_t *m;
1043         struct iovec *iovp;
1044         int iovcnt;
1045         cred_t *savecred;
1046         int in_crit = 0;
1047         caller_context_t ct;
1048 
1049         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1050         if (vp == NULL) {
1051                 ns->ns_status = NFSERR_STALE;
1052                 return;
1053         }
1054 
1055         if (rdonly(ro, vp)) {
1056                 VN_RELE(vp);
1057                 ns->ns_status = NFSERR_ROFS;
1058                 return;
1059         }
1060 
1061         if (vp->v_type != VREG) {
1062                 VN_RELE(vp);
1063                 ns->ns_status = NFSERR_ISDIR;
1064                 return;
1065         }
1066 
1067         ct.cc_sysid = 0;
1068         ct.cc_pid = 0;
1069         ct.cc_caller_id = nfs2_srv_caller_id;
1070         ct.cc_flags = CC_DONTBLOCK;
1071 
1072         va.va_mask = AT_UID|AT_MODE;
1073 
1074         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1075 
1076         if (error) {
1077                 VN_RELE(vp);
1078                 ns->ns_status = puterrno(error);
1079 
1080                 return;
1081         }
1082 
1083         if (crgetuid(cr) != va.va_uid) {
1084                 /*
1085                  * This is a kludge to allow writes of files created
1086                  * with read only permission.  The owner of the file
1087                  * is always allowed to write it.
1088                  */
1089                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1090 
1091                 if (error) {
1092                         VN_RELE(vp);
1093                         ns->ns_status = puterrno(error);
1094                         return;
1095                 }
1096         }
1097 
1098         /*
1099          * Can't access a mandatory lock file.  This might cause
1100          * the NFS service thread to block forever waiting for a
1101          * lock to be released that will never be released.
1102          */
1103         if (MANDLOCK(vp, va.va_mode)) {
1104                 VN_RELE(vp);
1105                 ns->ns_status = NFSERR_ACCES;
1106                 return;
1107         }
1108 
1109         /*
1110          * We have to enter the critical region before calling VOP_RWLOCK
1111          * to avoid a deadlock with ufs.
1112          */
1113         if (nbl_need_check(vp)) {
1114                 nbl_start_crit(vp, RW_READER);
1115                 in_crit = 1;
1116                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1117                     wa->wa_count, 0, NULL)) {
1118                         error = EACCES;
1119                         goto out;
1120                 }
1121         }
1122 
1123         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1124 
1125         /* check if a monitor detected a delegation conflict */
1126         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1127                 goto out;
1128         }
1129 
1130         if (wa->wa_data || wa->wa_rlist) {
1131                 /* Do the RDMA thing if necessary */
1132                 if (wa->wa_rlist) {
1133                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1134                         iov[0].iov_len = wa->wa_count;
1135                 } else  {
1136                         iov[0].iov_base = wa->wa_data;
1137                         iov[0].iov_len = wa->wa_count;
1138                 }
1139                 uio.uio_iov = iov;
1140                 uio.uio_iovcnt = 1;
1141                 uio.uio_segflg = UIO_SYSSPACE;
1142                 uio.uio_extflg = UIO_COPY_DEFAULT;
1143                 uio.uio_loffset = (offset_t)wa->wa_offset;
1144                 uio.uio_resid = wa->wa_count;
1145                 /*
1146                  * The limit is checked on the client. We
1147                  * should allow any size writes here.
1148                  */
1149                 uio.uio_llimit = curproc->p_fsz_ctl;
1150                 rlimit = uio.uio_llimit - wa->wa_offset;
1151                 if (rlimit < (rlim64_t)uio.uio_resid)
1152                         uio.uio_resid = (uint_t)rlimit;
1153 
1154                 /*
1155                  * for now we assume no append mode
1156                  */
1157                 /*
1158                  * We're changing creds because VM may fault and we need
1159                  * the cred of the current thread to be used if quota
1160                  * checking is enabled.
1161                  */
1162                 savecred = curthread->t_cred;
1163                 curthread->t_cred = cr;
1164                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1165                 curthread->t_cred = savecred;
1166         } else {
1167 
1168                 iovcnt = 0;
1169                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1170                         iovcnt++;
1171                 if (iovcnt <= MAX_IOVECS) {
1172 #ifdef DEBUG
1173                         rfs_write_sync_hits++;
1174 #endif
1175                         iovp = iov;
1176                 } else {
1177 #ifdef DEBUG
1178                         rfs_write_sync_misses++;
1179 #endif
1180                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1181                 }
1182                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1183                 uio.uio_iov = iovp;
1184                 uio.uio_iovcnt = iovcnt;
1185                 uio.uio_segflg = UIO_SYSSPACE;
1186                 uio.uio_extflg = UIO_COPY_DEFAULT;
1187                 uio.uio_loffset = (offset_t)wa->wa_offset;
1188                 uio.uio_resid = wa->wa_count;
1189                 /*
1190                  * The limit is checked on the client. We
1191                  * should allow any size writes here.
1192                  */
1193                 uio.uio_llimit = curproc->p_fsz_ctl;
1194                 rlimit = uio.uio_llimit - wa->wa_offset;
1195                 if (rlimit < (rlim64_t)uio.uio_resid)
1196                         uio.uio_resid = (uint_t)rlimit;
1197 
1198                 /*
1199                  * For now we assume no append mode.
1200                  */
1201                 /*
1202                  * We're changing creds because VM may fault and we need
1203                  * the cred of the current thread to be used if quota
1204                  * checking is enabled.
1205                  */
1206                 savecred = curthread->t_cred;
1207                 curthread->t_cred = cr;
1208                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1209                 curthread->t_cred = savecred;
1210 
1211                 if (iovp != iov)
1212                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1213         }
1214 
1215         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1216 
1217         if (!error) {
1218                 /*
1219                  * Get attributes again so we send the latest mod
1220                  * time to the client side for its cache.
1221                  */
1222                 va.va_mask = AT_ALL;    /* now we want everything */
1223 
1224                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1225 
1226                 /* check for overflows */
1227                 if (!error) {
1228                         acl_perm(vp, exi, &va, cr);
1229                         error = vattr_to_nattr(&va, &ns->ns_attr);
1230                 }
1231         }
1232 
1233 out:
1234         if (in_crit)
1235                 nbl_end_crit(vp);
1236         VN_RELE(vp);
1237 
1238         /* check if a monitor detected a delegation conflict */
1239         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1240                 /* mark as wouldblock so response is dropped */
1241                 curthread->t_flag |= T_WOULDBLOCK;
1242         else
1243                 ns->ns_status = puterrno(error);
1244 
1245 }
1246 
1247 struct rfs_async_write {
1248         struct nfswriteargs *wa;
1249         struct nfsattrstat *ns;
1250         struct svc_req *req;
1251         cred_t *cr;
1252         bool_t ro;
1253         kthread_t *thread;
1254         struct rfs_async_write *list;
1255 };
1256 
1257 struct rfs_async_write_list {
1258         fhandle_t *fhp;
1259         kcondvar_t cv;
1260         struct rfs_async_write *list;
1261         struct rfs_async_write_list *next;
1262 };
1263 
1264 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1265 static kmutex_t rfs_async_write_lock;
1266 volatile int rfs_write_async = 1;       /* enables write clustering if == 1 */
1267 
1268 #define MAXCLIOVECS     42
1269 #define RFSWRITE_INITVAL (enum nfsstat) -1
1270 
1271 #ifdef DEBUG
1272 static int rfs_write_hits = 0;
1273 static int rfs_write_misses = 0;
1274 #endif
1275 
1276 /*
1277  * Write data to file.
1278  * Returns attributes of a file after writing some data to it.
1279  */
1280 void
1281 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1282     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1283 {
1284         int error;
1285         vnode_t *vp;
1286         rlim64_t rlimit;
1287         struct vattr va;
1288         struct uio uio;
1289         struct rfs_async_write_list *lp;
1290         struct rfs_async_write_list *nlp;
1291         struct rfs_async_write *rp;
1292         struct rfs_async_write *nrp;
1293         struct rfs_async_write *trp;
1294         struct rfs_async_write *lrp;
1295         int data_written;
1296         int iovcnt;
1297         mblk_t *m;
1298         struct iovec *iovp;
1299         struct iovec *niovp;
1300         struct iovec iov[MAXCLIOVECS];
1301         int count;
1302         int rcount;
1303         uint_t off;
1304         uint_t len;
1305         struct rfs_async_write nrpsp;
1306         struct rfs_async_write_list nlpsp;
1307         ushort_t t_flag;
1308         cred_t *savecred;
1309         int in_crit = 0;
1310         caller_context_t ct;
1311         nfs_srv_t *nsrv;
1312 
1313         nsrv = zone_getspecific(rfs_zone_key, curzone);
1314         if (!nsrv->write_async) {
1315                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1316                 return;
1317         }
1318 
1319         /*
1320          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1321          * is considered an OK.
1322          */
1323         ns->ns_status = RFSWRITE_INITVAL;
1324 
1325         nrp = &nrpsp;
1326         nrp->wa = wa;
1327         nrp->ns = ns;
1328         nrp->req = req;
1329         nrp->cr = cr;
1330         nrp->ro = ro;
1331         nrp->thread = curthread;
1332 
1333         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1334 
1335         /*
1336          * Look to see if there is already a cluster started
1337          * for this file.
1338          */
1339         mutex_enter(&nsrv->async_write_lock);
1340         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1341                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1342                     sizeof (fhandle_t)) == 0)
1343                         break;
1344         }
1345 
1346         /*
1347          * If lp is non-NULL, then there is already a cluster
1348          * started.  We need to place ourselves in the cluster
1349          * list in the right place as determined by starting
1350          * offset.  Conflicts with non-blocking mandatory locked
1351          * regions will be checked when the cluster is processed.
1352          */
1353         if (lp != NULL) {
1354                 rp = lp->list;
1355                 trp = NULL;
1356                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1357                         trp = rp;
1358                         rp = rp->list;
1359                 }
1360                 nrp->list = rp;
1361                 if (trp == NULL)
1362                         lp->list = nrp;
1363                 else
1364                         trp->list = nrp;
1365                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1366                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1367                 mutex_exit(&nsrv->async_write_lock);
1368 
1369                 return;
1370         }
1371 
1372         /*
1373          * No cluster started yet, start one and add ourselves
1374          * to the list of clusters.
1375          */
1376         nrp->list = NULL;
1377 
1378         nlp = &nlpsp;
1379         nlp->fhp = &wa->wa_fhandle;
1380         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1381         nlp->list = nrp;
1382         nlp->next = NULL;
1383 
1384         if (nsrv->async_write_head == NULL) {
1385                 nsrv->async_write_head = nlp;
1386         } else {
1387                 lp = nsrv->async_write_head;
1388                 while (lp->next != NULL)
1389                         lp = lp->next;
1390                 lp->next = nlp;
1391         }
1392         mutex_exit(&nsrv->async_write_lock);
1393 
1394         /*
1395          * Convert the file handle common to all of the requests
1396          * in this cluster to a vnode.
1397          */
1398         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1399         if (vp == NULL) {
1400                 mutex_enter(&nsrv->async_write_lock);
1401                 if (nsrv->async_write_head == nlp)
1402                         nsrv->async_write_head = nlp->next;
1403                 else {
1404                         lp = nsrv->async_write_head;
1405                         while (lp->next != nlp)
1406                                 lp = lp->next;
1407                         lp->next = nlp->next;
1408                 }
1409                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1410                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1411                         rp->ns->ns_status = NFSERR_STALE;
1412                         rp->thread->t_flag |= t_flag;
1413                 }
1414                 cv_broadcast(&nlp->cv);
1415                 mutex_exit(&nsrv->async_write_lock);
1416 
1417                 return;
1418         }
1419 
1420         /*
1421          * Can only write regular files.  Attempts to write any
1422          * other file types fail with EISDIR.
1423          */
1424         if (vp->v_type != VREG) {
1425                 VN_RELE(vp);
1426                 mutex_enter(&nsrv->async_write_lock);
1427                 if (nsrv->async_write_head == nlp)
1428                         nsrv->async_write_head = nlp->next;
1429                 else {
1430                         lp = nsrv->async_write_head;
1431                         while (lp->next != nlp)
1432                                 lp = lp->next;
1433                         lp->next = nlp->next;
1434                 }
1435                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1436                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1437                         rp->ns->ns_status = NFSERR_ISDIR;
1438                         rp->thread->t_flag |= t_flag;
1439                 }
1440                 cv_broadcast(&nlp->cv);
1441                 mutex_exit(&nsrv->async_write_lock);
1442 
1443                 return;
1444         }
1445 
1446         /*
1447          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1448          * deadlock with ufs.
1449          */
1450         if (nbl_need_check(vp)) {
1451                 nbl_start_crit(vp, RW_READER);
1452                 in_crit = 1;
1453         }
1454 
1455         ct.cc_sysid = 0;
1456         ct.cc_pid = 0;
1457         ct.cc_caller_id = nfs2_srv_caller_id;
1458         ct.cc_flags = CC_DONTBLOCK;
1459 
1460         /*
1461          * Lock the file for writing.  This operation provides
1462          * the delay which allows clusters to grow.
1463          */
1464         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1465 
1466         /* check if a monitor detected a delegation conflict */
1467         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1468                 if (in_crit)
1469                         nbl_end_crit(vp);
1470                 VN_RELE(vp);
1471                 /* mark as wouldblock so response is dropped */
1472                 curthread->t_flag |= T_WOULDBLOCK;
1473                 mutex_enter(&nsrv->async_write_lock);
1474                 if (nsrv->async_write_head == nlp)
1475                         nsrv->async_write_head = nlp->next;
1476                 else {
1477                         lp = nsrv->async_write_head;
1478                         while (lp->next != nlp)
1479                                 lp = lp->next;
1480                         lp->next = nlp->next;
1481                 }
1482                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1483                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1484                                 rp->ns->ns_status = puterrno(error);
1485                                 rp->thread->t_flag |= T_WOULDBLOCK;
1486                         }
1487                 }
1488                 cv_broadcast(&nlp->cv);
1489                 mutex_exit(&nsrv->async_write_lock);
1490 
1491                 return;
1492         }
1493 
1494         /*
1495          * Disconnect this cluster from the list of clusters.
1496          * The cluster that is being dealt with must be fixed
1497          * in size after this point, so there is no reason
1498          * to leave it on the list so that new requests can
1499          * find it.
1500          *
1501          * The algorithm is that the first write request will
1502          * create a cluster, convert the file handle to a
1503          * vnode pointer, and then lock the file for writing.
1504          * This request is not likely to be clustered with
1505          * any others.  However, the next request will create
1506          * a new cluster and be blocked in VOP_RWLOCK while
1507          * the first request is being processed.  This delay
1508          * will allow more requests to be clustered in this
1509          * second cluster.
1510          */
1511         mutex_enter(&nsrv->async_write_lock);
1512         if (nsrv->async_write_head == nlp)
1513                 nsrv->async_write_head = nlp->next;
1514         else {
1515                 lp = nsrv->async_write_head;
1516                 while (lp->next != nlp)
1517                         lp = lp->next;
1518                 lp->next = nlp->next;
1519         }
1520         mutex_exit(&nsrv->async_write_lock);
1521 
1522         /*
1523          * Step through the list of requests in this cluster.
1524          * We need to check permissions to make sure that all
1525          * of the requests have sufficient permission to write
1526          * the file.  A cluster can be composed of requests
1527          * from different clients and different users on each
1528          * client.
1529          *
1530          * As a side effect, we also calculate the size of the
1531          * byte range that this cluster encompasses.
1532          */
1533         rp = nlp->list;
1534         off = rp->wa->wa_offset;
1535         len = (uint_t)0;
1536         do {
1537                 if (rdonly(rp->ro, vp)) {
1538                         rp->ns->ns_status = NFSERR_ROFS;
1539                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1540                         rp->thread->t_flag |= t_flag;
1541                         continue;
1542                 }
1543 
1544                 va.va_mask = AT_UID|AT_MODE;
1545 
1546                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1547 
1548                 if (!error) {
1549                         if (crgetuid(rp->cr) != va.va_uid) {
1550                                 /*
1551                                  * This is a kludge to allow writes of files
1552                                  * created with read only permission.  The
1553                                  * owner of the file is always allowed to
1554                                  * write it.
1555                                  */
1556                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1557                         }
1558                         if (!error && MANDLOCK(vp, va.va_mode))
1559                                 error = EACCES;
1560                 }
1561 
1562                 /*
1563                  * Check for a conflict with a nbmand-locked region.
1564                  */
1565                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1566                     rp->wa->wa_count, 0, NULL)) {
1567                         error = EACCES;
1568                 }
1569 
1570                 if (error) {
1571                         rp->ns->ns_status = puterrno(error);
1572                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1573                         rp->thread->t_flag |= t_flag;
1574                         continue;
1575                 }
1576                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1577                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1578         } while ((rp = rp->list) != NULL);
1579 
1580         /*
1581          * Step through the cluster attempting to gather as many
1582          * requests which are contiguous as possible.  These
1583          * contiguous requests are handled via one call to VOP_WRITE
1584          * instead of different calls to VOP_WRITE.  We also keep
1585          * track of the fact that any data was written.
1586          */
1587         rp = nlp->list;
1588         data_written = 0;
1589         do {
1590                 /*
1591                  * Skip any requests which are already marked as having an
1592                  * error.
1593                  */
1594                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1595                         rp = rp->list;
1596                         continue;
1597                 }
1598 
1599                 /*
1600                  * Count the number of iovec's which are required
1601                  * to handle this set of requests.  One iovec is
1602                  * needed for each data buffer, whether addressed
1603                  * by wa_data or by the b_rptr pointers in the
1604                  * mblk chains.
1605                  */
1606                 iovcnt = 0;
1607                 lrp = rp;
1608                 for (;;) {
1609                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1610                                 iovcnt++;
1611                         else {
1612                                 m = lrp->wa->wa_mblk;
1613                                 while (m != NULL) {
1614                                         iovcnt++;
1615                                         m = m->b_cont;
1616                                 }
1617                         }
1618                         if (lrp->list == NULL ||
1619                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1620                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1621                             lrp->list->wa->wa_offset) {
1622                                 lrp = lrp->list;
1623                                 break;
1624                         }
1625                         lrp = lrp->list;
1626                 }
1627 
1628                 if (iovcnt <= MAXCLIOVECS) {
1629 #ifdef DEBUG
1630                         rfs_write_hits++;
1631 #endif
1632                         niovp = iov;
1633                 } else {
1634 #ifdef DEBUG
1635                         rfs_write_misses++;
1636 #endif
1637                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1638                 }
1639                 /*
1640                  * Put together the scatter/gather iovecs.
1641                  */
1642                 iovp = niovp;
1643                 trp = rp;
1644                 count = 0;
1645                 do {
1646                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1647                                 if (trp->wa->wa_rlist) {
1648                                         iovp->iov_base =
1649                                             (char *)((trp->wa->wa_rlist)->
1650                                             u.c_daddr3);
1651                                         iovp->iov_len = trp->wa->wa_count;
1652                                 } else  {
1653                                         iovp->iov_base = trp->wa->wa_data;
1654                                         iovp->iov_len = trp->wa->wa_count;
1655                                 }
1656                                 iovp++;
1657                         } else {
1658                                 m = trp->wa->wa_mblk;
1659                                 rcount = trp->wa->wa_count;
1660                                 while (m != NULL) {
1661                                         iovp->iov_base = (caddr_t)m->b_rptr;
1662                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1663                                         rcount -= iovp->iov_len;
1664                                         if (rcount < 0)
1665                                                 iovp->iov_len += rcount;
1666                                         iovp++;
1667                                         if (rcount <= 0)
1668                                                 break;
1669                                         m = m->b_cont;
1670                                 }
1671                         }
1672                         count += trp->wa->wa_count;
1673                         trp = trp->list;
1674                 } while (trp != lrp);
1675 
1676                 uio.uio_iov = niovp;
1677                 uio.uio_iovcnt = iovcnt;
1678                 uio.uio_segflg = UIO_SYSSPACE;
1679                 uio.uio_extflg = UIO_COPY_DEFAULT;
1680                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1681                 uio.uio_resid = count;
1682                 /*
1683                  * The limit is checked on the client. We
1684                  * should allow any size writes here.
1685                  */
1686                 uio.uio_llimit = curproc->p_fsz_ctl;
1687                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1688                 if (rlimit < (rlim64_t)uio.uio_resid)
1689                         uio.uio_resid = (uint_t)rlimit;
1690 
1691                 /*
1692                  * For now we assume no append mode.
1693                  */
1694 
1695                 /*
1696                  * We're changing creds because VM may fault
1697                  * and we need the cred of the current
1698                  * thread to be used if quota * checking is
1699                  * enabled.
1700                  */
1701                 savecred = curthread->t_cred;
1702                 curthread->t_cred = cr;
1703                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1704                 curthread->t_cred = savecred;
1705 
1706                 /* check if a monitor detected a delegation conflict */
1707                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1708                         /* mark as wouldblock so response is dropped */
1709                         curthread->t_flag |= T_WOULDBLOCK;
1710 
1711                 if (niovp != iov)
1712                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1713 
1714                 if (!error) {
1715                         data_written = 1;
1716                         /*
1717                          * Get attributes again so we send the latest mod
1718                          * time to the client side for its cache.
1719                          */
1720                         va.va_mask = AT_ALL;    /* now we want everything */
1721 
1722                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1723 
1724                         if (!error)
1725                                 acl_perm(vp, exi, &va, rp->cr);
1726                 }
1727 
1728                 /*
1729                  * Fill in the status responses for each request
1730                  * which was just handled.  Also, copy the latest
1731                  * attributes in to the attribute responses if
1732                  * appropriate.
1733                  */
1734                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1735                 do {
1736                         rp->thread->t_flag |= t_flag;
1737                         /* check for overflows */
1738                         if (!error) {
1739                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1740                         }
1741                         rp->ns->ns_status = puterrno(error);
1742                         rp = rp->list;
1743                 } while (rp != lrp);
1744         } while (rp != NULL);
1745 
1746         /*
1747          * If any data was written at all, then we need to flush
1748          * the data and metadata to stable storage.
1749          */
1750         if (data_written) {
1751                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1752 
1753                 if (!error) {
1754                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1755                 }
1756         }
1757 
1758         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1759 
1760         if (in_crit)
1761                 nbl_end_crit(vp);
1762         VN_RELE(vp);
1763 
1764         t_flag = curthread->t_flag & T_WOULDBLOCK;
1765         mutex_enter(&nsrv->async_write_lock);
1766         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1767                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1768                         rp->ns->ns_status = puterrno(error);
1769                         rp->thread->t_flag |= t_flag;
1770                 }
1771         }
1772         cv_broadcast(&nlp->cv);
1773         mutex_exit(&nsrv->async_write_lock);
1774 
1775 }
1776 
1777 void *
1778 rfs_write_getfh(struct nfswriteargs *wa)
1779 {
1780         return (&wa->wa_fhandle);
1781 }
1782 
1783 /*
1784  * Create a file.
1785  * Creates a file with given attributes and returns those attributes
1786  * and an fhandle for the new file.
1787  */
1788 void
1789 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1790     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1791 {
1792         int error;
1793         int lookuperr;
1794         int in_crit = 0;
1795         struct vattr va;
1796         vnode_t *vp;
1797         vnode_t *realvp;
1798         vnode_t *dvp;
1799         char *name = args->ca_da.da_name;
1800         vnode_t *tvp = NULL;
1801         int mode;
1802         int lookup_ok;
1803         bool_t trunc;
1804         struct sockaddr *ca;
1805 
1806         /*
1807          * Disallow NULL paths
1808          */
1809         if (name == NULL || *name == '\0') {
1810                 dr->dr_status = NFSERR_ACCES;
1811                 return;
1812         }
1813 
1814         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1815         if (dvp == NULL) {
1816                 dr->dr_status = NFSERR_STALE;
1817                 return;
1818         }
1819 
1820         error = sattr_to_vattr(args->ca_sa, &va);
1821         if (error) {
1822                 dr->dr_status = puterrno(error);
1823                 return;
1824         }
1825 
1826         /*
1827          * Must specify the mode.
1828          */
1829         if (!(va.va_mask & AT_MODE)) {
1830                 VN_RELE(dvp);
1831                 dr->dr_status = NFSERR_INVAL;
1832                 return;
1833         }
1834 
1835         if (protect_zfs_mntpt(dvp) != 0) {
1836                 VN_RELE(dvp);
1837                 dr->dr_status = NFSERR_ACCES;
1838                 return;
1839         }
1840 
1841         /*
1842          * This is a completely gross hack to make mknod
1843          * work over the wire until we can wack the protocol
1844          */
1845         if ((va.va_mode & IFMT) == IFCHR) {
1846                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1847                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1848                 else {
1849                         va.va_type = VCHR;
1850                         /*
1851                          * uncompress the received dev_t
1852                          * if the top half is zero indicating a request
1853                          * from an `older style' OS.
1854                          */
1855                         if ((va.va_size & 0xffff0000) == 0)
1856                                 va.va_rdev = nfsv2_expdev(va.va_size);
1857                         else
1858                                 va.va_rdev = (dev_t)va.va_size;
1859                 }
1860                 va.va_mask &= ~AT_SIZE;
1861         } else if ((va.va_mode & IFMT) == IFBLK) {
1862                 va.va_type = VBLK;
1863                 /*
1864                  * uncompress the received dev_t
1865                  * if the top half is zero indicating a request
1866                  * from an `older style' OS.
1867                  */
1868                 if ((va.va_size & 0xffff0000) == 0)
1869                         va.va_rdev = nfsv2_expdev(va.va_size);
1870                 else
1871                         va.va_rdev = (dev_t)va.va_size;
1872                 va.va_mask &= ~AT_SIZE;
1873         } else if ((va.va_mode & IFMT) == IFSOCK) {
1874                 va.va_type = VSOCK;
1875         } else {
1876                 va.va_type = VREG;
1877         }
1878         va.va_mode &= ~IFMT;
1879         va.va_mask |= AT_TYPE;
1880 
1881         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1882         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1883             MAXPATHLEN);
1884         if (name == NULL) {
1885                 dr->dr_status = puterrno(EINVAL);
1886                 return;
1887         }
1888 
1889         /*
1890          * Why was the choice made to use VWRITE as the mode to the
1891          * call to VOP_CREATE ? This results in a bug.  When a client
1892          * opens a file that already exists and is RDONLY, the second
1893          * open fails with an EACESS because of the mode.
1894          * bug ID 1054648.
1895          */
1896         lookup_ok = 0;
1897         mode = VWRITE;
1898         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1899                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1900                     NULL, NULL, NULL);
1901                 if (!error) {
1902                         struct vattr at;
1903 
1904                         lookup_ok = 1;
1905                         at.va_mask = AT_MODE;
1906                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1907                         if (!error)
1908                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1909                         VN_RELE(tvp);
1910                         tvp = NULL;
1911                 }
1912         }
1913 
1914         if (!lookup_ok) {
1915                 if (rdonly(ro, dvp)) {
1916                         error = EROFS;
1917                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1918                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1919                         error = EPERM;
1920                 } else {
1921                         error = 0;
1922                 }
1923         }
1924 
1925         /*
1926          * If file size is being modified on an already existing file
1927          * make sure that there are no conflicting non-blocking mandatory
1928          * locks in the region being manipulated. Return EACCES if there
1929          * are conflicting locks.
1930          */
1931         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1932                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1933                     NULL, NULL, NULL);
1934 
1935                 if (!lookuperr &&
1936                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1937                         VN_RELE(tvp);
1938                         curthread->t_flag |= T_WOULDBLOCK;
1939                         goto out;
1940                 }
1941 
1942                 if (!lookuperr && nbl_need_check(tvp)) {
1943                         /*
1944                          * The file exists. Now check if it has any
1945                          * conflicting non-blocking mandatory locks
1946                          * in the region being changed.
1947                          */
1948                         struct vattr bva;
1949                         u_offset_t offset;
1950                         ssize_t length;
1951 
1952                         nbl_start_crit(tvp, RW_READER);
1953                         in_crit = 1;
1954 
1955                         bva.va_mask = AT_SIZE;
1956                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1957                         if (!error) {
1958                                 if (va.va_size < bva.va_size) {
1959                                         offset = va.va_size;
1960                                         length = bva.va_size - va.va_size;
1961                                 } else {
1962                                         offset = bva.va_size;
1963                                         length = va.va_size - bva.va_size;
1964                                 }
1965                                 if (length) {
1966                                         if (nbl_conflict(tvp, NBL_WRITE,
1967                                             offset, length, 0, NULL)) {
1968                                                 error = EACCES;
1969                                         }
1970                                 }
1971                         }
1972                         if (error) {
1973                                 nbl_end_crit(tvp);
1974                                 VN_RELE(tvp);
1975                                 in_crit = 0;
1976                         }
1977                 } else if (tvp != NULL) {
1978                         VN_RELE(tvp);
1979                 }
1980         }
1981 
1982         if (!error) {
1983                 /*
1984                  * If filesystem is shared with nosuid the remove any
1985                  * setuid/setgid bits on create.
1986                  */
1987                 if (va.va_type == VREG &&
1988                     exi->exi_export.ex_flags & EX_NOSUID)
1989                         va.va_mode &= ~(VSUID | VSGID);
1990 
1991                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1992                     NULL, NULL);
1993 
1994                 if (!error) {
1995 
1996                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1997                                 trunc = TRUE;
1998                         else
1999                                 trunc = FALSE;
2000 
2001                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2002                                 VN_RELE(vp);
2003                                 curthread->t_flag |= T_WOULDBLOCK;
2004                                 goto out;
2005                         }
2006                         va.va_mask = AT_ALL;
2007 
2008                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2009 
2010                         /* check for overflows */
2011                         if (!error) {
2012                                 acl_perm(vp, exi, &va, cr);
2013                                 error = vattr_to_nattr(&va, &dr->dr_attr);
2014                                 if (!error) {
2015                                         error = makefh(&dr->dr_fhandle, vp,
2016                                             exi);
2017                                 }
2018                         }
2019                         /*
2020                          * Force modified metadata out to stable storage.
2021                          *
2022                          * if a underlying vp exists, pass it to VOP_FSYNC
2023                          */
2024                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
2025                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2026                         else
2027                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2028                         VN_RELE(vp);
2029                 }
2030 
2031                 if (in_crit) {
2032                         nbl_end_crit(tvp);
2033                         VN_RELE(tvp);
2034                 }
2035         }
2036 
2037         /*
2038          * Force modified data and metadata out to stable storage.
2039          */
2040         (void) VOP_FSYNC(dvp, 0, cr, NULL);
2041 
2042 out:
2043 
2044         VN_RELE(dvp);
2045 
2046         dr->dr_status = puterrno(error);
2047 
2048         if (name != args->ca_da.da_name)
2049                 kmem_free(name, MAXPATHLEN);
2050 }
2051 void *
2052 rfs_create_getfh(struct nfscreatargs *args)
2053 {
2054         return (args->ca_da.da_fhandle);
2055 }
2056 
2057 /*
2058  * Remove a file.
2059  * Remove named file from parent directory.
2060  */
2061 /* ARGSUSED */
2062 void
2063 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2064     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2065 {
2066         int error = 0;
2067         vnode_t *vp;
2068         vnode_t *targvp;
2069         int in_crit = 0;
2070 
2071         /*
2072          * Disallow NULL paths
2073          */
2074         if (da->da_name == NULL || *da->da_name == '\0') {
2075                 *status = NFSERR_ACCES;
2076                 return;
2077         }
2078 
2079         vp = nfs_fhtovp(da->da_fhandle, exi);
2080         if (vp == NULL) {
2081                 *status = NFSERR_STALE;
2082                 return;
2083         }
2084 
2085         if (rdonly(ro, vp)) {
2086                 VN_RELE(vp);
2087                 *status = NFSERR_ROFS;
2088                 return;
2089         }
2090 
2091         /*
2092          * Check for a conflict with a non-blocking mandatory share reservation.
2093          */
2094         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2095             NULL, cr, NULL, NULL, NULL);
2096         if (error != 0) {
2097                 VN_RELE(vp);
2098                 *status = puterrno(error);
2099                 return;
2100         }
2101 
2102         /*
2103          * If the file is delegated to an v4 client, then initiate
2104          * recall and drop this request (by setting T_WOULDBLOCK).
2105          * The client will eventually re-transmit the request and
2106          * (hopefully), by then, the v4 client will have returned
2107          * the delegation.
2108          */
2109 
2110         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2111                 VN_RELE(vp);
2112                 VN_RELE(targvp);
2113                 curthread->t_flag |= T_WOULDBLOCK;
2114                 return;
2115         }
2116 
2117         if (nbl_need_check(targvp)) {
2118                 nbl_start_crit(targvp, RW_READER);
2119                 in_crit = 1;
2120                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2121                         error = EACCES;
2122                         goto out;
2123                 }
2124         }
2125 
2126         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2127 
2128         /*
2129          * Force modified data and metadata out to stable storage.
2130          */
2131         (void) VOP_FSYNC(vp, 0, cr, NULL);
2132 
2133 out:
2134         if (in_crit)
2135                 nbl_end_crit(targvp);
2136         VN_RELE(targvp);
2137         VN_RELE(vp);
2138 
2139         *status = puterrno(error);
2140 
2141 }
2142 
2143 void *
2144 rfs_remove_getfh(struct nfsdiropargs *da)
2145 {
2146         return (da->da_fhandle);
2147 }
2148 
2149 /*
2150  * rename a file
2151  * Give a file (from) a new name (to).
2152  */
2153 /* ARGSUSED */
2154 void
2155 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2156     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2157 {
2158         int error = 0;
2159         vnode_t *fromvp;
2160         vnode_t *tovp;
2161         struct exportinfo *to_exi;
2162         fhandle_t *fh;
2163         vnode_t *srcvp;
2164         vnode_t *targvp;
2165         int in_crit = 0;
2166 
2167         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2168         if (fromvp == NULL) {
2169                 *status = NFSERR_STALE;
2170                 return;
2171         }
2172 
2173         fh = args->rna_to.da_fhandle;
2174         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2175         if (to_exi == NULL) {
2176                 VN_RELE(fromvp);
2177                 *status = NFSERR_ACCES;
2178                 return;
2179         }
2180         exi_rele(&to_exi);
2181 
2182         if (to_exi != exi) {
2183                 VN_RELE(fromvp);
2184                 *status = NFSERR_XDEV;
2185                 return;
2186         }
2187 
2188         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2189         if (tovp == NULL) {
2190                 VN_RELE(fromvp);
2191                 *status = NFSERR_STALE;
2192                 return;
2193         }
2194 
2195         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2196                 VN_RELE(tovp);
2197                 VN_RELE(fromvp);
2198                 *status = NFSERR_NOTDIR;
2199                 return;
2200         }
2201 
2202         /*
2203          * Disallow NULL paths
2204          */
2205         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2206             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2207                 VN_RELE(tovp);
2208                 VN_RELE(fromvp);
2209                 *status = NFSERR_ACCES;
2210                 return;
2211         }
2212 
2213         if (rdonly(ro, tovp)) {
2214                 VN_RELE(tovp);
2215                 VN_RELE(fromvp);
2216                 *status = NFSERR_ROFS;
2217                 return;
2218         }
2219 
2220         if (protect_zfs_mntpt(tovp) != 0) {
2221                 VN_RELE(tovp);
2222                 VN_RELE(fromvp);
2223                 *status = NFSERR_ACCES;
2224                 return;
2225         }
2226 
2227         /*
2228          * Check for a conflict with a non-blocking mandatory share reservation.
2229          */
2230         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2231             NULL, cr, NULL, NULL, NULL);
2232         if (error != 0) {
2233                 VN_RELE(tovp);
2234                 VN_RELE(fromvp);
2235                 *status = puterrno(error);
2236                 return;
2237         }
2238 
2239         /* Check for delegations on the source file */
2240 
2241         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2242                 VN_RELE(tovp);
2243                 VN_RELE(fromvp);
2244                 VN_RELE(srcvp);
2245                 curthread->t_flag |= T_WOULDBLOCK;
2246                 return;
2247         }
2248 
2249         /* Check for delegation on the file being renamed over, if it exists */
2250 
2251         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2252             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2253             NULL, NULL, NULL) == 0) {
2254 
2255                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2256                         VN_RELE(tovp);
2257                         VN_RELE(fromvp);
2258                         VN_RELE(srcvp);
2259                         VN_RELE(targvp);
2260                         curthread->t_flag |= T_WOULDBLOCK;
2261                         return;
2262                 }
2263                 VN_RELE(targvp);
2264         }
2265 
2266 
2267         if (nbl_need_check(srcvp)) {
2268                 nbl_start_crit(srcvp, RW_READER);
2269                 in_crit = 1;
2270                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2271                         error = EACCES;
2272                         goto out;
2273                 }
2274         }
2275 
2276         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2277             tovp, args->rna_to.da_name, cr, NULL, 0);
2278 
2279         if (error == 0)
2280                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2281                     strlen(args->rna_to.da_name));
2282 
2283         /*
2284          * Force modified data and metadata out to stable storage.
2285          */
2286         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2287         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2288 
2289 out:
2290         if (in_crit)
2291                 nbl_end_crit(srcvp);
2292         VN_RELE(srcvp);
2293         VN_RELE(tovp);
2294         VN_RELE(fromvp);
2295 
2296         *status = puterrno(error);
2297 
2298 }
2299 void *
2300 rfs_rename_getfh(struct nfsrnmargs *args)
2301 {
2302         return (args->rna_from.da_fhandle);
2303 }
2304 
2305 /*
2306  * Link to a file.
2307  * Create a file (to) which is a hard link to the given file (from).
2308  */
2309 /* ARGSUSED */
2310 void
2311 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2312     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2313 {
2314         int error;
2315         vnode_t *fromvp;
2316         vnode_t *tovp;
2317         struct exportinfo *to_exi;
2318         fhandle_t *fh;
2319 
2320         fromvp = nfs_fhtovp(args->la_from, exi);
2321         if (fromvp == NULL) {
2322                 *status = NFSERR_STALE;
2323                 return;
2324         }
2325 
2326         fh = args->la_to.da_fhandle;
2327         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2328         if (to_exi == NULL) {
2329                 VN_RELE(fromvp);
2330                 *status = NFSERR_ACCES;
2331                 return;
2332         }
2333         exi_rele(&to_exi);
2334 
2335         if (to_exi != exi) {
2336                 VN_RELE(fromvp);
2337                 *status = NFSERR_XDEV;
2338                 return;
2339         }
2340 
2341         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2342         if (tovp == NULL) {
2343                 VN_RELE(fromvp);
2344                 *status = NFSERR_STALE;
2345                 return;
2346         }
2347 
2348         if (tovp->v_type != VDIR) {
2349                 VN_RELE(tovp);
2350                 VN_RELE(fromvp);
2351                 *status = NFSERR_NOTDIR;
2352                 return;
2353         }
2354         /*
2355          * Disallow NULL paths
2356          */
2357         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2358                 VN_RELE(tovp);
2359                 VN_RELE(fromvp);
2360                 *status = NFSERR_ACCES;
2361                 return;
2362         }
2363 
2364         if (rdonly(ro, tovp)) {
2365                 VN_RELE(tovp);
2366                 VN_RELE(fromvp);
2367                 *status = NFSERR_ROFS;
2368                 return;
2369         }
2370 
2371         if (protect_zfs_mntpt(tovp) != 0) {
2372                 VN_RELE(tovp);
2373                 VN_RELE(fromvp);
2374                 *status = NFSERR_ACCES;
2375                 return;
2376         }
2377 
2378         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2379 
2380         /*
2381          * Force modified data and metadata out to stable storage.
2382          */
2383         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2384         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2385 
2386         VN_RELE(tovp);
2387         VN_RELE(fromvp);
2388 
2389         *status = puterrno(error);
2390 
2391 }
2392 void *
2393 rfs_link_getfh(struct nfslinkargs *args)
2394 {
2395         return (args->la_from);
2396 }
2397 
2398 /*
2399  * Symbolicly link to a file.
2400  * Create a file (from) with the given attributes which is a symbolic link
2401  * to the given path name (to).
2402  */
2403 void
2404 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2405     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2406 {
2407         int error;
2408         struct vattr va;
2409         vnode_t *vp;
2410         vnode_t *svp;
2411         int lerror;
2412         struct sockaddr *ca;
2413         char *name = NULL;
2414 
2415         /*
2416          * Disallow NULL paths
2417          */
2418         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2419                 *status = NFSERR_ACCES;
2420                 return;
2421         }
2422 
2423         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2424         if (vp == NULL) {
2425                 *status = NFSERR_STALE;
2426                 return;
2427         }
2428 
2429         if (rdonly(ro, vp)) {
2430                 VN_RELE(vp);
2431                 *status = NFSERR_ROFS;
2432                 return;
2433         }
2434 
2435         error = sattr_to_vattr(args->sla_sa, &va);
2436         if (error) {
2437                 VN_RELE(vp);
2438                 *status = puterrno(error);
2439                 return;
2440         }
2441 
2442         if (!(va.va_mask & AT_MODE)) {
2443                 VN_RELE(vp);
2444                 *status = NFSERR_INVAL;
2445                 return;
2446         }
2447 
2448         if (protect_zfs_mntpt(vp) != 0) {
2449                 VN_RELE(vp);
2450                 *status = NFSERR_ACCES;
2451                 return;
2452         }
2453 
2454         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2455         name = nfscmd_convname(ca, exi, args->sla_tnm,
2456             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2457 
2458         if (name == NULL) {
2459                 *status = NFSERR_ACCES;
2460                 return;
2461         }
2462 
2463         va.va_type = VLNK;
2464         va.va_mask |= AT_TYPE;
2465 
2466         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2467 
2468         /*
2469          * Force new data and metadata out to stable storage.
2470          */
2471         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2472             NULL, cr, NULL, NULL, NULL);
2473 
2474         if (!lerror) {
2475                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2476                 VN_RELE(svp);
2477         }
2478 
2479         /*
2480          * Force modified data and metadata out to stable storage.
2481          */
2482         (void) VOP_FSYNC(vp, 0, cr, NULL);
2483 
2484         VN_RELE(vp);
2485 
2486         *status = puterrno(error);
2487         if (name != args->sla_tnm)
2488                 kmem_free(name, MAXPATHLEN);
2489 
2490 }
2491 void *
2492 rfs_symlink_getfh(struct nfsslargs *args)
2493 {
2494         return (args->sla_from.da_fhandle);
2495 }
2496 
2497 /*
2498  * Make a directory.
2499  * Create a directory with the given name, parent directory, and attributes.
2500  * Returns a file handle and attributes for the new directory.
2501  */
2502 /* ARGSUSED */
2503 void
2504 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2505     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2506 {
2507         int error;
2508         struct vattr va;
2509         vnode_t *dvp = NULL;
2510         vnode_t *vp;
2511         char *name = args->ca_da.da_name;
2512 
2513         /*
2514          * Disallow NULL paths
2515          */
2516         if (name == NULL || *name == '\0') {
2517                 dr->dr_status = NFSERR_ACCES;
2518                 return;
2519         }
2520 
2521         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2522         if (vp == NULL) {
2523                 dr->dr_status = NFSERR_STALE;
2524                 return;
2525         }
2526 
2527         if (rdonly(ro, vp)) {
2528                 VN_RELE(vp);
2529                 dr->dr_status = NFSERR_ROFS;
2530                 return;
2531         }
2532 
2533         error = sattr_to_vattr(args->ca_sa, &va);
2534         if (error) {
2535                 VN_RELE(vp);
2536                 dr->dr_status = puterrno(error);
2537                 return;
2538         }
2539 
2540         if (!(va.va_mask & AT_MODE)) {
2541                 VN_RELE(vp);
2542                 dr->dr_status = NFSERR_INVAL;
2543                 return;
2544         }
2545 
2546         if (protect_zfs_mntpt(vp) != 0) {
2547                 VN_RELE(vp);
2548                 dr->dr_status = NFSERR_ACCES;
2549                 return;
2550         }
2551 
2552         va.va_type = VDIR;
2553         va.va_mask |= AT_TYPE;
2554 
2555         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2556 
2557         if (!error) {
2558                 /*
2559                  * Attribtutes of the newly created directory should
2560                  * be returned to the client.
2561                  */
2562                 va.va_mask = AT_ALL; /* We want everything */
2563                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2564 
2565                 /* check for overflows */
2566                 if (!error) {
2567                         acl_perm(vp, exi, &va, cr);
2568                         error = vattr_to_nattr(&va, &dr->dr_attr);
2569                         if (!error) {
2570                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2571                         }
2572                 }
2573                 /*
2574                  * Force new data and metadata out to stable storage.
2575                  */
2576                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2577                 VN_RELE(dvp);
2578         }
2579 
2580         /*
2581          * Force modified data and metadata out to stable storage.
2582          */
2583         (void) VOP_FSYNC(vp, 0, cr, NULL);
2584 
2585         VN_RELE(vp);
2586 
2587         dr->dr_status = puterrno(error);
2588 
2589 }
2590 void *
2591 rfs_mkdir_getfh(struct nfscreatargs *args)
2592 {
2593         return (args->ca_da.da_fhandle);
2594 }
2595 
2596 /*
2597  * Remove a directory.
2598  * Remove the given directory name from the given parent directory.
2599  */
2600 /* ARGSUSED */
2601 void
2602 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2603     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2604 {
2605         int error;
2606         vnode_t *vp;
2607 
2608         /*
2609          * Disallow NULL paths
2610          */
2611         if (da->da_name == NULL || *da->da_name == '\0') {
2612                 *status = NFSERR_ACCES;
2613                 return;
2614         }
2615 
2616         vp = nfs_fhtovp(da->da_fhandle, exi);
2617         if (vp == NULL) {
2618                 *status = NFSERR_STALE;
2619                 return;
2620         }
2621 
2622         if (rdonly(ro, vp)) {
2623                 VN_RELE(vp);
2624                 *status = NFSERR_ROFS;
2625                 return;
2626         }
2627 
2628         /*
2629          * VOP_RMDIR takes a third argument (the current
2630          * directory of the process).  That's because someone
2631          * wants to return EINVAL if one tries to remove ".".
2632          * Of course, NFS servers have no idea what their
2633          * clients' current directories are.  We fake it by
2634          * supplying a vnode known to exist and illegal to
2635          * remove.
2636          */
2637         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2638 
2639         /*
2640          * Force modified data and metadata out to stable storage.
2641          */
2642         (void) VOP_FSYNC(vp, 0, cr, NULL);
2643 
2644         VN_RELE(vp);
2645 
2646         /*
2647          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2648          * if the directory is not empty.  A System V NFS server
2649          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2650          * over the wire.
2651          */
2652         if (error == EEXIST)
2653                 *status = NFSERR_NOTEMPTY;
2654         else
2655                 *status = puterrno(error);
2656 
2657 }
2658 void *
2659 rfs_rmdir_getfh(struct nfsdiropargs *da)
2660 {
2661         return (da->da_fhandle);
2662 }
2663 
2664 #ifdef nextdp
2665 #undef nextdp
2666 #endif
2667 #define nextdp(dp)      ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
2668 
2669 /* ARGSUSED */
2670 void
2671 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2672     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2673 {
2674         int error;
2675         vnode_t *vp;
2676         struct iovec iov;
2677         struct uio uio;
2678         int iseof;
2679 
2680         uint32_t count = rda->rda_count;
2681         uint32_t size;          /* size of the readdirres structure */
2682         int overflow = 0;
2683 
2684         size_t datasz;
2685         char *data = NULL;
2686         dirent64_t *dp;
2687 
2688         struct sockaddr *ca;
2689         struct nfsentry **eptr;
2690         struct nfsentry *entry;
2691 
2692         vp = nfs_fhtovp(&rda->rda_fh, exi);
2693         if (vp == NULL) {
2694                 rd->rd_status = NFSERR_STALE;
2695                 return;
2696         }
2697 
2698         if (vp->v_type != VDIR) {
2699                 VN_RELE(vp);
2700                 rd->rd_status = NFSERR_NOTDIR;
2701                 return;
2702         }
2703 
2704         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2705 
2706         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2707         if (error)
2708                 goto bad;
2709 
2710         /*
2711          * Don't allow arbitrary counts for allocation
2712          */
2713         if (count > NFS_MAXDATA)
2714                 count = NFS_MAXDATA;
2715 
2716         /*
2717          * struct readdirres:
2718          *   status:            1
2719          *   entries (bool):    1
2720          *   eof:               1
2721          */
2722         size = (1 + 1 + 1) * BYTES_PER_XDR_UNIT;
2723 
2724         if (size > count) {
2725                 eptr = &rd->rd_entries;
2726                 iseof = 0;
2727                 size = 0;
2728 
2729                 goto done;
2730         }
2731 
2732         /*
2733          * This is simplification.  The dirent64_t size is not the same as the
2734          * size of XDR representation of entry, but the sizes are similar so
2735          * we'll assume they are same.  This assumption should not cause any
2736          * harm.  In worst case we will need to issue VOP_READDIR() once more.
2737          */
2738         datasz = count;
2739 
2740         /*
2741          * Make sure that there is room to read at least one entry
2742          * if any are available.
2743          */
2744         if (datasz < DIRENT64_RECLEN(MAXNAMELEN))
2745                 datasz = DIRENT64_RECLEN(MAXNAMELEN);
2746 
2747         data = kmem_alloc(datasz, KM_NOSLEEP);
2748         if (data == NULL) {
2749                 /* The allocation failed; downsize and wait for it this time */
2750                 if (datasz > MAXBSIZE)
2751                         datasz = MAXBSIZE;
2752                 data = kmem_alloc(datasz, KM_SLEEP);
2753         }
2754 
2755         uio.uio_iov = &iov;
2756         uio.uio_iovcnt = 1;
2757         uio.uio_segflg = UIO_SYSSPACE;
2758         uio.uio_extflg = UIO_COPY_CACHED;
2759         uio.uio_loffset = (offset_t)rda->rda_offset;
2760         uio.uio_resid = datasz;
2761 
2762         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2763         eptr = &rd->rd_entries;
2764         entry = NULL;
2765 
2766 getmoredents:
2767         iov.iov_base = data;
2768         iov.iov_len = datasz;
2769 
2770         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2771         if (error) {
2772                 iseof = 0;
2773                 goto done;
2774         }
2775 
2776         if (iov.iov_len == datasz)
2777                 goto done;
2778 
2779         for (dp = (dirent64_t *)data;
2780             (char *)dp - data < datasz - iov.iov_len && !overflow;
2781             dp = nextdp(dp)) {
2782                 char *name;
2783                 uint32_t esize;
2784                 uint32_t cookie;
2785 
2786                 overflow = (uint64_t)dp->d_off > UINT32_MAX;
2787                 if (overflow) {
2788                         cookie = 0;
2789                         iseof = 1;
2790                 } else
2791                         cookie = (uint32_t)dp->d_off;
2792 
2793                 if (dp->d_ino == 0 || (uint64_t)dp->d_ino > UINT32_MAX) {
2794                         if (entry != NULL)
2795                                 entry->cookie = cookie;
2796                         continue;
2797                 }
2798 
2799                 name = nfscmd_convname(ca, exi, dp->d_name,
2800                     NFSCMD_CONV_OUTBOUND, NFS_MAXPATHLEN + 1);
2801                 if (name == NULL) {
2802                         if (entry != NULL)
2803                                 entry->cookie = cookie;
2804                         continue;
2805                 }
2806 
2807                 /*
2808                  * struct entry:
2809                  *   fileid:            1
2810                  *   name (length):     1
2811                  *   name (data):       length (rounded up)
2812                  *   cookie:            1
2813                  *   nextentry (bool):  1
2814                  */
2815                 esize = (1 + 1 + 1 + 1) * BYTES_PER_XDR_UNIT +
2816                     RNDUP(strlen(name));
2817 
2818                 /* If the new entry does not fit, discard it */
2819                 if (esize > count - size) {
2820                         if (name != dp->d_name)
2821                                 kmem_free(name, NFS_MAXPATHLEN + 1);
2822                         iseof = 0;
2823                         goto done;
2824                 }
2825 
2826                 entry = kmem_alloc(sizeof (struct nfsentry), KM_SLEEP);
2827 
2828                 entry->fileid = (uint32_t)dp->d_ino;
2829                 entry->name = strdup(name);
2830                 if (name != dp->d_name)
2831                         kmem_free(name, NFS_MAXPATHLEN + 1);
2832                 entry->cookie = cookie;
2833 
2834                 size += esize;
2835 
2836                 /* Add the entry to the linked list */
2837                 *eptr = entry;
2838                 eptr = &entry->nextentry;
2839         }
2840 
2841         if (!iseof && size < count) {
2842                 uio.uio_resid = MIN(datasz, MAXBSIZE);
2843                 goto getmoredents;
2844         }
2845 
2846 done:
2847         *eptr = NULL;
2848 
2849         if (iseof || rd->rd_entries != NULL || !error) {
2850                 error = 0;
2851                 rd->rd_eof = iseof ? TRUE : FALSE;
2852 
2853                 /* This is for nfslog only */
2854                 rd->rd_offset = rda->rda_offset;
2855                 rd->rd_size = size;
2856         }
2857 
2858 bad:
2859         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2860 
2861 #if 0 /* notyet */
2862         /*
2863          * Don't do this.  It causes local disk writes when just
2864          * reading the file and the overhead is deemed larger
2865          * than the benefit.
2866          */
2867         /*
2868          * Force modified metadata out to stable storage.
2869          */
2870         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2871 #endif
2872 
2873         VN_RELE(vp);
2874 
2875         rd->rd_status = puterrno(error);
2876 
2877         if (data != NULL)
2878                 kmem_free(data, datasz);
2879 }
2880 void *
2881 rfs_readdir_getfh(struct nfsrddirargs *rda)
2882 {
2883         return (&rda->rda_fh);
2884 }
2885 void
2886 rfs_rddirfree(struct nfsrddirres *rd)
2887 {
2888         if (rd->rd_status == NFS_OK) {
2889                 struct nfsentry *entry, *nentry;
2890 
2891                 for (entry = rd->rd_entries; entry != NULL; entry = nentry) {
2892                         nentry = entry->nextentry;
2893                         strfree(entry->name);
2894                         kmem_free(entry, sizeof (struct nfsentry));
2895                 }
2896         }
2897 }
2898 
2899 /* ARGSUSED */
2900 void
2901 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2902     struct svc_req *req, cred_t *cr, bool_t ro)
2903 {
2904         int error;
2905         struct statvfs64 sb;
2906         vnode_t *vp;
2907 
2908         vp = nfs_fhtovp(fh, exi);
2909         if (vp == NULL) {
2910                 fs->fs_status = NFSERR_STALE;
2911                 return;
2912         }
2913 
2914         error = VFS_STATVFS(vp->v_vfsp, &sb);
2915 
2916         if (!error) {
2917                 fs->fs_tsize = nfstsize();
2918                 fs->fs_bsize = sb.f_frsize;
2919                 fs->fs_blocks = sb.f_blocks;
2920                 fs->fs_bfree = sb.f_bfree;
2921                 fs->fs_bavail = sb.f_bavail;
2922         }
2923 
2924         VN_RELE(vp);
2925 
2926         fs->fs_status = puterrno(error);
2927 
2928 }
2929 void *
2930 rfs_statfs_getfh(fhandle_t *fh)
2931 {
2932         return (fh);
2933 }
2934 
2935 static int
2936 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2937 {
2938         vap->va_mask = 0;
2939 
2940         /*
2941          * There was a sign extension bug in some VFS based systems
2942          * which stored the mode as a short.  When it would get
2943          * assigned to a u_long, no sign extension would occur.
2944          * It needed to, but this wasn't noticed because sa_mode
2945          * would then get assigned back to the short, thus ignoring
2946          * the upper 16 bits of sa_mode.
2947          *
2948          * To make this implementation work for both broken
2949          * clients and good clients, we check for both versions
2950          * of the mode.
2951          */
2952         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2953             sa->sa_mode != (uint32_t)-1) {
2954                 vap->va_mask |= AT_MODE;
2955                 vap->va_mode = sa->sa_mode;
2956         }
2957         if (sa->sa_uid != (uint32_t)-1) {
2958                 vap->va_mask |= AT_UID;
2959                 vap->va_uid = sa->sa_uid;
2960         }
2961         if (sa->sa_gid != (uint32_t)-1) {
2962                 vap->va_mask |= AT_GID;
2963                 vap->va_gid = sa->sa_gid;
2964         }
2965         if (sa->sa_size != (uint32_t)-1) {
2966                 vap->va_mask |= AT_SIZE;
2967                 vap->va_size = sa->sa_size;
2968         }
2969         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2970             sa->sa_atime.tv_usec != (int32_t)-1) {
2971 #ifndef _LP64
2972                 /* return error if time overflow */
2973                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2974                         return (EOVERFLOW);
2975 #endif
2976                 vap->va_mask |= AT_ATIME;
2977                 /*
2978                  * nfs protocol defines times as unsigned so don't extend sign,
2979                  * unless sysadmin set nfs_allow_preepoch_time.
2980                  */
2981                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2982                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2983         }
2984         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2985             sa->sa_mtime.tv_usec != (int32_t)-1) {
2986 #ifndef _LP64
2987                 /* return error if time overflow */
2988                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2989                         return (EOVERFLOW);
2990 #endif
2991                 vap->va_mask |= AT_MTIME;
2992                 /*
2993                  * nfs protocol defines times as unsigned so don't extend sign,
2994                  * unless sysadmin set nfs_allow_preepoch_time.
2995                  */
2996                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2997                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2998         }
2999         return (0);
3000 }
3001 
3002 static const enum nfsftype vt_to_nf[] = {
3003         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
3004 };
3005 
3006 /*
3007  * check the following fields for overflow: nodeid, size, and time.
3008  * There could be a problem when converting 64-bit LP64 fields
3009  * into 32-bit ones.  Return an error if there is an overflow.
3010  */
3011 int
3012 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
3013 {
3014         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
3015         na->na_type = vt_to_nf[vap->va_type];
3016 
3017         if (vap->va_mode == (unsigned short) -1)
3018                 na->na_mode = (uint32_t)-1;
3019         else
3020                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
3021 
3022         if (vap->va_uid == (unsigned short)(-1))
3023                 na->na_uid = (uint32_t)(-1);
3024         else if (vap->va_uid == UID_NOBODY)
3025                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
3026         else
3027                 na->na_uid = vap->va_uid;
3028 
3029         if (vap->va_gid == (unsigned short)(-1))
3030                 na->na_gid = (uint32_t)-1;
3031         else if (vap->va_gid == GID_NOBODY)
3032                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
3033         else
3034                 na->na_gid = vap->va_gid;
3035 
3036         /*
3037          * Do we need to check fsid for overflow?  It is 64-bit in the
3038          * vattr, but are bigger than 32 bit values supported?
3039          */
3040         na->na_fsid = vap->va_fsid;
3041 
3042         na->na_nodeid = vap->va_nodeid;
3043 
3044         /*
3045          * Check to make sure that the nodeid is representable over the
3046          * wire without losing bits.
3047          */
3048         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
3049                 return (EFBIG);
3050         na->na_nlink = vap->va_nlink;
3051 
3052         /*
3053          * Check for big files here, instead of at the caller.  See
3054          * comments in cstat for large special file explanation.
3055          */
3056         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
3057                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
3058                         return (EFBIG);
3059                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
3060                         /* UNKNOWN_SIZE | OVERFLOW */
3061                         na->na_size = MAXOFF32_T;
3062                 } else
3063                         na->na_size = vap->va_size;
3064         } else
3065                 na->na_size = vap->va_size;
3066 
3067         /*
3068          * If the vnode times overflow the 32-bit times that NFS2
3069          * uses on the wire then return an error.
3070          */
3071         if (!NFS_VAP_TIME_OK(vap)) {
3072                 return (EOVERFLOW);
3073         }
3074         na->na_atime.tv_sec = vap->va_atime.tv_sec;
3075         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
3076 
3077         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
3078         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
3079 
3080         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
3081         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
3082 
3083         /*
3084          * If the dev_t will fit into 16 bits then compress
3085          * it, otherwise leave it alone. See comments in
3086          * nfs_client.c.
3087          */
3088         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
3089             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
3090                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
3091         else
3092                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
3093 
3094         na->na_blocks = vap->va_nblocks;
3095         na->na_blocksize = vap->va_blksize;
3096 
3097         /*
3098          * This bit of ugliness is a *TEMPORARY* hack to preserve the
3099          * over-the-wire protocols for named-pipe vnodes.  It remaps the
3100          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
3101          *
3102          * BUYER BEWARE:
3103          *  If you are porting the NFS to a non-Sun server, you probably
3104          *  don't want to include the following block of code.  The
3105          *  over-the-wire special file types will be changing with the
3106          *  NFS Protocol Revision.
3107          */
3108         if (vap->va_type == VFIFO)
3109                 NA_SETFIFO(na);
3110         return (0);
3111 }
3112 
3113 /*
3114  * acl v2 support: returns approximate permission.
3115  *      default: returns minimal permission (more restrictive)
3116  *      aclok: returns maximal permission (less restrictive)
3117  *      This routine changes the permissions that are alaredy in *va.
3118  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3119  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3120  */
3121 static void
3122 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3123 {
3124         vsecattr_t      vsa;
3125         int             aclcnt;
3126         aclent_t        *aclentp;
3127         mode_t          mask_perm;
3128         mode_t          grp_perm;
3129         mode_t          other_perm;
3130         mode_t          other_orig;
3131         int             error;
3132 
3133         /* dont care default acl */
3134         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3135         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3136 
3137         if (!error) {
3138                 aclcnt = vsa.vsa_aclcnt;
3139                 if (aclcnt > MIN_ACL_ENTRIES) {
3140                         /* non-trivial ACL */
3141                         aclentp = vsa.vsa_aclentp;
3142                         if (exi->exi_export.ex_flags & EX_ACLOK) {
3143                                 /* maximal permissions */
3144                                 grp_perm = 0;
3145                                 other_perm = 0;
3146                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3147                                         switch (aclentp->a_type) {
3148                                         case USER_OBJ:
3149                                                 break;
3150                                         case USER:
3151                                                 grp_perm |=
3152                                                     aclentp->a_perm << 3;
3153                                                 other_perm |= aclentp->a_perm;
3154                                                 break;
3155                                         case GROUP_OBJ:
3156                                                 grp_perm |=
3157                                                     aclentp->a_perm << 3;
3158                                                 break;
3159                                         case GROUP:
3160                                                 other_perm |= aclentp->a_perm;
3161                                                 break;
3162                                         case OTHER_OBJ:
3163                                                 other_orig = aclentp->a_perm;
3164                                                 break;
3165                                         case CLASS_OBJ:
3166                                                 mask_perm = aclentp->a_perm;
3167                                                 break;
3168                                         default:
3169                                                 break;
3170                                         }
3171                                 }
3172                                 grp_perm &= mask_perm << 3;
3173                                 other_perm &= mask_perm;
3174                                 other_perm |= other_orig;
3175 
3176                         } else {
3177                                 /* minimal permissions */
3178                                 grp_perm = 070;
3179                                 other_perm = 07;
3180                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3181                                         switch (aclentp->a_type) {
3182                                         case USER_OBJ:
3183                                                 break;
3184                                         case USER:
3185                                         case CLASS_OBJ:
3186                                                 grp_perm &=
3187                                                     aclentp->a_perm << 3;
3188                                                 other_perm &=
3189                                                     aclentp->a_perm;
3190                                                 break;
3191                                         case GROUP_OBJ:
3192                                                 grp_perm &=
3193                                                     aclentp->a_perm << 3;
3194                                                 break;
3195                                         case GROUP:
3196                                                 other_perm &=
3197                                                     aclentp->a_perm;
3198                                                 break;
3199                                         case OTHER_OBJ:
3200                                                 other_perm &=
3201                                                     aclentp->a_perm;
3202                                                 break;
3203                                         default:
3204                                                 break;
3205                                         }
3206                                 }
3207                         }
3208                         /* copy to va */
3209                         va->va_mode &= ~077;
3210                         va->va_mode |= grp_perm | other_perm;
3211                 }
3212                 if (vsa.vsa_aclcnt)
3213                         kmem_free(vsa.vsa_aclentp,
3214                             vsa.vsa_aclcnt * sizeof (aclent_t));
3215         }
3216 }
3217 
3218 void
3219 rfs_srvrinit(void)
3220 {
3221         nfs2_srv_caller_id = fs_new_caller_id();
3222         zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3223 }
3224 
3225 void
3226 rfs_srvrfini(void)
3227 {
3228 }
3229 
3230 /* ARGSUSED */
3231 static void *
3232 rfs_zone_init(zoneid_t zoneid)
3233 {
3234         nfs_srv_t *ns;
3235 
3236         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3237 
3238         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3239         ns->write_async = 1;
3240 
3241         return (ns);
3242 }
3243 
3244 /* ARGSUSED */
3245 static void
3246 rfs_zone_fini(zoneid_t zoneid, void *data)
3247 {
3248         nfs_srv_t *ns;
3249 
3250         ns = (nfs_srv_t *)data;
3251         mutex_destroy(&ns->async_write_lock);
3252         kmem_free(ns, sizeof (*ns));
3253 }
3254 
3255 static int
3256 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3257 {
3258         struct clist    *wcl;
3259         int             wlist_len;
3260         uint32_t        count = rr->rr_count;
3261 
3262         wcl = ra->ra_wlist;
3263 
3264         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3265                 return (FALSE);
3266         }
3267 
3268         wcl = ra->ra_wlist;
3269         rr->rr_ok.rrok_wlist_len = wlist_len;
3270         rr->rr_ok.rrok_wlist = wcl;
3271 
3272         return (TRUE);
3273 }