nlm-and-less-ZSD Old usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 
  33 /*
  34  * Copyright 2018 Nexenta Systems, Inc.
  35  * Copyright (c) 2016 by Delphix. All rights reserved.
  36  */
  37 
  38 #include <sys/param.h>
  39 #include <sys/types.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/buf.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/stat.h>
  47 #include <sys/errno.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/statvfs.h>
  50 #include <sys/kmem.h>
  51 #include <sys/kstat.h>
  52 #include <sys/dirent.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/debug.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/mode.h>
  57 #include <sys/acl.h>
  58 #include <sys/nbmlock.h>
  59 #include <sys/policy.h>
  60 #include <sys/sdt.h>
  61 
  62 #include <rpc/types.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/svc.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/export.h>
  68 #include <nfs/nfs_cmd.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_kmem.h>
  75 
  76 #include <sys/strsubr.h>
  77 
  78 struct rfs_async_write_list;
  79 
  80 /*
  81  * Zone globals of NFSv2 server
  82  */
  83 typedef struct nfs_srv {
  84         kmutex_t                        async_write_lock;
  85         struct rfs_async_write_list     *async_write_head;
  86 
  87         /*
  88          * enables write clustering if == 1
  89          */
  90         int             write_async;
  91 } nfs_srv_t;
  92 
  93 /*
  94  * These are the interface routines for the server side of the
  95  * Network File System.  See the NFS version 2 protocol specification
  96  * for a description of this interface.
  97  */
  98 
  99 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101                         cred_t *);
 102 static void     *rfs_zone_init(zoneid_t zoneid);
 103 static void     rfs_zone_fini(zoneid_t zoneid, void *data);
 104 
 105 
 106 /*
 107  * Some "over the wire" UNIX file types.  These are encoded
 108  * into the mode.  This needs to be fixed in the next rev.
 109  */
 110 #define IFMT            0170000         /* type of file */
 111 #define IFCHR           0020000         /* character special */
 112 #define IFBLK           0060000         /* block special */
 113 #define IFSOCK          0140000         /* socket */
 114 
 115 u_longlong_t nfs2_srv_caller_id;
 116 static zone_key_t rfs_zone_key;
 117 
 118 /*
 119  * Get file attributes.
 120  * Returns the current attributes of the file with the given fhandle.
 121  */
 122 /* ARGSUSED */
 123 void
 124 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 125     struct svc_req *req, cred_t *cr, bool_t ro)
 126 {
 127         int error;
 128         vnode_t *vp;
 129         struct vattr va;
 130 
 131         vp = nfs_fhtovp(fhp, exi);
 132         if (vp == NULL) {
 133                 ns->ns_status = NFSERR_STALE;
 134                 return;
 135         }
 136 
 137         /*
 138          * Do the getattr.
 139          */
 140         va.va_mask = AT_ALL;    /* we want all the attributes */
 141 
 142         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 143 
 144         /* check for overflows */
 145         if (!error) {
 146                 /* Lie about the object type for a referral */
 147                 if (vn_is_nfs_reparse(vp, cr))
 148                         va.va_type = VLNK;
 149 
 150                 acl_perm(vp, exi, &va, cr);
 151                 error = vattr_to_nattr(&va, &ns->ns_attr);
 152         }
 153 
 154         VN_RELE(vp);
 155 
 156         ns->ns_status = puterrno(error);
 157 }
 158 void *
 159 rfs_getattr_getfh(fhandle_t *fhp)
 160 {
 161         return (fhp);
 162 }
 163 
 164 /*
 165  * Set file attributes.
 166  * Sets the attributes of the file with the given fhandle.  Returns
 167  * the new attributes.
 168  */
 169 /* ARGSUSED */
 170 void
 171 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 172     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 173 {
 174         int error;
 175         int flag;
 176         int in_crit = 0;
 177         vnode_t *vp;
 178         struct vattr va;
 179         struct vattr bva;
 180         struct flock64 bf;
 181         caller_context_t ct;
 182 
 183 
 184         vp = nfs_fhtovp(&args->saa_fh, exi);
 185         if (vp == NULL) {
 186                 ns->ns_status = NFSERR_STALE;
 187                 return;
 188         }
 189 
 190         if (rdonly(ro, vp)) {
 191                 VN_RELE(vp);
 192                 ns->ns_status = NFSERR_ROFS;
 193                 return;
 194         }
 195 
 196         error = sattr_to_vattr(&args->saa_sa, &va);
 197         if (error) {
 198                 VN_RELE(vp);
 199                 ns->ns_status = puterrno(error);
 200                 return;
 201         }
 202 
 203         /*
 204          * If the client is requesting a change to the mtime,
 205          * but the nanosecond field is set to 1 billion, then
 206          * this is a flag to the server that it should set the
 207          * atime and mtime fields to the server's current time.
 208          * The 1 billion number actually came from the client
 209          * as 1 million, but the units in the over the wire
 210          * request are microseconds instead of nanoseconds.
 211          *
 212          * This is an overload of the protocol and should be
 213          * documented in the NFS Version 2 protocol specification.
 214          */
 215         if (va.va_mask & AT_MTIME) {
 216                 if (va.va_mtime.tv_nsec == 1000000000) {
 217                         gethrestime(&va.va_mtime);
 218                         va.va_atime = va.va_mtime;
 219                         va.va_mask |= AT_ATIME;
 220                         flag = 0;
 221                 } else
 222                         flag = ATTR_UTIME;
 223         } else
 224                 flag = 0;
 225 
 226         /*
 227          * If the filesystem is exported with nosuid, then mask off
 228          * the setuid and setgid bits.
 229          */
 230         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 231             (exi->exi_export.ex_flags & EX_NOSUID))
 232                 va.va_mode &= ~(VSUID | VSGID);
 233 
 234         ct.cc_sysid = 0;
 235         ct.cc_pid = 0;
 236         ct.cc_caller_id = nfs2_srv_caller_id;
 237         ct.cc_flags = CC_DONTBLOCK;
 238 
 239         /*
 240          * We need to specially handle size changes because it is
 241          * possible for the client to create a file with modes
 242          * which indicate read-only, but with the file opened for
 243          * writing.  If the client then tries to set the size of
 244          * the file, then the normal access checking done in
 245          * VOP_SETATTR would prevent the client from doing so,
 246          * although it should be legal for it to do so.  To get
 247          * around this, we do the access checking for ourselves
 248          * and then use VOP_SPACE which doesn't do the access
 249          * checking which VOP_SETATTR does. VOP_SPACE can only
 250          * operate on VREG files, let VOP_SETATTR handle the other
 251          * extremely rare cases.
 252          * Also the client should not be allowed to change the
 253          * size of the file if there is a conflicting non-blocking
 254          * mandatory lock in the region of change.
 255          */
 256         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 257                 if (nbl_need_check(vp)) {
 258                         nbl_start_crit(vp, RW_READER);
 259                         in_crit = 1;
 260                 }
 261 
 262                 bva.va_mask = AT_UID | AT_SIZE;
 263 
 264                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 265 
 266                 if (error) {
 267                         if (in_crit)
 268                                 nbl_end_crit(vp);
 269                         VN_RELE(vp);
 270                         ns->ns_status = puterrno(error);
 271                         return;
 272                 }
 273 
 274                 if (in_crit) {
 275                         u_offset_t offset;
 276                         ssize_t length;
 277 
 278                         if (va.va_size < bva.va_size) {
 279                                 offset = va.va_size;
 280                                 length = bva.va_size - va.va_size;
 281                         } else {
 282                                 offset = bva.va_size;
 283                                 length = va.va_size - bva.va_size;
 284                         }
 285                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 286                             NULL)) {
 287                                 error = EACCES;
 288                         }
 289                 }
 290 
 291                 if (crgetuid(cr) == bva.va_uid && !error &&
 292                     va.va_size != bva.va_size) {
 293                         va.va_mask &= ~AT_SIZE;
 294                         bf.l_type = F_WRLCK;
 295                         bf.l_whence = 0;
 296                         bf.l_start = (off64_t)va.va_size;
 297                         bf.l_len = 0;
 298                         bf.l_sysid = 0;
 299                         bf.l_pid = 0;
 300 
 301                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 302                             (offset_t)va.va_size, cr, &ct);
 303                 }
 304                 if (in_crit)
 305                         nbl_end_crit(vp);
 306         } else
 307                 error = 0;
 308 
 309         /*
 310          * Do the setattr.
 311          */
 312         if (!error && va.va_mask) {
 313                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 314         }
 315 
 316         /*
 317          * check if the monitor on either vop_space or vop_setattr detected
 318          * a delegation conflict and if so, mark the thread flag as
 319          * wouldblock so that the response is dropped and the client will
 320          * try again.
 321          */
 322         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 323                 VN_RELE(vp);
 324                 curthread->t_flag |= T_WOULDBLOCK;
 325                 return;
 326         }
 327 
 328         if (!error) {
 329                 va.va_mask = AT_ALL;    /* get everything */
 330 
 331                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 332 
 333                 /* check for overflows */
 334                 if (!error) {
 335                         acl_perm(vp, exi, &va, cr);
 336                         error = vattr_to_nattr(&va, &ns->ns_attr);
 337                 }
 338         }
 339 
 340         ct.cc_flags = 0;
 341 
 342         /*
 343          * Force modified metadata out to stable storage.
 344          */
 345         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 346 
 347         VN_RELE(vp);
 348 
 349         ns->ns_status = puterrno(error);
 350 }
 351 void *
 352 rfs_setattr_getfh(struct nfssaargs *args)
 353 {
 354         return (&args->saa_fh);
 355 }
 356 
 357 /* Change and release @exip and @vpp only in success */
 358 int
 359 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 360 {
 361         struct exportinfo *exi;
 362         vnode_t *vp = *vpp;
 363         fid_t fid;
 364         int error;
 365 
 366         VN_HOLD(vp);
 367 
 368         if ((error = traverse(&vp)) != 0) {
 369                 VN_RELE(vp);
 370                 return (error);
 371         }
 372 
 373         bzero(&fid, sizeof (fid));
 374         fid.fid_len = MAXFIDSZ;
 375         error = VOP_FID(vp, &fid, NULL);
 376         if (error) {
 377                 VN_RELE(vp);
 378                 return (error);
 379         }
 380 
 381         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 382         if (exi == NULL ||
 383             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 384                 /*
 385                  * It is not error, just subdir is not exported
 386                  * or "nohide" is not set
 387                  */
 388                 if (exi != NULL)
 389                         exi_rele(exi);
 390                 VN_RELE(vp);
 391         } else {
 392                 /* go to submount */
 393                 exi_rele(*exip);
 394                 *exip = exi;
 395 
 396                 VN_RELE(*vpp);
 397                 *vpp = vp;
 398         }
 399 
 400         return (0);
 401 }
 402 
 403 /*
 404  * Given mounted "dvp" and "exi", go upper mountpoint
 405  * with dvp/exi correction
 406  * Return 0 in success
 407  */
 408 int
 409 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 410 {
 411         struct exportinfo *exi;
 412         vnode_t *dvp = *dvpp;
 413 
 414         ASSERT3P((*exip)->exi_zone, ==, curzone);
 415         ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
 416 
 417         VN_HOLD(dvp);
 418         dvp = untraverse(dvp);
 419         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 420         if (exi == NULL) {
 421                 VN_RELE(dvp);
 422                 return (-1);
 423         }
 424 
 425         ASSERT3P(exi->exi_zone, ==, curzone);
 426         exi_rele(*exip);
 427         *exip = exi;
 428         VN_RELE(*dvpp);
 429         *dvpp = dvp;
 430 
 431         return (0);
 432 }
 433 /*
 434  * Directory lookup.
 435  * Returns an fhandle and file attributes for file name in a directory.
 436  */
 437 /* ARGSUSED */
 438 void
 439 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 440     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 441 {
 442         int error;
 443         vnode_t *dvp;
 444         vnode_t *vp;
 445         struct vattr va;
 446         fhandle_t *fhp = da->da_fhandle;
 447         struct sec_ol sec = {0, 0};
 448         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 449         char *name;
 450         struct sockaddr *ca;
 451 
 452         /*
 453          * Trusted Extension doesn't support NFSv2. MOUNT
 454          * will reject v2 clients. Need to prevent v2 client
 455          * access via WebNFS here.
 456          */
 457         if (is_system_labeled() && req->rq_vers == 2) {
 458                 dr->dr_status = NFSERR_ACCES;
 459                 return;
 460         }
 461 
 462         /*
 463          * Disallow NULL paths
 464          */
 465         if (da->da_name == NULL || *da->da_name == '\0') {
 466                 dr->dr_status = NFSERR_ACCES;
 467                 return;
 468         }
 469 
 470         /*
 471          * Allow lookups from the root - the default
 472          * location of the public filehandle.
 473          */
 474         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 475                 dvp = ZONE_ROOTVP();
 476                 VN_HOLD(dvp);
 477         } else {
 478                 dvp = nfs_fhtovp(fhp, exi);
 479                 if (dvp == NULL) {
 480                         dr->dr_status = NFSERR_STALE;
 481                         return;
 482                 }
 483         }
 484 
 485         exi_hold(exi);
 486         ASSERT3P(exi->exi_zone, ==, curzone);
 487 
 488         /*
 489          * Not allow lookup beyond root.
 490          * If the filehandle matches a filehandle of the exi,
 491          * then the ".." refers beyond the root of an exported filesystem.
 492          */
 493         if (strcmp(da->da_name, "..") == 0 &&
 494             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 495                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 496                     ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 497                         /*
 498                          * special case for ".." and 'nohide'exported root
 499                          */
 500                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 501                                 error = NFSERR_ACCES;
 502                                 goto out;
 503                         }
 504                 } else  {
 505                         error = NFSERR_NOENT;
 506                         goto out;
 507                 }
 508         }
 509 
 510         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 511         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 512             MAXPATHLEN);
 513 
 514         if (name == NULL) {
 515                 error = NFSERR_ACCES;
 516                 goto out;
 517         }
 518 
 519         /*
 520          * If the public filehandle is used then allow
 521          * a multi-component lookup, i.e. evaluate
 522          * a pathname and follow symbolic links if
 523          * necessary.
 524          *
 525          * This may result in a vnode in another filesystem
 526          * which is OK as long as the filesystem is exported.
 527          */
 528         if (PUBLIC_FH2(fhp)) {
 529                 publicfh_flag = TRUE;
 530 
 531                 exi_rele(exi);
 532 
 533                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 534                     &sec);
 535         } else {
 536                 /*
 537                  * Do a normal single component lookup.
 538                  */
 539                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 540                     NULL, NULL, NULL);
 541         }
 542 
 543         if (name != da->da_name)
 544                 kmem_free(name, MAXPATHLEN);
 545 
 546         if (error == 0 && vn_ismntpt(vp)) {
 547                 error = rfs_cross_mnt(&vp, &exi);
 548                 if (error)
 549                         VN_RELE(vp);
 550         }
 551 
 552         if (!error) {
 553                 va.va_mask = AT_ALL;    /* we want everything */
 554 
 555                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 556 
 557                 /* check for overflows */
 558                 if (!error) {
 559                         acl_perm(vp, exi, &va, cr);
 560                         error = vattr_to_nattr(&va, &dr->dr_attr);
 561                         if (!error) {
 562                                 if (sec.sec_flags & SEC_QUERY)
 563                                         error = makefh_ol(&dr->dr_fhandle, exi,
 564                                             sec.sec_index);
 565                                 else {
 566                                         error = makefh(&dr->dr_fhandle, vp,
 567                                             exi);
 568                                         if (!error && publicfh_flag &&
 569                                             !chk_clnt_sec(exi, req))
 570                                                 auth_weak = TRUE;
 571                                 }
 572                         }
 573                 }
 574                 VN_RELE(vp);
 575         }
 576 
 577 out:
 578         VN_RELE(dvp);
 579 
 580         if (exi != NULL)
 581                 exi_rele(exi);
 582 
 583         /*
 584          * If it's public fh, no 0x81, and client's flavor is
 585          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 586          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 587          */
 588         if (auth_weak)
 589                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 590         else
 591                 dr->dr_status = puterrno(error);
 592 }
 593 void *
 594 rfs_lookup_getfh(struct nfsdiropargs *da)
 595 {
 596         return (da->da_fhandle);
 597 }
 598 
 599 /*
 600  * Read symbolic link.
 601  * Returns the string in the symbolic link at the given fhandle.
 602  */
 603 /* ARGSUSED */
 604 void
 605 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 606     struct svc_req *req, cred_t *cr, bool_t ro)
 607 {
 608         int error;
 609         struct iovec iov;
 610         struct uio uio;
 611         vnode_t *vp;
 612         struct vattr va;
 613         struct sockaddr *ca;
 614         char *name = NULL;
 615         int is_referral = 0;
 616 
 617         vp = nfs_fhtovp(fhp, exi);
 618         if (vp == NULL) {
 619                 rl->rl_data = NULL;
 620                 rl->rl_status = NFSERR_STALE;
 621                 return;
 622         }
 623 
 624         va.va_mask = AT_MODE;
 625 
 626         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 627 
 628         if (error) {
 629                 VN_RELE(vp);
 630                 rl->rl_data = NULL;
 631                 rl->rl_status = puterrno(error);
 632                 return;
 633         }
 634 
 635         if (MANDLOCK(vp, va.va_mode)) {
 636                 VN_RELE(vp);
 637                 rl->rl_data = NULL;
 638                 rl->rl_status = NFSERR_ACCES;
 639                 return;
 640         }
 641 
 642         /* We lied about the object type for a referral */
 643         if (vn_is_nfs_reparse(vp, cr))
 644                 is_referral = 1;
 645 
 646         /*
 647          * XNFS and RFC1094 require us to return ENXIO if argument
 648          * is not a link. BUGID 1138002.
 649          */
 650         if (vp->v_type != VLNK && !is_referral) {
 651                 VN_RELE(vp);
 652                 rl->rl_data = NULL;
 653                 rl->rl_status = NFSERR_NXIO;
 654                 return;
 655         }
 656 
 657         /*
 658          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 659          */
 660         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 661 
 662         if (is_referral) {
 663                 char *s;
 664                 size_t strsz;
 665 
 666                 /* Get an artificial symlink based on a referral */
 667                 s = build_symlink(vp, cr, &strsz);
 668                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 669                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 670                     vnode_t *, vp, char *, s);
 671                 if (s == NULL)
 672                         error = EINVAL;
 673                 else {
 674                         error = 0;
 675                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 676                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 677                         kmem_free(s, strsz);
 678                 }
 679 
 680         } else {
 681 
 682                 /*
 683                  * Set up io vector to read sym link data
 684                  */
 685                 iov.iov_base = rl->rl_data;
 686                 iov.iov_len = NFS_MAXPATHLEN;
 687                 uio.uio_iov = &iov;
 688                 uio.uio_iovcnt = 1;
 689                 uio.uio_segflg = UIO_SYSSPACE;
 690                 uio.uio_extflg = UIO_COPY_CACHED;
 691                 uio.uio_loffset = (offset_t)0;
 692                 uio.uio_resid = NFS_MAXPATHLEN;
 693 
 694                 /*
 695                  * Do the readlink.
 696                  */
 697                 error = VOP_READLINK(vp, &uio, cr, NULL);
 698 
 699                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 700 
 701                 if (!error)
 702                         rl->rl_data[rl->rl_count] = '\0';
 703 
 704         }
 705 
 706 
 707         VN_RELE(vp);
 708 
 709         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 710         name = nfscmd_convname(ca, exi, rl->rl_data,
 711             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 712 
 713         if (name != NULL && name != rl->rl_data) {
 714                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 715                 rl->rl_data = name;
 716         }
 717 
 718         /*
 719          * XNFS and RFC1094 require us to return ENXIO if argument
 720          * is not a link. UFS returns EINVAL if this is the case,
 721          * so we do the mapping here. BUGID 1138002.
 722          */
 723         if (error == EINVAL)
 724                 rl->rl_status = NFSERR_NXIO;
 725         else
 726                 rl->rl_status = puterrno(error);
 727 
 728 }
 729 void *
 730 rfs_readlink_getfh(fhandle_t *fhp)
 731 {
 732         return (fhp);
 733 }
 734 /*
 735  * Free data allocated by rfs_readlink
 736  */
 737 void
 738 rfs_rlfree(struct nfsrdlnres *rl)
 739 {
 740         if (rl->rl_data != NULL)
 741                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 742 }
 743 
 744 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 745 
 746 /*
 747  * Read data.
 748  * Returns some data read from the file at the given fhandle.
 749  */
 750 /* ARGSUSED */
 751 void
 752 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 753     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 754 {
 755         vnode_t *vp;
 756         int error;
 757         struct vattr va;
 758         struct iovec iov;
 759         struct uio uio;
 760         mblk_t *mp;
 761         int alloc_err = 0;
 762         int in_crit = 0;
 763         caller_context_t ct;
 764 
 765         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 766         if (vp == NULL) {
 767                 rr->rr_data = NULL;
 768                 rr->rr_status = NFSERR_STALE;
 769                 return;
 770         }
 771 
 772         if (vp->v_type != VREG) {
 773                 VN_RELE(vp);
 774                 rr->rr_data = NULL;
 775                 rr->rr_status = NFSERR_ISDIR;
 776                 return;
 777         }
 778 
 779         ct.cc_sysid = 0;
 780         ct.cc_pid = 0;
 781         ct.cc_caller_id = nfs2_srv_caller_id;
 782         ct.cc_flags = CC_DONTBLOCK;
 783 
 784         /*
 785          * Enter the critical region before calling VOP_RWLOCK
 786          * to avoid a deadlock with write requests.
 787          */
 788         if (nbl_need_check(vp)) {
 789                 nbl_start_crit(vp, RW_READER);
 790                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 791                     0, NULL)) {
 792                         nbl_end_crit(vp);
 793                         VN_RELE(vp);
 794                         rr->rr_data = NULL;
 795                         rr->rr_status = NFSERR_ACCES;
 796                         return;
 797                 }
 798                 in_crit = 1;
 799         }
 800 
 801         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 802 
 803         /* check if a monitor detected a delegation conflict */
 804         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 805                 if (in_crit)
 806                         nbl_end_crit(vp);
 807                 VN_RELE(vp);
 808                 /* mark as wouldblock so response is dropped */
 809                 curthread->t_flag |= T_WOULDBLOCK;
 810 
 811                 rr->rr_data = NULL;
 812                 return;
 813         }
 814 
 815         va.va_mask = AT_ALL;
 816 
 817         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 818 
 819         if (error) {
 820                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 821                 if (in_crit)
 822                         nbl_end_crit(vp);
 823 
 824                 VN_RELE(vp);
 825                 rr->rr_data = NULL;
 826                 rr->rr_status = puterrno(error);
 827 
 828                 return;
 829         }
 830 
 831         /*
 832          * This is a kludge to allow reading of files created
 833          * with no read permission.  The owner of the file
 834          * is always allowed to read it.
 835          */
 836         if (crgetuid(cr) != va.va_uid) {
 837                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 838 
 839                 if (error) {
 840                         /*
 841                          * Exec is the same as read over the net because
 842                          * of demand loading.
 843                          */
 844                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 845                 }
 846                 if (error) {
 847                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 848                         if (in_crit)
 849                                 nbl_end_crit(vp);
 850                         VN_RELE(vp);
 851                         rr->rr_data = NULL;
 852                         rr->rr_status = puterrno(error);
 853 
 854                         return;
 855                 }
 856         }
 857 
 858         if (MANDLOCK(vp, va.va_mode)) {
 859                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 860                 if (in_crit)
 861                         nbl_end_crit(vp);
 862 
 863                 VN_RELE(vp);
 864                 rr->rr_data = NULL;
 865                 rr->rr_status = NFSERR_ACCES;
 866 
 867                 return;
 868         }
 869 
 870         rr->rr_ok.rrok_wlist_len = 0;
 871         rr->rr_ok.rrok_wlist = NULL;
 872 
 873         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 874                 rr->rr_count = 0;
 875                 rr->rr_data = NULL;
 876                 /*
 877                  * In this case, status is NFS_OK, but there is no data
 878                  * to encode. So set rr_mp to NULL.
 879                  */
 880                 rr->rr_mp = NULL;
 881                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 882                 if (rr->rr_ok.rrok_wlist)
 883                         clist_zero_len(rr->rr_ok.rrok_wlist);
 884                 goto done;
 885         }
 886 
 887         if (ra->ra_wlist) {
 888                 mp = NULL;
 889                 rr->rr_mp = NULL;
 890                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 891                 if (ra->ra_count > iov.iov_len) {
 892                         rr->rr_data = NULL;
 893                         rr->rr_status = NFSERR_INVAL;
 894                         goto done;
 895                 }
 896         } else {
 897                 /*
 898                  * mp will contain the data to be sent out in the read reply.
 899                  * This will be freed after the reply has been sent out (by the
 900                  * driver).
 901                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 902                  * that the call to xdrmblk_putmblk() never fails.
 903                  */
 904                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 905                     &alloc_err);
 906                 ASSERT(mp != NULL);
 907                 ASSERT(alloc_err == 0);
 908 
 909                 rr->rr_mp = mp;
 910 
 911                 /*
 912                  * Set up io vector
 913                  */
 914                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 915                 iov.iov_len = ra->ra_count;
 916         }
 917 
 918         uio.uio_iov = &iov;
 919         uio.uio_iovcnt = 1;
 920         uio.uio_segflg = UIO_SYSSPACE;
 921         uio.uio_extflg = UIO_COPY_CACHED;
 922         uio.uio_loffset = (offset_t)ra->ra_offset;
 923         uio.uio_resid = ra->ra_count;
 924 
 925         error = VOP_READ(vp, &uio, 0, cr, &ct);
 926 
 927         if (error) {
 928                 if (mp)
 929                         freeb(mp);
 930 
 931                 /*
 932                  * check if a monitor detected a delegation conflict and
 933                  * mark as wouldblock so response is dropped
 934                  */
 935                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 936                         curthread->t_flag |= T_WOULDBLOCK;
 937                 else
 938                         rr->rr_status = puterrno(error);
 939 
 940                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 941                 if (in_crit)
 942                         nbl_end_crit(vp);
 943 
 944                 VN_RELE(vp);
 945                 rr->rr_data = NULL;
 946 
 947                 return;
 948         }
 949 
 950         /*
 951          * Get attributes again so we can send the latest access
 952          * time to the client side for its cache.
 953          */
 954         va.va_mask = AT_ALL;
 955 
 956         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 957 
 958         if (error) {
 959                 if (mp)
 960                         freeb(mp);
 961 
 962                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 963                 if (in_crit)
 964                         nbl_end_crit(vp);
 965 
 966                 VN_RELE(vp);
 967                 rr->rr_data = NULL;
 968                 rr->rr_status = puterrno(error);
 969 
 970                 return;
 971         }
 972 
 973         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 974 
 975         if (mp) {
 976                 rr->rr_data = (char *)mp->b_datap->db_base;
 977         } else {
 978                 if (ra->ra_wlist) {
 979                         rr->rr_data = (caddr_t)iov.iov_base;
 980                         if (!rdma_setup_read_data2(ra, rr)) {
 981                                 rr->rr_data = NULL;
 982                                 rr->rr_status = puterrno(NFSERR_INVAL);
 983                         }
 984                 }
 985         }
 986 done:
 987         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 988         if (in_crit)
 989                 nbl_end_crit(vp);
 990 
 991         acl_perm(vp, exi, &va, cr);
 992 
 993         /* check for overflows */
 994         error = vattr_to_nattr(&va, &rr->rr_attr);
 995 
 996         VN_RELE(vp);
 997 
 998         rr->rr_status = puterrno(error);
 999 }
1000 
1001 /*
1002  * Free data allocated by rfs_read
1003  */
1004 void
1005 rfs_rdfree(struct nfsrdresult *rr)
1006 {
1007         mblk_t *mp;
1008 
1009         if (rr->rr_status == NFS_OK) {
1010                 mp = rr->rr_mp;
1011                 if (mp != NULL)
1012                         freeb(mp);
1013         }
1014 }
1015 
1016 void *
1017 rfs_read_getfh(struct nfsreadargs *ra)
1018 {
1019         return (&ra->ra_fhandle);
1020 }
1021 
1022 #define MAX_IOVECS      12
1023 
1024 #ifdef DEBUG
1025 static int rfs_write_sync_hits = 0;
1026 static int rfs_write_sync_misses = 0;
1027 #endif
1028 
1029 /*
1030  * Write data to file.
1031  * Returns attributes of a file after writing some data to it.
1032  *
1033  * Any changes made here, especially in error handling might have
1034  * to also be done in rfs_write (which clusters write requests).
1035  */
1036 /* ARGSUSED */
1037 void
1038 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1039     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1040 {
1041         int error;
1042         vnode_t *vp;
1043         rlim64_t rlimit;
1044         struct vattr va;
1045         struct uio uio;
1046         struct iovec iov[MAX_IOVECS];
1047         mblk_t *m;
1048         struct iovec *iovp;
1049         int iovcnt;
1050         cred_t *savecred;
1051         int in_crit = 0;
1052         caller_context_t ct;
1053 
1054         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1055         if (vp == NULL) {
1056                 ns->ns_status = NFSERR_STALE;
1057                 return;
1058         }
1059 
1060         if (rdonly(ro, vp)) {
1061                 VN_RELE(vp);
1062                 ns->ns_status = NFSERR_ROFS;
1063                 return;
1064         }
1065 
1066         if (vp->v_type != VREG) {
1067                 VN_RELE(vp);
1068                 ns->ns_status = NFSERR_ISDIR;
1069                 return;
1070         }
1071 
1072         ct.cc_sysid = 0;
1073         ct.cc_pid = 0;
1074         ct.cc_caller_id = nfs2_srv_caller_id;
1075         ct.cc_flags = CC_DONTBLOCK;
1076 
1077         va.va_mask = AT_UID|AT_MODE;
1078 
1079         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1080 
1081         if (error) {
1082                 VN_RELE(vp);
1083                 ns->ns_status = puterrno(error);
1084 
1085                 return;
1086         }
1087 
1088         if (crgetuid(cr) != va.va_uid) {
1089                 /*
1090                  * This is a kludge to allow writes of files created
1091                  * with read only permission.  The owner of the file
1092                  * is always allowed to write it.
1093                  */
1094                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1095 
1096                 if (error) {
1097                         VN_RELE(vp);
1098                         ns->ns_status = puterrno(error);
1099                         return;
1100                 }
1101         }
1102 
1103         /*
1104          * Can't access a mandatory lock file.  This might cause
1105          * the NFS service thread to block forever waiting for a
1106          * lock to be released that will never be released.
1107          */
1108         if (MANDLOCK(vp, va.va_mode)) {
1109                 VN_RELE(vp);
1110                 ns->ns_status = NFSERR_ACCES;
1111                 return;
1112         }
1113 
1114         /*
1115          * We have to enter the critical region before calling VOP_RWLOCK
1116          * to avoid a deadlock with ufs.
1117          */
1118         if (nbl_need_check(vp)) {
1119                 nbl_start_crit(vp, RW_READER);
1120                 in_crit = 1;
1121                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1122                     wa->wa_count, 0, NULL)) {
1123                         error = EACCES;
1124                         goto out;
1125                 }
1126         }
1127 
1128         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1129 
1130         /* check if a monitor detected a delegation conflict */
1131         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1132                 goto out;
1133         }
1134 
1135         if (wa->wa_data || wa->wa_rlist) {
1136                 /* Do the RDMA thing if necessary */
1137                 if (wa->wa_rlist) {
1138                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1139                         iov[0].iov_len = wa->wa_count;
1140                 } else  {
1141                         iov[0].iov_base = wa->wa_data;
1142                         iov[0].iov_len = wa->wa_count;
1143                 }
1144                 uio.uio_iov = iov;
1145                 uio.uio_iovcnt = 1;
1146                 uio.uio_segflg = UIO_SYSSPACE;
1147                 uio.uio_extflg = UIO_COPY_DEFAULT;
1148                 uio.uio_loffset = (offset_t)wa->wa_offset;
1149                 uio.uio_resid = wa->wa_count;
1150                 /*
1151                  * The limit is checked on the client. We
1152                  * should allow any size writes here.
1153                  */
1154                 uio.uio_llimit = curproc->p_fsz_ctl;
1155                 rlimit = uio.uio_llimit - wa->wa_offset;
1156                 if (rlimit < (rlim64_t)uio.uio_resid)
1157                         uio.uio_resid = (uint_t)rlimit;
1158 
1159                 /*
1160                  * for now we assume no append mode
1161                  */
1162                 /*
1163                  * We're changing creds because VM may fault and we need
1164                  * the cred of the current thread to be used if quota
1165                  * checking is enabled.
1166                  */
1167                 savecred = curthread->t_cred;
1168                 curthread->t_cred = cr;
1169                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1170                 curthread->t_cred = savecred;
1171         } else {
1172 
1173                 iovcnt = 0;
1174                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1175                         iovcnt++;
1176                 if (iovcnt <= MAX_IOVECS) {
1177 #ifdef DEBUG
1178                         rfs_write_sync_hits++;
1179 #endif
1180                         iovp = iov;
1181                 } else {
1182 #ifdef DEBUG
1183                         rfs_write_sync_misses++;
1184 #endif
1185                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1186                 }
1187                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1188                 uio.uio_iov = iovp;
1189                 uio.uio_iovcnt = iovcnt;
1190                 uio.uio_segflg = UIO_SYSSPACE;
1191                 uio.uio_extflg = UIO_COPY_DEFAULT;
1192                 uio.uio_loffset = (offset_t)wa->wa_offset;
1193                 uio.uio_resid = wa->wa_count;
1194                 /*
1195                  * The limit is checked on the client. We
1196                  * should allow any size writes here.
1197                  */
1198                 uio.uio_llimit = curproc->p_fsz_ctl;
1199                 rlimit = uio.uio_llimit - wa->wa_offset;
1200                 if (rlimit < (rlim64_t)uio.uio_resid)
1201                         uio.uio_resid = (uint_t)rlimit;
1202 
1203                 /*
1204                  * For now we assume no append mode.
1205                  */
1206                 /*
1207                  * We're changing creds because VM may fault and we need
1208                  * the cred of the current thread to be used if quota
1209                  * checking is enabled.
1210                  */
1211                 savecred = curthread->t_cred;
1212                 curthread->t_cred = cr;
1213                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1214                 curthread->t_cred = savecred;
1215 
1216                 if (iovp != iov)
1217                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1218         }
1219 
1220         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1221 
1222         if (!error) {
1223                 /*
1224                  * Get attributes again so we send the latest mod
1225                  * time to the client side for its cache.
1226                  */
1227                 va.va_mask = AT_ALL;    /* now we want everything */
1228 
1229                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1230 
1231                 /* check for overflows */
1232                 if (!error) {
1233                         acl_perm(vp, exi, &va, cr);
1234                         error = vattr_to_nattr(&va, &ns->ns_attr);
1235                 }
1236         }
1237 
1238 out:
1239         if (in_crit)
1240                 nbl_end_crit(vp);
1241         VN_RELE(vp);
1242 
1243         /* check if a monitor detected a delegation conflict */
1244         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1245                 /* mark as wouldblock so response is dropped */
1246                 curthread->t_flag |= T_WOULDBLOCK;
1247         else
1248                 ns->ns_status = puterrno(error);
1249 
1250 }
1251 
1252 struct rfs_async_write {
1253         struct nfswriteargs *wa;
1254         struct nfsattrstat *ns;
1255         struct svc_req *req;
1256         cred_t *cr;
1257         bool_t ro;
1258         kthread_t *thread;
1259         struct rfs_async_write *list;
1260 };
1261 
1262 struct rfs_async_write_list {
1263         fhandle_t *fhp;
1264         kcondvar_t cv;
1265         struct rfs_async_write *list;
1266         struct rfs_async_write_list *next;
1267 };
1268 
1269 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1270 static kmutex_t rfs_async_write_lock;
1271 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1272 
1273 #define MAXCLIOVECS     42
1274 #define RFSWRITE_INITVAL (enum nfsstat) -1
1275 
1276 #ifdef DEBUG
1277 static int rfs_write_hits = 0;
1278 static int rfs_write_misses = 0;
1279 #endif
1280 
1281 /*
1282  * Write data to file.
1283  * Returns attributes of a file after writing some data to it.
1284  */
1285 void
1286 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1287     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1288 {
1289         int error;
1290         vnode_t *vp;
1291         rlim64_t rlimit;
1292         struct vattr va;
1293         struct uio uio;
1294         struct rfs_async_write_list *lp;
1295         struct rfs_async_write_list *nlp;
1296         struct rfs_async_write *rp;
1297         struct rfs_async_write *nrp;
1298         struct rfs_async_write *trp;
1299         struct rfs_async_write *lrp;
1300         int data_written;
1301         int iovcnt;
1302         mblk_t *m;
1303         struct iovec *iovp;
1304         struct iovec *niovp;
1305         struct iovec iov[MAXCLIOVECS];
1306         int count;
1307         int rcount;
1308         uint_t off;
1309         uint_t len;
1310         struct rfs_async_write nrpsp;
1311         struct rfs_async_write_list nlpsp;
1312         ushort_t t_flag;
1313         cred_t *savecred;
1314         int in_crit = 0;
1315         caller_context_t ct;
1316         nfs_srv_t *nsrv;
1317 
1318         ASSERT3P(curzone, ==, ((exi == NULL) ? curzone : exi->exi_zone));
1319         nsrv = zone_getspecific(rfs_zone_key, curzone);
1320         if (!nsrv->write_async) {
1321                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1322                 return;
1323         }
1324 
1325         /*
1326          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1327          * is considered an OK.
1328          */
1329         ns->ns_status = RFSWRITE_INITVAL;
1330 
1331         nrp = &nrpsp;
1332         nrp->wa = wa;
1333         nrp->ns = ns;
1334         nrp->req = req;
1335         nrp->cr = cr;
1336         nrp->ro = ro;
1337         nrp->thread = curthread;
1338 
1339         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1340 
1341         /*
1342          * Look to see if there is already a cluster started
1343          * for this file.
1344          */
1345         mutex_enter(&nsrv->async_write_lock);
1346         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1347                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1348                     sizeof (fhandle_t)) == 0)
1349                         break;
1350         }
1351 
1352         /*
1353          * If lp is non-NULL, then there is already a cluster
1354          * started.  We need to place ourselves in the cluster
1355          * list in the right place as determined by starting
1356          * offset.  Conflicts with non-blocking mandatory locked
1357          * regions will be checked when the cluster is processed.
1358          */
1359         if (lp != NULL) {
1360                 rp = lp->list;
1361                 trp = NULL;
1362                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1363                         trp = rp;
1364                         rp = rp->list;
1365                 }
1366                 nrp->list = rp;
1367                 if (trp == NULL)
1368                         lp->list = nrp;
1369                 else
1370                         trp->list = nrp;
1371                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1372                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1373                 mutex_exit(&nsrv->async_write_lock);
1374 
1375                 return;
1376         }
1377 
1378         /*
1379          * No cluster started yet, start one and add ourselves
1380          * to the list of clusters.
1381          */
1382         nrp->list = NULL;
1383 
1384         nlp = &nlpsp;
1385         nlp->fhp = &wa->wa_fhandle;
1386         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1387         nlp->list = nrp;
1388         nlp->next = NULL;
1389 
1390         if (nsrv->async_write_head == NULL) {
1391                 nsrv->async_write_head = nlp;
1392         } else {
1393                 lp = nsrv->async_write_head;
1394                 while (lp->next != NULL)
1395                         lp = lp->next;
1396                 lp->next = nlp;
1397         }
1398         mutex_exit(&nsrv->async_write_lock);
1399 
1400         /*
1401          * Convert the file handle common to all of the requests
1402          * in this cluster to a vnode.
1403          */
1404         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1405         if (vp == NULL) {
1406                 mutex_enter(&nsrv->async_write_lock);
1407                 if (nsrv->async_write_head == nlp)
1408                         nsrv->async_write_head = nlp->next;
1409                 else {
1410                         lp = nsrv->async_write_head;
1411                         while (lp->next != nlp)
1412                                 lp = lp->next;
1413                         lp->next = nlp->next;
1414                 }
1415                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1416                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1417                         rp->ns->ns_status = NFSERR_STALE;
1418                         rp->thread->t_flag |= t_flag;
1419                 }
1420                 cv_broadcast(&nlp->cv);
1421                 mutex_exit(&nsrv->async_write_lock);
1422 
1423                 return;
1424         }
1425 
1426         /*
1427          * Can only write regular files.  Attempts to write any
1428          * other file types fail with EISDIR.
1429          */
1430         if (vp->v_type != VREG) {
1431                 VN_RELE(vp);
1432                 mutex_enter(&nsrv->async_write_lock);
1433                 if (nsrv->async_write_head == nlp)
1434                         nsrv->async_write_head = nlp->next;
1435                 else {
1436                         lp = nsrv->async_write_head;
1437                         while (lp->next != nlp)
1438                                 lp = lp->next;
1439                         lp->next = nlp->next;
1440                 }
1441                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1442                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1443                         rp->ns->ns_status = NFSERR_ISDIR;
1444                         rp->thread->t_flag |= t_flag;
1445                 }
1446                 cv_broadcast(&nlp->cv);
1447                 mutex_exit(&nsrv->async_write_lock);
1448 
1449                 return;
1450         }
1451 
1452         /*
1453          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1454          * deadlock with ufs.
1455          */
1456         if (nbl_need_check(vp)) {
1457                 nbl_start_crit(vp, RW_READER);
1458                 in_crit = 1;
1459         }
1460 
1461         ct.cc_sysid = 0;
1462         ct.cc_pid = 0;
1463         ct.cc_caller_id = nfs2_srv_caller_id;
1464         ct.cc_flags = CC_DONTBLOCK;
1465 
1466         /*
1467          * Lock the file for writing.  This operation provides
1468          * the delay which allows clusters to grow.
1469          */
1470         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1471 
1472         /* check if a monitor detected a delegation conflict */
1473         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1474                 if (in_crit)
1475                         nbl_end_crit(vp);
1476                 VN_RELE(vp);
1477                 /* mark as wouldblock so response is dropped */
1478                 curthread->t_flag |= T_WOULDBLOCK;
1479                 mutex_enter(&nsrv->async_write_lock);
1480                 if (nsrv->async_write_head == nlp)
1481                         nsrv->async_write_head = nlp->next;
1482                 else {
1483                         lp = nsrv->async_write_head;
1484                         while (lp->next != nlp)
1485                                 lp = lp->next;
1486                         lp->next = nlp->next;
1487                 }
1488                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1489                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1490                                 rp->ns->ns_status = puterrno(error);
1491                                 rp->thread->t_flag |= T_WOULDBLOCK;
1492                         }
1493                 }
1494                 cv_broadcast(&nlp->cv);
1495                 mutex_exit(&nsrv->async_write_lock);
1496 
1497                 return;
1498         }
1499 
1500         /*
1501          * Disconnect this cluster from the list of clusters.
1502          * The cluster that is being dealt with must be fixed
1503          * in size after this point, so there is no reason
1504          * to leave it on the list so that new requests can
1505          * find it.
1506          *
1507          * The algorithm is that the first write request will
1508          * create a cluster, convert the file handle to a
1509          * vnode pointer, and then lock the file for writing.
1510          * This request is not likely to be clustered with
1511          * any others.  However, the next request will create
1512          * a new cluster and be blocked in VOP_RWLOCK while
1513          * the first request is being processed.  This delay
1514          * will allow more requests to be clustered in this
1515          * second cluster.
1516          */
1517         mutex_enter(&nsrv->async_write_lock);
1518         if (nsrv->async_write_head == nlp)
1519                 nsrv->async_write_head = nlp->next;
1520         else {
1521                 lp = nsrv->async_write_head;
1522                 while (lp->next != nlp)
1523                         lp = lp->next;
1524                 lp->next = nlp->next;
1525         }
1526         mutex_exit(&nsrv->async_write_lock);
1527 
1528         /*
1529          * Step through the list of requests in this cluster.
1530          * We need to check permissions to make sure that all
1531          * of the requests have sufficient permission to write
1532          * the file.  A cluster can be composed of requests
1533          * from different clients and different users on each
1534          * client.
1535          *
1536          * As a side effect, we also calculate the size of the
1537          * byte range that this cluster encompasses.
1538          */
1539         rp = nlp->list;
1540         off = rp->wa->wa_offset;
1541         len = (uint_t)0;
1542         do {
1543                 if (rdonly(rp->ro, vp)) {
1544                         rp->ns->ns_status = NFSERR_ROFS;
1545                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1546                         rp->thread->t_flag |= t_flag;
1547                         continue;
1548                 }
1549 
1550                 va.va_mask = AT_UID|AT_MODE;
1551 
1552                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1553 
1554                 if (!error) {
1555                         if (crgetuid(rp->cr) != va.va_uid) {
1556                                 /*
1557                                  * This is a kludge to allow writes of files
1558                                  * created with read only permission.  The
1559                                  * owner of the file is always allowed to
1560                                  * write it.
1561                                  */
1562                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1563                         }
1564                         if (!error && MANDLOCK(vp, va.va_mode))
1565                                 error = EACCES;
1566                 }
1567 
1568                 /*
1569                  * Check for a conflict with a nbmand-locked region.
1570                  */
1571                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1572                     rp->wa->wa_count, 0, NULL)) {
1573                         error = EACCES;
1574                 }
1575 
1576                 if (error) {
1577                         rp->ns->ns_status = puterrno(error);
1578                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1579                         rp->thread->t_flag |= t_flag;
1580                         continue;
1581                 }
1582                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1583                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1584         } while ((rp = rp->list) != NULL);
1585 
1586         /*
1587          * Step through the cluster attempting to gather as many
1588          * requests which are contiguous as possible.  These
1589          * contiguous requests are handled via one call to VOP_WRITE
1590          * instead of different calls to VOP_WRITE.  We also keep
1591          * track of the fact that any data was written.
1592          */
1593         rp = nlp->list;
1594         data_written = 0;
1595         do {
1596                 /*
1597                  * Skip any requests which are already marked as having an
1598                  * error.
1599                  */
1600                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1601                         rp = rp->list;
1602                         continue;
1603                 }
1604 
1605                 /*
1606                  * Count the number of iovec's which are required
1607                  * to handle this set of requests.  One iovec is
1608                  * needed for each data buffer, whether addressed
1609                  * by wa_data or by the b_rptr pointers in the
1610                  * mblk chains.
1611                  */
1612                 iovcnt = 0;
1613                 lrp = rp;
1614                 for (;;) {
1615                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1616                                 iovcnt++;
1617                         else {
1618                                 m = lrp->wa->wa_mblk;
1619                                 while (m != NULL) {
1620                                         iovcnt++;
1621                                         m = m->b_cont;
1622                                 }
1623                         }
1624                         if (lrp->list == NULL ||
1625                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1626                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1627                             lrp->list->wa->wa_offset) {
1628                                 lrp = lrp->list;
1629                                 break;
1630                         }
1631                         lrp = lrp->list;
1632                 }
1633 
1634                 if (iovcnt <= MAXCLIOVECS) {
1635 #ifdef DEBUG
1636                         rfs_write_hits++;
1637 #endif
1638                         niovp = iov;
1639                 } else {
1640 #ifdef DEBUG
1641                         rfs_write_misses++;
1642 #endif
1643                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1644                 }
1645                 /*
1646                  * Put together the scatter/gather iovecs.
1647                  */
1648                 iovp = niovp;
1649                 trp = rp;
1650                 count = 0;
1651                 do {
1652                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1653                                 if (trp->wa->wa_rlist) {
1654                                         iovp->iov_base =
1655                                             (char *)((trp->wa->wa_rlist)->
1656                                             u.c_daddr3);
1657                                         iovp->iov_len = trp->wa->wa_count;
1658                                 } else  {
1659                                         iovp->iov_base = trp->wa->wa_data;
1660                                         iovp->iov_len = trp->wa->wa_count;
1661                                 }
1662                                 iovp++;
1663                         } else {
1664                                 m = trp->wa->wa_mblk;
1665                                 rcount = trp->wa->wa_count;
1666                                 while (m != NULL) {
1667                                         iovp->iov_base = (caddr_t)m->b_rptr;
1668                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1669                                         rcount -= iovp->iov_len;
1670                                         if (rcount < 0)
1671                                                 iovp->iov_len += rcount;
1672                                         iovp++;
1673                                         if (rcount <= 0)
1674                                                 break;
1675                                         m = m->b_cont;
1676                                 }
1677                         }
1678                         count += trp->wa->wa_count;
1679                         trp = trp->list;
1680                 } while (trp != lrp);
1681 
1682                 uio.uio_iov = niovp;
1683                 uio.uio_iovcnt = iovcnt;
1684                 uio.uio_segflg = UIO_SYSSPACE;
1685                 uio.uio_extflg = UIO_COPY_DEFAULT;
1686                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1687                 uio.uio_resid = count;
1688                 /*
1689                  * The limit is checked on the client. We
1690                  * should allow any size writes here.
1691                  */
1692                 uio.uio_llimit = curproc->p_fsz_ctl;
1693                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1694                 if (rlimit < (rlim64_t)uio.uio_resid)
1695                         uio.uio_resid = (uint_t)rlimit;
1696 
1697                 /*
1698                  * For now we assume no append mode.
1699                  */
1700 
1701                 /*
1702                  * We're changing creds because VM may fault
1703                  * and we need the cred of the current
1704                  * thread to be used if quota * checking is
1705                  * enabled.
1706                  */
1707                 savecred = curthread->t_cred;
1708                 curthread->t_cred = cr;
1709                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1710                 curthread->t_cred = savecred;
1711 
1712                 /* check if a monitor detected a delegation conflict */
1713                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1714                         /* mark as wouldblock so response is dropped */
1715                         curthread->t_flag |= T_WOULDBLOCK;
1716 
1717                 if (niovp != iov)
1718                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1719 
1720                 if (!error) {
1721                         data_written = 1;
1722                         /*
1723                          * Get attributes again so we send the latest mod
1724                          * time to the client side for its cache.
1725                          */
1726                         va.va_mask = AT_ALL;    /* now we want everything */
1727 
1728                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1729 
1730                         if (!error)
1731                                 acl_perm(vp, exi, &va, rp->cr);
1732                 }
1733 
1734                 /*
1735                  * Fill in the status responses for each request
1736                  * which was just handled.  Also, copy the latest
1737                  * attributes in to the attribute responses if
1738                  * appropriate.
1739                  */
1740                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1741                 do {
1742                         rp->thread->t_flag |= t_flag;
1743                         /* check for overflows */
1744                         if (!error) {
1745                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1746                         }
1747                         rp->ns->ns_status = puterrno(error);
1748                         rp = rp->list;
1749                 } while (rp != lrp);
1750         } while (rp != NULL);
1751 
1752         /*
1753          * If any data was written at all, then we need to flush
1754          * the data and metadata to stable storage.
1755          */
1756         if (data_written) {
1757                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1758 
1759                 if (!error) {
1760                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1761                 }
1762         }
1763 
1764         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1765 
1766         if (in_crit)
1767                 nbl_end_crit(vp);
1768         VN_RELE(vp);
1769 
1770         t_flag = curthread->t_flag & T_WOULDBLOCK;
1771         mutex_enter(&nsrv->async_write_lock);
1772         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1773                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1774                         rp->ns->ns_status = puterrno(error);
1775                         rp->thread->t_flag |= t_flag;
1776                 }
1777         }
1778         cv_broadcast(&nlp->cv);
1779         mutex_exit(&nsrv->async_write_lock);
1780 
1781 }
1782 
1783 void *
1784 rfs_write_getfh(struct nfswriteargs *wa)
1785 {
1786         return (&wa->wa_fhandle);
1787 }
1788 
1789 /*
1790  * Create a file.
1791  * Creates a file with given attributes and returns those attributes
1792  * and an fhandle for the new file.
1793  */
1794 void
1795 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1796     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1797 {
1798         int error;
1799         int lookuperr;
1800         int in_crit = 0;
1801         struct vattr va;
1802         vnode_t *vp;
1803         vnode_t *realvp;
1804         vnode_t *dvp;
1805         char *name = args->ca_da.da_name;
1806         vnode_t *tvp = NULL;
1807         int mode;
1808         int lookup_ok;
1809         bool_t trunc;
1810         struct sockaddr *ca;
1811 
1812         /*
1813          * Disallow NULL paths
1814          */
1815         if (name == NULL || *name == '\0') {
1816                 dr->dr_status = NFSERR_ACCES;
1817                 return;
1818         }
1819 
1820         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1821         if (dvp == NULL) {
1822                 dr->dr_status = NFSERR_STALE;
1823                 return;
1824         }
1825 
1826         error = sattr_to_vattr(args->ca_sa, &va);
1827         if (error) {
1828                 dr->dr_status = puterrno(error);
1829                 return;
1830         }
1831 
1832         /*
1833          * Must specify the mode.
1834          */
1835         if (!(va.va_mask & AT_MODE)) {
1836                 VN_RELE(dvp);
1837                 dr->dr_status = NFSERR_INVAL;
1838                 return;
1839         }
1840 
1841         /*
1842          * This is a completely gross hack to make mknod
1843          * work over the wire until we can wack the protocol
1844          */
1845         if ((va.va_mode & IFMT) == IFCHR) {
1846                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1847                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1848                 else {
1849                         va.va_type = VCHR;
1850                         /*
1851                          * uncompress the received dev_t
1852                          * if the top half is zero indicating a request
1853                          * from an `older style' OS.
1854                          */
1855                         if ((va.va_size & 0xffff0000) == 0)
1856                                 va.va_rdev = nfsv2_expdev(va.va_size);
1857                         else
1858                                 va.va_rdev = (dev_t)va.va_size;
1859                 }
1860                 va.va_mask &= ~AT_SIZE;
1861         } else if ((va.va_mode & IFMT) == IFBLK) {
1862                 va.va_type = VBLK;
1863                 /*
1864                  * uncompress the received dev_t
1865                  * if the top half is zero indicating a request
1866                  * from an `older style' OS.
1867                  */
1868                 if ((va.va_size & 0xffff0000) == 0)
1869                         va.va_rdev = nfsv2_expdev(va.va_size);
1870                 else
1871                         va.va_rdev = (dev_t)va.va_size;
1872                 va.va_mask &= ~AT_SIZE;
1873         } else if ((va.va_mode & IFMT) == IFSOCK) {
1874                 va.va_type = VSOCK;
1875         } else {
1876                 va.va_type = VREG;
1877         }
1878         va.va_mode &= ~IFMT;
1879         va.va_mask |= AT_TYPE;
1880 
1881         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1882         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1883             MAXPATHLEN);
1884         if (name == NULL) {
1885                 dr->dr_status = puterrno(EINVAL);
1886                 return;
1887         }
1888 
1889         /*
1890          * Why was the choice made to use VWRITE as the mode to the
1891          * call to VOP_CREATE ? This results in a bug.  When a client
1892          * opens a file that already exists and is RDONLY, the second
1893          * open fails with an EACESS because of the mode.
1894          * bug ID 1054648.
1895          */
1896         lookup_ok = 0;
1897         mode = VWRITE;
1898         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1899                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1900                     NULL, NULL, NULL);
1901                 if (!error) {
1902                         struct vattr at;
1903 
1904                         lookup_ok = 1;
1905                         at.va_mask = AT_MODE;
1906                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1907                         if (!error)
1908                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1909                         VN_RELE(tvp);
1910                         tvp = NULL;
1911                 }
1912         }
1913 
1914         if (!lookup_ok) {
1915                 if (rdonly(ro, dvp)) {
1916                         error = EROFS;
1917                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1918                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1919                         error = EPERM;
1920                 } else {
1921                         error = 0;
1922                 }
1923         }
1924 
1925         /*
1926          * If file size is being modified on an already existing file
1927          * make sure that there are no conflicting non-blocking mandatory
1928          * locks in the region being manipulated. Return EACCES if there
1929          * are conflicting locks.
1930          */
1931         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1932                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1933                     NULL, NULL, NULL);
1934 
1935                 if (!lookuperr &&
1936                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1937                         VN_RELE(tvp);
1938                         curthread->t_flag |= T_WOULDBLOCK;
1939                         goto out;
1940                 }
1941 
1942                 if (!lookuperr && nbl_need_check(tvp)) {
1943                         /*
1944                          * The file exists. Now check if it has any
1945                          * conflicting non-blocking mandatory locks
1946                          * in the region being changed.
1947                          */
1948                         struct vattr bva;
1949                         u_offset_t offset;
1950                         ssize_t length;
1951 
1952                         nbl_start_crit(tvp, RW_READER);
1953                         in_crit = 1;
1954 
1955                         bva.va_mask = AT_SIZE;
1956                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1957                         if (!error) {
1958                                 if (va.va_size < bva.va_size) {
1959                                         offset = va.va_size;
1960                                         length = bva.va_size - va.va_size;
1961                                 } else {
1962                                         offset = bva.va_size;
1963                                         length = va.va_size - bva.va_size;
1964                                 }
1965                                 if (length) {
1966                                         if (nbl_conflict(tvp, NBL_WRITE,
1967                                             offset, length, 0, NULL)) {
1968                                                 error = EACCES;
1969                                         }
1970                                 }
1971                         }
1972                         if (error) {
1973                                 nbl_end_crit(tvp);
1974                                 VN_RELE(tvp);
1975                                 in_crit = 0;
1976                         }
1977                 } else if (tvp != NULL) {
1978                         VN_RELE(tvp);
1979                 }
1980         }
1981 
1982         if (!error) {
1983                 /*
1984                  * If filesystem is shared with nosuid the remove any
1985                  * setuid/setgid bits on create.
1986                  */
1987                 if (va.va_type == VREG &&
1988                     exi->exi_export.ex_flags & EX_NOSUID)
1989                         va.va_mode &= ~(VSUID | VSGID);
1990 
1991                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1992                     NULL, NULL);
1993 
1994                 if (!error) {
1995 
1996                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1997                                 trunc = TRUE;
1998                         else
1999                                 trunc = FALSE;
2000 
2001                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2002                                 VN_RELE(vp);
2003                                 curthread->t_flag |= T_WOULDBLOCK;
2004                                 goto out;
2005                         }
2006                         va.va_mask = AT_ALL;
2007 
2008                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2009 
2010                         /* check for overflows */
2011                         if (!error) {
2012                                 acl_perm(vp, exi, &va, cr);
2013                                 error = vattr_to_nattr(&va, &dr->dr_attr);
2014                                 if (!error) {
2015                                         error = makefh(&dr->dr_fhandle, vp,
2016                                             exi);
2017                                 }
2018                         }
2019                         /*
2020                          * Force modified metadata out to stable storage.
2021                          *
2022                          * if a underlying vp exists, pass it to VOP_FSYNC
2023                          */
2024                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
2025                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2026                         else
2027                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2028                         VN_RELE(vp);
2029                 }
2030 
2031                 if (in_crit) {
2032                         nbl_end_crit(tvp);
2033                         VN_RELE(tvp);
2034                 }
2035         }
2036 
2037         /*
2038          * Force modified data and metadata out to stable storage.
2039          */
2040         (void) VOP_FSYNC(dvp, 0, cr, NULL);
2041 
2042 out:
2043 
2044         VN_RELE(dvp);
2045 
2046         dr->dr_status = puterrno(error);
2047 
2048         if (name != args->ca_da.da_name)
2049                 kmem_free(name, MAXPATHLEN);
2050 }
2051 void *
2052 rfs_create_getfh(struct nfscreatargs *args)
2053 {
2054         return (args->ca_da.da_fhandle);
2055 }
2056 
2057 /*
2058  * Remove a file.
2059  * Remove named file from parent directory.
2060  */
2061 /* ARGSUSED */
2062 void
2063 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2064     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2065 {
2066         int error = 0;
2067         vnode_t *vp;
2068         vnode_t *targvp;
2069         int in_crit = 0;
2070 
2071         /*
2072          * Disallow NULL paths
2073          */
2074         if (da->da_name == NULL || *da->da_name == '\0') {
2075                 *status = NFSERR_ACCES;
2076                 return;
2077         }
2078 
2079         vp = nfs_fhtovp(da->da_fhandle, exi);
2080         if (vp == NULL) {
2081                 *status = NFSERR_STALE;
2082                 return;
2083         }
2084 
2085         if (rdonly(ro, vp)) {
2086                 VN_RELE(vp);
2087                 *status = NFSERR_ROFS;
2088                 return;
2089         }
2090 
2091         /*
2092          * Check for a conflict with a non-blocking mandatory share reservation.
2093          */
2094         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2095             NULL, cr, NULL, NULL, NULL);
2096         if (error != 0) {
2097                 VN_RELE(vp);
2098                 *status = puterrno(error);
2099                 return;
2100         }
2101 
2102         /*
2103          * If the file is delegated to an v4 client, then initiate
2104          * recall and drop this request (by setting T_WOULDBLOCK).
2105          * The client will eventually re-transmit the request and
2106          * (hopefully), by then, the v4 client will have returned
2107          * the delegation.
2108          */
2109 
2110         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2111                 VN_RELE(vp);
2112                 VN_RELE(targvp);
2113                 curthread->t_flag |= T_WOULDBLOCK;
2114                 return;
2115         }
2116 
2117         if (nbl_need_check(targvp)) {
2118                 nbl_start_crit(targvp, RW_READER);
2119                 in_crit = 1;
2120                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2121                         error = EACCES;
2122                         goto out;
2123                 }
2124         }
2125 
2126         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2127 
2128         /*
2129          * Force modified data and metadata out to stable storage.
2130          */
2131         (void) VOP_FSYNC(vp, 0, cr, NULL);
2132 
2133 out:
2134         if (in_crit)
2135                 nbl_end_crit(targvp);
2136         VN_RELE(targvp);
2137         VN_RELE(vp);
2138 
2139         *status = puterrno(error);
2140 
2141 }
2142 
2143 void *
2144 rfs_remove_getfh(struct nfsdiropargs *da)
2145 {
2146         return (da->da_fhandle);
2147 }
2148 
2149 /*
2150  * rename a file
2151  * Give a file (from) a new name (to).
2152  */
2153 /* ARGSUSED */
2154 void
2155 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2156     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2157 {
2158         int error = 0;
2159         vnode_t *fromvp;
2160         vnode_t *tovp;
2161         struct exportinfo *to_exi;
2162         fhandle_t *fh;
2163         vnode_t *srcvp;
2164         vnode_t *targvp;
2165         int in_crit = 0;
2166 
2167         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2168         if (fromvp == NULL) {
2169                 *status = NFSERR_STALE;
2170                 return;
2171         }
2172 
2173         fh = args->rna_to.da_fhandle;
2174         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2175         if (to_exi == NULL) {
2176                 VN_RELE(fromvp);
2177                 *status = NFSERR_ACCES;
2178                 return;
2179         }
2180         exi_rele(to_exi);
2181 
2182         if (to_exi != exi) {
2183                 VN_RELE(fromvp);
2184                 *status = NFSERR_XDEV;
2185                 return;
2186         }
2187 
2188         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2189         if (tovp == NULL) {
2190                 VN_RELE(fromvp);
2191                 *status = NFSERR_STALE;
2192                 return;
2193         }
2194 
2195         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2196                 VN_RELE(tovp);
2197                 VN_RELE(fromvp);
2198                 *status = NFSERR_NOTDIR;
2199                 return;
2200         }
2201 
2202         /*
2203          * Disallow NULL paths
2204          */
2205         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2206             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2207                 VN_RELE(tovp);
2208                 VN_RELE(fromvp);
2209                 *status = NFSERR_ACCES;
2210                 return;
2211         }
2212 
2213         if (rdonly(ro, tovp)) {
2214                 VN_RELE(tovp);
2215                 VN_RELE(fromvp);
2216                 *status = NFSERR_ROFS;
2217                 return;
2218         }
2219 
2220         /*
2221          * Check for a conflict with a non-blocking mandatory share reservation.
2222          */
2223         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2224             NULL, cr, NULL, NULL, NULL);
2225         if (error != 0) {
2226                 VN_RELE(tovp);
2227                 VN_RELE(fromvp);
2228                 *status = puterrno(error);
2229                 return;
2230         }
2231 
2232         /* Check for delegations on the source file */
2233 
2234         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2235                 VN_RELE(tovp);
2236                 VN_RELE(fromvp);
2237                 VN_RELE(srcvp);
2238                 curthread->t_flag |= T_WOULDBLOCK;
2239                 return;
2240         }
2241 
2242         /* Check for delegation on the file being renamed over, if it exists */
2243 
2244         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2245             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2246             NULL, NULL, NULL) == 0) {
2247 
2248                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2249                         VN_RELE(tovp);
2250                         VN_RELE(fromvp);
2251                         VN_RELE(srcvp);
2252                         VN_RELE(targvp);
2253                         curthread->t_flag |= T_WOULDBLOCK;
2254                         return;
2255                 }
2256                 VN_RELE(targvp);
2257         }
2258 
2259 
2260         if (nbl_need_check(srcvp)) {
2261                 nbl_start_crit(srcvp, RW_READER);
2262                 in_crit = 1;
2263                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2264                         error = EACCES;
2265                         goto out;
2266                 }
2267         }
2268 
2269         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2270             tovp, args->rna_to.da_name, cr, NULL, 0);
2271 
2272         if (error == 0)
2273                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2274                     strlen(args->rna_to.da_name));
2275 
2276         /*
2277          * Force modified data and metadata out to stable storage.
2278          */
2279         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2280         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2281 
2282 out:
2283         if (in_crit)
2284                 nbl_end_crit(srcvp);
2285         VN_RELE(srcvp);
2286         VN_RELE(tovp);
2287         VN_RELE(fromvp);
2288 
2289         *status = puterrno(error);
2290 
2291 }
2292 void *
2293 rfs_rename_getfh(struct nfsrnmargs *args)
2294 {
2295         return (args->rna_from.da_fhandle);
2296 }
2297 
2298 /*
2299  * Link to a file.
2300  * Create a file (to) which is a hard link to the given file (from).
2301  */
2302 /* ARGSUSED */
2303 void
2304 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2305     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2306 {
2307         int error;
2308         vnode_t *fromvp;
2309         vnode_t *tovp;
2310         struct exportinfo *to_exi;
2311         fhandle_t *fh;
2312 
2313         fromvp = nfs_fhtovp(args->la_from, exi);
2314         if (fromvp == NULL) {
2315                 *status = NFSERR_STALE;
2316                 return;
2317         }
2318 
2319         fh = args->la_to.da_fhandle;
2320         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2321         if (to_exi == NULL) {
2322                 VN_RELE(fromvp);
2323                 *status = NFSERR_ACCES;
2324                 return;
2325         }
2326         exi_rele(to_exi);
2327 
2328         if (to_exi != exi) {
2329                 VN_RELE(fromvp);
2330                 *status = NFSERR_XDEV;
2331                 return;
2332         }
2333 
2334         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2335         if (tovp == NULL) {
2336                 VN_RELE(fromvp);
2337                 *status = NFSERR_STALE;
2338                 return;
2339         }
2340 
2341         if (tovp->v_type != VDIR) {
2342                 VN_RELE(tovp);
2343                 VN_RELE(fromvp);
2344                 *status = NFSERR_NOTDIR;
2345                 return;
2346         }
2347         /*
2348          * Disallow NULL paths
2349          */
2350         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2351                 VN_RELE(tovp);
2352                 VN_RELE(fromvp);
2353                 *status = NFSERR_ACCES;
2354                 return;
2355         }
2356 
2357         if (rdonly(ro, tovp)) {
2358                 VN_RELE(tovp);
2359                 VN_RELE(fromvp);
2360                 *status = NFSERR_ROFS;
2361                 return;
2362         }
2363 
2364         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2365 
2366         /*
2367          * Force modified data and metadata out to stable storage.
2368          */
2369         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2370         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2371 
2372         VN_RELE(tovp);
2373         VN_RELE(fromvp);
2374 
2375         *status = puterrno(error);
2376 
2377 }
2378 void *
2379 rfs_link_getfh(struct nfslinkargs *args)
2380 {
2381         return (args->la_from);
2382 }
2383 
2384 /*
2385  * Symbolicly link to a file.
2386  * Create a file (to) with the given attributes which is a symbolic link
2387  * to the given path name (to).
2388  */
2389 void
2390 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2391     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2392 {
2393         int error;
2394         struct vattr va;
2395         vnode_t *vp;
2396         vnode_t *svp;
2397         int lerror;
2398         struct sockaddr *ca;
2399         char *name = NULL;
2400 
2401         /*
2402          * Disallow NULL paths
2403          */
2404         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2405                 *status = NFSERR_ACCES;
2406                 return;
2407         }
2408 
2409         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2410         if (vp == NULL) {
2411                 *status = NFSERR_STALE;
2412                 return;
2413         }
2414 
2415         if (rdonly(ro, vp)) {
2416                 VN_RELE(vp);
2417                 *status = NFSERR_ROFS;
2418                 return;
2419         }
2420 
2421         error = sattr_to_vattr(args->sla_sa, &va);
2422         if (error) {
2423                 VN_RELE(vp);
2424                 *status = puterrno(error);
2425                 return;
2426         }
2427 
2428         if (!(va.va_mask & AT_MODE)) {
2429                 VN_RELE(vp);
2430                 *status = NFSERR_INVAL;
2431                 return;
2432         }
2433 
2434         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2435         name = nfscmd_convname(ca, exi, args->sla_tnm,
2436             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2437 
2438         if (name == NULL) {
2439                 *status = NFSERR_ACCES;
2440                 return;
2441         }
2442 
2443         va.va_type = VLNK;
2444         va.va_mask |= AT_TYPE;
2445 
2446         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2447 
2448         /*
2449          * Force new data and metadata out to stable storage.
2450          */
2451         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2452             NULL, cr, NULL, NULL, NULL);
2453 
2454         if (!lerror) {
2455                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2456                 VN_RELE(svp);
2457         }
2458 
2459         /*
2460          * Force modified data and metadata out to stable storage.
2461          */
2462         (void) VOP_FSYNC(vp, 0, cr, NULL);
2463 
2464         VN_RELE(vp);
2465 
2466         *status = puterrno(error);
2467         if (name != args->sla_tnm)
2468                 kmem_free(name, MAXPATHLEN);
2469 
2470 }
2471 void *
2472 rfs_symlink_getfh(struct nfsslargs *args)
2473 {
2474         return (args->sla_from.da_fhandle);
2475 }
2476 
2477 /*
2478  * Make a directory.
2479  * Create a directory with the given name, parent directory, and attributes.
2480  * Returns a file handle and attributes for the new directory.
2481  */
2482 /* ARGSUSED */
2483 void
2484 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2485     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2486 {
2487         int error;
2488         struct vattr va;
2489         vnode_t *dvp = NULL;
2490         vnode_t *vp;
2491         char *name = args->ca_da.da_name;
2492 
2493         /*
2494          * Disallow NULL paths
2495          */
2496         if (name == NULL || *name == '\0') {
2497                 dr->dr_status = NFSERR_ACCES;
2498                 return;
2499         }
2500 
2501         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2502         if (vp == NULL) {
2503                 dr->dr_status = NFSERR_STALE;
2504                 return;
2505         }
2506 
2507         if (rdonly(ro, vp)) {
2508                 VN_RELE(vp);
2509                 dr->dr_status = NFSERR_ROFS;
2510                 return;
2511         }
2512 
2513         error = sattr_to_vattr(args->ca_sa, &va);
2514         if (error) {
2515                 VN_RELE(vp);
2516                 dr->dr_status = puterrno(error);
2517                 return;
2518         }
2519 
2520         if (!(va.va_mask & AT_MODE)) {
2521                 VN_RELE(vp);
2522                 dr->dr_status = NFSERR_INVAL;
2523                 return;
2524         }
2525 
2526         va.va_type = VDIR;
2527         va.va_mask |= AT_TYPE;
2528 
2529         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2530 
2531         if (!error) {
2532                 /*
2533                  * Attribtutes of the newly created directory should
2534                  * be returned to the client.
2535                  */
2536                 va.va_mask = AT_ALL; /* We want everything */
2537                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2538 
2539                 /* check for overflows */
2540                 if (!error) {
2541                         acl_perm(vp, exi, &va, cr);
2542                         error = vattr_to_nattr(&va, &dr->dr_attr);
2543                         if (!error) {
2544                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2545                         }
2546                 }
2547                 /*
2548                  * Force new data and metadata out to stable storage.
2549                  */
2550                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2551                 VN_RELE(dvp);
2552         }
2553 
2554         /*
2555          * Force modified data and metadata out to stable storage.
2556          */
2557         (void) VOP_FSYNC(vp, 0, cr, NULL);
2558 
2559         VN_RELE(vp);
2560 
2561         dr->dr_status = puterrno(error);
2562 
2563 }
2564 void *
2565 rfs_mkdir_getfh(struct nfscreatargs *args)
2566 {
2567         return (args->ca_da.da_fhandle);
2568 }
2569 
2570 /*
2571  * Remove a directory.
2572  * Remove the given directory name from the given parent directory.
2573  */
2574 /* ARGSUSED */
2575 void
2576 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2577     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2578 {
2579         int error;
2580         vnode_t *vp;
2581 
2582         /*
2583          * Disallow NULL paths
2584          */
2585         if (da->da_name == NULL || *da->da_name == '\0') {
2586                 *status = NFSERR_ACCES;
2587                 return;
2588         }
2589 
2590         vp = nfs_fhtovp(da->da_fhandle, exi);
2591         if (vp == NULL) {
2592                 *status = NFSERR_STALE;
2593                 return;
2594         }
2595 
2596         if (rdonly(ro, vp)) {
2597                 VN_RELE(vp);
2598                 *status = NFSERR_ROFS;
2599                 return;
2600         }
2601 
2602         /*
2603          * VOP_RMDIR takes a third argument (the current
2604          * directory of the process).  That's because someone
2605          * wants to return EINVAL if one tries to remove ".".
2606          * Of course, NFS servers have no idea what their
2607          * clients' current directories are.  We fake it by
2608          * supplying a vnode known to exist and illegal to
2609          * remove.
2610          */
2611         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2612 
2613         /*
2614          * Force modified data and metadata out to stable storage.
2615          */
2616         (void) VOP_FSYNC(vp, 0, cr, NULL);
2617 
2618         VN_RELE(vp);
2619 
2620         /*
2621          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2622          * if the directory is not empty.  A System V NFS server
2623          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2624          * over the wire.
2625          */
2626         if (error == EEXIST)
2627                 *status = NFSERR_NOTEMPTY;
2628         else
2629                 *status = puterrno(error);
2630 
2631 }
2632 void *
2633 rfs_rmdir_getfh(struct nfsdiropargs *da)
2634 {
2635         return (da->da_fhandle);
2636 }
2637 
2638 /* ARGSUSED */
2639 void
2640 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2641     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2642 {
2643         int error;
2644         int iseof;
2645         struct iovec iov;
2646         struct uio uio;
2647         vnode_t *vp;
2648         char *ndata = NULL;
2649         struct sockaddr *ca;
2650         size_t nents;
2651         int ret;
2652 
2653         vp = nfs_fhtovp(&rda->rda_fh, exi);
2654         if (vp == NULL) {
2655                 rd->rd_entries = NULL;
2656                 rd->rd_status = NFSERR_STALE;
2657                 return;
2658         }
2659 
2660         if (vp->v_type != VDIR) {
2661                 VN_RELE(vp);
2662                 rd->rd_entries = NULL;
2663                 rd->rd_status = NFSERR_NOTDIR;
2664                 return;
2665         }
2666 
2667         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2668 
2669         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2670 
2671         if (error) {
2672                 rd->rd_entries = NULL;
2673                 goto bad;
2674         }
2675 
2676         if (rda->rda_count == 0) {
2677                 rd->rd_entries = NULL;
2678                 rd->rd_size = 0;
2679                 rd->rd_eof = FALSE;
2680                 goto bad;
2681         }
2682 
2683         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2684 
2685         /*
2686          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2687          */
2688         rd->rd_bufsize = (uint_t)rda->rda_count;
2689         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2690 
2691         /*
2692          * Set up io vector to read directory data
2693          */
2694         iov.iov_base = (caddr_t)rd->rd_entries;
2695         iov.iov_len = rda->rda_count;
2696         uio.uio_iov = &iov;
2697         uio.uio_iovcnt = 1;
2698         uio.uio_segflg = UIO_SYSSPACE;
2699         uio.uio_extflg = UIO_COPY_CACHED;
2700         uio.uio_loffset = (offset_t)rda->rda_offset;
2701         uio.uio_resid = rda->rda_count;
2702 
2703         /*
2704          * read directory
2705          */
2706         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2707 
2708         /*
2709          * Clean up
2710          */
2711         if (!error) {
2712                 /*
2713                  * set size and eof
2714                  */
2715                 if (uio.uio_resid == rda->rda_count) {
2716                         rd->rd_size = 0;
2717                         rd->rd_eof = TRUE;
2718                 } else {
2719                         rd->rd_size = (uint32_t)(rda->rda_count -
2720                             uio.uio_resid);
2721                         rd->rd_eof = iseof ? TRUE : FALSE;
2722                 }
2723         }
2724 
2725         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2726         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2727         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2728             rda->rda_count, &ndata);
2729 
2730         if (ret != 0) {
2731                 size_t dropbytes;
2732                 /*
2733                  * We had to drop one or more entries in order to fit
2734                  * during the character conversion.  We need to patch
2735                  * up the size and eof info.
2736                  */
2737                 if (rd->rd_eof)
2738                         rd->rd_eof = FALSE;
2739                 dropbytes = nfscmd_dropped_entrysize(
2740                     (struct dirent64 *)rd->rd_entries, nents, ret);
2741                 rd->rd_size -= dropbytes;
2742         }
2743         if (ndata == NULL) {
2744                 ndata = (char *)rd->rd_entries;
2745         } else if (ndata != (char *)rd->rd_entries) {
2746                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2747                 rd->rd_entries = (void *)ndata;
2748                 rd->rd_bufsize = rda->rda_count;
2749         }
2750 
2751 bad:
2752         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2753 
2754 #if 0 /* notyet */
2755         /*
2756          * Don't do this.  It causes local disk writes when just
2757          * reading the file and the overhead is deemed larger
2758          * than the benefit.
2759          */
2760         /*
2761          * Force modified metadata out to stable storage.
2762          */
2763         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2764 #endif
2765 
2766         VN_RELE(vp);
2767 
2768         rd->rd_status = puterrno(error);
2769 
2770 }
2771 void *
2772 rfs_readdir_getfh(struct nfsrddirargs *rda)
2773 {
2774         return (&rda->rda_fh);
2775 }
2776 void
2777 rfs_rddirfree(struct nfsrddirres *rd)
2778 {
2779         if (rd->rd_entries != NULL)
2780                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2781 }
2782 
2783 /* ARGSUSED */
2784 void
2785 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2786     struct svc_req *req, cred_t *cr, bool_t ro)
2787 {
2788         int error;
2789         struct statvfs64 sb;
2790         vnode_t *vp;
2791 
2792         vp = nfs_fhtovp(fh, exi);
2793         if (vp == NULL) {
2794                 fs->fs_status = NFSERR_STALE;
2795                 return;
2796         }
2797 
2798         error = VFS_STATVFS(vp->v_vfsp, &sb);
2799 
2800         if (!error) {
2801                 fs->fs_tsize = nfstsize();
2802                 fs->fs_bsize = sb.f_frsize;
2803                 fs->fs_blocks = sb.f_blocks;
2804                 fs->fs_bfree = sb.f_bfree;
2805                 fs->fs_bavail = sb.f_bavail;
2806         }
2807 
2808         VN_RELE(vp);
2809 
2810         fs->fs_status = puterrno(error);
2811 
2812 }
2813 void *
2814 rfs_statfs_getfh(fhandle_t *fh)
2815 {
2816         return (fh);
2817 }
2818 
2819 static int
2820 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2821 {
2822         vap->va_mask = 0;
2823 
2824         /*
2825          * There was a sign extension bug in some VFS based systems
2826          * which stored the mode as a short.  When it would get
2827          * assigned to a u_long, no sign extension would occur.
2828          * It needed to, but this wasn't noticed because sa_mode
2829          * would then get assigned back to the short, thus ignoring
2830          * the upper 16 bits of sa_mode.
2831          *
2832          * To make this implementation work for both broken
2833          * clients and good clients, we check for both versions
2834          * of the mode.
2835          */
2836         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2837             sa->sa_mode != (uint32_t)-1) {
2838                 vap->va_mask |= AT_MODE;
2839                 vap->va_mode = sa->sa_mode;
2840         }
2841         if (sa->sa_uid != (uint32_t)-1) {
2842                 vap->va_mask |= AT_UID;
2843                 vap->va_uid = sa->sa_uid;
2844         }
2845         if (sa->sa_gid != (uint32_t)-1) {
2846                 vap->va_mask |= AT_GID;
2847                 vap->va_gid = sa->sa_gid;
2848         }
2849         if (sa->sa_size != (uint32_t)-1) {
2850                 vap->va_mask |= AT_SIZE;
2851                 vap->va_size = sa->sa_size;
2852         }
2853         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2854             sa->sa_atime.tv_usec != (int32_t)-1) {
2855 #ifndef _LP64
2856                 /* return error if time overflow */
2857                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2858                         return (EOVERFLOW);
2859 #endif
2860                 vap->va_mask |= AT_ATIME;
2861                 /*
2862                  * nfs protocol defines times as unsigned so don't extend sign,
2863                  * unless sysadmin set nfs_allow_preepoch_time.
2864                  */
2865                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2866                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2867         }
2868         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2869             sa->sa_mtime.tv_usec != (int32_t)-1) {
2870 #ifndef _LP64
2871                 /* return error if time overflow */
2872                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2873                         return (EOVERFLOW);
2874 #endif
2875                 vap->va_mask |= AT_MTIME;
2876                 /*
2877                  * nfs protocol defines times as unsigned so don't extend sign,
2878                  * unless sysadmin set nfs_allow_preepoch_time.
2879                  */
2880                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2881                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2882         }
2883         return (0);
2884 }
2885 
2886 static const enum nfsftype vt_to_nf[] = {
2887         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2888 };
2889 
2890 /*
2891  * check the following fields for overflow: nodeid, size, and time.
2892  * There could be a problem when converting 64-bit LP64 fields
2893  * into 32-bit ones.  Return an error if there is an overflow.
2894  */
2895 int
2896 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2897 {
2898         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2899         na->na_type = vt_to_nf[vap->va_type];
2900 
2901         if (vap->va_mode == (unsigned short) -1)
2902                 na->na_mode = (uint32_t)-1;
2903         else
2904                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2905 
2906         if (vap->va_uid == (unsigned short)(-1))
2907                 na->na_uid = (uint32_t)(-1);
2908         else if (vap->va_uid == UID_NOBODY)
2909                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2910         else
2911                 na->na_uid = vap->va_uid;
2912 
2913         if (vap->va_gid == (unsigned short)(-1))
2914                 na->na_gid = (uint32_t)-1;
2915         else if (vap->va_gid == GID_NOBODY)
2916                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2917         else
2918                 na->na_gid = vap->va_gid;
2919 
2920         /*
2921          * Do we need to check fsid for overflow?  It is 64-bit in the
2922          * vattr, but are bigger than 32 bit values supported?
2923          */
2924         na->na_fsid = vap->va_fsid;
2925 
2926         na->na_nodeid = vap->va_nodeid;
2927 
2928         /*
2929          * Check to make sure that the nodeid is representable over the
2930          * wire without losing bits.
2931          */
2932         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2933                 return (EFBIG);
2934         na->na_nlink = vap->va_nlink;
2935 
2936         /*
2937          * Check for big files here, instead of at the caller.  See
2938          * comments in cstat for large special file explanation.
2939          */
2940         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2941                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2942                         return (EFBIG);
2943                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2944                         /* UNKNOWN_SIZE | OVERFLOW */
2945                         na->na_size = MAXOFF32_T;
2946                 } else
2947                         na->na_size = vap->va_size;
2948         } else
2949                 na->na_size = vap->va_size;
2950 
2951         /*
2952          * If the vnode times overflow the 32-bit times that NFS2
2953          * uses on the wire then return an error.
2954          */
2955         if (!NFS_VAP_TIME_OK(vap)) {
2956                 return (EOVERFLOW);
2957         }
2958         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2959         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2960 
2961         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2962         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2963 
2964         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2965         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2966 
2967         /*
2968          * If the dev_t will fit into 16 bits then compress
2969          * it, otherwise leave it alone. See comments in
2970          * nfs_client.c.
2971          */
2972         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2973             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2974                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2975         else
2976                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2977 
2978         na->na_blocks = vap->va_nblocks;
2979         na->na_blocksize = vap->va_blksize;
2980 
2981         /*
2982          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2983          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2984          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2985          *
2986          * BUYER BEWARE:
2987          *  If you are porting the NFS to a non-Sun server, you probably
2988          *  don't want to include the following block of code.  The
2989          *  over-the-wire special file types will be changing with the
2990          *  NFS Protocol Revision.
2991          */
2992         if (vap->va_type == VFIFO)
2993                 NA_SETFIFO(na);
2994         return (0);
2995 }
2996 
2997 /*
2998  * acl v2 support: returns approximate permission.
2999  *      default: returns minimal permission (more restrictive)
3000  *      aclok: returns maximal permission (less restrictive)
3001  *      This routine changes the permissions that are alaredy in *va.
3002  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3003  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3004  */
3005 static void
3006 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3007 {
3008         vsecattr_t      vsa;
3009         int             aclcnt;
3010         aclent_t        *aclentp;
3011         mode_t          mask_perm;
3012         mode_t          grp_perm;
3013         mode_t          other_perm;
3014         mode_t          other_orig;
3015         int             error;
3016 
3017         /* dont care default acl */
3018         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3019         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3020 
3021         if (!error) {
3022                 aclcnt = vsa.vsa_aclcnt;
3023                 if (aclcnt > MIN_ACL_ENTRIES) {
3024                         /* non-trivial ACL */
3025                         aclentp = vsa.vsa_aclentp;
3026                         if (exi->exi_export.ex_flags & EX_ACLOK) {
3027                                 /* maximal permissions */
3028                                 grp_perm = 0;
3029                                 other_perm = 0;
3030                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3031                                         switch (aclentp->a_type) {
3032                                         case USER_OBJ:
3033                                                 break;
3034                                         case USER:
3035                                                 grp_perm |=
3036                                                     aclentp->a_perm << 3;
3037                                                 other_perm |= aclentp->a_perm;
3038                                                 break;
3039                                         case GROUP_OBJ:
3040                                                 grp_perm |=
3041                                                     aclentp->a_perm << 3;
3042                                                 break;
3043                                         case GROUP:
3044                                                 other_perm |= aclentp->a_perm;
3045                                                 break;
3046                                         case OTHER_OBJ:
3047                                                 other_orig = aclentp->a_perm;
3048                                                 break;
3049                                         case CLASS_OBJ:
3050                                                 mask_perm = aclentp->a_perm;
3051                                                 break;
3052                                         default:
3053                                                 break;
3054                                         }
3055                                 }
3056                                 grp_perm &= mask_perm << 3;
3057                                 other_perm &= mask_perm;
3058                                 other_perm |= other_orig;
3059 
3060                         } else {
3061                                 /* minimal permissions */
3062                                 grp_perm = 070;
3063                                 other_perm = 07;
3064                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3065                                         switch (aclentp->a_type) {
3066                                         case USER_OBJ:
3067                                                 break;
3068                                         case USER:
3069                                         case CLASS_OBJ:
3070                                                 grp_perm &=
3071                                                     aclentp->a_perm << 3;
3072                                                 other_perm &=
3073                                                     aclentp->a_perm;
3074                                                 break;
3075                                         case GROUP_OBJ:
3076                                                 grp_perm &=
3077                                                     aclentp->a_perm << 3;
3078                                                 break;
3079                                         case GROUP:
3080                                                 other_perm &=
3081                                                     aclentp->a_perm;
3082                                                 break;
3083                                         case OTHER_OBJ:
3084                                                 other_perm &=
3085                                                     aclentp->a_perm;
3086                                                 break;
3087                                         default:
3088                                                 break;
3089                                         }
3090                                 }
3091                         }
3092                         /* copy to va */
3093                         va->va_mode &= ~077;
3094                         va->va_mode |= grp_perm | other_perm;
3095                 }
3096                 if (vsa.vsa_aclcnt)
3097                         kmem_free(vsa.vsa_aclentp,
3098                             vsa.vsa_aclcnt * sizeof (aclent_t));
3099         }
3100 }
3101 
3102 void
3103 rfs_srvrinit(void)
3104 {
3105         nfs2_srv_caller_id = fs_new_caller_id();
3106         zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3107 }
3108 
3109 void
3110 rfs_srvrfini(void)
3111 {
3112 }
3113 
3114 /* ARGSUSED */
3115 static void *
3116 rfs_zone_init(zoneid_t zoneid)
3117 {
3118         nfs_srv_t *ns;
3119 
3120         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3121 
3122         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3123         ns->write_async = 1;
3124 
3125         return (ns);
3126 }
3127 
3128 /* ARGSUSED */
3129 static void
3130 rfs_zone_fini(zoneid_t zoneid, void *data)
3131 {
3132         nfs_srv_t *ns;
3133 
3134         ns = (nfs_srv_t *)data;
3135         mutex_destroy(&ns->async_write_lock);
3136         kmem_free(ns, sizeof (*ns));
3137 }
3138 
3139 static int
3140 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3141 {
3142         struct clist    *wcl;
3143         int             wlist_len;
3144         uint32_t        count = rr->rr_count;
3145 
3146         wcl = ra->ra_wlist;
3147 
3148         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3149                 return (FALSE);
3150         }
3151 
3152         wcl = ra->ra_wlist;
3153         rr->rr_ok.rrok_wlist_len = wlist_len;
3154         rr->rr_ok.rrok_wlist = wcl;
3155 
3156         return (TRUE);
3157 }