1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 
  33 /*
  34  * Copyright 2018 Nexenta Systems, Inc.
  35  * Copyright (c) 2016 by Delphix. All rights reserved.
  36  */
  37 
  38 #include <sys/param.h>
  39 #include <sys/types.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/buf.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/stat.h>
  47 #include <sys/errno.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/statvfs.h>
  50 #include <sys/kmem.h>
  51 #include <sys/kstat.h>
  52 #include <sys/dirent.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/debug.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/mode.h>
  57 #include <sys/acl.h>
  58 #include <sys/nbmlock.h>
  59 #include <sys/policy.h>
  60 #include <sys/sdt.h>
  61 
  62 #include <rpc/types.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/svc.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/export.h>
  68 #include <nfs/nfs_cmd.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_kmem.h>
  75 
  76 #include <sys/strsubr.h>
  77 
  78 struct rfs_async_write_list;
  79 
  80 /*
  81  * Zone globals of NFSv2 server
  82  */
  83 typedef struct nfs_srv {
  84         kmutex_t                        async_write_lock;
  85         struct rfs_async_write_list     *async_write_head;
  86 
  87         /*
  88          * enables write clustering if == 1
  89          */
  90         int             write_async;
  91 } nfs_srv_t;
  92 
  93 /*
  94  * These are the interface routines for the server side of the
  95  * Network File System.  See the NFS version 2 protocol specification
  96  * for a description of this interface.
  97  */
  98 
  99 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101                         cred_t *);
 102 
 103 
 104 /*
 105  * Some "over the wire" UNIX file types.  These are encoded
 106  * into the mode.  This needs to be fixed in the next rev.
 107  */
 108 #define IFMT            0170000         /* type of file */
 109 #define IFCHR           0020000         /* character special */
 110 #define IFBLK           0060000         /* block special */
 111 #define IFSOCK          0140000         /* socket */
 112 
 113 u_longlong_t nfs2_srv_caller_id;
 114 
 115 static nfs_srv_t *
 116 nfs_get_srv(void)
 117 {
 118         nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 119         nfs_srv_t *srv = ng->nfs_srv;
 120         ASSERT(srv != NULL);
 121         return (srv);
 122 }
 123 
 124 /*
 125  * Get file attributes.
 126  * Returns the current attributes of the file with the given fhandle.
 127  */
 128 /* ARGSUSED */
 129 void
 130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131     struct svc_req *req, cred_t *cr, bool_t ro)
 132 {
 133         int error;
 134         vnode_t *vp;
 135         struct vattr va;
 136 
 137         vp = nfs_fhtovp(fhp, exi);
 138         if (vp == NULL) {
 139                 ns->ns_status = NFSERR_STALE;
 140                 return;
 141         }
 142 
 143         /*
 144          * Do the getattr.
 145          */
 146         va.va_mask = AT_ALL;    /* we want all the attributes */
 147 
 148         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 149 
 150         /* check for overflows */
 151         if (!error) {
 152                 /* Lie about the object type for a referral */
 153                 if (vn_is_nfs_reparse(vp, cr))
 154                         va.va_type = VLNK;
 155 
 156                 acl_perm(vp, exi, &va, cr);
 157                 error = vattr_to_nattr(&va, &ns->ns_attr);
 158         }
 159 
 160         VN_RELE(vp);
 161 
 162         ns->ns_status = puterrno(error);
 163 }
 164 void *
 165 rfs_getattr_getfh(fhandle_t *fhp)
 166 {
 167         return (fhp);
 168 }
 169 
 170 /*
 171  * Set file attributes.
 172  * Sets the attributes of the file with the given fhandle.  Returns
 173  * the new attributes.
 174  */
 175 /* ARGSUSED */
 176 void
 177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 178     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 179 {
 180         int error;
 181         int flag;
 182         int in_crit = 0;
 183         vnode_t *vp;
 184         struct vattr va;
 185         struct vattr bva;
 186         struct flock64 bf;
 187         caller_context_t ct;
 188 
 189 
 190         vp = nfs_fhtovp(&args->saa_fh, exi);
 191         if (vp == NULL) {
 192                 ns->ns_status = NFSERR_STALE;
 193                 return;
 194         }
 195 
 196         if (rdonly(ro, vp)) {
 197                 VN_RELE(vp);
 198                 ns->ns_status = NFSERR_ROFS;
 199                 return;
 200         }
 201 
 202         error = sattr_to_vattr(&args->saa_sa, &va);
 203         if (error) {
 204                 VN_RELE(vp);
 205                 ns->ns_status = puterrno(error);
 206                 return;
 207         }
 208 
 209         /*
 210          * If the client is requesting a change to the mtime,
 211          * but the nanosecond field is set to 1 billion, then
 212          * this is a flag to the server that it should set the
 213          * atime and mtime fields to the server's current time.
 214          * The 1 billion number actually came from the client
 215          * as 1 million, but the units in the over the wire
 216          * request are microseconds instead of nanoseconds.
 217          *
 218          * This is an overload of the protocol and should be
 219          * documented in the NFS Version 2 protocol specification.
 220          */
 221         if (va.va_mask & AT_MTIME) {
 222                 if (va.va_mtime.tv_nsec == 1000000000) {
 223                         gethrestime(&va.va_mtime);
 224                         va.va_atime = va.va_mtime;
 225                         va.va_mask |= AT_ATIME;
 226                         flag = 0;
 227                 } else
 228                         flag = ATTR_UTIME;
 229         } else
 230                 flag = 0;
 231 
 232         /*
 233          * If the filesystem is exported with nosuid, then mask off
 234          * the setuid and setgid bits.
 235          */
 236         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 237             (exi->exi_export.ex_flags & EX_NOSUID))
 238                 va.va_mode &= ~(VSUID | VSGID);
 239 
 240         ct.cc_sysid = 0;
 241         ct.cc_pid = 0;
 242         ct.cc_caller_id = nfs2_srv_caller_id;
 243         ct.cc_flags = CC_DONTBLOCK;
 244 
 245         /*
 246          * We need to specially handle size changes because it is
 247          * possible for the client to create a file with modes
 248          * which indicate read-only, but with the file opened for
 249          * writing.  If the client then tries to set the size of
 250          * the file, then the normal access checking done in
 251          * VOP_SETATTR would prevent the client from doing so,
 252          * although it should be legal for it to do so.  To get
 253          * around this, we do the access checking for ourselves
 254          * and then use VOP_SPACE which doesn't do the access
 255          * checking which VOP_SETATTR does. VOP_SPACE can only
 256          * operate on VREG files, let VOP_SETATTR handle the other
 257          * extremely rare cases.
 258          * Also the client should not be allowed to change the
 259          * size of the file if there is a conflicting non-blocking
 260          * mandatory lock in the region of change.
 261          */
 262         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 263                 if (nbl_need_check(vp)) {
 264                         nbl_start_crit(vp, RW_READER);
 265                         in_crit = 1;
 266                 }
 267 
 268                 bva.va_mask = AT_UID | AT_SIZE;
 269 
 270                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 271 
 272                 if (error) {
 273                         if (in_crit)
 274                                 nbl_end_crit(vp);
 275                         VN_RELE(vp);
 276                         ns->ns_status = puterrno(error);
 277                         return;
 278                 }
 279 
 280                 if (in_crit) {
 281                         u_offset_t offset;
 282                         ssize_t length;
 283 
 284                         if (va.va_size < bva.va_size) {
 285                                 offset = va.va_size;
 286                                 length = bva.va_size - va.va_size;
 287                         } else {
 288                                 offset = bva.va_size;
 289                                 length = va.va_size - bva.va_size;
 290                         }
 291                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 292                             NULL)) {
 293                                 error = EACCES;
 294                         }
 295                 }
 296 
 297                 if (crgetuid(cr) == bva.va_uid && !error &&
 298                     va.va_size != bva.va_size) {
 299                         va.va_mask &= ~AT_SIZE;
 300                         bf.l_type = F_WRLCK;
 301                         bf.l_whence = 0;
 302                         bf.l_start = (off64_t)va.va_size;
 303                         bf.l_len = 0;
 304                         bf.l_sysid = 0;
 305                         bf.l_pid = 0;
 306 
 307                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 308                             (offset_t)va.va_size, cr, &ct);
 309                 }
 310                 if (in_crit)
 311                         nbl_end_crit(vp);
 312         } else
 313                 error = 0;
 314 
 315         /*
 316          * Do the setattr.
 317          */
 318         if (!error && va.va_mask) {
 319                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 320         }
 321 
 322         /*
 323          * check if the monitor on either vop_space or vop_setattr detected
 324          * a delegation conflict and if so, mark the thread flag as
 325          * wouldblock so that the response is dropped and the client will
 326          * try again.
 327          */
 328         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 329                 VN_RELE(vp);
 330                 curthread->t_flag |= T_WOULDBLOCK;
 331                 return;
 332         }
 333 
 334         if (!error) {
 335                 va.va_mask = AT_ALL;    /* get everything */
 336 
 337                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 338 
 339                 /* check for overflows */
 340                 if (!error) {
 341                         acl_perm(vp, exi, &va, cr);
 342                         error = vattr_to_nattr(&va, &ns->ns_attr);
 343                 }
 344         }
 345 
 346         ct.cc_flags = 0;
 347 
 348         /*
 349          * Force modified metadata out to stable storage.
 350          */
 351         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 352 
 353         VN_RELE(vp);
 354 
 355         ns->ns_status = puterrno(error);
 356 }
 357 void *
 358 rfs_setattr_getfh(struct nfssaargs *args)
 359 {
 360         return (&args->saa_fh);
 361 }
 362 
 363 /* Change and release @exip and @vpp only in success */
 364 int
 365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 366 {
 367         struct exportinfo *exi;
 368         vnode_t *vp = *vpp;
 369         fid_t fid;
 370         int error;
 371 
 372         VN_HOLD(vp);
 373 
 374         if ((error = traverse(&vp)) != 0) {
 375                 VN_RELE(vp);
 376                 return (error);
 377         }
 378 
 379         bzero(&fid, sizeof (fid));
 380         fid.fid_len = MAXFIDSZ;
 381         error = VOP_FID(vp, &fid, NULL);
 382         if (error) {
 383                 VN_RELE(vp);
 384                 return (error);
 385         }
 386 
 387         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 388         if (exi == NULL ||
 389             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 390                 /*
 391                  * It is not error, just subdir is not exported
 392                  * or "nohide" is not set
 393                  */
 394                 if (exi != NULL)
 395                         exi_rele(exi);
 396                 VN_RELE(vp);
 397         } else {
 398                 /* go to submount */
 399                 exi_rele(*exip);
 400                 *exip = exi;
 401 
 402                 VN_RELE(*vpp);
 403                 *vpp = vp;
 404         }
 405 
 406         return (0);
 407 }
 408 
 409 /*
 410  * Given mounted "dvp" and "exi", go upper mountpoint
 411  * with dvp/exi correction
 412  * Return 0 in success
 413  */
 414 int
 415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416 {
 417         struct exportinfo *exi;
 418         vnode_t *dvp = *dvpp;
 419 
 420         ASSERT3P((*exip)->exi_zone, ==, curzone);
 421         ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
 422 
 423         VN_HOLD(dvp);
 424         dvp = untraverse(dvp);
 425         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 426         if (exi == NULL) {
 427                 VN_RELE(dvp);
 428                 return (-1);
 429         }
 430 
 431         ASSERT3P(exi->exi_zone, ==, curzone);
 432         exi_rele(*exip);
 433         *exip = exi;
 434         VN_RELE(*dvpp);
 435         *dvpp = dvp;
 436 
 437         return (0);
 438 }
 439 /*
 440  * Directory lookup.
 441  * Returns an fhandle and file attributes for file name in a directory.
 442  */
 443 /* ARGSUSED */
 444 void
 445 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 446     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 447 {
 448         int error;
 449         vnode_t *dvp;
 450         vnode_t *vp;
 451         struct vattr va;
 452         fhandle_t *fhp = da->da_fhandle;
 453         struct sec_ol sec = {0, 0};
 454         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 455         char *name;
 456         struct sockaddr *ca;
 457 
 458         /*
 459          * Trusted Extension doesn't support NFSv2. MOUNT
 460          * will reject v2 clients. Need to prevent v2 client
 461          * access via WebNFS here.
 462          */
 463         if (is_system_labeled() && req->rq_vers == 2) {
 464                 dr->dr_status = NFSERR_ACCES;
 465                 return;
 466         }
 467 
 468         /*
 469          * Disallow NULL paths
 470          */
 471         if (da->da_name == NULL || *da->da_name == '\0') {
 472                 dr->dr_status = NFSERR_ACCES;
 473                 return;
 474         }
 475 
 476         /*
 477          * Allow lookups from the root - the default
 478          * location of the public filehandle.
 479          */
 480         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 481                 dvp = ZONE_ROOTVP();
 482                 VN_HOLD(dvp);
 483         } else {
 484                 dvp = nfs_fhtovp(fhp, exi);
 485                 if (dvp == NULL) {
 486                         dr->dr_status = NFSERR_STALE;
 487                         return;
 488                 }
 489         }
 490 
 491         exi_hold(exi);
 492         ASSERT3P(exi->exi_zone, ==, curzone);
 493 
 494         /*
 495          * Not allow lookup beyond root.
 496          * If the filehandle matches a filehandle of the exi,
 497          * then the ".." refers beyond the root of an exported filesystem.
 498          */
 499         if (strcmp(da->da_name, "..") == 0 &&
 500             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 501                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 502                     ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 503                         /*
 504                          * special case for ".." and 'nohide'exported root
 505                          */
 506                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 507                                 error = NFSERR_ACCES;
 508                                 goto out;
 509                         }
 510                 } else  {
 511                         error = NFSERR_NOENT;
 512                         goto out;
 513                 }
 514         }
 515 
 516         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 517         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 518             MAXPATHLEN);
 519 
 520         if (name == NULL) {
 521                 error = NFSERR_ACCES;
 522                 goto out;
 523         }
 524 
 525         /*
 526          * If the public filehandle is used then allow
 527          * a multi-component lookup, i.e. evaluate
 528          * a pathname and follow symbolic links if
 529          * necessary.
 530          *
 531          * This may result in a vnode in another filesystem
 532          * which is OK as long as the filesystem is exported.
 533          */
 534         if (PUBLIC_FH2(fhp)) {
 535                 publicfh_flag = TRUE;
 536 
 537                 exi_rele(exi);
 538 
 539                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 540                     &sec);
 541         } else {
 542                 /*
 543                  * Do a normal single component lookup.
 544                  */
 545                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 546                     NULL, NULL, NULL);
 547         }
 548 
 549         if (name != da->da_name)
 550                 kmem_free(name, MAXPATHLEN);
 551 
 552         if (error == 0 && vn_ismntpt(vp)) {
 553                 error = rfs_cross_mnt(&vp, &exi);
 554                 if (error)
 555                         VN_RELE(vp);
 556         }
 557 
 558         if (!error) {
 559                 va.va_mask = AT_ALL;    /* we want everything */
 560 
 561                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 562 
 563                 /* check for overflows */
 564                 if (!error) {
 565                         acl_perm(vp, exi, &va, cr);
 566                         error = vattr_to_nattr(&va, &dr->dr_attr);
 567                         if (!error) {
 568                                 if (sec.sec_flags & SEC_QUERY)
 569                                         error = makefh_ol(&dr->dr_fhandle, exi,
 570                                             sec.sec_index);
 571                                 else {
 572                                         error = makefh(&dr->dr_fhandle, vp,
 573                                             exi);
 574                                         if (!error && publicfh_flag &&
 575                                             !chk_clnt_sec(exi, req))
 576                                                 auth_weak = TRUE;
 577                                 }
 578                         }
 579                 }
 580                 VN_RELE(vp);
 581         }
 582 
 583 out:
 584         VN_RELE(dvp);
 585 
 586         if (exi != NULL)
 587                 exi_rele(exi);
 588 
 589         /*
 590          * If it's public fh, no 0x81, and client's flavor is
 591          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 592          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 593          */
 594         if (auth_weak)
 595                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 596         else
 597                 dr->dr_status = puterrno(error);
 598 }
 599 void *
 600 rfs_lookup_getfh(struct nfsdiropargs *da)
 601 {
 602         return (da->da_fhandle);
 603 }
 604 
 605 /*
 606  * Read symbolic link.
 607  * Returns the string in the symbolic link at the given fhandle.
 608  */
 609 /* ARGSUSED */
 610 void
 611 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 612     struct svc_req *req, cred_t *cr, bool_t ro)
 613 {
 614         int error;
 615         struct iovec iov;
 616         struct uio uio;
 617         vnode_t *vp;
 618         struct vattr va;
 619         struct sockaddr *ca;
 620         char *name = NULL;
 621         int is_referral = 0;
 622 
 623         vp = nfs_fhtovp(fhp, exi);
 624         if (vp == NULL) {
 625                 rl->rl_data = NULL;
 626                 rl->rl_status = NFSERR_STALE;
 627                 return;
 628         }
 629 
 630         va.va_mask = AT_MODE;
 631 
 632         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 633 
 634         if (error) {
 635                 VN_RELE(vp);
 636                 rl->rl_data = NULL;
 637                 rl->rl_status = puterrno(error);
 638                 return;
 639         }
 640 
 641         if (MANDLOCK(vp, va.va_mode)) {
 642                 VN_RELE(vp);
 643                 rl->rl_data = NULL;
 644                 rl->rl_status = NFSERR_ACCES;
 645                 return;
 646         }
 647 
 648         /* We lied about the object type for a referral */
 649         if (vn_is_nfs_reparse(vp, cr))
 650                 is_referral = 1;
 651 
 652         /*
 653          * XNFS and RFC1094 require us to return ENXIO if argument
 654          * is not a link. BUGID 1138002.
 655          */
 656         if (vp->v_type != VLNK && !is_referral) {
 657                 VN_RELE(vp);
 658                 rl->rl_data = NULL;
 659                 rl->rl_status = NFSERR_NXIO;
 660                 return;
 661         }
 662 
 663         /*
 664          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 665          */
 666         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 667 
 668         if (is_referral) {
 669                 char *s;
 670                 size_t strsz;
 671 
 672                 /* Get an artificial symlink based on a referral */
 673                 s = build_symlink(vp, cr, &strsz);
 674                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 675                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 676                     vnode_t *, vp, char *, s);
 677                 if (s == NULL)
 678                         error = EINVAL;
 679                 else {
 680                         error = 0;
 681                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 682                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 683                         kmem_free(s, strsz);
 684                 }
 685 
 686         } else {
 687 
 688                 /*
 689                  * Set up io vector to read sym link data
 690                  */
 691                 iov.iov_base = rl->rl_data;
 692                 iov.iov_len = NFS_MAXPATHLEN;
 693                 uio.uio_iov = &iov;
 694                 uio.uio_iovcnt = 1;
 695                 uio.uio_segflg = UIO_SYSSPACE;
 696                 uio.uio_extflg = UIO_COPY_CACHED;
 697                 uio.uio_loffset = (offset_t)0;
 698                 uio.uio_resid = NFS_MAXPATHLEN;
 699 
 700                 /*
 701                  * Do the readlink.
 702                  */
 703                 error = VOP_READLINK(vp, &uio, cr, NULL);
 704 
 705                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 706 
 707                 if (!error)
 708                         rl->rl_data[rl->rl_count] = '\0';
 709 
 710         }
 711 
 712 
 713         VN_RELE(vp);
 714 
 715         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 716         name = nfscmd_convname(ca, exi, rl->rl_data,
 717             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 718 
 719         if (name != NULL && name != rl->rl_data) {
 720                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 721                 rl->rl_data = name;
 722         }
 723 
 724         /*
 725          * XNFS and RFC1094 require us to return ENXIO if argument
 726          * is not a link. UFS returns EINVAL if this is the case,
 727          * so we do the mapping here. BUGID 1138002.
 728          */
 729         if (error == EINVAL)
 730                 rl->rl_status = NFSERR_NXIO;
 731         else
 732                 rl->rl_status = puterrno(error);
 733 
 734 }
 735 void *
 736 rfs_readlink_getfh(fhandle_t *fhp)
 737 {
 738         return (fhp);
 739 }
 740 /*
 741  * Free data allocated by rfs_readlink
 742  */
 743 void
 744 rfs_rlfree(struct nfsrdlnres *rl)
 745 {
 746         if (rl->rl_data != NULL)
 747                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 748 }
 749 
 750 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 751 
 752 /*
 753  * Read data.
 754  * Returns some data read from the file at the given fhandle.
 755  */
 756 /* ARGSUSED */
 757 void
 758 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 759     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 760 {
 761         vnode_t *vp;
 762         int error;
 763         struct vattr va;
 764         struct iovec iov;
 765         struct uio uio;
 766         mblk_t *mp;
 767         int alloc_err = 0;
 768         int in_crit = 0;
 769         caller_context_t ct;
 770 
 771         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 772         if (vp == NULL) {
 773                 rr->rr_data = NULL;
 774                 rr->rr_status = NFSERR_STALE;
 775                 return;
 776         }
 777 
 778         if (vp->v_type != VREG) {
 779                 VN_RELE(vp);
 780                 rr->rr_data = NULL;
 781                 rr->rr_status = NFSERR_ISDIR;
 782                 return;
 783         }
 784 
 785         ct.cc_sysid = 0;
 786         ct.cc_pid = 0;
 787         ct.cc_caller_id = nfs2_srv_caller_id;
 788         ct.cc_flags = CC_DONTBLOCK;
 789 
 790         /*
 791          * Enter the critical region before calling VOP_RWLOCK
 792          * to avoid a deadlock with write requests.
 793          */
 794         if (nbl_need_check(vp)) {
 795                 nbl_start_crit(vp, RW_READER);
 796                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 797                     0, NULL)) {
 798                         nbl_end_crit(vp);
 799                         VN_RELE(vp);
 800                         rr->rr_data = NULL;
 801                         rr->rr_status = NFSERR_ACCES;
 802                         return;
 803                 }
 804                 in_crit = 1;
 805         }
 806 
 807         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 808 
 809         /* check if a monitor detected a delegation conflict */
 810         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 811                 if (in_crit)
 812                         nbl_end_crit(vp);
 813                 VN_RELE(vp);
 814                 /* mark as wouldblock so response is dropped */
 815                 curthread->t_flag |= T_WOULDBLOCK;
 816 
 817                 rr->rr_data = NULL;
 818                 return;
 819         }
 820 
 821         va.va_mask = AT_ALL;
 822 
 823         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 824 
 825         if (error) {
 826                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 827                 if (in_crit)
 828                         nbl_end_crit(vp);
 829 
 830                 VN_RELE(vp);
 831                 rr->rr_data = NULL;
 832                 rr->rr_status = puterrno(error);
 833 
 834                 return;
 835         }
 836 
 837         /*
 838          * This is a kludge to allow reading of files created
 839          * with no read permission.  The owner of the file
 840          * is always allowed to read it.
 841          */
 842         if (crgetuid(cr) != va.va_uid) {
 843                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 844 
 845                 if (error) {
 846                         /*
 847                          * Exec is the same as read over the net because
 848                          * of demand loading.
 849                          */
 850                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 851                 }
 852                 if (error) {
 853                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 854                         if (in_crit)
 855                                 nbl_end_crit(vp);
 856                         VN_RELE(vp);
 857                         rr->rr_data = NULL;
 858                         rr->rr_status = puterrno(error);
 859 
 860                         return;
 861                 }
 862         }
 863 
 864         if (MANDLOCK(vp, va.va_mode)) {
 865                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 866                 if (in_crit)
 867                         nbl_end_crit(vp);
 868 
 869                 VN_RELE(vp);
 870                 rr->rr_data = NULL;
 871                 rr->rr_status = NFSERR_ACCES;
 872 
 873                 return;
 874         }
 875 
 876         rr->rr_ok.rrok_wlist_len = 0;
 877         rr->rr_ok.rrok_wlist = NULL;
 878 
 879         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 880                 rr->rr_count = 0;
 881                 rr->rr_data = NULL;
 882                 /*
 883                  * In this case, status is NFS_OK, but there is no data
 884                  * to encode. So set rr_mp to NULL.
 885                  */
 886                 rr->rr_mp = NULL;
 887                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 888                 if (rr->rr_ok.rrok_wlist)
 889                         clist_zero_len(rr->rr_ok.rrok_wlist);
 890                 goto done;
 891         }
 892 
 893         if (ra->ra_wlist) {
 894                 mp = NULL;
 895                 rr->rr_mp = NULL;
 896                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 897                 if (ra->ra_count > iov.iov_len) {
 898                         rr->rr_data = NULL;
 899                         rr->rr_status = NFSERR_INVAL;
 900                         goto done;
 901                 }
 902         } else {
 903                 /*
 904                  * mp will contain the data to be sent out in the read reply.
 905                  * This will be freed after the reply has been sent out (by the
 906                  * driver).
 907                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 908                  * that the call to xdrmblk_putmblk() never fails.
 909                  */
 910                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 911                     &alloc_err);
 912                 ASSERT(mp != NULL);
 913                 ASSERT(alloc_err == 0);
 914 
 915                 rr->rr_mp = mp;
 916 
 917                 /*
 918                  * Set up io vector
 919                  */
 920                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 921                 iov.iov_len = ra->ra_count;
 922         }
 923 
 924         uio.uio_iov = &iov;
 925         uio.uio_iovcnt = 1;
 926         uio.uio_segflg = UIO_SYSSPACE;
 927         uio.uio_extflg = UIO_COPY_CACHED;
 928         uio.uio_loffset = (offset_t)ra->ra_offset;
 929         uio.uio_resid = ra->ra_count;
 930 
 931         error = VOP_READ(vp, &uio, 0, cr, &ct);
 932 
 933         if (error) {
 934                 if (mp)
 935                         freeb(mp);
 936 
 937                 /*
 938                  * check if a monitor detected a delegation conflict and
 939                  * mark as wouldblock so response is dropped
 940                  */
 941                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 942                         curthread->t_flag |= T_WOULDBLOCK;
 943                 else
 944                         rr->rr_status = puterrno(error);
 945 
 946                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 947                 if (in_crit)
 948                         nbl_end_crit(vp);
 949 
 950                 VN_RELE(vp);
 951                 rr->rr_data = NULL;
 952 
 953                 return;
 954         }
 955 
 956         /*
 957          * Get attributes again so we can send the latest access
 958          * time to the client side for its cache.
 959          */
 960         va.va_mask = AT_ALL;
 961 
 962         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 963 
 964         if (error) {
 965                 if (mp)
 966                         freeb(mp);
 967 
 968                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 969                 if (in_crit)
 970                         nbl_end_crit(vp);
 971 
 972                 VN_RELE(vp);
 973                 rr->rr_data = NULL;
 974                 rr->rr_status = puterrno(error);
 975 
 976                 return;
 977         }
 978 
 979         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 980 
 981         if (mp) {
 982                 rr->rr_data = (char *)mp->b_datap->db_base;
 983         } else {
 984                 if (ra->ra_wlist) {
 985                         rr->rr_data = (caddr_t)iov.iov_base;
 986                         if (!rdma_setup_read_data2(ra, rr)) {
 987                                 rr->rr_data = NULL;
 988                                 rr->rr_status = puterrno(NFSERR_INVAL);
 989                         }
 990                 }
 991         }
 992 done:
 993         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 994         if (in_crit)
 995                 nbl_end_crit(vp);
 996 
 997         acl_perm(vp, exi, &va, cr);
 998 
 999         /* check for overflows */
1000         error = vattr_to_nattr(&va, &rr->rr_attr);
1001 
1002         VN_RELE(vp);
1003 
1004         rr->rr_status = puterrno(error);
1005 }
1006 
1007 /*
1008  * Free data allocated by rfs_read
1009  */
1010 void
1011 rfs_rdfree(struct nfsrdresult *rr)
1012 {
1013         mblk_t *mp;
1014 
1015         if (rr->rr_status == NFS_OK) {
1016                 mp = rr->rr_mp;
1017                 if (mp != NULL)
1018                         freeb(mp);
1019         }
1020 }
1021 
1022 void *
1023 rfs_read_getfh(struct nfsreadargs *ra)
1024 {
1025         return (&ra->ra_fhandle);
1026 }
1027 
1028 #define MAX_IOVECS      12
1029 
1030 #ifdef DEBUG
1031 static int rfs_write_sync_hits = 0;
1032 static int rfs_write_sync_misses = 0;
1033 #endif
1034 
1035 /*
1036  * Write data to file.
1037  * Returns attributes of a file after writing some data to it.
1038  *
1039  * Any changes made here, especially in error handling might have
1040  * to also be done in rfs_write (which clusters write requests).
1041  */
1042 /* ARGSUSED */
1043 void
1044 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1045     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1046 {
1047         int error;
1048         vnode_t *vp;
1049         rlim64_t rlimit;
1050         struct vattr va;
1051         struct uio uio;
1052         struct iovec iov[MAX_IOVECS];
1053         mblk_t *m;
1054         struct iovec *iovp;
1055         int iovcnt;
1056         cred_t *savecred;
1057         int in_crit = 0;
1058         caller_context_t ct;
1059 
1060         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1061         if (vp == NULL) {
1062                 ns->ns_status = NFSERR_STALE;
1063                 return;
1064         }
1065 
1066         if (rdonly(ro, vp)) {
1067                 VN_RELE(vp);
1068                 ns->ns_status = NFSERR_ROFS;
1069                 return;
1070         }
1071 
1072         if (vp->v_type != VREG) {
1073                 VN_RELE(vp);
1074                 ns->ns_status = NFSERR_ISDIR;
1075                 return;
1076         }
1077 
1078         ct.cc_sysid = 0;
1079         ct.cc_pid = 0;
1080         ct.cc_caller_id = nfs2_srv_caller_id;
1081         ct.cc_flags = CC_DONTBLOCK;
1082 
1083         va.va_mask = AT_UID|AT_MODE;
1084 
1085         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1086 
1087         if (error) {
1088                 VN_RELE(vp);
1089                 ns->ns_status = puterrno(error);
1090 
1091                 return;
1092         }
1093 
1094         if (crgetuid(cr) != va.va_uid) {
1095                 /*
1096                  * This is a kludge to allow writes of files created
1097                  * with read only permission.  The owner of the file
1098                  * is always allowed to write it.
1099                  */
1100                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1101 
1102                 if (error) {
1103                         VN_RELE(vp);
1104                         ns->ns_status = puterrno(error);
1105                         return;
1106                 }
1107         }
1108 
1109         /*
1110          * Can't access a mandatory lock file.  This might cause
1111          * the NFS service thread to block forever waiting for a
1112          * lock to be released that will never be released.
1113          */
1114         if (MANDLOCK(vp, va.va_mode)) {
1115                 VN_RELE(vp);
1116                 ns->ns_status = NFSERR_ACCES;
1117                 return;
1118         }
1119 
1120         /*
1121          * We have to enter the critical region before calling VOP_RWLOCK
1122          * to avoid a deadlock with ufs.
1123          */
1124         if (nbl_need_check(vp)) {
1125                 nbl_start_crit(vp, RW_READER);
1126                 in_crit = 1;
1127                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1128                     wa->wa_count, 0, NULL)) {
1129                         error = EACCES;
1130                         goto out;
1131                 }
1132         }
1133 
1134         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1135 
1136         /* check if a monitor detected a delegation conflict */
1137         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1138                 goto out;
1139         }
1140 
1141         if (wa->wa_data || wa->wa_rlist) {
1142                 /* Do the RDMA thing if necessary */
1143                 if (wa->wa_rlist) {
1144                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1145                         iov[0].iov_len = wa->wa_count;
1146                 } else  {
1147                         iov[0].iov_base = wa->wa_data;
1148                         iov[0].iov_len = wa->wa_count;
1149                 }
1150                 uio.uio_iov = iov;
1151                 uio.uio_iovcnt = 1;
1152                 uio.uio_segflg = UIO_SYSSPACE;
1153                 uio.uio_extflg = UIO_COPY_DEFAULT;
1154                 uio.uio_loffset = (offset_t)wa->wa_offset;
1155                 uio.uio_resid = wa->wa_count;
1156                 /*
1157                  * The limit is checked on the client. We
1158                  * should allow any size writes here.
1159                  */
1160                 uio.uio_llimit = curproc->p_fsz_ctl;
1161                 rlimit = uio.uio_llimit - wa->wa_offset;
1162                 if (rlimit < (rlim64_t)uio.uio_resid)
1163                         uio.uio_resid = (uint_t)rlimit;
1164 
1165                 /*
1166                  * for now we assume no append mode
1167                  */
1168                 /*
1169                  * We're changing creds because VM may fault and we need
1170                  * the cred of the current thread to be used if quota
1171                  * checking is enabled.
1172                  */
1173                 savecred = curthread->t_cred;
1174                 curthread->t_cred = cr;
1175                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1176                 curthread->t_cred = savecred;
1177         } else {
1178 
1179                 iovcnt = 0;
1180                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1181                         iovcnt++;
1182                 if (iovcnt <= MAX_IOVECS) {
1183 #ifdef DEBUG
1184                         rfs_write_sync_hits++;
1185 #endif
1186                         iovp = iov;
1187                 } else {
1188 #ifdef DEBUG
1189                         rfs_write_sync_misses++;
1190 #endif
1191                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1192                 }
1193                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1194                 uio.uio_iov = iovp;
1195                 uio.uio_iovcnt = iovcnt;
1196                 uio.uio_segflg = UIO_SYSSPACE;
1197                 uio.uio_extflg = UIO_COPY_DEFAULT;
1198                 uio.uio_loffset = (offset_t)wa->wa_offset;
1199                 uio.uio_resid = wa->wa_count;
1200                 /*
1201                  * The limit is checked on the client. We
1202                  * should allow any size writes here.
1203                  */
1204                 uio.uio_llimit = curproc->p_fsz_ctl;
1205                 rlimit = uio.uio_llimit - wa->wa_offset;
1206                 if (rlimit < (rlim64_t)uio.uio_resid)
1207                         uio.uio_resid = (uint_t)rlimit;
1208 
1209                 /*
1210                  * For now we assume no append mode.
1211                  */
1212                 /*
1213                  * We're changing creds because VM may fault and we need
1214                  * the cred of the current thread to be used if quota
1215                  * checking is enabled.
1216                  */
1217                 savecred = curthread->t_cred;
1218                 curthread->t_cred = cr;
1219                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1220                 curthread->t_cred = savecred;
1221 
1222                 if (iovp != iov)
1223                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1224         }
1225 
1226         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1227 
1228         if (!error) {
1229                 /*
1230                  * Get attributes again so we send the latest mod
1231                  * time to the client side for its cache.
1232                  */
1233                 va.va_mask = AT_ALL;    /* now we want everything */
1234 
1235                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1236 
1237                 /* check for overflows */
1238                 if (!error) {
1239                         acl_perm(vp, exi, &va, cr);
1240                         error = vattr_to_nattr(&va, &ns->ns_attr);
1241                 }
1242         }
1243 
1244 out:
1245         if (in_crit)
1246                 nbl_end_crit(vp);
1247         VN_RELE(vp);
1248 
1249         /* check if a monitor detected a delegation conflict */
1250         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1251                 /* mark as wouldblock so response is dropped */
1252                 curthread->t_flag |= T_WOULDBLOCK;
1253         else
1254                 ns->ns_status = puterrno(error);
1255 
1256 }
1257 
1258 struct rfs_async_write {
1259         struct nfswriteargs *wa;
1260         struct nfsattrstat *ns;
1261         struct svc_req *req;
1262         cred_t *cr;
1263         bool_t ro;
1264         kthread_t *thread;
1265         struct rfs_async_write *list;
1266 };
1267 
1268 struct rfs_async_write_list {
1269         fhandle_t *fhp;
1270         kcondvar_t cv;
1271         struct rfs_async_write *list;
1272         struct rfs_async_write_list *next;
1273 };
1274 
1275 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1276 static kmutex_t rfs_async_write_lock;
1277 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1278 
1279 #define MAXCLIOVECS     42
1280 #define RFSWRITE_INITVAL (enum nfsstat) -1
1281 
1282 #ifdef DEBUG
1283 static int rfs_write_hits = 0;
1284 static int rfs_write_misses = 0;
1285 #endif
1286 
1287 /*
1288  * Write data to file.
1289  * Returns attributes of a file after writing some data to it.
1290  */
1291 void
1292 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1293     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1294 {
1295         int error;
1296         vnode_t *vp;
1297         rlim64_t rlimit;
1298         struct vattr va;
1299         struct uio uio;
1300         struct rfs_async_write_list *lp;
1301         struct rfs_async_write_list *nlp;
1302         struct rfs_async_write *rp;
1303         struct rfs_async_write *nrp;
1304         struct rfs_async_write *trp;
1305         struct rfs_async_write *lrp;
1306         int data_written;
1307         int iovcnt;
1308         mblk_t *m;
1309         struct iovec *iovp;
1310         struct iovec *niovp;
1311         struct iovec iov[MAXCLIOVECS];
1312         int count;
1313         int rcount;
1314         uint_t off;
1315         uint_t len;
1316         struct rfs_async_write nrpsp;
1317         struct rfs_async_write_list nlpsp;
1318         ushort_t t_flag;
1319         cred_t *savecred;
1320         int in_crit = 0;
1321         caller_context_t ct;
1322         nfs_srv_t *nsrv;
1323 
1324         ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1325         nsrv = nfs_get_srv();
1326         if (!nsrv->write_async) {
1327                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1328                 return;
1329         }
1330 
1331         /*
1332          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1333          * is considered an OK.
1334          */
1335         ns->ns_status = RFSWRITE_INITVAL;
1336 
1337         nrp = &nrpsp;
1338         nrp->wa = wa;
1339         nrp->ns = ns;
1340         nrp->req = req;
1341         nrp->cr = cr;
1342         nrp->ro = ro;
1343         nrp->thread = curthread;
1344 
1345         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1346 
1347         /*
1348          * Look to see if there is already a cluster started
1349          * for this file.
1350          */
1351         mutex_enter(&nsrv->async_write_lock);
1352         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1353                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1354                     sizeof (fhandle_t)) == 0)
1355                         break;
1356         }
1357 
1358         /*
1359          * If lp is non-NULL, then there is already a cluster
1360          * started.  We need to place ourselves in the cluster
1361          * list in the right place as determined by starting
1362          * offset.  Conflicts with non-blocking mandatory locked
1363          * regions will be checked when the cluster is processed.
1364          */
1365         if (lp != NULL) {
1366                 rp = lp->list;
1367                 trp = NULL;
1368                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1369                         trp = rp;
1370                         rp = rp->list;
1371                 }
1372                 nrp->list = rp;
1373                 if (trp == NULL)
1374                         lp->list = nrp;
1375                 else
1376                         trp->list = nrp;
1377                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1378                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1379                 mutex_exit(&nsrv->async_write_lock);
1380 
1381                 return;
1382         }
1383 
1384         /*
1385          * No cluster started yet, start one and add ourselves
1386          * to the list of clusters.
1387          */
1388         nrp->list = NULL;
1389 
1390         nlp = &nlpsp;
1391         nlp->fhp = &wa->wa_fhandle;
1392         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1393         nlp->list = nrp;
1394         nlp->next = NULL;
1395 
1396         if (nsrv->async_write_head == NULL) {
1397                 nsrv->async_write_head = nlp;
1398         } else {
1399                 lp = nsrv->async_write_head;
1400                 while (lp->next != NULL)
1401                         lp = lp->next;
1402                 lp->next = nlp;
1403         }
1404         mutex_exit(&nsrv->async_write_lock);
1405 
1406         /*
1407          * Convert the file handle common to all of the requests
1408          * in this cluster to a vnode.
1409          */
1410         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1411         if (vp == NULL) {
1412                 mutex_enter(&nsrv->async_write_lock);
1413                 if (nsrv->async_write_head == nlp)
1414                         nsrv->async_write_head = nlp->next;
1415                 else {
1416                         lp = nsrv->async_write_head;
1417                         while (lp->next != nlp)
1418                                 lp = lp->next;
1419                         lp->next = nlp->next;
1420                 }
1421                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1422                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1423                         rp->ns->ns_status = NFSERR_STALE;
1424                         rp->thread->t_flag |= t_flag;
1425                 }
1426                 cv_broadcast(&nlp->cv);
1427                 mutex_exit(&nsrv->async_write_lock);
1428 
1429                 return;
1430         }
1431 
1432         /*
1433          * Can only write regular files.  Attempts to write any
1434          * other file types fail with EISDIR.
1435          */
1436         if (vp->v_type != VREG) {
1437                 VN_RELE(vp);
1438                 mutex_enter(&nsrv->async_write_lock);
1439                 if (nsrv->async_write_head == nlp)
1440                         nsrv->async_write_head = nlp->next;
1441                 else {
1442                         lp = nsrv->async_write_head;
1443                         while (lp->next != nlp)
1444                                 lp = lp->next;
1445                         lp->next = nlp->next;
1446                 }
1447                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1448                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1449                         rp->ns->ns_status = NFSERR_ISDIR;
1450                         rp->thread->t_flag |= t_flag;
1451                 }
1452                 cv_broadcast(&nlp->cv);
1453                 mutex_exit(&nsrv->async_write_lock);
1454 
1455                 return;
1456         }
1457 
1458         /*
1459          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1460          * deadlock with ufs.
1461          */
1462         if (nbl_need_check(vp)) {
1463                 nbl_start_crit(vp, RW_READER);
1464                 in_crit = 1;
1465         }
1466 
1467         ct.cc_sysid = 0;
1468         ct.cc_pid = 0;
1469         ct.cc_caller_id = nfs2_srv_caller_id;
1470         ct.cc_flags = CC_DONTBLOCK;
1471 
1472         /*
1473          * Lock the file for writing.  This operation provides
1474          * the delay which allows clusters to grow.
1475          */
1476         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1477 
1478         /* check if a monitor detected a delegation conflict */
1479         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1480                 if (in_crit)
1481                         nbl_end_crit(vp);
1482                 VN_RELE(vp);
1483                 /* mark as wouldblock so response is dropped */
1484                 curthread->t_flag |= T_WOULDBLOCK;
1485                 mutex_enter(&nsrv->async_write_lock);
1486                 if (nsrv->async_write_head == nlp)
1487                         nsrv->async_write_head = nlp->next;
1488                 else {
1489                         lp = nsrv->async_write_head;
1490                         while (lp->next != nlp)
1491                                 lp = lp->next;
1492                         lp->next = nlp->next;
1493                 }
1494                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1495                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1496                                 rp->ns->ns_status = puterrno(error);
1497                                 rp->thread->t_flag |= T_WOULDBLOCK;
1498                         }
1499                 }
1500                 cv_broadcast(&nlp->cv);
1501                 mutex_exit(&nsrv->async_write_lock);
1502 
1503                 return;
1504         }
1505 
1506         /*
1507          * Disconnect this cluster from the list of clusters.
1508          * The cluster that is being dealt with must be fixed
1509          * in size after this point, so there is no reason
1510          * to leave it on the list so that new requests can
1511          * find it.
1512          *
1513          * The algorithm is that the first write request will
1514          * create a cluster, convert the file handle to a
1515          * vnode pointer, and then lock the file for writing.
1516          * This request is not likely to be clustered with
1517          * any others.  However, the next request will create
1518          * a new cluster and be blocked in VOP_RWLOCK while
1519          * the first request is being processed.  This delay
1520          * will allow more requests to be clustered in this
1521          * second cluster.
1522          */
1523         mutex_enter(&nsrv->async_write_lock);
1524         if (nsrv->async_write_head == nlp)
1525                 nsrv->async_write_head = nlp->next;
1526         else {
1527                 lp = nsrv->async_write_head;
1528                 while (lp->next != nlp)
1529                         lp = lp->next;
1530                 lp->next = nlp->next;
1531         }
1532         mutex_exit(&nsrv->async_write_lock);
1533 
1534         /*
1535          * Step through the list of requests in this cluster.
1536          * We need to check permissions to make sure that all
1537          * of the requests have sufficient permission to write
1538          * the file.  A cluster can be composed of requests
1539          * from different clients and different users on each
1540          * client.
1541          *
1542          * As a side effect, we also calculate the size of the
1543          * byte range that this cluster encompasses.
1544          */
1545         rp = nlp->list;
1546         off = rp->wa->wa_offset;
1547         len = (uint_t)0;
1548         do {
1549                 if (rdonly(rp->ro, vp)) {
1550                         rp->ns->ns_status = NFSERR_ROFS;
1551                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1552                         rp->thread->t_flag |= t_flag;
1553                         continue;
1554                 }
1555 
1556                 va.va_mask = AT_UID|AT_MODE;
1557 
1558                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1559 
1560                 if (!error) {
1561                         if (crgetuid(rp->cr) != va.va_uid) {
1562                                 /*
1563                                  * This is a kludge to allow writes of files
1564                                  * created with read only permission.  The
1565                                  * owner of the file is always allowed to
1566                                  * write it.
1567                                  */
1568                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1569                         }
1570                         if (!error && MANDLOCK(vp, va.va_mode))
1571                                 error = EACCES;
1572                 }
1573 
1574                 /*
1575                  * Check for a conflict with a nbmand-locked region.
1576                  */
1577                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1578                     rp->wa->wa_count, 0, NULL)) {
1579                         error = EACCES;
1580                 }
1581 
1582                 if (error) {
1583                         rp->ns->ns_status = puterrno(error);
1584                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1585                         rp->thread->t_flag |= t_flag;
1586                         continue;
1587                 }
1588                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1589                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1590         } while ((rp = rp->list) != NULL);
1591 
1592         /*
1593          * Step through the cluster attempting to gather as many
1594          * requests which are contiguous as possible.  These
1595          * contiguous requests are handled via one call to VOP_WRITE
1596          * instead of different calls to VOP_WRITE.  We also keep
1597          * track of the fact that any data was written.
1598          */
1599         rp = nlp->list;
1600         data_written = 0;
1601         do {
1602                 /*
1603                  * Skip any requests which are already marked as having an
1604                  * error.
1605                  */
1606                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1607                         rp = rp->list;
1608                         continue;
1609                 }
1610 
1611                 /*
1612                  * Count the number of iovec's which are required
1613                  * to handle this set of requests.  One iovec is
1614                  * needed for each data buffer, whether addressed
1615                  * by wa_data or by the b_rptr pointers in the
1616                  * mblk chains.
1617                  */
1618                 iovcnt = 0;
1619                 lrp = rp;
1620                 for (;;) {
1621                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1622                                 iovcnt++;
1623                         else {
1624                                 m = lrp->wa->wa_mblk;
1625                                 while (m != NULL) {
1626                                         iovcnt++;
1627                                         m = m->b_cont;
1628                                 }
1629                         }
1630                         if (lrp->list == NULL ||
1631                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1632                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1633                             lrp->list->wa->wa_offset) {
1634                                 lrp = lrp->list;
1635                                 break;
1636                         }
1637                         lrp = lrp->list;
1638                 }
1639 
1640                 if (iovcnt <= MAXCLIOVECS) {
1641 #ifdef DEBUG
1642                         rfs_write_hits++;
1643 #endif
1644                         niovp = iov;
1645                 } else {
1646 #ifdef DEBUG
1647                         rfs_write_misses++;
1648 #endif
1649                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1650                 }
1651                 /*
1652                  * Put together the scatter/gather iovecs.
1653                  */
1654                 iovp = niovp;
1655                 trp = rp;
1656                 count = 0;
1657                 do {
1658                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1659                                 if (trp->wa->wa_rlist) {
1660                                         iovp->iov_base =
1661                                             (char *)((trp->wa->wa_rlist)->
1662                                             u.c_daddr3);
1663                                         iovp->iov_len = trp->wa->wa_count;
1664                                 } else  {
1665                                         iovp->iov_base = trp->wa->wa_data;
1666                                         iovp->iov_len = trp->wa->wa_count;
1667                                 }
1668                                 iovp++;
1669                         } else {
1670                                 m = trp->wa->wa_mblk;
1671                                 rcount = trp->wa->wa_count;
1672                                 while (m != NULL) {
1673                                         iovp->iov_base = (caddr_t)m->b_rptr;
1674                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1675                                         rcount -= iovp->iov_len;
1676                                         if (rcount < 0)
1677                                                 iovp->iov_len += rcount;
1678                                         iovp++;
1679                                         if (rcount <= 0)
1680                                                 break;
1681                                         m = m->b_cont;
1682                                 }
1683                         }
1684                         count += trp->wa->wa_count;
1685                         trp = trp->list;
1686                 } while (trp != lrp);
1687 
1688                 uio.uio_iov = niovp;
1689                 uio.uio_iovcnt = iovcnt;
1690                 uio.uio_segflg = UIO_SYSSPACE;
1691                 uio.uio_extflg = UIO_COPY_DEFAULT;
1692                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1693                 uio.uio_resid = count;
1694                 /*
1695                  * The limit is checked on the client. We
1696                  * should allow any size writes here.
1697                  */
1698                 uio.uio_llimit = curproc->p_fsz_ctl;
1699                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1700                 if (rlimit < (rlim64_t)uio.uio_resid)
1701                         uio.uio_resid = (uint_t)rlimit;
1702 
1703                 /*
1704                  * For now we assume no append mode.
1705                  */
1706 
1707                 /*
1708                  * We're changing creds because VM may fault
1709                  * and we need the cred of the current
1710                  * thread to be used if quota * checking is
1711                  * enabled.
1712                  */
1713                 savecred = curthread->t_cred;
1714                 curthread->t_cred = cr;
1715                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1716                 curthread->t_cred = savecred;
1717 
1718                 /* check if a monitor detected a delegation conflict */
1719                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1720                         /* mark as wouldblock so response is dropped */
1721                         curthread->t_flag |= T_WOULDBLOCK;
1722 
1723                 if (niovp != iov)
1724                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1725 
1726                 if (!error) {
1727                         data_written = 1;
1728                         /*
1729                          * Get attributes again so we send the latest mod
1730                          * time to the client side for its cache.
1731                          */
1732                         va.va_mask = AT_ALL;    /* now we want everything */
1733 
1734                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1735 
1736                         if (!error)
1737                                 acl_perm(vp, exi, &va, rp->cr);
1738                 }
1739 
1740                 /*
1741                  * Fill in the status responses for each request
1742                  * which was just handled.  Also, copy the latest
1743                  * attributes in to the attribute responses if
1744                  * appropriate.
1745                  */
1746                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1747                 do {
1748                         rp->thread->t_flag |= t_flag;
1749                         /* check for overflows */
1750                         if (!error) {
1751                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1752                         }
1753                         rp->ns->ns_status = puterrno(error);
1754                         rp = rp->list;
1755                 } while (rp != lrp);
1756         } while (rp != NULL);
1757 
1758         /*
1759          * If any data was written at all, then we need to flush
1760          * the data and metadata to stable storage.
1761          */
1762         if (data_written) {
1763                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1764 
1765                 if (!error) {
1766                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1767                 }
1768         }
1769 
1770         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1771 
1772         if (in_crit)
1773                 nbl_end_crit(vp);
1774         VN_RELE(vp);
1775 
1776         t_flag = curthread->t_flag & T_WOULDBLOCK;
1777         mutex_enter(&nsrv->async_write_lock);
1778         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1779                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1780                         rp->ns->ns_status = puterrno(error);
1781                         rp->thread->t_flag |= t_flag;
1782                 }
1783         }
1784         cv_broadcast(&nlp->cv);
1785         mutex_exit(&nsrv->async_write_lock);
1786 
1787 }
1788 
1789 void *
1790 rfs_write_getfh(struct nfswriteargs *wa)
1791 {
1792         return (&wa->wa_fhandle);
1793 }
1794 
1795 /*
1796  * Create a file.
1797  * Creates a file with given attributes and returns those attributes
1798  * and an fhandle for the new file.
1799  */
1800 void
1801 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1802     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1803 {
1804         int error;
1805         int lookuperr;
1806         int in_crit = 0;
1807         struct vattr va;
1808         vnode_t *vp;
1809         vnode_t *realvp;
1810         vnode_t *dvp;
1811         char *name = args->ca_da.da_name;
1812         vnode_t *tvp = NULL;
1813         int mode;
1814         int lookup_ok;
1815         bool_t trunc;
1816         struct sockaddr *ca;
1817 
1818         /*
1819          * Disallow NULL paths
1820          */
1821         if (name == NULL || *name == '\0') {
1822                 dr->dr_status = NFSERR_ACCES;
1823                 return;
1824         }
1825 
1826         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1827         if (dvp == NULL) {
1828                 dr->dr_status = NFSERR_STALE;
1829                 return;
1830         }
1831 
1832         error = sattr_to_vattr(args->ca_sa, &va);
1833         if (error) {
1834                 dr->dr_status = puterrno(error);
1835                 return;
1836         }
1837 
1838         /*
1839          * Must specify the mode.
1840          */
1841         if (!(va.va_mask & AT_MODE)) {
1842                 VN_RELE(dvp);
1843                 dr->dr_status = NFSERR_INVAL;
1844                 return;
1845         }
1846 
1847         /*
1848          * This is a completely gross hack to make mknod
1849          * work over the wire until we can wack the protocol
1850          */
1851         if ((va.va_mode & IFMT) == IFCHR) {
1852                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1853                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1854                 else {
1855                         va.va_type = VCHR;
1856                         /*
1857                          * uncompress the received dev_t
1858                          * if the top half is zero indicating a request
1859                          * from an `older style' OS.
1860                          */
1861                         if ((va.va_size & 0xffff0000) == 0)
1862                                 va.va_rdev = nfsv2_expdev(va.va_size);
1863                         else
1864                                 va.va_rdev = (dev_t)va.va_size;
1865                 }
1866                 va.va_mask &= ~AT_SIZE;
1867         } else if ((va.va_mode & IFMT) == IFBLK) {
1868                 va.va_type = VBLK;
1869                 /*
1870                  * uncompress the received dev_t
1871                  * if the top half is zero indicating a request
1872                  * from an `older style' OS.
1873                  */
1874                 if ((va.va_size & 0xffff0000) == 0)
1875                         va.va_rdev = nfsv2_expdev(va.va_size);
1876                 else
1877                         va.va_rdev = (dev_t)va.va_size;
1878                 va.va_mask &= ~AT_SIZE;
1879         } else if ((va.va_mode & IFMT) == IFSOCK) {
1880                 va.va_type = VSOCK;
1881         } else {
1882                 va.va_type = VREG;
1883         }
1884         va.va_mode &= ~IFMT;
1885         va.va_mask |= AT_TYPE;
1886 
1887         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1888         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1889             MAXPATHLEN);
1890         if (name == NULL) {
1891                 dr->dr_status = puterrno(EINVAL);
1892                 return;
1893         }
1894 
1895         /*
1896          * Why was the choice made to use VWRITE as the mode to the
1897          * call to VOP_CREATE ? This results in a bug.  When a client
1898          * opens a file that already exists and is RDONLY, the second
1899          * open fails with an EACESS because of the mode.
1900          * bug ID 1054648.
1901          */
1902         lookup_ok = 0;
1903         mode = VWRITE;
1904         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1905                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1906                     NULL, NULL, NULL);
1907                 if (!error) {
1908                         struct vattr at;
1909 
1910                         lookup_ok = 1;
1911                         at.va_mask = AT_MODE;
1912                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1913                         if (!error)
1914                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1915                         VN_RELE(tvp);
1916                         tvp = NULL;
1917                 }
1918         }
1919 
1920         if (!lookup_ok) {
1921                 if (rdonly(ro, dvp)) {
1922                         error = EROFS;
1923                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1924                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1925                         error = EPERM;
1926                 } else {
1927                         error = 0;
1928                 }
1929         }
1930 
1931         /*
1932          * If file size is being modified on an already existing file
1933          * make sure that there are no conflicting non-blocking mandatory
1934          * locks in the region being manipulated. Return EACCES if there
1935          * are conflicting locks.
1936          */
1937         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1938                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1939                     NULL, NULL, NULL);
1940 
1941                 if (!lookuperr &&
1942                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1943                         VN_RELE(tvp);
1944                         curthread->t_flag |= T_WOULDBLOCK;
1945                         goto out;
1946                 }
1947 
1948                 if (!lookuperr && nbl_need_check(tvp)) {
1949                         /*
1950                          * The file exists. Now check if it has any
1951                          * conflicting non-blocking mandatory locks
1952                          * in the region being changed.
1953                          */
1954                         struct vattr bva;
1955                         u_offset_t offset;
1956                         ssize_t length;
1957 
1958                         nbl_start_crit(tvp, RW_READER);
1959                         in_crit = 1;
1960 
1961                         bva.va_mask = AT_SIZE;
1962                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1963                         if (!error) {
1964                                 if (va.va_size < bva.va_size) {
1965                                         offset = va.va_size;
1966                                         length = bva.va_size - va.va_size;
1967                                 } else {
1968                                         offset = bva.va_size;
1969                                         length = va.va_size - bva.va_size;
1970                                 }
1971                                 if (length) {
1972                                         if (nbl_conflict(tvp, NBL_WRITE,
1973                                             offset, length, 0, NULL)) {
1974                                                 error = EACCES;
1975                                         }
1976                                 }
1977                         }
1978                         if (error) {
1979                                 nbl_end_crit(tvp);
1980                                 VN_RELE(tvp);
1981                                 in_crit = 0;
1982                         }
1983                 } else if (tvp != NULL) {
1984                         VN_RELE(tvp);
1985                 }
1986         }
1987 
1988         if (!error) {
1989                 /*
1990                  * If filesystem is shared with nosuid the remove any
1991                  * setuid/setgid bits on create.
1992                  */
1993                 if (va.va_type == VREG &&
1994                     exi->exi_export.ex_flags & EX_NOSUID)
1995                         va.va_mode &= ~(VSUID | VSGID);
1996 
1997                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1998                     NULL, NULL);
1999 
2000                 if (!error) {
2001 
2002                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2003                                 trunc = TRUE;
2004                         else
2005                                 trunc = FALSE;
2006 
2007                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2008                                 VN_RELE(vp);
2009                                 curthread->t_flag |= T_WOULDBLOCK;
2010                                 goto out;
2011                         }
2012                         va.va_mask = AT_ALL;
2013 
2014                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2015 
2016                         /* check for overflows */
2017                         if (!error) {
2018                                 acl_perm(vp, exi, &va, cr);
2019                                 error = vattr_to_nattr(&va, &dr->dr_attr);
2020                                 if (!error) {
2021                                         error = makefh(&dr->dr_fhandle, vp,
2022                                             exi);
2023                                 }
2024                         }
2025                         /*
2026                          * Force modified metadata out to stable storage.
2027                          *
2028                          * if a underlying vp exists, pass it to VOP_FSYNC
2029                          */
2030                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
2031                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2032                         else
2033                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2034                         VN_RELE(vp);
2035                 }
2036 
2037                 if (in_crit) {
2038                         nbl_end_crit(tvp);
2039                         VN_RELE(tvp);
2040                 }
2041         }
2042 
2043         /*
2044          * Force modified data and metadata out to stable storage.
2045          */
2046         (void) VOP_FSYNC(dvp, 0, cr, NULL);
2047 
2048 out:
2049 
2050         VN_RELE(dvp);
2051 
2052         dr->dr_status = puterrno(error);
2053 
2054         if (name != args->ca_da.da_name)
2055                 kmem_free(name, MAXPATHLEN);
2056 }
2057 void *
2058 rfs_create_getfh(struct nfscreatargs *args)
2059 {
2060         return (args->ca_da.da_fhandle);
2061 }
2062 
2063 /*
2064  * Remove a file.
2065  * Remove named file from parent directory.
2066  */
2067 /* ARGSUSED */
2068 void
2069 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2070     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2071 {
2072         int error = 0;
2073         vnode_t *vp;
2074         vnode_t *targvp;
2075         int in_crit = 0;
2076 
2077         /*
2078          * Disallow NULL paths
2079          */
2080         if (da->da_name == NULL || *da->da_name == '\0') {
2081                 *status = NFSERR_ACCES;
2082                 return;
2083         }
2084 
2085         vp = nfs_fhtovp(da->da_fhandle, exi);
2086         if (vp == NULL) {
2087                 *status = NFSERR_STALE;
2088                 return;
2089         }
2090 
2091         if (rdonly(ro, vp)) {
2092                 VN_RELE(vp);
2093                 *status = NFSERR_ROFS;
2094                 return;
2095         }
2096 
2097         /*
2098          * Check for a conflict with a non-blocking mandatory share reservation.
2099          */
2100         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2101             NULL, cr, NULL, NULL, NULL);
2102         if (error != 0) {
2103                 VN_RELE(vp);
2104                 *status = puterrno(error);
2105                 return;
2106         }
2107 
2108         /*
2109          * If the file is delegated to an v4 client, then initiate
2110          * recall and drop this request (by setting T_WOULDBLOCK).
2111          * The client will eventually re-transmit the request and
2112          * (hopefully), by then, the v4 client will have returned
2113          * the delegation.
2114          */
2115 
2116         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2117                 VN_RELE(vp);
2118                 VN_RELE(targvp);
2119                 curthread->t_flag |= T_WOULDBLOCK;
2120                 return;
2121         }
2122 
2123         if (nbl_need_check(targvp)) {
2124                 nbl_start_crit(targvp, RW_READER);
2125                 in_crit = 1;
2126                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2127                         error = EACCES;
2128                         goto out;
2129                 }
2130         }
2131 
2132         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2133 
2134         /*
2135          * Force modified data and metadata out to stable storage.
2136          */
2137         (void) VOP_FSYNC(vp, 0, cr, NULL);
2138 
2139 out:
2140         if (in_crit)
2141                 nbl_end_crit(targvp);
2142         VN_RELE(targvp);
2143         VN_RELE(vp);
2144 
2145         *status = puterrno(error);
2146 
2147 }
2148 
2149 void *
2150 rfs_remove_getfh(struct nfsdiropargs *da)
2151 {
2152         return (da->da_fhandle);
2153 }
2154 
2155 /*
2156  * rename a file
2157  * Give a file (from) a new name (to).
2158  */
2159 /* ARGSUSED */
2160 void
2161 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2162     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2163 {
2164         int error = 0;
2165         vnode_t *fromvp;
2166         vnode_t *tovp;
2167         struct exportinfo *to_exi;
2168         fhandle_t *fh;
2169         vnode_t *srcvp;
2170         vnode_t *targvp;
2171         int in_crit = 0;
2172 
2173         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2174         if (fromvp == NULL) {
2175                 *status = NFSERR_STALE;
2176                 return;
2177         }
2178 
2179         fh = args->rna_to.da_fhandle;
2180         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2181         if (to_exi == NULL) {
2182                 VN_RELE(fromvp);
2183                 *status = NFSERR_ACCES;
2184                 return;
2185         }
2186         exi_rele(to_exi);
2187 
2188         if (to_exi != exi) {
2189                 VN_RELE(fromvp);
2190                 *status = NFSERR_XDEV;
2191                 return;
2192         }
2193 
2194         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2195         if (tovp == NULL) {
2196                 VN_RELE(fromvp);
2197                 *status = NFSERR_STALE;
2198                 return;
2199         }
2200 
2201         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2202                 VN_RELE(tovp);
2203                 VN_RELE(fromvp);
2204                 *status = NFSERR_NOTDIR;
2205                 return;
2206         }
2207 
2208         /*
2209          * Disallow NULL paths
2210          */
2211         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2212             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2213                 VN_RELE(tovp);
2214                 VN_RELE(fromvp);
2215                 *status = NFSERR_ACCES;
2216                 return;
2217         }
2218 
2219         if (rdonly(ro, tovp)) {
2220                 VN_RELE(tovp);
2221                 VN_RELE(fromvp);
2222                 *status = NFSERR_ROFS;
2223                 return;
2224         }
2225 
2226         /*
2227          * Check for a conflict with a non-blocking mandatory share reservation.
2228          */
2229         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2230             NULL, cr, NULL, NULL, NULL);
2231         if (error != 0) {
2232                 VN_RELE(tovp);
2233                 VN_RELE(fromvp);
2234                 *status = puterrno(error);
2235                 return;
2236         }
2237 
2238         /* Check for delegations on the source file */
2239 
2240         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2241                 VN_RELE(tovp);
2242                 VN_RELE(fromvp);
2243                 VN_RELE(srcvp);
2244                 curthread->t_flag |= T_WOULDBLOCK;
2245                 return;
2246         }
2247 
2248         /* Check for delegation on the file being renamed over, if it exists */
2249 
2250         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2251             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2252             NULL, NULL, NULL) == 0) {
2253 
2254                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2255                         VN_RELE(tovp);
2256                         VN_RELE(fromvp);
2257                         VN_RELE(srcvp);
2258                         VN_RELE(targvp);
2259                         curthread->t_flag |= T_WOULDBLOCK;
2260                         return;
2261                 }
2262                 VN_RELE(targvp);
2263         }
2264 
2265 
2266         if (nbl_need_check(srcvp)) {
2267                 nbl_start_crit(srcvp, RW_READER);
2268                 in_crit = 1;
2269                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2270                         error = EACCES;
2271                         goto out;
2272                 }
2273         }
2274 
2275         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2276             tovp, args->rna_to.da_name, cr, NULL, 0);
2277 
2278         if (error == 0)
2279                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2280                     strlen(args->rna_to.da_name));
2281 
2282         /*
2283          * Force modified data and metadata out to stable storage.
2284          */
2285         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2286         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2287 
2288 out:
2289         if (in_crit)
2290                 nbl_end_crit(srcvp);
2291         VN_RELE(srcvp);
2292         VN_RELE(tovp);
2293         VN_RELE(fromvp);
2294 
2295         *status = puterrno(error);
2296 
2297 }
2298 void *
2299 rfs_rename_getfh(struct nfsrnmargs *args)
2300 {
2301         return (args->rna_from.da_fhandle);
2302 }
2303 
2304 /*
2305  * Link to a file.
2306  * Create a file (to) which is a hard link to the given file (from).
2307  */
2308 /* ARGSUSED */
2309 void
2310 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2311     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2312 {
2313         int error;
2314         vnode_t *fromvp;
2315         vnode_t *tovp;
2316         struct exportinfo *to_exi;
2317         fhandle_t *fh;
2318 
2319         fromvp = nfs_fhtovp(args->la_from, exi);
2320         if (fromvp == NULL) {
2321                 *status = NFSERR_STALE;
2322                 return;
2323         }
2324 
2325         fh = args->la_to.da_fhandle;
2326         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2327         if (to_exi == NULL) {
2328                 VN_RELE(fromvp);
2329                 *status = NFSERR_ACCES;
2330                 return;
2331         }
2332         exi_rele(to_exi);
2333 
2334         if (to_exi != exi) {
2335                 VN_RELE(fromvp);
2336                 *status = NFSERR_XDEV;
2337                 return;
2338         }
2339 
2340         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2341         if (tovp == NULL) {
2342                 VN_RELE(fromvp);
2343                 *status = NFSERR_STALE;
2344                 return;
2345         }
2346 
2347         if (tovp->v_type != VDIR) {
2348                 VN_RELE(tovp);
2349                 VN_RELE(fromvp);
2350                 *status = NFSERR_NOTDIR;
2351                 return;
2352         }
2353         /*
2354          * Disallow NULL paths
2355          */
2356         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2357                 VN_RELE(tovp);
2358                 VN_RELE(fromvp);
2359                 *status = NFSERR_ACCES;
2360                 return;
2361         }
2362 
2363         if (rdonly(ro, tovp)) {
2364                 VN_RELE(tovp);
2365                 VN_RELE(fromvp);
2366                 *status = NFSERR_ROFS;
2367                 return;
2368         }
2369 
2370         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2371 
2372         /*
2373          * Force modified data and metadata out to stable storage.
2374          */
2375         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2376         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2377 
2378         VN_RELE(tovp);
2379         VN_RELE(fromvp);
2380 
2381         *status = puterrno(error);
2382 
2383 }
2384 void *
2385 rfs_link_getfh(struct nfslinkargs *args)
2386 {
2387         return (args->la_from);
2388 }
2389 
2390 /*
2391  * Symbolicly link to a file.
2392  * Create a file (to) with the given attributes which is a symbolic link
2393  * to the given path name (to).
2394  */
2395 void
2396 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2397     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2398 {
2399         int error;
2400         struct vattr va;
2401         vnode_t *vp;
2402         vnode_t *svp;
2403         int lerror;
2404         struct sockaddr *ca;
2405         char *name = NULL;
2406 
2407         /*
2408          * Disallow NULL paths
2409          */
2410         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2411                 *status = NFSERR_ACCES;
2412                 return;
2413         }
2414 
2415         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2416         if (vp == NULL) {
2417                 *status = NFSERR_STALE;
2418                 return;
2419         }
2420 
2421         if (rdonly(ro, vp)) {
2422                 VN_RELE(vp);
2423                 *status = NFSERR_ROFS;
2424                 return;
2425         }
2426 
2427         error = sattr_to_vattr(args->sla_sa, &va);
2428         if (error) {
2429                 VN_RELE(vp);
2430                 *status = puterrno(error);
2431                 return;
2432         }
2433 
2434         if (!(va.va_mask & AT_MODE)) {
2435                 VN_RELE(vp);
2436                 *status = NFSERR_INVAL;
2437                 return;
2438         }
2439 
2440         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2441         name = nfscmd_convname(ca, exi, args->sla_tnm,
2442             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2443 
2444         if (name == NULL) {
2445                 *status = NFSERR_ACCES;
2446                 return;
2447         }
2448 
2449         va.va_type = VLNK;
2450         va.va_mask |= AT_TYPE;
2451 
2452         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2453 
2454         /*
2455          * Force new data and metadata out to stable storage.
2456          */
2457         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2458             NULL, cr, NULL, NULL, NULL);
2459 
2460         if (!lerror) {
2461                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2462                 VN_RELE(svp);
2463         }
2464 
2465         /*
2466          * Force modified data and metadata out to stable storage.
2467          */
2468         (void) VOP_FSYNC(vp, 0, cr, NULL);
2469 
2470         VN_RELE(vp);
2471 
2472         *status = puterrno(error);
2473         if (name != args->sla_tnm)
2474                 kmem_free(name, MAXPATHLEN);
2475 
2476 }
2477 void *
2478 rfs_symlink_getfh(struct nfsslargs *args)
2479 {
2480         return (args->sla_from.da_fhandle);
2481 }
2482 
2483 /*
2484  * Make a directory.
2485  * Create a directory with the given name, parent directory, and attributes.
2486  * Returns a file handle and attributes for the new directory.
2487  */
2488 /* ARGSUSED */
2489 void
2490 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2491     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2492 {
2493         int error;
2494         struct vattr va;
2495         vnode_t *dvp = NULL;
2496         vnode_t *vp;
2497         char *name = args->ca_da.da_name;
2498 
2499         /*
2500          * Disallow NULL paths
2501          */
2502         if (name == NULL || *name == '\0') {
2503                 dr->dr_status = NFSERR_ACCES;
2504                 return;
2505         }
2506 
2507         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2508         if (vp == NULL) {
2509                 dr->dr_status = NFSERR_STALE;
2510                 return;
2511         }
2512 
2513         if (rdonly(ro, vp)) {
2514                 VN_RELE(vp);
2515                 dr->dr_status = NFSERR_ROFS;
2516                 return;
2517         }
2518 
2519         error = sattr_to_vattr(args->ca_sa, &va);
2520         if (error) {
2521                 VN_RELE(vp);
2522                 dr->dr_status = puterrno(error);
2523                 return;
2524         }
2525 
2526         if (!(va.va_mask & AT_MODE)) {
2527                 VN_RELE(vp);
2528                 dr->dr_status = NFSERR_INVAL;
2529                 return;
2530         }
2531 
2532         va.va_type = VDIR;
2533         va.va_mask |= AT_TYPE;
2534 
2535         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2536 
2537         if (!error) {
2538                 /*
2539                  * Attribtutes of the newly created directory should
2540                  * be returned to the client.
2541                  */
2542                 va.va_mask = AT_ALL; /* We want everything */
2543                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2544 
2545                 /* check for overflows */
2546                 if (!error) {
2547                         acl_perm(vp, exi, &va, cr);
2548                         error = vattr_to_nattr(&va, &dr->dr_attr);
2549                         if (!error) {
2550                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2551                         }
2552                 }
2553                 /*
2554                  * Force new data and metadata out to stable storage.
2555                  */
2556                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2557                 VN_RELE(dvp);
2558         }
2559 
2560         /*
2561          * Force modified data and metadata out to stable storage.
2562          */
2563         (void) VOP_FSYNC(vp, 0, cr, NULL);
2564 
2565         VN_RELE(vp);
2566 
2567         dr->dr_status = puterrno(error);
2568 
2569 }
2570 void *
2571 rfs_mkdir_getfh(struct nfscreatargs *args)
2572 {
2573         return (args->ca_da.da_fhandle);
2574 }
2575 
2576 /*
2577  * Remove a directory.
2578  * Remove the given directory name from the given parent directory.
2579  */
2580 /* ARGSUSED */
2581 void
2582 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2583     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2584 {
2585         int error;
2586         vnode_t *vp;
2587 
2588         /*
2589          * Disallow NULL paths
2590          */
2591         if (da->da_name == NULL || *da->da_name == '\0') {
2592                 *status = NFSERR_ACCES;
2593                 return;
2594         }
2595 
2596         vp = nfs_fhtovp(da->da_fhandle, exi);
2597         if (vp == NULL) {
2598                 *status = NFSERR_STALE;
2599                 return;
2600         }
2601 
2602         if (rdonly(ro, vp)) {
2603                 VN_RELE(vp);
2604                 *status = NFSERR_ROFS;
2605                 return;
2606         }
2607 
2608         /*
2609          * VOP_RMDIR takes a third argument (the current
2610          * directory of the process).  That's because someone
2611          * wants to return EINVAL if one tries to remove ".".
2612          * Of course, NFS servers have no idea what their
2613          * clients' current directories are.  We fake it by
2614          * supplying a vnode known to exist and illegal to
2615          * remove.
2616          */
2617         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2618 
2619         /*
2620          * Force modified data and metadata out to stable storage.
2621          */
2622         (void) VOP_FSYNC(vp, 0, cr, NULL);
2623 
2624         VN_RELE(vp);
2625 
2626         /*
2627          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2628          * if the directory is not empty.  A System V NFS server
2629          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2630          * over the wire.
2631          */
2632         if (error == EEXIST)
2633                 *status = NFSERR_NOTEMPTY;
2634         else
2635                 *status = puterrno(error);
2636 
2637 }
2638 void *
2639 rfs_rmdir_getfh(struct nfsdiropargs *da)
2640 {
2641         return (da->da_fhandle);
2642 }
2643 
2644 /* ARGSUSED */
2645 void
2646 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2647     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2648 {
2649         int error;
2650         int iseof;
2651         struct iovec iov;
2652         struct uio uio;
2653         vnode_t *vp;
2654         char *ndata = NULL;
2655         struct sockaddr *ca;
2656         size_t nents;
2657         int ret;
2658 
2659         vp = nfs_fhtovp(&rda->rda_fh, exi);
2660         if (vp == NULL) {
2661                 rd->rd_entries = NULL;
2662                 rd->rd_status = NFSERR_STALE;
2663                 return;
2664         }
2665 
2666         if (vp->v_type != VDIR) {
2667                 VN_RELE(vp);
2668                 rd->rd_entries = NULL;
2669                 rd->rd_status = NFSERR_NOTDIR;
2670                 return;
2671         }
2672 
2673         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2674 
2675         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2676 
2677         if (error) {
2678                 rd->rd_entries = NULL;
2679                 goto bad;
2680         }
2681 
2682         if (rda->rda_count == 0) {
2683                 rd->rd_entries = NULL;
2684                 rd->rd_size = 0;
2685                 rd->rd_eof = FALSE;
2686                 goto bad;
2687         }
2688 
2689         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2690 
2691         /*
2692          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2693          */
2694         rd->rd_bufsize = (uint_t)rda->rda_count;
2695         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2696 
2697         /*
2698          * Set up io vector to read directory data
2699          */
2700         iov.iov_base = (caddr_t)rd->rd_entries;
2701         iov.iov_len = rda->rda_count;
2702         uio.uio_iov = &iov;
2703         uio.uio_iovcnt = 1;
2704         uio.uio_segflg = UIO_SYSSPACE;
2705         uio.uio_extflg = UIO_COPY_CACHED;
2706         uio.uio_loffset = (offset_t)rda->rda_offset;
2707         uio.uio_resid = rda->rda_count;
2708 
2709         /*
2710          * read directory
2711          */
2712         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2713 
2714         /*
2715          * Clean up
2716          */
2717         if (!error) {
2718                 /*
2719                  * set size and eof
2720                  */
2721                 if (uio.uio_resid == rda->rda_count) {
2722                         rd->rd_size = 0;
2723                         rd->rd_eof = TRUE;
2724                 } else {
2725                         rd->rd_size = (uint32_t)(rda->rda_count -
2726                             uio.uio_resid);
2727                         rd->rd_eof = iseof ? TRUE : FALSE;
2728                 }
2729         }
2730 
2731         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2732         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2733         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2734             rda->rda_count, &ndata);
2735 
2736         if (ret != 0) {
2737                 size_t dropbytes;
2738                 /*
2739                  * We had to drop one or more entries in order to fit
2740                  * during the character conversion.  We need to patch
2741                  * up the size and eof info.
2742                  */
2743                 if (rd->rd_eof)
2744                         rd->rd_eof = FALSE;
2745                 dropbytes = nfscmd_dropped_entrysize(
2746                     (struct dirent64 *)rd->rd_entries, nents, ret);
2747                 rd->rd_size -= dropbytes;
2748         }
2749         if (ndata == NULL) {
2750                 ndata = (char *)rd->rd_entries;
2751         } else if (ndata != (char *)rd->rd_entries) {
2752                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2753                 rd->rd_entries = (void *)ndata;
2754                 rd->rd_bufsize = rda->rda_count;
2755         }
2756 
2757 bad:
2758         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2759 
2760 #if 0 /* notyet */
2761         /*
2762          * Don't do this.  It causes local disk writes when just
2763          * reading the file and the overhead is deemed larger
2764          * than the benefit.
2765          */
2766         /*
2767          * Force modified metadata out to stable storage.
2768          */
2769         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2770 #endif
2771 
2772         VN_RELE(vp);
2773 
2774         rd->rd_status = puterrno(error);
2775 
2776 }
2777 void *
2778 rfs_readdir_getfh(struct nfsrddirargs *rda)
2779 {
2780         return (&rda->rda_fh);
2781 }
2782 void
2783 rfs_rddirfree(struct nfsrddirres *rd)
2784 {
2785         if (rd->rd_entries != NULL)
2786                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2787 }
2788 
2789 /* ARGSUSED */
2790 void
2791 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2792     struct svc_req *req, cred_t *cr, bool_t ro)
2793 {
2794         int error;
2795         struct statvfs64 sb;
2796         vnode_t *vp;
2797 
2798         vp = nfs_fhtovp(fh, exi);
2799         if (vp == NULL) {
2800                 fs->fs_status = NFSERR_STALE;
2801                 return;
2802         }
2803 
2804         error = VFS_STATVFS(vp->v_vfsp, &sb);
2805 
2806         if (!error) {
2807                 fs->fs_tsize = nfstsize();
2808                 fs->fs_bsize = sb.f_frsize;
2809                 fs->fs_blocks = sb.f_blocks;
2810                 fs->fs_bfree = sb.f_bfree;
2811                 fs->fs_bavail = sb.f_bavail;
2812         }
2813 
2814         VN_RELE(vp);
2815 
2816         fs->fs_status = puterrno(error);
2817 
2818 }
2819 void *
2820 rfs_statfs_getfh(fhandle_t *fh)
2821 {
2822         return (fh);
2823 }
2824 
2825 static int
2826 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2827 {
2828         vap->va_mask = 0;
2829 
2830         /*
2831          * There was a sign extension bug in some VFS based systems
2832          * which stored the mode as a short.  When it would get
2833          * assigned to a u_long, no sign extension would occur.
2834          * It needed to, but this wasn't noticed because sa_mode
2835          * would then get assigned back to the short, thus ignoring
2836          * the upper 16 bits of sa_mode.
2837          *
2838          * To make this implementation work for both broken
2839          * clients and good clients, we check for both versions
2840          * of the mode.
2841          */
2842         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2843             sa->sa_mode != (uint32_t)-1) {
2844                 vap->va_mask |= AT_MODE;
2845                 vap->va_mode = sa->sa_mode;
2846         }
2847         if (sa->sa_uid != (uint32_t)-1) {
2848                 vap->va_mask |= AT_UID;
2849                 vap->va_uid = sa->sa_uid;
2850         }
2851         if (sa->sa_gid != (uint32_t)-1) {
2852                 vap->va_mask |= AT_GID;
2853                 vap->va_gid = sa->sa_gid;
2854         }
2855         if (sa->sa_size != (uint32_t)-1) {
2856                 vap->va_mask |= AT_SIZE;
2857                 vap->va_size = sa->sa_size;
2858         }
2859         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2860             sa->sa_atime.tv_usec != (int32_t)-1) {
2861 #ifndef _LP64
2862                 /* return error if time overflow */
2863                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2864                         return (EOVERFLOW);
2865 #endif
2866                 vap->va_mask |= AT_ATIME;
2867                 /*
2868                  * nfs protocol defines times as unsigned so don't extend sign,
2869                  * unless sysadmin set nfs_allow_preepoch_time.
2870                  */
2871                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2872                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2873         }
2874         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2875             sa->sa_mtime.tv_usec != (int32_t)-1) {
2876 #ifndef _LP64
2877                 /* return error if time overflow */
2878                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2879                         return (EOVERFLOW);
2880 #endif
2881                 vap->va_mask |= AT_MTIME;
2882                 /*
2883                  * nfs protocol defines times as unsigned so don't extend sign,
2884                  * unless sysadmin set nfs_allow_preepoch_time.
2885                  */
2886                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2887                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2888         }
2889         return (0);
2890 }
2891 
2892 static const enum nfsftype vt_to_nf[] = {
2893         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2894 };
2895 
2896 /*
2897  * check the following fields for overflow: nodeid, size, and time.
2898  * There could be a problem when converting 64-bit LP64 fields
2899  * into 32-bit ones.  Return an error if there is an overflow.
2900  */
2901 int
2902 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2903 {
2904         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2905         na->na_type = vt_to_nf[vap->va_type];
2906 
2907         if (vap->va_mode == (unsigned short) -1)
2908                 na->na_mode = (uint32_t)-1;
2909         else
2910                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2911 
2912         if (vap->va_uid == (unsigned short)(-1))
2913                 na->na_uid = (uint32_t)(-1);
2914         else if (vap->va_uid == UID_NOBODY)
2915                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2916         else
2917                 na->na_uid = vap->va_uid;
2918 
2919         if (vap->va_gid == (unsigned short)(-1))
2920                 na->na_gid = (uint32_t)-1;
2921         else if (vap->va_gid == GID_NOBODY)
2922                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2923         else
2924                 na->na_gid = vap->va_gid;
2925 
2926         /*
2927          * Do we need to check fsid for overflow?  It is 64-bit in the
2928          * vattr, but are bigger than 32 bit values supported?
2929          */
2930         na->na_fsid = vap->va_fsid;
2931 
2932         na->na_nodeid = vap->va_nodeid;
2933 
2934         /*
2935          * Check to make sure that the nodeid is representable over the
2936          * wire without losing bits.
2937          */
2938         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2939                 return (EFBIG);
2940         na->na_nlink = vap->va_nlink;
2941 
2942         /*
2943          * Check for big files here, instead of at the caller.  See
2944          * comments in cstat for large special file explanation.
2945          */
2946         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2947                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2948                         return (EFBIG);
2949                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2950                         /* UNKNOWN_SIZE | OVERFLOW */
2951                         na->na_size = MAXOFF32_T;
2952                 } else
2953                         na->na_size = vap->va_size;
2954         } else
2955                 na->na_size = vap->va_size;
2956 
2957         /*
2958          * If the vnode times overflow the 32-bit times that NFS2
2959          * uses on the wire then return an error.
2960          */
2961         if (!NFS_VAP_TIME_OK(vap)) {
2962                 return (EOVERFLOW);
2963         }
2964         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2965         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2966 
2967         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2968         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2969 
2970         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2971         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2972 
2973         /*
2974          * If the dev_t will fit into 16 bits then compress
2975          * it, otherwise leave it alone. See comments in
2976          * nfs_client.c.
2977          */
2978         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2979             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2980                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2981         else
2982                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2983 
2984         na->na_blocks = vap->va_nblocks;
2985         na->na_blocksize = vap->va_blksize;
2986 
2987         /*
2988          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2989          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2990          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2991          *
2992          * BUYER BEWARE:
2993          *  If you are porting the NFS to a non-Sun server, you probably
2994          *  don't want to include the following block of code.  The
2995          *  over-the-wire special file types will be changing with the
2996          *  NFS Protocol Revision.
2997          */
2998         if (vap->va_type == VFIFO)
2999                 NA_SETFIFO(na);
3000         return (0);
3001 }
3002 
3003 /*
3004  * acl v2 support: returns approximate permission.
3005  *      default: returns minimal permission (more restrictive)
3006  *      aclok: returns maximal permission (less restrictive)
3007  *      This routine changes the permissions that are alaredy in *va.
3008  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3009  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3010  */
3011 static void
3012 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3013 {
3014         vsecattr_t      vsa;
3015         int             aclcnt;
3016         aclent_t        *aclentp;
3017         mode_t          mask_perm;
3018         mode_t          grp_perm;
3019         mode_t          other_perm;
3020         mode_t          other_orig;
3021         int             error;
3022 
3023         /* dont care default acl */
3024         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3025         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3026 
3027         if (!error) {
3028                 aclcnt = vsa.vsa_aclcnt;
3029                 if (aclcnt > MIN_ACL_ENTRIES) {
3030                         /* non-trivial ACL */
3031                         aclentp = vsa.vsa_aclentp;
3032                         if (exi->exi_export.ex_flags & EX_ACLOK) {
3033                                 /* maximal permissions */
3034                                 grp_perm = 0;
3035                                 other_perm = 0;
3036                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3037                                         switch (aclentp->a_type) {
3038                                         case USER_OBJ:
3039                                                 break;
3040                                         case USER:
3041                                                 grp_perm |=
3042                                                     aclentp->a_perm << 3;
3043                                                 other_perm |= aclentp->a_perm;
3044                                                 break;
3045                                         case GROUP_OBJ:
3046                                                 grp_perm |=
3047                                                     aclentp->a_perm << 3;
3048                                                 break;
3049                                         case GROUP:
3050                                                 other_perm |= aclentp->a_perm;
3051                                                 break;
3052                                         case OTHER_OBJ:
3053                                                 other_orig = aclentp->a_perm;
3054                                                 break;
3055                                         case CLASS_OBJ:
3056                                                 mask_perm = aclentp->a_perm;
3057                                                 break;
3058                                         default:
3059                                                 break;
3060                                         }
3061                                 }
3062                                 grp_perm &= mask_perm << 3;
3063                                 other_perm &= mask_perm;
3064                                 other_perm |= other_orig;
3065 
3066                         } else {
3067                                 /* minimal permissions */
3068                                 grp_perm = 070;
3069                                 other_perm = 07;
3070                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3071                                         switch (aclentp->a_type) {
3072                                         case USER_OBJ:
3073                                                 break;
3074                                         case USER:
3075                                         case CLASS_OBJ:
3076                                                 grp_perm &=
3077                                                     aclentp->a_perm << 3;
3078                                                 other_perm &=
3079                                                     aclentp->a_perm;
3080                                                 break;
3081                                         case GROUP_OBJ:
3082                                                 grp_perm &=
3083                                                     aclentp->a_perm << 3;
3084                                                 break;
3085                                         case GROUP:
3086                                                 other_perm &=
3087                                                     aclentp->a_perm;
3088                                                 break;
3089                                         case OTHER_OBJ:
3090                                                 other_perm &=
3091                                                     aclentp->a_perm;
3092                                                 break;
3093                                         default:
3094                                                 break;
3095                                         }
3096                                 }
3097                         }
3098                         /* copy to va */
3099                         va->va_mode &= ~077;
3100                         va->va_mode |= grp_perm | other_perm;
3101                 }
3102                 if (vsa.vsa_aclcnt)
3103                         kmem_free(vsa.vsa_aclentp,
3104                             vsa.vsa_aclcnt * sizeof (aclent_t));
3105         }
3106 }
3107 
3108 void
3109 rfs_srvrinit(void)
3110 {
3111         nfs2_srv_caller_id = fs_new_caller_id();
3112 }
3113 
3114 void
3115 rfs_srvrfini(void)
3116 {
3117 }
3118 
3119 /* ARGSUSED */
3120 void
3121 rfs_srv_zone_init(nfs_globals_t *ng)
3122 {
3123         nfs_srv_t *ns;
3124 
3125         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3126 
3127         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3128         ns->write_async = 1;
3129 
3130         ng->nfs_srv = ns;
3131 }
3132 
3133 /* ARGSUSED */
3134 void
3135 rfs_srv_zone_fini(nfs_globals_t *ng)
3136 {
3137         nfs_srv_t *ns = ng->nfs_srv;
3138 
3139         ng->nfs_srv = NULL;
3140 
3141         mutex_destroy(&ns->async_write_lock);
3142         kmem_free(ns, sizeof (*ns));
3143 }
3144 
3145 static int
3146 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3147 {
3148         struct clist    *wcl;
3149         int             wlist_len;
3150         uint32_t        count = rr->rr_count;
3151 
3152         wcl = ra->ra_wlist;
3153 
3154         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3155                 return (FALSE);
3156         }
3157 
3158         wcl = ra->ra_wlist;
3159         rr->rr_ok.rrok_wlist_len = wlist_len;
3160         rr->rr_ok.rrok_wlist = wcl;
3161 
3162         return (TRUE);
3163 }