untraverse-1 New usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 
  33 /*
  34  * Copyright 2018 Nexenta Systems, Inc.
  35  * Copyright (c) 2016 by Delphix. All rights reserved.
  36  */
  37 
  38 #include <sys/param.h>
  39 #include <sys/types.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/buf.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/stat.h>
  47 #include <sys/errno.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/statvfs.h>
  50 #include <sys/kmem.h>
  51 #include <sys/kstat.h>
  52 #include <sys/dirent.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/debug.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/mode.h>
  57 #include <sys/acl.h>
  58 #include <sys/nbmlock.h>
  59 #include <sys/policy.h>
  60 #include <sys/sdt.h>
  61 
  62 #include <rpc/types.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/svc.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/export.h>
  68 #include <nfs/nfs_cmd.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_kmem.h>
  75 
  76 #include <sys/strsubr.h>
  77 
  78 struct rfs_async_write_list;
  79 
  80 /*
  81  * Zone globals of NFSv2 server
  82  */
  83 typedef struct nfs_srv {
  84         kmutex_t                        async_write_lock;
  85         struct rfs_async_write_list     *async_write_head;
  86 
  87         /*
  88          * enables write clustering if == 1
  89          */
  90         int             write_async;
  91 } nfs_srv_t;
  92 
  93 /*
  94  * These are the interface routines for the server side of the
  95  * Network File System.  See the NFS version 2 protocol specification
  96  * for a description of this interface.
  97  */
  98 
  99 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101                         cred_t *);
 102 
 103 
 104 /*
 105  * Some "over the wire" UNIX file types.  These are encoded
 106  * into the mode.  This needs to be fixed in the next rev.
 107  */
 108 #define IFMT            0170000         /* type of file */
 109 #define IFCHR           0020000         /* character special */
 110 #define IFBLK           0060000         /* block special */
 111 #define IFSOCK          0140000         /* socket */
 112 
 113 u_longlong_t nfs2_srv_caller_id;
 114 
 115 static nfs_srv_t *
 116 nfs_get_srv(void)
 117 {
 118         nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 119         nfs_srv_t *srv = ng->nfs_srv;
 120         ASSERT(srv != NULL);
 121         return (srv);
 122 }
 123 
 124 /*
 125  * Get file attributes.
 126  * Returns the current attributes of the file with the given fhandle.
 127  */
 128 /* ARGSUSED */
 129 void
 130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131     struct svc_req *req, cred_t *cr, bool_t ro)
 132 {
 133         int error;
 134         vnode_t *vp;
 135         struct vattr va;
 136 
 137         vp = nfs_fhtovp(fhp, exi);
 138         if (vp == NULL) {
 139                 ns->ns_status = NFSERR_STALE;
 140                 return;
 141         }
 142 
 143         /*
 144          * Do the getattr.
 145          */
 146         va.va_mask = AT_ALL;    /* we want all the attributes */
 147 
 148         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 149 
 150         /* check for overflows */
 151         if (!error) {
 152                 /* Lie about the object type for a referral */
 153                 if (vn_is_nfs_reparse(vp, cr))
 154                         va.va_type = VLNK;
 155 
 156                 acl_perm(vp, exi, &va, cr);
 157                 error = vattr_to_nattr(&va, &ns->ns_attr);
 158         }
 159 
 160         VN_RELE(vp);
 161 
 162         ns->ns_status = puterrno(error);
 163 }
 164 void *
 165 rfs_getattr_getfh(fhandle_t *fhp)
 166 {
 167         return (fhp);
 168 }
 169 
 170 /*
 171  * Set file attributes.
 172  * Sets the attributes of the file with the given fhandle.  Returns
 173  * the new attributes.
 174  */
 175 /* ARGSUSED */
 176 void
 177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 178     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 179 {
 180         int error;
 181         int flag;
 182         int in_crit = 0;
 183         vnode_t *vp;
 184         struct vattr va;
 185         struct vattr bva;
 186         struct flock64 bf;
 187         caller_context_t ct;
 188 
 189 
 190         vp = nfs_fhtovp(&args->saa_fh, exi);
 191         if (vp == NULL) {
 192                 ns->ns_status = NFSERR_STALE;
 193                 return;
 194         }
 195 
 196         if (rdonly(ro, vp)) {
 197                 VN_RELE(vp);
 198                 ns->ns_status = NFSERR_ROFS;
 199                 return;
 200         }
 201 
 202         error = sattr_to_vattr(&args->saa_sa, &va);
 203         if (error) {
 204                 VN_RELE(vp);
 205                 ns->ns_status = puterrno(error);
 206                 return;
 207         }
 208 
 209         /*
 210          * If the client is requesting a change to the mtime,
 211          * but the nanosecond field is set to 1 billion, then
 212          * this is a flag to the server that it should set the
 213          * atime and mtime fields to the server's current time.
 214          * The 1 billion number actually came from the client
 215          * as 1 million, but the units in the over the wire
 216          * request are microseconds instead of nanoseconds.
 217          *
 218          * This is an overload of the protocol and should be
 219          * documented in the NFS Version 2 protocol specification.
 220          */
 221         if (va.va_mask & AT_MTIME) {
 222                 if (va.va_mtime.tv_nsec == 1000000000) {
 223                         gethrestime(&va.va_mtime);
 224                         va.va_atime = va.va_mtime;
 225                         va.va_mask |= AT_ATIME;
 226                         flag = 0;
 227                 } else
 228                         flag = ATTR_UTIME;
 229         } else
 230                 flag = 0;
 231 
 232         /*
 233          * If the filesystem is exported with nosuid, then mask off
 234          * the setuid and setgid bits.
 235          */
 236         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 237             (exi->exi_export.ex_flags & EX_NOSUID))
 238                 va.va_mode &= ~(VSUID | VSGID);
 239 
 240         ct.cc_sysid = 0;
 241         ct.cc_pid = 0;
 242         ct.cc_caller_id = nfs2_srv_caller_id;
 243         ct.cc_flags = CC_DONTBLOCK;
 244 
 245         /*
 246          * We need to specially handle size changes because it is
 247          * possible for the client to create a file with modes
 248          * which indicate read-only, but with the file opened for
 249          * writing.  If the client then tries to set the size of
 250          * the file, then the normal access checking done in
 251          * VOP_SETATTR would prevent the client from doing so,
 252          * although it should be legal for it to do so.  To get
 253          * around this, we do the access checking for ourselves
 254          * and then use VOP_SPACE which doesn't do the access
 255          * checking which VOP_SETATTR does. VOP_SPACE can only
 256          * operate on VREG files, let VOP_SETATTR handle the other
 257          * extremely rare cases.
 258          * Also the client should not be allowed to change the
 259          * size of the file if there is a conflicting non-blocking
 260          * mandatory lock in the region of change.
 261          */
 262         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 263                 if (nbl_need_check(vp)) {
 264                         nbl_start_crit(vp, RW_READER);
 265                         in_crit = 1;
 266                 }
 267 
 268                 bva.va_mask = AT_UID | AT_SIZE;
 269 
 270                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 271 
 272                 if (error) {
 273                         if (in_crit)
 274                                 nbl_end_crit(vp);
 275                         VN_RELE(vp);
 276                         ns->ns_status = puterrno(error);
 277                         return;
 278                 }
 279 
 280                 if (in_crit) {
 281                         u_offset_t offset;
 282                         ssize_t length;
 283 
 284                         if (va.va_size < bva.va_size) {
 285                                 offset = va.va_size;
 286                                 length = bva.va_size - va.va_size;
 287                         } else {
 288                                 offset = bva.va_size;
 289                                 length = va.va_size - bva.va_size;
 290                         }
 291                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 292                             NULL)) {
 293                                 error = EACCES;
 294                         }
 295                 }
 296 
 297                 if (crgetuid(cr) == bva.va_uid && !error &&
 298                     va.va_size != bva.va_size) {
 299                         va.va_mask &= ~AT_SIZE;
 300                         bf.l_type = F_WRLCK;
 301                         bf.l_whence = 0;
 302                         bf.l_start = (off64_t)va.va_size;
 303                         bf.l_len = 0;
 304                         bf.l_sysid = 0;
 305                         bf.l_pid = 0;
 306 
 307                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 308                             (offset_t)va.va_size, cr, &ct);
 309                 }
 310                 if (in_crit)
 311                         nbl_end_crit(vp);
 312         } else
 313                 error = 0;
 314 
 315         /*
 316          * Do the setattr.
 317          */
 318         if (!error && va.va_mask) {
 319                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 320         }
 321 
 322         /*
 323          * check if the monitor on either vop_space or vop_setattr detected
 324          * a delegation conflict and if so, mark the thread flag as
 325          * wouldblock so that the response is dropped and the client will
 326          * try again.
 327          */
 328         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 329                 VN_RELE(vp);
 330                 curthread->t_flag |= T_WOULDBLOCK;
 331                 return;
 332         }
 333 
 334         if (!error) {
 335                 va.va_mask = AT_ALL;    /* get everything */
 336 
 337                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 338 
 339                 /* check for overflows */
 340                 if (!error) {
 341                         acl_perm(vp, exi, &va, cr);
 342                         error = vattr_to_nattr(&va, &ns->ns_attr);
 343                 }
 344         }
 345 
 346         ct.cc_flags = 0;
 347 
 348         /*
 349          * Force modified metadata out to stable storage.
 350          */
 351         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 352 
 353         VN_RELE(vp);
 354 
 355         ns->ns_status = puterrno(error);
 356 }
 357 void *
 358 rfs_setattr_getfh(struct nfssaargs *args)
 359 {
 360         return (&args->saa_fh);
 361 }
 362 
 363 /* Change and release @exip and @vpp only in success */
 364 int
 365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 366 {
 367         struct exportinfo *exi;
 368         vnode_t *vp = *vpp;
 369         fid_t fid;
 370         int error;
 371 
 372         VN_HOLD(vp);
 373 
 374         if ((error = traverse(&vp)) != 0) {
 375                 VN_RELE(vp);
 376                 return (error);
 377         }
 378 
 379         bzero(&fid, sizeof (fid));
 380         fid.fid_len = MAXFIDSZ;
 381         error = VOP_FID(vp, &fid, NULL);
 382         if (error) {
 383                 VN_RELE(vp);
 384                 return (error);
 385         }
 386 
 387         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 388         if (exi == NULL ||
 389             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 390                 /*
 391                  * It is not error, just subdir is not exported
 392                  * or "nohide" is not set
 393                  */
 394                 if (exi != NULL)
 395                         exi_rele(exi);
 396                 VN_RELE(vp);
 397         } else {
 398                 /* go to submount */
 399                 exi_rele(*exip);
 400                 *exip = exi;
 401 
 402                 VN_RELE(*vpp);
 403                 *vpp = vp;
 404         }
 405 
 406         return (0);
 407 }
 408 
 409 /*
 410  * Given mounted "dvp" and "exi", go upper mountpoint
 411  * with dvp/exi correction
 412  * Return 0 in success
 413  */
 414 int
 415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416 {
 417         struct exportinfo *exi;
 418         vnode_t *dvp = *dvpp;
 419 
 420         ASSERT3U((*exip)->exi_zoneid, ==, curzone->zone_id);
 421         ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
 422 
 423         VN_HOLD(dvp);
 424         dvp = untraverse((*exip)->exi_ne, dvp);
 425         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 426         if (exi == NULL) {
 427                 VN_RELE(dvp);
 428                 return (-1);
 429         }
 430 
 431         ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 432         exi_rele(*exip);
 433         *exip = exi;
 434         VN_RELE(*dvpp);
 435         *dvpp = dvp;
 436 
 437         return (0);
 438 }
 439 /*
 440  * Directory lookup.
 441  * Returns an fhandle and file attributes for file name in a directory.
 442  */
 443 /* ARGSUSED */
 444 void
 445 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 446     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 447 {
 448         int error;
 449         vnode_t *dvp;
 450         vnode_t *vp;
 451         struct vattr va;
 452         fhandle_t *fhp = da->da_fhandle;
 453         struct sec_ol sec = {0, 0};
 454         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 455         char *name;
 456         struct sockaddr *ca;
 457 
 458         /*
 459          * Trusted Extension doesn't support NFSv2. MOUNT
 460          * will reject v2 clients. Need to prevent v2 client
 461          * access via WebNFS here.
 462          */
 463         if (is_system_labeled() && req->rq_vers == 2) {
 464                 dr->dr_status = NFSERR_ACCES;
 465                 return;
 466         }
 467 
 468         /*
 469          * Disallow NULL paths
 470          */
 471         if (da->da_name == NULL || *da->da_name == '\0') {
 472                 dr->dr_status = NFSERR_ACCES;
 473                 return;
 474         }
 475 
 476         /*
 477          * Allow lookups from the root - the default
 478          * location of the public filehandle.
 479          */
 480         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 481                 dvp = ZONE_ROOTVP();
 482                 VN_HOLD(dvp);
 483         } else {
 484                 dvp = nfs_fhtovp(fhp, exi);
 485                 if (dvp == NULL) {
 486                         dr->dr_status = NFSERR_STALE;
 487                         return;
 488                 }
 489         }
 490 
 491         exi_hold(exi);
 492         ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 493 
 494         /*
 495          * Not allow lookup beyond root.
 496          * If the filehandle matches a filehandle of the exi,
 497          * then the ".." refers beyond the root of an exported filesystem.
 498          */
 499         if (strcmp(da->da_name, "..") == 0 &&
 500             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 501                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 502                     ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 503                         /*
 504                          * special case for ".." and 'nohide'exported root
 505                          */
 506                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 507                                 error = NFSERR_ACCES;
 508                                 goto out;
 509                         }
 510                 } else  {
 511                         error = NFSERR_NOENT;
 512                         goto out;
 513                 }
 514         }
 515 
 516         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 517         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 518             MAXPATHLEN);
 519 
 520         if (name == NULL) {
 521                 error = NFSERR_ACCES;
 522                 goto out;
 523         }
 524 
 525         /*
 526          * If the public filehandle is used then allow
 527          * a multi-component lookup, i.e. evaluate
 528          * a pathname and follow symbolic links if
 529          * necessary.
 530          *
 531          * This may result in a vnode in another filesystem
 532          * which is OK as long as the filesystem is exported.
 533          */
 534         if (PUBLIC_FH2(fhp)) {
 535                 publicfh_flag = TRUE;
 536 
 537                 exi_rele(exi);
 538                 exi = NULL;
 539 
 540                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 541                     &sec);
 542         } else {
 543                 /*
 544                  * Do a normal single component lookup.
 545                  */
 546                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 547                     NULL, NULL, NULL);
 548         }
 549 
 550         if (name != da->da_name)
 551                 kmem_free(name, MAXPATHLEN);
 552 
 553         if (error == 0 && vn_ismntpt(vp)) {
 554                 error = rfs_cross_mnt(&vp, &exi);
 555                 if (error)
 556                         VN_RELE(vp);
 557         }
 558 
 559         if (!error) {
 560                 va.va_mask = AT_ALL;    /* we want everything */
 561 
 562                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 563 
 564                 /* check for overflows */
 565                 if (!error) {
 566                         acl_perm(vp, exi, &va, cr);
 567                         error = vattr_to_nattr(&va, &dr->dr_attr);
 568                         if (!error) {
 569                                 if (sec.sec_flags & SEC_QUERY)
 570                                         error = makefh_ol(&dr->dr_fhandle, exi,
 571                                             sec.sec_index);
 572                                 else {
 573                                         error = makefh(&dr->dr_fhandle, vp,
 574                                             exi);
 575                                         if (!error && publicfh_flag &&
 576                                             !chk_clnt_sec(exi, req))
 577                                                 auth_weak = TRUE;
 578                                 }
 579                         }
 580                 }
 581                 VN_RELE(vp);
 582         }
 583 
 584 out:
 585         VN_RELE(dvp);
 586 
 587         if (exi != NULL)
 588                 exi_rele(exi);
 589 
 590         /*
 591          * If it's public fh, no 0x81, and client's flavor is
 592          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 593          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 594          */
 595         if (auth_weak)
 596                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 597         else
 598                 dr->dr_status = puterrno(error);
 599 }
 600 void *
 601 rfs_lookup_getfh(struct nfsdiropargs *da)
 602 {
 603         return (da->da_fhandle);
 604 }
 605 
 606 /*
 607  * Read symbolic link.
 608  * Returns the string in the symbolic link at the given fhandle.
 609  */
 610 /* ARGSUSED */
 611 void
 612 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 613     struct svc_req *req, cred_t *cr, bool_t ro)
 614 {
 615         int error;
 616         struct iovec iov;
 617         struct uio uio;
 618         vnode_t *vp;
 619         struct vattr va;
 620         struct sockaddr *ca;
 621         char *name = NULL;
 622         int is_referral = 0;
 623 
 624         vp = nfs_fhtovp(fhp, exi);
 625         if (vp == NULL) {
 626                 rl->rl_data = NULL;
 627                 rl->rl_status = NFSERR_STALE;
 628                 return;
 629         }
 630 
 631         va.va_mask = AT_MODE;
 632 
 633         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 634 
 635         if (error) {
 636                 VN_RELE(vp);
 637                 rl->rl_data = NULL;
 638                 rl->rl_status = puterrno(error);
 639                 return;
 640         }
 641 
 642         if (MANDLOCK(vp, va.va_mode)) {
 643                 VN_RELE(vp);
 644                 rl->rl_data = NULL;
 645                 rl->rl_status = NFSERR_ACCES;
 646                 return;
 647         }
 648 
 649         /* We lied about the object type for a referral */
 650         if (vn_is_nfs_reparse(vp, cr))
 651                 is_referral = 1;
 652 
 653         /*
 654          * XNFS and RFC1094 require us to return ENXIO if argument
 655          * is not a link. BUGID 1138002.
 656          */
 657         if (vp->v_type != VLNK && !is_referral) {
 658                 VN_RELE(vp);
 659                 rl->rl_data = NULL;
 660                 rl->rl_status = NFSERR_NXIO;
 661                 return;
 662         }
 663 
 664         /*
 665          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 666          */
 667         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 668 
 669         if (is_referral) {
 670                 char *s;
 671                 size_t strsz;
 672 
 673                 /* Get an artificial symlink based on a referral */
 674                 s = build_symlink(vp, cr, &strsz);
 675                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 676                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 677                     vnode_t *, vp, char *, s);
 678                 if (s == NULL)
 679                         error = EINVAL;
 680                 else {
 681                         error = 0;
 682                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 683                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 684                         kmem_free(s, strsz);
 685                 }
 686 
 687         } else {
 688 
 689                 /*
 690                  * Set up io vector to read sym link data
 691                  */
 692                 iov.iov_base = rl->rl_data;
 693                 iov.iov_len = NFS_MAXPATHLEN;
 694                 uio.uio_iov = &iov;
 695                 uio.uio_iovcnt = 1;
 696                 uio.uio_segflg = UIO_SYSSPACE;
 697                 uio.uio_extflg = UIO_COPY_CACHED;
 698                 uio.uio_loffset = (offset_t)0;
 699                 uio.uio_resid = NFS_MAXPATHLEN;
 700 
 701                 /*
 702                  * Do the readlink.
 703                  */
 704                 error = VOP_READLINK(vp, &uio, cr, NULL);
 705 
 706                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 707 
 708                 if (!error)
 709                         rl->rl_data[rl->rl_count] = '\0';
 710 
 711         }
 712 
 713 
 714         VN_RELE(vp);
 715 
 716         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 717         name = nfscmd_convname(ca, exi, rl->rl_data,
 718             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 719 
 720         if (name != NULL && name != rl->rl_data) {
 721                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 722                 rl->rl_data = name;
 723         }
 724 
 725         /*
 726          * XNFS and RFC1094 require us to return ENXIO if argument
 727          * is not a link. UFS returns EINVAL if this is the case,
 728          * so we do the mapping here. BUGID 1138002.
 729          */
 730         if (error == EINVAL)
 731                 rl->rl_status = NFSERR_NXIO;
 732         else
 733                 rl->rl_status = puterrno(error);
 734 
 735 }
 736 void *
 737 rfs_readlink_getfh(fhandle_t *fhp)
 738 {
 739         return (fhp);
 740 }
 741 /*
 742  * Free data allocated by rfs_readlink
 743  */
 744 void
 745 rfs_rlfree(struct nfsrdlnres *rl)
 746 {
 747         if (rl->rl_data != NULL)
 748                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 749 }
 750 
 751 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 752 
 753 /*
 754  * Read data.
 755  * Returns some data read from the file at the given fhandle.
 756  */
 757 /* ARGSUSED */
 758 void
 759 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 760     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 761 {
 762         vnode_t *vp;
 763         int error;
 764         struct vattr va;
 765         struct iovec iov;
 766         struct uio uio;
 767         mblk_t *mp;
 768         int alloc_err = 0;
 769         int in_crit = 0;
 770         caller_context_t ct;
 771 
 772         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 773         if (vp == NULL) {
 774                 rr->rr_data = NULL;
 775                 rr->rr_status = NFSERR_STALE;
 776                 return;
 777         }
 778 
 779         if (vp->v_type != VREG) {
 780                 VN_RELE(vp);
 781                 rr->rr_data = NULL;
 782                 rr->rr_status = NFSERR_ISDIR;
 783                 return;
 784         }
 785 
 786         ct.cc_sysid = 0;
 787         ct.cc_pid = 0;
 788         ct.cc_caller_id = nfs2_srv_caller_id;
 789         ct.cc_flags = CC_DONTBLOCK;
 790 
 791         /*
 792          * Enter the critical region before calling VOP_RWLOCK
 793          * to avoid a deadlock with write requests.
 794          */
 795         if (nbl_need_check(vp)) {
 796                 nbl_start_crit(vp, RW_READER);
 797                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 798                     0, NULL)) {
 799                         nbl_end_crit(vp);
 800                         VN_RELE(vp);
 801                         rr->rr_data = NULL;
 802                         rr->rr_status = NFSERR_ACCES;
 803                         return;
 804                 }
 805                 in_crit = 1;
 806         }
 807 
 808         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 809 
 810         /* check if a monitor detected a delegation conflict */
 811         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 812                 if (in_crit)
 813                         nbl_end_crit(vp);
 814                 VN_RELE(vp);
 815                 /* mark as wouldblock so response is dropped */
 816                 curthread->t_flag |= T_WOULDBLOCK;
 817 
 818                 rr->rr_data = NULL;
 819                 return;
 820         }
 821 
 822         va.va_mask = AT_ALL;
 823 
 824         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 825 
 826         if (error) {
 827                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 828                 if (in_crit)
 829                         nbl_end_crit(vp);
 830 
 831                 VN_RELE(vp);
 832                 rr->rr_data = NULL;
 833                 rr->rr_status = puterrno(error);
 834 
 835                 return;
 836         }
 837 
 838         /*
 839          * This is a kludge to allow reading of files created
 840          * with no read permission.  The owner of the file
 841          * is always allowed to read it.
 842          */
 843         if (crgetuid(cr) != va.va_uid) {
 844                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 845 
 846                 if (error) {
 847                         /*
 848                          * Exec is the same as read over the net because
 849                          * of demand loading.
 850                          */
 851                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 852                 }
 853                 if (error) {
 854                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 855                         if (in_crit)
 856                                 nbl_end_crit(vp);
 857                         VN_RELE(vp);
 858                         rr->rr_data = NULL;
 859                         rr->rr_status = puterrno(error);
 860 
 861                         return;
 862                 }
 863         }
 864 
 865         if (MANDLOCK(vp, va.va_mode)) {
 866                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 867                 if (in_crit)
 868                         nbl_end_crit(vp);
 869 
 870                 VN_RELE(vp);
 871                 rr->rr_data = NULL;
 872                 rr->rr_status = NFSERR_ACCES;
 873 
 874                 return;
 875         }
 876 
 877         rr->rr_ok.rrok_wlist_len = 0;
 878         rr->rr_ok.rrok_wlist = NULL;
 879 
 880         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 881                 rr->rr_count = 0;
 882                 rr->rr_data = NULL;
 883                 /*
 884                  * In this case, status is NFS_OK, but there is no data
 885                  * to encode. So set rr_mp to NULL.
 886                  */
 887                 rr->rr_mp = NULL;
 888                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 889                 if (rr->rr_ok.rrok_wlist)
 890                         clist_zero_len(rr->rr_ok.rrok_wlist);
 891                 goto done;
 892         }
 893 
 894         if (ra->ra_wlist) {
 895                 mp = NULL;
 896                 rr->rr_mp = NULL;
 897                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 898                 if (ra->ra_count > iov.iov_len) {
 899                         rr->rr_data = NULL;
 900                         rr->rr_status = NFSERR_INVAL;
 901                         goto done;
 902                 }
 903         } else {
 904                 /*
 905                  * mp will contain the data to be sent out in the read reply.
 906                  * This will be freed after the reply has been sent out (by the
 907                  * driver).
 908                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 909                  * that the call to xdrmblk_putmblk() never fails.
 910                  */
 911                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 912                     &alloc_err);
 913                 ASSERT(mp != NULL);
 914                 ASSERT(alloc_err == 0);
 915 
 916                 rr->rr_mp = mp;
 917 
 918                 /*
 919                  * Set up io vector
 920                  */
 921                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 922                 iov.iov_len = ra->ra_count;
 923         }
 924 
 925         uio.uio_iov = &iov;
 926         uio.uio_iovcnt = 1;
 927         uio.uio_segflg = UIO_SYSSPACE;
 928         uio.uio_extflg = UIO_COPY_CACHED;
 929         uio.uio_loffset = (offset_t)ra->ra_offset;
 930         uio.uio_resid = ra->ra_count;
 931 
 932         error = VOP_READ(vp, &uio, 0, cr, &ct);
 933 
 934         if (error) {
 935                 if (mp)
 936                         freeb(mp);
 937 
 938                 /*
 939                  * check if a monitor detected a delegation conflict and
 940                  * mark as wouldblock so response is dropped
 941                  */
 942                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 943                         curthread->t_flag |= T_WOULDBLOCK;
 944                 else
 945                         rr->rr_status = puterrno(error);
 946 
 947                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 948                 if (in_crit)
 949                         nbl_end_crit(vp);
 950 
 951                 VN_RELE(vp);
 952                 rr->rr_data = NULL;
 953 
 954                 return;
 955         }
 956 
 957         /*
 958          * Get attributes again so we can send the latest access
 959          * time to the client side for its cache.
 960          */
 961         va.va_mask = AT_ALL;
 962 
 963         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 964 
 965         if (error) {
 966                 if (mp)
 967                         freeb(mp);
 968 
 969                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 970                 if (in_crit)
 971                         nbl_end_crit(vp);
 972 
 973                 VN_RELE(vp);
 974                 rr->rr_data = NULL;
 975                 rr->rr_status = puterrno(error);
 976 
 977                 return;
 978         }
 979 
 980         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 981 
 982         if (mp) {
 983                 rr->rr_data = (char *)mp->b_datap->db_base;
 984         } else {
 985                 if (ra->ra_wlist) {
 986                         rr->rr_data = (caddr_t)iov.iov_base;
 987                         if (!rdma_setup_read_data2(ra, rr)) {
 988                                 rr->rr_data = NULL;
 989                                 rr->rr_status = puterrno(NFSERR_INVAL);
 990                         }
 991                 }
 992         }
 993 done:
 994         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 995         if (in_crit)
 996                 nbl_end_crit(vp);
 997 
 998         acl_perm(vp, exi, &va, cr);
 999 
1000         /* check for overflows */
1001         error = vattr_to_nattr(&va, &rr->rr_attr);
1002 
1003         VN_RELE(vp);
1004 
1005         rr->rr_status = puterrno(error);
1006 }
1007 
1008 /*
1009  * Free data allocated by rfs_read
1010  */
1011 void
1012 rfs_rdfree(struct nfsrdresult *rr)
1013 {
1014         mblk_t *mp;
1015 
1016         if (rr->rr_status == NFS_OK) {
1017                 mp = rr->rr_mp;
1018                 if (mp != NULL)
1019                         freeb(mp);
1020         }
1021 }
1022 
1023 void *
1024 rfs_read_getfh(struct nfsreadargs *ra)
1025 {
1026         return (&ra->ra_fhandle);
1027 }
1028 
1029 #define MAX_IOVECS      12
1030 
1031 #ifdef DEBUG
1032 static int rfs_write_sync_hits = 0;
1033 static int rfs_write_sync_misses = 0;
1034 #endif
1035 
1036 /*
1037  * Write data to file.
1038  * Returns attributes of a file after writing some data to it.
1039  *
1040  * Any changes made here, especially in error handling might have
1041  * to also be done in rfs_write (which clusters write requests).
1042  */
1043 /* ARGSUSED */
1044 void
1045 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1046     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1047 {
1048         int error;
1049         vnode_t *vp;
1050         rlim64_t rlimit;
1051         struct vattr va;
1052         struct uio uio;
1053         struct iovec iov[MAX_IOVECS];
1054         mblk_t *m;
1055         struct iovec *iovp;
1056         int iovcnt;
1057         cred_t *savecred;
1058         int in_crit = 0;
1059         caller_context_t ct;
1060 
1061         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1062         if (vp == NULL) {
1063                 ns->ns_status = NFSERR_STALE;
1064                 return;
1065         }
1066 
1067         if (rdonly(ro, vp)) {
1068                 VN_RELE(vp);
1069                 ns->ns_status = NFSERR_ROFS;
1070                 return;
1071         }
1072 
1073         if (vp->v_type != VREG) {
1074                 VN_RELE(vp);
1075                 ns->ns_status = NFSERR_ISDIR;
1076                 return;
1077         }
1078 
1079         ct.cc_sysid = 0;
1080         ct.cc_pid = 0;
1081         ct.cc_caller_id = nfs2_srv_caller_id;
1082         ct.cc_flags = CC_DONTBLOCK;
1083 
1084         va.va_mask = AT_UID|AT_MODE;
1085 
1086         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1087 
1088         if (error) {
1089                 VN_RELE(vp);
1090                 ns->ns_status = puterrno(error);
1091 
1092                 return;
1093         }
1094 
1095         if (crgetuid(cr) != va.va_uid) {
1096                 /*
1097                  * This is a kludge to allow writes of files created
1098                  * with read only permission.  The owner of the file
1099                  * is always allowed to write it.
1100                  */
1101                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1102 
1103                 if (error) {
1104                         VN_RELE(vp);
1105                         ns->ns_status = puterrno(error);
1106                         return;
1107                 }
1108         }
1109 
1110         /*
1111          * Can't access a mandatory lock file.  This might cause
1112          * the NFS service thread to block forever waiting for a
1113          * lock to be released that will never be released.
1114          */
1115         if (MANDLOCK(vp, va.va_mode)) {
1116                 VN_RELE(vp);
1117                 ns->ns_status = NFSERR_ACCES;
1118                 return;
1119         }
1120 
1121         /*
1122          * We have to enter the critical region before calling VOP_RWLOCK
1123          * to avoid a deadlock with ufs.
1124          */
1125         if (nbl_need_check(vp)) {
1126                 nbl_start_crit(vp, RW_READER);
1127                 in_crit = 1;
1128                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1129                     wa->wa_count, 0, NULL)) {
1130                         error = EACCES;
1131                         goto out;
1132                 }
1133         }
1134 
1135         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1136 
1137         /* check if a monitor detected a delegation conflict */
1138         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1139                 goto out;
1140         }
1141 
1142         if (wa->wa_data || wa->wa_rlist) {
1143                 /* Do the RDMA thing if necessary */
1144                 if (wa->wa_rlist) {
1145                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1146                         iov[0].iov_len = wa->wa_count;
1147                 } else  {
1148                         iov[0].iov_base = wa->wa_data;
1149                         iov[0].iov_len = wa->wa_count;
1150                 }
1151                 uio.uio_iov = iov;
1152                 uio.uio_iovcnt = 1;
1153                 uio.uio_segflg = UIO_SYSSPACE;
1154                 uio.uio_extflg = UIO_COPY_DEFAULT;
1155                 uio.uio_loffset = (offset_t)wa->wa_offset;
1156                 uio.uio_resid = wa->wa_count;
1157                 /*
1158                  * The limit is checked on the client. We
1159                  * should allow any size writes here.
1160                  */
1161                 uio.uio_llimit = curproc->p_fsz_ctl;
1162                 rlimit = uio.uio_llimit - wa->wa_offset;
1163                 if (rlimit < (rlim64_t)uio.uio_resid)
1164                         uio.uio_resid = (uint_t)rlimit;
1165 
1166                 /*
1167                  * for now we assume no append mode
1168                  */
1169                 /*
1170                  * We're changing creds because VM may fault and we need
1171                  * the cred of the current thread to be used if quota
1172                  * checking is enabled.
1173                  */
1174                 savecred = curthread->t_cred;
1175                 curthread->t_cred = cr;
1176                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1177                 curthread->t_cred = savecred;
1178         } else {
1179 
1180                 iovcnt = 0;
1181                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1182                         iovcnt++;
1183                 if (iovcnt <= MAX_IOVECS) {
1184 #ifdef DEBUG
1185                         rfs_write_sync_hits++;
1186 #endif
1187                         iovp = iov;
1188                 } else {
1189 #ifdef DEBUG
1190                         rfs_write_sync_misses++;
1191 #endif
1192                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1193                 }
1194                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1195                 uio.uio_iov = iovp;
1196                 uio.uio_iovcnt = iovcnt;
1197                 uio.uio_segflg = UIO_SYSSPACE;
1198                 uio.uio_extflg = UIO_COPY_DEFAULT;
1199                 uio.uio_loffset = (offset_t)wa->wa_offset;
1200                 uio.uio_resid = wa->wa_count;
1201                 /*
1202                  * The limit is checked on the client. We
1203                  * should allow any size writes here.
1204                  */
1205                 uio.uio_llimit = curproc->p_fsz_ctl;
1206                 rlimit = uio.uio_llimit - wa->wa_offset;
1207                 if (rlimit < (rlim64_t)uio.uio_resid)
1208                         uio.uio_resid = (uint_t)rlimit;
1209 
1210                 /*
1211                  * For now we assume no append mode.
1212                  */
1213                 /*
1214                  * We're changing creds because VM may fault and we need
1215                  * the cred of the current thread to be used if quota
1216                  * checking is enabled.
1217                  */
1218                 savecred = curthread->t_cred;
1219                 curthread->t_cred = cr;
1220                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1221                 curthread->t_cred = savecred;
1222 
1223                 if (iovp != iov)
1224                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1225         }
1226 
1227         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1228 
1229         if (!error) {
1230                 /*
1231                  * Get attributes again so we send the latest mod
1232                  * time to the client side for its cache.
1233                  */
1234                 va.va_mask = AT_ALL;    /* now we want everything */
1235 
1236                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1237 
1238                 /* check for overflows */
1239                 if (!error) {
1240                         acl_perm(vp, exi, &va, cr);
1241                         error = vattr_to_nattr(&va, &ns->ns_attr);
1242                 }
1243         }
1244 
1245 out:
1246         if (in_crit)
1247                 nbl_end_crit(vp);
1248         VN_RELE(vp);
1249 
1250         /* check if a monitor detected a delegation conflict */
1251         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1252                 /* mark as wouldblock so response is dropped */
1253                 curthread->t_flag |= T_WOULDBLOCK;
1254         else
1255                 ns->ns_status = puterrno(error);
1256 
1257 }
1258 
1259 struct rfs_async_write {
1260         struct nfswriteargs *wa;
1261         struct nfsattrstat *ns;
1262         struct svc_req *req;
1263         cred_t *cr;
1264         bool_t ro;
1265         kthread_t *thread;
1266         struct rfs_async_write *list;
1267 };
1268 
1269 struct rfs_async_write_list {
1270         fhandle_t *fhp;
1271         kcondvar_t cv;
1272         struct rfs_async_write *list;
1273         struct rfs_async_write_list *next;
1274 };
1275 
1276 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1277 static kmutex_t rfs_async_write_lock;
1278 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1279 
1280 #define MAXCLIOVECS     42
1281 #define RFSWRITE_INITVAL (enum nfsstat) -1
1282 
1283 #ifdef DEBUG
1284 static int rfs_write_hits = 0;
1285 static int rfs_write_misses = 0;
1286 #endif
1287 
1288 /*
1289  * Write data to file.
1290  * Returns attributes of a file after writing some data to it.
1291  */
1292 void
1293 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1294     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1295 {
1296         int error;
1297         vnode_t *vp;
1298         rlim64_t rlimit;
1299         struct vattr va;
1300         struct uio uio;
1301         struct rfs_async_write_list *lp;
1302         struct rfs_async_write_list *nlp;
1303         struct rfs_async_write *rp;
1304         struct rfs_async_write *nrp;
1305         struct rfs_async_write *trp;
1306         struct rfs_async_write *lrp;
1307         int data_written;
1308         int iovcnt;
1309         mblk_t *m;
1310         struct iovec *iovp;
1311         struct iovec *niovp;
1312         struct iovec iov[MAXCLIOVECS];
1313         int count;
1314         int rcount;
1315         uint_t off;
1316         uint_t len;
1317         struct rfs_async_write nrpsp;
1318         struct rfs_async_write_list nlpsp;
1319         ushort_t t_flag;
1320         cred_t *savecred;
1321         int in_crit = 0;
1322         caller_context_t ct;
1323         nfs_srv_t *nsrv;
1324 
1325         ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1326         nsrv = nfs_get_srv();
1327         if (!nsrv->write_async) {
1328                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1329                 return;
1330         }
1331 
1332         /*
1333          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1334          * is considered an OK.
1335          */
1336         ns->ns_status = RFSWRITE_INITVAL;
1337 
1338         nrp = &nrpsp;
1339         nrp->wa = wa;
1340         nrp->ns = ns;
1341         nrp->req = req;
1342         nrp->cr = cr;
1343         nrp->ro = ro;
1344         nrp->thread = curthread;
1345 
1346         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1347 
1348         /*
1349          * Look to see if there is already a cluster started
1350          * for this file.
1351          */
1352         mutex_enter(&nsrv->async_write_lock);
1353         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1354                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1355                     sizeof (fhandle_t)) == 0)
1356                         break;
1357         }
1358 
1359         /*
1360          * If lp is non-NULL, then there is already a cluster
1361          * started.  We need to place ourselves in the cluster
1362          * list in the right place as determined by starting
1363          * offset.  Conflicts with non-blocking mandatory locked
1364          * regions will be checked when the cluster is processed.
1365          */
1366         if (lp != NULL) {
1367                 rp = lp->list;
1368                 trp = NULL;
1369                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1370                         trp = rp;
1371                         rp = rp->list;
1372                 }
1373                 nrp->list = rp;
1374                 if (trp == NULL)
1375                         lp->list = nrp;
1376                 else
1377                         trp->list = nrp;
1378                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1379                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1380                 mutex_exit(&nsrv->async_write_lock);
1381 
1382                 return;
1383         }
1384 
1385         /*
1386          * No cluster started yet, start one and add ourselves
1387          * to the list of clusters.
1388          */
1389         nrp->list = NULL;
1390 
1391         nlp = &nlpsp;
1392         nlp->fhp = &wa->wa_fhandle;
1393         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1394         nlp->list = nrp;
1395         nlp->next = NULL;
1396 
1397         if (nsrv->async_write_head == NULL) {
1398                 nsrv->async_write_head = nlp;
1399         } else {
1400                 lp = nsrv->async_write_head;
1401                 while (lp->next != NULL)
1402                         lp = lp->next;
1403                 lp->next = nlp;
1404         }
1405         mutex_exit(&nsrv->async_write_lock);
1406 
1407         /*
1408          * Convert the file handle common to all of the requests
1409          * in this cluster to a vnode.
1410          */
1411         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1412         if (vp == NULL) {
1413                 mutex_enter(&nsrv->async_write_lock);
1414                 if (nsrv->async_write_head == nlp)
1415                         nsrv->async_write_head = nlp->next;
1416                 else {
1417                         lp = nsrv->async_write_head;
1418                         while (lp->next != nlp)
1419                                 lp = lp->next;
1420                         lp->next = nlp->next;
1421                 }
1422                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1423                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1424                         rp->ns->ns_status = NFSERR_STALE;
1425                         rp->thread->t_flag |= t_flag;
1426                 }
1427                 cv_broadcast(&nlp->cv);
1428                 mutex_exit(&nsrv->async_write_lock);
1429 
1430                 return;
1431         }
1432 
1433         /*
1434          * Can only write regular files.  Attempts to write any
1435          * other file types fail with EISDIR.
1436          */
1437         if (vp->v_type != VREG) {
1438                 VN_RELE(vp);
1439                 mutex_enter(&nsrv->async_write_lock);
1440                 if (nsrv->async_write_head == nlp)
1441                         nsrv->async_write_head = nlp->next;
1442                 else {
1443                         lp = nsrv->async_write_head;
1444                         while (lp->next != nlp)
1445                                 lp = lp->next;
1446                         lp->next = nlp->next;
1447                 }
1448                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1449                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1450                         rp->ns->ns_status = NFSERR_ISDIR;
1451                         rp->thread->t_flag |= t_flag;
1452                 }
1453                 cv_broadcast(&nlp->cv);
1454                 mutex_exit(&nsrv->async_write_lock);
1455 
1456                 return;
1457         }
1458 
1459         /*
1460          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1461          * deadlock with ufs.
1462          */
1463         if (nbl_need_check(vp)) {
1464                 nbl_start_crit(vp, RW_READER);
1465                 in_crit = 1;
1466         }
1467 
1468         ct.cc_sysid = 0;
1469         ct.cc_pid = 0;
1470         ct.cc_caller_id = nfs2_srv_caller_id;
1471         ct.cc_flags = CC_DONTBLOCK;
1472 
1473         /*
1474          * Lock the file for writing.  This operation provides
1475          * the delay which allows clusters to grow.
1476          */
1477         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1478 
1479         /* check if a monitor detected a delegation conflict */
1480         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1481                 if (in_crit)
1482                         nbl_end_crit(vp);
1483                 VN_RELE(vp);
1484                 /* mark as wouldblock so response is dropped */
1485                 curthread->t_flag |= T_WOULDBLOCK;
1486                 mutex_enter(&nsrv->async_write_lock);
1487                 if (nsrv->async_write_head == nlp)
1488                         nsrv->async_write_head = nlp->next;
1489                 else {
1490                         lp = nsrv->async_write_head;
1491                         while (lp->next != nlp)
1492                                 lp = lp->next;
1493                         lp->next = nlp->next;
1494                 }
1495                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1496                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1497                                 rp->ns->ns_status = puterrno(error);
1498                                 rp->thread->t_flag |= T_WOULDBLOCK;
1499                         }
1500                 }
1501                 cv_broadcast(&nlp->cv);
1502                 mutex_exit(&nsrv->async_write_lock);
1503 
1504                 return;
1505         }
1506 
1507         /*
1508          * Disconnect this cluster from the list of clusters.
1509          * The cluster that is being dealt with must be fixed
1510          * in size after this point, so there is no reason
1511          * to leave it on the list so that new requests can
1512          * find it.
1513          *
1514          * The algorithm is that the first write request will
1515          * create a cluster, convert the file handle to a
1516          * vnode pointer, and then lock the file for writing.
1517          * This request is not likely to be clustered with
1518          * any others.  However, the next request will create
1519          * a new cluster and be blocked in VOP_RWLOCK while
1520          * the first request is being processed.  This delay
1521          * will allow more requests to be clustered in this
1522          * second cluster.
1523          */
1524         mutex_enter(&nsrv->async_write_lock);
1525         if (nsrv->async_write_head == nlp)
1526                 nsrv->async_write_head = nlp->next;
1527         else {
1528                 lp = nsrv->async_write_head;
1529                 while (lp->next != nlp)
1530                         lp = lp->next;
1531                 lp->next = nlp->next;
1532         }
1533         mutex_exit(&nsrv->async_write_lock);
1534 
1535         /*
1536          * Step through the list of requests in this cluster.
1537          * We need to check permissions to make sure that all
1538          * of the requests have sufficient permission to write
1539          * the file.  A cluster can be composed of requests
1540          * from different clients and different users on each
1541          * client.
1542          *
1543          * As a side effect, we also calculate the size of the
1544          * byte range that this cluster encompasses.
1545          */
1546         rp = nlp->list;
1547         off = rp->wa->wa_offset;
1548         len = (uint_t)0;
1549         do {
1550                 if (rdonly(rp->ro, vp)) {
1551                         rp->ns->ns_status = NFSERR_ROFS;
1552                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1553                         rp->thread->t_flag |= t_flag;
1554                         continue;
1555                 }
1556 
1557                 va.va_mask = AT_UID|AT_MODE;
1558 
1559                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1560 
1561                 if (!error) {
1562                         if (crgetuid(rp->cr) != va.va_uid) {
1563                                 /*
1564                                  * This is a kludge to allow writes of files
1565                                  * created with read only permission.  The
1566                                  * owner of the file is always allowed to
1567                                  * write it.
1568                                  */
1569                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1570                         }
1571                         if (!error && MANDLOCK(vp, va.va_mode))
1572                                 error = EACCES;
1573                 }
1574 
1575                 /*
1576                  * Check for a conflict with a nbmand-locked region.
1577                  */
1578                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1579                     rp->wa->wa_count, 0, NULL)) {
1580                         error = EACCES;
1581                 }
1582 
1583                 if (error) {
1584                         rp->ns->ns_status = puterrno(error);
1585                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1586                         rp->thread->t_flag |= t_flag;
1587                         continue;
1588                 }
1589                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1590                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1591         } while ((rp = rp->list) != NULL);
1592 
1593         /*
1594          * Step through the cluster attempting to gather as many
1595          * requests which are contiguous as possible.  These
1596          * contiguous requests are handled via one call to VOP_WRITE
1597          * instead of different calls to VOP_WRITE.  We also keep
1598          * track of the fact that any data was written.
1599          */
1600         rp = nlp->list;
1601         data_written = 0;
1602         do {
1603                 /*
1604                  * Skip any requests which are already marked as having an
1605                  * error.
1606                  */
1607                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1608                         rp = rp->list;
1609                         continue;
1610                 }
1611 
1612                 /*
1613                  * Count the number of iovec's which are required
1614                  * to handle this set of requests.  One iovec is
1615                  * needed for each data buffer, whether addressed
1616                  * by wa_data or by the b_rptr pointers in the
1617                  * mblk chains.
1618                  */
1619                 iovcnt = 0;
1620                 lrp = rp;
1621                 for (;;) {
1622                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1623                                 iovcnt++;
1624                         else {
1625                                 m = lrp->wa->wa_mblk;
1626                                 while (m != NULL) {
1627                                         iovcnt++;
1628                                         m = m->b_cont;
1629                                 }
1630                         }
1631                         if (lrp->list == NULL ||
1632                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1633                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1634                             lrp->list->wa->wa_offset) {
1635                                 lrp = lrp->list;
1636                                 break;
1637                         }
1638                         lrp = lrp->list;
1639                 }
1640 
1641                 if (iovcnt <= MAXCLIOVECS) {
1642 #ifdef DEBUG
1643                         rfs_write_hits++;
1644 #endif
1645                         niovp = iov;
1646                 } else {
1647 #ifdef DEBUG
1648                         rfs_write_misses++;
1649 #endif
1650                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1651                 }
1652                 /*
1653                  * Put together the scatter/gather iovecs.
1654                  */
1655                 iovp = niovp;
1656                 trp = rp;
1657                 count = 0;
1658                 do {
1659                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1660                                 if (trp->wa->wa_rlist) {
1661                                         iovp->iov_base =
1662                                             (char *)((trp->wa->wa_rlist)->
1663                                             u.c_daddr3);
1664                                         iovp->iov_len = trp->wa->wa_count;
1665                                 } else  {
1666                                         iovp->iov_base = trp->wa->wa_data;
1667                                         iovp->iov_len = trp->wa->wa_count;
1668                                 }
1669                                 iovp++;
1670                         } else {
1671                                 m = trp->wa->wa_mblk;
1672                                 rcount = trp->wa->wa_count;
1673                                 while (m != NULL) {
1674                                         iovp->iov_base = (caddr_t)m->b_rptr;
1675                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1676                                         rcount -= iovp->iov_len;
1677                                         if (rcount < 0)
1678                                                 iovp->iov_len += rcount;
1679                                         iovp++;
1680                                         if (rcount <= 0)
1681                                                 break;
1682                                         m = m->b_cont;
1683                                 }
1684                         }
1685                         count += trp->wa->wa_count;
1686                         trp = trp->list;
1687                 } while (trp != lrp);
1688 
1689                 uio.uio_iov = niovp;
1690                 uio.uio_iovcnt = iovcnt;
1691                 uio.uio_segflg = UIO_SYSSPACE;
1692                 uio.uio_extflg = UIO_COPY_DEFAULT;
1693                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1694                 uio.uio_resid = count;
1695                 /*
1696                  * The limit is checked on the client. We
1697                  * should allow any size writes here.
1698                  */
1699                 uio.uio_llimit = curproc->p_fsz_ctl;
1700                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1701                 if (rlimit < (rlim64_t)uio.uio_resid)
1702                         uio.uio_resid = (uint_t)rlimit;
1703 
1704                 /*
1705                  * For now we assume no append mode.
1706                  */
1707 
1708                 /*
1709                  * We're changing creds because VM may fault
1710                  * and we need the cred of the current
1711                  * thread to be used if quota * checking is
1712                  * enabled.
1713                  */
1714                 savecred = curthread->t_cred;
1715                 curthread->t_cred = cr;
1716                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1717                 curthread->t_cred = savecred;
1718 
1719                 /* check if a monitor detected a delegation conflict */
1720                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1721                         /* mark as wouldblock so response is dropped */
1722                         curthread->t_flag |= T_WOULDBLOCK;
1723 
1724                 if (niovp != iov)
1725                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1726 
1727                 if (!error) {
1728                         data_written = 1;
1729                         /*
1730                          * Get attributes again so we send the latest mod
1731                          * time to the client side for its cache.
1732                          */
1733                         va.va_mask = AT_ALL;    /* now we want everything */
1734 
1735                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1736 
1737                         if (!error)
1738                                 acl_perm(vp, exi, &va, rp->cr);
1739                 }
1740 
1741                 /*
1742                  * Fill in the status responses for each request
1743                  * which was just handled.  Also, copy the latest
1744                  * attributes in to the attribute responses if
1745                  * appropriate.
1746                  */
1747                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1748                 do {
1749                         rp->thread->t_flag |= t_flag;
1750                         /* check for overflows */
1751                         if (!error) {
1752                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1753                         }
1754                         rp->ns->ns_status = puterrno(error);
1755                         rp = rp->list;
1756                 } while (rp != lrp);
1757         } while (rp != NULL);
1758 
1759         /*
1760          * If any data was written at all, then we need to flush
1761          * the data and metadata to stable storage.
1762          */
1763         if (data_written) {
1764                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1765 
1766                 if (!error) {
1767                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1768                 }
1769         }
1770 
1771         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1772 
1773         if (in_crit)
1774                 nbl_end_crit(vp);
1775         VN_RELE(vp);
1776 
1777         t_flag = curthread->t_flag & T_WOULDBLOCK;
1778         mutex_enter(&nsrv->async_write_lock);
1779         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1780                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1781                         rp->ns->ns_status = puterrno(error);
1782                         rp->thread->t_flag |= t_flag;
1783                 }
1784         }
1785         cv_broadcast(&nlp->cv);
1786         mutex_exit(&nsrv->async_write_lock);
1787 
1788 }
1789 
1790 void *
1791 rfs_write_getfh(struct nfswriteargs *wa)
1792 {
1793         return (&wa->wa_fhandle);
1794 }
1795 
1796 /*
1797  * Create a file.
1798  * Creates a file with given attributes and returns those attributes
1799  * and an fhandle for the new file.
1800  */
1801 void
1802 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1803     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1804 {
1805         int error;
1806         int lookuperr;
1807         int in_crit = 0;
1808         struct vattr va;
1809         vnode_t *vp;
1810         vnode_t *realvp;
1811         vnode_t *dvp;
1812         char *name = args->ca_da.da_name;
1813         vnode_t *tvp = NULL;
1814         int mode;
1815         int lookup_ok;
1816         bool_t trunc;
1817         struct sockaddr *ca;
1818 
1819         /*
1820          * Disallow NULL paths
1821          */
1822         if (name == NULL || *name == '\0') {
1823                 dr->dr_status = NFSERR_ACCES;
1824                 return;
1825         }
1826 
1827         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1828         if (dvp == NULL) {
1829                 dr->dr_status = NFSERR_STALE;
1830                 return;
1831         }
1832 
1833         error = sattr_to_vattr(args->ca_sa, &va);
1834         if (error) {
1835                 dr->dr_status = puterrno(error);
1836                 return;
1837         }
1838 
1839         /*
1840          * Must specify the mode.
1841          */
1842         if (!(va.va_mask & AT_MODE)) {
1843                 VN_RELE(dvp);
1844                 dr->dr_status = NFSERR_INVAL;
1845                 return;
1846         }
1847 
1848         /*
1849          * This is a completely gross hack to make mknod
1850          * work over the wire until we can wack the protocol
1851          */
1852         if ((va.va_mode & IFMT) == IFCHR) {
1853                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1854                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1855                 else {
1856                         va.va_type = VCHR;
1857                         /*
1858                          * uncompress the received dev_t
1859                          * if the top half is zero indicating a request
1860                          * from an `older style' OS.
1861                          */
1862                         if ((va.va_size & 0xffff0000) == 0)
1863                                 va.va_rdev = nfsv2_expdev(va.va_size);
1864                         else
1865                                 va.va_rdev = (dev_t)va.va_size;
1866                 }
1867                 va.va_mask &= ~AT_SIZE;
1868         } else if ((va.va_mode & IFMT) == IFBLK) {
1869                 va.va_type = VBLK;
1870                 /*
1871                  * uncompress the received dev_t
1872                  * if the top half is zero indicating a request
1873                  * from an `older style' OS.
1874                  */
1875                 if ((va.va_size & 0xffff0000) == 0)
1876                         va.va_rdev = nfsv2_expdev(va.va_size);
1877                 else
1878                         va.va_rdev = (dev_t)va.va_size;
1879                 va.va_mask &= ~AT_SIZE;
1880         } else if ((va.va_mode & IFMT) == IFSOCK) {
1881                 va.va_type = VSOCK;
1882         } else {
1883                 va.va_type = VREG;
1884         }
1885         va.va_mode &= ~IFMT;
1886         va.va_mask |= AT_TYPE;
1887 
1888         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1889         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1890             MAXPATHLEN);
1891         if (name == NULL) {
1892                 dr->dr_status = puterrno(EINVAL);
1893                 return;
1894         }
1895 
1896         /*
1897          * Why was the choice made to use VWRITE as the mode to the
1898          * call to VOP_CREATE ? This results in a bug.  When a client
1899          * opens a file that already exists and is RDONLY, the second
1900          * open fails with an EACESS because of the mode.
1901          * bug ID 1054648.
1902          */
1903         lookup_ok = 0;
1904         mode = VWRITE;
1905         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1906                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1907                     NULL, NULL, NULL);
1908                 if (!error) {
1909                         struct vattr at;
1910 
1911                         lookup_ok = 1;
1912                         at.va_mask = AT_MODE;
1913                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1914                         if (!error)
1915                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1916                         VN_RELE(tvp);
1917                         tvp = NULL;
1918                 }
1919         }
1920 
1921         if (!lookup_ok) {
1922                 if (rdonly(ro, dvp)) {
1923                         error = EROFS;
1924                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1925                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1926                         error = EPERM;
1927                 } else {
1928                         error = 0;
1929                 }
1930         }
1931 
1932         /*
1933          * If file size is being modified on an already existing file
1934          * make sure that there are no conflicting non-blocking mandatory
1935          * locks in the region being manipulated. Return EACCES if there
1936          * are conflicting locks.
1937          */
1938         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1939                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1940                     NULL, NULL, NULL);
1941 
1942                 if (!lookuperr &&
1943                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1944                         VN_RELE(tvp);
1945                         curthread->t_flag |= T_WOULDBLOCK;
1946                         goto out;
1947                 }
1948 
1949                 if (!lookuperr && nbl_need_check(tvp)) {
1950                         /*
1951                          * The file exists. Now check if it has any
1952                          * conflicting non-blocking mandatory locks
1953                          * in the region being changed.
1954                          */
1955                         struct vattr bva;
1956                         u_offset_t offset;
1957                         ssize_t length;
1958 
1959                         nbl_start_crit(tvp, RW_READER);
1960                         in_crit = 1;
1961 
1962                         bva.va_mask = AT_SIZE;
1963                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1964                         if (!error) {
1965                                 if (va.va_size < bva.va_size) {
1966                                         offset = va.va_size;
1967                                         length = bva.va_size - va.va_size;
1968                                 } else {
1969                                         offset = bva.va_size;
1970                                         length = va.va_size - bva.va_size;
1971                                 }
1972                                 if (length) {
1973                                         if (nbl_conflict(tvp, NBL_WRITE,
1974                                             offset, length, 0, NULL)) {
1975                                                 error = EACCES;
1976                                         }
1977                                 }
1978                         }
1979                         if (error) {
1980                                 nbl_end_crit(tvp);
1981                                 VN_RELE(tvp);
1982                                 in_crit = 0;
1983                         }
1984                 } else if (tvp != NULL) {
1985                         VN_RELE(tvp);
1986                 }
1987         }
1988 
1989         if (!error) {
1990                 /*
1991                  * If filesystem is shared with nosuid the remove any
1992                  * setuid/setgid bits on create.
1993                  */
1994                 if (va.va_type == VREG &&
1995                     exi->exi_export.ex_flags & EX_NOSUID)
1996                         va.va_mode &= ~(VSUID | VSGID);
1997 
1998                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1999                     NULL, NULL);
2000 
2001                 if (!error) {
2002 
2003                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2004                                 trunc = TRUE;
2005                         else
2006                                 trunc = FALSE;
2007 
2008                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2009                                 VN_RELE(vp);
2010                                 curthread->t_flag |= T_WOULDBLOCK;
2011                                 goto out;
2012                         }
2013                         va.va_mask = AT_ALL;
2014 
2015                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2016 
2017                         /* check for overflows */
2018                         if (!error) {
2019                                 acl_perm(vp, exi, &va, cr);
2020                                 error = vattr_to_nattr(&va, &dr->dr_attr);
2021                                 if (!error) {
2022                                         error = makefh(&dr->dr_fhandle, vp,
2023                                             exi);
2024                                 }
2025                         }
2026                         /*
2027                          * Force modified metadata out to stable storage.
2028                          *
2029                          * if a underlying vp exists, pass it to VOP_FSYNC
2030                          */
2031                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
2032                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2033                         else
2034                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2035                         VN_RELE(vp);
2036                 }
2037 
2038                 if (in_crit) {
2039                         nbl_end_crit(tvp);
2040                         VN_RELE(tvp);
2041                 }
2042         }
2043 
2044         /*
2045          * Force modified data and metadata out to stable storage.
2046          */
2047         (void) VOP_FSYNC(dvp, 0, cr, NULL);
2048 
2049 out:
2050 
2051         VN_RELE(dvp);
2052 
2053         dr->dr_status = puterrno(error);
2054 
2055         if (name != args->ca_da.da_name)
2056                 kmem_free(name, MAXPATHLEN);
2057 }
2058 void *
2059 rfs_create_getfh(struct nfscreatargs *args)
2060 {
2061         return (args->ca_da.da_fhandle);
2062 }
2063 
2064 /*
2065  * Remove a file.
2066  * Remove named file from parent directory.
2067  */
2068 /* ARGSUSED */
2069 void
2070 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2071     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2072 {
2073         int error = 0;
2074         vnode_t *vp;
2075         vnode_t *targvp;
2076         int in_crit = 0;
2077 
2078         /*
2079          * Disallow NULL paths
2080          */
2081         if (da->da_name == NULL || *da->da_name == '\0') {
2082                 *status = NFSERR_ACCES;
2083                 return;
2084         }
2085 
2086         vp = nfs_fhtovp(da->da_fhandle, exi);
2087         if (vp == NULL) {
2088                 *status = NFSERR_STALE;
2089                 return;
2090         }
2091 
2092         if (rdonly(ro, vp)) {
2093                 VN_RELE(vp);
2094                 *status = NFSERR_ROFS;
2095                 return;
2096         }
2097 
2098         /*
2099          * Check for a conflict with a non-blocking mandatory share reservation.
2100          */
2101         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2102             NULL, cr, NULL, NULL, NULL);
2103         if (error != 0) {
2104                 VN_RELE(vp);
2105                 *status = puterrno(error);
2106                 return;
2107         }
2108 
2109         /*
2110          * If the file is delegated to an v4 client, then initiate
2111          * recall and drop this request (by setting T_WOULDBLOCK).
2112          * The client will eventually re-transmit the request and
2113          * (hopefully), by then, the v4 client will have returned
2114          * the delegation.
2115          */
2116 
2117         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2118                 VN_RELE(vp);
2119                 VN_RELE(targvp);
2120                 curthread->t_flag |= T_WOULDBLOCK;
2121                 return;
2122         }
2123 
2124         if (nbl_need_check(targvp)) {
2125                 nbl_start_crit(targvp, RW_READER);
2126                 in_crit = 1;
2127                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2128                         error = EACCES;
2129                         goto out;
2130                 }
2131         }
2132 
2133         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2134 
2135         /*
2136          * Force modified data and metadata out to stable storage.
2137          */
2138         (void) VOP_FSYNC(vp, 0, cr, NULL);
2139 
2140 out:
2141         if (in_crit)
2142                 nbl_end_crit(targvp);
2143         VN_RELE(targvp);
2144         VN_RELE(vp);
2145 
2146         *status = puterrno(error);
2147 
2148 }
2149 
2150 void *
2151 rfs_remove_getfh(struct nfsdiropargs *da)
2152 {
2153         return (da->da_fhandle);
2154 }
2155 
2156 /*
2157  * rename a file
2158  * Give a file (from) a new name (to).
2159  */
2160 /* ARGSUSED */
2161 void
2162 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2163     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2164 {
2165         int error = 0;
2166         vnode_t *fromvp;
2167         vnode_t *tovp;
2168         struct exportinfo *to_exi;
2169         fhandle_t *fh;
2170         vnode_t *srcvp;
2171         vnode_t *targvp;
2172         int in_crit = 0;
2173 
2174         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2175         if (fromvp == NULL) {
2176                 *status = NFSERR_STALE;
2177                 return;
2178         }
2179 
2180         fh = args->rna_to.da_fhandle;
2181         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2182         if (to_exi == NULL) {
2183                 VN_RELE(fromvp);
2184                 *status = NFSERR_ACCES;
2185                 return;
2186         }
2187         exi_rele(to_exi);
2188 
2189         if (to_exi != exi) {
2190                 VN_RELE(fromvp);
2191                 *status = NFSERR_XDEV;
2192                 return;
2193         }
2194 
2195         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2196         if (tovp == NULL) {
2197                 VN_RELE(fromvp);
2198                 *status = NFSERR_STALE;
2199                 return;
2200         }
2201 
2202         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2203                 VN_RELE(tovp);
2204                 VN_RELE(fromvp);
2205                 *status = NFSERR_NOTDIR;
2206                 return;
2207         }
2208 
2209         /*
2210          * Disallow NULL paths
2211          */
2212         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2213             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2214                 VN_RELE(tovp);
2215                 VN_RELE(fromvp);
2216                 *status = NFSERR_ACCES;
2217                 return;
2218         }
2219 
2220         if (rdonly(ro, tovp)) {
2221                 VN_RELE(tovp);
2222                 VN_RELE(fromvp);
2223                 *status = NFSERR_ROFS;
2224                 return;
2225         }
2226 
2227         /*
2228          * Check for a conflict with a non-blocking mandatory share reservation.
2229          */
2230         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2231             NULL, cr, NULL, NULL, NULL);
2232         if (error != 0) {
2233                 VN_RELE(tovp);
2234                 VN_RELE(fromvp);
2235                 *status = puterrno(error);
2236                 return;
2237         }
2238 
2239         /* Check for delegations on the source file */
2240 
2241         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2242                 VN_RELE(tovp);
2243                 VN_RELE(fromvp);
2244                 VN_RELE(srcvp);
2245                 curthread->t_flag |= T_WOULDBLOCK;
2246                 return;
2247         }
2248 
2249         /* Check for delegation on the file being renamed over, if it exists */
2250 
2251         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2252             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2253             NULL, NULL, NULL) == 0) {
2254 
2255                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2256                         VN_RELE(tovp);
2257                         VN_RELE(fromvp);
2258                         VN_RELE(srcvp);
2259                         VN_RELE(targvp);
2260                         curthread->t_flag |= T_WOULDBLOCK;
2261                         return;
2262                 }
2263                 VN_RELE(targvp);
2264         }
2265 
2266 
2267         if (nbl_need_check(srcvp)) {
2268                 nbl_start_crit(srcvp, RW_READER);
2269                 in_crit = 1;
2270                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2271                         error = EACCES;
2272                         goto out;
2273                 }
2274         }
2275 
2276         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2277             tovp, args->rna_to.da_name, cr, NULL, 0);
2278 
2279         if (error == 0)
2280                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2281                     strlen(args->rna_to.da_name));
2282 
2283         /*
2284          * Force modified data and metadata out to stable storage.
2285          */
2286         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2287         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2288 
2289 out:
2290         if (in_crit)
2291                 nbl_end_crit(srcvp);
2292         VN_RELE(srcvp);
2293         VN_RELE(tovp);
2294         VN_RELE(fromvp);
2295 
2296         *status = puterrno(error);
2297 
2298 }
2299 void *
2300 rfs_rename_getfh(struct nfsrnmargs *args)
2301 {
2302         return (args->rna_from.da_fhandle);
2303 }
2304 
2305 /*
2306  * Link to a file.
2307  * Create a file (to) which is a hard link to the given file (from).
2308  */
2309 /* ARGSUSED */
2310 void
2311 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2312     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2313 {
2314         int error;
2315         vnode_t *fromvp;
2316         vnode_t *tovp;
2317         struct exportinfo *to_exi;
2318         fhandle_t *fh;
2319 
2320         fromvp = nfs_fhtovp(args->la_from, exi);
2321         if (fromvp == NULL) {
2322                 *status = NFSERR_STALE;
2323                 return;
2324         }
2325 
2326         fh = args->la_to.da_fhandle;
2327         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2328         if (to_exi == NULL) {
2329                 VN_RELE(fromvp);
2330                 *status = NFSERR_ACCES;
2331                 return;
2332         }
2333         exi_rele(to_exi);
2334 
2335         if (to_exi != exi) {
2336                 VN_RELE(fromvp);
2337                 *status = NFSERR_XDEV;
2338                 return;
2339         }
2340 
2341         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2342         if (tovp == NULL) {
2343                 VN_RELE(fromvp);
2344                 *status = NFSERR_STALE;
2345                 return;
2346         }
2347 
2348         if (tovp->v_type != VDIR) {
2349                 VN_RELE(tovp);
2350                 VN_RELE(fromvp);
2351                 *status = NFSERR_NOTDIR;
2352                 return;
2353         }
2354         /*
2355          * Disallow NULL paths
2356          */
2357         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2358                 VN_RELE(tovp);
2359                 VN_RELE(fromvp);
2360                 *status = NFSERR_ACCES;
2361                 return;
2362         }
2363 
2364         if (rdonly(ro, tovp)) {
2365                 VN_RELE(tovp);
2366                 VN_RELE(fromvp);
2367                 *status = NFSERR_ROFS;
2368                 return;
2369         }
2370 
2371         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2372 
2373         /*
2374          * Force modified data and metadata out to stable storage.
2375          */
2376         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2377         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2378 
2379         VN_RELE(tovp);
2380         VN_RELE(fromvp);
2381 
2382         *status = puterrno(error);
2383 
2384 }
2385 void *
2386 rfs_link_getfh(struct nfslinkargs *args)
2387 {
2388         return (args->la_from);
2389 }
2390 
2391 /*
2392  * Symbolicly link to a file.
2393  * Create a file (to) with the given attributes which is a symbolic link
2394  * to the given path name (to).
2395  */
2396 void
2397 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2398     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2399 {
2400         int error;
2401         struct vattr va;
2402         vnode_t *vp;
2403         vnode_t *svp;
2404         int lerror;
2405         struct sockaddr *ca;
2406         char *name = NULL;
2407 
2408         /*
2409          * Disallow NULL paths
2410          */
2411         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2412                 *status = NFSERR_ACCES;
2413                 return;
2414         }
2415 
2416         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2417         if (vp == NULL) {
2418                 *status = NFSERR_STALE;
2419                 return;
2420         }
2421 
2422         if (rdonly(ro, vp)) {
2423                 VN_RELE(vp);
2424                 *status = NFSERR_ROFS;
2425                 return;
2426         }
2427 
2428         error = sattr_to_vattr(args->sla_sa, &va);
2429         if (error) {
2430                 VN_RELE(vp);
2431                 *status = puterrno(error);
2432                 return;
2433         }
2434 
2435         if (!(va.va_mask & AT_MODE)) {
2436                 VN_RELE(vp);
2437                 *status = NFSERR_INVAL;
2438                 return;
2439         }
2440 
2441         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2442         name = nfscmd_convname(ca, exi, args->sla_tnm,
2443             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2444 
2445         if (name == NULL) {
2446                 *status = NFSERR_ACCES;
2447                 return;
2448         }
2449 
2450         va.va_type = VLNK;
2451         va.va_mask |= AT_TYPE;
2452 
2453         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2454 
2455         /*
2456          * Force new data and metadata out to stable storage.
2457          */
2458         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2459             NULL, cr, NULL, NULL, NULL);
2460 
2461         if (!lerror) {
2462                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2463                 VN_RELE(svp);
2464         }
2465 
2466         /*
2467          * Force modified data and metadata out to stable storage.
2468          */
2469         (void) VOP_FSYNC(vp, 0, cr, NULL);
2470 
2471         VN_RELE(vp);
2472 
2473         *status = puterrno(error);
2474         if (name != args->sla_tnm)
2475                 kmem_free(name, MAXPATHLEN);
2476 
2477 }
2478 void *
2479 rfs_symlink_getfh(struct nfsslargs *args)
2480 {
2481         return (args->sla_from.da_fhandle);
2482 }
2483 
2484 /*
2485  * Make a directory.
2486  * Create a directory with the given name, parent directory, and attributes.
2487  * Returns a file handle and attributes for the new directory.
2488  */
2489 /* ARGSUSED */
2490 void
2491 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2492     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2493 {
2494         int error;
2495         struct vattr va;
2496         vnode_t *dvp = NULL;
2497         vnode_t *vp;
2498         char *name = args->ca_da.da_name;
2499 
2500         /*
2501          * Disallow NULL paths
2502          */
2503         if (name == NULL || *name == '\0') {
2504                 dr->dr_status = NFSERR_ACCES;
2505                 return;
2506         }
2507 
2508         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2509         if (vp == NULL) {
2510                 dr->dr_status = NFSERR_STALE;
2511                 return;
2512         }
2513 
2514         if (rdonly(ro, vp)) {
2515                 VN_RELE(vp);
2516                 dr->dr_status = NFSERR_ROFS;
2517                 return;
2518         }
2519 
2520         error = sattr_to_vattr(args->ca_sa, &va);
2521         if (error) {
2522                 VN_RELE(vp);
2523                 dr->dr_status = puterrno(error);
2524                 return;
2525         }
2526 
2527         if (!(va.va_mask & AT_MODE)) {
2528                 VN_RELE(vp);
2529                 dr->dr_status = NFSERR_INVAL;
2530                 return;
2531         }
2532 
2533         va.va_type = VDIR;
2534         va.va_mask |= AT_TYPE;
2535 
2536         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2537 
2538         if (!error) {
2539                 /*
2540                  * Attribtutes of the newly created directory should
2541                  * be returned to the client.
2542                  */
2543                 va.va_mask = AT_ALL; /* We want everything */
2544                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2545 
2546                 /* check for overflows */
2547                 if (!error) {
2548                         acl_perm(vp, exi, &va, cr);
2549                         error = vattr_to_nattr(&va, &dr->dr_attr);
2550                         if (!error) {
2551                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2552                         }
2553                 }
2554                 /*
2555                  * Force new data and metadata out to stable storage.
2556                  */
2557                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2558                 VN_RELE(dvp);
2559         }
2560 
2561         /*
2562          * Force modified data and metadata out to stable storage.
2563          */
2564         (void) VOP_FSYNC(vp, 0, cr, NULL);
2565 
2566         VN_RELE(vp);
2567 
2568         dr->dr_status = puterrno(error);
2569 
2570 }
2571 void *
2572 rfs_mkdir_getfh(struct nfscreatargs *args)
2573 {
2574         return (args->ca_da.da_fhandle);
2575 }
2576 
2577 /*
2578  * Remove a directory.
2579  * Remove the given directory name from the given parent directory.
2580  */
2581 /* ARGSUSED */
2582 void
2583 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2584     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2585 {
2586         int error;
2587         vnode_t *vp;
2588 
2589         /*
2590          * Disallow NULL paths
2591          */
2592         if (da->da_name == NULL || *da->da_name == '\0') {
2593                 *status = NFSERR_ACCES;
2594                 return;
2595         }
2596 
2597         vp = nfs_fhtovp(da->da_fhandle, exi);
2598         if (vp == NULL) {
2599                 *status = NFSERR_STALE;
2600                 return;
2601         }
2602 
2603         if (rdonly(ro, vp)) {
2604                 VN_RELE(vp);
2605                 *status = NFSERR_ROFS;
2606                 return;
2607         }
2608 
2609         /*
2610          * VOP_RMDIR takes a third argument (the current
2611          * directory of the process).  That's because someone
2612          * wants to return EINVAL if one tries to remove ".".
2613          * Of course, NFS servers have no idea what their
2614          * clients' current directories are.  We fake it by
2615          * supplying a vnode known to exist and illegal to
2616          * remove.
2617          */
2618         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2619 
2620         /*
2621          * Force modified data and metadata out to stable storage.
2622          */
2623         (void) VOP_FSYNC(vp, 0, cr, NULL);
2624 
2625         VN_RELE(vp);
2626 
2627         /*
2628          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2629          * if the directory is not empty.  A System V NFS server
2630          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2631          * over the wire.
2632          */
2633         if (error == EEXIST)
2634                 *status = NFSERR_NOTEMPTY;
2635         else
2636                 *status = puterrno(error);
2637 
2638 }
2639 void *
2640 rfs_rmdir_getfh(struct nfsdiropargs *da)
2641 {
2642         return (da->da_fhandle);
2643 }
2644 
2645 /* ARGSUSED */
2646 void
2647 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2648     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2649 {
2650         int error;
2651         int iseof;
2652         struct iovec iov;
2653         struct uio uio;
2654         vnode_t *vp;
2655         char *ndata = NULL;
2656         struct sockaddr *ca;
2657         size_t nents;
2658         int ret;
2659 
2660         vp = nfs_fhtovp(&rda->rda_fh, exi);
2661         if (vp == NULL) {
2662                 rd->rd_entries = NULL;
2663                 rd->rd_status = NFSERR_STALE;
2664                 return;
2665         }
2666 
2667         if (vp->v_type != VDIR) {
2668                 VN_RELE(vp);
2669                 rd->rd_entries = NULL;
2670                 rd->rd_status = NFSERR_NOTDIR;
2671                 return;
2672         }
2673 
2674         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2675 
2676         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2677 
2678         if (error) {
2679                 rd->rd_entries = NULL;
2680                 goto bad;
2681         }
2682 
2683         if (rda->rda_count == 0) {
2684                 rd->rd_entries = NULL;
2685                 rd->rd_size = 0;
2686                 rd->rd_eof = FALSE;
2687                 goto bad;
2688         }
2689 
2690         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2691 
2692         /*
2693          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2694          */
2695         rd->rd_bufsize = (uint_t)rda->rda_count;
2696         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2697 
2698         /*
2699          * Set up io vector to read directory data
2700          */
2701         iov.iov_base = (caddr_t)rd->rd_entries;
2702         iov.iov_len = rda->rda_count;
2703         uio.uio_iov = &iov;
2704         uio.uio_iovcnt = 1;
2705         uio.uio_segflg = UIO_SYSSPACE;
2706         uio.uio_extflg = UIO_COPY_CACHED;
2707         uio.uio_loffset = (offset_t)rda->rda_offset;
2708         uio.uio_resid = rda->rda_count;
2709 
2710         /*
2711          * read directory
2712          */
2713         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2714 
2715         /*
2716          * Clean up
2717          */
2718         if (!error) {
2719                 /*
2720                  * set size and eof
2721                  */
2722                 if (uio.uio_resid == rda->rda_count) {
2723                         rd->rd_size = 0;
2724                         rd->rd_eof = TRUE;
2725                 } else {
2726                         rd->rd_size = (uint32_t)(rda->rda_count -
2727                             uio.uio_resid);
2728                         rd->rd_eof = iseof ? TRUE : FALSE;
2729                 }
2730         }
2731 
2732         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2733         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2734         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2735             rda->rda_count, &ndata);
2736 
2737         if (ret != 0) {
2738                 size_t dropbytes;
2739                 /*
2740                  * We had to drop one or more entries in order to fit
2741                  * during the character conversion.  We need to patch
2742                  * up the size and eof info.
2743                  */
2744                 if (rd->rd_eof)
2745                         rd->rd_eof = FALSE;
2746                 dropbytes = nfscmd_dropped_entrysize(
2747                     (struct dirent64 *)rd->rd_entries, nents, ret);
2748                 rd->rd_size -= dropbytes;
2749         }
2750         if (ndata == NULL) {
2751                 ndata = (char *)rd->rd_entries;
2752         } else if (ndata != (char *)rd->rd_entries) {
2753                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2754                 rd->rd_entries = (void *)ndata;
2755                 rd->rd_bufsize = rda->rda_count;
2756         }
2757 
2758 bad:
2759         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2760 
2761 #if 0 /* notyet */
2762         /*
2763          * Don't do this.  It causes local disk writes when just
2764          * reading the file and the overhead is deemed larger
2765          * than the benefit.
2766          */
2767         /*
2768          * Force modified metadata out to stable storage.
2769          */
2770         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2771 #endif
2772 
2773         VN_RELE(vp);
2774 
2775         rd->rd_status = puterrno(error);
2776 
2777 }
2778 void *
2779 rfs_readdir_getfh(struct nfsrddirargs *rda)
2780 {
2781         return (&rda->rda_fh);
2782 }
2783 void
2784 rfs_rddirfree(struct nfsrddirres *rd)
2785 {
2786         if (rd->rd_entries != NULL)
2787                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2788 }
2789 
2790 /* ARGSUSED */
2791 void
2792 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2793     struct svc_req *req, cred_t *cr, bool_t ro)
2794 {
2795         int error;
2796         struct statvfs64 sb;
2797         vnode_t *vp;
2798 
2799         vp = nfs_fhtovp(fh, exi);
2800         if (vp == NULL) {
2801                 fs->fs_status = NFSERR_STALE;
2802                 return;
2803         }
2804 
2805         error = VFS_STATVFS(vp->v_vfsp, &sb);
2806 
2807         if (!error) {
2808                 fs->fs_tsize = nfstsize();
2809                 fs->fs_bsize = sb.f_frsize;
2810                 fs->fs_blocks = sb.f_blocks;
2811                 fs->fs_bfree = sb.f_bfree;
2812                 fs->fs_bavail = sb.f_bavail;
2813         }
2814 
2815         VN_RELE(vp);
2816 
2817         fs->fs_status = puterrno(error);
2818 
2819 }
2820 void *
2821 rfs_statfs_getfh(fhandle_t *fh)
2822 {
2823         return (fh);
2824 }
2825 
2826 static int
2827 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2828 {
2829         vap->va_mask = 0;
2830 
2831         /*
2832          * There was a sign extension bug in some VFS based systems
2833          * which stored the mode as a short.  When it would get
2834          * assigned to a u_long, no sign extension would occur.
2835          * It needed to, but this wasn't noticed because sa_mode
2836          * would then get assigned back to the short, thus ignoring
2837          * the upper 16 bits of sa_mode.
2838          *
2839          * To make this implementation work for both broken
2840          * clients and good clients, we check for both versions
2841          * of the mode.
2842          */
2843         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2844             sa->sa_mode != (uint32_t)-1) {
2845                 vap->va_mask |= AT_MODE;
2846                 vap->va_mode = sa->sa_mode;
2847         }
2848         if (sa->sa_uid != (uint32_t)-1) {
2849                 vap->va_mask |= AT_UID;
2850                 vap->va_uid = sa->sa_uid;
2851         }
2852         if (sa->sa_gid != (uint32_t)-1) {
2853                 vap->va_mask |= AT_GID;
2854                 vap->va_gid = sa->sa_gid;
2855         }
2856         if (sa->sa_size != (uint32_t)-1) {
2857                 vap->va_mask |= AT_SIZE;
2858                 vap->va_size = sa->sa_size;
2859         }
2860         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2861             sa->sa_atime.tv_usec != (int32_t)-1) {
2862 #ifndef _LP64
2863                 /* return error if time overflow */
2864                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2865                         return (EOVERFLOW);
2866 #endif
2867                 vap->va_mask |= AT_ATIME;
2868                 /*
2869                  * nfs protocol defines times as unsigned so don't extend sign,
2870                  * unless sysadmin set nfs_allow_preepoch_time.
2871                  */
2872                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2873                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2874         }
2875         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2876             sa->sa_mtime.tv_usec != (int32_t)-1) {
2877 #ifndef _LP64
2878                 /* return error if time overflow */
2879                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2880                         return (EOVERFLOW);
2881 #endif
2882                 vap->va_mask |= AT_MTIME;
2883                 /*
2884                  * nfs protocol defines times as unsigned so don't extend sign,
2885                  * unless sysadmin set nfs_allow_preepoch_time.
2886                  */
2887                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2888                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2889         }
2890         return (0);
2891 }
2892 
2893 static const enum nfsftype vt_to_nf[] = {
2894         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2895 };
2896 
2897 /*
2898  * check the following fields for overflow: nodeid, size, and time.
2899  * There could be a problem when converting 64-bit LP64 fields
2900  * into 32-bit ones.  Return an error if there is an overflow.
2901  */
2902 int
2903 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2904 {
2905         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2906         na->na_type = vt_to_nf[vap->va_type];
2907 
2908         if (vap->va_mode == (unsigned short) -1)
2909                 na->na_mode = (uint32_t)-1;
2910         else
2911                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2912 
2913         if (vap->va_uid == (unsigned short)(-1))
2914                 na->na_uid = (uint32_t)(-1);
2915         else if (vap->va_uid == UID_NOBODY)
2916                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2917         else
2918                 na->na_uid = vap->va_uid;
2919 
2920         if (vap->va_gid == (unsigned short)(-1))
2921                 na->na_gid = (uint32_t)-1;
2922         else if (vap->va_gid == GID_NOBODY)
2923                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2924         else
2925                 na->na_gid = vap->va_gid;
2926 
2927         /*
2928          * Do we need to check fsid for overflow?  It is 64-bit in the
2929          * vattr, but are bigger than 32 bit values supported?
2930          */
2931         na->na_fsid = vap->va_fsid;
2932 
2933         na->na_nodeid = vap->va_nodeid;
2934 
2935         /*
2936          * Check to make sure that the nodeid is representable over the
2937          * wire without losing bits.
2938          */
2939         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2940                 return (EFBIG);
2941         na->na_nlink = vap->va_nlink;
2942 
2943         /*
2944          * Check for big files here, instead of at the caller.  See
2945          * comments in cstat for large special file explanation.
2946          */
2947         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2948                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2949                         return (EFBIG);
2950                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2951                         /* UNKNOWN_SIZE | OVERFLOW */
2952                         na->na_size = MAXOFF32_T;
2953                 } else
2954                         na->na_size = vap->va_size;
2955         } else
2956                 na->na_size = vap->va_size;
2957 
2958         /*
2959          * If the vnode times overflow the 32-bit times that NFS2
2960          * uses on the wire then return an error.
2961          */
2962         if (!NFS_VAP_TIME_OK(vap)) {
2963                 return (EOVERFLOW);
2964         }
2965         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2966         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2967 
2968         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2969         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2970 
2971         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2972         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2973 
2974         /*
2975          * If the dev_t will fit into 16 bits then compress
2976          * it, otherwise leave it alone. See comments in
2977          * nfs_client.c.
2978          */
2979         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2980             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2981                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2982         else
2983                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2984 
2985         na->na_blocks = vap->va_nblocks;
2986         na->na_blocksize = vap->va_blksize;
2987 
2988         /*
2989          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2990          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2991          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2992          *
2993          * BUYER BEWARE:
2994          *  If you are porting the NFS to a non-Sun server, you probably
2995          *  don't want to include the following block of code.  The
2996          *  over-the-wire special file types will be changing with the
2997          *  NFS Protocol Revision.
2998          */
2999         if (vap->va_type == VFIFO)
3000                 NA_SETFIFO(na);
3001         return (0);
3002 }
3003 
3004 /*
3005  * acl v2 support: returns approximate permission.
3006  *      default: returns minimal permission (more restrictive)
3007  *      aclok: returns maximal permission (less restrictive)
3008  *      This routine changes the permissions that are alaredy in *va.
3009  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3010  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3011  */
3012 static void
3013 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3014 {
3015         vsecattr_t      vsa;
3016         int             aclcnt;
3017         aclent_t        *aclentp;
3018         mode_t          mask_perm;
3019         mode_t          grp_perm;
3020         mode_t          other_perm;
3021         mode_t          other_orig;
3022         int             error;
3023 
3024         /* dont care default acl */
3025         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3026         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3027 
3028         if (!error) {
3029                 aclcnt = vsa.vsa_aclcnt;
3030                 if (aclcnt > MIN_ACL_ENTRIES) {
3031                         /* non-trivial ACL */
3032                         aclentp = vsa.vsa_aclentp;
3033                         if (exi->exi_export.ex_flags & EX_ACLOK) {
3034                                 /* maximal permissions */
3035                                 grp_perm = 0;
3036                                 other_perm = 0;
3037                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3038                                         switch (aclentp->a_type) {
3039                                         case USER_OBJ:
3040                                                 break;
3041                                         case USER:
3042                                                 grp_perm |=
3043                                                     aclentp->a_perm << 3;
3044                                                 other_perm |= aclentp->a_perm;
3045                                                 break;
3046                                         case GROUP_OBJ:
3047                                                 grp_perm |=
3048                                                     aclentp->a_perm << 3;
3049                                                 break;
3050                                         case GROUP:
3051                                                 other_perm |= aclentp->a_perm;
3052                                                 break;
3053                                         case OTHER_OBJ:
3054                                                 other_orig = aclentp->a_perm;
3055                                                 break;
3056                                         case CLASS_OBJ:
3057                                                 mask_perm = aclentp->a_perm;
3058                                                 break;
3059                                         default:
3060                                                 break;
3061                                         }
3062                                 }
3063                                 grp_perm &= mask_perm << 3;
3064                                 other_perm &= mask_perm;
3065                                 other_perm |= other_orig;
3066 
3067                         } else {
3068                                 /* minimal permissions */
3069                                 grp_perm = 070;
3070                                 other_perm = 07;
3071                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
3072                                         switch (aclentp->a_type) {
3073                                         case USER_OBJ:
3074                                                 break;
3075                                         case USER:
3076                                         case CLASS_OBJ:
3077                                                 grp_perm &=
3078                                                     aclentp->a_perm << 3;
3079                                                 other_perm &=
3080                                                     aclentp->a_perm;
3081                                                 break;
3082                                         case GROUP_OBJ:
3083                                                 grp_perm &=
3084                                                     aclentp->a_perm << 3;
3085                                                 break;
3086                                         case GROUP:
3087                                                 other_perm &=
3088                                                     aclentp->a_perm;
3089                                                 break;
3090                                         case OTHER_OBJ:
3091                                                 other_perm &=
3092                                                     aclentp->a_perm;
3093                                                 break;
3094                                         default:
3095                                                 break;
3096                                         }
3097                                 }
3098                         }
3099                         /* copy to va */
3100                         va->va_mode &= ~077;
3101                         va->va_mode |= grp_perm | other_perm;
3102                 }
3103                 if (vsa.vsa_aclcnt)
3104                         kmem_free(vsa.vsa_aclentp,
3105                             vsa.vsa_aclcnt * sizeof (aclent_t));
3106         }
3107 }
3108 
3109 void
3110 rfs_srvrinit(void)
3111 {
3112         nfs2_srv_caller_id = fs_new_caller_id();
3113 }
3114 
3115 void
3116 rfs_srvrfini(void)
3117 {
3118 }
3119 
3120 /* ARGSUSED */
3121 void
3122 rfs_srv_zone_init(nfs_globals_t *ng)
3123 {
3124         nfs_srv_t *ns;
3125 
3126         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3127 
3128         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3129         ns->write_async = 1;
3130 
3131         ng->nfs_srv = ns;
3132 }
3133 
3134 /* ARGSUSED */
3135 void
3136 rfs_srv_zone_fini(nfs_globals_t *ng)
3137 {
3138         nfs_srv_t *ns = ng->nfs_srv;
3139 
3140         ng->nfs_srv = NULL;
3141 
3142         mutex_destroy(&ns->async_write_lock);
3143         kmem_free(ns, sizeof (*ns));
3144 }
3145 
3146 static int
3147 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3148 {
3149         struct clist    *wcl;
3150         int             wlist_len;
3151         uint32_t        count = rr->rr_count;
3152 
3153         wcl = ra->ra_wlist;
3154 
3155         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3156                 return (FALSE);
3157         }
3158 
3159         wcl = ra->ra_wlist;
3160         rr->rr_ok.rrok_wlist_len = wlist_len;
3161         rr->rr_ok.rrok_wlist = wcl;
3162 
3163         return (TRUE);
3164 }