big-one Sdiff usr/src/uts/common/fs/nfs/nfs

Print this page

NEX-17125 NFS: nbmand lock entered but not exited on error path
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-15279 support NFS server in zone
NEX-15520 online NFS shares cause zoneadm halt to hang in nfs_export_zone_fini
Portions contributed by: Dan Kruchinin dan.kruchinin@nexenta.com
Portions contributed by: Stepan Zastupov stepan.zastupov@gmail.com
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-9275 Got "bad mutex" panic when run IO to nfs share from clients
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-3524 CLONE - Port NEX-3505 "wrong authentication" messages with root=@0.0.0.0/0 set, result in loss of client access
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
NEX-3533 CLONE - Port NEX-3019 NFSv3 writes underneath mounted filesystem to directory
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-3095 Issues related to NFS nohide
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-1128 NFS server: Generic uid and gid remapping for AUTH_SYS
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
OS-20 share_nfs(1m) charset handling is unreliable
OS-22 Page fault at nfscmd_dropped_entrysize+0x1e()
OS-23 NFSv2/3/4: READDIR responses are inconsistent when charset conversion fails
OS-24 rfs3_readdir(): Issues related to nfscmd_convdirent()
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword
closes #12112 rb3823 - nfs-nohide: lookup("..") for submount should be correct
re #3541 rb11254 - nfs nohide - "nfssrv: need ability to go to submounts for v3 and v2 protocols"

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */

  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2016 by Delphix. All rights reserved.
  25  */
  26 
  27 /*
  28  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  29  *      All rights reserved.
  30  */
  31 





  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/cred.h>
  36 #include <sys/buf.h>
  37 #include <sys/vfs.h>
  38 #include <sys/vnode.h>
  39 #include <sys/uio.h>
  40 #include <sys/stat.h>
  41 #include <sys/errno.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/statvfs.h>
  44 #include <sys/kmem.h>
  45 #include <sys/kstat.h>
  46 #include <sys/dirent.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/debug.h>
  49 #include <sys/vtrace.h>
  50 #include <sys/mode.h>
  51 #include <sys/acl.h>
  52 #include <sys/nbmlock.h>
  53 #include <sys/policy.h>
  54 #include <sys/sdt.h>
  55 
  56 #include <rpc/types.h>
  57 #include <rpc/auth.h>
  58 #include <rpc/svc.h>
  59 
  60 #include <nfs/nfs.h>
  61 #include <nfs/export.h>
  62 #include <nfs/nfs_cmd.h>
  63 
  64 #include <vm/hat.h>
  65 #include <vm/as.h>
  66 #include <vm/seg.h>
  67 #include <vm/seg_map.h>
  68 #include <vm/seg_kmem.h>
  69 
  70 #include <sys/strsubr.h>
  71 


  72 /*













  73  * These are the interface routines for the server side of the
  74  * Network File System.  See the NFS version 2 protocol specification
  75  * for a description of this interface.
  76  */
  77 
  78 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  79 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  80                         cred_t *);


  81 

  82 /*
  83  * Some "over the wire" UNIX file types.  These are encoded
  84  * into the mode.  This needs to be fixed in the next rev.
  85  */
  86 #define IFMT            0170000         /* type of file */
  87 #define IFCHR           0020000         /* character special */
  88 #define IFBLK           0060000         /* block special */
  89 #define IFSOCK          0140000         /* socket */
  90 
  91 u_longlong_t nfs2_srv_caller_id;

  92 
  93 /*
  94  * Get file attributes.
  95  * Returns the current attributes of the file with the given fhandle.
  96  */
  97 /* ARGSUSED */
  98 void
  99 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 100     struct svc_req *req, cred_t *cr, bool_t ro)
 101 {
 102         int error;
 103         vnode_t *vp;
 104         struct vattr va;
 105 
 106         vp = nfs_fhtovp(fhp, exi);
 107         if (vp == NULL) {
 108                 ns->ns_status = NFSERR_STALE;
 109                 return;
 110         }
 111

 312                 }
 313         }
 314 
 315         ct.cc_flags = 0;
 316 
 317         /*
 318          * Force modified metadata out to stable storage.
 319          */
 320         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 321 
 322         VN_RELE(vp);
 323 
 324         ns->ns_status = puterrno(error);
 325 }
 326 void *
 327 rfs_setattr_getfh(struct nfssaargs *args)
 328 {
 329         return (&args->saa_fh);
 330 }
 331 














































 332 /*




























 333  * Directory lookup.
 334  * Returns an fhandle and file attributes for file name in a directory.
 335  */
 336 /* ARGSUSED */
 337 void
 338 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 339     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 340 {
 341         int error;
 342         vnode_t *dvp;
 343         vnode_t *vp;
 344         struct vattr va;
 345         fhandle_t *fhp = da->da_fhandle;
 346         struct sec_ol sec = {0, 0};
 347         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 348         char *name;
 349         struct sockaddr *ca;
 350 
 351         /*
 352          * Trusted Extension doesn't support NFSv2. MOUNT

 354          * access via WebNFS here.
 355          */
 356         if (is_system_labeled() && req->rq_vers == 2) {
 357                 dr->dr_status = NFSERR_ACCES;
 358                 return;
 359         }
 360 
 361         /*
 362          * Disallow NULL paths
 363          */
 364         if (da->da_name == NULL || *da->da_name == '\0') {
 365                 dr->dr_status = NFSERR_ACCES;
 366                 return;
 367         }
 368 
 369         /*
 370          * Allow lookups from the root - the default
 371          * location of the public filehandle.
 372          */
 373         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 374                 dvp = rootdir;
 375                 VN_HOLD(dvp);
 376         } else {
 377                 dvp = nfs_fhtovp(fhp, exi);
 378                 if (dvp == NULL) {
 379                         dr->dr_status = NFSERR_STALE;
 380                         return;
 381                 }
 382         }
 383 


 384         /*
 385          * Not allow lookup beyond root.
 386          * If the filehandle matches a filehandle of the exi,
 387          * then the ".." refers beyond the root of an exported filesystem.
 388          */
 389         if (strcmp(da->da_name, "..") == 0 &&
 390             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 391                 VN_RELE(dvp);
 392                 dr->dr_status = NFSERR_NOENT;
 393                 return;





 394         }





 395 
 396         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 397         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 398             MAXPATHLEN);
 399 
 400         if (name == NULL) {
 401                 dr->dr_status = NFSERR_ACCES;
 402                 return;
 403         }
 404 
 405         /*
 406          * If the public filehandle is used then allow
 407          * a multi-component lookup, i.e. evaluate
 408          * a pathname and follow symbolic links if
 409          * necessary.
 410          *
 411          * This may result in a vnode in another filesystem
 412          * which is OK as long as the filesystem is exported.
 413          */
 414         if (PUBLIC_FH2(fhp)) {
 415                 publicfh_flag = TRUE;



 416                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 417                     &sec);
 418         } else {
 419                 /*
 420                  * Do a normal single component lookup.
 421                  */
 422                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 423                     NULL, NULL, NULL);
 424         }
 425 
 426         if (name != da->da_name)
 427                 kmem_free(name, MAXPATHLEN);
 428 





 429 
 430         if (!error) {
 431                 va.va_mask = AT_ALL;    /* we want everything */
 432 
 433                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 434 
 435                 /* check for overflows */
 436                 if (!error) {
 437                         acl_perm(vp, exi, &va, cr);
 438                         error = vattr_to_nattr(&va, &dr->dr_attr);
 439                         if (!error) {
 440                                 if (sec.sec_flags & SEC_QUERY)
 441                                         error = makefh_ol(&dr->dr_fhandle, exi,
 442                                             sec.sec_index);
 443                                 else {
 444                                         error = makefh(&dr->dr_fhandle, vp,
 445                                             exi);
 446                                         if (!error && publicfh_flag &&
 447                                             !chk_clnt_sec(exi, req))
 448                                                 auth_weak = TRUE;
 449                                 }
 450                         }
 451                 }
 452                 VN_RELE(vp);
 453         }
 454 

 455         VN_RELE(dvp);
 456 
 457         /*
 458          * If publicfh_flag is true then we have called rfs_publicfh_mclookup
 459          * and have obtained a new exportinfo in exi which needs to be
 460          * released. Note the the original exportinfo pointed to by exi
 461          * will be released by the caller, comon_dispatch.
 462          */
 463         if (publicfh_flag && exi != NULL)
 464                 exi_rele(exi);
 465 
 466         /*
 467          * If it's public fh, no 0x81, and client's flavor is
 468          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 469          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 470          */
 471         if (auth_weak)
 472                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 473         else
 474                 dr->dr_status = puterrno(error);
 475 }
 476 void *
 477 rfs_lookup_getfh(struct nfsdiropargs *da)
 478 {
 479         return (da->da_fhandle);
 480 }
 481 
 482 /*
 483  * Read symbolic link.
 484  * Returns the string in the symbolic link at the given fhandle.

 668          * Enter the critical region before calling VOP_RWLOCK
 669          * to avoid a deadlock with write requests.
 670          */
 671         if (nbl_need_check(vp)) {
 672                 nbl_start_crit(vp, RW_READER);
 673                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 674                     0, NULL)) {
 675                         nbl_end_crit(vp);
 676                         VN_RELE(vp);
 677                         rr->rr_data = NULL;
 678                         rr->rr_status = NFSERR_ACCES;
 679                         return;
 680                 }
 681                 in_crit = 1;
 682         }
 683 
 684         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 685 
 686         /* check if a monitor detected a delegation conflict */
 687         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {


 688                 VN_RELE(vp);
 689                 /* mark as wouldblock so response is dropped */
 690                 curthread->t_flag |= T_WOULDBLOCK;
 691 
 692                 rr->rr_data = NULL;
 693                 return;
 694         }
 695 
 696         va.va_mask = AT_ALL;
 697 
 698         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 699 
 700         if (error) {
 701                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 702                 if (in_crit)
 703                         nbl_end_crit(vp);
 704 
 705                 VN_RELE(vp);
 706                 rr->rr_data = NULL;
 707                 rr->rr_status = puterrno(error);

 993         }
 994 
 995         /*
 996          * We have to enter the critical region before calling VOP_RWLOCK
 997          * to avoid a deadlock with ufs.
 998          */
 999         if (nbl_need_check(vp)) {
1000                 nbl_start_crit(vp, RW_READER);
1001                 in_crit = 1;
1002                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1003                     wa->wa_count, 0, NULL)) {
1004                         error = EACCES;
1005                         goto out;
1006                 }
1007         }
1008 
1009         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1010 
1011         /* check if a monitor detected a delegation conflict */
1012         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1013                 VN_RELE(vp);
1014                 /* mark as wouldblock so response is dropped */
1015                 curthread->t_flag |= T_WOULDBLOCK;
1016                 return;
1017         }
1018 
1019         if (wa->wa_data || wa->wa_rlist) {
1020                 /* Do the RDMA thing if necessary */
1021                 if (wa->wa_rlist) {
1022                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1023                         iov[0].iov_len = wa->wa_count;
1024                 } else  {
1025                         iov[0].iov_base = wa->wa_data;
1026                         iov[0].iov_len = wa->wa_count;
1027                 }
1028                 uio.uio_iov = iov;
1029                 uio.uio_iovcnt = 1;
1030                 uio.uio_segflg = UIO_SYSSPACE;
1031                 uio.uio_extflg = UIO_COPY_DEFAULT;
1032                 uio.uio_loffset = (offset_t)wa->wa_offset;
1033                 uio.uio_resid = wa->wa_count;
1034                 /*
1035                  * The limit is checked on the client. We
1036                  * should allow any size writes here.
1037                  */
1038                 uio.uio_llimit = curproc->p_fsz_ctl;
1039                 rlimit = uio.uio_llimit - wa->wa_offset;
1040                 if (rlimit < (rlim64_t)uio.uio_resid)
1041                         uio.uio_resid = (uint_t)rlimit;
1042 
1043                 /*
1044                  * for now we assume no append mode
1045                  */
1046                 /*
1047                  * We're changing creds because VM may fault and we need
1048                  * the cred of the current thread to be used if quota
1049                  * checking is enabled.
1050                  */
1051                 savecred = curthread->t_cred;
1052                 curthread->t_cred = cr;
1053                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1054                 curthread->t_cred = savecred;
1055         } else {

1056                 iovcnt = 0;
1057                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1058                         iovcnt++;
1059                 if (iovcnt <= MAX_IOVECS) {
1060 #ifdef DEBUG
1061                         rfs_write_sync_hits++;
1062 #endif
1063                         iovp = iov;
1064                 } else {
1065 #ifdef DEBUG
1066                         rfs_write_sync_misses++;
1067 #endif
1068                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1069                 }
1070                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1071                 uio.uio_iov = iovp;
1072                 uio.uio_iovcnt = iovcnt;
1073                 uio.uio_segflg = UIO_SYSSPACE;
1074                 uio.uio_extflg = UIO_COPY_DEFAULT;
1075                 uio.uio_loffset = (offset_t)wa->wa_offset;

1134 
1135 struct rfs_async_write {
1136         struct nfswriteargs *wa;
1137         struct nfsattrstat *ns;
1138         struct svc_req *req;
1139         cred_t *cr;
1140         bool_t ro;
1141         kthread_t *thread;
1142         struct rfs_async_write *list;
1143 };
1144 
1145 struct rfs_async_write_list {
1146         fhandle_t *fhp;
1147         kcondvar_t cv;
1148         struct rfs_async_write *list;
1149         struct rfs_async_write_list *next;
1150 };
1151 
1152 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1153 static kmutex_t rfs_async_write_lock;
1154 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1155 
1156 #define MAXCLIOVECS     42
1157 #define RFSWRITE_INITVAL (enum nfsstat) -1
1158 
1159 #ifdef DEBUG
1160 static int rfs_write_hits = 0;
1161 static int rfs_write_misses = 0;
1162 #endif
1163 
1164 /*
1165  * Write data to file.
1166  * Returns attributes of a file after writing some data to it.
1167  */
1168 void
1169 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1170     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1171 {
1172         int error;
1173         vnode_t *vp;
1174         rlim64_t rlimit;

1179         struct rfs_async_write *rp;
1180         struct rfs_async_write *nrp;
1181         struct rfs_async_write *trp;
1182         struct rfs_async_write *lrp;
1183         int data_written;
1184         int iovcnt;
1185         mblk_t *m;
1186         struct iovec *iovp;
1187         struct iovec *niovp;
1188         struct iovec iov[MAXCLIOVECS];
1189         int count;
1190         int rcount;
1191         uint_t off;
1192         uint_t len;
1193         struct rfs_async_write nrpsp;
1194         struct rfs_async_write_list nlpsp;
1195         ushort_t t_flag;
1196         cred_t *savecred;
1197         int in_crit = 0;
1198         caller_context_t ct;

1199 
1200         if (!rfs_write_async) {

1201                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1202                 return;
1203         }
1204 
1205         /*
1206          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1207          * is considered an OK.
1208          */
1209         ns->ns_status = RFSWRITE_INITVAL;
1210 
1211         nrp = &nrpsp;
1212         nrp->wa = wa;
1213         nrp->ns = ns;
1214         nrp->req = req;
1215         nrp->cr = cr;
1216         nrp->ro = ro;
1217         nrp->thread = curthread;
1218 
1219         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1220 
1221         /*
1222          * Look to see if there is already a cluster started
1223          * for this file.
1224          */
1225         mutex_enter(&rfs_async_write_lock);
1226         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1227                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1228                     sizeof (fhandle_t)) == 0)
1229                         break;
1230         }
1231 
1232         /*
1233          * If lp is non-NULL, then there is already a cluster
1234          * started.  We need to place ourselves in the cluster
1235          * list in the right place as determined by starting
1236          * offset.  Conflicts with non-blocking mandatory locked
1237          * regions will be checked when the cluster is processed.
1238          */
1239         if (lp != NULL) {
1240                 rp = lp->list;
1241                 trp = NULL;
1242                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1243                         trp = rp;
1244                         rp = rp->list;
1245                 }
1246                 nrp->list = rp;
1247                 if (trp == NULL)
1248                         lp->list = nrp;
1249                 else
1250                         trp->list = nrp;
1251                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1252                         cv_wait(&lp->cv, &rfs_async_write_lock);
1253                 mutex_exit(&rfs_async_write_lock);
1254 
1255                 return;
1256         }
1257 
1258         /*
1259          * No cluster started yet, start one and add ourselves
1260          * to the list of clusters.
1261          */
1262         nrp->list = NULL;
1263 
1264         nlp = &nlpsp;
1265         nlp->fhp = &wa->wa_fhandle;
1266         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1267         nlp->list = nrp;
1268         nlp->next = NULL;
1269 
1270         if (rfs_async_write_head == NULL) {
1271                 rfs_async_write_head = nlp;
1272         } else {
1273                 lp = rfs_async_write_head;
1274                 while (lp->next != NULL)
1275                         lp = lp->next;
1276                 lp->next = nlp;
1277         }
1278         mutex_exit(&rfs_async_write_lock);
1279 
1280         /*
1281          * Convert the file handle common to all of the requests
1282          * in this cluster to a vnode.
1283          */
1284         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1285         if (vp == NULL) {
1286                 mutex_enter(&rfs_async_write_lock);
1287                 if (rfs_async_write_head == nlp)
1288                         rfs_async_write_head = nlp->next;
1289                 else {
1290                         lp = rfs_async_write_head;
1291                         while (lp->next != nlp)
1292                                 lp = lp->next;
1293                         lp->next = nlp->next;
1294                 }
1295                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1296                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1297                         rp->ns->ns_status = NFSERR_STALE;
1298                         rp->thread->t_flag |= t_flag;
1299                 }
1300                 cv_broadcast(&nlp->cv);
1301                 mutex_exit(&rfs_async_write_lock);
1302 
1303                 return;
1304         }
1305 
1306         /*
1307          * Can only write regular files.  Attempts to write any
1308          * other file types fail with EISDIR.
1309          */
1310         if (vp->v_type != VREG) {
1311                 VN_RELE(vp);
1312                 mutex_enter(&rfs_async_write_lock);
1313                 if (rfs_async_write_head == nlp)
1314                         rfs_async_write_head = nlp->next;
1315                 else {
1316                         lp = rfs_async_write_head;
1317                         while (lp->next != nlp)
1318                                 lp = lp->next;
1319                         lp->next = nlp->next;
1320                 }
1321                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1322                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1323                         rp->ns->ns_status = NFSERR_ISDIR;
1324                         rp->thread->t_flag |= t_flag;
1325                 }
1326                 cv_broadcast(&nlp->cv);
1327                 mutex_exit(&rfs_async_write_lock);
1328 
1329                 return;
1330         }
1331 
1332         /*
1333          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1334          * deadlock with ufs.
1335          */
1336         if (nbl_need_check(vp)) {
1337                 nbl_start_crit(vp, RW_READER);
1338                 in_crit = 1;
1339         }
1340 
1341         ct.cc_sysid = 0;
1342         ct.cc_pid = 0;
1343         ct.cc_caller_id = nfs2_srv_caller_id;
1344         ct.cc_flags = CC_DONTBLOCK;
1345 
1346         /*
1347          * Lock the file for writing.  This operation provides
1348          * the delay which allows clusters to grow.
1349          */
1350         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1351 
1352         /* check if a monitor detected a delegation conflict */
1353         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1354                 if (in_crit)
1355                         nbl_end_crit(vp);
1356                 VN_RELE(vp);
1357                 /* mark as wouldblock so response is dropped */
1358                 curthread->t_flag |= T_WOULDBLOCK;
1359                 mutex_enter(&rfs_async_write_lock);
1360                 if (rfs_async_write_head == nlp)
1361                         rfs_async_write_head = nlp->next;
1362                 else {
1363                         lp = rfs_async_write_head;
1364                         while (lp->next != nlp)
1365                                 lp = lp->next;
1366                         lp->next = nlp->next;
1367                 }
1368                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1369                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1370                                 rp->ns->ns_status = puterrno(error);
1371                                 rp->thread->t_flag |= T_WOULDBLOCK;
1372                         }
1373                 }
1374                 cv_broadcast(&nlp->cv);
1375                 mutex_exit(&rfs_async_write_lock);
1376 
1377                 return;
1378         }
1379 
1380         /*
1381          * Disconnect this cluster from the list of clusters.
1382          * The cluster that is being dealt with must be fixed
1383          * in size after this point, so there is no reason
1384          * to leave it on the list so that new requests can
1385          * find it.
1386          *
1387          * The algorithm is that the first write request will
1388          * create a cluster, convert the file handle to a
1389          * vnode pointer, and then lock the file for writing.
1390          * This request is not likely to be clustered with
1391          * any others.  However, the next request will create
1392          * a new cluster and be blocked in VOP_RWLOCK while
1393          * the first request is being processed.  This delay
1394          * will allow more requests to be clustered in this
1395          * second cluster.
1396          */
1397         mutex_enter(&rfs_async_write_lock);
1398         if (rfs_async_write_head == nlp)
1399                 rfs_async_write_head = nlp->next;
1400         else {
1401                 lp = rfs_async_write_head;
1402                 while (lp->next != nlp)
1403                         lp = lp->next;
1404                 lp->next = nlp->next;
1405         }
1406         mutex_exit(&rfs_async_write_lock);
1407 
1408         /*
1409          * Step through the list of requests in this cluster.
1410          * We need to check permissions to make sure that all
1411          * of the requests have sufficient permission to write
1412          * the file.  A cluster can be composed of requests
1413          * from different clients and different users on each
1414          * client.
1415          *
1416          * As a side effect, we also calculate the size of the
1417          * byte range that this cluster encompasses.
1418          */
1419         rp = nlp->list;
1420         off = rp->wa->wa_offset;
1421         len = (uint_t)0;
1422         do {
1423                 if (rdonly(rp->ro, vp)) {
1424                         rp->ns->ns_status = NFSERR_ROFS;
1425                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1426                         rp->thread->t_flag |= t_flag;

1631 
1632         /*
1633          * If any data was written at all, then we need to flush
1634          * the data and metadata to stable storage.
1635          */
1636         if (data_written) {
1637                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1638 
1639                 if (!error) {
1640                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1641                 }
1642         }
1643 
1644         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1645 
1646         if (in_crit)
1647                 nbl_end_crit(vp);
1648         VN_RELE(vp);
1649 
1650         t_flag = curthread->t_flag & T_WOULDBLOCK;
1651         mutex_enter(&rfs_async_write_lock);
1652         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1653                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1654                         rp->ns->ns_status = puterrno(error);
1655                         rp->thread->t_flag |= t_flag;
1656                 }
1657         }
1658         cv_broadcast(&nlp->cv);
1659         mutex_exit(&rfs_async_write_lock);
1660 
1661 }
1662 
1663 void *
1664 rfs_write_getfh(struct nfswriteargs *wa)
1665 {
1666         return (&wa->wa_fhandle);
1667 }
1668 
1669 /*
1670  * Create a file.
1671  * Creates a file with given attributes and returns those attributes
1672  * and an fhandle for the new file.
1673  */
1674 void
1675 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1676     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1677 {
1678         int error;
1679         int lookuperr;

1701         if (dvp == NULL) {
1702                 dr->dr_status = NFSERR_STALE;
1703                 return;
1704         }
1705 
1706         error = sattr_to_vattr(args->ca_sa, &va);
1707         if (error) {
1708                 dr->dr_status = puterrno(error);
1709                 return;
1710         }
1711 
1712         /*
1713          * Must specify the mode.
1714          */
1715         if (!(va.va_mask & AT_MODE)) {
1716                 VN_RELE(dvp);
1717                 dr->dr_status = NFSERR_INVAL;
1718                 return;
1719         }
1720 






1721         /*
1722          * This is a completely gross hack to make mknod
1723          * work over the wire until we can wack the protocol
1724          */
1725         if ((va.va_mode & IFMT) == IFCHR) {
1726                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1727                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1728                 else {
1729                         va.va_type = VCHR;
1730                         /*
1731                          * uncompress the received dev_t
1732                          * if the top half is zero indicating a request
1733                          * from an `older style' OS.
1734                          */
1735                         if ((va.va_size & 0xffff0000) == 0)
1736                                 va.va_rdev = nfsv2_expdev(va.va_size);
1737                         else
1738                                 va.va_rdev = (dev_t)va.va_size;
1739                 }
1740                 va.va_mask &= ~AT_SIZE;

2040         vnode_t *tovp;
2041         struct exportinfo *to_exi;
2042         fhandle_t *fh;
2043         vnode_t *srcvp;
2044         vnode_t *targvp;
2045         int in_crit = 0;
2046 
2047         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2048         if (fromvp == NULL) {
2049                 *status = NFSERR_STALE;
2050                 return;
2051         }
2052 
2053         fh = args->rna_to.da_fhandle;
2054         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2055         if (to_exi == NULL) {
2056                 VN_RELE(fromvp);
2057                 *status = NFSERR_ACCES;
2058                 return;
2059         }
2060         exi_rele(to_exi);
2061 
2062         if (to_exi != exi) {
2063                 VN_RELE(fromvp);
2064                 *status = NFSERR_XDEV;
2065                 return;
2066         }
2067 
2068         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2069         if (tovp == NULL) {
2070                 VN_RELE(fromvp);
2071                 *status = NFSERR_STALE;
2072                 return;
2073         }
2074 
2075         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2076                 VN_RELE(tovp);
2077                 VN_RELE(fromvp);
2078                 *status = NFSERR_NOTDIR;
2079                 return;
2080         }
2081 
2082         /*
2083          * Disallow NULL paths
2084          */
2085         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2086             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2087                 VN_RELE(tovp);
2088                 VN_RELE(fromvp);
2089                 *status = NFSERR_ACCES;
2090                 return;
2091         }
2092 
2093         if (rdonly(ro, tovp)) {
2094                 VN_RELE(tovp);
2095                 VN_RELE(fromvp);
2096                 *status = NFSERR_ROFS;
2097                 return;
2098         }
2099 







2100         /*
2101          * Check for a conflict with a non-blocking mandatory share reservation.
2102          */
2103         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2104             NULL, cr, NULL, NULL, NULL);
2105         if (error != 0) {
2106                 VN_RELE(tovp);
2107                 VN_RELE(fromvp);
2108                 *status = puterrno(error);
2109                 return;
2110         }
2111 
2112         /* Check for delegations on the source file */
2113 
2114         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2115                 VN_RELE(tovp);
2116                 VN_RELE(fromvp);
2117                 VN_RELE(srcvp);
2118                 curthread->t_flag |= T_WOULDBLOCK;
2119                 return;
2120         }
2121 
2122         /* Check for delegation on the file being renamed over, if it exists */
2123 
2124         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2125             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2126             NULL, NULL, NULL) == 0) {
2127 
2128                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2129                         VN_RELE(tovp);
2130                         VN_RELE(fromvp);
2131                         VN_RELE(srcvp);
2132                         VN_RELE(targvp);
2133                         curthread->t_flag |= T_WOULDBLOCK;
2134                         return;
2135                 }
2136                 VN_RELE(targvp);
2137         }
2138 
2139 
2140         if (nbl_need_check(srcvp)) {
2141                 nbl_start_crit(srcvp, RW_READER);
2142                 in_crit = 1;
2143                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2144                         error = EACCES;

2186 {
2187         int error;
2188         vnode_t *fromvp;
2189         vnode_t *tovp;
2190         struct exportinfo *to_exi;
2191         fhandle_t *fh;
2192 
2193         fromvp = nfs_fhtovp(args->la_from, exi);
2194         if (fromvp == NULL) {
2195                 *status = NFSERR_STALE;
2196                 return;
2197         }
2198 
2199         fh = args->la_to.da_fhandle;
2200         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2201         if (to_exi == NULL) {
2202                 VN_RELE(fromvp);
2203                 *status = NFSERR_ACCES;
2204                 return;
2205         }
2206         exi_rele(to_exi);
2207 
2208         if (to_exi != exi) {
2209                 VN_RELE(fromvp);
2210                 *status = NFSERR_XDEV;
2211                 return;
2212         }
2213 
2214         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2215         if (tovp == NULL) {
2216                 VN_RELE(fromvp);
2217                 *status = NFSERR_STALE;
2218                 return;
2219         }
2220 
2221         if (tovp->v_type != VDIR) {
2222                 VN_RELE(tovp);
2223                 VN_RELE(fromvp);
2224                 *status = NFSERR_NOTDIR;
2225                 return;
2226         }
2227         /*
2228          * Disallow NULL paths
2229          */
2230         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2231                 VN_RELE(tovp);
2232                 VN_RELE(fromvp);
2233                 *status = NFSERR_ACCES;
2234                 return;
2235         }
2236 
2237         if (rdonly(ro, tovp)) {
2238                 VN_RELE(tovp);
2239                 VN_RELE(fromvp);
2240                 *status = NFSERR_ROFS;
2241                 return;
2242         }
2243 







2244         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2245 
2246         /*
2247          * Force modified data and metadata out to stable storage.
2248          */
2249         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2250         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2251 
2252         VN_RELE(tovp);
2253         VN_RELE(fromvp);
2254 
2255         *status = puterrno(error);
2256 
2257 }
2258 void *
2259 rfs_link_getfh(struct nfslinkargs *args)
2260 {
2261         return (args->la_from);
2262 }
2263 
2264 /*
2265  * Symbolicly link to a file.
2266  * Create a file (to) with the given attributes which is a symbolic link
2267  * to the given path name (to).
2268  */
2269 void
2270 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2271     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2272 {
2273         int error;
2274         struct vattr va;
2275         vnode_t *vp;
2276         vnode_t *svp;
2277         int lerror;
2278         struct sockaddr *ca;
2279         char *name = NULL;
2280 
2281         /*
2282          * Disallow NULL paths
2283          */
2284         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2285                 *status = NFSERR_ACCES;
2286                 return;

2294 
2295         if (rdonly(ro, vp)) {
2296                 VN_RELE(vp);
2297                 *status = NFSERR_ROFS;
2298                 return;
2299         }
2300 
2301         error = sattr_to_vattr(args->sla_sa, &va);
2302         if (error) {
2303                 VN_RELE(vp);
2304                 *status = puterrno(error);
2305                 return;
2306         }
2307 
2308         if (!(va.va_mask & AT_MODE)) {
2309                 VN_RELE(vp);
2310                 *status = NFSERR_INVAL;
2311                 return;
2312         }
2313 






2314         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2315         name = nfscmd_convname(ca, exi, args->sla_tnm,
2316             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2317 
2318         if (name == NULL) {
2319                 *status = NFSERR_ACCES;
2320                 return;
2321         }
2322 
2323         va.va_type = VLNK;
2324         va.va_mask |= AT_TYPE;
2325 
2326         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2327 
2328         /*
2329          * Force new data and metadata out to stable storage.
2330          */
2331         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2332             NULL, cr, NULL, NULL, NULL);
2333

2386 
2387         if (rdonly(ro, vp)) {
2388                 VN_RELE(vp);
2389                 dr->dr_status = NFSERR_ROFS;
2390                 return;
2391         }
2392 
2393         error = sattr_to_vattr(args->ca_sa, &va);
2394         if (error) {
2395                 VN_RELE(vp);
2396                 dr->dr_status = puterrno(error);
2397                 return;
2398         }
2399 
2400         if (!(va.va_mask & AT_MODE)) {
2401                 VN_RELE(vp);
2402                 dr->dr_status = NFSERR_INVAL;
2403                 return;
2404         }
2405 






2406         va.va_type = VDIR;
2407         va.va_mask |= AT_TYPE;
2408 
2409         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2410 
2411         if (!error) {
2412                 /*
2413                  * Attribtutes of the newly created directory should
2414                  * be returned to the client.
2415                  */
2416                 va.va_mask = AT_ALL; /* We want everything */
2417                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2418 
2419                 /* check for overflows */
2420                 if (!error) {
2421                         acl_perm(vp, exi, &va, cr);
2422                         error = vattr_to_nattr(&va, &dr->dr_attr);
2423                         if (!error) {
2424                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2425                         }

2471         if (vp == NULL) {
2472                 *status = NFSERR_STALE;
2473                 return;
2474         }
2475 
2476         if (rdonly(ro, vp)) {
2477                 VN_RELE(vp);
2478                 *status = NFSERR_ROFS;
2479                 return;
2480         }
2481 
2482         /*
2483          * VOP_RMDIR takes a third argument (the current
2484          * directory of the process).  That's because someone
2485          * wants to return EINVAL if one tries to remove ".".
2486          * Of course, NFS servers have no idea what their
2487          * clients' current directories are.  We fake it by
2488          * supplying a vnode known to exist and illegal to
2489          * remove.
2490          */
2491         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2492 
2493         /*
2494          * Force modified data and metadata out to stable storage.
2495          */
2496         (void) VOP_FSYNC(vp, 0, cr, NULL);
2497 
2498         VN_RELE(vp);
2499 
2500         /*
2501          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2502          * if the directory is not empty.  A System V NFS server
2503          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2504          * over the wire.
2505          */
2506         if (error == EEXIST)
2507                 *status = NFSERR_NOTEMPTY;
2508         else
2509                 *status = puterrno(error);
2510 
2511 }
2512 void *
2513 rfs_rmdir_getfh(struct nfsdiropargs *da)
2514 {
2515         return (da->da_fhandle);
2516 }
2517 





2518 /* ARGSUSED */
2519 void
2520 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2521     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2522 {
2523         int error;
2524         int iseof;
2525         struct iovec iov;
2526         struct uio uio;
2527         vnode_t *vp;
2528         char *ndata = NULL;








2529         struct sockaddr *ca;
2530         size_t nents;
2531         int ret;
2532 
2533         vp = nfs_fhtovp(&rda->rda_fh, exi);
2534         if (vp == NULL) {
2535                 rd->rd_entries = NULL;
2536                 rd->rd_status = NFSERR_STALE;
2537                 return;
2538         }
2539 
2540         if (vp->v_type != VDIR) {
2541                 VN_RELE(vp);
2542                 rd->rd_entries = NULL;
2543                 rd->rd_status = NFSERR_NOTDIR;
2544                 return;
2545         }
2546 
2547         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2548 
2549         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2550 
2551         if (error) {
2552                 rd->rd_entries = NULL;
2553                 goto bad;
2554         }
2555 
2556         if (rda->rda_count == 0) {
2557                 rd->rd_entries = NULL;
2558                 rd->rd_size = 0;
2559                 rd->rd_eof = FALSE;
2560                 goto bad;
2561         }
2562 
2563         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);






2564 








2565         /*
2566          * Allocate data for entries.  This will be freed by rfs_rddirfree.



2567          */
2568         rd->rd_bufsize = (uint_t)rda->rda_count;
2569         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2570 
2571         /*
2572          * Set up io vector to read directory data

2573          */
2574         iov.iov_base = (caddr_t)rd->rd_entries;
2575         iov.iov_len = rda->rda_count;









2576         uio.uio_iov = &iov;
2577         uio.uio_iovcnt = 1;
2578         uio.uio_segflg = UIO_SYSSPACE;
2579         uio.uio_extflg = UIO_COPY_CACHED;
2580         uio.uio_loffset = (offset_t)rda->rda_offset;
2581         uio.uio_resid = rda->rda_count;
2582 
2583         /*
2584          * read directory
2585          */





2586         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);




2587 
2588         /*
2589          * Clean up
2590          */
2591         if (!error) {
2592                 /*
2593                  * set size and eof
2594                  */
2595                 if (uio.uio_resid == rda->rda_count) {
2596                         rd->rd_size = 0;
2597                         rd->rd_eof = TRUE;
2598                 } else {
2599                         rd->rd_size = (uint32_t)(rda->rda_count -
2600                             uio.uio_resid);
2601                         rd->rd_eof = iseof ? TRUE : FALSE;







2602                 }







2603         }
2604 
2605         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2606         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2607         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2608             rda->rda_count, &ndata);
2609 
2610         if (ret != 0) {
2611                 size_t dropbytes;
2612                 /*
2613                  * We had to drop one or more entries in order to fit
2614                  * during the character conversion.  We need to patch
2615                  * up the size and eof info.



2616                  */
2617                 if (rd->rd_eof)
2618                         rd->rd_eof = FALSE;
2619                 dropbytes = nfscmd_dropped_entrysize(
2620                     (struct dirent64 *)rd->rd_entries, nents, ret);
2621                 rd->rd_size -= dropbytes;




2622         }
2623         if (ndata == NULL) {
2624                 ndata = (char *)rd->rd_entries;
2625         } else if (ndata != (char *)rd->rd_entries) {
2626                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2627                 rd->rd_entries = (void *)ndata;
2628                 rd->rd_bufsize = rda->rda_count;








2629         }
2630 

















2631 bad:
2632         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2633 
2634 #if 0 /* notyet */
2635         /*
2636          * Don't do this.  It causes local disk writes when just
2637          * reading the file and the overhead is deemed larger
2638          * than the benefit.
2639          */
2640         /*
2641          * Force modified metadata out to stable storage.
2642          */
2643         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2644 #endif
2645 
2646         VN_RELE(vp);
2647 
2648         rd->rd_status = puterrno(error);
2649 


2650 }
2651 void *
2652 rfs_readdir_getfh(struct nfsrddirargs *rda)
2653 {
2654         return (&rda->rda_fh);
2655 }
2656 void
2657 rfs_rddirfree(struct nfsrddirres *rd)
2658 {
2659         if (rd->rd_entries != NULL)
2660                 kmem_free(rd->rd_entries, rd->rd_bufsize);







2661 }
2662 
2663 /* ARGSUSED */
2664 void
2665 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2666     struct svc_req *req, cred_t *cr, bool_t ro)
2667 {
2668         int error;
2669         struct statvfs64 sb;
2670         vnode_t *vp;
2671 
2672         vp = nfs_fhtovp(fh, exi);
2673         if (vp == NULL) {
2674                 fs->fs_status = NFSERR_STALE;
2675                 return;
2676         }
2677 
2678         error = VFS_STATVFS(vp->v_vfsp, &sb);
2679 
2680         if (!error) {

2746                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2747         }
2748         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2749             sa->sa_mtime.tv_usec != (int32_t)-1) {
2750 #ifndef _LP64
2751                 /* return error if time overflow */
2752                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2753                         return (EOVERFLOW);
2754 #endif
2755                 vap->va_mask |= AT_MTIME;
2756                 /*
2757                  * nfs protocol defines times as unsigned so don't extend sign,
2758                  * unless sysadmin set nfs_allow_preepoch_time.
2759                  */
2760                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2761                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2762         }
2763         return (0);
2764 }
2765 
2766 static enum nfsftype vt_to_nf[] = {
2767         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2768 };
2769 
2770 /*
2771  * check the following fields for overflow: nodeid, size, and time.
2772  * There could be a problem when converting 64-bit LP64 fields
2773  * into 32-bit ones.  Return an error if there is an overflow.
2774  */
2775 int
2776 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2777 {
2778         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2779         na->na_type = vt_to_nf[vap->va_type];
2780 
2781         if (vap->va_mode == (unsigned short) -1)
2782                 na->na_mode = (uint32_t)-1;
2783         else
2784                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2785 
2786         if (vap->va_uid == (unsigned short)(-1))

2965                                                     aclentp->a_perm;
2966                                                 break;
2967                                         default:
2968                                                 break;
2969                                         }
2970                                 }
2971                         }
2972                         /* copy to va */
2973                         va->va_mode &= ~077;
2974                         va->va_mode |= grp_perm | other_perm;
2975                 }
2976                 if (vsa.vsa_aclcnt)
2977                         kmem_free(vsa.vsa_aclentp,
2978                             vsa.vsa_aclcnt * sizeof (aclent_t));
2979         }
2980 }
2981 
2982 void
2983 rfs_srvrinit(void)
2984 {
2985         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2986         nfs2_srv_caller_id = fs_new_caller_id();

2987 }
2988 
2989 void
2990 rfs_srvrfini(void)
2991 {
2992         mutex_destroy(&rfs_async_write_lock);
2993 }
2994 

























2995 static int
2996 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2997 {
2998         struct clist    *wcl;
2999         int             wlist_len;
3000         uint32_t        count = rr->rr_count;
3001 
3002         wcl = ra->ra_wlist;
3003 
3004         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3005                 return (FALSE);
3006         }
3007 
3008         wcl = ra->ra_wlist;
3009         rr->rr_ok.rrok_wlist_len = wlist_len;
3010         rr->rr_ok.rrok_wlist = wcl;
3011 
3012         return (TRUE);
3013 }

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.


  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All rights reserved.
  29  */
  30 
  31 /*
  32  * Copyright 2018 Nexenta Systems, Inc.
  33  * Copyright (c) 2016 by Delphix. All rights reserved.
  34  */
  35 
  36 #include <sys/param.h>
  37 #include <sys/types.h>
  38 #include <sys/systm.h>
  39 #include <sys/cred.h>
  40 #include <sys/buf.h>
  41 #include <sys/vfs.h>
  42 #include <sys/vnode.h>
  43 #include <sys/uio.h>
  44 #include <sys/stat.h>
  45 #include <sys/errno.h>
  46 #include <sys/sysmacros.h>
  47 #include <sys/statvfs.h>
  48 #include <sys/kmem.h>
  49 #include <sys/kstat.h>
  50 #include <sys/dirent.h>
  51 #include <sys/cmn_err.h>
  52 #include <sys/debug.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/mode.h>
  55 #include <sys/acl.h>
  56 #include <sys/nbmlock.h>
  57 #include <sys/policy.h>
  58 #include <sys/sdt.h>
  59 
  60 #include <rpc/types.h>
  61 #include <rpc/auth.h>
  62 #include <rpc/svc.h>
  63 
  64 #include <nfs/nfs.h>
  65 #include <nfs/export.h>
  66 #include <nfs/nfs_cmd.h>
  67 
  68 #include <vm/hat.h>
  69 #include <vm/as.h>
  70 #include <vm/seg.h>
  71 #include <vm/seg_map.h>
  72 #include <vm/seg_kmem.h>
  73 
  74 #include <sys/strsubr.h>
  75 
  76 struct rfs_async_write_list;
  77 
  78 /*
  79  * Zone globals of NFSv2 server
  80  */
  81 typedef struct nfs_srv {
  82         kmutex_t                        async_write_lock;
  83         struct rfs_async_write_list     *async_write_head;
  84 
  85         /*
  86          * enables write clustering if == 1
  87          */
  88         int             write_async;
  89 } nfs_srv_t;
  90 
  91 /*
  92  * These are the interface routines for the server side of the
  93  * Network File System.  See the NFS version 2 protocol specification
  94  * for a description of this interface.
  95  */
  96 
  97 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  98 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  99                         cred_t *);
 100 static void     *rfs_zone_init(zoneid_t zoneid);
 101 static void     rfs_zone_fini(zoneid_t zoneid, void *data);
 102 
 103 
 104 /*
 105  * Some "over the wire" UNIX file types.  These are encoded
 106  * into the mode.  This needs to be fixed in the next rev.
 107  */
 108 #define IFMT            0170000         /* type of file */
 109 #define IFCHR           0020000         /* character special */
 110 #define IFBLK           0060000         /* block special */
 111 #define IFSOCK          0140000         /* socket */
 112 
 113 u_longlong_t nfs2_srv_caller_id;
 114 static zone_key_t rfs_zone_key;
 115 
 116 /*
 117  * Get file attributes.
 118  * Returns the current attributes of the file with the given fhandle.
 119  */
 120 /* ARGSUSED */
 121 void
 122 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 123     struct svc_req *req, cred_t *cr, bool_t ro)
 124 {
 125         int error;
 126         vnode_t *vp;
 127         struct vattr va;
 128 
 129         vp = nfs_fhtovp(fhp, exi);
 130         if (vp == NULL) {
 131                 ns->ns_status = NFSERR_STALE;
 132                 return;
 133         }
 134

 335                 }
 336         }
 337 
 338         ct.cc_flags = 0;
 339 
 340         /*
 341          * Force modified metadata out to stable storage.
 342          */
 343         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 344 
 345         VN_RELE(vp);
 346 
 347         ns->ns_status = puterrno(error);
 348 }
 349 void *
 350 rfs_setattr_getfh(struct nfssaargs *args)
 351 {
 352         return (&args->saa_fh);
 353 }
 354 
 355 /* Change and release @exip and @vpp only in success */
 356 int
 357 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 358 {
 359         struct exportinfo *exi;
 360         vnode_t *vp = *vpp;
 361         fid_t fid;
 362         int error;
 363 
 364         VN_HOLD(vp);
 365 
 366         if ((error = traverse(&vp)) != 0) {
 367                 VN_RELE(vp);
 368                 return (error);
 369         }
 370 
 371         bzero(&fid, sizeof (fid));
 372         fid.fid_len = MAXFIDSZ;
 373         error = VOP_FID(vp, &fid, NULL);
 374         if (error) {
 375                 VN_RELE(vp);
 376                 return (error);
 377         }
 378 
 379         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 380         if (exi == NULL ||
 381             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 382                 /*
 383                  * It is not error, just subdir is not exported
 384                  * or "nohide" is not set
 385                  */
 386                 if (exi != NULL)
 387                         exi_rele(&exi);
 388                 VN_RELE(vp);
 389         } else {
 390                 /* go to submount */
 391                 exi_rele(exip);
 392                 *exip = exi;
 393 
 394                 VN_RELE(*vpp);
 395                 *vpp = vp;
 396         }
 397 
 398         return (0);
 399 }
 400 
 401 /*
 402  * Given mounted "dvp" and "exi", go upper mountpoint
 403  * with dvp/exi correction
 404  * Return 0 in success
 405  */
 406 int
 407 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 408 {
 409         struct exportinfo *exi;
 410         vnode_t *dvp = *dvpp;
 411 
 412         ASSERT(dvp->v_flag & VROOT);
 413 
 414         VN_HOLD(dvp);
 415         dvp = untraverse(dvp);
 416         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 417         if (exi == NULL) {
 418                 VN_RELE(dvp);
 419                 return (-1);
 420         }
 421 
 422         exi_rele(exip);
 423         *exip = exi;
 424         VN_RELE(*dvpp);
 425         *dvpp = dvp;
 426 
 427         return (0);
 428 }
 429 /*
 430  * Directory lookup.
 431  * Returns an fhandle and file attributes for file name in a directory.
 432  */
 433 /* ARGSUSED */
 434 void
 435 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 436     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 437 {
 438         int error;
 439         vnode_t *dvp;
 440         vnode_t *vp;
 441         struct vattr va;
 442         fhandle_t *fhp = da->da_fhandle;
 443         struct sec_ol sec = {0, 0};
 444         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 445         char *name;
 446         struct sockaddr *ca;
 447 
 448         /*
 449          * Trusted Extension doesn't support NFSv2. MOUNT

 451          * access via WebNFS here.
 452          */
 453         if (is_system_labeled() && req->rq_vers == 2) {
 454                 dr->dr_status = NFSERR_ACCES;
 455                 return;
 456         }
 457 
 458         /*
 459          * Disallow NULL paths
 460          */
 461         if (da->da_name == NULL || *da->da_name == '\0') {
 462                 dr->dr_status = NFSERR_ACCES;
 463                 return;
 464         }
 465 
 466         /*
 467          * Allow lookups from the root - the default
 468          * location of the public filehandle.
 469          */
 470         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 471                 dvp = ZONE_ROOTVP();
 472                 VN_HOLD(dvp);
 473         } else {
 474                 dvp = nfs_fhtovp(fhp, exi);
 475                 if (dvp == NULL) {
 476                         dr->dr_status = NFSERR_STALE;
 477                         return;
 478                 }
 479         }
 480 
 481         exi_hold(exi);
 482 
 483         /*
 484          * Not allow lookup beyond root.
 485          * If the filehandle matches a filehandle of the exi,
 486          * then the ".." refers beyond the root of an exported filesystem.
 487          */
 488         if (strcmp(da->da_name, "..") == 0 &&
 489             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 490                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 491                     (dvp->v_flag & VROOT)) {
 492                         /*
 493                          * special case for ".." and 'nohide'exported root
 494                          */
 495                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 496                                 error = NFSERR_ACCES;
 497                                 goto out;
 498                         }
 499                 } else  {
 500                         error = NFSERR_NOENT;
 501                         goto out;
 502                 }
 503         }
 504 
 505         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 506         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 507             MAXPATHLEN);
 508 
 509         if (name == NULL) {
 510                 error = NFSERR_ACCES;
 511                 goto out;
 512         }
 513 
 514         /*
 515          * If the public filehandle is used then allow
 516          * a multi-component lookup, i.e. evaluate
 517          * a pathname and follow symbolic links if
 518          * necessary.
 519          *
 520          * This may result in a vnode in another filesystem
 521          * which is OK as long as the filesystem is exported.
 522          */
 523         if (PUBLIC_FH2(fhp)) {
 524                 publicfh_flag = TRUE;
 525 
 526                 exi_rele(&exi);
 527 
 528                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 529                     &sec);
 530         } else {
 531                 /*
 532                  * Do a normal single component lookup.
 533                  */
 534                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 535                     NULL, NULL, NULL);
 536         }
 537 
 538         if (name != da->da_name)
 539                 kmem_free(name, MAXPATHLEN);
 540 
 541         if (error == 0 && vn_ismntpt(vp)) {
 542                 error = rfs_cross_mnt(&vp, &exi);
 543                 if (error)
 544                         VN_RELE(vp);
 545         }
 546 
 547         if (!error) {
 548                 va.va_mask = AT_ALL;    /* we want everything */
 549 
 550                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 551 
 552                 /* check for overflows */
 553                 if (!error) {
 554                         acl_perm(vp, exi, &va, cr);
 555                         error = vattr_to_nattr(&va, &dr->dr_attr);
 556                         if (!error) {
 557                                 if (sec.sec_flags & SEC_QUERY)
 558                                         error = makefh_ol(&dr->dr_fhandle, exi,
 559                                             sec.sec_index);
 560                                 else {
 561                                         error = makefh(&dr->dr_fhandle, vp,
 562                                             exi);
 563                                         if (!error && publicfh_flag &&
 564                                             !chk_clnt_sec(exi, req))
 565                                                 auth_weak = TRUE;
 566                                 }
 567                         }
 568                 }
 569                 VN_RELE(vp);
 570         }
 571 
 572 out:
 573         VN_RELE(dvp);
 574 
 575         if (exi != NULL)
 576                 exi_rele(&exi);






 577 
 578         /*
 579          * If it's public fh, no 0x81, and client's flavor is
 580          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 581          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 582          */
 583         if (auth_weak)
 584                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 585         else
 586                 dr->dr_status = puterrno(error);
 587 }
 588 void *
 589 rfs_lookup_getfh(struct nfsdiropargs *da)
 590 {
 591         return (da->da_fhandle);
 592 }
 593 
 594 /*
 595  * Read symbolic link.
 596  * Returns the string in the symbolic link at the given fhandle.

 780          * Enter the critical region before calling VOP_RWLOCK
 781          * to avoid a deadlock with write requests.
 782          */
 783         if (nbl_need_check(vp)) {
 784                 nbl_start_crit(vp, RW_READER);
 785                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 786                     0, NULL)) {
 787                         nbl_end_crit(vp);
 788                         VN_RELE(vp);
 789                         rr->rr_data = NULL;
 790                         rr->rr_status = NFSERR_ACCES;
 791                         return;
 792                 }
 793                 in_crit = 1;
 794         }
 795 
 796         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 797 
 798         /* check if a monitor detected a delegation conflict */
 799         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 800                 if (in_crit)
 801                         nbl_end_crit(vp);
 802                 VN_RELE(vp);
 803                 /* mark as wouldblock so response is dropped */
 804                 curthread->t_flag |= T_WOULDBLOCK;
 805 
 806                 rr->rr_data = NULL;
 807                 return;
 808         }
 809 
 810         va.va_mask = AT_ALL;
 811 
 812         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 813 
 814         if (error) {
 815                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 816                 if (in_crit)
 817                         nbl_end_crit(vp);
 818 
 819                 VN_RELE(vp);
 820                 rr->rr_data = NULL;
 821                 rr->rr_status = puterrno(error);

1107         }
1108 
1109         /*
1110          * We have to enter the critical region before calling VOP_RWLOCK
1111          * to avoid a deadlock with ufs.
1112          */
1113         if (nbl_need_check(vp)) {
1114                 nbl_start_crit(vp, RW_READER);
1115                 in_crit = 1;
1116                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1117                     wa->wa_count, 0, NULL)) {
1118                         error = EACCES;
1119                         goto out;
1120                 }
1121         }
1122 
1123         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1124 
1125         /* check if a monitor detected a delegation conflict */
1126         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1127                 goto out;



1128         }
1129 
1130         if (wa->wa_data || wa->wa_rlist) {
1131                 /* Do the RDMA thing if necessary */
1132                 if (wa->wa_rlist) {
1133                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1134                         iov[0].iov_len = wa->wa_count;
1135                 } else  {
1136                         iov[0].iov_base = wa->wa_data;
1137                         iov[0].iov_len = wa->wa_count;
1138                 }
1139                 uio.uio_iov = iov;
1140                 uio.uio_iovcnt = 1;
1141                 uio.uio_segflg = UIO_SYSSPACE;
1142                 uio.uio_extflg = UIO_COPY_DEFAULT;
1143                 uio.uio_loffset = (offset_t)wa->wa_offset;
1144                 uio.uio_resid = wa->wa_count;
1145                 /*
1146                  * The limit is checked on the client. We
1147                  * should allow any size writes here.
1148                  */
1149                 uio.uio_llimit = curproc->p_fsz_ctl;
1150                 rlimit = uio.uio_llimit - wa->wa_offset;
1151                 if (rlimit < (rlim64_t)uio.uio_resid)
1152                         uio.uio_resid = (uint_t)rlimit;
1153 
1154                 /*
1155                  * for now we assume no append mode
1156                  */
1157                 /*
1158                  * We're changing creds because VM may fault and we need
1159                  * the cred of the current thread to be used if quota
1160                  * checking is enabled.
1161                  */
1162                 savecred = curthread->t_cred;
1163                 curthread->t_cred = cr;
1164                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1165                 curthread->t_cred = savecred;
1166         } else {
1167 
1168                 iovcnt = 0;
1169                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1170                         iovcnt++;
1171                 if (iovcnt <= MAX_IOVECS) {
1172 #ifdef DEBUG
1173                         rfs_write_sync_hits++;
1174 #endif
1175                         iovp = iov;
1176                 } else {
1177 #ifdef DEBUG
1178                         rfs_write_sync_misses++;
1179 #endif
1180                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1181                 }
1182                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1183                 uio.uio_iov = iovp;
1184                 uio.uio_iovcnt = iovcnt;
1185                 uio.uio_segflg = UIO_SYSSPACE;
1186                 uio.uio_extflg = UIO_COPY_DEFAULT;
1187                 uio.uio_loffset = (offset_t)wa->wa_offset;

1246 
1247 struct rfs_async_write {
1248         struct nfswriteargs *wa;
1249         struct nfsattrstat *ns;
1250         struct svc_req *req;
1251         cred_t *cr;
1252         bool_t ro;
1253         kthread_t *thread;
1254         struct rfs_async_write *list;
1255 };
1256 
1257 struct rfs_async_write_list {
1258         fhandle_t *fhp;
1259         kcondvar_t cv;
1260         struct rfs_async_write *list;
1261         struct rfs_async_write_list *next;
1262 };
1263 
1264 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1265 static kmutex_t rfs_async_write_lock;
1266 volatile int rfs_write_async = 1;       /* enables write clustering if == 1 */
1267 
1268 #define MAXCLIOVECS     42
1269 #define RFSWRITE_INITVAL (enum nfsstat) -1
1270 
1271 #ifdef DEBUG
1272 static int rfs_write_hits = 0;
1273 static int rfs_write_misses = 0;
1274 #endif
1275 
1276 /*
1277  * Write data to file.
1278  * Returns attributes of a file after writing some data to it.
1279  */
1280 void
1281 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1282     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1283 {
1284         int error;
1285         vnode_t *vp;
1286         rlim64_t rlimit;

1291         struct rfs_async_write *rp;
1292         struct rfs_async_write *nrp;
1293         struct rfs_async_write *trp;
1294         struct rfs_async_write *lrp;
1295         int data_written;
1296         int iovcnt;
1297         mblk_t *m;
1298         struct iovec *iovp;
1299         struct iovec *niovp;
1300         struct iovec iov[MAXCLIOVECS];
1301         int count;
1302         int rcount;
1303         uint_t off;
1304         uint_t len;
1305         struct rfs_async_write nrpsp;
1306         struct rfs_async_write_list nlpsp;
1307         ushort_t t_flag;
1308         cred_t *savecred;
1309         int in_crit = 0;
1310         caller_context_t ct;
1311         nfs_srv_t *nsrv;
1312 
1313         nsrv = zone_getspecific(rfs_zone_key, curzone);
1314         if (!nsrv->write_async) {
1315                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1316                 return;
1317         }
1318 
1319         /*
1320          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1321          * is considered an OK.
1322          */
1323         ns->ns_status = RFSWRITE_INITVAL;
1324 
1325         nrp = &nrpsp;
1326         nrp->wa = wa;
1327         nrp->ns = ns;
1328         nrp->req = req;
1329         nrp->cr = cr;
1330         nrp->ro = ro;
1331         nrp->thread = curthread;
1332 
1333         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1334 
1335         /*
1336          * Look to see if there is already a cluster started
1337          * for this file.
1338          */
1339         mutex_enter(&nsrv->async_write_lock);
1340         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1341                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1342                     sizeof (fhandle_t)) == 0)
1343                         break;
1344         }
1345 
1346         /*
1347          * If lp is non-NULL, then there is already a cluster
1348          * started.  We need to place ourselves in the cluster
1349          * list in the right place as determined by starting
1350          * offset.  Conflicts with non-blocking mandatory locked
1351          * regions will be checked when the cluster is processed.
1352          */
1353         if (lp != NULL) {
1354                 rp = lp->list;
1355                 trp = NULL;
1356                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1357                         trp = rp;
1358                         rp = rp->list;
1359                 }
1360                 nrp->list = rp;
1361                 if (trp == NULL)
1362                         lp->list = nrp;
1363                 else
1364                         trp->list = nrp;
1365                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1366                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1367                 mutex_exit(&nsrv->async_write_lock);
1368 
1369                 return;
1370         }
1371 
1372         /*
1373          * No cluster started yet, start one and add ourselves
1374          * to the list of clusters.
1375          */
1376         nrp->list = NULL;
1377 
1378         nlp = &nlpsp;
1379         nlp->fhp = &wa->wa_fhandle;
1380         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1381         nlp->list = nrp;
1382         nlp->next = NULL;
1383 
1384         if (nsrv->async_write_head == NULL) {
1385                 nsrv->async_write_head = nlp;
1386         } else {
1387                 lp = nsrv->async_write_head;
1388                 while (lp->next != NULL)
1389                         lp = lp->next;
1390                 lp->next = nlp;
1391         }
1392         mutex_exit(&nsrv->async_write_lock);
1393 
1394         /*
1395          * Convert the file handle common to all of the requests
1396          * in this cluster to a vnode.
1397          */
1398         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1399         if (vp == NULL) {
1400                 mutex_enter(&nsrv->async_write_lock);
1401                 if (nsrv->async_write_head == nlp)
1402                         nsrv->async_write_head = nlp->next;
1403                 else {
1404                         lp = nsrv->async_write_head;
1405                         while (lp->next != nlp)
1406                                 lp = lp->next;
1407                         lp->next = nlp->next;
1408                 }
1409                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1410                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1411                         rp->ns->ns_status = NFSERR_STALE;
1412                         rp->thread->t_flag |= t_flag;
1413                 }
1414                 cv_broadcast(&nlp->cv);
1415                 mutex_exit(&nsrv->async_write_lock);
1416 
1417                 return;
1418         }
1419 
1420         /*
1421          * Can only write regular files.  Attempts to write any
1422          * other file types fail with EISDIR.
1423          */
1424         if (vp->v_type != VREG) {
1425                 VN_RELE(vp);
1426                 mutex_enter(&nsrv->async_write_lock);
1427                 if (nsrv->async_write_head == nlp)
1428                         nsrv->async_write_head = nlp->next;
1429                 else {
1430                         lp = nsrv->async_write_head;
1431                         while (lp->next != nlp)
1432                                 lp = lp->next;
1433                         lp->next = nlp->next;
1434                 }
1435                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1436                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1437                         rp->ns->ns_status = NFSERR_ISDIR;
1438                         rp->thread->t_flag |= t_flag;
1439                 }
1440                 cv_broadcast(&nlp->cv);
1441                 mutex_exit(&nsrv->async_write_lock);
1442 
1443                 return;
1444         }
1445 
1446         /*
1447          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1448          * deadlock with ufs.
1449          */
1450         if (nbl_need_check(vp)) {
1451                 nbl_start_crit(vp, RW_READER);
1452                 in_crit = 1;
1453         }
1454 
1455         ct.cc_sysid = 0;
1456         ct.cc_pid = 0;
1457         ct.cc_caller_id = nfs2_srv_caller_id;
1458         ct.cc_flags = CC_DONTBLOCK;
1459 
1460         /*
1461          * Lock the file for writing.  This operation provides
1462          * the delay which allows clusters to grow.
1463          */
1464         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1465 
1466         /* check if a monitor detected a delegation conflict */
1467         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1468                 if (in_crit)
1469                         nbl_end_crit(vp);
1470                 VN_RELE(vp);
1471                 /* mark as wouldblock so response is dropped */
1472                 curthread->t_flag |= T_WOULDBLOCK;
1473                 mutex_enter(&nsrv->async_write_lock);
1474                 if (nsrv->async_write_head == nlp)
1475                         nsrv->async_write_head = nlp->next;
1476                 else {
1477                         lp = nsrv->async_write_head;
1478                         while (lp->next != nlp)
1479                                 lp = lp->next;
1480                         lp->next = nlp->next;
1481                 }
1482                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1483                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1484                                 rp->ns->ns_status = puterrno(error);
1485                                 rp->thread->t_flag |= T_WOULDBLOCK;
1486                         }
1487                 }
1488                 cv_broadcast(&nlp->cv);
1489                 mutex_exit(&nsrv->async_write_lock);
1490 
1491                 return;
1492         }
1493 
1494         /*
1495          * Disconnect this cluster from the list of clusters.
1496          * The cluster that is being dealt with must be fixed
1497          * in size after this point, so there is no reason
1498          * to leave it on the list so that new requests can
1499          * find it.
1500          *
1501          * The algorithm is that the first write request will
1502          * create a cluster, convert the file handle to a
1503          * vnode pointer, and then lock the file for writing.
1504          * This request is not likely to be clustered with
1505          * any others.  However, the next request will create
1506          * a new cluster and be blocked in VOP_RWLOCK while
1507          * the first request is being processed.  This delay
1508          * will allow more requests to be clustered in this
1509          * second cluster.
1510          */
1511         mutex_enter(&nsrv->async_write_lock);
1512         if (nsrv->async_write_head == nlp)
1513                 nsrv->async_write_head = nlp->next;
1514         else {
1515                 lp = nsrv->async_write_head;
1516                 while (lp->next != nlp)
1517                         lp = lp->next;
1518                 lp->next = nlp->next;
1519         }
1520         mutex_exit(&nsrv->async_write_lock);
1521 
1522         /*
1523          * Step through the list of requests in this cluster.
1524          * We need to check permissions to make sure that all
1525          * of the requests have sufficient permission to write
1526          * the file.  A cluster can be composed of requests
1527          * from different clients and different users on each
1528          * client.
1529          *
1530          * As a side effect, we also calculate the size of the
1531          * byte range that this cluster encompasses.
1532          */
1533         rp = nlp->list;
1534         off = rp->wa->wa_offset;
1535         len = (uint_t)0;
1536         do {
1537                 if (rdonly(rp->ro, vp)) {
1538                         rp->ns->ns_status = NFSERR_ROFS;
1539                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1540                         rp->thread->t_flag |= t_flag;

1745 
1746         /*
1747          * If any data was written at all, then we need to flush
1748          * the data and metadata to stable storage.
1749          */
1750         if (data_written) {
1751                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1752 
1753                 if (!error) {
1754                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1755                 }
1756         }
1757 
1758         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1759 
1760         if (in_crit)
1761                 nbl_end_crit(vp);
1762         VN_RELE(vp);
1763 
1764         t_flag = curthread->t_flag & T_WOULDBLOCK;
1765         mutex_enter(&nsrv->async_write_lock);
1766         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1767                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1768                         rp->ns->ns_status = puterrno(error);
1769                         rp->thread->t_flag |= t_flag;
1770                 }
1771         }
1772         cv_broadcast(&nlp->cv);
1773         mutex_exit(&nsrv->async_write_lock);
1774 
1775 }
1776 
1777 void *
1778 rfs_write_getfh(struct nfswriteargs *wa)
1779 {
1780         return (&wa->wa_fhandle);
1781 }
1782 
1783 /*
1784  * Create a file.
1785  * Creates a file with given attributes and returns those attributes
1786  * and an fhandle for the new file.
1787  */
1788 void
1789 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1790     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1791 {
1792         int error;
1793         int lookuperr;

1815         if (dvp == NULL) {
1816                 dr->dr_status = NFSERR_STALE;
1817                 return;
1818         }
1819 
1820         error = sattr_to_vattr(args->ca_sa, &va);
1821         if (error) {
1822                 dr->dr_status = puterrno(error);
1823                 return;
1824         }
1825 
1826         /*
1827          * Must specify the mode.
1828          */
1829         if (!(va.va_mask & AT_MODE)) {
1830                 VN_RELE(dvp);
1831                 dr->dr_status = NFSERR_INVAL;
1832                 return;
1833         }
1834 
1835         if (protect_zfs_mntpt(dvp) != 0) {
1836                 VN_RELE(dvp);
1837                 dr->dr_status = NFSERR_ACCES;
1838                 return;
1839         }
1840 
1841         /*
1842          * This is a completely gross hack to make mknod
1843          * work over the wire until we can wack the protocol
1844          */
1845         if ((va.va_mode & IFMT) == IFCHR) {
1846                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1847                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1848                 else {
1849                         va.va_type = VCHR;
1850                         /*
1851                          * uncompress the received dev_t
1852                          * if the top half is zero indicating a request
1853                          * from an `older style' OS.
1854                          */
1855                         if ((va.va_size & 0xffff0000) == 0)
1856                                 va.va_rdev = nfsv2_expdev(va.va_size);
1857                         else
1858                                 va.va_rdev = (dev_t)va.va_size;
1859                 }
1860                 va.va_mask &= ~AT_SIZE;

2160         vnode_t *tovp;
2161         struct exportinfo *to_exi;
2162         fhandle_t *fh;
2163         vnode_t *srcvp;
2164         vnode_t *targvp;
2165         int in_crit = 0;
2166 
2167         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2168         if (fromvp == NULL) {
2169                 *status = NFSERR_STALE;
2170                 return;
2171         }
2172 
2173         fh = args->rna_to.da_fhandle;
2174         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2175         if (to_exi == NULL) {
2176                 VN_RELE(fromvp);
2177                 *status = NFSERR_ACCES;
2178                 return;
2179         }
2180         exi_rele(&to_exi);
2181 
2182         if (to_exi != exi) {
2183                 VN_RELE(fromvp);
2184                 *status = NFSERR_XDEV;
2185                 return;
2186         }
2187 
2188         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2189         if (tovp == NULL) {
2190                 VN_RELE(fromvp);
2191                 *status = NFSERR_STALE;
2192                 return;
2193         }
2194 
2195         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2196                 VN_RELE(tovp);
2197                 VN_RELE(fromvp);
2198                 *status = NFSERR_NOTDIR;
2199                 return;
2200         }
2201 
2202         /*
2203          * Disallow NULL paths
2204          */
2205         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2206             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2207                 VN_RELE(tovp);
2208                 VN_RELE(fromvp);
2209                 *status = NFSERR_ACCES;
2210                 return;
2211         }
2212 
2213         if (rdonly(ro, tovp)) {
2214                 VN_RELE(tovp);
2215                 VN_RELE(fromvp);
2216                 *status = NFSERR_ROFS;
2217                 return;
2218         }
2219 
2220         if (protect_zfs_mntpt(tovp) != 0) {
2221                 VN_RELE(tovp);
2222                 VN_RELE(fromvp);
2223                 *status = NFSERR_ACCES;
2224                 return;
2225         }
2226 
2227         /*
2228          * Check for a conflict with a non-blocking mandatory share reservation.
2229          */
2230         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2231             NULL, cr, NULL, NULL, NULL);
2232         if (error != 0) {
2233                 VN_RELE(tovp);
2234                 VN_RELE(fromvp);
2235                 *status = puterrno(error);
2236                 return;
2237         }
2238 
2239         /* Check for delegations on the source file */
2240 
2241         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2242                 VN_RELE(tovp);
2243                 VN_RELE(fromvp);
2244                 VN_RELE(srcvp);
2245                 curthread->t_flag |= T_WOULDBLOCK;
2246                 return;
2247         }
2248 
2249         /* Check for delegation on the file being renamed over, if it exists */
2250 
2251         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2252             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2253             NULL, NULL, NULL) == 0) {
2254 
2255                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2256                         VN_RELE(tovp);
2257                         VN_RELE(fromvp);
2258                         VN_RELE(srcvp);
2259                         VN_RELE(targvp);
2260                         curthread->t_flag |= T_WOULDBLOCK;
2261                         return;
2262                 }
2263                 VN_RELE(targvp);
2264         }
2265 
2266 
2267         if (nbl_need_check(srcvp)) {
2268                 nbl_start_crit(srcvp, RW_READER);
2269                 in_crit = 1;
2270                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2271                         error = EACCES;

2313 {
2314         int error;
2315         vnode_t *fromvp;
2316         vnode_t *tovp;
2317         struct exportinfo *to_exi;
2318         fhandle_t *fh;
2319 
2320         fromvp = nfs_fhtovp(args->la_from, exi);
2321         if (fromvp == NULL) {
2322                 *status = NFSERR_STALE;
2323                 return;
2324         }
2325 
2326         fh = args->la_to.da_fhandle;
2327         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2328         if (to_exi == NULL) {
2329                 VN_RELE(fromvp);
2330                 *status = NFSERR_ACCES;
2331                 return;
2332         }
2333         exi_rele(&to_exi);
2334 
2335         if (to_exi != exi) {
2336                 VN_RELE(fromvp);
2337                 *status = NFSERR_XDEV;
2338                 return;
2339         }
2340 
2341         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2342         if (tovp == NULL) {
2343                 VN_RELE(fromvp);
2344                 *status = NFSERR_STALE;
2345                 return;
2346         }
2347 
2348         if (tovp->v_type != VDIR) {
2349                 VN_RELE(tovp);
2350                 VN_RELE(fromvp);
2351                 *status = NFSERR_NOTDIR;
2352                 return;
2353         }
2354         /*
2355          * Disallow NULL paths
2356          */
2357         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2358                 VN_RELE(tovp);
2359                 VN_RELE(fromvp);
2360                 *status = NFSERR_ACCES;
2361                 return;
2362         }
2363 
2364         if (rdonly(ro, tovp)) {
2365                 VN_RELE(tovp);
2366                 VN_RELE(fromvp);
2367                 *status = NFSERR_ROFS;
2368                 return;
2369         }
2370 
2371         if (protect_zfs_mntpt(tovp) != 0) {
2372                 VN_RELE(tovp);
2373                 VN_RELE(fromvp);
2374                 *status = NFSERR_ACCES;
2375                 return;
2376         }
2377 
2378         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2379 
2380         /*
2381          * Force modified data and metadata out to stable storage.
2382          */
2383         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2384         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2385 
2386         VN_RELE(tovp);
2387         VN_RELE(fromvp);
2388 
2389         *status = puterrno(error);
2390 
2391 }
2392 void *
2393 rfs_link_getfh(struct nfslinkargs *args)
2394 {
2395         return (args->la_from);
2396 }
2397 
2398 /*
2399  * Symbolicly link to a file.
2400  * Create a file (from) with the given attributes which is a symbolic link
2401  * to the given path name (to).
2402  */
2403 void
2404 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2405     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2406 {
2407         int error;
2408         struct vattr va;
2409         vnode_t *vp;
2410         vnode_t *svp;
2411         int lerror;
2412         struct sockaddr *ca;
2413         char *name = NULL;
2414 
2415         /*
2416          * Disallow NULL paths
2417          */
2418         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2419                 *status = NFSERR_ACCES;
2420                 return;

2428 
2429         if (rdonly(ro, vp)) {
2430                 VN_RELE(vp);
2431                 *status = NFSERR_ROFS;
2432                 return;
2433         }
2434 
2435         error = sattr_to_vattr(args->sla_sa, &va);
2436         if (error) {
2437                 VN_RELE(vp);
2438                 *status = puterrno(error);
2439                 return;
2440         }
2441 
2442         if (!(va.va_mask & AT_MODE)) {
2443                 VN_RELE(vp);
2444                 *status = NFSERR_INVAL;
2445                 return;
2446         }
2447 
2448         if (protect_zfs_mntpt(vp) != 0) {
2449                 VN_RELE(vp);
2450                 *status = NFSERR_ACCES;
2451                 return;
2452         }
2453 
2454         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2455         name = nfscmd_convname(ca, exi, args->sla_tnm,
2456             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2457 
2458         if (name == NULL) {
2459                 *status = NFSERR_ACCES;
2460                 return;
2461         }
2462 
2463         va.va_type = VLNK;
2464         va.va_mask |= AT_TYPE;
2465 
2466         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2467 
2468         /*
2469          * Force new data and metadata out to stable storage.
2470          */
2471         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2472             NULL, cr, NULL, NULL, NULL);
2473

2526 
2527         if (rdonly(ro, vp)) {
2528                 VN_RELE(vp);
2529                 dr->dr_status = NFSERR_ROFS;
2530                 return;
2531         }
2532 
2533         error = sattr_to_vattr(args->ca_sa, &va);
2534         if (error) {
2535                 VN_RELE(vp);
2536                 dr->dr_status = puterrno(error);
2537                 return;
2538         }
2539 
2540         if (!(va.va_mask & AT_MODE)) {
2541                 VN_RELE(vp);
2542                 dr->dr_status = NFSERR_INVAL;
2543                 return;
2544         }
2545 
2546         if (protect_zfs_mntpt(vp) != 0) {
2547                 VN_RELE(vp);
2548                 dr->dr_status = NFSERR_ACCES;
2549                 return;
2550         }
2551 
2552         va.va_type = VDIR;
2553         va.va_mask |= AT_TYPE;
2554 
2555         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2556 
2557         if (!error) {
2558                 /*
2559                  * Attribtutes of the newly created directory should
2560                  * be returned to the client.
2561                  */
2562                 va.va_mask = AT_ALL; /* We want everything */
2563                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2564 
2565                 /* check for overflows */
2566                 if (!error) {
2567                         acl_perm(vp, exi, &va, cr);
2568                         error = vattr_to_nattr(&va, &dr->dr_attr);
2569                         if (!error) {
2570                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2571                         }

2617         if (vp == NULL) {
2618                 *status = NFSERR_STALE;
2619                 return;
2620         }
2621 
2622         if (rdonly(ro, vp)) {
2623                 VN_RELE(vp);
2624                 *status = NFSERR_ROFS;
2625                 return;
2626         }
2627 
2628         /*
2629          * VOP_RMDIR takes a third argument (the current
2630          * directory of the process).  That's because someone
2631          * wants to return EINVAL if one tries to remove ".".
2632          * Of course, NFS servers have no idea what their
2633          * clients' current directories are.  We fake it by
2634          * supplying a vnode known to exist and illegal to
2635          * remove.
2636          */
2637         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2638 
2639         /*
2640          * Force modified data and metadata out to stable storage.
2641          */
2642         (void) VOP_FSYNC(vp, 0, cr, NULL);
2643 
2644         VN_RELE(vp);
2645 
2646         /*
2647          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2648          * if the directory is not empty.  A System V NFS server
2649          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2650          * over the wire.
2651          */
2652         if (error == EEXIST)
2653                 *status = NFSERR_NOTEMPTY;
2654         else
2655                 *status = puterrno(error);
2656 
2657 }
2658 void *
2659 rfs_rmdir_getfh(struct nfsdiropargs *da)
2660 {
2661         return (da->da_fhandle);
2662 }
2663 
2664 #ifdef nextdp
2665 #undef nextdp
2666 #endif
2667 #define nextdp(dp)      ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
2668 
2669 /* ARGSUSED */
2670 void
2671 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2672     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2673 {
2674         int error;
2675         vnode_t *vp;
2676         struct iovec iov;
2677         struct uio uio;
2678         int iseof;
2679 
2680         uint32_t count = rda->rda_count;
2681         uint32_t size;          /* size of the readdirres structure */
2682         int overflow = 0;
2683 
2684         size_t datasz;
2685         char *data = NULL;
2686         dirent64_t *dp;
2687 
2688         struct sockaddr *ca;
2689         struct nfsentry **eptr;
2690         struct nfsentry *entry;
2691 
2692         vp = nfs_fhtovp(&rda->rda_fh, exi);
2693         if (vp == NULL) {

2694                 rd->rd_status = NFSERR_STALE;
2695                 return;
2696         }
2697 
2698         if (vp->v_type != VDIR) {
2699                 VN_RELE(vp);

2700                 rd->rd_status = NFSERR_NOTDIR;
2701                 return;
2702         }
2703 
2704         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2705 
2706         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2707         if (error)


2708                 goto bad;

2709 
2710         /*
2711          * Don't allow arbitrary counts for allocation
2712          */
2713         if (count > NFS_MAXDATA)
2714                 count = NFS_MAXDATA;

2715 
2716         /*
2717          * struct readdirres:
2718          *   status:            1
2719          *   entries (bool):    1
2720          *   eof:               1
2721          */
2722         size = (1 + 1 + 1) * BYTES_PER_XDR_UNIT;
2723 
2724         if (size > count) {
2725                 eptr = &rd->rd_entries;
2726                 iseof = 0;
2727                 size = 0;
2728 
2729                 goto done;
2730         }
2731 
2732         /*
2733          * This is simplification.  The dirent64_t size is not the same as the
2734          * size of XDR representation of entry, but the sizes are similar so
2735          * we'll assume they are same.  This assumption should not cause any
2736          * harm.  In worst case we will need to issue VOP_READDIR() once more.
2737          */
2738         datasz = count;

2739 
2740         /*
2741          * Make sure that there is room to read at least one entry
2742          * if any are available.
2743          */
2744         if (datasz < DIRENT64_RECLEN(MAXNAMELEN))
2745                 datasz = DIRENT64_RECLEN(MAXNAMELEN);
2746 
2747         data = kmem_alloc(datasz, KM_NOSLEEP);
2748         if (data == NULL) {
2749                 /* The allocation failed; downsize and wait for it this time */
2750                 if (datasz > MAXBSIZE)
2751                         datasz = MAXBSIZE;
2752                 data = kmem_alloc(datasz, KM_SLEEP);
2753         }
2754 
2755         uio.uio_iov = &iov;
2756         uio.uio_iovcnt = 1;
2757         uio.uio_segflg = UIO_SYSSPACE;
2758         uio.uio_extflg = UIO_COPY_CACHED;
2759         uio.uio_loffset = (offset_t)rda->rda_offset;
2760         uio.uio_resid = datasz;
2761 
2762         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2763         eptr = &rd->rd_entries;
2764         entry = NULL;
2765 
2766 getmoredents:
2767         iov.iov_base = data;
2768         iov.iov_len = datasz;
2769 
2770         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2771         if (error) {
2772                 iseof = 0;
2773                 goto done;
2774         }
2775 
2776         if (iov.iov_len == datasz)
2777                 goto done;
2778 
2779         for (dp = (dirent64_t *)data;
2780             (char *)dp - data < datasz - iov.iov_len && !overflow;
2781             dp = nextdp(dp)) {
2782                 char *name;
2783                 uint32_t esize;
2784                 uint32_t cookie;
2785 
2786                 overflow = (uint64_t)dp->d_off > UINT32_MAX;
2787                 if (overflow) {
2788                         cookie = 0;
2789                         iseof = 1;
2790                 } else
2791                         cookie = (uint32_t)dp->d_off;
2792 
2793                 if (dp->d_ino == 0 || (uint64_t)dp->d_ino > UINT32_MAX) {
2794                         if (entry != NULL)
2795                                 entry->cookie = cookie;
2796                         continue;
2797                 }
2798 
2799                 name = nfscmd_convname(ca, exi, dp->d_name,
2800                     NFSCMD_CONV_OUTBOUND, NFS_MAXPATHLEN + 1);
2801                 if (name == NULL) {
2802                         if (entry != NULL)
2803                                 entry->cookie = cookie;
2804                         continue;
2805                 }
2806 







2807                 /*
2808                  * struct entry:
2809                  *   fileid:            1
2810                  *   name (length):     1
2811                  *   name (data):       length (rounded up)
2812                  *   cookie:            1
2813                  *   nextentry (bool):  1
2814                  */
2815                 esize = (1 + 1 + 1 + 1) * BYTES_PER_XDR_UNIT +
2816                     RNDUP(strlen(name));
2817 
2818                 /* If the new entry does not fit, discard it */
2819                 if (esize > count - size) {
2820                         if (name != dp->d_name)
2821                                 kmem_free(name, NFS_MAXPATHLEN + 1);
2822                         iseof = 0;
2823                         goto done;
2824                 }
2825 
2826                 entry = kmem_alloc(sizeof (struct nfsentry), KM_SLEEP);
2827 
2828                 entry->fileid = (uint32_t)dp->d_ino;
2829                 entry->name = strdup(name);
2830                 if (name != dp->d_name)
2831                         kmem_free(name, NFS_MAXPATHLEN + 1);
2832                 entry->cookie = cookie;
2833 
2834                 size += esize;
2835 
2836                 /* Add the entry to the linked list */
2837                 *eptr = entry;
2838                 eptr = &entry->nextentry;
2839         }
2840 
2841         if (!iseof && size < count) {
2842                 uio.uio_resid = MIN(datasz, MAXBSIZE);
2843                 goto getmoredents;
2844         }
2845 
2846 done:
2847         *eptr = NULL;
2848 
2849         if (iseof || rd->rd_entries != NULL || !error) {
2850                 error = 0;
2851                 rd->rd_eof = iseof ? TRUE : FALSE;
2852 
2853                 /* This is for nfslog only */
2854                 rd->rd_offset = rda->rda_offset;
2855                 rd->rd_size = size;
2856         }
2857 
2858 bad:
2859         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2860 
2861 #if 0 /* notyet */
2862         /*
2863          * Don't do this.  It causes local disk writes when just
2864          * reading the file and the overhead is deemed larger
2865          * than the benefit.
2866          */
2867         /*
2868          * Force modified metadata out to stable storage.
2869          */
2870         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2871 #endif
2872 
2873         VN_RELE(vp);
2874 
2875         rd->rd_status = puterrno(error);
2876 
2877         if (data != NULL)
2878                 kmem_free(data, datasz);
2879 }
2880 void *
2881 rfs_readdir_getfh(struct nfsrddirargs *rda)
2882 {
2883         return (&rda->rda_fh);
2884 }
2885 void
2886 rfs_rddirfree(struct nfsrddirres *rd)
2887 {
2888         if (rd->rd_status == NFS_OK) {
2889                 struct nfsentry *entry, *nentry;
2890 
2891                 for (entry = rd->rd_entries; entry != NULL; entry = nentry) {
2892                         nentry = entry->nextentry;
2893                         strfree(entry->name);
2894                         kmem_free(entry, sizeof (struct nfsentry));
2895                 }
2896         }
2897 }
2898 
2899 /* ARGSUSED */
2900 void
2901 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2902     struct svc_req *req, cred_t *cr, bool_t ro)
2903 {
2904         int error;
2905         struct statvfs64 sb;
2906         vnode_t *vp;
2907 
2908         vp = nfs_fhtovp(fh, exi);
2909         if (vp == NULL) {
2910                 fs->fs_status = NFSERR_STALE;
2911                 return;
2912         }
2913 
2914         error = VFS_STATVFS(vp->v_vfsp, &sb);
2915 
2916         if (!error) {

2982                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2983         }
2984         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2985             sa->sa_mtime.tv_usec != (int32_t)-1) {
2986 #ifndef _LP64
2987                 /* return error if time overflow */
2988                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2989                         return (EOVERFLOW);
2990 #endif
2991                 vap->va_mask |= AT_MTIME;
2992                 /*
2993                  * nfs protocol defines times as unsigned so don't extend sign,
2994                  * unless sysadmin set nfs_allow_preepoch_time.
2995                  */
2996                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2997                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2998         }
2999         return (0);
3000 }
3001 
3002 static const enum nfsftype vt_to_nf[] = {
3003         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
3004 };
3005 
3006 /*
3007  * check the following fields for overflow: nodeid, size, and time.
3008  * There could be a problem when converting 64-bit LP64 fields
3009  * into 32-bit ones.  Return an error if there is an overflow.
3010  */
3011 int
3012 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
3013 {
3014         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
3015         na->na_type = vt_to_nf[vap->va_type];
3016 
3017         if (vap->va_mode == (unsigned short) -1)
3018                 na->na_mode = (uint32_t)-1;
3019         else
3020                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
3021 
3022         if (vap->va_uid == (unsigned short)(-1))

3201                                                     aclentp->a_perm;
3202                                                 break;
3203                                         default:
3204                                                 break;
3205                                         }
3206                                 }
3207                         }
3208                         /* copy to va */
3209                         va->va_mode &= ~077;
3210                         va->va_mode |= grp_perm | other_perm;
3211                 }
3212                 if (vsa.vsa_aclcnt)
3213                         kmem_free(vsa.vsa_aclentp,
3214                             vsa.vsa_aclcnt * sizeof (aclent_t));
3215         }
3216 }
3217 
3218 void
3219 rfs_srvrinit(void)
3220 {

3221         nfs2_srv_caller_id = fs_new_caller_id();
3222         zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3223 }
3224 
3225 void
3226 rfs_srvrfini(void)
3227 {

3228 }
3229 
3230 /* ARGSUSED */
3231 static void *
3232 rfs_zone_init(zoneid_t zoneid)
3233 {
3234         nfs_srv_t *ns;
3235 
3236         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3237 
3238         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3239         ns->write_async = 1;
3240 
3241         return (ns);
3242 }
3243 
3244 /* ARGSUSED */
3245 static void
3246 rfs_zone_fini(zoneid_t zoneid, void *data)
3247 {
3248         nfs_srv_t *ns;
3249 
3250         ns = (nfs_srv_t *)data;
3251         mutex_destroy(&ns->async_write_lock);
3252         kmem_free(ns, sizeof (*ns));
3253 }
3254 
3255 static int
3256 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3257 {
3258         struct clist    *wcl;
3259         int             wlist_len;
3260         uint32_t        count = rr->rr_count;
3261 
3262         wcl = ra->ra_wlist;
3263 
3264         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3265                 return (FALSE);
3266         }
3267 
3268         wcl = ra->ra_wlist;
3269         rr->rr_ok.rrok_wlist_len = wlist_len;
3270         rr->rr_ok.rrok_wlist = wcl;
3271 
3272         return (TRUE);
3273 }