11083 Sdiff usr/src/uts/common/fs/nfs/nfs

Print this page

11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16

  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 





  33 #include <sys/param.h>
  34 #include <sys/types.h>
  35 #include <sys/systm.h>
  36 #include <sys/cred.h>
  37 #include <sys/buf.h>
  38 #include <sys/vfs.h>
  39 #include <sys/vnode.h>
  40 #include <sys/uio.h>
  41 #include <sys/stat.h>
  42 #include <sys/errno.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/statvfs.h>
  45 #include <sys/kmem.h>
  46 #include <sys/kstat.h>
  47 #include <sys/dirent.h>
  48 #include <sys/cmn_err.h>
  49 #include <sys/debug.h>
  50 #include <sys/vtrace.h>
  51 #include <sys/mode.h>
  52 #include <sys/acl.h>
  53 #include <sys/nbmlock.h>
  54 #include <sys/policy.h>
  55 #include <sys/sdt.h>
  56 
  57 #include <rpc/types.h>
  58 #include <rpc/auth.h>
  59 #include <rpc/svc.h>
  60 
  61 #include <nfs/nfs.h>
  62 #include <nfs/export.h>
  63 #include <nfs/nfs_cmd.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_map.h>
  69 #include <vm/seg_kmem.h>
  70 
  71 #include <sys/strsubr.h>
  72 


  73 /*













  74  * These are the interface routines for the server side of the
  75  * Network File System.  See the NFS version 2 protocol specification
  76  * for a description of this interface.
  77  */
  78 
  79 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  80 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  81                         cred_t *);
  82 

  83 /*
  84  * Some "over the wire" UNIX file types.  These are encoded
  85  * into the mode.  This needs to be fixed in the next rev.
  86  */
  87 #define IFMT            0170000         /* type of file */
  88 #define IFCHR           0020000         /* character special */
  89 #define IFBLK           0060000         /* block special */
  90 #define IFSOCK          0140000         /* socket */
  91 
  92 u_longlong_t nfs2_srv_caller_id;
  93 









  94 /*
  95  * Get file attributes.
  96  * Returns the current attributes of the file with the given fhandle.
  97  */
  98 /* ARGSUSED */
  99 void
 100 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 101     struct svc_req *req, cred_t *cr, bool_t ro)
 102 {
 103         int error;
 104         vnode_t *vp;
 105         struct vattr va;
 106 
 107         vp = nfs_fhtovp(fhp, exi);
 108         if (vp == NULL) {
 109                 ns->ns_status = NFSERR_STALE;
 110                 return;
 111         }
 112 
 113         /*

 369                 exi_rele(*exip);
 370                 *exip = exi;
 371 
 372                 VN_RELE(*vpp);
 373                 *vpp = vp;
 374         }
 375 
 376         return (0);
 377 }
 378 
 379 /*
 380  * Given mounted "dvp" and "exi", go upper mountpoint
 381  * with dvp/exi correction
 382  * Return 0 in success
 383  */
 384 int
 385 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 386 {
 387         struct exportinfo *exi;
 388         vnode_t *dvp = *dvpp;

 389 
 390         ASSERT(dvp->v_flag & VROOT);

 391 
 392         VN_HOLD(dvp);
 393         dvp = untraverse(dvp);
 394         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 395         if (exi == NULL) {
 396                 VN_RELE(dvp);
 397                 return (-1);
 398         }
 399 

 400         exi_rele(*exip);
 401         *exip = exi;
 402         VN_RELE(*dvpp);
 403         *dvpp = dvp;
 404 
 405         return (0);
 406 }
 407 /*
 408  * Directory lookup.
 409  * Returns an fhandle and file attributes for file name in a directory.
 410  */
 411 /* ARGSUSED */
 412 void
 413 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 414     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 415 {
 416         int error;
 417         vnode_t *dvp;
 418         vnode_t *vp;
 419         struct vattr va;

 429          * access via WebNFS here.
 430          */
 431         if (is_system_labeled() && req->rq_vers == 2) {
 432                 dr->dr_status = NFSERR_ACCES;
 433                 return;
 434         }
 435 
 436         /*
 437          * Disallow NULL paths
 438          */
 439         if (da->da_name == NULL || *da->da_name == '\0') {
 440                 dr->dr_status = NFSERR_ACCES;
 441                 return;
 442         }
 443 
 444         /*
 445          * Allow lookups from the root - the default
 446          * location of the public filehandle.
 447          */
 448         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 449                 dvp = rootdir;
 450                 VN_HOLD(dvp);
 451         } else {
 452                 dvp = nfs_fhtovp(fhp, exi);
 453                 if (dvp == NULL) {
 454                         dr->dr_status = NFSERR_STALE;
 455                         return;
 456                 }
 457         }
 458 
 459         exi_hold(exi);

 460 
 461         /*
 462          * Not allow lookup beyond root.
 463          * If the filehandle matches a filehandle of the exi,
 464          * then the ".." refers beyond the root of an exported filesystem.
 465          */
 466         if (strcmp(da->da_name, "..") == 0 &&
 467             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 468                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 469                     (dvp->v_flag & VROOT)) {
 470                         /*
 471                          * special case for ".." and 'nohide'exported root
 472                          */
 473                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 474                                 error = NFSERR_ACCES;
 475                                 goto out;
 476                         }
 477                 } else  {
 478                         error = NFSERR_NOENT;
 479                         goto out;
 480                 }
 481         }
 482 
 483         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 484         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 485             MAXPATHLEN);
 486 
 487         if (name == NULL) {
 488                 error = NFSERR_ACCES;
 489                 goto out;
 490         }
 491 
 492         /*
 493          * If the public filehandle is used then allow
 494          * a multi-component lookup, i.e. evaluate
 495          * a pathname and follow symbolic links if
 496          * necessary.
 497          *
 498          * This may result in a vnode in another filesystem
 499          * which is OK as long as the filesystem is exported.
 500          */
 501         if (PUBLIC_FH2(fhp)) {
 502                 publicfh_flag = TRUE;
 503 
 504                 exi_rele(exi);

 505 
 506                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 507                     &sec);
 508         } else {
 509                 /*
 510                  * Do a normal single component lookup.
 511                  */
 512                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 513                     NULL, NULL, NULL);
 514         }
 515 
 516         if (name != da->da_name)
 517                 kmem_free(name, MAXPATHLEN);
 518 
 519         if (error == 0 && vn_ismntpt(vp)) {
 520                 error = rfs_cross_mnt(&vp, &exi);
 521                 if (error)
 522                         VN_RELE(vp);
 523         }
 524

 618 
 619         /*
 620          * XNFS and RFC1094 require us to return ENXIO if argument
 621          * is not a link. BUGID 1138002.
 622          */
 623         if (vp->v_type != VLNK && !is_referral) {
 624                 VN_RELE(vp);
 625                 rl->rl_data = NULL;
 626                 rl->rl_status = NFSERR_NXIO;
 627                 return;
 628         }
 629 
 630         /*
 631          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 632          */
 633         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 634 
 635         if (is_referral) {
 636                 char *s;
 637                 size_t strsz;


 638 
 639                 /* Get an artificial symlink based on a referral */
 640                 s = build_symlink(vp, cr, &strsz);
 641                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 642                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 643                     vnode_t *, vp, char *, s);
 644                 if (s == NULL)
 645                         error = EINVAL;
 646                 else {
 647                         error = 0;
 648                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 649                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 650                         kmem_free(s, strsz);
 651                 }
 652 
 653         } else {
 654 
 655                 /*
 656                  * Set up io vector to read sym link data
 657                  */
 658                 iov.iov_base = rl->rl_data;
 659                 iov.iov_len = NFS_MAXPATHLEN;
 660                 uio.uio_iov = &iov;
 661                 uio.uio_iovcnt = 1;

 758          * Enter the critical region before calling VOP_RWLOCK
 759          * to avoid a deadlock with write requests.
 760          */
 761         if (nbl_need_check(vp)) {
 762                 nbl_start_crit(vp, RW_READER);
 763                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 764                     0, NULL)) {
 765                         nbl_end_crit(vp);
 766                         VN_RELE(vp);
 767                         rr->rr_data = NULL;
 768                         rr->rr_status = NFSERR_ACCES;
 769                         return;
 770                 }
 771                 in_crit = 1;
 772         }
 773 
 774         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 775 
 776         /* check if a monitor detected a delegation conflict */
 777         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {


 778                 VN_RELE(vp);
 779                 /* mark as wouldblock so response is dropped */
 780                 curthread->t_flag |= T_WOULDBLOCK;
 781 
 782                 rr->rr_data = NULL;
 783                 return;
 784         }
 785 
 786         va.va_mask = AT_ALL;
 787 
 788         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 789 
 790         if (error) {
 791                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 792                 if (in_crit)
 793                         nbl_end_crit(vp);
 794 
 795                 VN_RELE(vp);
 796                 rr->rr_data = NULL;
 797                 rr->rr_status = puterrno(error);

1083         }
1084 
1085         /*
1086          * We have to enter the critical region before calling VOP_RWLOCK
1087          * to avoid a deadlock with ufs.
1088          */
1089         if (nbl_need_check(vp)) {
1090                 nbl_start_crit(vp, RW_READER);
1091                 in_crit = 1;
1092                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1093                     wa->wa_count, 0, NULL)) {
1094                         error = EACCES;
1095                         goto out;
1096                 }
1097         }
1098 
1099         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1100 
1101         /* check if a monitor detected a delegation conflict */
1102         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1103                 VN_RELE(vp);
1104                 /* mark as wouldblock so response is dropped */
1105                 curthread->t_flag |= T_WOULDBLOCK;
1106                 return;
1107         }
1108 
1109         if (wa->wa_data || wa->wa_rlist) {
1110                 /* Do the RDMA thing if necessary */
1111                 if (wa->wa_rlist) {
1112                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1113                         iov[0].iov_len = wa->wa_count;
1114                 } else  {
1115                         iov[0].iov_base = wa->wa_data;
1116                         iov[0].iov_len = wa->wa_count;
1117                 }
1118                 uio.uio_iov = iov;
1119                 uio.uio_iovcnt = 1;
1120                 uio.uio_segflg = UIO_SYSSPACE;
1121                 uio.uio_extflg = UIO_COPY_DEFAULT;
1122                 uio.uio_loffset = (offset_t)wa->wa_offset;
1123                 uio.uio_resid = wa->wa_count;
1124                 /*
1125                  * The limit is checked on the client. We
1126                  * should allow any size writes here.
1127                  */
1128                 uio.uio_llimit = curproc->p_fsz_ctl;
1129                 rlimit = uio.uio_llimit - wa->wa_offset;
1130                 if (rlimit < (rlim64_t)uio.uio_resid)
1131                         uio.uio_resid = (uint_t)rlimit;
1132 
1133                 /*
1134                  * for now we assume no append mode
1135                  */
1136                 /*
1137                  * We're changing creds because VM may fault and we need
1138                  * the cred of the current thread to be used if quota
1139                  * checking is enabled.
1140                  */
1141                 savecred = curthread->t_cred;
1142                 curthread->t_cred = cr;
1143                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1144                 curthread->t_cred = savecred;
1145         } else {

1146                 iovcnt = 0;
1147                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1148                         iovcnt++;
1149                 if (iovcnt <= MAX_IOVECS) {
1150 #ifdef DEBUG
1151                         rfs_write_sync_hits++;
1152 #endif
1153                         iovp = iov;
1154                 } else {
1155 #ifdef DEBUG
1156                         rfs_write_sync_misses++;
1157 #endif
1158                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1159                 }
1160                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1161                 uio.uio_iov = iovp;
1162                 uio.uio_iovcnt = iovcnt;
1163                 uio.uio_segflg = UIO_SYSSPACE;
1164                 uio.uio_extflg = UIO_COPY_DEFAULT;
1165                 uio.uio_loffset = (offset_t)wa->wa_offset;

1269         struct rfs_async_write *rp;
1270         struct rfs_async_write *nrp;
1271         struct rfs_async_write *trp;
1272         struct rfs_async_write *lrp;
1273         int data_written;
1274         int iovcnt;
1275         mblk_t *m;
1276         struct iovec *iovp;
1277         struct iovec *niovp;
1278         struct iovec iov[MAXCLIOVECS];
1279         int count;
1280         int rcount;
1281         uint_t off;
1282         uint_t len;
1283         struct rfs_async_write nrpsp;
1284         struct rfs_async_write_list nlpsp;
1285         ushort_t t_flag;
1286         cred_t *savecred;
1287         int in_crit = 0;
1288         caller_context_t ct;

1289 
1290         if (!rfs_write_async) {


1291                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1292                 return;
1293         }
1294 
1295         /*
1296          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1297          * is considered an OK.
1298          */
1299         ns->ns_status = RFSWRITE_INITVAL;
1300 
1301         nrp = &nrpsp;
1302         nrp->wa = wa;
1303         nrp->ns = ns;
1304         nrp->req = req;
1305         nrp->cr = cr;
1306         nrp->ro = ro;
1307         nrp->thread = curthread;
1308 
1309         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1310 
1311         /*
1312          * Look to see if there is already a cluster started
1313          * for this file.
1314          */
1315         mutex_enter(&rfs_async_write_lock);
1316         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1317                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1318                     sizeof (fhandle_t)) == 0)
1319                         break;
1320         }
1321 
1322         /*
1323          * If lp is non-NULL, then there is already a cluster
1324          * started.  We need to place ourselves in the cluster
1325          * list in the right place as determined by starting
1326          * offset.  Conflicts with non-blocking mandatory locked
1327          * regions will be checked when the cluster is processed.
1328          */
1329         if (lp != NULL) {
1330                 rp = lp->list;
1331                 trp = NULL;
1332                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1333                         trp = rp;
1334                         rp = rp->list;
1335                 }
1336                 nrp->list = rp;
1337                 if (trp == NULL)
1338                         lp->list = nrp;
1339                 else
1340                         trp->list = nrp;
1341                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1342                         cv_wait(&lp->cv, &rfs_async_write_lock);
1343                 mutex_exit(&rfs_async_write_lock);
1344 
1345                 return;
1346         }
1347 
1348         /*
1349          * No cluster started yet, start one and add ourselves
1350          * to the list of clusters.
1351          */
1352         nrp->list = NULL;
1353 
1354         nlp = &nlpsp;
1355         nlp->fhp = &wa->wa_fhandle;
1356         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1357         nlp->list = nrp;
1358         nlp->next = NULL;
1359 
1360         if (rfs_async_write_head == NULL) {
1361                 rfs_async_write_head = nlp;
1362         } else {
1363                 lp = rfs_async_write_head;
1364                 while (lp->next != NULL)
1365                         lp = lp->next;
1366                 lp->next = nlp;
1367         }
1368         mutex_exit(&rfs_async_write_lock);
1369 
1370         /*
1371          * Convert the file handle common to all of the requests
1372          * in this cluster to a vnode.
1373          */
1374         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1375         if (vp == NULL) {
1376                 mutex_enter(&rfs_async_write_lock);
1377                 if (rfs_async_write_head == nlp)
1378                         rfs_async_write_head = nlp->next;
1379                 else {
1380                         lp = rfs_async_write_head;
1381                         while (lp->next != nlp)
1382                                 lp = lp->next;
1383                         lp->next = nlp->next;
1384                 }
1385                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1386                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1387                         rp->ns->ns_status = NFSERR_STALE;
1388                         rp->thread->t_flag |= t_flag;
1389                 }
1390                 cv_broadcast(&nlp->cv);
1391                 mutex_exit(&rfs_async_write_lock);
1392 
1393                 return;
1394         }
1395 
1396         /*
1397          * Can only write regular files.  Attempts to write any
1398          * other file types fail with EISDIR.
1399          */
1400         if (vp->v_type != VREG) {
1401                 VN_RELE(vp);
1402                 mutex_enter(&rfs_async_write_lock);
1403                 if (rfs_async_write_head == nlp)
1404                         rfs_async_write_head = nlp->next;
1405                 else {
1406                         lp = rfs_async_write_head;
1407                         while (lp->next != nlp)
1408                                 lp = lp->next;
1409                         lp->next = nlp->next;
1410                 }
1411                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1412                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1413                         rp->ns->ns_status = NFSERR_ISDIR;
1414                         rp->thread->t_flag |= t_flag;
1415                 }
1416                 cv_broadcast(&nlp->cv);
1417                 mutex_exit(&rfs_async_write_lock);
1418 
1419                 return;
1420         }
1421 
1422         /*
1423          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1424          * deadlock with ufs.
1425          */
1426         if (nbl_need_check(vp)) {
1427                 nbl_start_crit(vp, RW_READER);
1428                 in_crit = 1;
1429         }
1430 
1431         ct.cc_sysid = 0;
1432         ct.cc_pid = 0;
1433         ct.cc_caller_id = nfs2_srv_caller_id;
1434         ct.cc_flags = CC_DONTBLOCK;
1435 
1436         /*
1437          * Lock the file for writing.  This operation provides
1438          * the delay which allows clusters to grow.
1439          */
1440         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1441 
1442         /* check if a monitor detected a delegation conflict */
1443         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1444                 if (in_crit)
1445                         nbl_end_crit(vp);
1446                 VN_RELE(vp);
1447                 /* mark as wouldblock so response is dropped */
1448                 curthread->t_flag |= T_WOULDBLOCK;
1449                 mutex_enter(&rfs_async_write_lock);
1450                 if (rfs_async_write_head == nlp)
1451                         rfs_async_write_head = nlp->next;
1452                 else {
1453                         lp = rfs_async_write_head;
1454                         while (lp->next != nlp)
1455                                 lp = lp->next;
1456                         lp->next = nlp->next;
1457                 }
1458                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1459                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1460                                 rp->ns->ns_status = puterrno(error);
1461                                 rp->thread->t_flag |= T_WOULDBLOCK;
1462                         }
1463                 }
1464                 cv_broadcast(&nlp->cv);
1465                 mutex_exit(&rfs_async_write_lock);
1466 
1467                 return;
1468         }
1469 
1470         /*
1471          * Disconnect this cluster from the list of clusters.
1472          * The cluster that is being dealt with must be fixed
1473          * in size after this point, so there is no reason
1474          * to leave it on the list so that new requests can
1475          * find it.
1476          *
1477          * The algorithm is that the first write request will
1478          * create a cluster, convert the file handle to a
1479          * vnode pointer, and then lock the file for writing.
1480          * This request is not likely to be clustered with
1481          * any others.  However, the next request will create
1482          * a new cluster and be blocked in VOP_RWLOCK while
1483          * the first request is being processed.  This delay
1484          * will allow more requests to be clustered in this
1485          * second cluster.
1486          */
1487         mutex_enter(&rfs_async_write_lock);
1488         if (rfs_async_write_head == nlp)
1489                 rfs_async_write_head = nlp->next;
1490         else {
1491                 lp = rfs_async_write_head;
1492                 while (lp->next != nlp)
1493                         lp = lp->next;
1494                 lp->next = nlp->next;
1495         }
1496         mutex_exit(&rfs_async_write_lock);
1497 
1498         /*
1499          * Step through the list of requests in this cluster.
1500          * We need to check permissions to make sure that all
1501          * of the requests have sufficient permission to write
1502          * the file.  A cluster can be composed of requests
1503          * from different clients and different users on each
1504          * client.
1505          *
1506          * As a side effect, we also calculate the size of the
1507          * byte range that this cluster encompasses.
1508          */
1509         rp = nlp->list;
1510         off = rp->wa->wa_offset;
1511         len = (uint_t)0;
1512         do {
1513                 if (rdonly(rp->ro, vp)) {
1514                         rp->ns->ns_status = NFSERR_ROFS;
1515                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1516                         rp->thread->t_flag |= t_flag;

1721 
1722         /*
1723          * If any data was written at all, then we need to flush
1724          * the data and metadata to stable storage.
1725          */
1726         if (data_written) {
1727                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1728 
1729                 if (!error) {
1730                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1731                 }
1732         }
1733 
1734         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1735 
1736         if (in_crit)
1737                 nbl_end_crit(vp);
1738         VN_RELE(vp);
1739 
1740         t_flag = curthread->t_flag & T_WOULDBLOCK;
1741         mutex_enter(&rfs_async_write_lock);
1742         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1743                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1744                         rp->ns->ns_status = puterrno(error);
1745                         rp->thread->t_flag |= t_flag;
1746                 }
1747         }
1748         cv_broadcast(&nlp->cv);
1749         mutex_exit(&rfs_async_write_lock);
1750 
1751 }
1752 
1753 void *
1754 rfs_write_getfh(struct nfswriteargs *wa)
1755 {
1756         return (&wa->wa_fhandle);
1757 }
1758 
1759 /*
1760  * Create a file.
1761  * Creates a file with given attributes and returns those attributes
1762  * and an fhandle for the new file.
1763  */
1764 void
1765 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1766     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1767 {
1768         int error;
1769         int lookuperr;

2194             NULL, cr, NULL, NULL, NULL);
2195         if (error != 0) {
2196                 VN_RELE(tovp);
2197                 VN_RELE(fromvp);
2198                 *status = puterrno(error);
2199                 return;
2200         }
2201 
2202         /* Check for delegations on the source file */
2203 
2204         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2205                 VN_RELE(tovp);
2206                 VN_RELE(fromvp);
2207                 VN_RELE(srcvp);
2208                 curthread->t_flag |= T_WOULDBLOCK;
2209                 return;
2210         }
2211 
2212         /* Check for delegation on the file being renamed over, if it exists */
2213 
2214         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2215             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2216             NULL, NULL, NULL) == 0) {
2217 
2218                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2219                         VN_RELE(tovp);
2220                         VN_RELE(fromvp);
2221                         VN_RELE(srcvp);
2222                         VN_RELE(targvp);
2223                         curthread->t_flag |= T_WOULDBLOCK;
2224                         return;
2225                 }
2226                 VN_RELE(targvp);
2227         }
2228 
2229 
2230         if (nbl_need_check(srcvp)) {
2231                 nbl_start_crit(srcvp, RW_READER);
2232                 in_crit = 1;
2233                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2234                         error = EACCES;

2561         if (vp == NULL) {
2562                 *status = NFSERR_STALE;
2563                 return;
2564         }
2565 
2566         if (rdonly(ro, vp)) {
2567                 VN_RELE(vp);
2568                 *status = NFSERR_ROFS;
2569                 return;
2570         }
2571 
2572         /*
2573          * VOP_RMDIR takes a third argument (the current
2574          * directory of the process).  That's because someone
2575          * wants to return EINVAL if one tries to remove ".".
2576          * Of course, NFS servers have no idea what their
2577          * clients' current directories are.  We fake it by
2578          * supplying a vnode known to exist and illegal to
2579          * remove.
2580          */
2581         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2582 
2583         /*
2584          * Force modified data and metadata out to stable storage.
2585          */
2586         (void) VOP_FSYNC(vp, 0, cr, NULL);
2587 
2588         VN_RELE(vp);
2589 
2590         /*
2591          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2592          * if the directory is not empty.  A System V NFS server
2593          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2594          * over the wire.
2595          */
2596         if (error == EEXIST)
2597                 *status = NFSERR_NOTEMPTY;
2598         else
2599                 *status = puterrno(error);
2600 
2601 }

2836                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2837         }
2838         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2839             sa->sa_mtime.tv_usec != (int32_t)-1) {
2840 #ifndef _LP64
2841                 /* return error if time overflow */
2842                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2843                         return (EOVERFLOW);
2844 #endif
2845                 vap->va_mask |= AT_MTIME;
2846                 /*
2847                  * nfs protocol defines times as unsigned so don't extend sign,
2848                  * unless sysadmin set nfs_allow_preepoch_time.
2849                  */
2850                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2851                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2852         }
2853         return (0);
2854 }
2855 
2856 static enum nfsftype vt_to_nf[] = {
2857         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2858 };
2859 
2860 /*
2861  * check the following fields for overflow: nodeid, size, and time.
2862  * There could be a problem when converting 64-bit LP64 fields
2863  * into 32-bit ones.  Return an error if there is an overflow.
2864  */
2865 int
2866 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2867 {
2868         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2869         na->na_type = vt_to_nf[vap->va_type];
2870 
2871         if (vap->va_mode == (unsigned short) -1)
2872                 na->na_mode = (uint32_t)-1;
2873         else
2874                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2875 
2876         if (vap->va_uid == (unsigned short)(-1))

3055                                                     aclentp->a_perm;
3056                                                 break;
3057                                         default:
3058                                                 break;
3059                                         }
3060                                 }
3061                         }
3062                         /* copy to va */
3063                         va->va_mode &= ~077;
3064                         va->va_mode |= grp_perm | other_perm;
3065                 }
3066                 if (vsa.vsa_aclcnt)
3067                         kmem_free(vsa.vsa_aclentp,
3068                             vsa.vsa_aclcnt * sizeof (aclent_t));
3069         }
3070 }
3071 
3072 void
3073 rfs_srvrinit(void)
3074 {
3075         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3076         nfs2_srv_caller_id = fs_new_caller_id();
3077 }
3078 
3079 void
3080 rfs_srvrfini(void)
3081 {
3082         mutex_destroy(&rfs_async_write_lock);
3083 }
3084 


























3085 static int
3086 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3087 {
3088         struct clist    *wcl;
3089         int             wlist_len;
3090         uint32_t        count = rr->rr_count;
3091 
3092         wcl = ra->ra_wlist;
3093 
3094         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3095                 return (FALSE);
3096         }
3097 
3098         wcl = ra->ra_wlist;
3099         rr->rr_ok.rrok_wlist_len = wlist_len;
3100         rr->rr_ok.rrok_wlist = wcl;
3101 
3102         return (TRUE);
3103 }

  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30  *      All rights reserved.
  31  */
  32 
  33 /*
  34  * Copyright 2018 Nexenta Systems, Inc.
  35  * Copyright (c) 2016 by Delphix. All rights reserved.
  36  */
  37 
  38 #include <sys/param.h>
  39 #include <sys/types.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/buf.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/stat.h>
  47 #include <sys/errno.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/statvfs.h>
  50 #include <sys/kmem.h>
  51 #include <sys/kstat.h>
  52 #include <sys/dirent.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/debug.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/mode.h>
  57 #include <sys/acl.h>
  58 #include <sys/nbmlock.h>
  59 #include <sys/policy.h>
  60 #include <sys/sdt.h>
  61 
  62 #include <rpc/types.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/svc.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/export.h>
  68 #include <nfs/nfs_cmd.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_kmem.h>
  75 
  76 #include <sys/strsubr.h>
  77 
  78 struct rfs_async_write_list;
  79 
  80 /*
  81  * Zone globals of NFSv2 server
  82  */
  83 typedef struct nfs_srv {
  84         kmutex_t                        async_write_lock;
  85         struct rfs_async_write_list     *async_write_head;
  86 
  87         /*
  88          * enables write clustering if == 1
  89          */
  90         int             write_async;
  91 } nfs_srv_t;
  92 
  93 /*
  94  * These are the interface routines for the server side of the
  95  * Network File System.  See the NFS version 2 protocol specification
  96  * for a description of this interface.
  97  */
  98 
  99 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101                         cred_t *);
 102 
 103 
 104 /*
 105  * Some "over the wire" UNIX file types.  These are encoded
 106  * into the mode.  This needs to be fixed in the next rev.
 107  */
 108 #define IFMT            0170000         /* type of file */
 109 #define IFCHR           0020000         /* character special */
 110 #define IFBLK           0060000         /* block special */
 111 #define IFSOCK          0140000         /* socket */
 112 
 113 u_longlong_t nfs2_srv_caller_id;
 114 
 115 static nfs_srv_t *
 116 nfs_get_srv(void)
 117 {
 118         nfs_globals_t *ng = nfs_srv_getzg();
 119         nfs_srv_t *srv = ng->nfs_srv;
 120         ASSERT(srv != NULL);
 121         return (srv);
 122 }
 123 
 124 /*
 125  * Get file attributes.
 126  * Returns the current attributes of the file with the given fhandle.
 127  */
 128 /* ARGSUSED */
 129 void
 130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131     struct svc_req *req, cred_t *cr, bool_t ro)
 132 {
 133         int error;
 134         vnode_t *vp;
 135         struct vattr va;
 136 
 137         vp = nfs_fhtovp(fhp, exi);
 138         if (vp == NULL) {
 139                 ns->ns_status = NFSERR_STALE;
 140                 return;
 141         }
 142 
 143         /*

 399                 exi_rele(*exip);
 400                 *exip = exi;
 401 
 402                 VN_RELE(*vpp);
 403                 *vpp = vp;
 404         }
 405 
 406         return (0);
 407 }
 408 
 409 /*
 410  * Given mounted "dvp" and "exi", go upper mountpoint
 411  * with dvp/exi correction
 412  * Return 0 in success
 413  */
 414 int
 415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416 {
 417         struct exportinfo *exi;
 418         vnode_t *dvp = *dvpp;
 419         vnode_t *zone_rootvp;
 420 
 421         zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
 422         ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
 423 
 424         VN_HOLD(dvp);
 425         dvp = untraverse(dvp, zone_rootvp);
 426         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 427         if (exi == NULL) {
 428                 VN_RELE(dvp);
 429                 return (-1);
 430         }
 431 
 432         ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
 433         exi_rele(*exip);
 434         *exip = exi;
 435         VN_RELE(*dvpp);
 436         *dvpp = dvp;
 437 
 438         return (0);
 439 }
 440 /*
 441  * Directory lookup.
 442  * Returns an fhandle and file attributes for file name in a directory.
 443  */
 444 /* ARGSUSED */
 445 void
 446 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 447     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 448 {
 449         int error;
 450         vnode_t *dvp;
 451         vnode_t *vp;
 452         struct vattr va;

 462          * access via WebNFS here.
 463          */
 464         if (is_system_labeled() && req->rq_vers == 2) {
 465                 dr->dr_status = NFSERR_ACCES;
 466                 return;
 467         }
 468 
 469         /*
 470          * Disallow NULL paths
 471          */
 472         if (da->da_name == NULL || *da->da_name == '\0') {
 473                 dr->dr_status = NFSERR_ACCES;
 474                 return;
 475         }
 476 
 477         /*
 478          * Allow lookups from the root - the default
 479          * location of the public filehandle.
 480          */
 481         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 482                 dvp = ZONE_ROOTVP();
 483                 VN_HOLD(dvp);
 484         } else {
 485                 dvp = nfs_fhtovp(fhp, exi);
 486                 if (dvp == NULL) {
 487                         dr->dr_status = NFSERR_STALE;
 488                         return;
 489                 }
 490         }
 491 
 492         exi_hold(exi);
 493         ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 494 
 495         /*
 496          * Not allow lookup beyond root.
 497          * If the filehandle matches a filehandle of the exi,
 498          * then the ".." refers beyond the root of an exported filesystem.
 499          */
 500         if (strcmp(da->da_name, "..") == 0 &&
 501             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 502                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 503                     ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 504                         /*
 505                          * special case for ".." and 'nohide'exported root
 506                          */
 507                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 508                                 error = NFSERR_ACCES;
 509                                 goto out;
 510                         }
 511                 } else  {
 512                         error = NFSERR_NOENT;
 513                         goto out;
 514                 }
 515         }
 516 
 517         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 518         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 519             MAXPATHLEN);
 520 
 521         if (name == NULL) {
 522                 error = NFSERR_ACCES;
 523                 goto out;
 524         }
 525 
 526         /*
 527          * If the public filehandle is used then allow
 528          * a multi-component lookup, i.e. evaluate
 529          * a pathname and follow symbolic links if
 530          * necessary.
 531          *
 532          * This may result in a vnode in another filesystem
 533          * which is OK as long as the filesystem is exported.
 534          */
 535         if (PUBLIC_FH2(fhp)) {
 536                 publicfh_flag = TRUE;
 537 
 538                 exi_rele(exi);
 539                 exi = NULL;
 540 
 541                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 542                     &sec);
 543         } else {
 544                 /*
 545                  * Do a normal single component lookup.
 546                  */
 547                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 548                     NULL, NULL, NULL);
 549         }
 550 
 551         if (name != da->da_name)
 552                 kmem_free(name, MAXPATHLEN);
 553 
 554         if (error == 0 && vn_ismntpt(vp)) {
 555                 error = rfs_cross_mnt(&vp, &exi);
 556                 if (error)
 557                         VN_RELE(vp);
 558         }
 559

 653 
 654         /*
 655          * XNFS and RFC1094 require us to return ENXIO if argument
 656          * is not a link. BUGID 1138002.
 657          */
 658         if (vp->v_type != VLNK && !is_referral) {
 659                 VN_RELE(vp);
 660                 rl->rl_data = NULL;
 661                 rl->rl_status = NFSERR_NXIO;
 662                 return;
 663         }
 664 
 665         /*
 666          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 667          */
 668         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 669 
 670         if (is_referral) {
 671                 char *s;
 672                 size_t strsz;
 673                 kstat_named_t *stat =
 674                     exi->exi_ne->ne_globals->svstat[NFS_VERSION];
 675 
 676                 /* Get an artificial symlink based on a referral */
 677                 s = build_symlink(vp, cr, &strsz);
 678                 stat[NFS_REFERLINKS].value.ui64++;
 679                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 680                     vnode_t *, vp, char *, s);
 681                 if (s == NULL)
 682                         error = EINVAL;
 683                 else {
 684                         error = 0;
 685                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 686                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 687                         kmem_free(s, strsz);
 688                 }
 689 
 690         } else {
 691 
 692                 /*
 693                  * Set up io vector to read sym link data
 694                  */
 695                 iov.iov_base = rl->rl_data;
 696                 iov.iov_len = NFS_MAXPATHLEN;
 697                 uio.uio_iov = &iov;
 698                 uio.uio_iovcnt = 1;

 795          * Enter the critical region before calling VOP_RWLOCK
 796          * to avoid a deadlock with write requests.
 797          */
 798         if (nbl_need_check(vp)) {
 799                 nbl_start_crit(vp, RW_READER);
 800                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 801                     0, NULL)) {
 802                         nbl_end_crit(vp);
 803                         VN_RELE(vp);
 804                         rr->rr_data = NULL;
 805                         rr->rr_status = NFSERR_ACCES;
 806                         return;
 807                 }
 808                 in_crit = 1;
 809         }
 810 
 811         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 812 
 813         /* check if a monitor detected a delegation conflict */
 814         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 815                 if (in_crit)
 816                         nbl_end_crit(vp);
 817                 VN_RELE(vp);
 818                 /* mark as wouldblock so response is dropped */
 819                 curthread->t_flag |= T_WOULDBLOCK;
 820 
 821                 rr->rr_data = NULL;
 822                 return;
 823         }
 824 
 825         va.va_mask = AT_ALL;
 826 
 827         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 828 
 829         if (error) {
 830                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 831                 if (in_crit)
 832                         nbl_end_crit(vp);
 833 
 834                 VN_RELE(vp);
 835                 rr->rr_data = NULL;
 836                 rr->rr_status = puterrno(error);

1122         }
1123 
1124         /*
1125          * We have to enter the critical region before calling VOP_RWLOCK
1126          * to avoid a deadlock with ufs.
1127          */
1128         if (nbl_need_check(vp)) {
1129                 nbl_start_crit(vp, RW_READER);
1130                 in_crit = 1;
1131                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1132                     wa->wa_count, 0, NULL)) {
1133                         error = EACCES;
1134                         goto out;
1135                 }
1136         }
1137 
1138         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1139 
1140         /* check if a monitor detected a delegation conflict */
1141         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1142                 goto out;



1143         }
1144 
1145         if (wa->wa_data || wa->wa_rlist) {
1146                 /* Do the RDMA thing if necessary */
1147                 if (wa->wa_rlist) {
1148                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1149                         iov[0].iov_len = wa->wa_count;
1150                 } else  {
1151                         iov[0].iov_base = wa->wa_data;
1152                         iov[0].iov_len = wa->wa_count;
1153                 }
1154                 uio.uio_iov = iov;
1155                 uio.uio_iovcnt = 1;
1156                 uio.uio_segflg = UIO_SYSSPACE;
1157                 uio.uio_extflg = UIO_COPY_DEFAULT;
1158                 uio.uio_loffset = (offset_t)wa->wa_offset;
1159                 uio.uio_resid = wa->wa_count;
1160                 /*
1161                  * The limit is checked on the client. We
1162                  * should allow any size writes here.
1163                  */
1164                 uio.uio_llimit = curproc->p_fsz_ctl;
1165                 rlimit = uio.uio_llimit - wa->wa_offset;
1166                 if (rlimit < (rlim64_t)uio.uio_resid)
1167                         uio.uio_resid = (uint_t)rlimit;
1168 
1169                 /*
1170                  * for now we assume no append mode
1171                  */
1172                 /*
1173                  * We're changing creds because VM may fault and we need
1174                  * the cred of the current thread to be used if quota
1175                  * checking is enabled.
1176                  */
1177                 savecred = curthread->t_cred;
1178                 curthread->t_cred = cr;
1179                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1180                 curthread->t_cred = savecred;
1181         } else {
1182 
1183                 iovcnt = 0;
1184                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1185                         iovcnt++;
1186                 if (iovcnt <= MAX_IOVECS) {
1187 #ifdef DEBUG
1188                         rfs_write_sync_hits++;
1189 #endif
1190                         iovp = iov;
1191                 } else {
1192 #ifdef DEBUG
1193                         rfs_write_sync_misses++;
1194 #endif
1195                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1196                 }
1197                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1198                 uio.uio_iov = iovp;
1199                 uio.uio_iovcnt = iovcnt;
1200                 uio.uio_segflg = UIO_SYSSPACE;
1201                 uio.uio_extflg = UIO_COPY_DEFAULT;
1202                 uio.uio_loffset = (offset_t)wa->wa_offset;

1306         struct rfs_async_write *rp;
1307         struct rfs_async_write *nrp;
1308         struct rfs_async_write *trp;
1309         struct rfs_async_write *lrp;
1310         int data_written;
1311         int iovcnt;
1312         mblk_t *m;
1313         struct iovec *iovp;
1314         struct iovec *niovp;
1315         struct iovec iov[MAXCLIOVECS];
1316         int count;
1317         int rcount;
1318         uint_t off;
1319         uint_t len;
1320         struct rfs_async_write nrpsp;
1321         struct rfs_async_write_list nlpsp;
1322         ushort_t t_flag;
1323         cred_t *savecred;
1324         int in_crit = 0;
1325         caller_context_t ct;
1326         nfs_srv_t *nsrv;
1327 
1328         ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1329         nsrv = nfs_get_srv();
1330         if (!nsrv->write_async) {
1331                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1332                 return;
1333         }
1334 
1335         /*
1336          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1337          * is considered an OK.
1338          */
1339         ns->ns_status = RFSWRITE_INITVAL;
1340 
1341         nrp = &nrpsp;
1342         nrp->wa = wa;
1343         nrp->ns = ns;
1344         nrp->req = req;
1345         nrp->cr = cr;
1346         nrp->ro = ro;
1347         nrp->thread = curthread;
1348 
1349         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1350 
1351         /*
1352          * Look to see if there is already a cluster started
1353          * for this file.
1354          */
1355         mutex_enter(&nsrv->async_write_lock);
1356         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1357                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1358                     sizeof (fhandle_t)) == 0)
1359                         break;
1360         }
1361 
1362         /*
1363          * If lp is non-NULL, then there is already a cluster
1364          * started.  We need to place ourselves in the cluster
1365          * list in the right place as determined by starting
1366          * offset.  Conflicts with non-blocking mandatory locked
1367          * regions will be checked when the cluster is processed.
1368          */
1369         if (lp != NULL) {
1370                 rp = lp->list;
1371                 trp = NULL;
1372                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1373                         trp = rp;
1374                         rp = rp->list;
1375                 }
1376                 nrp->list = rp;
1377                 if (trp == NULL)
1378                         lp->list = nrp;
1379                 else
1380                         trp->list = nrp;
1381                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1382                         cv_wait(&lp->cv, &nsrv->async_write_lock);
1383                 mutex_exit(&nsrv->async_write_lock);
1384 
1385                 return;
1386         }
1387 
1388         /*
1389          * No cluster started yet, start one and add ourselves
1390          * to the list of clusters.
1391          */
1392         nrp->list = NULL;
1393 
1394         nlp = &nlpsp;
1395         nlp->fhp = &wa->wa_fhandle;
1396         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1397         nlp->list = nrp;
1398         nlp->next = NULL;
1399 
1400         if (nsrv->async_write_head == NULL) {
1401                 nsrv->async_write_head = nlp;
1402         } else {
1403                 lp = nsrv->async_write_head;
1404                 while (lp->next != NULL)
1405                         lp = lp->next;
1406                 lp->next = nlp;
1407         }
1408         mutex_exit(&nsrv->async_write_lock);
1409 
1410         /*
1411          * Convert the file handle common to all of the requests
1412          * in this cluster to a vnode.
1413          */
1414         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1415         if (vp == NULL) {
1416                 mutex_enter(&nsrv->async_write_lock);
1417                 if (nsrv->async_write_head == nlp)
1418                         nsrv->async_write_head = nlp->next;
1419                 else {
1420                         lp = nsrv->async_write_head;
1421                         while (lp->next != nlp)
1422                                 lp = lp->next;
1423                         lp->next = nlp->next;
1424                 }
1425                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1426                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1427                         rp->ns->ns_status = NFSERR_STALE;
1428                         rp->thread->t_flag |= t_flag;
1429                 }
1430                 cv_broadcast(&nlp->cv);
1431                 mutex_exit(&nsrv->async_write_lock);
1432 
1433                 return;
1434         }
1435 
1436         /*
1437          * Can only write regular files.  Attempts to write any
1438          * other file types fail with EISDIR.
1439          */
1440         if (vp->v_type != VREG) {
1441                 VN_RELE(vp);
1442                 mutex_enter(&nsrv->async_write_lock);
1443                 if (nsrv->async_write_head == nlp)
1444                         nsrv->async_write_head = nlp->next;
1445                 else {
1446                         lp = nsrv->async_write_head;
1447                         while (lp->next != nlp)
1448                                 lp = lp->next;
1449                         lp->next = nlp->next;
1450                 }
1451                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1452                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1453                         rp->ns->ns_status = NFSERR_ISDIR;
1454                         rp->thread->t_flag |= t_flag;
1455                 }
1456                 cv_broadcast(&nlp->cv);
1457                 mutex_exit(&nsrv->async_write_lock);
1458 
1459                 return;
1460         }
1461 
1462         /*
1463          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1464          * deadlock with ufs.
1465          */
1466         if (nbl_need_check(vp)) {
1467                 nbl_start_crit(vp, RW_READER);
1468                 in_crit = 1;
1469         }
1470 
1471         ct.cc_sysid = 0;
1472         ct.cc_pid = 0;
1473         ct.cc_caller_id = nfs2_srv_caller_id;
1474         ct.cc_flags = CC_DONTBLOCK;
1475 
1476         /*
1477          * Lock the file for writing.  This operation provides
1478          * the delay which allows clusters to grow.
1479          */
1480         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1481 
1482         /* check if a monitor detected a delegation conflict */
1483         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1484                 if (in_crit)
1485                         nbl_end_crit(vp);
1486                 VN_RELE(vp);
1487                 /* mark as wouldblock so response is dropped */
1488                 curthread->t_flag |= T_WOULDBLOCK;
1489                 mutex_enter(&nsrv->async_write_lock);
1490                 if (nsrv->async_write_head == nlp)
1491                         nsrv->async_write_head = nlp->next;
1492                 else {
1493                         lp = nsrv->async_write_head;
1494                         while (lp->next != nlp)
1495                                 lp = lp->next;
1496                         lp->next = nlp->next;
1497                 }
1498                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1499                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1500                                 rp->ns->ns_status = puterrno(error);
1501                                 rp->thread->t_flag |= T_WOULDBLOCK;
1502                         }
1503                 }
1504                 cv_broadcast(&nlp->cv);
1505                 mutex_exit(&nsrv->async_write_lock);
1506 
1507                 return;
1508         }
1509 
1510         /*
1511          * Disconnect this cluster from the list of clusters.
1512          * The cluster that is being dealt with must be fixed
1513          * in size after this point, so there is no reason
1514          * to leave it on the list so that new requests can
1515          * find it.
1516          *
1517          * The algorithm is that the first write request will
1518          * create a cluster, convert the file handle to a
1519          * vnode pointer, and then lock the file for writing.
1520          * This request is not likely to be clustered with
1521          * any others.  However, the next request will create
1522          * a new cluster and be blocked in VOP_RWLOCK while
1523          * the first request is being processed.  This delay
1524          * will allow more requests to be clustered in this
1525          * second cluster.
1526          */
1527         mutex_enter(&nsrv->async_write_lock);
1528         if (nsrv->async_write_head == nlp)
1529                 nsrv->async_write_head = nlp->next;
1530         else {
1531                 lp = nsrv->async_write_head;
1532                 while (lp->next != nlp)
1533                         lp = lp->next;
1534                 lp->next = nlp->next;
1535         }
1536         mutex_exit(&nsrv->async_write_lock);
1537 
1538         /*
1539          * Step through the list of requests in this cluster.
1540          * We need to check permissions to make sure that all
1541          * of the requests have sufficient permission to write
1542          * the file.  A cluster can be composed of requests
1543          * from different clients and different users on each
1544          * client.
1545          *
1546          * As a side effect, we also calculate the size of the
1547          * byte range that this cluster encompasses.
1548          */
1549         rp = nlp->list;
1550         off = rp->wa->wa_offset;
1551         len = (uint_t)0;
1552         do {
1553                 if (rdonly(rp->ro, vp)) {
1554                         rp->ns->ns_status = NFSERR_ROFS;
1555                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1556                         rp->thread->t_flag |= t_flag;

1761 
1762         /*
1763          * If any data was written at all, then we need to flush
1764          * the data and metadata to stable storage.
1765          */
1766         if (data_written) {
1767                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1768 
1769                 if (!error) {
1770                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1771                 }
1772         }
1773 
1774         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1775 
1776         if (in_crit)
1777                 nbl_end_crit(vp);
1778         VN_RELE(vp);
1779 
1780         t_flag = curthread->t_flag & T_WOULDBLOCK;
1781         mutex_enter(&nsrv->async_write_lock);
1782         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1783                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1784                         rp->ns->ns_status = puterrno(error);
1785                         rp->thread->t_flag |= t_flag;
1786                 }
1787         }
1788         cv_broadcast(&nlp->cv);
1789         mutex_exit(&nsrv->async_write_lock);
1790 
1791 }
1792 
1793 void *
1794 rfs_write_getfh(struct nfswriteargs *wa)
1795 {
1796         return (&wa->wa_fhandle);
1797 }
1798 
1799 /*
1800  * Create a file.
1801  * Creates a file with given attributes and returns those attributes
1802  * and an fhandle for the new file.
1803  */
1804 void
1805 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1806     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1807 {
1808         int error;
1809         int lookuperr;

2234             NULL, cr, NULL, NULL, NULL);
2235         if (error != 0) {
2236                 VN_RELE(tovp);
2237                 VN_RELE(fromvp);
2238                 *status = puterrno(error);
2239                 return;
2240         }
2241 
2242         /* Check for delegations on the source file */
2243 
2244         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2245                 VN_RELE(tovp);
2246                 VN_RELE(fromvp);
2247                 VN_RELE(srcvp);
2248                 curthread->t_flag |= T_WOULDBLOCK;
2249                 return;
2250         }
2251 
2252         /* Check for delegation on the file being renamed over, if it exists */
2253 
2254         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2255             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2256             NULL, NULL, NULL) == 0) {
2257 
2258                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2259                         VN_RELE(tovp);
2260                         VN_RELE(fromvp);
2261                         VN_RELE(srcvp);
2262                         VN_RELE(targvp);
2263                         curthread->t_flag |= T_WOULDBLOCK;
2264                         return;
2265                 }
2266                 VN_RELE(targvp);
2267         }
2268 
2269 
2270         if (nbl_need_check(srcvp)) {
2271                 nbl_start_crit(srcvp, RW_READER);
2272                 in_crit = 1;
2273                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2274                         error = EACCES;

2601         if (vp == NULL) {
2602                 *status = NFSERR_STALE;
2603                 return;
2604         }
2605 
2606         if (rdonly(ro, vp)) {
2607                 VN_RELE(vp);
2608                 *status = NFSERR_ROFS;
2609                 return;
2610         }
2611 
2612         /*
2613          * VOP_RMDIR takes a third argument (the current
2614          * directory of the process).  That's because someone
2615          * wants to return EINVAL if one tries to remove ".".
2616          * Of course, NFS servers have no idea what their
2617          * clients' current directories are.  We fake it by
2618          * supplying a vnode known to exist and illegal to
2619          * remove.
2620          */
2621         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2622 
2623         /*
2624          * Force modified data and metadata out to stable storage.
2625          */
2626         (void) VOP_FSYNC(vp, 0, cr, NULL);
2627 
2628         VN_RELE(vp);
2629 
2630         /*
2631          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2632          * if the directory is not empty.  A System V NFS server
2633          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2634          * over the wire.
2635          */
2636         if (error == EEXIST)
2637                 *status = NFSERR_NOTEMPTY;
2638         else
2639                 *status = puterrno(error);
2640 
2641 }

2876                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2877         }
2878         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2879             sa->sa_mtime.tv_usec != (int32_t)-1) {
2880 #ifndef _LP64
2881                 /* return error if time overflow */
2882                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2883                         return (EOVERFLOW);
2884 #endif
2885                 vap->va_mask |= AT_MTIME;
2886                 /*
2887                  * nfs protocol defines times as unsigned so don't extend sign,
2888                  * unless sysadmin set nfs_allow_preepoch_time.
2889                  */
2890                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2891                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2892         }
2893         return (0);
2894 }
2895 
2896 static const enum nfsftype vt_to_nf[] = {
2897         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2898 };
2899 
2900 /*
2901  * check the following fields for overflow: nodeid, size, and time.
2902  * There could be a problem when converting 64-bit LP64 fields
2903  * into 32-bit ones.  Return an error if there is an overflow.
2904  */
2905 int
2906 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2907 {
2908         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2909         na->na_type = vt_to_nf[vap->va_type];
2910 
2911         if (vap->va_mode == (unsigned short) -1)
2912                 na->na_mode = (uint32_t)-1;
2913         else
2914                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2915 
2916         if (vap->va_uid == (unsigned short)(-1))

3095                                                     aclentp->a_perm;
3096                                                 break;
3097                                         default:
3098                                                 break;
3099                                         }
3100                                 }
3101                         }
3102                         /* copy to va */
3103                         va->va_mode &= ~077;
3104                         va->va_mode |= grp_perm | other_perm;
3105                 }
3106                 if (vsa.vsa_aclcnt)
3107                         kmem_free(vsa.vsa_aclentp,
3108                             vsa.vsa_aclcnt * sizeof (aclent_t));
3109         }
3110 }
3111 
3112 void
3113 rfs_srvrinit(void)
3114 {

3115         nfs2_srv_caller_id = fs_new_caller_id();
3116 }
3117 
3118 void
3119 rfs_srvrfini(void)
3120 {

3121 }
3122 
3123 /* ARGSUSED */
3124 void
3125 rfs_srv_zone_init(nfs_globals_t *ng)
3126 {
3127         nfs_srv_t *ns;
3128 
3129         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3130 
3131         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3132         ns->write_async = 1;
3133 
3134         ng->nfs_srv = ns;
3135 }
3136 
3137 /* ARGSUSED */
3138 void
3139 rfs_srv_zone_fini(nfs_globals_t *ng)
3140 {
3141         nfs_srv_t *ns = ng->nfs_srv;
3142 
3143         ng->nfs_srv = NULL;
3144 
3145         mutex_destroy(&ns->async_write_lock);
3146         kmem_free(ns, sizeof (*ns));
3147 }
3148 
3149 static int
3150 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3151 {
3152         struct clist    *wcl;
3153         int             wlist_len;
3154         uint32_t        count = rr->rr_count;
3155 
3156         wcl = ra->ra_wlist;
3157 
3158         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3159                 return (FALSE);
3160         }
3161 
3162         wcl = ra->ra_wlist;
3163         rr->rr_ok.rrok_wlist_len = wlist_len;
3164         rr->rr_ok.rrok_wlist = wcl;
3165 
3166         return (TRUE);
3167 }