13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
30 * All rights reserved.
31 */
32
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/uio.h>
41 #include <sys/stat.h>
42 #include <sys/errno.h>
43 #include <sys/sysmacros.h>
44 #include <sys/statvfs.h>
45 #include <sys/kmem.h>
46 #include <sys/kstat.h>
47 #include <sys/dirent.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/vtrace.h>
51 #include <sys/mode.h>
52 #include <sys/acl.h>
53 #include <sys/nbmlock.h>
54 #include <sys/policy.h>
55 #include <sys/sdt.h>
56
57 #include <rpc/types.h>
58 #include <rpc/auth.h>
59 #include <rpc/svc.h>
60
61 #include <nfs/nfs.h>
62 #include <nfs/export.h>
63 #include <nfs/nfs_cmd.h>
64
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_kmem.h>
70
71 #include <sys/strsubr.h>
72
73 /*
74 * These are the interface routines for the server side of the
75 * Network File System. See the NFS version 2 protocol specification
76 * for a description of this interface.
77 */
78
79 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
80 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
81 cred_t *);
82
83 /*
84 * Some "over the wire" UNIX file types. These are encoded
85 * into the mode. This needs to be fixed in the next rev.
86 */
87 #define IFMT 0170000 /* type of file */
88 #define IFCHR 0020000 /* character special */
89 #define IFBLK 0060000 /* block special */
90 #define IFSOCK 0140000 /* socket */
91
92 u_longlong_t nfs2_srv_caller_id;
93
94 /*
95 * Get file attributes.
96 * Returns the current attributes of the file with the given fhandle.
97 */
98 /* ARGSUSED */
99 void
100 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
101 struct svc_req *req, cred_t *cr, bool_t ro)
102 {
103 int error;
104 vnode_t *vp;
105 struct vattr va;
106
107 vp = nfs_fhtovp(fhp, exi);
108 if (vp == NULL) {
109 ns->ns_status = NFSERR_STALE;
110 return;
111 }
112
113 /*
369 exi_rele(*exip);
370 *exip = exi;
371
372 VN_RELE(*vpp);
373 *vpp = vp;
374 }
375
376 return (0);
377 }
378
379 /*
380 * Given mounted "dvp" and "exi", go upper mountpoint
381 * with dvp/exi correction
382 * Return 0 in success
383 */
384 int
385 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
386 {
387 struct exportinfo *exi;
388 vnode_t *dvp = *dvpp;
389
390 ASSERT(dvp->v_flag & VROOT);
391
392 VN_HOLD(dvp);
393 dvp = untraverse(dvp);
394 exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
395 if (exi == NULL) {
396 VN_RELE(dvp);
397 return (-1);
398 }
399
400 exi_rele(*exip);
401 *exip = exi;
402 VN_RELE(*dvpp);
403 *dvpp = dvp;
404
405 return (0);
406 }
407 /*
408 * Directory lookup.
409 * Returns an fhandle and file attributes for file name in a directory.
410 */
411 /* ARGSUSED */
412 void
413 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
414 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
415 {
416 int error;
417 vnode_t *dvp;
418 vnode_t *vp;
419 struct vattr va;
429 * access via WebNFS here.
430 */
431 if (is_system_labeled() && req->rq_vers == 2) {
432 dr->dr_status = NFSERR_ACCES;
433 return;
434 }
435
436 /*
437 * Disallow NULL paths
438 */
439 if (da->da_name == NULL || *da->da_name == '\0') {
440 dr->dr_status = NFSERR_ACCES;
441 return;
442 }
443
444 /*
445 * Allow lookups from the root - the default
446 * location of the public filehandle.
447 */
448 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
449 dvp = rootdir;
450 VN_HOLD(dvp);
451 } else {
452 dvp = nfs_fhtovp(fhp, exi);
453 if (dvp == NULL) {
454 dr->dr_status = NFSERR_STALE;
455 return;
456 }
457 }
458
459 exi_hold(exi);
460
461 /*
462 * Not allow lookup beyond root.
463 * If the filehandle matches a filehandle of the exi,
464 * then the ".." refers beyond the root of an exported filesystem.
465 */
466 if (strcmp(da->da_name, "..") == 0 &&
467 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
468 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
469 (dvp->v_flag & VROOT)) {
470 /*
471 * special case for ".." and 'nohide'exported root
472 */
473 if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
474 error = NFSERR_ACCES;
475 goto out;
476 }
477 } else {
478 error = NFSERR_NOENT;
479 goto out;
480 }
481 }
482
483 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
484 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
485 MAXPATHLEN);
486
487 if (name == NULL) {
488 error = NFSERR_ACCES;
489 goto out;
490 }
491
492 /*
493 * If the public filehandle is used then allow
494 * a multi-component lookup, i.e. evaluate
495 * a pathname and follow symbolic links if
496 * necessary.
497 *
498 * This may result in a vnode in another filesystem
499 * which is OK as long as the filesystem is exported.
500 */
501 if (PUBLIC_FH2(fhp)) {
502 publicfh_flag = TRUE;
503
504 exi_rele(exi);
505
506 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
507 &sec);
508 } else {
509 /*
510 * Do a normal single component lookup.
511 */
512 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
513 NULL, NULL, NULL);
514 }
515
516 if (name != da->da_name)
517 kmem_free(name, MAXPATHLEN);
518
519 if (error == 0 && vn_ismntpt(vp)) {
520 error = rfs_cross_mnt(&vp, &exi);
521 if (error)
522 VN_RELE(vp);
523 }
524
618
619 /*
620 * XNFS and RFC1094 require us to return ENXIO if argument
621 * is not a link. BUGID 1138002.
622 */
623 if (vp->v_type != VLNK && !is_referral) {
624 VN_RELE(vp);
625 rl->rl_data = NULL;
626 rl->rl_status = NFSERR_NXIO;
627 return;
628 }
629
630 /*
631 * Allocate data for pathname. This will be freed by rfs_rlfree.
632 */
633 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
634
635 if (is_referral) {
636 char *s;
637 size_t strsz;
638
639 /* Get an artificial symlink based on a referral */
640 s = build_symlink(vp, cr, &strsz);
641 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
642 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
643 vnode_t *, vp, char *, s);
644 if (s == NULL)
645 error = EINVAL;
646 else {
647 error = 0;
648 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
649 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
650 kmem_free(s, strsz);
651 }
652
653 } else {
654
655 /*
656 * Set up io vector to read sym link data
657 */
658 iov.iov_base = rl->rl_data;
659 iov.iov_len = NFS_MAXPATHLEN;
660 uio.uio_iov = &iov;
661 uio.uio_iovcnt = 1;
758 * Enter the critical region before calling VOP_RWLOCK
759 * to avoid a deadlock with write requests.
760 */
761 if (nbl_need_check(vp)) {
762 nbl_start_crit(vp, RW_READER);
763 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
764 0, NULL)) {
765 nbl_end_crit(vp);
766 VN_RELE(vp);
767 rr->rr_data = NULL;
768 rr->rr_status = NFSERR_ACCES;
769 return;
770 }
771 in_crit = 1;
772 }
773
774 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
775
776 /* check if a monitor detected a delegation conflict */
777 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
778 VN_RELE(vp);
779 /* mark as wouldblock so response is dropped */
780 curthread->t_flag |= T_WOULDBLOCK;
781
782 rr->rr_data = NULL;
783 return;
784 }
785
786 va.va_mask = AT_ALL;
787
788 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
789
790 if (error) {
791 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
792 if (in_crit)
793 nbl_end_crit(vp);
794
795 VN_RELE(vp);
796 rr->rr_data = NULL;
797 rr->rr_status = puterrno(error);
1083 }
1084
1085 /*
1086 * We have to enter the critical region before calling VOP_RWLOCK
1087 * to avoid a deadlock with ufs.
1088 */
1089 if (nbl_need_check(vp)) {
1090 nbl_start_crit(vp, RW_READER);
1091 in_crit = 1;
1092 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1093 wa->wa_count, 0, NULL)) {
1094 error = EACCES;
1095 goto out;
1096 }
1097 }
1098
1099 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1100
1101 /* check if a monitor detected a delegation conflict */
1102 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1103 VN_RELE(vp);
1104 /* mark as wouldblock so response is dropped */
1105 curthread->t_flag |= T_WOULDBLOCK;
1106 return;
1107 }
1108
1109 if (wa->wa_data || wa->wa_rlist) {
1110 /* Do the RDMA thing if necessary */
1111 if (wa->wa_rlist) {
1112 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1113 iov[0].iov_len = wa->wa_count;
1114 } else {
1115 iov[0].iov_base = wa->wa_data;
1116 iov[0].iov_len = wa->wa_count;
1117 }
1118 uio.uio_iov = iov;
1119 uio.uio_iovcnt = 1;
1120 uio.uio_segflg = UIO_SYSSPACE;
1121 uio.uio_extflg = UIO_COPY_DEFAULT;
1122 uio.uio_loffset = (offset_t)wa->wa_offset;
1123 uio.uio_resid = wa->wa_count;
1124 /*
1125 * The limit is checked on the client. We
1126 * should allow any size writes here.
1127 */
1128 uio.uio_llimit = curproc->p_fsz_ctl;
1129 rlimit = uio.uio_llimit - wa->wa_offset;
1130 if (rlimit < (rlim64_t)uio.uio_resid)
1131 uio.uio_resid = (uint_t)rlimit;
1132
1133 /*
1134 * for now we assume no append mode
1135 */
1136 /*
1137 * We're changing creds because VM may fault and we need
1138 * the cred of the current thread to be used if quota
1139 * checking is enabled.
1140 */
1141 savecred = curthread->t_cred;
1142 curthread->t_cred = cr;
1143 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1144 curthread->t_cred = savecred;
1145 } else {
1146 iovcnt = 0;
1147 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1148 iovcnt++;
1149 if (iovcnt <= MAX_IOVECS) {
1150 #ifdef DEBUG
1151 rfs_write_sync_hits++;
1152 #endif
1153 iovp = iov;
1154 } else {
1155 #ifdef DEBUG
1156 rfs_write_sync_misses++;
1157 #endif
1158 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1159 }
1160 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1161 uio.uio_iov = iovp;
1162 uio.uio_iovcnt = iovcnt;
1163 uio.uio_segflg = UIO_SYSSPACE;
1164 uio.uio_extflg = UIO_COPY_DEFAULT;
1165 uio.uio_loffset = (offset_t)wa->wa_offset;
1269 struct rfs_async_write *rp;
1270 struct rfs_async_write *nrp;
1271 struct rfs_async_write *trp;
1272 struct rfs_async_write *lrp;
1273 int data_written;
1274 int iovcnt;
1275 mblk_t *m;
1276 struct iovec *iovp;
1277 struct iovec *niovp;
1278 struct iovec iov[MAXCLIOVECS];
1279 int count;
1280 int rcount;
1281 uint_t off;
1282 uint_t len;
1283 struct rfs_async_write nrpsp;
1284 struct rfs_async_write_list nlpsp;
1285 ushort_t t_flag;
1286 cred_t *savecred;
1287 int in_crit = 0;
1288 caller_context_t ct;
1289
1290 if (!rfs_write_async) {
1291 rfs_write_sync(wa, ns, exi, req, cr, ro);
1292 return;
1293 }
1294
1295 /*
1296 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1297 * is considered an OK.
1298 */
1299 ns->ns_status = RFSWRITE_INITVAL;
1300
1301 nrp = &nrpsp;
1302 nrp->wa = wa;
1303 nrp->ns = ns;
1304 nrp->req = req;
1305 nrp->cr = cr;
1306 nrp->ro = ro;
1307 nrp->thread = curthread;
1308
1309 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1310
1311 /*
1312 * Look to see if there is already a cluster started
1313 * for this file.
1314 */
1315 mutex_enter(&rfs_async_write_lock);
1316 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1317 if (bcmp(&wa->wa_fhandle, lp->fhp,
1318 sizeof (fhandle_t)) == 0)
1319 break;
1320 }
1321
1322 /*
1323 * If lp is non-NULL, then there is already a cluster
1324 * started. We need to place ourselves in the cluster
1325 * list in the right place as determined by starting
1326 * offset. Conflicts with non-blocking mandatory locked
1327 * regions will be checked when the cluster is processed.
1328 */
1329 if (lp != NULL) {
1330 rp = lp->list;
1331 trp = NULL;
1332 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1333 trp = rp;
1334 rp = rp->list;
1335 }
1336 nrp->list = rp;
1337 if (trp == NULL)
1338 lp->list = nrp;
1339 else
1340 trp->list = nrp;
1341 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1342 cv_wait(&lp->cv, &rfs_async_write_lock);
1343 mutex_exit(&rfs_async_write_lock);
1344
1345 return;
1346 }
1347
1348 /*
1349 * No cluster started yet, start one and add ourselves
1350 * to the list of clusters.
1351 */
1352 nrp->list = NULL;
1353
1354 nlp = &nlpsp;
1355 nlp->fhp = &wa->wa_fhandle;
1356 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1357 nlp->list = nrp;
1358 nlp->next = NULL;
1359
1360 if (rfs_async_write_head == NULL) {
1361 rfs_async_write_head = nlp;
1362 } else {
1363 lp = rfs_async_write_head;
1364 while (lp->next != NULL)
1365 lp = lp->next;
1366 lp->next = nlp;
1367 }
1368 mutex_exit(&rfs_async_write_lock);
1369
1370 /*
1371 * Convert the file handle common to all of the requests
1372 * in this cluster to a vnode.
1373 */
1374 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1375 if (vp == NULL) {
1376 mutex_enter(&rfs_async_write_lock);
1377 if (rfs_async_write_head == nlp)
1378 rfs_async_write_head = nlp->next;
1379 else {
1380 lp = rfs_async_write_head;
1381 while (lp->next != nlp)
1382 lp = lp->next;
1383 lp->next = nlp->next;
1384 }
1385 t_flag = curthread->t_flag & T_WOULDBLOCK;
1386 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1387 rp->ns->ns_status = NFSERR_STALE;
1388 rp->thread->t_flag |= t_flag;
1389 }
1390 cv_broadcast(&nlp->cv);
1391 mutex_exit(&rfs_async_write_lock);
1392
1393 return;
1394 }
1395
1396 /*
1397 * Can only write regular files. Attempts to write any
1398 * other file types fail with EISDIR.
1399 */
1400 if (vp->v_type != VREG) {
1401 VN_RELE(vp);
1402 mutex_enter(&rfs_async_write_lock);
1403 if (rfs_async_write_head == nlp)
1404 rfs_async_write_head = nlp->next;
1405 else {
1406 lp = rfs_async_write_head;
1407 while (lp->next != nlp)
1408 lp = lp->next;
1409 lp->next = nlp->next;
1410 }
1411 t_flag = curthread->t_flag & T_WOULDBLOCK;
1412 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1413 rp->ns->ns_status = NFSERR_ISDIR;
1414 rp->thread->t_flag |= t_flag;
1415 }
1416 cv_broadcast(&nlp->cv);
1417 mutex_exit(&rfs_async_write_lock);
1418
1419 return;
1420 }
1421
1422 /*
1423 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1424 * deadlock with ufs.
1425 */
1426 if (nbl_need_check(vp)) {
1427 nbl_start_crit(vp, RW_READER);
1428 in_crit = 1;
1429 }
1430
1431 ct.cc_sysid = 0;
1432 ct.cc_pid = 0;
1433 ct.cc_caller_id = nfs2_srv_caller_id;
1434 ct.cc_flags = CC_DONTBLOCK;
1435
1436 /*
1437 * Lock the file for writing. This operation provides
1438 * the delay which allows clusters to grow.
1439 */
1440 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1441
1442 /* check if a monitor detected a delegation conflict */
1443 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1444 if (in_crit)
1445 nbl_end_crit(vp);
1446 VN_RELE(vp);
1447 /* mark as wouldblock so response is dropped */
1448 curthread->t_flag |= T_WOULDBLOCK;
1449 mutex_enter(&rfs_async_write_lock);
1450 if (rfs_async_write_head == nlp)
1451 rfs_async_write_head = nlp->next;
1452 else {
1453 lp = rfs_async_write_head;
1454 while (lp->next != nlp)
1455 lp = lp->next;
1456 lp->next = nlp->next;
1457 }
1458 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1459 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1460 rp->ns->ns_status = puterrno(error);
1461 rp->thread->t_flag |= T_WOULDBLOCK;
1462 }
1463 }
1464 cv_broadcast(&nlp->cv);
1465 mutex_exit(&rfs_async_write_lock);
1466
1467 return;
1468 }
1469
1470 /*
1471 * Disconnect this cluster from the list of clusters.
1472 * The cluster that is being dealt with must be fixed
1473 * in size after this point, so there is no reason
1474 * to leave it on the list so that new requests can
1475 * find it.
1476 *
1477 * The algorithm is that the first write request will
1478 * create a cluster, convert the file handle to a
1479 * vnode pointer, and then lock the file for writing.
1480 * This request is not likely to be clustered with
1481 * any others. However, the next request will create
1482 * a new cluster and be blocked in VOP_RWLOCK while
1483 * the first request is being processed. This delay
1484 * will allow more requests to be clustered in this
1485 * second cluster.
1486 */
1487 mutex_enter(&rfs_async_write_lock);
1488 if (rfs_async_write_head == nlp)
1489 rfs_async_write_head = nlp->next;
1490 else {
1491 lp = rfs_async_write_head;
1492 while (lp->next != nlp)
1493 lp = lp->next;
1494 lp->next = nlp->next;
1495 }
1496 mutex_exit(&rfs_async_write_lock);
1497
1498 /*
1499 * Step through the list of requests in this cluster.
1500 * We need to check permissions to make sure that all
1501 * of the requests have sufficient permission to write
1502 * the file. A cluster can be composed of requests
1503 * from different clients and different users on each
1504 * client.
1505 *
1506 * As a side effect, we also calculate the size of the
1507 * byte range that this cluster encompasses.
1508 */
1509 rp = nlp->list;
1510 off = rp->wa->wa_offset;
1511 len = (uint_t)0;
1512 do {
1513 if (rdonly(rp->ro, vp)) {
1514 rp->ns->ns_status = NFSERR_ROFS;
1515 t_flag = curthread->t_flag & T_WOULDBLOCK;
1516 rp->thread->t_flag |= t_flag;
1721
1722 /*
1723 * If any data was written at all, then we need to flush
1724 * the data and metadata to stable storage.
1725 */
1726 if (data_written) {
1727 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1728
1729 if (!error) {
1730 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1731 }
1732 }
1733
1734 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1735
1736 if (in_crit)
1737 nbl_end_crit(vp);
1738 VN_RELE(vp);
1739
1740 t_flag = curthread->t_flag & T_WOULDBLOCK;
1741 mutex_enter(&rfs_async_write_lock);
1742 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1743 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1744 rp->ns->ns_status = puterrno(error);
1745 rp->thread->t_flag |= t_flag;
1746 }
1747 }
1748 cv_broadcast(&nlp->cv);
1749 mutex_exit(&rfs_async_write_lock);
1750
1751 }
1752
1753 void *
1754 rfs_write_getfh(struct nfswriteargs *wa)
1755 {
1756 return (&wa->wa_fhandle);
1757 }
1758
1759 /*
1760 * Create a file.
1761 * Creates a file with given attributes and returns those attributes
1762 * and an fhandle for the new file.
1763 */
1764 void
1765 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1766 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1767 {
1768 int error;
1769 int lookuperr;
2194 NULL, cr, NULL, NULL, NULL);
2195 if (error != 0) {
2196 VN_RELE(tovp);
2197 VN_RELE(fromvp);
2198 *status = puterrno(error);
2199 return;
2200 }
2201
2202 /* Check for delegations on the source file */
2203
2204 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2205 VN_RELE(tovp);
2206 VN_RELE(fromvp);
2207 VN_RELE(srcvp);
2208 curthread->t_flag |= T_WOULDBLOCK;
2209 return;
2210 }
2211
2212 /* Check for delegation on the file being renamed over, if it exists */
2213
2214 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2215 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2216 NULL, NULL, NULL) == 0) {
2217
2218 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2219 VN_RELE(tovp);
2220 VN_RELE(fromvp);
2221 VN_RELE(srcvp);
2222 VN_RELE(targvp);
2223 curthread->t_flag |= T_WOULDBLOCK;
2224 return;
2225 }
2226 VN_RELE(targvp);
2227 }
2228
2229
2230 if (nbl_need_check(srcvp)) {
2231 nbl_start_crit(srcvp, RW_READER);
2232 in_crit = 1;
2233 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2234 error = EACCES;
2561 if (vp == NULL) {
2562 *status = NFSERR_STALE;
2563 return;
2564 }
2565
2566 if (rdonly(ro, vp)) {
2567 VN_RELE(vp);
2568 *status = NFSERR_ROFS;
2569 return;
2570 }
2571
2572 /*
2573 * VOP_RMDIR takes a third argument (the current
2574 * directory of the process). That's because someone
2575 * wants to return EINVAL if one tries to remove ".".
2576 * Of course, NFS servers have no idea what their
2577 * clients' current directories are. We fake it by
2578 * supplying a vnode known to exist and illegal to
2579 * remove.
2580 */
2581 error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2582
2583 /*
2584 * Force modified data and metadata out to stable storage.
2585 */
2586 (void) VOP_FSYNC(vp, 0, cr, NULL);
2587
2588 VN_RELE(vp);
2589
2590 /*
2591 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2592 * if the directory is not empty. A System V NFS server
2593 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2594 * over the wire.
2595 */
2596 if (error == EEXIST)
2597 *status = NFSERR_NOTEMPTY;
2598 else
2599 *status = puterrno(error);
2600
2601 }
2836 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2837 }
2838 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2839 sa->sa_mtime.tv_usec != (int32_t)-1) {
2840 #ifndef _LP64
2841 /* return error if time overflow */
2842 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2843 return (EOVERFLOW);
2844 #endif
2845 vap->va_mask |= AT_MTIME;
2846 /*
2847 * nfs protocol defines times as unsigned so don't extend sign,
2848 * unless sysadmin set nfs_allow_preepoch_time.
2849 */
2850 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2851 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2852 }
2853 return (0);
2854 }
2855
2856 static enum nfsftype vt_to_nf[] = {
2857 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2858 };
2859
2860 /*
2861 * check the following fields for overflow: nodeid, size, and time.
2862 * There could be a problem when converting 64-bit LP64 fields
2863 * into 32-bit ones. Return an error if there is an overflow.
2864 */
2865 int
2866 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2867 {
2868 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2869 na->na_type = vt_to_nf[vap->va_type];
2870
2871 if (vap->va_mode == (unsigned short) -1)
2872 na->na_mode = (uint32_t)-1;
2873 else
2874 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2875
2876 if (vap->va_uid == (unsigned short)(-1))
3055 aclentp->a_perm;
3056 break;
3057 default:
3058 break;
3059 }
3060 }
3061 }
3062 /* copy to va */
3063 va->va_mode &= ~077;
3064 va->va_mode |= grp_perm | other_perm;
3065 }
3066 if (vsa.vsa_aclcnt)
3067 kmem_free(vsa.vsa_aclentp,
3068 vsa.vsa_aclcnt * sizeof (aclent_t));
3069 }
3070 }
3071
3072 void
3073 rfs_srvrinit(void)
3074 {
3075 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3076 nfs2_srv_caller_id = fs_new_caller_id();
3077 }
3078
3079 void
3080 rfs_srvrfini(void)
3081 {
3082 mutex_destroy(&rfs_async_write_lock);
3083 }
3084
3085 static int
3086 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3087 {
3088 struct clist *wcl;
3089 int wlist_len;
3090 uint32_t count = rr->rr_count;
3091
3092 wcl = ra->ra_wlist;
3093
3094 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3095 return (FALSE);
3096 }
3097
3098 wcl = ra->ra_wlist;
3099 rr->rr_ok.rrok_wlist_len = wlist_len;
3100 rr->rr_ok.rrok_wlist = wcl;
3101
3102 return (TRUE);
3103 }
|
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
30 * All rights reserved.
31 */
32
33 /*
34 * Copyright 2018 Nexenta Systems, Inc.
35 * Copyright (c) 2016 by Delphix. All rights reserved.
36 */
37
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/uio.h>
46 #include <sys/stat.h>
47 #include <sys/errno.h>
48 #include <sys/sysmacros.h>
49 #include <sys/statvfs.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/dirent.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/vtrace.h>
56 #include <sys/mode.h>
57 #include <sys/acl.h>
58 #include <sys/nbmlock.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/svc.h>
65
66 #include <nfs/nfs.h>
67 #include <nfs/export.h>
68 #include <nfs/nfs_cmd.h>
69
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_kmem.h>
75
76 #include <sys/strsubr.h>
77
78 struct rfs_async_write_list;
79
80 /*
81 * Zone globals of NFSv2 server
82 */
83 typedef struct nfs_srv {
84 kmutex_t async_write_lock;
85 struct rfs_async_write_list *async_write_head;
86
87 /*
88 * enables write clustering if == 1
89 */
90 int write_async;
91 } nfs_srv_t;
92
93 /*
94 * These are the interface routines for the server side of the
95 * Network File System. See the NFS version 2 protocol specification
96 * for a description of this interface.
97 */
98
99 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
100 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
101 cred_t *);
102
103
104 /*
105 * Some "over the wire" UNIX file types. These are encoded
106 * into the mode. This needs to be fixed in the next rev.
107 */
108 #define IFMT 0170000 /* type of file */
109 #define IFCHR 0020000 /* character special */
110 #define IFBLK 0060000 /* block special */
111 #define IFSOCK 0140000 /* socket */
112
113 u_longlong_t nfs2_srv_caller_id;
114
115 static nfs_srv_t *
116 nfs_get_srv(void)
117 {
118 nfs_globals_t *ng = nfs_srv_getzg();
119 nfs_srv_t *srv = ng->nfs_srv;
120 ASSERT(srv != NULL);
121 return (srv);
122 }
123
124 /*
125 * Get file attributes.
126 * Returns the current attributes of the file with the given fhandle.
127 */
128 /* ARGSUSED */
129 void
130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
131 struct svc_req *req, cred_t *cr, bool_t ro)
132 {
133 int error;
134 vnode_t *vp;
135 struct vattr va;
136
137 vp = nfs_fhtovp(fhp, exi);
138 if (vp == NULL) {
139 ns->ns_status = NFSERR_STALE;
140 return;
141 }
142
143 /*
399 exi_rele(*exip);
400 *exip = exi;
401
402 VN_RELE(*vpp);
403 *vpp = vp;
404 }
405
406 return (0);
407 }
408
409 /*
410 * Given mounted "dvp" and "exi", go upper mountpoint
411 * with dvp/exi correction
412 * Return 0 in success
413 */
414 int
415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
416 {
417 struct exportinfo *exi;
418 vnode_t *dvp = *dvpp;
419 vnode_t *zone_rootvp;
420
421 zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
422 ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
423
424 VN_HOLD(dvp);
425 dvp = untraverse(dvp, zone_rootvp);
426 exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
427 if (exi == NULL) {
428 VN_RELE(dvp);
429 return (-1);
430 }
431
432 ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
433 exi_rele(*exip);
434 *exip = exi;
435 VN_RELE(*dvpp);
436 *dvpp = dvp;
437
438 return (0);
439 }
440 /*
441 * Directory lookup.
442 * Returns an fhandle and file attributes for file name in a directory.
443 */
444 /* ARGSUSED */
445 void
446 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
447 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
448 {
449 int error;
450 vnode_t *dvp;
451 vnode_t *vp;
452 struct vattr va;
462 * access via WebNFS here.
463 */
464 if (is_system_labeled() && req->rq_vers == 2) {
465 dr->dr_status = NFSERR_ACCES;
466 return;
467 }
468
469 /*
470 * Disallow NULL paths
471 */
472 if (da->da_name == NULL || *da->da_name == '\0') {
473 dr->dr_status = NFSERR_ACCES;
474 return;
475 }
476
477 /*
478 * Allow lookups from the root - the default
479 * location of the public filehandle.
480 */
481 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
482 dvp = ZONE_ROOTVP();
483 VN_HOLD(dvp);
484 } else {
485 dvp = nfs_fhtovp(fhp, exi);
486 if (dvp == NULL) {
487 dr->dr_status = NFSERR_STALE;
488 return;
489 }
490 }
491
492 exi_hold(exi);
493 ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
494
495 /*
496 * Not allow lookup beyond root.
497 * If the filehandle matches a filehandle of the exi,
498 * then the ".." refers beyond the root of an exported filesystem.
499 */
500 if (strcmp(da->da_name, "..") == 0 &&
501 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
502 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
503 ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
504 /*
505 * special case for ".." and 'nohide'exported root
506 */
507 if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
508 error = NFSERR_ACCES;
509 goto out;
510 }
511 } else {
512 error = NFSERR_NOENT;
513 goto out;
514 }
515 }
516
517 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
518 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
519 MAXPATHLEN);
520
521 if (name == NULL) {
522 error = NFSERR_ACCES;
523 goto out;
524 }
525
526 /*
527 * If the public filehandle is used then allow
528 * a multi-component lookup, i.e. evaluate
529 * a pathname and follow symbolic links if
530 * necessary.
531 *
532 * This may result in a vnode in another filesystem
533 * which is OK as long as the filesystem is exported.
534 */
535 if (PUBLIC_FH2(fhp)) {
536 publicfh_flag = TRUE;
537
538 exi_rele(exi);
539 exi = NULL;
540
541 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
542 &sec);
543 } else {
544 /*
545 * Do a normal single component lookup.
546 */
547 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
548 NULL, NULL, NULL);
549 }
550
551 if (name != da->da_name)
552 kmem_free(name, MAXPATHLEN);
553
554 if (error == 0 && vn_ismntpt(vp)) {
555 error = rfs_cross_mnt(&vp, &exi);
556 if (error)
557 VN_RELE(vp);
558 }
559
653
654 /*
655 * XNFS and RFC1094 require us to return ENXIO if argument
656 * is not a link. BUGID 1138002.
657 */
658 if (vp->v_type != VLNK && !is_referral) {
659 VN_RELE(vp);
660 rl->rl_data = NULL;
661 rl->rl_status = NFSERR_NXIO;
662 return;
663 }
664
665 /*
666 * Allocate data for pathname. This will be freed by rfs_rlfree.
667 */
668 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
669
670 if (is_referral) {
671 char *s;
672 size_t strsz;
673 kstat_named_t *stat =
674 exi->exi_ne->ne_globals->svstat[NFS_VERSION];
675
676 /* Get an artificial symlink based on a referral */
677 s = build_symlink(vp, cr, &strsz);
678 stat[NFS_REFERLINKS].value.ui64++;
679 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
680 vnode_t *, vp, char *, s);
681 if (s == NULL)
682 error = EINVAL;
683 else {
684 error = 0;
685 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
686 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
687 kmem_free(s, strsz);
688 }
689
690 } else {
691
692 /*
693 * Set up io vector to read sym link data
694 */
695 iov.iov_base = rl->rl_data;
696 iov.iov_len = NFS_MAXPATHLEN;
697 uio.uio_iov = &iov;
698 uio.uio_iovcnt = 1;
795 * Enter the critical region before calling VOP_RWLOCK
796 * to avoid a deadlock with write requests.
797 */
798 if (nbl_need_check(vp)) {
799 nbl_start_crit(vp, RW_READER);
800 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
801 0, NULL)) {
802 nbl_end_crit(vp);
803 VN_RELE(vp);
804 rr->rr_data = NULL;
805 rr->rr_status = NFSERR_ACCES;
806 return;
807 }
808 in_crit = 1;
809 }
810
811 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
812
813 /* check if a monitor detected a delegation conflict */
814 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
815 if (in_crit)
816 nbl_end_crit(vp);
817 VN_RELE(vp);
818 /* mark as wouldblock so response is dropped */
819 curthread->t_flag |= T_WOULDBLOCK;
820
821 rr->rr_data = NULL;
822 return;
823 }
824
825 va.va_mask = AT_ALL;
826
827 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
828
829 if (error) {
830 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
831 if (in_crit)
832 nbl_end_crit(vp);
833
834 VN_RELE(vp);
835 rr->rr_data = NULL;
836 rr->rr_status = puterrno(error);
1122 }
1123
1124 /*
1125 * We have to enter the critical region before calling VOP_RWLOCK
1126 * to avoid a deadlock with ufs.
1127 */
1128 if (nbl_need_check(vp)) {
1129 nbl_start_crit(vp, RW_READER);
1130 in_crit = 1;
1131 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1132 wa->wa_count, 0, NULL)) {
1133 error = EACCES;
1134 goto out;
1135 }
1136 }
1137
1138 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1139
1140 /* check if a monitor detected a delegation conflict */
1141 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1142 goto out;
1143 }
1144
1145 if (wa->wa_data || wa->wa_rlist) {
1146 /* Do the RDMA thing if necessary */
1147 if (wa->wa_rlist) {
1148 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1149 iov[0].iov_len = wa->wa_count;
1150 } else {
1151 iov[0].iov_base = wa->wa_data;
1152 iov[0].iov_len = wa->wa_count;
1153 }
1154 uio.uio_iov = iov;
1155 uio.uio_iovcnt = 1;
1156 uio.uio_segflg = UIO_SYSSPACE;
1157 uio.uio_extflg = UIO_COPY_DEFAULT;
1158 uio.uio_loffset = (offset_t)wa->wa_offset;
1159 uio.uio_resid = wa->wa_count;
1160 /*
1161 * The limit is checked on the client. We
1162 * should allow any size writes here.
1163 */
1164 uio.uio_llimit = curproc->p_fsz_ctl;
1165 rlimit = uio.uio_llimit - wa->wa_offset;
1166 if (rlimit < (rlim64_t)uio.uio_resid)
1167 uio.uio_resid = (uint_t)rlimit;
1168
1169 /*
1170 * for now we assume no append mode
1171 */
1172 /*
1173 * We're changing creds because VM may fault and we need
1174 * the cred of the current thread to be used if quota
1175 * checking is enabled.
1176 */
1177 savecred = curthread->t_cred;
1178 curthread->t_cred = cr;
1179 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1180 curthread->t_cred = savecred;
1181 } else {
1182
1183 iovcnt = 0;
1184 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1185 iovcnt++;
1186 if (iovcnt <= MAX_IOVECS) {
1187 #ifdef DEBUG
1188 rfs_write_sync_hits++;
1189 #endif
1190 iovp = iov;
1191 } else {
1192 #ifdef DEBUG
1193 rfs_write_sync_misses++;
1194 #endif
1195 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1196 }
1197 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1198 uio.uio_iov = iovp;
1199 uio.uio_iovcnt = iovcnt;
1200 uio.uio_segflg = UIO_SYSSPACE;
1201 uio.uio_extflg = UIO_COPY_DEFAULT;
1202 uio.uio_loffset = (offset_t)wa->wa_offset;
1306 struct rfs_async_write *rp;
1307 struct rfs_async_write *nrp;
1308 struct rfs_async_write *trp;
1309 struct rfs_async_write *lrp;
1310 int data_written;
1311 int iovcnt;
1312 mblk_t *m;
1313 struct iovec *iovp;
1314 struct iovec *niovp;
1315 struct iovec iov[MAXCLIOVECS];
1316 int count;
1317 int rcount;
1318 uint_t off;
1319 uint_t len;
1320 struct rfs_async_write nrpsp;
1321 struct rfs_async_write_list nlpsp;
1322 ushort_t t_flag;
1323 cred_t *savecred;
1324 int in_crit = 0;
1325 caller_context_t ct;
1326 nfs_srv_t *nsrv;
1327
1328 ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1329 nsrv = nfs_get_srv();
1330 if (!nsrv->write_async) {
1331 rfs_write_sync(wa, ns, exi, req, cr, ro);
1332 return;
1333 }
1334
1335 /*
1336 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1337 * is considered an OK.
1338 */
1339 ns->ns_status = RFSWRITE_INITVAL;
1340
1341 nrp = &nrpsp;
1342 nrp->wa = wa;
1343 nrp->ns = ns;
1344 nrp->req = req;
1345 nrp->cr = cr;
1346 nrp->ro = ro;
1347 nrp->thread = curthread;
1348
1349 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1350
1351 /*
1352 * Look to see if there is already a cluster started
1353 * for this file.
1354 */
1355 mutex_enter(&nsrv->async_write_lock);
1356 for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1357 if (bcmp(&wa->wa_fhandle, lp->fhp,
1358 sizeof (fhandle_t)) == 0)
1359 break;
1360 }
1361
1362 /*
1363 * If lp is non-NULL, then there is already a cluster
1364 * started. We need to place ourselves in the cluster
1365 * list in the right place as determined by starting
1366 * offset. Conflicts with non-blocking mandatory locked
1367 * regions will be checked when the cluster is processed.
1368 */
1369 if (lp != NULL) {
1370 rp = lp->list;
1371 trp = NULL;
1372 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1373 trp = rp;
1374 rp = rp->list;
1375 }
1376 nrp->list = rp;
1377 if (trp == NULL)
1378 lp->list = nrp;
1379 else
1380 trp->list = nrp;
1381 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1382 cv_wait(&lp->cv, &nsrv->async_write_lock);
1383 mutex_exit(&nsrv->async_write_lock);
1384
1385 return;
1386 }
1387
1388 /*
1389 * No cluster started yet, start one and add ourselves
1390 * to the list of clusters.
1391 */
1392 nrp->list = NULL;
1393
1394 nlp = &nlpsp;
1395 nlp->fhp = &wa->wa_fhandle;
1396 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1397 nlp->list = nrp;
1398 nlp->next = NULL;
1399
1400 if (nsrv->async_write_head == NULL) {
1401 nsrv->async_write_head = nlp;
1402 } else {
1403 lp = nsrv->async_write_head;
1404 while (lp->next != NULL)
1405 lp = lp->next;
1406 lp->next = nlp;
1407 }
1408 mutex_exit(&nsrv->async_write_lock);
1409
1410 /*
1411 * Convert the file handle common to all of the requests
1412 * in this cluster to a vnode.
1413 */
1414 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1415 if (vp == NULL) {
1416 mutex_enter(&nsrv->async_write_lock);
1417 if (nsrv->async_write_head == nlp)
1418 nsrv->async_write_head = nlp->next;
1419 else {
1420 lp = nsrv->async_write_head;
1421 while (lp->next != nlp)
1422 lp = lp->next;
1423 lp->next = nlp->next;
1424 }
1425 t_flag = curthread->t_flag & T_WOULDBLOCK;
1426 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1427 rp->ns->ns_status = NFSERR_STALE;
1428 rp->thread->t_flag |= t_flag;
1429 }
1430 cv_broadcast(&nlp->cv);
1431 mutex_exit(&nsrv->async_write_lock);
1432
1433 return;
1434 }
1435
1436 /*
1437 * Can only write regular files. Attempts to write any
1438 * other file types fail with EISDIR.
1439 */
1440 if (vp->v_type != VREG) {
1441 VN_RELE(vp);
1442 mutex_enter(&nsrv->async_write_lock);
1443 if (nsrv->async_write_head == nlp)
1444 nsrv->async_write_head = nlp->next;
1445 else {
1446 lp = nsrv->async_write_head;
1447 while (lp->next != nlp)
1448 lp = lp->next;
1449 lp->next = nlp->next;
1450 }
1451 t_flag = curthread->t_flag & T_WOULDBLOCK;
1452 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1453 rp->ns->ns_status = NFSERR_ISDIR;
1454 rp->thread->t_flag |= t_flag;
1455 }
1456 cv_broadcast(&nlp->cv);
1457 mutex_exit(&nsrv->async_write_lock);
1458
1459 return;
1460 }
1461
1462 /*
1463 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1464 * deadlock with ufs.
1465 */
1466 if (nbl_need_check(vp)) {
1467 nbl_start_crit(vp, RW_READER);
1468 in_crit = 1;
1469 }
1470
1471 ct.cc_sysid = 0;
1472 ct.cc_pid = 0;
1473 ct.cc_caller_id = nfs2_srv_caller_id;
1474 ct.cc_flags = CC_DONTBLOCK;
1475
1476 /*
1477 * Lock the file for writing. This operation provides
1478 * the delay which allows clusters to grow.
1479 */
1480 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1481
1482 /* check if a monitor detected a delegation conflict */
1483 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1484 if (in_crit)
1485 nbl_end_crit(vp);
1486 VN_RELE(vp);
1487 /* mark as wouldblock so response is dropped */
1488 curthread->t_flag |= T_WOULDBLOCK;
1489 mutex_enter(&nsrv->async_write_lock);
1490 if (nsrv->async_write_head == nlp)
1491 nsrv->async_write_head = nlp->next;
1492 else {
1493 lp = nsrv->async_write_head;
1494 while (lp->next != nlp)
1495 lp = lp->next;
1496 lp->next = nlp->next;
1497 }
1498 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1499 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1500 rp->ns->ns_status = puterrno(error);
1501 rp->thread->t_flag |= T_WOULDBLOCK;
1502 }
1503 }
1504 cv_broadcast(&nlp->cv);
1505 mutex_exit(&nsrv->async_write_lock);
1506
1507 return;
1508 }
1509
1510 /*
1511 * Disconnect this cluster from the list of clusters.
1512 * The cluster that is being dealt with must be fixed
1513 * in size after this point, so there is no reason
1514 * to leave it on the list so that new requests can
1515 * find it.
1516 *
1517 * The algorithm is that the first write request will
1518 * create a cluster, convert the file handle to a
1519 * vnode pointer, and then lock the file for writing.
1520 * This request is not likely to be clustered with
1521 * any others. However, the next request will create
1522 * a new cluster and be blocked in VOP_RWLOCK while
1523 * the first request is being processed. This delay
1524 * will allow more requests to be clustered in this
1525 * second cluster.
1526 */
1527 mutex_enter(&nsrv->async_write_lock);
1528 if (nsrv->async_write_head == nlp)
1529 nsrv->async_write_head = nlp->next;
1530 else {
1531 lp = nsrv->async_write_head;
1532 while (lp->next != nlp)
1533 lp = lp->next;
1534 lp->next = nlp->next;
1535 }
1536 mutex_exit(&nsrv->async_write_lock);
1537
1538 /*
1539 * Step through the list of requests in this cluster.
1540 * We need to check permissions to make sure that all
1541 * of the requests have sufficient permission to write
1542 * the file. A cluster can be composed of requests
1543 * from different clients and different users on each
1544 * client.
1545 *
1546 * As a side effect, we also calculate the size of the
1547 * byte range that this cluster encompasses.
1548 */
1549 rp = nlp->list;
1550 off = rp->wa->wa_offset;
1551 len = (uint_t)0;
1552 do {
1553 if (rdonly(rp->ro, vp)) {
1554 rp->ns->ns_status = NFSERR_ROFS;
1555 t_flag = curthread->t_flag & T_WOULDBLOCK;
1556 rp->thread->t_flag |= t_flag;
1761
1762 /*
1763 * If any data was written at all, then we need to flush
1764 * the data and metadata to stable storage.
1765 */
1766 if (data_written) {
1767 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1768
1769 if (!error) {
1770 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1771 }
1772 }
1773
1774 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1775
1776 if (in_crit)
1777 nbl_end_crit(vp);
1778 VN_RELE(vp);
1779
1780 t_flag = curthread->t_flag & T_WOULDBLOCK;
1781 mutex_enter(&nsrv->async_write_lock);
1782 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1783 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1784 rp->ns->ns_status = puterrno(error);
1785 rp->thread->t_flag |= t_flag;
1786 }
1787 }
1788 cv_broadcast(&nlp->cv);
1789 mutex_exit(&nsrv->async_write_lock);
1790
1791 }
1792
1793 void *
1794 rfs_write_getfh(struct nfswriteargs *wa)
1795 {
1796 return (&wa->wa_fhandle);
1797 }
1798
1799 /*
1800 * Create a file.
1801 * Creates a file with given attributes and returns those attributes
1802 * and an fhandle for the new file.
1803 */
1804 void
1805 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1806 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1807 {
1808 int error;
1809 int lookuperr;
2234 NULL, cr, NULL, NULL, NULL);
2235 if (error != 0) {
2236 VN_RELE(tovp);
2237 VN_RELE(fromvp);
2238 *status = puterrno(error);
2239 return;
2240 }
2241
2242 /* Check for delegations on the source file */
2243
2244 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2245 VN_RELE(tovp);
2246 VN_RELE(fromvp);
2247 VN_RELE(srcvp);
2248 curthread->t_flag |= T_WOULDBLOCK;
2249 return;
2250 }
2251
2252 /* Check for delegation on the file being renamed over, if it exists */
2253
2254 if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2255 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2256 NULL, NULL, NULL) == 0) {
2257
2258 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2259 VN_RELE(tovp);
2260 VN_RELE(fromvp);
2261 VN_RELE(srcvp);
2262 VN_RELE(targvp);
2263 curthread->t_flag |= T_WOULDBLOCK;
2264 return;
2265 }
2266 VN_RELE(targvp);
2267 }
2268
2269
2270 if (nbl_need_check(srcvp)) {
2271 nbl_start_crit(srcvp, RW_READER);
2272 in_crit = 1;
2273 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2274 error = EACCES;
2601 if (vp == NULL) {
2602 *status = NFSERR_STALE;
2603 return;
2604 }
2605
2606 if (rdonly(ro, vp)) {
2607 VN_RELE(vp);
2608 *status = NFSERR_ROFS;
2609 return;
2610 }
2611
2612 /*
2613 * VOP_RMDIR takes a third argument (the current
2614 * directory of the process). That's because someone
2615 * wants to return EINVAL if one tries to remove ".".
2616 * Of course, NFS servers have no idea what their
2617 * clients' current directories are. We fake it by
2618 * supplying a vnode known to exist and illegal to
2619 * remove.
2620 */
2621 error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2622
2623 /*
2624 * Force modified data and metadata out to stable storage.
2625 */
2626 (void) VOP_FSYNC(vp, 0, cr, NULL);
2627
2628 VN_RELE(vp);
2629
2630 /*
2631 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2632 * if the directory is not empty. A System V NFS server
2633 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2634 * over the wire.
2635 */
2636 if (error == EEXIST)
2637 *status = NFSERR_NOTEMPTY;
2638 else
2639 *status = puterrno(error);
2640
2641 }
2876 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2877 }
2878 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2879 sa->sa_mtime.tv_usec != (int32_t)-1) {
2880 #ifndef _LP64
2881 /* return error if time overflow */
2882 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2883 return (EOVERFLOW);
2884 #endif
2885 vap->va_mask |= AT_MTIME;
2886 /*
2887 * nfs protocol defines times as unsigned so don't extend sign,
2888 * unless sysadmin set nfs_allow_preepoch_time.
2889 */
2890 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2891 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2892 }
2893 return (0);
2894 }
2895
2896 static const enum nfsftype vt_to_nf[] = {
2897 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2898 };
2899
2900 /*
2901 * check the following fields for overflow: nodeid, size, and time.
2902 * There could be a problem when converting 64-bit LP64 fields
2903 * into 32-bit ones. Return an error if there is an overflow.
2904 */
2905 int
2906 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2907 {
2908 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2909 na->na_type = vt_to_nf[vap->va_type];
2910
2911 if (vap->va_mode == (unsigned short) -1)
2912 na->na_mode = (uint32_t)-1;
2913 else
2914 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2915
2916 if (vap->va_uid == (unsigned short)(-1))
3095 aclentp->a_perm;
3096 break;
3097 default:
3098 break;
3099 }
3100 }
3101 }
3102 /* copy to va */
3103 va->va_mode &= ~077;
3104 va->va_mode |= grp_perm | other_perm;
3105 }
3106 if (vsa.vsa_aclcnt)
3107 kmem_free(vsa.vsa_aclentp,
3108 vsa.vsa_aclcnt * sizeof (aclent_t));
3109 }
3110 }
3111
3112 void
3113 rfs_srvrinit(void)
3114 {
3115 nfs2_srv_caller_id = fs_new_caller_id();
3116 }
3117
3118 void
3119 rfs_srvrfini(void)
3120 {
3121 }
3122
3123 /* ARGSUSED */
3124 void
3125 rfs_srv_zone_init(nfs_globals_t *ng)
3126 {
3127 nfs_srv_t *ns;
3128
3129 ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3130
3131 mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3132 ns->write_async = 1;
3133
3134 ng->nfs_srv = ns;
3135 }
3136
3137 /* ARGSUSED */
3138 void
3139 rfs_srv_zone_fini(nfs_globals_t *ng)
3140 {
3141 nfs_srv_t *ns = ng->nfs_srv;
3142
3143 ng->nfs_srv = NULL;
3144
3145 mutex_destroy(&ns->async_write_lock);
3146 kmem_free(ns, sizeof (*ns));
3147 }
3148
3149 static int
3150 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3151 {
3152 struct clist *wcl;
3153 int wlist_len;
3154 uint32_t count = rr->rr_count;
3155
3156 wcl = ra->ra_wlist;
3157
3158 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3159 return (FALSE);
3160 }
3161
3162 wcl = ra->ra_wlist;
3163 rr->rr_ok.rrok_wlist_len = wlist_len;
3164 rr->rr_ok.rrok_wlist = wcl;
3165
3166 return (TRUE);
3167 }
|