1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
30 * All rights reserved.
31 */
32
33 /*
34 * Copyright 2018 Nexenta Systems, Inc.
35 * Copyright (c) 2016 by Delphix. All rights reserved.
36 */
37
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/uio.h>
46 #include <sys/stat.h>
47 #include <sys/errno.h>
48 #include <sys/sysmacros.h>
49 #include <sys/statvfs.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/dirent.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/vtrace.h>
56 #include <sys/mode.h>
57 #include <sys/acl.h>
58 #include <sys/nbmlock.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/svc.h>
65
66 #include <nfs/nfs.h>
67 #include <nfs/export.h>
68 #include <nfs/nfs_cmd.h>
69
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_kmem.h>
75
76 #include <sys/strsubr.h>
77
78 struct rfs_async_write_list;
79
80 /*
81 * Zone globals of NFSv2 server
82 */
83 typedef struct nfs_srv {
84 kmutex_t async_write_lock;
85 struct rfs_async_write_list *async_write_head;
86
87 /*
88 * enables write clustering if == 1
89 */
90 int write_async;
91 } nfs_srv_t;
92
93 /*
94 * These are the interface routines for the server side of the
95 * Network File System. See the NFS version 2 protocol specification
96 * for a description of this interface.
97 */
98
99 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
100 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
101 cred_t *);
102
103
104 /*
105 * Some "over the wire" UNIX file types. These are encoded
106 * into the mode. This needs to be fixed in the next rev.
107 */
108 #define IFMT 0170000 /* type of file */
109 #define IFCHR 0020000 /* character special */
110 #define IFBLK 0060000 /* block special */
111 #define IFSOCK 0140000 /* socket */
112
113 u_longlong_t nfs2_srv_caller_id;
114
115 static nfs_srv_t *
116 nfs_get_srv(void)
117 {
118 nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
119 nfs_srv_t *srv = ng->nfs_srv;
120 ASSERT(srv != NULL);
121 return (srv);
122 }
123
124 /*
125 * Get file attributes.
126 * Returns the current attributes of the file with the given fhandle.
127 */
128 /* ARGSUSED */
129 void
130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
131 struct svc_req *req, cred_t *cr, bool_t ro)
132 {
133 int error;
134 vnode_t *vp;
135 struct vattr va;
136
137 vp = nfs_fhtovp(fhp, exi);
138 if (vp == NULL) {
139 ns->ns_status = NFSERR_STALE;
140 return;
141 }
142
143 /*
144 * Do the getattr.
145 */
146 va.va_mask = AT_ALL; /* we want all the attributes */
147
148 error = rfs4_delegated_getattr(vp, &va, 0, cr);
149
150 /* check for overflows */
151 if (!error) {
152 /* Lie about the object type for a referral */
153 if (vn_is_nfs_reparse(vp, cr))
154 va.va_type = VLNK;
155
156 acl_perm(vp, exi, &va, cr);
157 error = vattr_to_nattr(&va, &ns->ns_attr);
158 }
159
160 VN_RELE(vp);
161
162 ns->ns_status = puterrno(error);
163 }
164 void *
165 rfs_getattr_getfh(fhandle_t *fhp)
166 {
167 return (fhp);
168 }
169
170 /*
171 * Set file attributes.
172 * Sets the attributes of the file with the given fhandle. Returns
173 * the new attributes.
174 */
175 /* ARGSUSED */
176 void
177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
178 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
179 {
180 int error;
181 int flag;
182 int in_crit = 0;
183 vnode_t *vp;
184 struct vattr va;
185 struct vattr bva;
186 struct flock64 bf;
187 caller_context_t ct;
188
189
190 vp = nfs_fhtovp(&args->saa_fh, exi);
191 if (vp == NULL) {
192 ns->ns_status = NFSERR_STALE;
193 return;
194 }
195
196 if (rdonly(ro, vp)) {
197 VN_RELE(vp);
198 ns->ns_status = NFSERR_ROFS;
199 return;
200 }
201
202 error = sattr_to_vattr(&args->saa_sa, &va);
203 if (error) {
204 VN_RELE(vp);
205 ns->ns_status = puterrno(error);
206 return;
207 }
208
209 /*
210 * If the client is requesting a change to the mtime,
211 * but the nanosecond field is set to 1 billion, then
212 * this is a flag to the server that it should set the
213 * atime and mtime fields to the server's current time.
214 * The 1 billion number actually came from the client
215 * as 1 million, but the units in the over the wire
216 * request are microseconds instead of nanoseconds.
217 *
218 * This is an overload of the protocol and should be
219 * documented in the NFS Version 2 protocol specification.
220 */
221 if (va.va_mask & AT_MTIME) {
222 if (va.va_mtime.tv_nsec == 1000000000) {
223 gethrestime(&va.va_mtime);
224 va.va_atime = va.va_mtime;
225 va.va_mask |= AT_ATIME;
226 flag = 0;
227 } else
228 flag = ATTR_UTIME;
229 } else
230 flag = 0;
231
232 /*
233 * If the filesystem is exported with nosuid, then mask off
234 * the setuid and setgid bits.
235 */
236 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
237 (exi->exi_export.ex_flags & EX_NOSUID))
238 va.va_mode &= ~(VSUID | VSGID);
239
240 ct.cc_sysid = 0;
241 ct.cc_pid = 0;
242 ct.cc_caller_id = nfs2_srv_caller_id;
243 ct.cc_flags = CC_DONTBLOCK;
244
245 /*
246 * We need to specially handle size changes because it is
247 * possible for the client to create a file with modes
248 * which indicate read-only, but with the file opened for
249 * writing. If the client then tries to set the size of
250 * the file, then the normal access checking done in
251 * VOP_SETATTR would prevent the client from doing so,
252 * although it should be legal for it to do so. To get
253 * around this, we do the access checking for ourselves
254 * and then use VOP_SPACE which doesn't do the access
255 * checking which VOP_SETATTR does. VOP_SPACE can only
256 * operate on VREG files, let VOP_SETATTR handle the other
257 * extremely rare cases.
258 * Also the client should not be allowed to change the
259 * size of the file if there is a conflicting non-blocking
260 * mandatory lock in the region of change.
261 */
262 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
263 if (nbl_need_check(vp)) {
264 nbl_start_crit(vp, RW_READER);
265 in_crit = 1;
266 }
267
268 bva.va_mask = AT_UID | AT_SIZE;
269
270 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
271
272 if (error) {
273 if (in_crit)
274 nbl_end_crit(vp);
275 VN_RELE(vp);
276 ns->ns_status = puterrno(error);
277 return;
278 }
279
280 if (in_crit) {
281 u_offset_t offset;
282 ssize_t length;
283
284 if (va.va_size < bva.va_size) {
285 offset = va.va_size;
286 length = bva.va_size - va.va_size;
287 } else {
288 offset = bva.va_size;
289 length = va.va_size - bva.va_size;
290 }
291 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
292 NULL)) {
293 error = EACCES;
294 }
295 }
296
297 if (crgetuid(cr) == bva.va_uid && !error &&
298 va.va_size != bva.va_size) {
299 va.va_mask &= ~AT_SIZE;
300 bf.l_type = F_WRLCK;
301 bf.l_whence = 0;
302 bf.l_start = (off64_t)va.va_size;
303 bf.l_len = 0;
304 bf.l_sysid = 0;
305 bf.l_pid = 0;
306
307 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
308 (offset_t)va.va_size, cr, &ct);
309 }
310 if (in_crit)
311 nbl_end_crit(vp);
312 } else
313 error = 0;
314
315 /*
316 * Do the setattr.
317 */
318 if (!error && va.va_mask) {
319 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
320 }
321
322 /*
323 * check if the monitor on either vop_space or vop_setattr detected
324 * a delegation conflict and if so, mark the thread flag as
325 * wouldblock so that the response is dropped and the client will
326 * try again.
327 */
328 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
329 VN_RELE(vp);
330 curthread->t_flag |= T_WOULDBLOCK;
331 return;
332 }
333
334 if (!error) {
335 va.va_mask = AT_ALL; /* get everything */
336
337 error = rfs4_delegated_getattr(vp, &va, 0, cr);
338
339 /* check for overflows */
340 if (!error) {
341 acl_perm(vp, exi, &va, cr);
342 error = vattr_to_nattr(&va, &ns->ns_attr);
343 }
344 }
345
346 ct.cc_flags = 0;
347
348 /*
349 * Force modified metadata out to stable storage.
350 */
351 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
352
353 VN_RELE(vp);
354
355 ns->ns_status = puterrno(error);
356 }
357 void *
358 rfs_setattr_getfh(struct nfssaargs *args)
359 {
360 return (&args->saa_fh);
361 }
362
363 /* Change and release @exip and @vpp only in success */
364 int
365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
366 {
367 struct exportinfo *exi;
368 vnode_t *vp = *vpp;
369 fid_t fid;
370 int error;
371
372 VN_HOLD(vp);
373
374 if ((error = traverse(&vp)) != 0) {
375 VN_RELE(vp);
376 return (error);
377 }
378
379 bzero(&fid, sizeof (fid));
380 fid.fid_len = MAXFIDSZ;
381 error = VOP_FID(vp, &fid, NULL);
382 if (error) {
383 VN_RELE(vp);
384 return (error);
385 }
386
387 exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
388 if (exi == NULL ||
389 (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
390 /*
391 * It is not error, just subdir is not exported
392 * or "nohide" is not set
393 */
394 if (exi != NULL)
395 exi_rele(exi);
396 VN_RELE(vp);
397 } else {
398 /* go to submount */
399 exi_rele(*exip);
400 *exip = exi;
401
402 VN_RELE(*vpp);
403 *vpp = vp;
404 }
405
406 return (0);
407 }
408
409 /*
410 * Given mounted "dvp" and "exi", go upper mountpoint
411 * with dvp/exi correction
412 * Return 0 in success
413 */
414 int
415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
416 {
417 struct exportinfo *exi;
418 vnode_t *dvp = *dvpp;
419
420 ASSERT3U((*exip)->exi_zoneid, ==, curzone->zone_id);
421 ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
422
423 VN_HOLD(dvp);
424 dvp = untraverse((*exip)->exi_ne, dvp);
425 exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
426 if (exi == NULL) {
427 VN_RELE(dvp);
428 return (-1);
429 }
430
431 ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
432 exi_rele(*exip);
433 *exip = exi;
434 VN_RELE(*dvpp);
435 *dvpp = dvp;
436
437 return (0);
438 }
439 /*
440 * Directory lookup.
441 * Returns an fhandle and file attributes for file name in a directory.
442 */
443 /* ARGSUSED */
444 void
445 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
446 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
447 {
448 int error;
449 vnode_t *dvp;
450 vnode_t *vp;
451 struct vattr va;
452 fhandle_t *fhp = da->da_fhandle;
453 struct sec_ol sec = {0, 0};
454 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
455 char *name;
456 struct sockaddr *ca;
457
458 /*
459 * Trusted Extension doesn't support NFSv2. MOUNT
460 * will reject v2 clients. Need to prevent v2 client
461 * access via WebNFS here.
462 */
463 if (is_system_labeled() && req->rq_vers == 2) {
464 dr->dr_status = NFSERR_ACCES;
465 return;
466 }
467
468 /*
469 * Disallow NULL paths
470 */
471 if (da->da_name == NULL || *da->da_name == '\0') {
472 dr->dr_status = NFSERR_ACCES;
473 return;
474 }
475
476 /*
477 * Allow lookups from the root - the default
478 * location of the public filehandle.
479 */
480 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
481 dvp = ZONE_ROOTVP();
482 VN_HOLD(dvp);
483 } else {
484 dvp = nfs_fhtovp(fhp, exi);
485 if (dvp == NULL) {
486 dr->dr_status = NFSERR_STALE;
487 return;
488 }
489 }
490
491 exi_hold(exi);
492 ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
493
494 /*
495 * Not allow lookup beyond root.
496 * If the filehandle matches a filehandle of the exi,
497 * then the ".." refers beyond the root of an exported filesystem.
498 */
499 if (strcmp(da->da_name, "..") == 0 &&
500 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
501 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
502 ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
503 /*
504 * special case for ".." and 'nohide'exported root
505 */
506 if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
507 error = NFSERR_ACCES;
508 goto out;
509 }
510 } else {
511 error = NFSERR_NOENT;
512 goto out;
513 }
514 }
515
516 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
517 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
518 MAXPATHLEN);
519
520 if (name == NULL) {
521 error = NFSERR_ACCES;
522 goto out;
523 }
524
525 /*
526 * If the public filehandle is used then allow
527 * a multi-component lookup, i.e. evaluate
528 * a pathname and follow symbolic links if
529 * necessary.
530 *
531 * This may result in a vnode in another filesystem
532 * which is OK as long as the filesystem is exported.
533 */
534 if (PUBLIC_FH2(fhp)) {
535 publicfh_flag = TRUE;
536
537 exi_rele(exi);
538 exi = NULL;
539
540 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
541 &sec);
542 } else {
543 /*
544 * Do a normal single component lookup.
545 */
546 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
547 NULL, NULL, NULL);
548 }
549
550 if (name != da->da_name)
551 kmem_free(name, MAXPATHLEN);
552
553 if (error == 0 && vn_ismntpt(vp)) {
554 error = rfs_cross_mnt(&vp, &exi);
555 if (error)
556 VN_RELE(vp);
557 }
558
559 if (!error) {
560 va.va_mask = AT_ALL; /* we want everything */
561
562 error = rfs4_delegated_getattr(vp, &va, 0, cr);
563
564 /* check for overflows */
565 if (!error) {
566 acl_perm(vp, exi, &va, cr);
567 error = vattr_to_nattr(&va, &dr->dr_attr);
568 if (!error) {
569 if (sec.sec_flags & SEC_QUERY)
570 error = makefh_ol(&dr->dr_fhandle, exi,
571 sec.sec_index);
572 else {
573 error = makefh(&dr->dr_fhandle, vp,
574 exi);
575 if (!error && publicfh_flag &&
576 !chk_clnt_sec(exi, req))
577 auth_weak = TRUE;
578 }
579 }
580 }
581 VN_RELE(vp);
582 }
583
584 out:
585 VN_RELE(dvp);
586
587 if (exi != NULL)
588 exi_rele(exi);
589
590 /*
591 * If it's public fh, no 0x81, and client's flavor is
592 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
593 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
594 */
595 if (auth_weak)
596 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
597 else
598 dr->dr_status = puterrno(error);
599 }
600 void *
601 rfs_lookup_getfh(struct nfsdiropargs *da)
602 {
603 return (da->da_fhandle);
604 }
605
606 /*
607 * Read symbolic link.
608 * Returns the string in the symbolic link at the given fhandle.
609 */
610 /* ARGSUSED */
611 void
612 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
613 struct svc_req *req, cred_t *cr, bool_t ro)
614 {
615 int error;
616 struct iovec iov;
617 struct uio uio;
618 vnode_t *vp;
619 struct vattr va;
620 struct sockaddr *ca;
621 char *name = NULL;
622 int is_referral = 0;
623
624 vp = nfs_fhtovp(fhp, exi);
625 if (vp == NULL) {
626 rl->rl_data = NULL;
627 rl->rl_status = NFSERR_STALE;
628 return;
629 }
630
631 va.va_mask = AT_MODE;
632
633 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
634
635 if (error) {
636 VN_RELE(vp);
637 rl->rl_data = NULL;
638 rl->rl_status = puterrno(error);
639 return;
640 }
641
642 if (MANDLOCK(vp, va.va_mode)) {
643 VN_RELE(vp);
644 rl->rl_data = NULL;
645 rl->rl_status = NFSERR_ACCES;
646 return;
647 }
648
649 /* We lied about the object type for a referral */
650 if (vn_is_nfs_reparse(vp, cr))
651 is_referral = 1;
652
653 /*
654 * XNFS and RFC1094 require us to return ENXIO if argument
655 * is not a link. BUGID 1138002.
656 */
657 if (vp->v_type != VLNK && !is_referral) {
658 VN_RELE(vp);
659 rl->rl_data = NULL;
660 rl->rl_status = NFSERR_NXIO;
661 return;
662 }
663
664 /*
665 * Allocate data for pathname. This will be freed by rfs_rlfree.
666 */
667 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
668
669 if (is_referral) {
670 char *s;
671 size_t strsz;
672
673 /* Get an artificial symlink based on a referral */
674 s = build_symlink(vp, cr, &strsz);
675 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
676 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
677 vnode_t *, vp, char *, s);
678 if (s == NULL)
679 error = EINVAL;
680 else {
681 error = 0;
682 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
683 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
684 kmem_free(s, strsz);
685 }
686
687 } else {
688
689 /*
690 * Set up io vector to read sym link data
691 */
692 iov.iov_base = rl->rl_data;
693 iov.iov_len = NFS_MAXPATHLEN;
694 uio.uio_iov = &iov;
695 uio.uio_iovcnt = 1;
696 uio.uio_segflg = UIO_SYSSPACE;
697 uio.uio_extflg = UIO_COPY_CACHED;
698 uio.uio_loffset = (offset_t)0;
699 uio.uio_resid = NFS_MAXPATHLEN;
700
701 /*
702 * Do the readlink.
703 */
704 error = VOP_READLINK(vp, &uio, cr, NULL);
705
706 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
707
708 if (!error)
709 rl->rl_data[rl->rl_count] = '\0';
710
711 }
712
713
714 VN_RELE(vp);
715
716 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
717 name = nfscmd_convname(ca, exi, rl->rl_data,
718 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
719
720 if (name != NULL && name != rl->rl_data) {
721 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
722 rl->rl_data = name;
723 }
724
725 /*
726 * XNFS and RFC1094 require us to return ENXIO if argument
727 * is not a link. UFS returns EINVAL if this is the case,
728 * so we do the mapping here. BUGID 1138002.
729 */
730 if (error == EINVAL)
731 rl->rl_status = NFSERR_NXIO;
732 else
733 rl->rl_status = puterrno(error);
734
735 }
736 void *
737 rfs_readlink_getfh(fhandle_t *fhp)
738 {
739 return (fhp);
740 }
741 /*
742 * Free data allocated by rfs_readlink
743 */
744 void
745 rfs_rlfree(struct nfsrdlnres *rl)
746 {
747 if (rl->rl_data != NULL)
748 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
749 }
750
751 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
752
753 /*
754 * Read data.
755 * Returns some data read from the file at the given fhandle.
756 */
757 /* ARGSUSED */
758 void
759 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
760 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
761 {
762 vnode_t *vp;
763 int error;
764 struct vattr va;
765 struct iovec iov;
766 struct uio uio;
767 mblk_t *mp;
768 int alloc_err = 0;
769 int in_crit = 0;
770 caller_context_t ct;
771
772 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
773 if (vp == NULL) {
774 rr->rr_data = NULL;
775 rr->rr_status = NFSERR_STALE;
776 return;
777 }
778
779 if (vp->v_type != VREG) {
780 VN_RELE(vp);
781 rr->rr_data = NULL;
782 rr->rr_status = NFSERR_ISDIR;
783 return;
784 }
785
786 ct.cc_sysid = 0;
787 ct.cc_pid = 0;
788 ct.cc_caller_id = nfs2_srv_caller_id;
789 ct.cc_flags = CC_DONTBLOCK;
790
791 /*
792 * Enter the critical region before calling VOP_RWLOCK
793 * to avoid a deadlock with write requests.
794 */
795 if (nbl_need_check(vp)) {
796 nbl_start_crit(vp, RW_READER);
797 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
798 0, NULL)) {
799 nbl_end_crit(vp);
800 VN_RELE(vp);
801 rr->rr_data = NULL;
802 rr->rr_status = NFSERR_ACCES;
803 return;
804 }
805 in_crit = 1;
806 }
807
808 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
809
810 /* check if a monitor detected a delegation conflict */
811 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
812 if (in_crit)
813 nbl_end_crit(vp);
814 VN_RELE(vp);
815 /* mark as wouldblock so response is dropped */
816 curthread->t_flag |= T_WOULDBLOCK;
817
818 rr->rr_data = NULL;
819 return;
820 }
821
822 va.va_mask = AT_ALL;
823
824 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
825
826 if (error) {
827 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
828 if (in_crit)
829 nbl_end_crit(vp);
830
831 VN_RELE(vp);
832 rr->rr_data = NULL;
833 rr->rr_status = puterrno(error);
834
835 return;
836 }
837
838 /*
839 * This is a kludge to allow reading of files created
840 * with no read permission. The owner of the file
841 * is always allowed to read it.
842 */
843 if (crgetuid(cr) != va.va_uid) {
844 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
845
846 if (error) {
847 /*
848 * Exec is the same as read over the net because
849 * of demand loading.
850 */
851 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
852 }
853 if (error) {
854 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
855 if (in_crit)
856 nbl_end_crit(vp);
857 VN_RELE(vp);
858 rr->rr_data = NULL;
859 rr->rr_status = puterrno(error);
860
861 return;
862 }
863 }
864
865 if (MANDLOCK(vp, va.va_mode)) {
866 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
867 if (in_crit)
868 nbl_end_crit(vp);
869
870 VN_RELE(vp);
871 rr->rr_data = NULL;
872 rr->rr_status = NFSERR_ACCES;
873
874 return;
875 }
876
877 rr->rr_ok.rrok_wlist_len = 0;
878 rr->rr_ok.rrok_wlist = NULL;
879
880 if ((u_offset_t)ra->ra_offset >= va.va_size) {
881 rr->rr_count = 0;
882 rr->rr_data = NULL;
883 /*
884 * In this case, status is NFS_OK, but there is no data
885 * to encode. So set rr_mp to NULL.
886 */
887 rr->rr_mp = NULL;
888 rr->rr_ok.rrok_wlist = ra->ra_wlist;
889 if (rr->rr_ok.rrok_wlist)
890 clist_zero_len(rr->rr_ok.rrok_wlist);
891 goto done;
892 }
893
894 if (ra->ra_wlist) {
895 mp = NULL;
896 rr->rr_mp = NULL;
897 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
898 if (ra->ra_count > iov.iov_len) {
899 rr->rr_data = NULL;
900 rr->rr_status = NFSERR_INVAL;
901 goto done;
902 }
903 } else {
904 /*
905 * mp will contain the data to be sent out in the read reply.
906 * This will be freed after the reply has been sent out (by the
907 * driver).
908 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
909 * that the call to xdrmblk_putmblk() never fails.
910 */
911 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
912 &alloc_err);
913 ASSERT(mp != NULL);
914 ASSERT(alloc_err == 0);
915
916 rr->rr_mp = mp;
917
918 /*
919 * Set up io vector
920 */
921 iov.iov_base = (caddr_t)mp->b_datap->db_base;
922 iov.iov_len = ra->ra_count;
923 }
924
925 uio.uio_iov = &iov;
926 uio.uio_iovcnt = 1;
927 uio.uio_segflg = UIO_SYSSPACE;
928 uio.uio_extflg = UIO_COPY_CACHED;
929 uio.uio_loffset = (offset_t)ra->ra_offset;
930 uio.uio_resid = ra->ra_count;
931
932 error = VOP_READ(vp, &uio, 0, cr, &ct);
933
934 if (error) {
935 if (mp)
936 freeb(mp);
937
938 /*
939 * check if a monitor detected a delegation conflict and
940 * mark as wouldblock so response is dropped
941 */
942 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
943 curthread->t_flag |= T_WOULDBLOCK;
944 else
945 rr->rr_status = puterrno(error);
946
947 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
948 if (in_crit)
949 nbl_end_crit(vp);
950
951 VN_RELE(vp);
952 rr->rr_data = NULL;
953
954 return;
955 }
956
957 /*
958 * Get attributes again so we can send the latest access
959 * time to the client side for its cache.
960 */
961 va.va_mask = AT_ALL;
962
963 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
964
965 if (error) {
966 if (mp)
967 freeb(mp);
968
969 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
970 if (in_crit)
971 nbl_end_crit(vp);
972
973 VN_RELE(vp);
974 rr->rr_data = NULL;
975 rr->rr_status = puterrno(error);
976
977 return;
978 }
979
980 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
981
982 if (mp) {
983 rr->rr_data = (char *)mp->b_datap->db_base;
984 } else {
985 if (ra->ra_wlist) {
986 rr->rr_data = (caddr_t)iov.iov_base;
987 if (!rdma_setup_read_data2(ra, rr)) {
988 rr->rr_data = NULL;
989 rr->rr_status = puterrno(NFSERR_INVAL);
990 }
991 }
992 }
993 done:
994 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
995 if (in_crit)
996 nbl_end_crit(vp);
997
998 acl_perm(vp, exi, &va, cr);
999
1000 /* check for overflows */
1001 error = vattr_to_nattr(&va, &rr->rr_attr);
1002
1003 VN_RELE(vp);
1004
1005 rr->rr_status = puterrno(error);
1006 }
1007
1008 /*
1009 * Free data allocated by rfs_read
1010 */
1011 void
1012 rfs_rdfree(struct nfsrdresult *rr)
1013 {
1014 mblk_t *mp;
1015
1016 if (rr->rr_status == NFS_OK) {
1017 mp = rr->rr_mp;
1018 if (mp != NULL)
1019 freeb(mp);
1020 }
1021 }
1022
1023 void *
1024 rfs_read_getfh(struct nfsreadargs *ra)
1025 {
1026 return (&ra->ra_fhandle);
1027 }
1028
1029 #define MAX_IOVECS 12
1030
1031 #ifdef DEBUG
1032 static int rfs_write_sync_hits = 0;
1033 static int rfs_write_sync_misses = 0;
1034 #endif
1035
1036 /*
1037 * Write data to file.
1038 * Returns attributes of a file after writing some data to it.
1039 *
1040 * Any changes made here, especially in error handling might have
1041 * to also be done in rfs_write (which clusters write requests).
1042 */
1043 /* ARGSUSED */
1044 void
1045 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1046 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1047 {
1048 int error;
1049 vnode_t *vp;
1050 rlim64_t rlimit;
1051 struct vattr va;
1052 struct uio uio;
1053 struct iovec iov[MAX_IOVECS];
1054 mblk_t *m;
1055 struct iovec *iovp;
1056 int iovcnt;
1057 cred_t *savecred;
1058 int in_crit = 0;
1059 caller_context_t ct;
1060
1061 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1062 if (vp == NULL) {
1063 ns->ns_status = NFSERR_STALE;
1064 return;
1065 }
1066
1067 if (rdonly(ro, vp)) {
1068 VN_RELE(vp);
1069 ns->ns_status = NFSERR_ROFS;
1070 return;
1071 }
1072
1073 if (vp->v_type != VREG) {
1074 VN_RELE(vp);
1075 ns->ns_status = NFSERR_ISDIR;
1076 return;
1077 }
1078
1079 ct.cc_sysid = 0;
1080 ct.cc_pid = 0;
1081 ct.cc_caller_id = nfs2_srv_caller_id;
1082 ct.cc_flags = CC_DONTBLOCK;
1083
1084 va.va_mask = AT_UID|AT_MODE;
1085
1086 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1087
1088 if (error) {
1089 VN_RELE(vp);
1090 ns->ns_status = puterrno(error);
1091
1092 return;
1093 }
1094
1095 if (crgetuid(cr) != va.va_uid) {
1096 /*
1097 * This is a kludge to allow writes of files created
1098 * with read only permission. The owner of the file
1099 * is always allowed to write it.
1100 */
1101 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1102
1103 if (error) {
1104 VN_RELE(vp);
1105 ns->ns_status = puterrno(error);
1106 return;
1107 }
1108 }
1109
1110 /*
1111 * Can't access a mandatory lock file. This might cause
1112 * the NFS service thread to block forever waiting for a
1113 * lock to be released that will never be released.
1114 */
1115 if (MANDLOCK(vp, va.va_mode)) {
1116 VN_RELE(vp);
1117 ns->ns_status = NFSERR_ACCES;
1118 return;
1119 }
1120
1121 /*
1122 * We have to enter the critical region before calling VOP_RWLOCK
1123 * to avoid a deadlock with ufs.
1124 */
1125 if (nbl_need_check(vp)) {
1126 nbl_start_crit(vp, RW_READER);
1127 in_crit = 1;
1128 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1129 wa->wa_count, 0, NULL)) {
1130 error = EACCES;
1131 goto out;
1132 }
1133 }
1134
1135 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1136
1137 /* check if a monitor detected a delegation conflict */
1138 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1139 goto out;
1140 }
1141
1142 if (wa->wa_data || wa->wa_rlist) {
1143 /* Do the RDMA thing if necessary */
1144 if (wa->wa_rlist) {
1145 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1146 iov[0].iov_len = wa->wa_count;
1147 } else {
1148 iov[0].iov_base = wa->wa_data;
1149 iov[0].iov_len = wa->wa_count;
1150 }
1151 uio.uio_iov = iov;
1152 uio.uio_iovcnt = 1;
1153 uio.uio_segflg = UIO_SYSSPACE;
1154 uio.uio_extflg = UIO_COPY_DEFAULT;
1155 uio.uio_loffset = (offset_t)wa->wa_offset;
1156 uio.uio_resid = wa->wa_count;
1157 /*
1158 * The limit is checked on the client. We
1159 * should allow any size writes here.
1160 */
1161 uio.uio_llimit = curproc->p_fsz_ctl;
1162 rlimit = uio.uio_llimit - wa->wa_offset;
1163 if (rlimit < (rlim64_t)uio.uio_resid)
1164 uio.uio_resid = (uint_t)rlimit;
1165
1166 /*
1167 * for now we assume no append mode
1168 */
1169 /*
1170 * We're changing creds because VM may fault and we need
1171 * the cred of the current thread to be used if quota
1172 * checking is enabled.
1173 */
1174 savecred = curthread->t_cred;
1175 curthread->t_cred = cr;
1176 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1177 curthread->t_cred = savecred;
1178 } else {
1179
1180 iovcnt = 0;
1181 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1182 iovcnt++;
1183 if (iovcnt <= MAX_IOVECS) {
1184 #ifdef DEBUG
1185 rfs_write_sync_hits++;
1186 #endif
1187 iovp = iov;
1188 } else {
1189 #ifdef DEBUG
1190 rfs_write_sync_misses++;
1191 #endif
1192 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1193 }
1194 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1195 uio.uio_iov = iovp;
1196 uio.uio_iovcnt = iovcnt;
1197 uio.uio_segflg = UIO_SYSSPACE;
1198 uio.uio_extflg = UIO_COPY_DEFAULT;
1199 uio.uio_loffset = (offset_t)wa->wa_offset;
1200 uio.uio_resid = wa->wa_count;
1201 /*
1202 * The limit is checked on the client. We
1203 * should allow any size writes here.
1204 */
1205 uio.uio_llimit = curproc->p_fsz_ctl;
1206 rlimit = uio.uio_llimit - wa->wa_offset;
1207 if (rlimit < (rlim64_t)uio.uio_resid)
1208 uio.uio_resid = (uint_t)rlimit;
1209
1210 /*
1211 * For now we assume no append mode.
1212 */
1213 /*
1214 * We're changing creds because VM may fault and we need
1215 * the cred of the current thread to be used if quota
1216 * checking is enabled.
1217 */
1218 savecred = curthread->t_cred;
1219 curthread->t_cred = cr;
1220 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1221 curthread->t_cred = savecred;
1222
1223 if (iovp != iov)
1224 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1225 }
1226
1227 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1228
1229 if (!error) {
1230 /*
1231 * Get attributes again so we send the latest mod
1232 * time to the client side for its cache.
1233 */
1234 va.va_mask = AT_ALL; /* now we want everything */
1235
1236 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1237
1238 /* check for overflows */
1239 if (!error) {
1240 acl_perm(vp, exi, &va, cr);
1241 error = vattr_to_nattr(&va, &ns->ns_attr);
1242 }
1243 }
1244
1245 out:
1246 if (in_crit)
1247 nbl_end_crit(vp);
1248 VN_RELE(vp);
1249
1250 /* check if a monitor detected a delegation conflict */
1251 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1252 /* mark as wouldblock so response is dropped */
1253 curthread->t_flag |= T_WOULDBLOCK;
1254 else
1255 ns->ns_status = puterrno(error);
1256
1257 }
1258
1259 struct rfs_async_write {
1260 struct nfswriteargs *wa;
1261 struct nfsattrstat *ns;
1262 struct svc_req *req;
1263 cred_t *cr;
1264 bool_t ro;
1265 kthread_t *thread;
1266 struct rfs_async_write *list;
1267 };
1268
1269 struct rfs_async_write_list {
1270 fhandle_t *fhp;
1271 kcondvar_t cv;
1272 struct rfs_async_write *list;
1273 struct rfs_async_write_list *next;
1274 };
1275
1276 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1277 static kmutex_t rfs_async_write_lock;
1278 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1279
1280 #define MAXCLIOVECS 42
1281 #define RFSWRITE_INITVAL (enum nfsstat) -1
1282
1283 #ifdef DEBUG
1284 static int rfs_write_hits = 0;
1285 static int rfs_write_misses = 0;
1286 #endif
1287
1288 /*
1289 * Write data to file.
1290 * Returns attributes of a file after writing some data to it.
1291 */
1292 void
1293 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1294 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1295 {
1296 int error;
1297 vnode_t *vp;
1298 rlim64_t rlimit;
1299 struct vattr va;
1300 struct uio uio;
1301 struct rfs_async_write_list *lp;
1302 struct rfs_async_write_list *nlp;
1303 struct rfs_async_write *rp;
1304 struct rfs_async_write *nrp;
1305 struct rfs_async_write *trp;
1306 struct rfs_async_write *lrp;
1307 int data_written;
1308 int iovcnt;
1309 mblk_t *m;
1310 struct iovec *iovp;
1311 struct iovec *niovp;
1312 struct iovec iov[MAXCLIOVECS];
1313 int count;
1314 int rcount;
1315 uint_t off;
1316 uint_t len;
1317 struct rfs_async_write nrpsp;
1318 struct rfs_async_write_list nlpsp;
1319 ushort_t t_flag;
1320 cred_t *savecred;
1321 int in_crit = 0;
1322 caller_context_t ct;
1323 nfs_srv_t *nsrv;
1324
1325 ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1326 nsrv = nfs_get_srv();
1327 if (!nsrv->write_async) {
1328 rfs_write_sync(wa, ns, exi, req, cr, ro);
1329 return;
1330 }
1331
1332 /*
1333 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1334 * is considered an OK.
1335 */
1336 ns->ns_status = RFSWRITE_INITVAL;
1337
1338 nrp = &nrpsp;
1339 nrp->wa = wa;
1340 nrp->ns = ns;
1341 nrp->req = req;
1342 nrp->cr = cr;
1343 nrp->ro = ro;
1344 nrp->thread = curthread;
1345
1346 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1347
1348 /*
1349 * Look to see if there is already a cluster started
1350 * for this file.
1351 */
1352 mutex_enter(&nsrv->async_write_lock);
1353 for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1354 if (bcmp(&wa->wa_fhandle, lp->fhp,
1355 sizeof (fhandle_t)) == 0)
1356 break;
1357 }
1358
1359 /*
1360 * If lp is non-NULL, then there is already a cluster
1361 * started. We need to place ourselves in the cluster
1362 * list in the right place as determined by starting
1363 * offset. Conflicts with non-blocking mandatory locked
1364 * regions will be checked when the cluster is processed.
1365 */
1366 if (lp != NULL) {
1367 rp = lp->list;
1368 trp = NULL;
1369 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1370 trp = rp;
1371 rp = rp->list;
1372 }
1373 nrp->list = rp;
1374 if (trp == NULL)
1375 lp->list = nrp;
1376 else
1377 trp->list = nrp;
1378 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1379 cv_wait(&lp->cv, &nsrv->async_write_lock);
1380 mutex_exit(&nsrv->async_write_lock);
1381
1382 return;
1383 }
1384
1385 /*
1386 * No cluster started yet, start one and add ourselves
1387 * to the list of clusters.
1388 */
1389 nrp->list = NULL;
1390
1391 nlp = &nlpsp;
1392 nlp->fhp = &wa->wa_fhandle;
1393 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1394 nlp->list = nrp;
1395 nlp->next = NULL;
1396
1397 if (nsrv->async_write_head == NULL) {
1398 nsrv->async_write_head = nlp;
1399 } else {
1400 lp = nsrv->async_write_head;
1401 while (lp->next != NULL)
1402 lp = lp->next;
1403 lp->next = nlp;
1404 }
1405 mutex_exit(&nsrv->async_write_lock);
1406
1407 /*
1408 * Convert the file handle common to all of the requests
1409 * in this cluster to a vnode.
1410 */
1411 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1412 if (vp == NULL) {
1413 mutex_enter(&nsrv->async_write_lock);
1414 if (nsrv->async_write_head == nlp)
1415 nsrv->async_write_head = nlp->next;
1416 else {
1417 lp = nsrv->async_write_head;
1418 while (lp->next != nlp)
1419 lp = lp->next;
1420 lp->next = nlp->next;
1421 }
1422 t_flag = curthread->t_flag & T_WOULDBLOCK;
1423 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1424 rp->ns->ns_status = NFSERR_STALE;
1425 rp->thread->t_flag |= t_flag;
1426 }
1427 cv_broadcast(&nlp->cv);
1428 mutex_exit(&nsrv->async_write_lock);
1429
1430 return;
1431 }
1432
1433 /*
1434 * Can only write regular files. Attempts to write any
1435 * other file types fail with EISDIR.
1436 */
1437 if (vp->v_type != VREG) {
1438 VN_RELE(vp);
1439 mutex_enter(&nsrv->async_write_lock);
1440 if (nsrv->async_write_head == nlp)
1441 nsrv->async_write_head = nlp->next;
1442 else {
1443 lp = nsrv->async_write_head;
1444 while (lp->next != nlp)
1445 lp = lp->next;
1446 lp->next = nlp->next;
1447 }
1448 t_flag = curthread->t_flag & T_WOULDBLOCK;
1449 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1450 rp->ns->ns_status = NFSERR_ISDIR;
1451 rp->thread->t_flag |= t_flag;
1452 }
1453 cv_broadcast(&nlp->cv);
1454 mutex_exit(&nsrv->async_write_lock);
1455
1456 return;
1457 }
1458
1459 /*
1460 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1461 * deadlock with ufs.
1462 */
1463 if (nbl_need_check(vp)) {
1464 nbl_start_crit(vp, RW_READER);
1465 in_crit = 1;
1466 }
1467
1468 ct.cc_sysid = 0;
1469 ct.cc_pid = 0;
1470 ct.cc_caller_id = nfs2_srv_caller_id;
1471 ct.cc_flags = CC_DONTBLOCK;
1472
1473 /*
1474 * Lock the file for writing. This operation provides
1475 * the delay which allows clusters to grow.
1476 */
1477 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1478
1479 /* check if a monitor detected a delegation conflict */
1480 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1481 if (in_crit)
1482 nbl_end_crit(vp);
1483 VN_RELE(vp);
1484 /* mark as wouldblock so response is dropped */
1485 curthread->t_flag |= T_WOULDBLOCK;
1486 mutex_enter(&nsrv->async_write_lock);
1487 if (nsrv->async_write_head == nlp)
1488 nsrv->async_write_head = nlp->next;
1489 else {
1490 lp = nsrv->async_write_head;
1491 while (lp->next != nlp)
1492 lp = lp->next;
1493 lp->next = nlp->next;
1494 }
1495 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1496 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1497 rp->ns->ns_status = puterrno(error);
1498 rp->thread->t_flag |= T_WOULDBLOCK;
1499 }
1500 }
1501 cv_broadcast(&nlp->cv);
1502 mutex_exit(&nsrv->async_write_lock);
1503
1504 return;
1505 }
1506
1507 /*
1508 * Disconnect this cluster from the list of clusters.
1509 * The cluster that is being dealt with must be fixed
1510 * in size after this point, so there is no reason
1511 * to leave it on the list so that new requests can
1512 * find it.
1513 *
1514 * The algorithm is that the first write request will
1515 * create a cluster, convert the file handle to a
1516 * vnode pointer, and then lock the file for writing.
1517 * This request is not likely to be clustered with
1518 * any others. However, the next request will create
1519 * a new cluster and be blocked in VOP_RWLOCK while
1520 * the first request is being processed. This delay
1521 * will allow more requests to be clustered in this
1522 * second cluster.
1523 */
1524 mutex_enter(&nsrv->async_write_lock);
1525 if (nsrv->async_write_head == nlp)
1526 nsrv->async_write_head = nlp->next;
1527 else {
1528 lp = nsrv->async_write_head;
1529 while (lp->next != nlp)
1530 lp = lp->next;
1531 lp->next = nlp->next;
1532 }
1533 mutex_exit(&nsrv->async_write_lock);
1534
1535 /*
1536 * Step through the list of requests in this cluster.
1537 * We need to check permissions to make sure that all
1538 * of the requests have sufficient permission to write
1539 * the file. A cluster can be composed of requests
1540 * from different clients and different users on each
1541 * client.
1542 *
1543 * As a side effect, we also calculate the size of the
1544 * byte range that this cluster encompasses.
1545 */
1546 rp = nlp->list;
1547 off = rp->wa->wa_offset;
1548 len = (uint_t)0;
1549 do {
1550 if (rdonly(rp->ro, vp)) {
1551 rp->ns->ns_status = NFSERR_ROFS;
1552 t_flag = curthread->t_flag & T_WOULDBLOCK;
1553 rp->thread->t_flag |= t_flag;
1554 continue;
1555 }
1556
1557 va.va_mask = AT_UID|AT_MODE;
1558
1559 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1560
1561 if (!error) {
1562 if (crgetuid(rp->cr) != va.va_uid) {
1563 /*
1564 * This is a kludge to allow writes of files
1565 * created with read only permission. The
1566 * owner of the file is always allowed to
1567 * write it.
1568 */
1569 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1570 }
1571 if (!error && MANDLOCK(vp, va.va_mode))
1572 error = EACCES;
1573 }
1574
1575 /*
1576 * Check for a conflict with a nbmand-locked region.
1577 */
1578 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1579 rp->wa->wa_count, 0, NULL)) {
1580 error = EACCES;
1581 }
1582
1583 if (error) {
1584 rp->ns->ns_status = puterrno(error);
1585 t_flag = curthread->t_flag & T_WOULDBLOCK;
1586 rp->thread->t_flag |= t_flag;
1587 continue;
1588 }
1589 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1590 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1591 } while ((rp = rp->list) != NULL);
1592
1593 /*
1594 * Step through the cluster attempting to gather as many
1595 * requests which are contiguous as possible. These
1596 * contiguous requests are handled via one call to VOP_WRITE
1597 * instead of different calls to VOP_WRITE. We also keep
1598 * track of the fact that any data was written.
1599 */
1600 rp = nlp->list;
1601 data_written = 0;
1602 do {
1603 /*
1604 * Skip any requests which are already marked as having an
1605 * error.
1606 */
1607 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1608 rp = rp->list;
1609 continue;
1610 }
1611
1612 /*
1613 * Count the number of iovec's which are required
1614 * to handle this set of requests. One iovec is
1615 * needed for each data buffer, whether addressed
1616 * by wa_data or by the b_rptr pointers in the
1617 * mblk chains.
1618 */
1619 iovcnt = 0;
1620 lrp = rp;
1621 for (;;) {
1622 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1623 iovcnt++;
1624 else {
1625 m = lrp->wa->wa_mblk;
1626 while (m != NULL) {
1627 iovcnt++;
1628 m = m->b_cont;
1629 }
1630 }
1631 if (lrp->list == NULL ||
1632 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1633 lrp->wa->wa_offset + lrp->wa->wa_count !=
1634 lrp->list->wa->wa_offset) {
1635 lrp = lrp->list;
1636 break;
1637 }
1638 lrp = lrp->list;
1639 }
1640
1641 if (iovcnt <= MAXCLIOVECS) {
1642 #ifdef DEBUG
1643 rfs_write_hits++;
1644 #endif
1645 niovp = iov;
1646 } else {
1647 #ifdef DEBUG
1648 rfs_write_misses++;
1649 #endif
1650 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1651 }
1652 /*
1653 * Put together the scatter/gather iovecs.
1654 */
1655 iovp = niovp;
1656 trp = rp;
1657 count = 0;
1658 do {
1659 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1660 if (trp->wa->wa_rlist) {
1661 iovp->iov_base =
1662 (char *)((trp->wa->wa_rlist)->
1663 u.c_daddr3);
1664 iovp->iov_len = trp->wa->wa_count;
1665 } else {
1666 iovp->iov_base = trp->wa->wa_data;
1667 iovp->iov_len = trp->wa->wa_count;
1668 }
1669 iovp++;
1670 } else {
1671 m = trp->wa->wa_mblk;
1672 rcount = trp->wa->wa_count;
1673 while (m != NULL) {
1674 iovp->iov_base = (caddr_t)m->b_rptr;
1675 iovp->iov_len = (m->b_wptr - m->b_rptr);
1676 rcount -= iovp->iov_len;
1677 if (rcount < 0)
1678 iovp->iov_len += rcount;
1679 iovp++;
1680 if (rcount <= 0)
1681 break;
1682 m = m->b_cont;
1683 }
1684 }
1685 count += trp->wa->wa_count;
1686 trp = trp->list;
1687 } while (trp != lrp);
1688
1689 uio.uio_iov = niovp;
1690 uio.uio_iovcnt = iovcnt;
1691 uio.uio_segflg = UIO_SYSSPACE;
1692 uio.uio_extflg = UIO_COPY_DEFAULT;
1693 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1694 uio.uio_resid = count;
1695 /*
1696 * The limit is checked on the client. We
1697 * should allow any size writes here.
1698 */
1699 uio.uio_llimit = curproc->p_fsz_ctl;
1700 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1701 if (rlimit < (rlim64_t)uio.uio_resid)
1702 uio.uio_resid = (uint_t)rlimit;
1703
1704 /*
1705 * For now we assume no append mode.
1706 */
1707
1708 /*
1709 * We're changing creds because VM may fault
1710 * and we need the cred of the current
1711 * thread to be used if quota * checking is
1712 * enabled.
1713 */
1714 savecred = curthread->t_cred;
1715 curthread->t_cred = cr;
1716 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1717 curthread->t_cred = savecred;
1718
1719 /* check if a monitor detected a delegation conflict */
1720 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1721 /* mark as wouldblock so response is dropped */
1722 curthread->t_flag |= T_WOULDBLOCK;
1723
1724 if (niovp != iov)
1725 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1726
1727 if (!error) {
1728 data_written = 1;
1729 /*
1730 * Get attributes again so we send the latest mod
1731 * time to the client side for its cache.
1732 */
1733 va.va_mask = AT_ALL; /* now we want everything */
1734
1735 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1736
1737 if (!error)
1738 acl_perm(vp, exi, &va, rp->cr);
1739 }
1740
1741 /*
1742 * Fill in the status responses for each request
1743 * which was just handled. Also, copy the latest
1744 * attributes in to the attribute responses if
1745 * appropriate.
1746 */
1747 t_flag = curthread->t_flag & T_WOULDBLOCK;
1748 do {
1749 rp->thread->t_flag |= t_flag;
1750 /* check for overflows */
1751 if (!error) {
1752 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1753 }
1754 rp->ns->ns_status = puterrno(error);
1755 rp = rp->list;
1756 } while (rp != lrp);
1757 } while (rp != NULL);
1758
1759 /*
1760 * If any data was written at all, then we need to flush
1761 * the data and metadata to stable storage.
1762 */
1763 if (data_written) {
1764 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1765
1766 if (!error) {
1767 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1768 }
1769 }
1770
1771 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1772
1773 if (in_crit)
1774 nbl_end_crit(vp);
1775 VN_RELE(vp);
1776
1777 t_flag = curthread->t_flag & T_WOULDBLOCK;
1778 mutex_enter(&nsrv->async_write_lock);
1779 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1780 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1781 rp->ns->ns_status = puterrno(error);
1782 rp->thread->t_flag |= t_flag;
1783 }
1784 }
1785 cv_broadcast(&nlp->cv);
1786 mutex_exit(&nsrv->async_write_lock);
1787
1788 }
1789
1790 void *
1791 rfs_write_getfh(struct nfswriteargs *wa)
1792 {
1793 return (&wa->wa_fhandle);
1794 }
1795
1796 /*
1797 * Create a file.
1798 * Creates a file with given attributes and returns those attributes
1799 * and an fhandle for the new file.
1800 */
1801 void
1802 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1803 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1804 {
1805 int error;
1806 int lookuperr;
1807 int in_crit = 0;
1808 struct vattr va;
1809 vnode_t *vp;
1810 vnode_t *realvp;
1811 vnode_t *dvp;
1812 char *name = args->ca_da.da_name;
1813 vnode_t *tvp = NULL;
1814 int mode;
1815 int lookup_ok;
1816 bool_t trunc;
1817 struct sockaddr *ca;
1818
1819 /*
1820 * Disallow NULL paths
1821 */
1822 if (name == NULL || *name == '\0') {
1823 dr->dr_status = NFSERR_ACCES;
1824 return;
1825 }
1826
1827 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1828 if (dvp == NULL) {
1829 dr->dr_status = NFSERR_STALE;
1830 return;
1831 }
1832
1833 error = sattr_to_vattr(args->ca_sa, &va);
1834 if (error) {
1835 dr->dr_status = puterrno(error);
1836 return;
1837 }
1838
1839 /*
1840 * Must specify the mode.
1841 */
1842 if (!(va.va_mask & AT_MODE)) {
1843 VN_RELE(dvp);
1844 dr->dr_status = NFSERR_INVAL;
1845 return;
1846 }
1847
1848 /*
1849 * This is a completely gross hack to make mknod
1850 * work over the wire until we can wack the protocol
1851 */
1852 if ((va.va_mode & IFMT) == IFCHR) {
1853 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1854 va.va_type = VFIFO; /* xtra kludge for named pipe */
1855 else {
1856 va.va_type = VCHR;
1857 /*
1858 * uncompress the received dev_t
1859 * if the top half is zero indicating a request
1860 * from an `older style' OS.
1861 */
1862 if ((va.va_size & 0xffff0000) == 0)
1863 va.va_rdev = nfsv2_expdev(va.va_size);
1864 else
1865 va.va_rdev = (dev_t)va.va_size;
1866 }
1867 va.va_mask &= ~AT_SIZE;
1868 } else if ((va.va_mode & IFMT) == IFBLK) {
1869 va.va_type = VBLK;
1870 /*
1871 * uncompress the received dev_t
1872 * if the top half is zero indicating a request
1873 * from an `older style' OS.
1874 */
1875 if ((va.va_size & 0xffff0000) == 0)
1876 va.va_rdev = nfsv2_expdev(va.va_size);
1877 else
1878 va.va_rdev = (dev_t)va.va_size;
1879 va.va_mask &= ~AT_SIZE;
1880 } else if ((va.va_mode & IFMT) == IFSOCK) {
1881 va.va_type = VSOCK;
1882 } else {
1883 va.va_type = VREG;
1884 }
1885 va.va_mode &= ~IFMT;
1886 va.va_mask |= AT_TYPE;
1887
1888 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1889 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1890 MAXPATHLEN);
1891 if (name == NULL) {
1892 dr->dr_status = puterrno(EINVAL);
1893 return;
1894 }
1895
1896 /*
1897 * Why was the choice made to use VWRITE as the mode to the
1898 * call to VOP_CREATE ? This results in a bug. When a client
1899 * opens a file that already exists and is RDONLY, the second
1900 * open fails with an EACESS because of the mode.
1901 * bug ID 1054648.
1902 */
1903 lookup_ok = 0;
1904 mode = VWRITE;
1905 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1906 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1907 NULL, NULL, NULL);
1908 if (!error) {
1909 struct vattr at;
1910
1911 lookup_ok = 1;
1912 at.va_mask = AT_MODE;
1913 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1914 if (!error)
1915 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1916 VN_RELE(tvp);
1917 tvp = NULL;
1918 }
1919 }
1920
1921 if (!lookup_ok) {
1922 if (rdonly(ro, dvp)) {
1923 error = EROFS;
1924 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1925 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1926 error = EPERM;
1927 } else {
1928 error = 0;
1929 }
1930 }
1931
1932 /*
1933 * If file size is being modified on an already existing file
1934 * make sure that there are no conflicting non-blocking mandatory
1935 * locks in the region being manipulated. Return EACCES if there
1936 * are conflicting locks.
1937 */
1938 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1939 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1940 NULL, NULL, NULL);
1941
1942 if (!lookuperr &&
1943 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1944 VN_RELE(tvp);
1945 curthread->t_flag |= T_WOULDBLOCK;
1946 goto out;
1947 }
1948
1949 if (!lookuperr && nbl_need_check(tvp)) {
1950 /*
1951 * The file exists. Now check if it has any
1952 * conflicting non-blocking mandatory locks
1953 * in the region being changed.
1954 */
1955 struct vattr bva;
1956 u_offset_t offset;
1957 ssize_t length;
1958
1959 nbl_start_crit(tvp, RW_READER);
1960 in_crit = 1;
1961
1962 bva.va_mask = AT_SIZE;
1963 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1964 if (!error) {
1965 if (va.va_size < bva.va_size) {
1966 offset = va.va_size;
1967 length = bva.va_size - va.va_size;
1968 } else {
1969 offset = bva.va_size;
1970 length = va.va_size - bva.va_size;
1971 }
1972 if (length) {
1973 if (nbl_conflict(tvp, NBL_WRITE,
1974 offset, length, 0, NULL)) {
1975 error = EACCES;
1976 }
1977 }
1978 }
1979 if (error) {
1980 nbl_end_crit(tvp);
1981 VN_RELE(tvp);
1982 in_crit = 0;
1983 }
1984 } else if (tvp != NULL) {
1985 VN_RELE(tvp);
1986 }
1987 }
1988
1989 if (!error) {
1990 /*
1991 * If filesystem is shared with nosuid the remove any
1992 * setuid/setgid bits on create.
1993 */
1994 if (va.va_type == VREG &&
1995 exi->exi_export.ex_flags & EX_NOSUID)
1996 va.va_mode &= ~(VSUID | VSGID);
1997
1998 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1999 NULL, NULL);
2000
2001 if (!error) {
2002
2003 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2004 trunc = TRUE;
2005 else
2006 trunc = FALSE;
2007
2008 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2009 VN_RELE(vp);
2010 curthread->t_flag |= T_WOULDBLOCK;
2011 goto out;
2012 }
2013 va.va_mask = AT_ALL;
2014
2015 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2016
2017 /* check for overflows */
2018 if (!error) {
2019 acl_perm(vp, exi, &va, cr);
2020 error = vattr_to_nattr(&va, &dr->dr_attr);
2021 if (!error) {
2022 error = makefh(&dr->dr_fhandle, vp,
2023 exi);
2024 }
2025 }
2026 /*
2027 * Force modified metadata out to stable storage.
2028 *
2029 * if a underlying vp exists, pass it to VOP_FSYNC
2030 */
2031 if (VOP_REALVP(vp, &realvp, NULL) == 0)
2032 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2033 else
2034 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2035 VN_RELE(vp);
2036 }
2037
2038 if (in_crit) {
2039 nbl_end_crit(tvp);
2040 VN_RELE(tvp);
2041 }
2042 }
2043
2044 /*
2045 * Force modified data and metadata out to stable storage.
2046 */
2047 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2048
2049 out:
2050
2051 VN_RELE(dvp);
2052
2053 dr->dr_status = puterrno(error);
2054
2055 if (name != args->ca_da.da_name)
2056 kmem_free(name, MAXPATHLEN);
2057 }
2058 void *
2059 rfs_create_getfh(struct nfscreatargs *args)
2060 {
2061 return (args->ca_da.da_fhandle);
2062 }
2063
2064 /*
2065 * Remove a file.
2066 * Remove named file from parent directory.
2067 */
2068 /* ARGSUSED */
2069 void
2070 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2071 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2072 {
2073 int error = 0;
2074 vnode_t *vp;
2075 vnode_t *targvp;
2076 int in_crit = 0;
2077
2078 /*
2079 * Disallow NULL paths
2080 */
2081 if (da->da_name == NULL || *da->da_name == '\0') {
2082 *status = NFSERR_ACCES;
2083 return;
2084 }
2085
2086 vp = nfs_fhtovp(da->da_fhandle, exi);
2087 if (vp == NULL) {
2088 *status = NFSERR_STALE;
2089 return;
2090 }
2091
2092 if (rdonly(ro, vp)) {
2093 VN_RELE(vp);
2094 *status = NFSERR_ROFS;
2095 return;
2096 }
2097
2098 /*
2099 * Check for a conflict with a non-blocking mandatory share reservation.
2100 */
2101 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2102 NULL, cr, NULL, NULL, NULL);
2103 if (error != 0) {
2104 VN_RELE(vp);
2105 *status = puterrno(error);
2106 return;
2107 }
2108
2109 /*
2110 * If the file is delegated to an v4 client, then initiate
2111 * recall and drop this request (by setting T_WOULDBLOCK).
2112 * The client will eventually re-transmit the request and
2113 * (hopefully), by then, the v4 client will have returned
2114 * the delegation.
2115 */
2116
2117 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2118 VN_RELE(vp);
2119 VN_RELE(targvp);
2120 curthread->t_flag |= T_WOULDBLOCK;
2121 return;
2122 }
2123
2124 if (nbl_need_check(targvp)) {
2125 nbl_start_crit(targvp, RW_READER);
2126 in_crit = 1;
2127 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2128 error = EACCES;
2129 goto out;
2130 }
2131 }
2132
2133 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2134
2135 /*
2136 * Force modified data and metadata out to stable storage.
2137 */
2138 (void) VOP_FSYNC(vp, 0, cr, NULL);
2139
2140 out:
2141 if (in_crit)
2142 nbl_end_crit(targvp);
2143 VN_RELE(targvp);
2144 VN_RELE(vp);
2145
2146 *status = puterrno(error);
2147
2148 }
2149
2150 void *
2151 rfs_remove_getfh(struct nfsdiropargs *da)
2152 {
2153 return (da->da_fhandle);
2154 }
2155
2156 /*
2157 * rename a file
2158 * Give a file (from) a new name (to).
2159 */
2160 /* ARGSUSED */
2161 void
2162 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2163 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2164 {
2165 int error = 0;
2166 vnode_t *fromvp;
2167 vnode_t *tovp;
2168 struct exportinfo *to_exi;
2169 fhandle_t *fh;
2170 vnode_t *srcvp;
2171 vnode_t *targvp;
2172 int in_crit = 0;
2173
2174 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2175 if (fromvp == NULL) {
2176 *status = NFSERR_STALE;
2177 return;
2178 }
2179
2180 fh = args->rna_to.da_fhandle;
2181 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2182 if (to_exi == NULL) {
2183 VN_RELE(fromvp);
2184 *status = NFSERR_ACCES;
2185 return;
2186 }
2187 exi_rele(to_exi);
2188
2189 if (to_exi != exi) {
2190 VN_RELE(fromvp);
2191 *status = NFSERR_XDEV;
2192 return;
2193 }
2194
2195 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2196 if (tovp == NULL) {
2197 VN_RELE(fromvp);
2198 *status = NFSERR_STALE;
2199 return;
2200 }
2201
2202 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2203 VN_RELE(tovp);
2204 VN_RELE(fromvp);
2205 *status = NFSERR_NOTDIR;
2206 return;
2207 }
2208
2209 /*
2210 * Disallow NULL paths
2211 */
2212 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2213 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2214 VN_RELE(tovp);
2215 VN_RELE(fromvp);
2216 *status = NFSERR_ACCES;
2217 return;
2218 }
2219
2220 if (rdonly(ro, tovp)) {
2221 VN_RELE(tovp);
2222 VN_RELE(fromvp);
2223 *status = NFSERR_ROFS;
2224 return;
2225 }
2226
2227 /*
2228 * Check for a conflict with a non-blocking mandatory share reservation.
2229 */
2230 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2231 NULL, cr, NULL, NULL, NULL);
2232 if (error != 0) {
2233 VN_RELE(tovp);
2234 VN_RELE(fromvp);
2235 *status = puterrno(error);
2236 return;
2237 }
2238
2239 /* Check for delegations on the source file */
2240
2241 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2242 VN_RELE(tovp);
2243 VN_RELE(fromvp);
2244 VN_RELE(srcvp);
2245 curthread->t_flag |= T_WOULDBLOCK;
2246 return;
2247 }
2248
2249 /* Check for delegation on the file being renamed over, if it exists */
2250
2251 if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2252 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2253 NULL, NULL, NULL) == 0) {
2254
2255 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2256 VN_RELE(tovp);
2257 VN_RELE(fromvp);
2258 VN_RELE(srcvp);
2259 VN_RELE(targvp);
2260 curthread->t_flag |= T_WOULDBLOCK;
2261 return;
2262 }
2263 VN_RELE(targvp);
2264 }
2265
2266
2267 if (nbl_need_check(srcvp)) {
2268 nbl_start_crit(srcvp, RW_READER);
2269 in_crit = 1;
2270 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2271 error = EACCES;
2272 goto out;
2273 }
2274 }
2275
2276 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2277 tovp, args->rna_to.da_name, cr, NULL, 0);
2278
2279 if (error == 0)
2280 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2281 strlen(args->rna_to.da_name));
2282
2283 /*
2284 * Force modified data and metadata out to stable storage.
2285 */
2286 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2287 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2288
2289 out:
2290 if (in_crit)
2291 nbl_end_crit(srcvp);
2292 VN_RELE(srcvp);
2293 VN_RELE(tovp);
2294 VN_RELE(fromvp);
2295
2296 *status = puterrno(error);
2297
2298 }
2299 void *
2300 rfs_rename_getfh(struct nfsrnmargs *args)
2301 {
2302 return (args->rna_from.da_fhandle);
2303 }
2304
2305 /*
2306 * Link to a file.
2307 * Create a file (to) which is a hard link to the given file (from).
2308 */
2309 /* ARGSUSED */
2310 void
2311 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2312 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2313 {
2314 int error;
2315 vnode_t *fromvp;
2316 vnode_t *tovp;
2317 struct exportinfo *to_exi;
2318 fhandle_t *fh;
2319
2320 fromvp = nfs_fhtovp(args->la_from, exi);
2321 if (fromvp == NULL) {
2322 *status = NFSERR_STALE;
2323 return;
2324 }
2325
2326 fh = args->la_to.da_fhandle;
2327 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2328 if (to_exi == NULL) {
2329 VN_RELE(fromvp);
2330 *status = NFSERR_ACCES;
2331 return;
2332 }
2333 exi_rele(to_exi);
2334
2335 if (to_exi != exi) {
2336 VN_RELE(fromvp);
2337 *status = NFSERR_XDEV;
2338 return;
2339 }
2340
2341 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2342 if (tovp == NULL) {
2343 VN_RELE(fromvp);
2344 *status = NFSERR_STALE;
2345 return;
2346 }
2347
2348 if (tovp->v_type != VDIR) {
2349 VN_RELE(tovp);
2350 VN_RELE(fromvp);
2351 *status = NFSERR_NOTDIR;
2352 return;
2353 }
2354 /*
2355 * Disallow NULL paths
2356 */
2357 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2358 VN_RELE(tovp);
2359 VN_RELE(fromvp);
2360 *status = NFSERR_ACCES;
2361 return;
2362 }
2363
2364 if (rdonly(ro, tovp)) {
2365 VN_RELE(tovp);
2366 VN_RELE(fromvp);
2367 *status = NFSERR_ROFS;
2368 return;
2369 }
2370
2371 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2372
2373 /*
2374 * Force modified data and metadata out to stable storage.
2375 */
2376 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2377 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2378
2379 VN_RELE(tovp);
2380 VN_RELE(fromvp);
2381
2382 *status = puterrno(error);
2383
2384 }
2385 void *
2386 rfs_link_getfh(struct nfslinkargs *args)
2387 {
2388 return (args->la_from);
2389 }
2390
2391 /*
2392 * Symbolicly link to a file.
2393 * Create a file (to) with the given attributes which is a symbolic link
2394 * to the given path name (to).
2395 */
2396 void
2397 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2398 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2399 {
2400 int error;
2401 struct vattr va;
2402 vnode_t *vp;
2403 vnode_t *svp;
2404 int lerror;
2405 struct sockaddr *ca;
2406 char *name = NULL;
2407
2408 /*
2409 * Disallow NULL paths
2410 */
2411 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2412 *status = NFSERR_ACCES;
2413 return;
2414 }
2415
2416 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2417 if (vp == NULL) {
2418 *status = NFSERR_STALE;
2419 return;
2420 }
2421
2422 if (rdonly(ro, vp)) {
2423 VN_RELE(vp);
2424 *status = NFSERR_ROFS;
2425 return;
2426 }
2427
2428 error = sattr_to_vattr(args->sla_sa, &va);
2429 if (error) {
2430 VN_RELE(vp);
2431 *status = puterrno(error);
2432 return;
2433 }
2434
2435 if (!(va.va_mask & AT_MODE)) {
2436 VN_RELE(vp);
2437 *status = NFSERR_INVAL;
2438 return;
2439 }
2440
2441 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2442 name = nfscmd_convname(ca, exi, args->sla_tnm,
2443 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2444
2445 if (name == NULL) {
2446 *status = NFSERR_ACCES;
2447 return;
2448 }
2449
2450 va.va_type = VLNK;
2451 va.va_mask |= AT_TYPE;
2452
2453 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2454
2455 /*
2456 * Force new data and metadata out to stable storage.
2457 */
2458 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2459 NULL, cr, NULL, NULL, NULL);
2460
2461 if (!lerror) {
2462 (void) VOP_FSYNC(svp, 0, cr, NULL);
2463 VN_RELE(svp);
2464 }
2465
2466 /*
2467 * Force modified data and metadata out to stable storage.
2468 */
2469 (void) VOP_FSYNC(vp, 0, cr, NULL);
2470
2471 VN_RELE(vp);
2472
2473 *status = puterrno(error);
2474 if (name != args->sla_tnm)
2475 kmem_free(name, MAXPATHLEN);
2476
2477 }
2478 void *
2479 rfs_symlink_getfh(struct nfsslargs *args)
2480 {
2481 return (args->sla_from.da_fhandle);
2482 }
2483
2484 /*
2485 * Make a directory.
2486 * Create a directory with the given name, parent directory, and attributes.
2487 * Returns a file handle and attributes for the new directory.
2488 */
2489 /* ARGSUSED */
2490 void
2491 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2492 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2493 {
2494 int error;
2495 struct vattr va;
2496 vnode_t *dvp = NULL;
2497 vnode_t *vp;
2498 char *name = args->ca_da.da_name;
2499
2500 /*
2501 * Disallow NULL paths
2502 */
2503 if (name == NULL || *name == '\0') {
2504 dr->dr_status = NFSERR_ACCES;
2505 return;
2506 }
2507
2508 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2509 if (vp == NULL) {
2510 dr->dr_status = NFSERR_STALE;
2511 return;
2512 }
2513
2514 if (rdonly(ro, vp)) {
2515 VN_RELE(vp);
2516 dr->dr_status = NFSERR_ROFS;
2517 return;
2518 }
2519
2520 error = sattr_to_vattr(args->ca_sa, &va);
2521 if (error) {
2522 VN_RELE(vp);
2523 dr->dr_status = puterrno(error);
2524 return;
2525 }
2526
2527 if (!(va.va_mask & AT_MODE)) {
2528 VN_RELE(vp);
2529 dr->dr_status = NFSERR_INVAL;
2530 return;
2531 }
2532
2533 va.va_type = VDIR;
2534 va.va_mask |= AT_TYPE;
2535
2536 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2537
2538 if (!error) {
2539 /*
2540 * Attribtutes of the newly created directory should
2541 * be returned to the client.
2542 */
2543 va.va_mask = AT_ALL; /* We want everything */
2544 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2545
2546 /* check for overflows */
2547 if (!error) {
2548 acl_perm(vp, exi, &va, cr);
2549 error = vattr_to_nattr(&va, &dr->dr_attr);
2550 if (!error) {
2551 error = makefh(&dr->dr_fhandle, dvp, exi);
2552 }
2553 }
2554 /*
2555 * Force new data and metadata out to stable storage.
2556 */
2557 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2558 VN_RELE(dvp);
2559 }
2560
2561 /*
2562 * Force modified data and metadata out to stable storage.
2563 */
2564 (void) VOP_FSYNC(vp, 0, cr, NULL);
2565
2566 VN_RELE(vp);
2567
2568 dr->dr_status = puterrno(error);
2569
2570 }
2571 void *
2572 rfs_mkdir_getfh(struct nfscreatargs *args)
2573 {
2574 return (args->ca_da.da_fhandle);
2575 }
2576
2577 /*
2578 * Remove a directory.
2579 * Remove the given directory name from the given parent directory.
2580 */
2581 /* ARGSUSED */
2582 void
2583 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2584 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2585 {
2586 int error;
2587 vnode_t *vp;
2588
2589 /*
2590 * Disallow NULL paths
2591 */
2592 if (da->da_name == NULL || *da->da_name == '\0') {
2593 *status = NFSERR_ACCES;
2594 return;
2595 }
2596
2597 vp = nfs_fhtovp(da->da_fhandle, exi);
2598 if (vp == NULL) {
2599 *status = NFSERR_STALE;
2600 return;
2601 }
2602
2603 if (rdonly(ro, vp)) {
2604 VN_RELE(vp);
2605 *status = NFSERR_ROFS;
2606 return;
2607 }
2608
2609 /*
2610 * VOP_RMDIR takes a third argument (the current
2611 * directory of the process). That's because someone
2612 * wants to return EINVAL if one tries to remove ".".
2613 * Of course, NFS servers have no idea what their
2614 * clients' current directories are. We fake it by
2615 * supplying a vnode known to exist and illegal to
2616 * remove.
2617 */
2618 error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2619
2620 /*
2621 * Force modified data and metadata out to stable storage.
2622 */
2623 (void) VOP_FSYNC(vp, 0, cr, NULL);
2624
2625 VN_RELE(vp);
2626
2627 /*
2628 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2629 * if the directory is not empty. A System V NFS server
2630 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2631 * over the wire.
2632 */
2633 if (error == EEXIST)
2634 *status = NFSERR_NOTEMPTY;
2635 else
2636 *status = puterrno(error);
2637
2638 }
2639 void *
2640 rfs_rmdir_getfh(struct nfsdiropargs *da)
2641 {
2642 return (da->da_fhandle);
2643 }
2644
2645 /* ARGSUSED */
2646 void
2647 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2648 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2649 {
2650 int error;
2651 int iseof;
2652 struct iovec iov;
2653 struct uio uio;
2654 vnode_t *vp;
2655 char *ndata = NULL;
2656 struct sockaddr *ca;
2657 size_t nents;
2658 int ret;
2659
2660 vp = nfs_fhtovp(&rda->rda_fh, exi);
2661 if (vp == NULL) {
2662 rd->rd_entries = NULL;
2663 rd->rd_status = NFSERR_STALE;
2664 return;
2665 }
2666
2667 if (vp->v_type != VDIR) {
2668 VN_RELE(vp);
2669 rd->rd_entries = NULL;
2670 rd->rd_status = NFSERR_NOTDIR;
2671 return;
2672 }
2673
2674 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2675
2676 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2677
2678 if (error) {
2679 rd->rd_entries = NULL;
2680 goto bad;
2681 }
2682
2683 if (rda->rda_count == 0) {
2684 rd->rd_entries = NULL;
2685 rd->rd_size = 0;
2686 rd->rd_eof = FALSE;
2687 goto bad;
2688 }
2689
2690 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2691
2692 /*
2693 * Allocate data for entries. This will be freed by rfs_rddirfree.
2694 */
2695 rd->rd_bufsize = (uint_t)rda->rda_count;
2696 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2697
2698 /*
2699 * Set up io vector to read directory data
2700 */
2701 iov.iov_base = (caddr_t)rd->rd_entries;
2702 iov.iov_len = rda->rda_count;
2703 uio.uio_iov = &iov;
2704 uio.uio_iovcnt = 1;
2705 uio.uio_segflg = UIO_SYSSPACE;
2706 uio.uio_extflg = UIO_COPY_CACHED;
2707 uio.uio_loffset = (offset_t)rda->rda_offset;
2708 uio.uio_resid = rda->rda_count;
2709
2710 /*
2711 * read directory
2712 */
2713 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2714
2715 /*
2716 * Clean up
2717 */
2718 if (!error) {
2719 /*
2720 * set size and eof
2721 */
2722 if (uio.uio_resid == rda->rda_count) {
2723 rd->rd_size = 0;
2724 rd->rd_eof = TRUE;
2725 } else {
2726 rd->rd_size = (uint32_t)(rda->rda_count -
2727 uio.uio_resid);
2728 rd->rd_eof = iseof ? TRUE : FALSE;
2729 }
2730 }
2731
2732 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2733 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2734 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2735 rda->rda_count, &ndata);
2736
2737 if (ret != 0) {
2738 size_t dropbytes;
2739 /*
2740 * We had to drop one or more entries in order to fit
2741 * during the character conversion. We need to patch
2742 * up the size and eof info.
2743 */
2744 if (rd->rd_eof)
2745 rd->rd_eof = FALSE;
2746 dropbytes = nfscmd_dropped_entrysize(
2747 (struct dirent64 *)rd->rd_entries, nents, ret);
2748 rd->rd_size -= dropbytes;
2749 }
2750 if (ndata == NULL) {
2751 ndata = (char *)rd->rd_entries;
2752 } else if (ndata != (char *)rd->rd_entries) {
2753 kmem_free(rd->rd_entries, rd->rd_bufsize);
2754 rd->rd_entries = (void *)ndata;
2755 rd->rd_bufsize = rda->rda_count;
2756 }
2757
2758 bad:
2759 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2760
2761 #if 0 /* notyet */
2762 /*
2763 * Don't do this. It causes local disk writes when just
2764 * reading the file and the overhead is deemed larger
2765 * than the benefit.
2766 */
2767 /*
2768 * Force modified metadata out to stable storage.
2769 */
2770 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2771 #endif
2772
2773 VN_RELE(vp);
2774
2775 rd->rd_status = puterrno(error);
2776
2777 }
2778 void *
2779 rfs_readdir_getfh(struct nfsrddirargs *rda)
2780 {
2781 return (&rda->rda_fh);
2782 }
2783 void
2784 rfs_rddirfree(struct nfsrddirres *rd)
2785 {
2786 if (rd->rd_entries != NULL)
2787 kmem_free(rd->rd_entries, rd->rd_bufsize);
2788 }
2789
2790 /* ARGSUSED */
2791 void
2792 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2793 struct svc_req *req, cred_t *cr, bool_t ro)
2794 {
2795 int error;
2796 struct statvfs64 sb;
2797 vnode_t *vp;
2798
2799 vp = nfs_fhtovp(fh, exi);
2800 if (vp == NULL) {
2801 fs->fs_status = NFSERR_STALE;
2802 return;
2803 }
2804
2805 error = VFS_STATVFS(vp->v_vfsp, &sb);
2806
2807 if (!error) {
2808 fs->fs_tsize = nfstsize();
2809 fs->fs_bsize = sb.f_frsize;
2810 fs->fs_blocks = sb.f_blocks;
2811 fs->fs_bfree = sb.f_bfree;
2812 fs->fs_bavail = sb.f_bavail;
2813 }
2814
2815 VN_RELE(vp);
2816
2817 fs->fs_status = puterrno(error);
2818
2819 }
2820 void *
2821 rfs_statfs_getfh(fhandle_t *fh)
2822 {
2823 return (fh);
2824 }
2825
2826 static int
2827 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2828 {
2829 vap->va_mask = 0;
2830
2831 /*
2832 * There was a sign extension bug in some VFS based systems
2833 * which stored the mode as a short. When it would get
2834 * assigned to a u_long, no sign extension would occur.
2835 * It needed to, but this wasn't noticed because sa_mode
2836 * would then get assigned back to the short, thus ignoring
2837 * the upper 16 bits of sa_mode.
2838 *
2839 * To make this implementation work for both broken
2840 * clients and good clients, we check for both versions
2841 * of the mode.
2842 */
2843 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2844 sa->sa_mode != (uint32_t)-1) {
2845 vap->va_mask |= AT_MODE;
2846 vap->va_mode = sa->sa_mode;
2847 }
2848 if (sa->sa_uid != (uint32_t)-1) {
2849 vap->va_mask |= AT_UID;
2850 vap->va_uid = sa->sa_uid;
2851 }
2852 if (sa->sa_gid != (uint32_t)-1) {
2853 vap->va_mask |= AT_GID;
2854 vap->va_gid = sa->sa_gid;
2855 }
2856 if (sa->sa_size != (uint32_t)-1) {
2857 vap->va_mask |= AT_SIZE;
2858 vap->va_size = sa->sa_size;
2859 }
2860 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2861 sa->sa_atime.tv_usec != (int32_t)-1) {
2862 #ifndef _LP64
2863 /* return error if time overflow */
2864 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2865 return (EOVERFLOW);
2866 #endif
2867 vap->va_mask |= AT_ATIME;
2868 /*
2869 * nfs protocol defines times as unsigned so don't extend sign,
2870 * unless sysadmin set nfs_allow_preepoch_time.
2871 */
2872 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2873 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2874 }
2875 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2876 sa->sa_mtime.tv_usec != (int32_t)-1) {
2877 #ifndef _LP64
2878 /* return error if time overflow */
2879 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2880 return (EOVERFLOW);
2881 #endif
2882 vap->va_mask |= AT_MTIME;
2883 /*
2884 * nfs protocol defines times as unsigned so don't extend sign,
2885 * unless sysadmin set nfs_allow_preepoch_time.
2886 */
2887 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2888 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2889 }
2890 return (0);
2891 }
2892
2893 static const enum nfsftype vt_to_nf[] = {
2894 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2895 };
2896
2897 /*
2898 * check the following fields for overflow: nodeid, size, and time.
2899 * There could be a problem when converting 64-bit LP64 fields
2900 * into 32-bit ones. Return an error if there is an overflow.
2901 */
2902 int
2903 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2904 {
2905 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2906 na->na_type = vt_to_nf[vap->va_type];
2907
2908 if (vap->va_mode == (unsigned short) -1)
2909 na->na_mode = (uint32_t)-1;
2910 else
2911 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2912
2913 if (vap->va_uid == (unsigned short)(-1))
2914 na->na_uid = (uint32_t)(-1);
2915 else if (vap->va_uid == UID_NOBODY)
2916 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2917 else
2918 na->na_uid = vap->va_uid;
2919
2920 if (vap->va_gid == (unsigned short)(-1))
2921 na->na_gid = (uint32_t)-1;
2922 else if (vap->va_gid == GID_NOBODY)
2923 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2924 else
2925 na->na_gid = vap->va_gid;
2926
2927 /*
2928 * Do we need to check fsid for overflow? It is 64-bit in the
2929 * vattr, but are bigger than 32 bit values supported?
2930 */
2931 na->na_fsid = vap->va_fsid;
2932
2933 na->na_nodeid = vap->va_nodeid;
2934
2935 /*
2936 * Check to make sure that the nodeid is representable over the
2937 * wire without losing bits.
2938 */
2939 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2940 return (EFBIG);
2941 na->na_nlink = vap->va_nlink;
2942
2943 /*
2944 * Check for big files here, instead of at the caller. See
2945 * comments in cstat for large special file explanation.
2946 */
2947 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2948 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2949 return (EFBIG);
2950 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2951 /* UNKNOWN_SIZE | OVERFLOW */
2952 na->na_size = MAXOFF32_T;
2953 } else
2954 na->na_size = vap->va_size;
2955 } else
2956 na->na_size = vap->va_size;
2957
2958 /*
2959 * If the vnode times overflow the 32-bit times that NFS2
2960 * uses on the wire then return an error.
2961 */
2962 if (!NFS_VAP_TIME_OK(vap)) {
2963 return (EOVERFLOW);
2964 }
2965 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2966 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2967
2968 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2969 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2970
2971 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2972 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2973
2974 /*
2975 * If the dev_t will fit into 16 bits then compress
2976 * it, otherwise leave it alone. See comments in
2977 * nfs_client.c.
2978 */
2979 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2980 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2981 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2982 else
2983 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2984
2985 na->na_blocks = vap->va_nblocks;
2986 na->na_blocksize = vap->va_blksize;
2987
2988 /*
2989 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2990 * over-the-wire protocols for named-pipe vnodes. It remaps the
2991 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2992 *
2993 * BUYER BEWARE:
2994 * If you are porting the NFS to a non-Sun server, you probably
2995 * don't want to include the following block of code. The
2996 * over-the-wire special file types will be changing with the
2997 * NFS Protocol Revision.
2998 */
2999 if (vap->va_type == VFIFO)
3000 NA_SETFIFO(na);
3001 return (0);
3002 }
3003
3004 /*
3005 * acl v2 support: returns approximate permission.
3006 * default: returns minimal permission (more restrictive)
3007 * aclok: returns maximal permission (less restrictive)
3008 * This routine changes the permissions that are alaredy in *va.
3009 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3010 * CLASS_OBJ is always the same as GROUP_OBJ entry.
3011 */
3012 static void
3013 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3014 {
3015 vsecattr_t vsa;
3016 int aclcnt;
3017 aclent_t *aclentp;
3018 mode_t mask_perm;
3019 mode_t grp_perm;
3020 mode_t other_perm;
3021 mode_t other_orig;
3022 int error;
3023
3024 /* dont care default acl */
3025 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3026 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3027
3028 if (!error) {
3029 aclcnt = vsa.vsa_aclcnt;
3030 if (aclcnt > MIN_ACL_ENTRIES) {
3031 /* non-trivial ACL */
3032 aclentp = vsa.vsa_aclentp;
3033 if (exi->exi_export.ex_flags & EX_ACLOK) {
3034 /* maximal permissions */
3035 grp_perm = 0;
3036 other_perm = 0;
3037 for (; aclcnt > 0; aclcnt--, aclentp++) {
3038 switch (aclentp->a_type) {
3039 case USER_OBJ:
3040 break;
3041 case USER:
3042 grp_perm |=
3043 aclentp->a_perm << 3;
3044 other_perm |= aclentp->a_perm;
3045 break;
3046 case GROUP_OBJ:
3047 grp_perm |=
3048 aclentp->a_perm << 3;
3049 break;
3050 case GROUP:
3051 other_perm |= aclentp->a_perm;
3052 break;
3053 case OTHER_OBJ:
3054 other_orig = aclentp->a_perm;
3055 break;
3056 case CLASS_OBJ:
3057 mask_perm = aclentp->a_perm;
3058 break;
3059 default:
3060 break;
3061 }
3062 }
3063 grp_perm &= mask_perm << 3;
3064 other_perm &= mask_perm;
3065 other_perm |= other_orig;
3066
3067 } else {
3068 /* minimal permissions */
3069 grp_perm = 070;
3070 other_perm = 07;
3071 for (; aclcnt > 0; aclcnt--, aclentp++) {
3072 switch (aclentp->a_type) {
3073 case USER_OBJ:
3074 break;
3075 case USER:
3076 case CLASS_OBJ:
3077 grp_perm &=
3078 aclentp->a_perm << 3;
3079 other_perm &=
3080 aclentp->a_perm;
3081 break;
3082 case GROUP_OBJ:
3083 grp_perm &=
3084 aclentp->a_perm << 3;
3085 break;
3086 case GROUP:
3087 other_perm &=
3088 aclentp->a_perm;
3089 break;
3090 case OTHER_OBJ:
3091 other_perm &=
3092 aclentp->a_perm;
3093 break;
3094 default:
3095 break;
3096 }
3097 }
3098 }
3099 /* copy to va */
3100 va->va_mode &= ~077;
3101 va->va_mode |= grp_perm | other_perm;
3102 }
3103 if (vsa.vsa_aclcnt)
3104 kmem_free(vsa.vsa_aclentp,
3105 vsa.vsa_aclcnt * sizeof (aclent_t));
3106 }
3107 }
3108
3109 void
3110 rfs_srvrinit(void)
3111 {
3112 nfs2_srv_caller_id = fs_new_caller_id();
3113 }
3114
3115 void
3116 rfs_srvrfini(void)
3117 {
3118 }
3119
3120 /* ARGSUSED */
3121 void
3122 rfs_srv_zone_init(nfs_globals_t *ng)
3123 {
3124 nfs_srv_t *ns;
3125
3126 ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3127
3128 mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3129 ns->write_async = 1;
3130
3131 ng->nfs_srv = ns;
3132 }
3133
3134 /* ARGSUSED */
3135 void
3136 rfs_srv_zone_fini(nfs_globals_t *ng)
3137 {
3138 nfs_srv_t *ns = ng->nfs_srv;
3139
3140 ng->nfs_srv = NULL;
3141
3142 mutex_destroy(&ns->async_write_lock);
3143 kmem_free(ns, sizeof (*ns));
3144 }
3145
3146 static int
3147 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3148 {
3149 struct clist *wcl;
3150 int wlist_len;
3151 uint32_t count = rr->rr_count;
3152
3153 wcl = ra->ra_wlist;
3154
3155 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3156 return (FALSE);
3157 }
3158
3159 wcl = ra->ra_wlist;
3160 rr->rr_ok.rrok_wlist_len = wlist_len;
3161 rr->rr_ok.rrok_wlist = wcl;
3162
3163 return (TRUE);
3164 }