1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
30 * All rights reserved.
31 */
32
33 /*
34 * Copyright 2018 Nexenta Systems, Inc.
35 * Copyright (c) 2016 by Delphix. All rights reserved.
36 */
37
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/uio.h>
46 #include <sys/stat.h>
47 #include <sys/errno.h>
48 #include <sys/sysmacros.h>
49 #include <sys/statvfs.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/dirent.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/vtrace.h>
56 #include <sys/mode.h>
57 #include <sys/acl.h>
58 #include <sys/nbmlock.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/svc.h>
65
66 #include <nfs/nfs.h>
67 #include <nfs/export.h>
68 #include <nfs/nfs_cmd.h>
69
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_kmem.h>
75
76 #include <sys/strsubr.h>
77
78 struct rfs_async_write_list;
79
80 /*
81 * Zone globals of NFSv2 server
82 */
83 typedef struct nfs_srv {
84 kmutex_t async_write_lock;
85 struct rfs_async_write_list *async_write_head;
86
87 /*
88 * enables write clustering if == 1
89 */
90 int write_async;
91 } nfs_srv_t;
92
93 /*
94 * These are the interface routines for the server side of the
95 * Network File System. See the NFS version 2 protocol specification
96 * for a description of this interface.
97 */
98
99 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
100 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
101 cred_t *);
102
103
104 /*
105 * Some "over the wire" UNIX file types. These are encoded
106 * into the mode. This needs to be fixed in the next rev.
107 */
108 #define IFMT 0170000 /* type of file */
109 #define IFCHR 0020000 /* character special */
110 #define IFBLK 0060000 /* block special */
111 #define IFSOCK 0140000 /* socket */
112
113 u_longlong_t nfs2_srv_caller_id;
114
115 static nfs_srv_t *
116 nfs_get_srv(void)
117 {
118 nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
119 nfs_srv_t *srv = ng->nfs_srv;
120 ASSERT(srv != NULL);
121 return (srv);
122 }
123
124 /*
125 * Get file attributes.
126 * Returns the current attributes of the file with the given fhandle.
127 */
128 /* ARGSUSED */
129 void
130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
131 struct svc_req *req, cred_t *cr, bool_t ro)
132 {
133 int error;
134 vnode_t *vp;
135 struct vattr va;
136
137 vp = nfs_fhtovp(fhp, exi);
138 if (vp == NULL) {
139 ns->ns_status = NFSERR_STALE;
140 return;
141 }
142
143 /*
144 * Do the getattr.
145 */
146 va.va_mask = AT_ALL; /* we want all the attributes */
147
148 error = rfs4_delegated_getattr(vp, &va, 0, cr);
149
150 /* check for overflows */
151 if (!error) {
152 /* Lie about the object type for a referral */
153 if (vn_is_nfs_reparse(vp, cr))
154 va.va_type = VLNK;
155
156 acl_perm(vp, exi, &va, cr);
157 error = vattr_to_nattr(&va, &ns->ns_attr);
158 }
159
160 VN_RELE(vp);
161
162 ns->ns_status = puterrno(error);
163 }
164 void *
165 rfs_getattr_getfh(fhandle_t *fhp)
166 {
167 return (fhp);
168 }
169
170 /*
171 * Set file attributes.
172 * Sets the attributes of the file with the given fhandle. Returns
173 * the new attributes.
174 */
175 /* ARGSUSED */
176 void
177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
178 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
179 {
180 int error;
181 int flag;
182 int in_crit = 0;
183 vnode_t *vp;
184 struct vattr va;
185 struct vattr bva;
186 struct flock64 bf;
187 caller_context_t ct;
188
189
190 vp = nfs_fhtovp(&args->saa_fh, exi);
191 if (vp == NULL) {
192 ns->ns_status = NFSERR_STALE;
193 return;
194 }
195
196 if (rdonly(ro, vp)) {
197 VN_RELE(vp);
198 ns->ns_status = NFSERR_ROFS;
199 return;
200 }
201
202 error = sattr_to_vattr(&args->saa_sa, &va);
203 if (error) {
204 VN_RELE(vp);
205 ns->ns_status = puterrno(error);
206 return;
207 }
208
209 /*
210 * If the client is requesting a change to the mtime,
211 * but the nanosecond field is set to 1 billion, then
212 * this is a flag to the server that it should set the
213 * atime and mtime fields to the server's current time.
214 * The 1 billion number actually came from the client
215 * as 1 million, but the units in the over the wire
216 * request are microseconds instead of nanoseconds.
217 *
218 * This is an overload of the protocol and should be
219 * documented in the NFS Version 2 protocol specification.
220 */
221 if (va.va_mask & AT_MTIME) {
222 if (va.va_mtime.tv_nsec == 1000000000) {
223 gethrestime(&va.va_mtime);
224 va.va_atime = va.va_mtime;
225 va.va_mask |= AT_ATIME;
226 flag = 0;
227 } else
228 flag = ATTR_UTIME;
229 } else
230 flag = 0;
231
232 /*
233 * If the filesystem is exported with nosuid, then mask off
234 * the setuid and setgid bits.
235 */
236 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
237 (exi->exi_export.ex_flags & EX_NOSUID))
238 va.va_mode &= ~(VSUID | VSGID);
239
240 ct.cc_sysid = 0;
241 ct.cc_pid = 0;
242 ct.cc_caller_id = nfs2_srv_caller_id;
243 ct.cc_flags = CC_DONTBLOCK;
244
245 /*
246 * We need to specially handle size changes because it is
247 * possible for the client to create a file with modes
248 * which indicate read-only, but with the file opened for
249 * writing. If the client then tries to set the size of
250 * the file, then the normal access checking done in
251 * VOP_SETATTR would prevent the client from doing so,
252 * although it should be legal for it to do so. To get
253 * around this, we do the access checking for ourselves
254 * and then use VOP_SPACE which doesn't do the access
255 * checking which VOP_SETATTR does. VOP_SPACE can only
256 * operate on VREG files, let VOP_SETATTR handle the other
257 * extremely rare cases.
258 * Also the client should not be allowed to change the
259 * size of the file if there is a conflicting non-blocking
260 * mandatory lock in the region of change.
261 */
262 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
263 if (nbl_need_check(vp)) {
264 nbl_start_crit(vp, RW_READER);
265 in_crit = 1;
266 }
267
268 bva.va_mask = AT_UID | AT_SIZE;
269
270 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
271
272 if (error) {
273 if (in_crit)
274 nbl_end_crit(vp);
275 VN_RELE(vp);
276 ns->ns_status = puterrno(error);
277 return;
278 }
279
280 if (in_crit) {
281 u_offset_t offset;
282 ssize_t length;
283
284 if (va.va_size < bva.va_size) {
285 offset = va.va_size;
286 length = bva.va_size - va.va_size;
287 } else {
288 offset = bva.va_size;
289 length = va.va_size - bva.va_size;
290 }
291 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
292 NULL)) {
293 error = EACCES;
294 }
295 }
296
297 if (crgetuid(cr) == bva.va_uid && !error &&
298 va.va_size != bva.va_size) {
299 va.va_mask &= ~AT_SIZE;
300 bf.l_type = F_WRLCK;
301 bf.l_whence = 0;
302 bf.l_start = (off64_t)va.va_size;
303 bf.l_len = 0;
304 bf.l_sysid = 0;
305 bf.l_pid = 0;
306
307 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
308 (offset_t)va.va_size, cr, &ct);
309 }
310 if (in_crit)
311 nbl_end_crit(vp);
312 } else
313 error = 0;
314
315 /*
316 * Do the setattr.
317 */
318 if (!error && va.va_mask) {
319 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
320 }
321
322 /*
323 * check if the monitor on either vop_space or vop_setattr detected
324 * a delegation conflict and if so, mark the thread flag as
325 * wouldblock so that the response is dropped and the client will
326 * try again.
327 */
328 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
329 VN_RELE(vp);
330 curthread->t_flag |= T_WOULDBLOCK;
331 return;
332 }
333
334 if (!error) {
335 va.va_mask = AT_ALL; /* get everything */
336
337 error = rfs4_delegated_getattr(vp, &va, 0, cr);
338
339 /* check for overflows */
340 if (!error) {
341 acl_perm(vp, exi, &va, cr);
342 error = vattr_to_nattr(&va, &ns->ns_attr);
343 }
344 }
345
346 ct.cc_flags = 0;
347
348 /*
349 * Force modified metadata out to stable storage.
350 */
351 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
352
353 VN_RELE(vp);
354
355 ns->ns_status = puterrno(error);
356 }
357 void *
358 rfs_setattr_getfh(struct nfssaargs *args)
359 {
360 return (&args->saa_fh);
361 }
362
363 /* Change and release @exip and @vpp only in success */
364 int
365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
366 {
367 struct exportinfo *exi;
368 vnode_t *vp = *vpp;
369 fid_t fid;
370 int error;
371
372 VN_HOLD(vp);
373
374 if ((error = traverse(&vp)) != 0) {
375 VN_RELE(vp);
376 return (error);
377 }
378
379 bzero(&fid, sizeof (fid));
380 fid.fid_len = MAXFIDSZ;
381 error = VOP_FID(vp, &fid, NULL);
382 if (error) {
383 VN_RELE(vp);
384 return (error);
385 }
386
387 exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
388 if (exi == NULL ||
389 (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
390 /*
391 * It is not error, just subdir is not exported
392 * or "nohide" is not set
393 */
394 if (exi != NULL)
395 exi_rele(exi);
396 VN_RELE(vp);
397 } else {
398 /* go to submount */
399 exi_rele(*exip);
400 *exip = exi;
401
402 VN_RELE(*vpp);
403 *vpp = vp;
404 }
405
406 return (0);
407 }
408
409 /*
410 * Given mounted "dvp" and "exi", go upper mountpoint
411 * with dvp/exi correction
412 * Return 0 in success
413 */
414 int
415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
416 {
417 struct exportinfo *exi;
418 vnode_t *dvp = *dvpp;
419
420 ASSERT(dvp->v_flag & VROOT);
421
422 VN_HOLD(dvp);
423 dvp = untraverse(dvp);
424 exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
425 if (exi == NULL) {
426 VN_RELE(dvp);
427 return (-1);
428 }
429
430 exi_rele(*exip);
431 *exip = exi;
432 VN_RELE(*dvpp);
433 *dvpp = dvp;
434
435 return (0);
436 }
437 /*
438 * Directory lookup.
439 * Returns an fhandle and file attributes for file name in a directory.
440 */
441 /* ARGSUSED */
442 void
443 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
444 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
445 {
446 int error;
447 vnode_t *dvp;
448 vnode_t *vp;
449 struct vattr va;
450 fhandle_t *fhp = da->da_fhandle;
451 struct sec_ol sec = {0, 0};
452 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
453 char *name;
454 struct sockaddr *ca;
455
456 /*
457 * Trusted Extension doesn't support NFSv2. MOUNT
458 * will reject v2 clients. Need to prevent v2 client
459 * access via WebNFS here.
460 */
461 if (is_system_labeled() && req->rq_vers == 2) {
462 dr->dr_status = NFSERR_ACCES;
463 return;
464 }
465
466 /*
467 * Disallow NULL paths
468 */
469 if (da->da_name == NULL || *da->da_name == '\0') {
470 dr->dr_status = NFSERR_ACCES;
471 return;
472 }
473
474 /*
475 * Allow lookups from the root - the default
476 * location of the public filehandle.
477 */
478 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
479 dvp = ZONE_ROOTVP();
480 VN_HOLD(dvp);
481 } else {
482 dvp = nfs_fhtovp(fhp, exi);
483 if (dvp == NULL) {
484 dr->dr_status = NFSERR_STALE;
485 return;
486 }
487 }
488
489 exi_hold(exi);
490
491 /*
492 * Not allow lookup beyond root.
493 * If the filehandle matches a filehandle of the exi,
494 * then the ".." refers beyond the root of an exported filesystem.
495 */
496 if (strcmp(da->da_name, "..") == 0 &&
497 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
498 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
499 (dvp->v_flag & VROOT)) {
500 /*
501 * special case for ".." and 'nohide'exported root
502 */
503 if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
504 error = NFSERR_ACCES;
505 goto out;
506 }
507 } else {
508 error = NFSERR_NOENT;
509 goto out;
510 }
511 }
512
513 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
514 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
515 MAXPATHLEN);
516
517 if (name == NULL) {
518 error = NFSERR_ACCES;
519 goto out;
520 }
521
522 /*
523 * If the public filehandle is used then allow
524 * a multi-component lookup, i.e. evaluate
525 * a pathname and follow symbolic links if
526 * necessary.
527 *
528 * This may result in a vnode in another filesystem
529 * which is OK as long as the filesystem is exported.
530 */
531 if (PUBLIC_FH2(fhp)) {
532 publicfh_flag = TRUE;
533
534 exi_rele(exi);
535
536 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
537 &sec);
538 } else {
539 /*
540 * Do a normal single component lookup.
541 */
542 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
543 NULL, NULL, NULL);
544 }
545
546 if (name != da->da_name)
547 kmem_free(name, MAXPATHLEN);
548
549 if (error == 0 && vn_ismntpt(vp)) {
550 error = rfs_cross_mnt(&vp, &exi);
551 if (error)
552 VN_RELE(vp);
553 }
554
555 if (!error) {
556 va.va_mask = AT_ALL; /* we want everything */
557
558 error = rfs4_delegated_getattr(vp, &va, 0, cr);
559
560 /* check for overflows */
561 if (!error) {
562 acl_perm(vp, exi, &va, cr);
563 error = vattr_to_nattr(&va, &dr->dr_attr);
564 if (!error) {
565 if (sec.sec_flags & SEC_QUERY)
566 error = makefh_ol(&dr->dr_fhandle, exi,
567 sec.sec_index);
568 else {
569 error = makefh(&dr->dr_fhandle, vp,
570 exi);
571 if (!error && publicfh_flag &&
572 !chk_clnt_sec(exi, req))
573 auth_weak = TRUE;
574 }
575 }
576 }
577 VN_RELE(vp);
578 }
579
580 out:
581 VN_RELE(dvp);
582
583 if (exi != NULL)
584 exi_rele(exi);
585
586 /*
587 * If it's public fh, no 0x81, and client's flavor is
588 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
589 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
590 */
591 if (auth_weak)
592 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
593 else
594 dr->dr_status = puterrno(error);
595 }
596 void *
597 rfs_lookup_getfh(struct nfsdiropargs *da)
598 {
599 return (da->da_fhandle);
600 }
601
602 /*
603 * Read symbolic link.
604 * Returns the string in the symbolic link at the given fhandle.
605 */
606 /* ARGSUSED */
607 void
608 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
609 struct svc_req *req, cred_t *cr, bool_t ro)
610 {
611 int error;
612 struct iovec iov;
613 struct uio uio;
614 vnode_t *vp;
615 struct vattr va;
616 struct sockaddr *ca;
617 char *name = NULL;
618 int is_referral = 0;
619
620 vp = nfs_fhtovp(fhp, exi);
621 if (vp == NULL) {
622 rl->rl_data = NULL;
623 rl->rl_status = NFSERR_STALE;
624 return;
625 }
626
627 va.va_mask = AT_MODE;
628
629 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
630
631 if (error) {
632 VN_RELE(vp);
633 rl->rl_data = NULL;
634 rl->rl_status = puterrno(error);
635 return;
636 }
637
638 if (MANDLOCK(vp, va.va_mode)) {
639 VN_RELE(vp);
640 rl->rl_data = NULL;
641 rl->rl_status = NFSERR_ACCES;
642 return;
643 }
644
645 /* We lied about the object type for a referral */
646 if (vn_is_nfs_reparse(vp, cr))
647 is_referral = 1;
648
649 /*
650 * XNFS and RFC1094 require us to return ENXIO if argument
651 * is not a link. BUGID 1138002.
652 */
653 if (vp->v_type != VLNK && !is_referral) {
654 VN_RELE(vp);
655 rl->rl_data = NULL;
656 rl->rl_status = NFSERR_NXIO;
657 return;
658 }
659
660 /*
661 * Allocate data for pathname. This will be freed by rfs_rlfree.
662 */
663 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
664
665 if (is_referral) {
666 char *s;
667 size_t strsz;
668
669 /* Get an artificial symlink based on a referral */
670 s = build_symlink(vp, cr, &strsz);
671 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
672 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
673 vnode_t *, vp, char *, s);
674 if (s == NULL)
675 error = EINVAL;
676 else {
677 error = 0;
678 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
679 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
680 kmem_free(s, strsz);
681 }
682
683 } else {
684
685 /*
686 * Set up io vector to read sym link data
687 */
688 iov.iov_base = rl->rl_data;
689 iov.iov_len = NFS_MAXPATHLEN;
690 uio.uio_iov = &iov;
691 uio.uio_iovcnt = 1;
692 uio.uio_segflg = UIO_SYSSPACE;
693 uio.uio_extflg = UIO_COPY_CACHED;
694 uio.uio_loffset = (offset_t)0;
695 uio.uio_resid = NFS_MAXPATHLEN;
696
697 /*
698 * Do the readlink.
699 */
700 error = VOP_READLINK(vp, &uio, cr, NULL);
701
702 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
703
704 if (!error)
705 rl->rl_data[rl->rl_count] = '\0';
706
707 }
708
709
710 VN_RELE(vp);
711
712 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
713 name = nfscmd_convname(ca, exi, rl->rl_data,
714 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
715
716 if (name != NULL && name != rl->rl_data) {
717 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
718 rl->rl_data = name;
719 }
720
721 /*
722 * XNFS and RFC1094 require us to return ENXIO if argument
723 * is not a link. UFS returns EINVAL if this is the case,
724 * so we do the mapping here. BUGID 1138002.
725 */
726 if (error == EINVAL)
727 rl->rl_status = NFSERR_NXIO;
728 else
729 rl->rl_status = puterrno(error);
730
731 }
732 void *
733 rfs_readlink_getfh(fhandle_t *fhp)
734 {
735 return (fhp);
736 }
737 /*
738 * Free data allocated by rfs_readlink
739 */
740 void
741 rfs_rlfree(struct nfsrdlnres *rl)
742 {
743 if (rl->rl_data != NULL)
744 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
745 }
746
747 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
748
749 /*
750 * Read data.
751 * Returns some data read from the file at the given fhandle.
752 */
753 /* ARGSUSED */
754 void
755 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
756 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
757 {
758 vnode_t *vp;
759 int error;
760 struct vattr va;
761 struct iovec iov;
762 struct uio uio;
763 mblk_t *mp;
764 int alloc_err = 0;
765 int in_crit = 0;
766 caller_context_t ct;
767
768 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
769 if (vp == NULL) {
770 rr->rr_data = NULL;
771 rr->rr_status = NFSERR_STALE;
772 return;
773 }
774
775 if (vp->v_type != VREG) {
776 VN_RELE(vp);
777 rr->rr_data = NULL;
778 rr->rr_status = NFSERR_ISDIR;
779 return;
780 }
781
782 ct.cc_sysid = 0;
783 ct.cc_pid = 0;
784 ct.cc_caller_id = nfs2_srv_caller_id;
785 ct.cc_flags = CC_DONTBLOCK;
786
787 /*
788 * Enter the critical region before calling VOP_RWLOCK
789 * to avoid a deadlock with write requests.
790 */
791 if (nbl_need_check(vp)) {
792 nbl_start_crit(vp, RW_READER);
793 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
794 0, NULL)) {
795 nbl_end_crit(vp);
796 VN_RELE(vp);
797 rr->rr_data = NULL;
798 rr->rr_status = NFSERR_ACCES;
799 return;
800 }
801 in_crit = 1;
802 }
803
804 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
805
806 /* check if a monitor detected a delegation conflict */
807 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
808 if (in_crit)
809 nbl_end_crit(vp);
810 VN_RELE(vp);
811 /* mark as wouldblock so response is dropped */
812 curthread->t_flag |= T_WOULDBLOCK;
813
814 rr->rr_data = NULL;
815 return;
816 }
817
818 va.va_mask = AT_ALL;
819
820 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
821
822 if (error) {
823 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
824 if (in_crit)
825 nbl_end_crit(vp);
826
827 VN_RELE(vp);
828 rr->rr_data = NULL;
829 rr->rr_status = puterrno(error);
830
831 return;
832 }
833
834 /*
835 * This is a kludge to allow reading of files created
836 * with no read permission. The owner of the file
837 * is always allowed to read it.
838 */
839 if (crgetuid(cr) != va.va_uid) {
840 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
841
842 if (error) {
843 /*
844 * Exec is the same as read over the net because
845 * of demand loading.
846 */
847 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
848 }
849 if (error) {
850 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
851 if (in_crit)
852 nbl_end_crit(vp);
853 VN_RELE(vp);
854 rr->rr_data = NULL;
855 rr->rr_status = puterrno(error);
856
857 return;
858 }
859 }
860
861 if (MANDLOCK(vp, va.va_mode)) {
862 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
863 if (in_crit)
864 nbl_end_crit(vp);
865
866 VN_RELE(vp);
867 rr->rr_data = NULL;
868 rr->rr_status = NFSERR_ACCES;
869
870 return;
871 }
872
873 rr->rr_ok.rrok_wlist_len = 0;
874 rr->rr_ok.rrok_wlist = NULL;
875
876 if ((u_offset_t)ra->ra_offset >= va.va_size) {
877 rr->rr_count = 0;
878 rr->rr_data = NULL;
879 /*
880 * In this case, status is NFS_OK, but there is no data
881 * to encode. So set rr_mp to NULL.
882 */
883 rr->rr_mp = NULL;
884 rr->rr_ok.rrok_wlist = ra->ra_wlist;
885 if (rr->rr_ok.rrok_wlist)
886 clist_zero_len(rr->rr_ok.rrok_wlist);
887 goto done;
888 }
889
890 if (ra->ra_wlist) {
891 mp = NULL;
892 rr->rr_mp = NULL;
893 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
894 if (ra->ra_count > iov.iov_len) {
895 rr->rr_data = NULL;
896 rr->rr_status = NFSERR_INVAL;
897 goto done;
898 }
899 } else {
900 /*
901 * mp will contain the data to be sent out in the read reply.
902 * This will be freed after the reply has been sent out (by the
903 * driver).
904 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
905 * that the call to xdrmblk_putmblk() never fails.
906 */
907 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
908 &alloc_err);
909 ASSERT(mp != NULL);
910 ASSERT(alloc_err == 0);
911
912 rr->rr_mp = mp;
913
914 /*
915 * Set up io vector
916 */
917 iov.iov_base = (caddr_t)mp->b_datap->db_base;
918 iov.iov_len = ra->ra_count;
919 }
920
921 uio.uio_iov = &iov;
922 uio.uio_iovcnt = 1;
923 uio.uio_segflg = UIO_SYSSPACE;
924 uio.uio_extflg = UIO_COPY_CACHED;
925 uio.uio_loffset = (offset_t)ra->ra_offset;
926 uio.uio_resid = ra->ra_count;
927
928 error = VOP_READ(vp, &uio, 0, cr, &ct);
929
930 if (error) {
931 if (mp)
932 freeb(mp);
933
934 /*
935 * check if a monitor detected a delegation conflict and
936 * mark as wouldblock so response is dropped
937 */
938 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
939 curthread->t_flag |= T_WOULDBLOCK;
940 else
941 rr->rr_status = puterrno(error);
942
943 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
944 if (in_crit)
945 nbl_end_crit(vp);
946
947 VN_RELE(vp);
948 rr->rr_data = NULL;
949
950 return;
951 }
952
953 /*
954 * Get attributes again so we can send the latest access
955 * time to the client side for its cache.
956 */
957 va.va_mask = AT_ALL;
958
959 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
960
961 if (error) {
962 if (mp)
963 freeb(mp);
964
965 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
966 if (in_crit)
967 nbl_end_crit(vp);
968
969 VN_RELE(vp);
970 rr->rr_data = NULL;
971 rr->rr_status = puterrno(error);
972
973 return;
974 }
975
976 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
977
978 if (mp) {
979 rr->rr_data = (char *)mp->b_datap->db_base;
980 } else {
981 if (ra->ra_wlist) {
982 rr->rr_data = (caddr_t)iov.iov_base;
983 if (!rdma_setup_read_data2(ra, rr)) {
984 rr->rr_data = NULL;
985 rr->rr_status = puterrno(NFSERR_INVAL);
986 }
987 }
988 }
989 done:
990 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
991 if (in_crit)
992 nbl_end_crit(vp);
993
994 acl_perm(vp, exi, &va, cr);
995
996 /* check for overflows */
997 error = vattr_to_nattr(&va, &rr->rr_attr);
998
999 VN_RELE(vp);
1000
1001 rr->rr_status = puterrno(error);
1002 }
1003
1004 /*
1005 * Free data allocated by rfs_read
1006 */
1007 void
1008 rfs_rdfree(struct nfsrdresult *rr)
1009 {
1010 mblk_t *mp;
1011
1012 if (rr->rr_status == NFS_OK) {
1013 mp = rr->rr_mp;
1014 if (mp != NULL)
1015 freeb(mp);
1016 }
1017 }
1018
1019 void *
1020 rfs_read_getfh(struct nfsreadargs *ra)
1021 {
1022 return (&ra->ra_fhandle);
1023 }
1024
1025 #define MAX_IOVECS 12
1026
1027 #ifdef DEBUG
1028 static int rfs_write_sync_hits = 0;
1029 static int rfs_write_sync_misses = 0;
1030 #endif
1031
1032 /*
1033 * Write data to file.
1034 * Returns attributes of a file after writing some data to it.
1035 *
1036 * Any changes made here, especially in error handling might have
1037 * to also be done in rfs_write (which clusters write requests).
1038 */
1039 /* ARGSUSED */
1040 void
1041 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1042 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1043 {
1044 int error;
1045 vnode_t *vp;
1046 rlim64_t rlimit;
1047 struct vattr va;
1048 struct uio uio;
1049 struct iovec iov[MAX_IOVECS];
1050 mblk_t *m;
1051 struct iovec *iovp;
1052 int iovcnt;
1053 cred_t *savecred;
1054 int in_crit = 0;
1055 caller_context_t ct;
1056
1057 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1058 if (vp == NULL) {
1059 ns->ns_status = NFSERR_STALE;
1060 return;
1061 }
1062
1063 if (rdonly(ro, vp)) {
1064 VN_RELE(vp);
1065 ns->ns_status = NFSERR_ROFS;
1066 return;
1067 }
1068
1069 if (vp->v_type != VREG) {
1070 VN_RELE(vp);
1071 ns->ns_status = NFSERR_ISDIR;
1072 return;
1073 }
1074
1075 ct.cc_sysid = 0;
1076 ct.cc_pid = 0;
1077 ct.cc_caller_id = nfs2_srv_caller_id;
1078 ct.cc_flags = CC_DONTBLOCK;
1079
1080 va.va_mask = AT_UID|AT_MODE;
1081
1082 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1083
1084 if (error) {
1085 VN_RELE(vp);
1086 ns->ns_status = puterrno(error);
1087
1088 return;
1089 }
1090
1091 if (crgetuid(cr) != va.va_uid) {
1092 /*
1093 * This is a kludge to allow writes of files created
1094 * with read only permission. The owner of the file
1095 * is always allowed to write it.
1096 */
1097 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1098
1099 if (error) {
1100 VN_RELE(vp);
1101 ns->ns_status = puterrno(error);
1102 return;
1103 }
1104 }
1105
1106 /*
1107 * Can't access a mandatory lock file. This might cause
1108 * the NFS service thread to block forever waiting for a
1109 * lock to be released that will never be released.
1110 */
1111 if (MANDLOCK(vp, va.va_mode)) {
1112 VN_RELE(vp);
1113 ns->ns_status = NFSERR_ACCES;
1114 return;
1115 }
1116
1117 /*
1118 * We have to enter the critical region before calling VOP_RWLOCK
1119 * to avoid a deadlock with ufs.
1120 */
1121 if (nbl_need_check(vp)) {
1122 nbl_start_crit(vp, RW_READER);
1123 in_crit = 1;
1124 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1125 wa->wa_count, 0, NULL)) {
1126 error = EACCES;
1127 goto out;
1128 }
1129 }
1130
1131 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1132
1133 /* check if a monitor detected a delegation conflict */
1134 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1135 goto out;
1136 }
1137
1138 if (wa->wa_data || wa->wa_rlist) {
1139 /* Do the RDMA thing if necessary */
1140 if (wa->wa_rlist) {
1141 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1142 iov[0].iov_len = wa->wa_count;
1143 } else {
1144 iov[0].iov_base = wa->wa_data;
1145 iov[0].iov_len = wa->wa_count;
1146 }
1147 uio.uio_iov = iov;
1148 uio.uio_iovcnt = 1;
1149 uio.uio_segflg = UIO_SYSSPACE;
1150 uio.uio_extflg = UIO_COPY_DEFAULT;
1151 uio.uio_loffset = (offset_t)wa->wa_offset;
1152 uio.uio_resid = wa->wa_count;
1153 /*
1154 * The limit is checked on the client. We
1155 * should allow any size writes here.
1156 */
1157 uio.uio_llimit = curproc->p_fsz_ctl;
1158 rlimit = uio.uio_llimit - wa->wa_offset;
1159 if (rlimit < (rlim64_t)uio.uio_resid)
1160 uio.uio_resid = (uint_t)rlimit;
1161
1162 /*
1163 * for now we assume no append mode
1164 */
1165 /*
1166 * We're changing creds because VM may fault and we need
1167 * the cred of the current thread to be used if quota
1168 * checking is enabled.
1169 */
1170 savecred = curthread->t_cred;
1171 curthread->t_cred = cr;
1172 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1173 curthread->t_cred = savecred;
1174 } else {
1175
1176 iovcnt = 0;
1177 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1178 iovcnt++;
1179 if (iovcnt <= MAX_IOVECS) {
1180 #ifdef DEBUG
1181 rfs_write_sync_hits++;
1182 #endif
1183 iovp = iov;
1184 } else {
1185 #ifdef DEBUG
1186 rfs_write_sync_misses++;
1187 #endif
1188 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1189 }
1190 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1191 uio.uio_iov = iovp;
1192 uio.uio_iovcnt = iovcnt;
1193 uio.uio_segflg = UIO_SYSSPACE;
1194 uio.uio_extflg = UIO_COPY_DEFAULT;
1195 uio.uio_loffset = (offset_t)wa->wa_offset;
1196 uio.uio_resid = wa->wa_count;
1197 /*
1198 * The limit is checked on the client. We
1199 * should allow any size writes here.
1200 */
1201 uio.uio_llimit = curproc->p_fsz_ctl;
1202 rlimit = uio.uio_llimit - wa->wa_offset;
1203 if (rlimit < (rlim64_t)uio.uio_resid)
1204 uio.uio_resid = (uint_t)rlimit;
1205
1206 /*
1207 * For now we assume no append mode.
1208 */
1209 /*
1210 * We're changing creds because VM may fault and we need
1211 * the cred of the current thread to be used if quota
1212 * checking is enabled.
1213 */
1214 savecred = curthread->t_cred;
1215 curthread->t_cred = cr;
1216 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1217 curthread->t_cred = savecred;
1218
1219 if (iovp != iov)
1220 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1221 }
1222
1223 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1224
1225 if (!error) {
1226 /*
1227 * Get attributes again so we send the latest mod
1228 * time to the client side for its cache.
1229 */
1230 va.va_mask = AT_ALL; /* now we want everything */
1231
1232 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1233
1234 /* check for overflows */
1235 if (!error) {
1236 acl_perm(vp, exi, &va, cr);
1237 error = vattr_to_nattr(&va, &ns->ns_attr);
1238 }
1239 }
1240
1241 out:
1242 if (in_crit)
1243 nbl_end_crit(vp);
1244 VN_RELE(vp);
1245
1246 /* check if a monitor detected a delegation conflict */
1247 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1248 /* mark as wouldblock so response is dropped */
1249 curthread->t_flag |= T_WOULDBLOCK;
1250 else
1251 ns->ns_status = puterrno(error);
1252
1253 }
1254
1255 struct rfs_async_write {
1256 struct nfswriteargs *wa;
1257 struct nfsattrstat *ns;
1258 struct svc_req *req;
1259 cred_t *cr;
1260 bool_t ro;
1261 kthread_t *thread;
1262 struct rfs_async_write *list;
1263 };
1264
1265 struct rfs_async_write_list {
1266 fhandle_t *fhp;
1267 kcondvar_t cv;
1268 struct rfs_async_write *list;
1269 struct rfs_async_write_list *next;
1270 };
1271
1272 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1273 static kmutex_t rfs_async_write_lock;
1274 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1275
1276 #define MAXCLIOVECS 42
1277 #define RFSWRITE_INITVAL (enum nfsstat) -1
1278
1279 #ifdef DEBUG
1280 static int rfs_write_hits = 0;
1281 static int rfs_write_misses = 0;
1282 #endif
1283
1284 /*
1285 * Write data to file.
1286 * Returns attributes of a file after writing some data to it.
1287 */
1288 void
1289 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1290 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1291 {
1292 int error;
1293 vnode_t *vp;
1294 rlim64_t rlimit;
1295 struct vattr va;
1296 struct uio uio;
1297 struct rfs_async_write_list *lp;
1298 struct rfs_async_write_list *nlp;
1299 struct rfs_async_write *rp;
1300 struct rfs_async_write *nrp;
1301 struct rfs_async_write *trp;
1302 struct rfs_async_write *lrp;
1303 int data_written;
1304 int iovcnt;
1305 mblk_t *m;
1306 struct iovec *iovp;
1307 struct iovec *niovp;
1308 struct iovec iov[MAXCLIOVECS];
1309 int count;
1310 int rcount;
1311 uint_t off;
1312 uint_t len;
1313 struct rfs_async_write nrpsp;
1314 struct rfs_async_write_list nlpsp;
1315 ushort_t t_flag;
1316 cred_t *savecred;
1317 int in_crit = 0;
1318 caller_context_t ct;
1319 nfs_srv_t *nsrv;
1320
1321 nsrv = nfs_get_srv();
1322 if (!nsrv->write_async) {
1323 rfs_write_sync(wa, ns, exi, req, cr, ro);
1324 return;
1325 }
1326
1327 /*
1328 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1329 * is considered an OK.
1330 */
1331 ns->ns_status = RFSWRITE_INITVAL;
1332
1333 nrp = &nrpsp;
1334 nrp->wa = wa;
1335 nrp->ns = ns;
1336 nrp->req = req;
1337 nrp->cr = cr;
1338 nrp->ro = ro;
1339 nrp->thread = curthread;
1340
1341 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1342
1343 /*
1344 * Look to see if there is already a cluster started
1345 * for this file.
1346 */
1347 mutex_enter(&nsrv->async_write_lock);
1348 for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1349 if (bcmp(&wa->wa_fhandle, lp->fhp,
1350 sizeof (fhandle_t)) == 0)
1351 break;
1352 }
1353
1354 /*
1355 * If lp is non-NULL, then there is already a cluster
1356 * started. We need to place ourselves in the cluster
1357 * list in the right place as determined by starting
1358 * offset. Conflicts with non-blocking mandatory locked
1359 * regions will be checked when the cluster is processed.
1360 */
1361 if (lp != NULL) {
1362 rp = lp->list;
1363 trp = NULL;
1364 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1365 trp = rp;
1366 rp = rp->list;
1367 }
1368 nrp->list = rp;
1369 if (trp == NULL)
1370 lp->list = nrp;
1371 else
1372 trp->list = nrp;
1373 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1374 cv_wait(&lp->cv, &nsrv->async_write_lock);
1375 mutex_exit(&nsrv->async_write_lock);
1376
1377 return;
1378 }
1379
1380 /*
1381 * No cluster started yet, start one and add ourselves
1382 * to the list of clusters.
1383 */
1384 nrp->list = NULL;
1385
1386 nlp = &nlpsp;
1387 nlp->fhp = &wa->wa_fhandle;
1388 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1389 nlp->list = nrp;
1390 nlp->next = NULL;
1391
1392 if (nsrv->async_write_head == NULL) {
1393 nsrv->async_write_head = nlp;
1394 } else {
1395 lp = nsrv->async_write_head;
1396 while (lp->next != NULL)
1397 lp = lp->next;
1398 lp->next = nlp;
1399 }
1400 mutex_exit(&nsrv->async_write_lock);
1401
1402 /*
1403 * Convert the file handle common to all of the requests
1404 * in this cluster to a vnode.
1405 */
1406 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1407 if (vp == NULL) {
1408 mutex_enter(&nsrv->async_write_lock);
1409 if (nsrv->async_write_head == nlp)
1410 nsrv->async_write_head = nlp->next;
1411 else {
1412 lp = nsrv->async_write_head;
1413 while (lp->next != nlp)
1414 lp = lp->next;
1415 lp->next = nlp->next;
1416 }
1417 t_flag = curthread->t_flag & T_WOULDBLOCK;
1418 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1419 rp->ns->ns_status = NFSERR_STALE;
1420 rp->thread->t_flag |= t_flag;
1421 }
1422 cv_broadcast(&nlp->cv);
1423 mutex_exit(&nsrv->async_write_lock);
1424
1425 return;
1426 }
1427
1428 /*
1429 * Can only write regular files. Attempts to write any
1430 * other file types fail with EISDIR.
1431 */
1432 if (vp->v_type != VREG) {
1433 VN_RELE(vp);
1434 mutex_enter(&nsrv->async_write_lock);
1435 if (nsrv->async_write_head == nlp)
1436 nsrv->async_write_head = nlp->next;
1437 else {
1438 lp = nsrv->async_write_head;
1439 while (lp->next != nlp)
1440 lp = lp->next;
1441 lp->next = nlp->next;
1442 }
1443 t_flag = curthread->t_flag & T_WOULDBLOCK;
1444 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1445 rp->ns->ns_status = NFSERR_ISDIR;
1446 rp->thread->t_flag |= t_flag;
1447 }
1448 cv_broadcast(&nlp->cv);
1449 mutex_exit(&nsrv->async_write_lock);
1450
1451 return;
1452 }
1453
1454 /*
1455 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1456 * deadlock with ufs.
1457 */
1458 if (nbl_need_check(vp)) {
1459 nbl_start_crit(vp, RW_READER);
1460 in_crit = 1;
1461 }
1462
1463 ct.cc_sysid = 0;
1464 ct.cc_pid = 0;
1465 ct.cc_caller_id = nfs2_srv_caller_id;
1466 ct.cc_flags = CC_DONTBLOCK;
1467
1468 /*
1469 * Lock the file for writing. This operation provides
1470 * the delay which allows clusters to grow.
1471 */
1472 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1473
1474 /* check if a monitor detected a delegation conflict */
1475 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1476 if (in_crit)
1477 nbl_end_crit(vp);
1478 VN_RELE(vp);
1479 /* mark as wouldblock so response is dropped */
1480 curthread->t_flag |= T_WOULDBLOCK;
1481 mutex_enter(&nsrv->async_write_lock);
1482 if (nsrv->async_write_head == nlp)
1483 nsrv->async_write_head = nlp->next;
1484 else {
1485 lp = nsrv->async_write_head;
1486 while (lp->next != nlp)
1487 lp = lp->next;
1488 lp->next = nlp->next;
1489 }
1490 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1491 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1492 rp->ns->ns_status = puterrno(error);
1493 rp->thread->t_flag |= T_WOULDBLOCK;
1494 }
1495 }
1496 cv_broadcast(&nlp->cv);
1497 mutex_exit(&nsrv->async_write_lock);
1498
1499 return;
1500 }
1501
1502 /*
1503 * Disconnect this cluster from the list of clusters.
1504 * The cluster that is being dealt with must be fixed
1505 * in size after this point, so there is no reason
1506 * to leave it on the list so that new requests can
1507 * find it.
1508 *
1509 * The algorithm is that the first write request will
1510 * create a cluster, convert the file handle to a
1511 * vnode pointer, and then lock the file for writing.
1512 * This request is not likely to be clustered with
1513 * any others. However, the next request will create
1514 * a new cluster and be blocked in VOP_RWLOCK while
1515 * the first request is being processed. This delay
1516 * will allow more requests to be clustered in this
1517 * second cluster.
1518 */
1519 mutex_enter(&nsrv->async_write_lock);
1520 if (nsrv->async_write_head == nlp)
1521 nsrv->async_write_head = nlp->next;
1522 else {
1523 lp = nsrv->async_write_head;
1524 while (lp->next != nlp)
1525 lp = lp->next;
1526 lp->next = nlp->next;
1527 }
1528 mutex_exit(&nsrv->async_write_lock);
1529
1530 /*
1531 * Step through the list of requests in this cluster.
1532 * We need to check permissions to make sure that all
1533 * of the requests have sufficient permission to write
1534 * the file. A cluster can be composed of requests
1535 * from different clients and different users on each
1536 * client.
1537 *
1538 * As a side effect, we also calculate the size of the
1539 * byte range that this cluster encompasses.
1540 */
1541 rp = nlp->list;
1542 off = rp->wa->wa_offset;
1543 len = (uint_t)0;
1544 do {
1545 if (rdonly(rp->ro, vp)) {
1546 rp->ns->ns_status = NFSERR_ROFS;
1547 t_flag = curthread->t_flag & T_WOULDBLOCK;
1548 rp->thread->t_flag |= t_flag;
1549 continue;
1550 }
1551
1552 va.va_mask = AT_UID|AT_MODE;
1553
1554 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1555
1556 if (!error) {
1557 if (crgetuid(rp->cr) != va.va_uid) {
1558 /*
1559 * This is a kludge to allow writes of files
1560 * created with read only permission. The
1561 * owner of the file is always allowed to
1562 * write it.
1563 */
1564 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1565 }
1566 if (!error && MANDLOCK(vp, va.va_mode))
1567 error = EACCES;
1568 }
1569
1570 /*
1571 * Check for a conflict with a nbmand-locked region.
1572 */
1573 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1574 rp->wa->wa_count, 0, NULL)) {
1575 error = EACCES;
1576 }
1577
1578 if (error) {
1579 rp->ns->ns_status = puterrno(error);
1580 t_flag = curthread->t_flag & T_WOULDBLOCK;
1581 rp->thread->t_flag |= t_flag;
1582 continue;
1583 }
1584 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1585 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1586 } while ((rp = rp->list) != NULL);
1587
1588 /*
1589 * Step through the cluster attempting to gather as many
1590 * requests which are contiguous as possible. These
1591 * contiguous requests are handled via one call to VOP_WRITE
1592 * instead of different calls to VOP_WRITE. We also keep
1593 * track of the fact that any data was written.
1594 */
1595 rp = nlp->list;
1596 data_written = 0;
1597 do {
1598 /*
1599 * Skip any requests which are already marked as having an
1600 * error.
1601 */
1602 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1603 rp = rp->list;
1604 continue;
1605 }
1606
1607 /*
1608 * Count the number of iovec's which are required
1609 * to handle this set of requests. One iovec is
1610 * needed for each data buffer, whether addressed
1611 * by wa_data or by the b_rptr pointers in the
1612 * mblk chains.
1613 */
1614 iovcnt = 0;
1615 lrp = rp;
1616 for (;;) {
1617 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1618 iovcnt++;
1619 else {
1620 m = lrp->wa->wa_mblk;
1621 while (m != NULL) {
1622 iovcnt++;
1623 m = m->b_cont;
1624 }
1625 }
1626 if (lrp->list == NULL ||
1627 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1628 lrp->wa->wa_offset + lrp->wa->wa_count !=
1629 lrp->list->wa->wa_offset) {
1630 lrp = lrp->list;
1631 break;
1632 }
1633 lrp = lrp->list;
1634 }
1635
1636 if (iovcnt <= MAXCLIOVECS) {
1637 #ifdef DEBUG
1638 rfs_write_hits++;
1639 #endif
1640 niovp = iov;
1641 } else {
1642 #ifdef DEBUG
1643 rfs_write_misses++;
1644 #endif
1645 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1646 }
1647 /*
1648 * Put together the scatter/gather iovecs.
1649 */
1650 iovp = niovp;
1651 trp = rp;
1652 count = 0;
1653 do {
1654 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1655 if (trp->wa->wa_rlist) {
1656 iovp->iov_base =
1657 (char *)((trp->wa->wa_rlist)->
1658 u.c_daddr3);
1659 iovp->iov_len = trp->wa->wa_count;
1660 } else {
1661 iovp->iov_base = trp->wa->wa_data;
1662 iovp->iov_len = trp->wa->wa_count;
1663 }
1664 iovp++;
1665 } else {
1666 m = trp->wa->wa_mblk;
1667 rcount = trp->wa->wa_count;
1668 while (m != NULL) {
1669 iovp->iov_base = (caddr_t)m->b_rptr;
1670 iovp->iov_len = (m->b_wptr - m->b_rptr);
1671 rcount -= iovp->iov_len;
1672 if (rcount < 0)
1673 iovp->iov_len += rcount;
1674 iovp++;
1675 if (rcount <= 0)
1676 break;
1677 m = m->b_cont;
1678 }
1679 }
1680 count += trp->wa->wa_count;
1681 trp = trp->list;
1682 } while (trp != lrp);
1683
1684 uio.uio_iov = niovp;
1685 uio.uio_iovcnt = iovcnt;
1686 uio.uio_segflg = UIO_SYSSPACE;
1687 uio.uio_extflg = UIO_COPY_DEFAULT;
1688 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1689 uio.uio_resid = count;
1690 /*
1691 * The limit is checked on the client. We
1692 * should allow any size writes here.
1693 */
1694 uio.uio_llimit = curproc->p_fsz_ctl;
1695 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1696 if (rlimit < (rlim64_t)uio.uio_resid)
1697 uio.uio_resid = (uint_t)rlimit;
1698
1699 /*
1700 * For now we assume no append mode.
1701 */
1702
1703 /*
1704 * We're changing creds because VM may fault
1705 * and we need the cred of the current
1706 * thread to be used if quota * checking is
1707 * enabled.
1708 */
1709 savecred = curthread->t_cred;
1710 curthread->t_cred = cr;
1711 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1712 curthread->t_cred = savecred;
1713
1714 /* check if a monitor detected a delegation conflict */
1715 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1716 /* mark as wouldblock so response is dropped */
1717 curthread->t_flag |= T_WOULDBLOCK;
1718
1719 if (niovp != iov)
1720 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1721
1722 if (!error) {
1723 data_written = 1;
1724 /*
1725 * Get attributes again so we send the latest mod
1726 * time to the client side for its cache.
1727 */
1728 va.va_mask = AT_ALL; /* now we want everything */
1729
1730 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1731
1732 if (!error)
1733 acl_perm(vp, exi, &va, rp->cr);
1734 }
1735
1736 /*
1737 * Fill in the status responses for each request
1738 * which was just handled. Also, copy the latest
1739 * attributes in to the attribute responses if
1740 * appropriate.
1741 */
1742 t_flag = curthread->t_flag & T_WOULDBLOCK;
1743 do {
1744 rp->thread->t_flag |= t_flag;
1745 /* check for overflows */
1746 if (!error) {
1747 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1748 }
1749 rp->ns->ns_status = puterrno(error);
1750 rp = rp->list;
1751 } while (rp != lrp);
1752 } while (rp != NULL);
1753
1754 /*
1755 * If any data was written at all, then we need to flush
1756 * the data and metadata to stable storage.
1757 */
1758 if (data_written) {
1759 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1760
1761 if (!error) {
1762 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1763 }
1764 }
1765
1766 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1767
1768 if (in_crit)
1769 nbl_end_crit(vp);
1770 VN_RELE(vp);
1771
1772 t_flag = curthread->t_flag & T_WOULDBLOCK;
1773 mutex_enter(&nsrv->async_write_lock);
1774 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1775 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1776 rp->ns->ns_status = puterrno(error);
1777 rp->thread->t_flag |= t_flag;
1778 }
1779 }
1780 cv_broadcast(&nlp->cv);
1781 mutex_exit(&nsrv->async_write_lock);
1782
1783 }
1784
1785 void *
1786 rfs_write_getfh(struct nfswriteargs *wa)
1787 {
1788 return (&wa->wa_fhandle);
1789 }
1790
1791 /*
1792 * Create a file.
1793 * Creates a file with given attributes and returns those attributes
1794 * and an fhandle for the new file.
1795 */
1796 void
1797 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1798 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1799 {
1800 int error;
1801 int lookuperr;
1802 int in_crit = 0;
1803 struct vattr va;
1804 vnode_t *vp;
1805 vnode_t *realvp;
1806 vnode_t *dvp;
1807 char *name = args->ca_da.da_name;
1808 vnode_t *tvp = NULL;
1809 int mode;
1810 int lookup_ok;
1811 bool_t trunc;
1812 struct sockaddr *ca;
1813
1814 /*
1815 * Disallow NULL paths
1816 */
1817 if (name == NULL || *name == '\0') {
1818 dr->dr_status = NFSERR_ACCES;
1819 return;
1820 }
1821
1822 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1823 if (dvp == NULL) {
1824 dr->dr_status = NFSERR_STALE;
1825 return;
1826 }
1827
1828 error = sattr_to_vattr(args->ca_sa, &va);
1829 if (error) {
1830 dr->dr_status = puterrno(error);
1831 return;
1832 }
1833
1834 /*
1835 * Must specify the mode.
1836 */
1837 if (!(va.va_mask & AT_MODE)) {
1838 VN_RELE(dvp);
1839 dr->dr_status = NFSERR_INVAL;
1840 return;
1841 }
1842
1843 /*
1844 * This is a completely gross hack to make mknod
1845 * work over the wire until we can wack the protocol
1846 */
1847 if ((va.va_mode & IFMT) == IFCHR) {
1848 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1849 va.va_type = VFIFO; /* xtra kludge for named pipe */
1850 else {
1851 va.va_type = VCHR;
1852 /*
1853 * uncompress the received dev_t
1854 * if the top half is zero indicating a request
1855 * from an `older style' OS.
1856 */
1857 if ((va.va_size & 0xffff0000) == 0)
1858 va.va_rdev = nfsv2_expdev(va.va_size);
1859 else
1860 va.va_rdev = (dev_t)va.va_size;
1861 }
1862 va.va_mask &= ~AT_SIZE;
1863 } else if ((va.va_mode & IFMT) == IFBLK) {
1864 va.va_type = VBLK;
1865 /*
1866 * uncompress the received dev_t
1867 * if the top half is zero indicating a request
1868 * from an `older style' OS.
1869 */
1870 if ((va.va_size & 0xffff0000) == 0)
1871 va.va_rdev = nfsv2_expdev(va.va_size);
1872 else
1873 va.va_rdev = (dev_t)va.va_size;
1874 va.va_mask &= ~AT_SIZE;
1875 } else if ((va.va_mode & IFMT) == IFSOCK) {
1876 va.va_type = VSOCK;
1877 } else {
1878 va.va_type = VREG;
1879 }
1880 va.va_mode &= ~IFMT;
1881 va.va_mask |= AT_TYPE;
1882
1883 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1884 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1885 MAXPATHLEN);
1886 if (name == NULL) {
1887 dr->dr_status = puterrno(EINVAL);
1888 return;
1889 }
1890
1891 /*
1892 * Why was the choice made to use VWRITE as the mode to the
1893 * call to VOP_CREATE ? This results in a bug. When a client
1894 * opens a file that already exists and is RDONLY, the second
1895 * open fails with an EACESS because of the mode.
1896 * bug ID 1054648.
1897 */
1898 lookup_ok = 0;
1899 mode = VWRITE;
1900 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1901 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1902 NULL, NULL, NULL);
1903 if (!error) {
1904 struct vattr at;
1905
1906 lookup_ok = 1;
1907 at.va_mask = AT_MODE;
1908 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1909 if (!error)
1910 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1911 VN_RELE(tvp);
1912 tvp = NULL;
1913 }
1914 }
1915
1916 if (!lookup_ok) {
1917 if (rdonly(ro, dvp)) {
1918 error = EROFS;
1919 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1920 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1921 error = EPERM;
1922 } else {
1923 error = 0;
1924 }
1925 }
1926
1927 /*
1928 * If file size is being modified on an already existing file
1929 * make sure that there are no conflicting non-blocking mandatory
1930 * locks in the region being manipulated. Return EACCES if there
1931 * are conflicting locks.
1932 */
1933 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1934 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1935 NULL, NULL, NULL);
1936
1937 if (!lookuperr &&
1938 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1939 VN_RELE(tvp);
1940 curthread->t_flag |= T_WOULDBLOCK;
1941 goto out;
1942 }
1943
1944 if (!lookuperr && nbl_need_check(tvp)) {
1945 /*
1946 * The file exists. Now check if it has any
1947 * conflicting non-blocking mandatory locks
1948 * in the region being changed.
1949 */
1950 struct vattr bva;
1951 u_offset_t offset;
1952 ssize_t length;
1953
1954 nbl_start_crit(tvp, RW_READER);
1955 in_crit = 1;
1956
1957 bva.va_mask = AT_SIZE;
1958 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1959 if (!error) {
1960 if (va.va_size < bva.va_size) {
1961 offset = va.va_size;
1962 length = bva.va_size - va.va_size;
1963 } else {
1964 offset = bva.va_size;
1965 length = va.va_size - bva.va_size;
1966 }
1967 if (length) {
1968 if (nbl_conflict(tvp, NBL_WRITE,
1969 offset, length, 0, NULL)) {
1970 error = EACCES;
1971 }
1972 }
1973 }
1974 if (error) {
1975 nbl_end_crit(tvp);
1976 VN_RELE(tvp);
1977 in_crit = 0;
1978 }
1979 } else if (tvp != NULL) {
1980 VN_RELE(tvp);
1981 }
1982 }
1983
1984 if (!error) {
1985 /*
1986 * If filesystem is shared with nosuid the remove any
1987 * setuid/setgid bits on create.
1988 */
1989 if (va.va_type == VREG &&
1990 exi->exi_export.ex_flags & EX_NOSUID)
1991 va.va_mode &= ~(VSUID | VSGID);
1992
1993 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1994 NULL, NULL);
1995
1996 if (!error) {
1997
1998 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1999 trunc = TRUE;
2000 else
2001 trunc = FALSE;
2002
2003 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2004 VN_RELE(vp);
2005 curthread->t_flag |= T_WOULDBLOCK;
2006 goto out;
2007 }
2008 va.va_mask = AT_ALL;
2009
2010 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2011
2012 /* check for overflows */
2013 if (!error) {
2014 acl_perm(vp, exi, &va, cr);
2015 error = vattr_to_nattr(&va, &dr->dr_attr);
2016 if (!error) {
2017 error = makefh(&dr->dr_fhandle, vp,
2018 exi);
2019 }
2020 }
2021 /*
2022 * Force modified metadata out to stable storage.
2023 *
2024 * if a underlying vp exists, pass it to VOP_FSYNC
2025 */
2026 if (VOP_REALVP(vp, &realvp, NULL) == 0)
2027 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2028 else
2029 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2030 VN_RELE(vp);
2031 }
2032
2033 if (in_crit) {
2034 nbl_end_crit(tvp);
2035 VN_RELE(tvp);
2036 }
2037 }
2038
2039 /*
2040 * Force modified data and metadata out to stable storage.
2041 */
2042 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2043
2044 out:
2045
2046 VN_RELE(dvp);
2047
2048 dr->dr_status = puterrno(error);
2049
2050 if (name != args->ca_da.da_name)
2051 kmem_free(name, MAXPATHLEN);
2052 }
2053 void *
2054 rfs_create_getfh(struct nfscreatargs *args)
2055 {
2056 return (args->ca_da.da_fhandle);
2057 }
2058
2059 /*
2060 * Remove a file.
2061 * Remove named file from parent directory.
2062 */
2063 /* ARGSUSED */
2064 void
2065 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2066 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2067 {
2068 int error = 0;
2069 vnode_t *vp;
2070 vnode_t *targvp;
2071 int in_crit = 0;
2072
2073 /*
2074 * Disallow NULL paths
2075 */
2076 if (da->da_name == NULL || *da->da_name == '\0') {
2077 *status = NFSERR_ACCES;
2078 return;
2079 }
2080
2081 vp = nfs_fhtovp(da->da_fhandle, exi);
2082 if (vp == NULL) {
2083 *status = NFSERR_STALE;
2084 return;
2085 }
2086
2087 if (rdonly(ro, vp)) {
2088 VN_RELE(vp);
2089 *status = NFSERR_ROFS;
2090 return;
2091 }
2092
2093 /*
2094 * Check for a conflict with a non-blocking mandatory share reservation.
2095 */
2096 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2097 NULL, cr, NULL, NULL, NULL);
2098 if (error != 0) {
2099 VN_RELE(vp);
2100 *status = puterrno(error);
2101 return;
2102 }
2103
2104 /*
2105 * If the file is delegated to an v4 client, then initiate
2106 * recall and drop this request (by setting T_WOULDBLOCK).
2107 * The client will eventually re-transmit the request and
2108 * (hopefully), by then, the v4 client will have returned
2109 * the delegation.
2110 */
2111
2112 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2113 VN_RELE(vp);
2114 VN_RELE(targvp);
2115 curthread->t_flag |= T_WOULDBLOCK;
2116 return;
2117 }
2118
2119 if (nbl_need_check(targvp)) {
2120 nbl_start_crit(targvp, RW_READER);
2121 in_crit = 1;
2122 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2123 error = EACCES;
2124 goto out;
2125 }
2126 }
2127
2128 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2129
2130 /*
2131 * Force modified data and metadata out to stable storage.
2132 */
2133 (void) VOP_FSYNC(vp, 0, cr, NULL);
2134
2135 out:
2136 if (in_crit)
2137 nbl_end_crit(targvp);
2138 VN_RELE(targvp);
2139 VN_RELE(vp);
2140
2141 *status = puterrno(error);
2142
2143 }
2144
2145 void *
2146 rfs_remove_getfh(struct nfsdiropargs *da)
2147 {
2148 return (da->da_fhandle);
2149 }
2150
2151 /*
2152 * rename a file
2153 * Give a file (from) a new name (to).
2154 */
2155 /* ARGSUSED */
2156 void
2157 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2158 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2159 {
2160 int error = 0;
2161 vnode_t *fromvp;
2162 vnode_t *tovp;
2163 struct exportinfo *to_exi;
2164 fhandle_t *fh;
2165 vnode_t *srcvp;
2166 vnode_t *targvp;
2167 int in_crit = 0;
2168
2169 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2170 if (fromvp == NULL) {
2171 *status = NFSERR_STALE;
2172 return;
2173 }
2174
2175 fh = args->rna_to.da_fhandle;
2176 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2177 if (to_exi == NULL) {
2178 VN_RELE(fromvp);
2179 *status = NFSERR_ACCES;
2180 return;
2181 }
2182 exi_rele(to_exi);
2183
2184 if (to_exi != exi) {
2185 VN_RELE(fromvp);
2186 *status = NFSERR_XDEV;
2187 return;
2188 }
2189
2190 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2191 if (tovp == NULL) {
2192 VN_RELE(fromvp);
2193 *status = NFSERR_STALE;
2194 return;
2195 }
2196
2197 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2198 VN_RELE(tovp);
2199 VN_RELE(fromvp);
2200 *status = NFSERR_NOTDIR;
2201 return;
2202 }
2203
2204 /*
2205 * Disallow NULL paths
2206 */
2207 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2208 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2209 VN_RELE(tovp);
2210 VN_RELE(fromvp);
2211 *status = NFSERR_ACCES;
2212 return;
2213 }
2214
2215 if (rdonly(ro, tovp)) {
2216 VN_RELE(tovp);
2217 VN_RELE(fromvp);
2218 *status = NFSERR_ROFS;
2219 return;
2220 }
2221
2222 /*
2223 * Check for a conflict with a non-blocking mandatory share reservation.
2224 */
2225 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2226 NULL, cr, NULL, NULL, NULL);
2227 if (error != 0) {
2228 VN_RELE(tovp);
2229 VN_RELE(fromvp);
2230 *status = puterrno(error);
2231 return;
2232 }
2233
2234 /* Check for delegations on the source file */
2235
2236 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2237 VN_RELE(tovp);
2238 VN_RELE(fromvp);
2239 VN_RELE(srcvp);
2240 curthread->t_flag |= T_WOULDBLOCK;
2241 return;
2242 }
2243
2244 /* Check for delegation on the file being renamed over, if it exists */
2245
2246 if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2247 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2248 NULL, NULL, NULL) == 0) {
2249
2250 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2251 VN_RELE(tovp);
2252 VN_RELE(fromvp);
2253 VN_RELE(srcvp);
2254 VN_RELE(targvp);
2255 curthread->t_flag |= T_WOULDBLOCK;
2256 return;
2257 }
2258 VN_RELE(targvp);
2259 }
2260
2261
2262 if (nbl_need_check(srcvp)) {
2263 nbl_start_crit(srcvp, RW_READER);
2264 in_crit = 1;
2265 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2266 error = EACCES;
2267 goto out;
2268 }
2269 }
2270
2271 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2272 tovp, args->rna_to.da_name, cr, NULL, 0);
2273
2274 if (error == 0)
2275 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2276 strlen(args->rna_to.da_name));
2277
2278 /*
2279 * Force modified data and metadata out to stable storage.
2280 */
2281 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2282 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2283
2284 out:
2285 if (in_crit)
2286 nbl_end_crit(srcvp);
2287 VN_RELE(srcvp);
2288 VN_RELE(tovp);
2289 VN_RELE(fromvp);
2290
2291 *status = puterrno(error);
2292
2293 }
2294 void *
2295 rfs_rename_getfh(struct nfsrnmargs *args)
2296 {
2297 return (args->rna_from.da_fhandle);
2298 }
2299
2300 /*
2301 * Link to a file.
2302 * Create a file (to) which is a hard link to the given file (from).
2303 */
2304 /* ARGSUSED */
2305 void
2306 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2307 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2308 {
2309 int error;
2310 vnode_t *fromvp;
2311 vnode_t *tovp;
2312 struct exportinfo *to_exi;
2313 fhandle_t *fh;
2314
2315 fromvp = nfs_fhtovp(args->la_from, exi);
2316 if (fromvp == NULL) {
2317 *status = NFSERR_STALE;
2318 return;
2319 }
2320
2321 fh = args->la_to.da_fhandle;
2322 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2323 if (to_exi == NULL) {
2324 VN_RELE(fromvp);
2325 *status = NFSERR_ACCES;
2326 return;
2327 }
2328 exi_rele(to_exi);
2329
2330 if (to_exi != exi) {
2331 VN_RELE(fromvp);
2332 *status = NFSERR_XDEV;
2333 return;
2334 }
2335
2336 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2337 if (tovp == NULL) {
2338 VN_RELE(fromvp);
2339 *status = NFSERR_STALE;
2340 return;
2341 }
2342
2343 if (tovp->v_type != VDIR) {
2344 VN_RELE(tovp);
2345 VN_RELE(fromvp);
2346 *status = NFSERR_NOTDIR;
2347 return;
2348 }
2349 /*
2350 * Disallow NULL paths
2351 */
2352 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2353 VN_RELE(tovp);
2354 VN_RELE(fromvp);
2355 *status = NFSERR_ACCES;
2356 return;
2357 }
2358
2359 if (rdonly(ro, tovp)) {
2360 VN_RELE(tovp);
2361 VN_RELE(fromvp);
2362 *status = NFSERR_ROFS;
2363 return;
2364 }
2365
2366 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2367
2368 /*
2369 * Force modified data and metadata out to stable storage.
2370 */
2371 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2372 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2373
2374 VN_RELE(tovp);
2375 VN_RELE(fromvp);
2376
2377 *status = puterrno(error);
2378
2379 }
2380 void *
2381 rfs_link_getfh(struct nfslinkargs *args)
2382 {
2383 return (args->la_from);
2384 }
2385
2386 /*
2387 * Symbolicly link to a file.
2388 * Create a file (to) with the given attributes which is a symbolic link
2389 * to the given path name (to).
2390 */
2391 void
2392 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2393 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2394 {
2395 int error;
2396 struct vattr va;
2397 vnode_t *vp;
2398 vnode_t *svp;
2399 int lerror;
2400 struct sockaddr *ca;
2401 char *name = NULL;
2402
2403 /*
2404 * Disallow NULL paths
2405 */
2406 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2407 *status = NFSERR_ACCES;
2408 return;
2409 }
2410
2411 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2412 if (vp == NULL) {
2413 *status = NFSERR_STALE;
2414 return;
2415 }
2416
2417 if (rdonly(ro, vp)) {
2418 VN_RELE(vp);
2419 *status = NFSERR_ROFS;
2420 return;
2421 }
2422
2423 error = sattr_to_vattr(args->sla_sa, &va);
2424 if (error) {
2425 VN_RELE(vp);
2426 *status = puterrno(error);
2427 return;
2428 }
2429
2430 if (!(va.va_mask & AT_MODE)) {
2431 VN_RELE(vp);
2432 *status = NFSERR_INVAL;
2433 return;
2434 }
2435
2436 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2437 name = nfscmd_convname(ca, exi, args->sla_tnm,
2438 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2439
2440 if (name == NULL) {
2441 *status = NFSERR_ACCES;
2442 return;
2443 }
2444
2445 va.va_type = VLNK;
2446 va.va_mask |= AT_TYPE;
2447
2448 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2449
2450 /*
2451 * Force new data and metadata out to stable storage.
2452 */
2453 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2454 NULL, cr, NULL, NULL, NULL);
2455
2456 if (!lerror) {
2457 (void) VOP_FSYNC(svp, 0, cr, NULL);
2458 VN_RELE(svp);
2459 }
2460
2461 /*
2462 * Force modified data and metadata out to stable storage.
2463 */
2464 (void) VOP_FSYNC(vp, 0, cr, NULL);
2465
2466 VN_RELE(vp);
2467
2468 *status = puterrno(error);
2469 if (name != args->sla_tnm)
2470 kmem_free(name, MAXPATHLEN);
2471
2472 }
2473 void *
2474 rfs_symlink_getfh(struct nfsslargs *args)
2475 {
2476 return (args->sla_from.da_fhandle);
2477 }
2478
2479 /*
2480 * Make a directory.
2481 * Create a directory with the given name, parent directory, and attributes.
2482 * Returns a file handle and attributes for the new directory.
2483 */
2484 /* ARGSUSED */
2485 void
2486 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2487 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2488 {
2489 int error;
2490 struct vattr va;
2491 vnode_t *dvp = NULL;
2492 vnode_t *vp;
2493 char *name = args->ca_da.da_name;
2494
2495 /*
2496 * Disallow NULL paths
2497 */
2498 if (name == NULL || *name == '\0') {
2499 dr->dr_status = NFSERR_ACCES;
2500 return;
2501 }
2502
2503 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2504 if (vp == NULL) {
2505 dr->dr_status = NFSERR_STALE;
2506 return;
2507 }
2508
2509 if (rdonly(ro, vp)) {
2510 VN_RELE(vp);
2511 dr->dr_status = NFSERR_ROFS;
2512 return;
2513 }
2514
2515 error = sattr_to_vattr(args->ca_sa, &va);
2516 if (error) {
2517 VN_RELE(vp);
2518 dr->dr_status = puterrno(error);
2519 return;
2520 }
2521
2522 if (!(va.va_mask & AT_MODE)) {
2523 VN_RELE(vp);
2524 dr->dr_status = NFSERR_INVAL;
2525 return;
2526 }
2527
2528 va.va_type = VDIR;
2529 va.va_mask |= AT_TYPE;
2530
2531 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2532
2533 if (!error) {
2534 /*
2535 * Attribtutes of the newly created directory should
2536 * be returned to the client.
2537 */
2538 va.va_mask = AT_ALL; /* We want everything */
2539 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2540
2541 /* check for overflows */
2542 if (!error) {
2543 acl_perm(vp, exi, &va, cr);
2544 error = vattr_to_nattr(&va, &dr->dr_attr);
2545 if (!error) {
2546 error = makefh(&dr->dr_fhandle, dvp, exi);
2547 }
2548 }
2549 /*
2550 * Force new data and metadata out to stable storage.
2551 */
2552 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2553 VN_RELE(dvp);
2554 }
2555
2556 /*
2557 * Force modified data and metadata out to stable storage.
2558 */
2559 (void) VOP_FSYNC(vp, 0, cr, NULL);
2560
2561 VN_RELE(vp);
2562
2563 dr->dr_status = puterrno(error);
2564
2565 }
2566 void *
2567 rfs_mkdir_getfh(struct nfscreatargs *args)
2568 {
2569 return (args->ca_da.da_fhandle);
2570 }
2571
2572 /*
2573 * Remove a directory.
2574 * Remove the given directory name from the given parent directory.
2575 */
2576 /* ARGSUSED */
2577 void
2578 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2579 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2580 {
2581 int error;
2582 vnode_t *vp;
2583
2584 /*
2585 * Disallow NULL paths
2586 */
2587 if (da->da_name == NULL || *da->da_name == '\0') {
2588 *status = NFSERR_ACCES;
2589 return;
2590 }
2591
2592 vp = nfs_fhtovp(da->da_fhandle, exi);
2593 if (vp == NULL) {
2594 *status = NFSERR_STALE;
2595 return;
2596 }
2597
2598 if (rdonly(ro, vp)) {
2599 VN_RELE(vp);
2600 *status = NFSERR_ROFS;
2601 return;
2602 }
2603
2604 /*
2605 * VOP_RMDIR takes a third argument (the current
2606 * directory of the process). That's because someone
2607 * wants to return EINVAL if one tries to remove ".".
2608 * Of course, NFS servers have no idea what their
2609 * clients' current directories are. We fake it by
2610 * supplying a vnode known to exist and illegal to
2611 * remove.
2612 */
2613 error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2614
2615 /*
2616 * Force modified data and metadata out to stable storage.
2617 */
2618 (void) VOP_FSYNC(vp, 0, cr, NULL);
2619
2620 VN_RELE(vp);
2621
2622 /*
2623 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2624 * if the directory is not empty. A System V NFS server
2625 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2626 * over the wire.
2627 */
2628 if (error == EEXIST)
2629 *status = NFSERR_NOTEMPTY;
2630 else
2631 *status = puterrno(error);
2632
2633 }
2634 void *
2635 rfs_rmdir_getfh(struct nfsdiropargs *da)
2636 {
2637 return (da->da_fhandle);
2638 }
2639
2640 /* ARGSUSED */
2641 void
2642 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2643 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2644 {
2645 int error;
2646 int iseof;
2647 struct iovec iov;
2648 struct uio uio;
2649 vnode_t *vp;
2650 char *ndata = NULL;
2651 struct sockaddr *ca;
2652 size_t nents;
2653 int ret;
2654
2655 vp = nfs_fhtovp(&rda->rda_fh, exi);
2656 if (vp == NULL) {
2657 rd->rd_entries = NULL;
2658 rd->rd_status = NFSERR_STALE;
2659 return;
2660 }
2661
2662 if (vp->v_type != VDIR) {
2663 VN_RELE(vp);
2664 rd->rd_entries = NULL;
2665 rd->rd_status = NFSERR_NOTDIR;
2666 return;
2667 }
2668
2669 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2670
2671 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2672
2673 if (error) {
2674 rd->rd_entries = NULL;
2675 goto bad;
2676 }
2677
2678 if (rda->rda_count == 0) {
2679 rd->rd_entries = NULL;
2680 rd->rd_size = 0;
2681 rd->rd_eof = FALSE;
2682 goto bad;
2683 }
2684
2685 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2686
2687 /*
2688 * Allocate data for entries. This will be freed by rfs_rddirfree.
2689 */
2690 rd->rd_bufsize = (uint_t)rda->rda_count;
2691 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2692
2693 /*
2694 * Set up io vector to read directory data
2695 */
2696 iov.iov_base = (caddr_t)rd->rd_entries;
2697 iov.iov_len = rda->rda_count;
2698 uio.uio_iov = &iov;
2699 uio.uio_iovcnt = 1;
2700 uio.uio_segflg = UIO_SYSSPACE;
2701 uio.uio_extflg = UIO_COPY_CACHED;
2702 uio.uio_loffset = (offset_t)rda->rda_offset;
2703 uio.uio_resid = rda->rda_count;
2704
2705 /*
2706 * read directory
2707 */
2708 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2709
2710 /*
2711 * Clean up
2712 */
2713 if (!error) {
2714 /*
2715 * set size and eof
2716 */
2717 if (uio.uio_resid == rda->rda_count) {
2718 rd->rd_size = 0;
2719 rd->rd_eof = TRUE;
2720 } else {
2721 rd->rd_size = (uint32_t)(rda->rda_count -
2722 uio.uio_resid);
2723 rd->rd_eof = iseof ? TRUE : FALSE;
2724 }
2725 }
2726
2727 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2728 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2729 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2730 rda->rda_count, &ndata);
2731
2732 if (ret != 0) {
2733 size_t dropbytes;
2734 /*
2735 * We had to drop one or more entries in order to fit
2736 * during the character conversion. We need to patch
2737 * up the size and eof info.
2738 */
2739 if (rd->rd_eof)
2740 rd->rd_eof = FALSE;
2741 dropbytes = nfscmd_dropped_entrysize(
2742 (struct dirent64 *)rd->rd_entries, nents, ret);
2743 rd->rd_size -= dropbytes;
2744 }
2745 if (ndata == NULL) {
2746 ndata = (char *)rd->rd_entries;
2747 } else if (ndata != (char *)rd->rd_entries) {
2748 kmem_free(rd->rd_entries, rd->rd_bufsize);
2749 rd->rd_entries = (void *)ndata;
2750 rd->rd_bufsize = rda->rda_count;
2751 }
2752
2753 bad:
2754 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2755
2756 #if 0 /* notyet */
2757 /*
2758 * Don't do this. It causes local disk writes when just
2759 * reading the file and the overhead is deemed larger
2760 * than the benefit.
2761 */
2762 /*
2763 * Force modified metadata out to stable storage.
2764 */
2765 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2766 #endif
2767
2768 VN_RELE(vp);
2769
2770 rd->rd_status = puterrno(error);
2771
2772 }
2773 void *
2774 rfs_readdir_getfh(struct nfsrddirargs *rda)
2775 {
2776 return (&rda->rda_fh);
2777 }
2778 void
2779 rfs_rddirfree(struct nfsrddirres *rd)
2780 {
2781 if (rd->rd_entries != NULL)
2782 kmem_free(rd->rd_entries, rd->rd_bufsize);
2783 }
2784
2785 /* ARGSUSED */
2786 void
2787 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2788 struct svc_req *req, cred_t *cr, bool_t ro)
2789 {
2790 int error;
2791 struct statvfs64 sb;
2792 vnode_t *vp;
2793
2794 vp = nfs_fhtovp(fh, exi);
2795 if (vp == NULL) {
2796 fs->fs_status = NFSERR_STALE;
2797 return;
2798 }
2799
2800 error = VFS_STATVFS(vp->v_vfsp, &sb);
2801
2802 if (!error) {
2803 fs->fs_tsize = nfstsize();
2804 fs->fs_bsize = sb.f_frsize;
2805 fs->fs_blocks = sb.f_blocks;
2806 fs->fs_bfree = sb.f_bfree;
2807 fs->fs_bavail = sb.f_bavail;
2808 }
2809
2810 VN_RELE(vp);
2811
2812 fs->fs_status = puterrno(error);
2813
2814 }
2815 void *
2816 rfs_statfs_getfh(fhandle_t *fh)
2817 {
2818 return (fh);
2819 }
2820
2821 static int
2822 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2823 {
2824 vap->va_mask = 0;
2825
2826 /*
2827 * There was a sign extension bug in some VFS based systems
2828 * which stored the mode as a short. When it would get
2829 * assigned to a u_long, no sign extension would occur.
2830 * It needed to, but this wasn't noticed because sa_mode
2831 * would then get assigned back to the short, thus ignoring
2832 * the upper 16 bits of sa_mode.
2833 *
2834 * To make this implementation work for both broken
2835 * clients and good clients, we check for both versions
2836 * of the mode.
2837 */
2838 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2839 sa->sa_mode != (uint32_t)-1) {
2840 vap->va_mask |= AT_MODE;
2841 vap->va_mode = sa->sa_mode;
2842 }
2843 if (sa->sa_uid != (uint32_t)-1) {
2844 vap->va_mask |= AT_UID;
2845 vap->va_uid = sa->sa_uid;
2846 }
2847 if (sa->sa_gid != (uint32_t)-1) {
2848 vap->va_mask |= AT_GID;
2849 vap->va_gid = sa->sa_gid;
2850 }
2851 if (sa->sa_size != (uint32_t)-1) {
2852 vap->va_mask |= AT_SIZE;
2853 vap->va_size = sa->sa_size;
2854 }
2855 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2856 sa->sa_atime.tv_usec != (int32_t)-1) {
2857 #ifndef _LP64
2858 /* return error if time overflow */
2859 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2860 return (EOVERFLOW);
2861 #endif
2862 vap->va_mask |= AT_ATIME;
2863 /*
2864 * nfs protocol defines times as unsigned so don't extend sign,
2865 * unless sysadmin set nfs_allow_preepoch_time.
2866 */
2867 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2868 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2869 }
2870 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2871 sa->sa_mtime.tv_usec != (int32_t)-1) {
2872 #ifndef _LP64
2873 /* return error if time overflow */
2874 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2875 return (EOVERFLOW);
2876 #endif
2877 vap->va_mask |= AT_MTIME;
2878 /*
2879 * nfs protocol defines times as unsigned so don't extend sign,
2880 * unless sysadmin set nfs_allow_preepoch_time.
2881 */
2882 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2883 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2884 }
2885 return (0);
2886 }
2887
2888 static const enum nfsftype vt_to_nf[] = {
2889 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2890 };
2891
2892 /*
2893 * check the following fields for overflow: nodeid, size, and time.
2894 * There could be a problem when converting 64-bit LP64 fields
2895 * into 32-bit ones. Return an error if there is an overflow.
2896 */
2897 int
2898 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2899 {
2900 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2901 na->na_type = vt_to_nf[vap->va_type];
2902
2903 if (vap->va_mode == (unsigned short) -1)
2904 na->na_mode = (uint32_t)-1;
2905 else
2906 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2907
2908 if (vap->va_uid == (unsigned short)(-1))
2909 na->na_uid = (uint32_t)(-1);
2910 else if (vap->va_uid == UID_NOBODY)
2911 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2912 else
2913 na->na_uid = vap->va_uid;
2914
2915 if (vap->va_gid == (unsigned short)(-1))
2916 na->na_gid = (uint32_t)-1;
2917 else if (vap->va_gid == GID_NOBODY)
2918 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2919 else
2920 na->na_gid = vap->va_gid;
2921
2922 /*
2923 * Do we need to check fsid for overflow? It is 64-bit in the
2924 * vattr, but are bigger than 32 bit values supported?
2925 */
2926 na->na_fsid = vap->va_fsid;
2927
2928 na->na_nodeid = vap->va_nodeid;
2929
2930 /*
2931 * Check to make sure that the nodeid is representable over the
2932 * wire without losing bits.
2933 */
2934 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2935 return (EFBIG);
2936 na->na_nlink = vap->va_nlink;
2937
2938 /*
2939 * Check for big files here, instead of at the caller. See
2940 * comments in cstat for large special file explanation.
2941 */
2942 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2943 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2944 return (EFBIG);
2945 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2946 /* UNKNOWN_SIZE | OVERFLOW */
2947 na->na_size = MAXOFF32_T;
2948 } else
2949 na->na_size = vap->va_size;
2950 } else
2951 na->na_size = vap->va_size;
2952
2953 /*
2954 * If the vnode times overflow the 32-bit times that NFS2
2955 * uses on the wire then return an error.
2956 */
2957 if (!NFS_VAP_TIME_OK(vap)) {
2958 return (EOVERFLOW);
2959 }
2960 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2961 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2962
2963 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2964 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2965
2966 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2967 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2968
2969 /*
2970 * If the dev_t will fit into 16 bits then compress
2971 * it, otherwise leave it alone. See comments in
2972 * nfs_client.c.
2973 */
2974 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2975 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2976 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2977 else
2978 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2979
2980 na->na_blocks = vap->va_nblocks;
2981 na->na_blocksize = vap->va_blksize;
2982
2983 /*
2984 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2985 * over-the-wire protocols for named-pipe vnodes. It remaps the
2986 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2987 *
2988 * BUYER BEWARE:
2989 * If you are porting the NFS to a non-Sun server, you probably
2990 * don't want to include the following block of code. The
2991 * over-the-wire special file types will be changing with the
2992 * NFS Protocol Revision.
2993 */
2994 if (vap->va_type == VFIFO)
2995 NA_SETFIFO(na);
2996 return (0);
2997 }
2998
2999 /*
3000 * acl v2 support: returns approximate permission.
3001 * default: returns minimal permission (more restrictive)
3002 * aclok: returns maximal permission (less restrictive)
3003 * This routine changes the permissions that are alaredy in *va.
3004 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3005 * CLASS_OBJ is always the same as GROUP_OBJ entry.
3006 */
3007 static void
3008 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3009 {
3010 vsecattr_t vsa;
3011 int aclcnt;
3012 aclent_t *aclentp;
3013 mode_t mask_perm;
3014 mode_t grp_perm;
3015 mode_t other_perm;
3016 mode_t other_orig;
3017 int error;
3018
3019 /* dont care default acl */
3020 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3021 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3022
3023 if (!error) {
3024 aclcnt = vsa.vsa_aclcnt;
3025 if (aclcnt > MIN_ACL_ENTRIES) {
3026 /* non-trivial ACL */
3027 aclentp = vsa.vsa_aclentp;
3028 if (exi->exi_export.ex_flags & EX_ACLOK) {
3029 /* maximal permissions */
3030 grp_perm = 0;
3031 other_perm = 0;
3032 for (; aclcnt > 0; aclcnt--, aclentp++) {
3033 switch (aclentp->a_type) {
3034 case USER_OBJ:
3035 break;
3036 case USER:
3037 grp_perm |=
3038 aclentp->a_perm << 3;
3039 other_perm |= aclentp->a_perm;
3040 break;
3041 case GROUP_OBJ:
3042 grp_perm |=
3043 aclentp->a_perm << 3;
3044 break;
3045 case GROUP:
3046 other_perm |= aclentp->a_perm;
3047 break;
3048 case OTHER_OBJ:
3049 other_orig = aclentp->a_perm;
3050 break;
3051 case CLASS_OBJ:
3052 mask_perm = aclentp->a_perm;
3053 break;
3054 default:
3055 break;
3056 }
3057 }
3058 grp_perm &= mask_perm << 3;
3059 other_perm &= mask_perm;
3060 other_perm |= other_orig;
3061
3062 } else {
3063 /* minimal permissions */
3064 grp_perm = 070;
3065 other_perm = 07;
3066 for (; aclcnt > 0; aclcnt--, aclentp++) {
3067 switch (aclentp->a_type) {
3068 case USER_OBJ:
3069 break;
3070 case USER:
3071 case CLASS_OBJ:
3072 grp_perm &=
3073 aclentp->a_perm << 3;
3074 other_perm &=
3075 aclentp->a_perm;
3076 break;
3077 case GROUP_OBJ:
3078 grp_perm &=
3079 aclentp->a_perm << 3;
3080 break;
3081 case GROUP:
3082 other_perm &=
3083 aclentp->a_perm;
3084 break;
3085 case OTHER_OBJ:
3086 other_perm &=
3087 aclentp->a_perm;
3088 break;
3089 default:
3090 break;
3091 }
3092 }
3093 }
3094 /* copy to va */
3095 va->va_mode &= ~077;
3096 va->va_mode |= grp_perm | other_perm;
3097 }
3098 if (vsa.vsa_aclcnt)
3099 kmem_free(vsa.vsa_aclentp,
3100 vsa.vsa_aclcnt * sizeof (aclent_t));
3101 }
3102 }
3103
3104 void
3105 rfs_srvrinit(void)
3106 {
3107 nfs2_srv_caller_id = fs_new_caller_id();
3108 }
3109
3110 void
3111 rfs_srvrfini(void)
3112 {
3113 }
3114
3115 /* ARGSUSED */
3116 void
3117 rfs_srv_zone_init(nfs_globals_t *ng)
3118 {
3119 nfs_srv_t *ns;
3120
3121 ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3122
3123 mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3124 ns->write_async = 1;
3125
3126 ng->nfs_srv = ns;
3127 }
3128
3129 /* ARGSUSED */
3130 void
3131 rfs_srv_zone_fini(nfs_globals_t *ng)
3132 {
3133 nfs_srv_t *ns = ng->nfs_srv;
3134
3135 ng->nfs_srv = NULL;
3136
3137 mutex_destroy(&ns->async_write_lock);
3138 kmem_free(ns, sizeof (*ns));
3139 }
3140
3141 static int
3142 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3143 {
3144 struct clist *wcl;
3145 int wlist_len;
3146 uint32_t count = rr->rr_count;
3147
3148 wcl = ra->ra_wlist;
3149
3150 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3151 return (FALSE);
3152 }
3153
3154 wcl = ra->ra_wlist;
3155 rr->rr_ok.rrok_wlist_len = wlist_len;
3156 rr->rr_ok.rrok_wlist = wcl;
3157
3158 return (TRUE);
3159 }