1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
30 * All rights reserved.
31 */
32
33 /*
34 * Copyright 2018 Nexenta Systems, Inc.
35 * Copyright (c) 2016 by Delphix. All rights reserved.
36 */
37
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/uio.h>
46 #include <sys/stat.h>
47 #include <sys/errno.h>
48 #include <sys/sysmacros.h>
49 #include <sys/statvfs.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/dirent.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/vtrace.h>
56 #include <sys/mode.h>
57 #include <sys/acl.h>
58 #include <sys/nbmlock.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/svc.h>
65
66 #include <nfs/nfs.h>
67 #include <nfs/export.h>
68 #include <nfs/nfs_cmd.h>
69
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_kmem.h>
75
76 #include <sys/strsubr.h>
77
78 struct rfs_async_write_list;
79
80 /*
81 * Zone globals of NFSv2 server
82 */
83 typedef struct nfs_srv {
84 kmutex_t async_write_lock;
85 struct rfs_async_write_list *async_write_head;
86
87 /*
88 * enables write clustering if == 1
89 */
90 int write_async;
91 } nfs_srv_t;
92
93 /*
94 * These are the interface routines for the server side of the
95 * Network File System. See the NFS version 2 protocol specification
96 * for a description of this interface.
97 */
98
99 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
100 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
101 cred_t *);
102 static void *rfs_zone_init(zoneid_t zoneid);
103 static void rfs_zone_fini(zoneid_t zoneid, void *data);
104
105
106 /*
107 * Some "over the wire" UNIX file types. These are encoded
108 * into the mode. This needs to be fixed in the next rev.
109 */
110 #define IFMT 0170000 /* type of file */
111 #define IFCHR 0020000 /* character special */
112 #define IFBLK 0060000 /* block special */
113 #define IFSOCK 0140000 /* socket */
114
115 u_longlong_t nfs2_srv_caller_id;
116 static zone_key_t rfs_zone_key;
117
118 /*
119 * Get file attributes.
120 * Returns the current attributes of the file with the given fhandle.
121 */
122 /* ARGSUSED */
123 void
124 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
125 struct svc_req *req, cred_t *cr, bool_t ro)
126 {
127 int error;
128 vnode_t *vp;
129 struct vattr va;
130
131 vp = nfs_fhtovp(fhp, exi);
132 if (vp == NULL) {
133 ns->ns_status = NFSERR_STALE;
134 return;
135 }
136
137 /*
138 * Do the getattr.
139 */
140 va.va_mask = AT_ALL; /* we want all the attributes */
141
142 error = rfs4_delegated_getattr(vp, &va, 0, cr);
143
144 /* check for overflows */
145 if (!error) {
146 /* Lie about the object type for a referral */
147 if (vn_is_nfs_reparse(vp, cr))
148 va.va_type = VLNK;
149
150 acl_perm(vp, exi, &va, cr);
151 error = vattr_to_nattr(&va, &ns->ns_attr);
152 }
153
154 VN_RELE(vp);
155
156 ns->ns_status = puterrno(error);
157 }
158 void *
159 rfs_getattr_getfh(fhandle_t *fhp)
160 {
161 return (fhp);
162 }
163
164 /*
165 * Set file attributes.
166 * Sets the attributes of the file with the given fhandle. Returns
167 * the new attributes.
168 */
169 /* ARGSUSED */
170 void
171 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
172 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
173 {
174 int error;
175 int flag;
176 int in_crit = 0;
177 vnode_t *vp;
178 struct vattr va;
179 struct vattr bva;
180 struct flock64 bf;
181 caller_context_t ct;
182
183
184 vp = nfs_fhtovp(&args->saa_fh, exi);
185 if (vp == NULL) {
186 ns->ns_status = NFSERR_STALE;
187 return;
188 }
189
190 if (rdonly(ro, vp)) {
191 VN_RELE(vp);
192 ns->ns_status = NFSERR_ROFS;
193 return;
194 }
195
196 error = sattr_to_vattr(&args->saa_sa, &va);
197 if (error) {
198 VN_RELE(vp);
199 ns->ns_status = puterrno(error);
200 return;
201 }
202
203 /*
204 * If the client is requesting a change to the mtime,
205 * but the nanosecond field is set to 1 billion, then
206 * this is a flag to the server that it should set the
207 * atime and mtime fields to the server's current time.
208 * The 1 billion number actually came from the client
209 * as 1 million, but the units in the over the wire
210 * request are microseconds instead of nanoseconds.
211 *
212 * This is an overload of the protocol and should be
213 * documented in the NFS Version 2 protocol specification.
214 */
215 if (va.va_mask & AT_MTIME) {
216 if (va.va_mtime.tv_nsec == 1000000000) {
217 gethrestime(&va.va_mtime);
218 va.va_atime = va.va_mtime;
219 va.va_mask |= AT_ATIME;
220 flag = 0;
221 } else
222 flag = ATTR_UTIME;
223 } else
224 flag = 0;
225
226 /*
227 * If the filesystem is exported with nosuid, then mask off
228 * the setuid and setgid bits.
229 */
230 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
231 (exi->exi_export.ex_flags & EX_NOSUID))
232 va.va_mode &= ~(VSUID | VSGID);
233
234 ct.cc_sysid = 0;
235 ct.cc_pid = 0;
236 ct.cc_caller_id = nfs2_srv_caller_id;
237 ct.cc_flags = CC_DONTBLOCK;
238
239 /*
240 * We need to specially handle size changes because it is
241 * possible for the client to create a file with modes
242 * which indicate read-only, but with the file opened for
243 * writing. If the client then tries to set the size of
244 * the file, then the normal access checking done in
245 * VOP_SETATTR would prevent the client from doing so,
246 * although it should be legal for it to do so. To get
247 * around this, we do the access checking for ourselves
248 * and then use VOP_SPACE which doesn't do the access
249 * checking which VOP_SETATTR does. VOP_SPACE can only
250 * operate on VREG files, let VOP_SETATTR handle the other
251 * extremely rare cases.
252 * Also the client should not be allowed to change the
253 * size of the file if there is a conflicting non-blocking
254 * mandatory lock in the region of change.
255 */
256 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
257 if (nbl_need_check(vp)) {
258 nbl_start_crit(vp, RW_READER);
259 in_crit = 1;
260 }
261
262 bva.va_mask = AT_UID | AT_SIZE;
263
264 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
265
266 if (error) {
267 if (in_crit)
268 nbl_end_crit(vp);
269 VN_RELE(vp);
270 ns->ns_status = puterrno(error);
271 return;
272 }
273
274 if (in_crit) {
275 u_offset_t offset;
276 ssize_t length;
277
278 if (va.va_size < bva.va_size) {
279 offset = va.va_size;
280 length = bva.va_size - va.va_size;
281 } else {
282 offset = bva.va_size;
283 length = va.va_size - bva.va_size;
284 }
285 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
286 NULL)) {
287 error = EACCES;
288 }
289 }
290
291 if (crgetuid(cr) == bva.va_uid && !error &&
292 va.va_size != bva.va_size) {
293 va.va_mask &= ~AT_SIZE;
294 bf.l_type = F_WRLCK;
295 bf.l_whence = 0;
296 bf.l_start = (off64_t)va.va_size;
297 bf.l_len = 0;
298 bf.l_sysid = 0;
299 bf.l_pid = 0;
300
301 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
302 (offset_t)va.va_size, cr, &ct);
303 }
304 if (in_crit)
305 nbl_end_crit(vp);
306 } else
307 error = 0;
308
309 /*
310 * Do the setattr.
311 */
312 if (!error && va.va_mask) {
313 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
314 }
315
316 /*
317 * check if the monitor on either vop_space or vop_setattr detected
318 * a delegation conflict and if so, mark the thread flag as
319 * wouldblock so that the response is dropped and the client will
320 * try again.
321 */
322 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
323 VN_RELE(vp);
324 curthread->t_flag |= T_WOULDBLOCK;
325 return;
326 }
327
328 if (!error) {
329 va.va_mask = AT_ALL; /* get everything */
330
331 error = rfs4_delegated_getattr(vp, &va, 0, cr);
332
333 /* check for overflows */
334 if (!error) {
335 acl_perm(vp, exi, &va, cr);
336 error = vattr_to_nattr(&va, &ns->ns_attr);
337 }
338 }
339
340 ct.cc_flags = 0;
341
342 /*
343 * Force modified metadata out to stable storage.
344 */
345 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
346
347 VN_RELE(vp);
348
349 ns->ns_status = puterrno(error);
350 }
351 void *
352 rfs_setattr_getfh(struct nfssaargs *args)
353 {
354 return (&args->saa_fh);
355 }
356
357 /* Change and release @exip and @vpp only in success */
358 int
359 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
360 {
361 struct exportinfo *exi;
362 vnode_t *vp = *vpp;
363 fid_t fid;
364 int error;
365
366 VN_HOLD(vp);
367
368 if ((error = traverse(&vp)) != 0) {
369 VN_RELE(vp);
370 return (error);
371 }
372
373 bzero(&fid, sizeof (fid));
374 fid.fid_len = MAXFIDSZ;
375 error = VOP_FID(vp, &fid, NULL);
376 if (error) {
377 VN_RELE(vp);
378 return (error);
379 }
380
381 exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
382 if (exi == NULL ||
383 (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
384 /*
385 * It is not error, just subdir is not exported
386 * or "nohide" is not set
387 */
388 if (exi != NULL)
389 exi_rele(exi);
390 VN_RELE(vp);
391 } else {
392 /* go to submount */
393 exi_rele(*exip);
394 *exip = exi;
395
396 VN_RELE(*vpp);
397 *vpp = vp;
398 }
399
400 return (0);
401 }
402
403 /*
404 * Given mounted "dvp" and "exi", go upper mountpoint
405 * with dvp/exi correction
406 * Return 0 in success
407 */
408 int
409 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
410 {
411 struct exportinfo *exi;
412 vnode_t *dvp = *dvpp;
413
414 ASSERT(dvp->v_flag & VROOT);
415
416 VN_HOLD(dvp);
417 dvp = untraverse(dvp);
418 exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
419 if (exi == NULL) {
420 VN_RELE(dvp);
421 return (-1);
422 }
423
424 exi_rele(*exip);
425 *exip = exi;
426 VN_RELE(*dvpp);
427 *dvpp = dvp;
428
429 return (0);
430 }
431 /*
432 * Directory lookup.
433 * Returns an fhandle and file attributes for file name in a directory.
434 */
435 /* ARGSUSED */
436 void
437 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
438 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
439 {
440 int error;
441 vnode_t *dvp;
442 vnode_t *vp;
443 struct vattr va;
444 fhandle_t *fhp = da->da_fhandle;
445 struct sec_ol sec = {0, 0};
446 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
447 char *name;
448 struct sockaddr *ca;
449
450 /*
451 * Trusted Extension doesn't support NFSv2. MOUNT
452 * will reject v2 clients. Need to prevent v2 client
453 * access via WebNFS here.
454 */
455 if (is_system_labeled() && req->rq_vers == 2) {
456 dr->dr_status = NFSERR_ACCES;
457 return;
458 }
459
460 /*
461 * Disallow NULL paths
462 */
463 if (da->da_name == NULL || *da->da_name == '\0') {
464 dr->dr_status = NFSERR_ACCES;
465 return;
466 }
467
468 /*
469 * Allow lookups from the root - the default
470 * location of the public filehandle.
471 */
472 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
473 dvp = ZONE_ROOTVP();
474 VN_HOLD(dvp);
475 } else {
476 dvp = nfs_fhtovp(fhp, exi);
477 if (dvp == NULL) {
478 dr->dr_status = NFSERR_STALE;
479 return;
480 }
481 }
482
483 exi_hold(exi);
484
485 /*
486 * Not allow lookup beyond root.
487 * If the filehandle matches a filehandle of the exi,
488 * then the ".." refers beyond the root of an exported filesystem.
489 */
490 if (strcmp(da->da_name, "..") == 0 &&
491 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
492 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
493 (dvp->v_flag & VROOT)) {
494 /*
495 * special case for ".." and 'nohide'exported root
496 */
497 if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
498 error = NFSERR_ACCES;
499 goto out;
500 }
501 } else {
502 error = NFSERR_NOENT;
503 goto out;
504 }
505 }
506
507 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
508 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
509 MAXPATHLEN);
510
511 if (name == NULL) {
512 error = NFSERR_ACCES;
513 goto out;
514 }
515
516 /*
517 * If the public filehandle is used then allow
518 * a multi-component lookup, i.e. evaluate
519 * a pathname and follow symbolic links if
520 * necessary.
521 *
522 * This may result in a vnode in another filesystem
523 * which is OK as long as the filesystem is exported.
524 */
525 if (PUBLIC_FH2(fhp)) {
526 publicfh_flag = TRUE;
527
528 exi_rele(exi);
529
530 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
531 &sec);
532 } else {
533 /*
534 * Do a normal single component lookup.
535 */
536 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
537 NULL, NULL, NULL);
538 }
539
540 if (name != da->da_name)
541 kmem_free(name, MAXPATHLEN);
542
543 if (error == 0 && vn_ismntpt(vp)) {
544 error = rfs_cross_mnt(&vp, &exi);
545 if (error)
546 VN_RELE(vp);
547 }
548
549 if (!error) {
550 va.va_mask = AT_ALL; /* we want everything */
551
552 error = rfs4_delegated_getattr(vp, &va, 0, cr);
553
554 /* check for overflows */
555 if (!error) {
556 acl_perm(vp, exi, &va, cr);
557 error = vattr_to_nattr(&va, &dr->dr_attr);
558 if (!error) {
559 if (sec.sec_flags & SEC_QUERY)
560 error = makefh_ol(&dr->dr_fhandle, exi,
561 sec.sec_index);
562 else {
563 error = makefh(&dr->dr_fhandle, vp,
564 exi);
565 if (!error && publicfh_flag &&
566 !chk_clnt_sec(exi, req))
567 auth_weak = TRUE;
568 }
569 }
570 }
571 VN_RELE(vp);
572 }
573
574 out:
575 VN_RELE(dvp);
576
577 if (exi != NULL)
578 exi_rele(exi);
579
580 /*
581 * If it's public fh, no 0x81, and client's flavor is
582 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
583 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
584 */
585 if (auth_weak)
586 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
587 else
588 dr->dr_status = puterrno(error);
589 }
590 void *
591 rfs_lookup_getfh(struct nfsdiropargs *da)
592 {
593 return (da->da_fhandle);
594 }
595
596 /*
597 * Read symbolic link.
598 * Returns the string in the symbolic link at the given fhandle.
599 */
600 /* ARGSUSED */
601 void
602 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
603 struct svc_req *req, cred_t *cr, bool_t ro)
604 {
605 int error;
606 struct iovec iov;
607 struct uio uio;
608 vnode_t *vp;
609 struct vattr va;
610 struct sockaddr *ca;
611 char *name = NULL;
612 int is_referral = 0;
613
614 vp = nfs_fhtovp(fhp, exi);
615 if (vp == NULL) {
616 rl->rl_data = NULL;
617 rl->rl_status = NFSERR_STALE;
618 return;
619 }
620
621 va.va_mask = AT_MODE;
622
623 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
624
625 if (error) {
626 VN_RELE(vp);
627 rl->rl_data = NULL;
628 rl->rl_status = puterrno(error);
629 return;
630 }
631
632 if (MANDLOCK(vp, va.va_mode)) {
633 VN_RELE(vp);
634 rl->rl_data = NULL;
635 rl->rl_status = NFSERR_ACCES;
636 return;
637 }
638
639 /* We lied about the object type for a referral */
640 if (vn_is_nfs_reparse(vp, cr))
641 is_referral = 1;
642
643 /*
644 * XNFS and RFC1094 require us to return ENXIO if argument
645 * is not a link. BUGID 1138002.
646 */
647 if (vp->v_type != VLNK && !is_referral) {
648 VN_RELE(vp);
649 rl->rl_data = NULL;
650 rl->rl_status = NFSERR_NXIO;
651 return;
652 }
653
654 /*
655 * Allocate data for pathname. This will be freed by rfs_rlfree.
656 */
657 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
658
659 if (is_referral) {
660 char *s;
661 size_t strsz;
662
663 /* Get an artificial symlink based on a referral */
664 s = build_symlink(vp, cr, &strsz);
665 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
666 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
667 vnode_t *, vp, char *, s);
668 if (s == NULL)
669 error = EINVAL;
670 else {
671 error = 0;
672 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
673 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
674 kmem_free(s, strsz);
675 }
676
677 } else {
678
679 /*
680 * Set up io vector to read sym link data
681 */
682 iov.iov_base = rl->rl_data;
683 iov.iov_len = NFS_MAXPATHLEN;
684 uio.uio_iov = &iov;
685 uio.uio_iovcnt = 1;
686 uio.uio_segflg = UIO_SYSSPACE;
687 uio.uio_extflg = UIO_COPY_CACHED;
688 uio.uio_loffset = (offset_t)0;
689 uio.uio_resid = NFS_MAXPATHLEN;
690
691 /*
692 * Do the readlink.
693 */
694 error = VOP_READLINK(vp, &uio, cr, NULL);
695
696 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
697
698 if (!error)
699 rl->rl_data[rl->rl_count] = '\0';
700
701 }
702
703
704 VN_RELE(vp);
705
706 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
707 name = nfscmd_convname(ca, exi, rl->rl_data,
708 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
709
710 if (name != NULL && name != rl->rl_data) {
711 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
712 rl->rl_data = name;
713 }
714
715 /*
716 * XNFS and RFC1094 require us to return ENXIO if argument
717 * is not a link. UFS returns EINVAL if this is the case,
718 * so we do the mapping here. BUGID 1138002.
719 */
720 if (error == EINVAL)
721 rl->rl_status = NFSERR_NXIO;
722 else
723 rl->rl_status = puterrno(error);
724
725 }
726 void *
727 rfs_readlink_getfh(fhandle_t *fhp)
728 {
729 return (fhp);
730 }
731 /*
732 * Free data allocated by rfs_readlink
733 */
734 void
735 rfs_rlfree(struct nfsrdlnres *rl)
736 {
737 if (rl->rl_data != NULL)
738 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
739 }
740
741 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
742
743 /*
744 * Read data.
745 * Returns some data read from the file at the given fhandle.
746 */
747 /* ARGSUSED */
748 void
749 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
750 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
751 {
752 vnode_t *vp;
753 int error;
754 struct vattr va;
755 struct iovec iov;
756 struct uio uio;
757 mblk_t *mp;
758 int alloc_err = 0;
759 int in_crit = 0;
760 caller_context_t ct;
761
762 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
763 if (vp == NULL) {
764 rr->rr_data = NULL;
765 rr->rr_status = NFSERR_STALE;
766 return;
767 }
768
769 if (vp->v_type != VREG) {
770 VN_RELE(vp);
771 rr->rr_data = NULL;
772 rr->rr_status = NFSERR_ISDIR;
773 return;
774 }
775
776 ct.cc_sysid = 0;
777 ct.cc_pid = 0;
778 ct.cc_caller_id = nfs2_srv_caller_id;
779 ct.cc_flags = CC_DONTBLOCK;
780
781 /*
782 * Enter the critical region before calling VOP_RWLOCK
783 * to avoid a deadlock with write requests.
784 */
785 if (nbl_need_check(vp)) {
786 nbl_start_crit(vp, RW_READER);
787 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
788 0, NULL)) {
789 nbl_end_crit(vp);
790 VN_RELE(vp);
791 rr->rr_data = NULL;
792 rr->rr_status = NFSERR_ACCES;
793 return;
794 }
795 in_crit = 1;
796 }
797
798 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
799
800 /* check if a monitor detected a delegation conflict */
801 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
802 if (in_crit)
803 nbl_end_crit(vp);
804 VN_RELE(vp);
805 /* mark as wouldblock so response is dropped */
806 curthread->t_flag |= T_WOULDBLOCK;
807
808 rr->rr_data = NULL;
809 return;
810 }
811
812 va.va_mask = AT_ALL;
813
814 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
815
816 if (error) {
817 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
818 if (in_crit)
819 nbl_end_crit(vp);
820
821 VN_RELE(vp);
822 rr->rr_data = NULL;
823 rr->rr_status = puterrno(error);
824
825 return;
826 }
827
828 /*
829 * This is a kludge to allow reading of files created
830 * with no read permission. The owner of the file
831 * is always allowed to read it.
832 */
833 if (crgetuid(cr) != va.va_uid) {
834 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
835
836 if (error) {
837 /*
838 * Exec is the same as read over the net because
839 * of demand loading.
840 */
841 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
842 }
843 if (error) {
844 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
845 if (in_crit)
846 nbl_end_crit(vp);
847 VN_RELE(vp);
848 rr->rr_data = NULL;
849 rr->rr_status = puterrno(error);
850
851 return;
852 }
853 }
854
855 if (MANDLOCK(vp, va.va_mode)) {
856 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
857 if (in_crit)
858 nbl_end_crit(vp);
859
860 VN_RELE(vp);
861 rr->rr_data = NULL;
862 rr->rr_status = NFSERR_ACCES;
863
864 return;
865 }
866
867 rr->rr_ok.rrok_wlist_len = 0;
868 rr->rr_ok.rrok_wlist = NULL;
869
870 if ((u_offset_t)ra->ra_offset >= va.va_size) {
871 rr->rr_count = 0;
872 rr->rr_data = NULL;
873 /*
874 * In this case, status is NFS_OK, but there is no data
875 * to encode. So set rr_mp to NULL.
876 */
877 rr->rr_mp = NULL;
878 rr->rr_ok.rrok_wlist = ra->ra_wlist;
879 if (rr->rr_ok.rrok_wlist)
880 clist_zero_len(rr->rr_ok.rrok_wlist);
881 goto done;
882 }
883
884 if (ra->ra_wlist) {
885 mp = NULL;
886 rr->rr_mp = NULL;
887 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
888 if (ra->ra_count > iov.iov_len) {
889 rr->rr_data = NULL;
890 rr->rr_status = NFSERR_INVAL;
891 goto done;
892 }
893 } else {
894 /*
895 * mp will contain the data to be sent out in the read reply.
896 * This will be freed after the reply has been sent out (by the
897 * driver).
898 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
899 * that the call to xdrmblk_putmblk() never fails.
900 */
901 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
902 &alloc_err);
903 ASSERT(mp != NULL);
904 ASSERT(alloc_err == 0);
905
906 rr->rr_mp = mp;
907
908 /*
909 * Set up io vector
910 */
911 iov.iov_base = (caddr_t)mp->b_datap->db_base;
912 iov.iov_len = ra->ra_count;
913 }
914
915 uio.uio_iov = &iov;
916 uio.uio_iovcnt = 1;
917 uio.uio_segflg = UIO_SYSSPACE;
918 uio.uio_extflg = UIO_COPY_CACHED;
919 uio.uio_loffset = (offset_t)ra->ra_offset;
920 uio.uio_resid = ra->ra_count;
921
922 error = VOP_READ(vp, &uio, 0, cr, &ct);
923
924 if (error) {
925 if (mp)
926 freeb(mp);
927
928 /*
929 * check if a monitor detected a delegation conflict and
930 * mark as wouldblock so response is dropped
931 */
932 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
933 curthread->t_flag |= T_WOULDBLOCK;
934 else
935 rr->rr_status = puterrno(error);
936
937 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
938 if (in_crit)
939 nbl_end_crit(vp);
940
941 VN_RELE(vp);
942 rr->rr_data = NULL;
943
944 return;
945 }
946
947 /*
948 * Get attributes again so we can send the latest access
949 * time to the client side for its cache.
950 */
951 va.va_mask = AT_ALL;
952
953 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
954
955 if (error) {
956 if (mp)
957 freeb(mp);
958
959 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
960 if (in_crit)
961 nbl_end_crit(vp);
962
963 VN_RELE(vp);
964 rr->rr_data = NULL;
965 rr->rr_status = puterrno(error);
966
967 return;
968 }
969
970 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
971
972 if (mp) {
973 rr->rr_data = (char *)mp->b_datap->db_base;
974 } else {
975 if (ra->ra_wlist) {
976 rr->rr_data = (caddr_t)iov.iov_base;
977 if (!rdma_setup_read_data2(ra, rr)) {
978 rr->rr_data = NULL;
979 rr->rr_status = puterrno(NFSERR_INVAL);
980 }
981 }
982 }
983 done:
984 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
985 if (in_crit)
986 nbl_end_crit(vp);
987
988 acl_perm(vp, exi, &va, cr);
989
990 /* check for overflows */
991 error = vattr_to_nattr(&va, &rr->rr_attr);
992
993 VN_RELE(vp);
994
995 rr->rr_status = puterrno(error);
996 }
997
998 /*
999 * Free data allocated by rfs_read
1000 */
1001 void
1002 rfs_rdfree(struct nfsrdresult *rr)
1003 {
1004 mblk_t *mp;
1005
1006 if (rr->rr_status == NFS_OK) {
1007 mp = rr->rr_mp;
1008 if (mp != NULL)
1009 freeb(mp);
1010 }
1011 }
1012
1013 void *
1014 rfs_read_getfh(struct nfsreadargs *ra)
1015 {
1016 return (&ra->ra_fhandle);
1017 }
1018
1019 #define MAX_IOVECS 12
1020
1021 #ifdef DEBUG
1022 static int rfs_write_sync_hits = 0;
1023 static int rfs_write_sync_misses = 0;
1024 #endif
1025
1026 /*
1027 * Write data to file.
1028 * Returns attributes of a file after writing some data to it.
1029 *
1030 * Any changes made here, especially in error handling might have
1031 * to also be done in rfs_write (which clusters write requests).
1032 */
1033 /* ARGSUSED */
1034 void
1035 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1036 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1037 {
1038 int error;
1039 vnode_t *vp;
1040 rlim64_t rlimit;
1041 struct vattr va;
1042 struct uio uio;
1043 struct iovec iov[MAX_IOVECS];
1044 mblk_t *m;
1045 struct iovec *iovp;
1046 int iovcnt;
1047 cred_t *savecred;
1048 int in_crit = 0;
1049 caller_context_t ct;
1050
1051 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1052 if (vp == NULL) {
1053 ns->ns_status = NFSERR_STALE;
1054 return;
1055 }
1056
1057 if (rdonly(ro, vp)) {
1058 VN_RELE(vp);
1059 ns->ns_status = NFSERR_ROFS;
1060 return;
1061 }
1062
1063 if (vp->v_type != VREG) {
1064 VN_RELE(vp);
1065 ns->ns_status = NFSERR_ISDIR;
1066 return;
1067 }
1068
1069 ct.cc_sysid = 0;
1070 ct.cc_pid = 0;
1071 ct.cc_caller_id = nfs2_srv_caller_id;
1072 ct.cc_flags = CC_DONTBLOCK;
1073
1074 va.va_mask = AT_UID|AT_MODE;
1075
1076 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1077
1078 if (error) {
1079 VN_RELE(vp);
1080 ns->ns_status = puterrno(error);
1081
1082 return;
1083 }
1084
1085 if (crgetuid(cr) != va.va_uid) {
1086 /*
1087 * This is a kludge to allow writes of files created
1088 * with read only permission. The owner of the file
1089 * is always allowed to write it.
1090 */
1091 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1092
1093 if (error) {
1094 VN_RELE(vp);
1095 ns->ns_status = puterrno(error);
1096 return;
1097 }
1098 }
1099
1100 /*
1101 * Can't access a mandatory lock file. This might cause
1102 * the NFS service thread to block forever waiting for a
1103 * lock to be released that will never be released.
1104 */
1105 if (MANDLOCK(vp, va.va_mode)) {
1106 VN_RELE(vp);
1107 ns->ns_status = NFSERR_ACCES;
1108 return;
1109 }
1110
1111 /*
1112 * We have to enter the critical region before calling VOP_RWLOCK
1113 * to avoid a deadlock with ufs.
1114 */
1115 if (nbl_need_check(vp)) {
1116 nbl_start_crit(vp, RW_READER);
1117 in_crit = 1;
1118 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1119 wa->wa_count, 0, NULL)) {
1120 error = EACCES;
1121 goto out;
1122 }
1123 }
1124
1125 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1126
1127 /* check if a monitor detected a delegation conflict */
1128 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1129 goto out;
1130 }
1131
1132 if (wa->wa_data || wa->wa_rlist) {
1133 /* Do the RDMA thing if necessary */
1134 if (wa->wa_rlist) {
1135 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1136 iov[0].iov_len = wa->wa_count;
1137 } else {
1138 iov[0].iov_base = wa->wa_data;
1139 iov[0].iov_len = wa->wa_count;
1140 }
1141 uio.uio_iov = iov;
1142 uio.uio_iovcnt = 1;
1143 uio.uio_segflg = UIO_SYSSPACE;
1144 uio.uio_extflg = UIO_COPY_DEFAULT;
1145 uio.uio_loffset = (offset_t)wa->wa_offset;
1146 uio.uio_resid = wa->wa_count;
1147 /*
1148 * The limit is checked on the client. We
1149 * should allow any size writes here.
1150 */
1151 uio.uio_llimit = curproc->p_fsz_ctl;
1152 rlimit = uio.uio_llimit - wa->wa_offset;
1153 if (rlimit < (rlim64_t)uio.uio_resid)
1154 uio.uio_resid = (uint_t)rlimit;
1155
1156 /*
1157 * for now we assume no append mode
1158 */
1159 /*
1160 * We're changing creds because VM may fault and we need
1161 * the cred of the current thread to be used if quota
1162 * checking is enabled.
1163 */
1164 savecred = curthread->t_cred;
1165 curthread->t_cred = cr;
1166 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1167 curthread->t_cred = savecred;
1168 } else {
1169
1170 iovcnt = 0;
1171 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1172 iovcnt++;
1173 if (iovcnt <= MAX_IOVECS) {
1174 #ifdef DEBUG
1175 rfs_write_sync_hits++;
1176 #endif
1177 iovp = iov;
1178 } else {
1179 #ifdef DEBUG
1180 rfs_write_sync_misses++;
1181 #endif
1182 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1183 }
1184 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1185 uio.uio_iov = iovp;
1186 uio.uio_iovcnt = iovcnt;
1187 uio.uio_segflg = UIO_SYSSPACE;
1188 uio.uio_extflg = UIO_COPY_DEFAULT;
1189 uio.uio_loffset = (offset_t)wa->wa_offset;
1190 uio.uio_resid = wa->wa_count;
1191 /*
1192 * The limit is checked on the client. We
1193 * should allow any size writes here.
1194 */
1195 uio.uio_llimit = curproc->p_fsz_ctl;
1196 rlimit = uio.uio_llimit - wa->wa_offset;
1197 if (rlimit < (rlim64_t)uio.uio_resid)
1198 uio.uio_resid = (uint_t)rlimit;
1199
1200 /*
1201 * For now we assume no append mode.
1202 */
1203 /*
1204 * We're changing creds because VM may fault and we need
1205 * the cred of the current thread to be used if quota
1206 * checking is enabled.
1207 */
1208 savecred = curthread->t_cred;
1209 curthread->t_cred = cr;
1210 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1211 curthread->t_cred = savecred;
1212
1213 if (iovp != iov)
1214 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1215 }
1216
1217 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1218
1219 if (!error) {
1220 /*
1221 * Get attributes again so we send the latest mod
1222 * time to the client side for its cache.
1223 */
1224 va.va_mask = AT_ALL; /* now we want everything */
1225
1226 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1227
1228 /* check for overflows */
1229 if (!error) {
1230 acl_perm(vp, exi, &va, cr);
1231 error = vattr_to_nattr(&va, &ns->ns_attr);
1232 }
1233 }
1234
1235 out:
1236 if (in_crit)
1237 nbl_end_crit(vp);
1238 VN_RELE(vp);
1239
1240 /* check if a monitor detected a delegation conflict */
1241 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1242 /* mark as wouldblock so response is dropped */
1243 curthread->t_flag |= T_WOULDBLOCK;
1244 else
1245 ns->ns_status = puterrno(error);
1246
1247 }
1248
1249 struct rfs_async_write {
1250 struct nfswriteargs *wa;
1251 struct nfsattrstat *ns;
1252 struct svc_req *req;
1253 cred_t *cr;
1254 bool_t ro;
1255 kthread_t *thread;
1256 struct rfs_async_write *list;
1257 };
1258
1259 struct rfs_async_write_list {
1260 fhandle_t *fhp;
1261 kcondvar_t cv;
1262 struct rfs_async_write *list;
1263 struct rfs_async_write_list *next;
1264 };
1265
1266 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1267 static kmutex_t rfs_async_write_lock;
1268 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1269
1270 #define MAXCLIOVECS 42
1271 #define RFSWRITE_INITVAL (enum nfsstat) -1
1272
1273 #ifdef DEBUG
1274 static int rfs_write_hits = 0;
1275 static int rfs_write_misses = 0;
1276 #endif
1277
1278 /*
1279 * Write data to file.
1280 * Returns attributes of a file after writing some data to it.
1281 */
1282 void
1283 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1284 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1285 {
1286 int error;
1287 vnode_t *vp;
1288 rlim64_t rlimit;
1289 struct vattr va;
1290 struct uio uio;
1291 struct rfs_async_write_list *lp;
1292 struct rfs_async_write_list *nlp;
1293 struct rfs_async_write *rp;
1294 struct rfs_async_write *nrp;
1295 struct rfs_async_write *trp;
1296 struct rfs_async_write *lrp;
1297 int data_written;
1298 int iovcnt;
1299 mblk_t *m;
1300 struct iovec *iovp;
1301 struct iovec *niovp;
1302 struct iovec iov[MAXCLIOVECS];
1303 int count;
1304 int rcount;
1305 uint_t off;
1306 uint_t len;
1307 struct rfs_async_write nrpsp;
1308 struct rfs_async_write_list nlpsp;
1309 ushort_t t_flag;
1310 cred_t *savecred;
1311 int in_crit = 0;
1312 caller_context_t ct;
1313 nfs_srv_t *nsrv;
1314
1315 nsrv = zone_getspecific(rfs_zone_key, curzone);
1316 if (!nsrv->write_async) {
1317 rfs_write_sync(wa, ns, exi, req, cr, ro);
1318 return;
1319 }
1320
1321 /*
1322 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1323 * is considered an OK.
1324 */
1325 ns->ns_status = RFSWRITE_INITVAL;
1326
1327 nrp = &nrpsp;
1328 nrp->wa = wa;
1329 nrp->ns = ns;
1330 nrp->req = req;
1331 nrp->cr = cr;
1332 nrp->ro = ro;
1333 nrp->thread = curthread;
1334
1335 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1336
1337 /*
1338 * Look to see if there is already a cluster started
1339 * for this file.
1340 */
1341 mutex_enter(&nsrv->async_write_lock);
1342 for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1343 if (bcmp(&wa->wa_fhandle, lp->fhp,
1344 sizeof (fhandle_t)) == 0)
1345 break;
1346 }
1347
1348 /*
1349 * If lp is non-NULL, then there is already a cluster
1350 * started. We need to place ourselves in the cluster
1351 * list in the right place as determined by starting
1352 * offset. Conflicts with non-blocking mandatory locked
1353 * regions will be checked when the cluster is processed.
1354 */
1355 if (lp != NULL) {
1356 rp = lp->list;
1357 trp = NULL;
1358 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1359 trp = rp;
1360 rp = rp->list;
1361 }
1362 nrp->list = rp;
1363 if (trp == NULL)
1364 lp->list = nrp;
1365 else
1366 trp->list = nrp;
1367 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1368 cv_wait(&lp->cv, &nsrv->async_write_lock);
1369 mutex_exit(&nsrv->async_write_lock);
1370
1371 return;
1372 }
1373
1374 /*
1375 * No cluster started yet, start one and add ourselves
1376 * to the list of clusters.
1377 */
1378 nrp->list = NULL;
1379
1380 nlp = &nlpsp;
1381 nlp->fhp = &wa->wa_fhandle;
1382 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1383 nlp->list = nrp;
1384 nlp->next = NULL;
1385
1386 if (nsrv->async_write_head == NULL) {
1387 nsrv->async_write_head = nlp;
1388 } else {
1389 lp = nsrv->async_write_head;
1390 while (lp->next != NULL)
1391 lp = lp->next;
1392 lp->next = nlp;
1393 }
1394 mutex_exit(&nsrv->async_write_lock);
1395
1396 /*
1397 * Convert the file handle common to all of the requests
1398 * in this cluster to a vnode.
1399 */
1400 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1401 if (vp == NULL) {
1402 mutex_enter(&nsrv->async_write_lock);
1403 if (nsrv->async_write_head == nlp)
1404 nsrv->async_write_head = nlp->next;
1405 else {
1406 lp = nsrv->async_write_head;
1407 while (lp->next != nlp)
1408 lp = lp->next;
1409 lp->next = nlp->next;
1410 }
1411 t_flag = curthread->t_flag & T_WOULDBLOCK;
1412 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1413 rp->ns->ns_status = NFSERR_STALE;
1414 rp->thread->t_flag |= t_flag;
1415 }
1416 cv_broadcast(&nlp->cv);
1417 mutex_exit(&nsrv->async_write_lock);
1418
1419 return;
1420 }
1421
1422 /*
1423 * Can only write regular files. Attempts to write any
1424 * other file types fail with EISDIR.
1425 */
1426 if (vp->v_type != VREG) {
1427 VN_RELE(vp);
1428 mutex_enter(&nsrv->async_write_lock);
1429 if (nsrv->async_write_head == nlp)
1430 nsrv->async_write_head = nlp->next;
1431 else {
1432 lp = nsrv->async_write_head;
1433 while (lp->next != nlp)
1434 lp = lp->next;
1435 lp->next = nlp->next;
1436 }
1437 t_flag = curthread->t_flag & T_WOULDBLOCK;
1438 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1439 rp->ns->ns_status = NFSERR_ISDIR;
1440 rp->thread->t_flag |= t_flag;
1441 }
1442 cv_broadcast(&nlp->cv);
1443 mutex_exit(&nsrv->async_write_lock);
1444
1445 return;
1446 }
1447
1448 /*
1449 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1450 * deadlock with ufs.
1451 */
1452 if (nbl_need_check(vp)) {
1453 nbl_start_crit(vp, RW_READER);
1454 in_crit = 1;
1455 }
1456
1457 ct.cc_sysid = 0;
1458 ct.cc_pid = 0;
1459 ct.cc_caller_id = nfs2_srv_caller_id;
1460 ct.cc_flags = CC_DONTBLOCK;
1461
1462 /*
1463 * Lock the file for writing. This operation provides
1464 * the delay which allows clusters to grow.
1465 */
1466 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1467
1468 /* check if a monitor detected a delegation conflict */
1469 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1470 if (in_crit)
1471 nbl_end_crit(vp);
1472 VN_RELE(vp);
1473 /* mark as wouldblock so response is dropped */
1474 curthread->t_flag |= T_WOULDBLOCK;
1475 mutex_enter(&nsrv->async_write_lock);
1476 if (nsrv->async_write_head == nlp)
1477 nsrv->async_write_head = nlp->next;
1478 else {
1479 lp = nsrv->async_write_head;
1480 while (lp->next != nlp)
1481 lp = lp->next;
1482 lp->next = nlp->next;
1483 }
1484 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1485 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1486 rp->ns->ns_status = puterrno(error);
1487 rp->thread->t_flag |= T_WOULDBLOCK;
1488 }
1489 }
1490 cv_broadcast(&nlp->cv);
1491 mutex_exit(&nsrv->async_write_lock);
1492
1493 return;
1494 }
1495
1496 /*
1497 * Disconnect this cluster from the list of clusters.
1498 * The cluster that is being dealt with must be fixed
1499 * in size after this point, so there is no reason
1500 * to leave it on the list so that new requests can
1501 * find it.
1502 *
1503 * The algorithm is that the first write request will
1504 * create a cluster, convert the file handle to a
1505 * vnode pointer, and then lock the file for writing.
1506 * This request is not likely to be clustered with
1507 * any others. However, the next request will create
1508 * a new cluster and be blocked in VOP_RWLOCK while
1509 * the first request is being processed. This delay
1510 * will allow more requests to be clustered in this
1511 * second cluster.
1512 */
1513 mutex_enter(&nsrv->async_write_lock);
1514 if (nsrv->async_write_head == nlp)
1515 nsrv->async_write_head = nlp->next;
1516 else {
1517 lp = nsrv->async_write_head;
1518 while (lp->next != nlp)
1519 lp = lp->next;
1520 lp->next = nlp->next;
1521 }
1522 mutex_exit(&nsrv->async_write_lock);
1523
1524 /*
1525 * Step through the list of requests in this cluster.
1526 * We need to check permissions to make sure that all
1527 * of the requests have sufficient permission to write
1528 * the file. A cluster can be composed of requests
1529 * from different clients and different users on each
1530 * client.
1531 *
1532 * As a side effect, we also calculate the size of the
1533 * byte range that this cluster encompasses.
1534 */
1535 rp = nlp->list;
1536 off = rp->wa->wa_offset;
1537 len = (uint_t)0;
1538 do {
1539 if (rdonly(rp->ro, vp)) {
1540 rp->ns->ns_status = NFSERR_ROFS;
1541 t_flag = curthread->t_flag & T_WOULDBLOCK;
1542 rp->thread->t_flag |= t_flag;
1543 continue;
1544 }
1545
1546 va.va_mask = AT_UID|AT_MODE;
1547
1548 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1549
1550 if (!error) {
1551 if (crgetuid(rp->cr) != va.va_uid) {
1552 /*
1553 * This is a kludge to allow writes of files
1554 * created with read only permission. The
1555 * owner of the file is always allowed to
1556 * write it.
1557 */
1558 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1559 }
1560 if (!error && MANDLOCK(vp, va.va_mode))
1561 error = EACCES;
1562 }
1563
1564 /*
1565 * Check for a conflict with a nbmand-locked region.
1566 */
1567 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1568 rp->wa->wa_count, 0, NULL)) {
1569 error = EACCES;
1570 }
1571
1572 if (error) {
1573 rp->ns->ns_status = puterrno(error);
1574 t_flag = curthread->t_flag & T_WOULDBLOCK;
1575 rp->thread->t_flag |= t_flag;
1576 continue;
1577 }
1578 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1579 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1580 } while ((rp = rp->list) != NULL);
1581
1582 /*
1583 * Step through the cluster attempting to gather as many
1584 * requests which are contiguous as possible. These
1585 * contiguous requests are handled via one call to VOP_WRITE
1586 * instead of different calls to VOP_WRITE. We also keep
1587 * track of the fact that any data was written.
1588 */
1589 rp = nlp->list;
1590 data_written = 0;
1591 do {
1592 /*
1593 * Skip any requests which are already marked as having an
1594 * error.
1595 */
1596 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1597 rp = rp->list;
1598 continue;
1599 }
1600
1601 /*
1602 * Count the number of iovec's which are required
1603 * to handle this set of requests. One iovec is
1604 * needed for each data buffer, whether addressed
1605 * by wa_data or by the b_rptr pointers in the
1606 * mblk chains.
1607 */
1608 iovcnt = 0;
1609 lrp = rp;
1610 for (;;) {
1611 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1612 iovcnt++;
1613 else {
1614 m = lrp->wa->wa_mblk;
1615 while (m != NULL) {
1616 iovcnt++;
1617 m = m->b_cont;
1618 }
1619 }
1620 if (lrp->list == NULL ||
1621 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1622 lrp->wa->wa_offset + lrp->wa->wa_count !=
1623 lrp->list->wa->wa_offset) {
1624 lrp = lrp->list;
1625 break;
1626 }
1627 lrp = lrp->list;
1628 }
1629
1630 if (iovcnt <= MAXCLIOVECS) {
1631 #ifdef DEBUG
1632 rfs_write_hits++;
1633 #endif
1634 niovp = iov;
1635 } else {
1636 #ifdef DEBUG
1637 rfs_write_misses++;
1638 #endif
1639 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1640 }
1641 /*
1642 * Put together the scatter/gather iovecs.
1643 */
1644 iovp = niovp;
1645 trp = rp;
1646 count = 0;
1647 do {
1648 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1649 if (trp->wa->wa_rlist) {
1650 iovp->iov_base =
1651 (char *)((trp->wa->wa_rlist)->
1652 u.c_daddr3);
1653 iovp->iov_len = trp->wa->wa_count;
1654 } else {
1655 iovp->iov_base = trp->wa->wa_data;
1656 iovp->iov_len = trp->wa->wa_count;
1657 }
1658 iovp++;
1659 } else {
1660 m = trp->wa->wa_mblk;
1661 rcount = trp->wa->wa_count;
1662 while (m != NULL) {
1663 iovp->iov_base = (caddr_t)m->b_rptr;
1664 iovp->iov_len = (m->b_wptr - m->b_rptr);
1665 rcount -= iovp->iov_len;
1666 if (rcount < 0)
1667 iovp->iov_len += rcount;
1668 iovp++;
1669 if (rcount <= 0)
1670 break;
1671 m = m->b_cont;
1672 }
1673 }
1674 count += trp->wa->wa_count;
1675 trp = trp->list;
1676 } while (trp != lrp);
1677
1678 uio.uio_iov = niovp;
1679 uio.uio_iovcnt = iovcnt;
1680 uio.uio_segflg = UIO_SYSSPACE;
1681 uio.uio_extflg = UIO_COPY_DEFAULT;
1682 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1683 uio.uio_resid = count;
1684 /*
1685 * The limit is checked on the client. We
1686 * should allow any size writes here.
1687 */
1688 uio.uio_llimit = curproc->p_fsz_ctl;
1689 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1690 if (rlimit < (rlim64_t)uio.uio_resid)
1691 uio.uio_resid = (uint_t)rlimit;
1692
1693 /*
1694 * For now we assume no append mode.
1695 */
1696
1697 /*
1698 * We're changing creds because VM may fault
1699 * and we need the cred of the current
1700 * thread to be used if quota * checking is
1701 * enabled.
1702 */
1703 savecred = curthread->t_cred;
1704 curthread->t_cred = cr;
1705 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1706 curthread->t_cred = savecred;
1707
1708 /* check if a monitor detected a delegation conflict */
1709 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1710 /* mark as wouldblock so response is dropped */
1711 curthread->t_flag |= T_WOULDBLOCK;
1712
1713 if (niovp != iov)
1714 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1715
1716 if (!error) {
1717 data_written = 1;
1718 /*
1719 * Get attributes again so we send the latest mod
1720 * time to the client side for its cache.
1721 */
1722 va.va_mask = AT_ALL; /* now we want everything */
1723
1724 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1725
1726 if (!error)
1727 acl_perm(vp, exi, &va, rp->cr);
1728 }
1729
1730 /*
1731 * Fill in the status responses for each request
1732 * which was just handled. Also, copy the latest
1733 * attributes in to the attribute responses if
1734 * appropriate.
1735 */
1736 t_flag = curthread->t_flag & T_WOULDBLOCK;
1737 do {
1738 rp->thread->t_flag |= t_flag;
1739 /* check for overflows */
1740 if (!error) {
1741 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1742 }
1743 rp->ns->ns_status = puterrno(error);
1744 rp = rp->list;
1745 } while (rp != lrp);
1746 } while (rp != NULL);
1747
1748 /*
1749 * If any data was written at all, then we need to flush
1750 * the data and metadata to stable storage.
1751 */
1752 if (data_written) {
1753 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1754
1755 if (!error) {
1756 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1757 }
1758 }
1759
1760 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1761
1762 if (in_crit)
1763 nbl_end_crit(vp);
1764 VN_RELE(vp);
1765
1766 t_flag = curthread->t_flag & T_WOULDBLOCK;
1767 mutex_enter(&nsrv->async_write_lock);
1768 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1769 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1770 rp->ns->ns_status = puterrno(error);
1771 rp->thread->t_flag |= t_flag;
1772 }
1773 }
1774 cv_broadcast(&nlp->cv);
1775 mutex_exit(&nsrv->async_write_lock);
1776
1777 }
1778
1779 void *
1780 rfs_write_getfh(struct nfswriteargs *wa)
1781 {
1782 return (&wa->wa_fhandle);
1783 }
1784
1785 /*
1786 * Create a file.
1787 * Creates a file with given attributes and returns those attributes
1788 * and an fhandle for the new file.
1789 */
1790 void
1791 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1792 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1793 {
1794 int error;
1795 int lookuperr;
1796 int in_crit = 0;
1797 struct vattr va;
1798 vnode_t *vp;
1799 vnode_t *realvp;
1800 vnode_t *dvp;
1801 char *name = args->ca_da.da_name;
1802 vnode_t *tvp = NULL;
1803 int mode;
1804 int lookup_ok;
1805 bool_t trunc;
1806 struct sockaddr *ca;
1807
1808 /*
1809 * Disallow NULL paths
1810 */
1811 if (name == NULL || *name == '\0') {
1812 dr->dr_status = NFSERR_ACCES;
1813 return;
1814 }
1815
1816 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1817 if (dvp == NULL) {
1818 dr->dr_status = NFSERR_STALE;
1819 return;
1820 }
1821
1822 error = sattr_to_vattr(args->ca_sa, &va);
1823 if (error) {
1824 dr->dr_status = puterrno(error);
1825 return;
1826 }
1827
1828 /*
1829 * Must specify the mode.
1830 */
1831 if (!(va.va_mask & AT_MODE)) {
1832 VN_RELE(dvp);
1833 dr->dr_status = NFSERR_INVAL;
1834 return;
1835 }
1836
1837 /*
1838 * This is a completely gross hack to make mknod
1839 * work over the wire until we can wack the protocol
1840 */
1841 if ((va.va_mode & IFMT) == IFCHR) {
1842 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1843 va.va_type = VFIFO; /* xtra kludge for named pipe */
1844 else {
1845 va.va_type = VCHR;
1846 /*
1847 * uncompress the received dev_t
1848 * if the top half is zero indicating a request
1849 * from an `older style' OS.
1850 */
1851 if ((va.va_size & 0xffff0000) == 0)
1852 va.va_rdev = nfsv2_expdev(va.va_size);
1853 else
1854 va.va_rdev = (dev_t)va.va_size;
1855 }
1856 va.va_mask &= ~AT_SIZE;
1857 } else if ((va.va_mode & IFMT) == IFBLK) {
1858 va.va_type = VBLK;
1859 /*
1860 * uncompress the received dev_t
1861 * if the top half is zero indicating a request
1862 * from an `older style' OS.
1863 */
1864 if ((va.va_size & 0xffff0000) == 0)
1865 va.va_rdev = nfsv2_expdev(va.va_size);
1866 else
1867 va.va_rdev = (dev_t)va.va_size;
1868 va.va_mask &= ~AT_SIZE;
1869 } else if ((va.va_mode & IFMT) == IFSOCK) {
1870 va.va_type = VSOCK;
1871 } else {
1872 va.va_type = VREG;
1873 }
1874 va.va_mode &= ~IFMT;
1875 va.va_mask |= AT_TYPE;
1876
1877 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1878 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1879 MAXPATHLEN);
1880 if (name == NULL) {
1881 dr->dr_status = puterrno(EINVAL);
1882 return;
1883 }
1884
1885 /*
1886 * Why was the choice made to use VWRITE as the mode to the
1887 * call to VOP_CREATE ? This results in a bug. When a client
1888 * opens a file that already exists and is RDONLY, the second
1889 * open fails with an EACESS because of the mode.
1890 * bug ID 1054648.
1891 */
1892 lookup_ok = 0;
1893 mode = VWRITE;
1894 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1895 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1896 NULL, NULL, NULL);
1897 if (!error) {
1898 struct vattr at;
1899
1900 lookup_ok = 1;
1901 at.va_mask = AT_MODE;
1902 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1903 if (!error)
1904 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1905 VN_RELE(tvp);
1906 tvp = NULL;
1907 }
1908 }
1909
1910 if (!lookup_ok) {
1911 if (rdonly(ro, dvp)) {
1912 error = EROFS;
1913 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1914 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1915 error = EPERM;
1916 } else {
1917 error = 0;
1918 }
1919 }
1920
1921 /*
1922 * If file size is being modified on an already existing file
1923 * make sure that there are no conflicting non-blocking mandatory
1924 * locks in the region being manipulated. Return EACCES if there
1925 * are conflicting locks.
1926 */
1927 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1928 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1929 NULL, NULL, NULL);
1930
1931 if (!lookuperr &&
1932 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1933 VN_RELE(tvp);
1934 curthread->t_flag |= T_WOULDBLOCK;
1935 goto out;
1936 }
1937
1938 if (!lookuperr && nbl_need_check(tvp)) {
1939 /*
1940 * The file exists. Now check if it has any
1941 * conflicting non-blocking mandatory locks
1942 * in the region being changed.
1943 */
1944 struct vattr bva;
1945 u_offset_t offset;
1946 ssize_t length;
1947
1948 nbl_start_crit(tvp, RW_READER);
1949 in_crit = 1;
1950
1951 bva.va_mask = AT_SIZE;
1952 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1953 if (!error) {
1954 if (va.va_size < bva.va_size) {
1955 offset = va.va_size;
1956 length = bva.va_size - va.va_size;
1957 } else {
1958 offset = bva.va_size;
1959 length = va.va_size - bva.va_size;
1960 }
1961 if (length) {
1962 if (nbl_conflict(tvp, NBL_WRITE,
1963 offset, length, 0, NULL)) {
1964 error = EACCES;
1965 }
1966 }
1967 }
1968 if (error) {
1969 nbl_end_crit(tvp);
1970 VN_RELE(tvp);
1971 in_crit = 0;
1972 }
1973 } else if (tvp != NULL) {
1974 VN_RELE(tvp);
1975 }
1976 }
1977
1978 if (!error) {
1979 /*
1980 * If filesystem is shared with nosuid the remove any
1981 * setuid/setgid bits on create.
1982 */
1983 if (va.va_type == VREG &&
1984 exi->exi_export.ex_flags & EX_NOSUID)
1985 va.va_mode &= ~(VSUID | VSGID);
1986
1987 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1988 NULL, NULL);
1989
1990 if (!error) {
1991
1992 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1993 trunc = TRUE;
1994 else
1995 trunc = FALSE;
1996
1997 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1998 VN_RELE(vp);
1999 curthread->t_flag |= T_WOULDBLOCK;
2000 goto out;
2001 }
2002 va.va_mask = AT_ALL;
2003
2004 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2005
2006 /* check for overflows */
2007 if (!error) {
2008 acl_perm(vp, exi, &va, cr);
2009 error = vattr_to_nattr(&va, &dr->dr_attr);
2010 if (!error) {
2011 error = makefh(&dr->dr_fhandle, vp,
2012 exi);
2013 }
2014 }
2015 /*
2016 * Force modified metadata out to stable storage.
2017 *
2018 * if a underlying vp exists, pass it to VOP_FSYNC
2019 */
2020 if (VOP_REALVP(vp, &realvp, NULL) == 0)
2021 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2022 else
2023 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2024 VN_RELE(vp);
2025 }
2026
2027 if (in_crit) {
2028 nbl_end_crit(tvp);
2029 VN_RELE(tvp);
2030 }
2031 }
2032
2033 /*
2034 * Force modified data and metadata out to stable storage.
2035 */
2036 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2037
2038 out:
2039
2040 VN_RELE(dvp);
2041
2042 dr->dr_status = puterrno(error);
2043
2044 if (name != args->ca_da.da_name)
2045 kmem_free(name, MAXPATHLEN);
2046 }
2047 void *
2048 rfs_create_getfh(struct nfscreatargs *args)
2049 {
2050 return (args->ca_da.da_fhandle);
2051 }
2052
2053 /*
2054 * Remove a file.
2055 * Remove named file from parent directory.
2056 */
2057 /* ARGSUSED */
2058 void
2059 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2060 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2061 {
2062 int error = 0;
2063 vnode_t *vp;
2064 vnode_t *targvp;
2065 int in_crit = 0;
2066
2067 /*
2068 * Disallow NULL paths
2069 */
2070 if (da->da_name == NULL || *da->da_name == '\0') {
2071 *status = NFSERR_ACCES;
2072 return;
2073 }
2074
2075 vp = nfs_fhtovp(da->da_fhandle, exi);
2076 if (vp == NULL) {
2077 *status = NFSERR_STALE;
2078 return;
2079 }
2080
2081 if (rdonly(ro, vp)) {
2082 VN_RELE(vp);
2083 *status = NFSERR_ROFS;
2084 return;
2085 }
2086
2087 /*
2088 * Check for a conflict with a non-blocking mandatory share reservation.
2089 */
2090 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2091 NULL, cr, NULL, NULL, NULL);
2092 if (error != 0) {
2093 VN_RELE(vp);
2094 *status = puterrno(error);
2095 return;
2096 }
2097
2098 /*
2099 * If the file is delegated to an v4 client, then initiate
2100 * recall and drop this request (by setting T_WOULDBLOCK).
2101 * The client will eventually re-transmit the request and
2102 * (hopefully), by then, the v4 client will have returned
2103 * the delegation.
2104 */
2105
2106 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2107 VN_RELE(vp);
2108 VN_RELE(targvp);
2109 curthread->t_flag |= T_WOULDBLOCK;
2110 return;
2111 }
2112
2113 if (nbl_need_check(targvp)) {
2114 nbl_start_crit(targvp, RW_READER);
2115 in_crit = 1;
2116 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2117 error = EACCES;
2118 goto out;
2119 }
2120 }
2121
2122 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2123
2124 /*
2125 * Force modified data and metadata out to stable storage.
2126 */
2127 (void) VOP_FSYNC(vp, 0, cr, NULL);
2128
2129 out:
2130 if (in_crit)
2131 nbl_end_crit(targvp);
2132 VN_RELE(targvp);
2133 VN_RELE(vp);
2134
2135 *status = puterrno(error);
2136
2137 }
2138
2139 void *
2140 rfs_remove_getfh(struct nfsdiropargs *da)
2141 {
2142 return (da->da_fhandle);
2143 }
2144
2145 /*
2146 * rename a file
2147 * Give a file (from) a new name (to).
2148 */
2149 /* ARGSUSED */
2150 void
2151 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2152 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2153 {
2154 int error = 0;
2155 vnode_t *fromvp;
2156 vnode_t *tovp;
2157 struct exportinfo *to_exi;
2158 fhandle_t *fh;
2159 vnode_t *srcvp;
2160 vnode_t *targvp;
2161 int in_crit = 0;
2162
2163 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2164 if (fromvp == NULL) {
2165 *status = NFSERR_STALE;
2166 return;
2167 }
2168
2169 fh = args->rna_to.da_fhandle;
2170 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2171 if (to_exi == NULL) {
2172 VN_RELE(fromvp);
2173 *status = NFSERR_ACCES;
2174 return;
2175 }
2176 exi_rele(to_exi);
2177
2178 if (to_exi != exi) {
2179 VN_RELE(fromvp);
2180 *status = NFSERR_XDEV;
2181 return;
2182 }
2183
2184 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2185 if (tovp == NULL) {
2186 VN_RELE(fromvp);
2187 *status = NFSERR_STALE;
2188 return;
2189 }
2190
2191 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2192 VN_RELE(tovp);
2193 VN_RELE(fromvp);
2194 *status = NFSERR_NOTDIR;
2195 return;
2196 }
2197
2198 /*
2199 * Disallow NULL paths
2200 */
2201 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2202 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2203 VN_RELE(tovp);
2204 VN_RELE(fromvp);
2205 *status = NFSERR_ACCES;
2206 return;
2207 }
2208
2209 if (rdonly(ro, tovp)) {
2210 VN_RELE(tovp);
2211 VN_RELE(fromvp);
2212 *status = NFSERR_ROFS;
2213 return;
2214 }
2215
2216 /*
2217 * Check for a conflict with a non-blocking mandatory share reservation.
2218 */
2219 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2220 NULL, cr, NULL, NULL, NULL);
2221 if (error != 0) {
2222 VN_RELE(tovp);
2223 VN_RELE(fromvp);
2224 *status = puterrno(error);
2225 return;
2226 }
2227
2228 /* Check for delegations on the source file */
2229
2230 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2231 VN_RELE(tovp);
2232 VN_RELE(fromvp);
2233 VN_RELE(srcvp);
2234 curthread->t_flag |= T_WOULDBLOCK;
2235 return;
2236 }
2237
2238 /* Check for delegation on the file being renamed over, if it exists */
2239
2240 if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2241 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2242 NULL, NULL, NULL) == 0) {
2243
2244 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2245 VN_RELE(tovp);
2246 VN_RELE(fromvp);
2247 VN_RELE(srcvp);
2248 VN_RELE(targvp);
2249 curthread->t_flag |= T_WOULDBLOCK;
2250 return;
2251 }
2252 VN_RELE(targvp);
2253 }
2254
2255
2256 if (nbl_need_check(srcvp)) {
2257 nbl_start_crit(srcvp, RW_READER);
2258 in_crit = 1;
2259 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2260 error = EACCES;
2261 goto out;
2262 }
2263 }
2264
2265 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2266 tovp, args->rna_to.da_name, cr, NULL, 0);
2267
2268 if (error == 0)
2269 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2270 strlen(args->rna_to.da_name));
2271
2272 /*
2273 * Force modified data and metadata out to stable storage.
2274 */
2275 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2276 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2277
2278 out:
2279 if (in_crit)
2280 nbl_end_crit(srcvp);
2281 VN_RELE(srcvp);
2282 VN_RELE(tovp);
2283 VN_RELE(fromvp);
2284
2285 *status = puterrno(error);
2286
2287 }
2288 void *
2289 rfs_rename_getfh(struct nfsrnmargs *args)
2290 {
2291 return (args->rna_from.da_fhandle);
2292 }
2293
2294 /*
2295 * Link to a file.
2296 * Create a file (to) which is a hard link to the given file (from).
2297 */
2298 /* ARGSUSED */
2299 void
2300 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2301 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2302 {
2303 int error;
2304 vnode_t *fromvp;
2305 vnode_t *tovp;
2306 struct exportinfo *to_exi;
2307 fhandle_t *fh;
2308
2309 fromvp = nfs_fhtovp(args->la_from, exi);
2310 if (fromvp == NULL) {
2311 *status = NFSERR_STALE;
2312 return;
2313 }
2314
2315 fh = args->la_to.da_fhandle;
2316 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2317 if (to_exi == NULL) {
2318 VN_RELE(fromvp);
2319 *status = NFSERR_ACCES;
2320 return;
2321 }
2322 exi_rele(to_exi);
2323
2324 if (to_exi != exi) {
2325 VN_RELE(fromvp);
2326 *status = NFSERR_XDEV;
2327 return;
2328 }
2329
2330 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2331 if (tovp == NULL) {
2332 VN_RELE(fromvp);
2333 *status = NFSERR_STALE;
2334 return;
2335 }
2336
2337 if (tovp->v_type != VDIR) {
2338 VN_RELE(tovp);
2339 VN_RELE(fromvp);
2340 *status = NFSERR_NOTDIR;
2341 return;
2342 }
2343 /*
2344 * Disallow NULL paths
2345 */
2346 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2347 VN_RELE(tovp);
2348 VN_RELE(fromvp);
2349 *status = NFSERR_ACCES;
2350 return;
2351 }
2352
2353 if (rdonly(ro, tovp)) {
2354 VN_RELE(tovp);
2355 VN_RELE(fromvp);
2356 *status = NFSERR_ROFS;
2357 return;
2358 }
2359
2360 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2361
2362 /*
2363 * Force modified data and metadata out to stable storage.
2364 */
2365 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2366 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2367
2368 VN_RELE(tovp);
2369 VN_RELE(fromvp);
2370
2371 *status = puterrno(error);
2372
2373 }
2374 void *
2375 rfs_link_getfh(struct nfslinkargs *args)
2376 {
2377 return (args->la_from);
2378 }
2379
2380 /*
2381 * Symbolicly link to a file.
2382 * Create a file (to) with the given attributes which is a symbolic link
2383 * to the given path name (to).
2384 */
2385 void
2386 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2387 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2388 {
2389 int error;
2390 struct vattr va;
2391 vnode_t *vp;
2392 vnode_t *svp;
2393 int lerror;
2394 struct sockaddr *ca;
2395 char *name = NULL;
2396
2397 /*
2398 * Disallow NULL paths
2399 */
2400 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2401 *status = NFSERR_ACCES;
2402 return;
2403 }
2404
2405 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2406 if (vp == NULL) {
2407 *status = NFSERR_STALE;
2408 return;
2409 }
2410
2411 if (rdonly(ro, vp)) {
2412 VN_RELE(vp);
2413 *status = NFSERR_ROFS;
2414 return;
2415 }
2416
2417 error = sattr_to_vattr(args->sla_sa, &va);
2418 if (error) {
2419 VN_RELE(vp);
2420 *status = puterrno(error);
2421 return;
2422 }
2423
2424 if (!(va.va_mask & AT_MODE)) {
2425 VN_RELE(vp);
2426 *status = NFSERR_INVAL;
2427 return;
2428 }
2429
2430 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2431 name = nfscmd_convname(ca, exi, args->sla_tnm,
2432 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2433
2434 if (name == NULL) {
2435 *status = NFSERR_ACCES;
2436 return;
2437 }
2438
2439 va.va_type = VLNK;
2440 va.va_mask |= AT_TYPE;
2441
2442 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2443
2444 /*
2445 * Force new data and metadata out to stable storage.
2446 */
2447 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2448 NULL, cr, NULL, NULL, NULL);
2449
2450 if (!lerror) {
2451 (void) VOP_FSYNC(svp, 0, cr, NULL);
2452 VN_RELE(svp);
2453 }
2454
2455 /*
2456 * Force modified data and metadata out to stable storage.
2457 */
2458 (void) VOP_FSYNC(vp, 0, cr, NULL);
2459
2460 VN_RELE(vp);
2461
2462 *status = puterrno(error);
2463 if (name != args->sla_tnm)
2464 kmem_free(name, MAXPATHLEN);
2465
2466 }
2467 void *
2468 rfs_symlink_getfh(struct nfsslargs *args)
2469 {
2470 return (args->sla_from.da_fhandle);
2471 }
2472
2473 /*
2474 * Make a directory.
2475 * Create a directory with the given name, parent directory, and attributes.
2476 * Returns a file handle and attributes for the new directory.
2477 */
2478 /* ARGSUSED */
2479 void
2480 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2481 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2482 {
2483 int error;
2484 struct vattr va;
2485 vnode_t *dvp = NULL;
2486 vnode_t *vp;
2487 char *name = args->ca_da.da_name;
2488
2489 /*
2490 * Disallow NULL paths
2491 */
2492 if (name == NULL || *name == '\0') {
2493 dr->dr_status = NFSERR_ACCES;
2494 return;
2495 }
2496
2497 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2498 if (vp == NULL) {
2499 dr->dr_status = NFSERR_STALE;
2500 return;
2501 }
2502
2503 if (rdonly(ro, vp)) {
2504 VN_RELE(vp);
2505 dr->dr_status = NFSERR_ROFS;
2506 return;
2507 }
2508
2509 error = sattr_to_vattr(args->ca_sa, &va);
2510 if (error) {
2511 VN_RELE(vp);
2512 dr->dr_status = puterrno(error);
2513 return;
2514 }
2515
2516 if (!(va.va_mask & AT_MODE)) {
2517 VN_RELE(vp);
2518 dr->dr_status = NFSERR_INVAL;
2519 return;
2520 }
2521
2522 va.va_type = VDIR;
2523 va.va_mask |= AT_TYPE;
2524
2525 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2526
2527 if (!error) {
2528 /*
2529 * Attribtutes of the newly created directory should
2530 * be returned to the client.
2531 */
2532 va.va_mask = AT_ALL; /* We want everything */
2533 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2534
2535 /* check for overflows */
2536 if (!error) {
2537 acl_perm(vp, exi, &va, cr);
2538 error = vattr_to_nattr(&va, &dr->dr_attr);
2539 if (!error) {
2540 error = makefh(&dr->dr_fhandle, dvp, exi);
2541 }
2542 }
2543 /*
2544 * Force new data and metadata out to stable storage.
2545 */
2546 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2547 VN_RELE(dvp);
2548 }
2549
2550 /*
2551 * Force modified data and metadata out to stable storage.
2552 */
2553 (void) VOP_FSYNC(vp, 0, cr, NULL);
2554
2555 VN_RELE(vp);
2556
2557 dr->dr_status = puterrno(error);
2558
2559 }
2560 void *
2561 rfs_mkdir_getfh(struct nfscreatargs *args)
2562 {
2563 return (args->ca_da.da_fhandle);
2564 }
2565
2566 /*
2567 * Remove a directory.
2568 * Remove the given directory name from the given parent directory.
2569 */
2570 /* ARGSUSED */
2571 void
2572 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2573 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2574 {
2575 int error;
2576 vnode_t *vp;
2577
2578 /*
2579 * Disallow NULL paths
2580 */
2581 if (da->da_name == NULL || *da->da_name == '\0') {
2582 *status = NFSERR_ACCES;
2583 return;
2584 }
2585
2586 vp = nfs_fhtovp(da->da_fhandle, exi);
2587 if (vp == NULL) {
2588 *status = NFSERR_STALE;
2589 return;
2590 }
2591
2592 if (rdonly(ro, vp)) {
2593 VN_RELE(vp);
2594 *status = NFSERR_ROFS;
2595 return;
2596 }
2597
2598 /*
2599 * VOP_RMDIR takes a third argument (the current
2600 * directory of the process). That's because someone
2601 * wants to return EINVAL if one tries to remove ".".
2602 * Of course, NFS servers have no idea what their
2603 * clients' current directories are. We fake it by
2604 * supplying a vnode known to exist and illegal to
2605 * remove.
2606 */
2607 error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2608
2609 /*
2610 * Force modified data and metadata out to stable storage.
2611 */
2612 (void) VOP_FSYNC(vp, 0, cr, NULL);
2613
2614 VN_RELE(vp);
2615
2616 /*
2617 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2618 * if the directory is not empty. A System V NFS server
2619 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2620 * over the wire.
2621 */
2622 if (error == EEXIST)
2623 *status = NFSERR_NOTEMPTY;
2624 else
2625 *status = puterrno(error);
2626
2627 }
2628 void *
2629 rfs_rmdir_getfh(struct nfsdiropargs *da)
2630 {
2631 return (da->da_fhandle);
2632 }
2633
2634 /* ARGSUSED */
2635 void
2636 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2637 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2638 {
2639 int error;
2640 int iseof;
2641 struct iovec iov;
2642 struct uio uio;
2643 vnode_t *vp;
2644 char *ndata = NULL;
2645 struct sockaddr *ca;
2646 size_t nents;
2647 int ret;
2648
2649 vp = nfs_fhtovp(&rda->rda_fh, exi);
2650 if (vp == NULL) {
2651 rd->rd_entries = NULL;
2652 rd->rd_status = NFSERR_STALE;
2653 return;
2654 }
2655
2656 if (vp->v_type != VDIR) {
2657 VN_RELE(vp);
2658 rd->rd_entries = NULL;
2659 rd->rd_status = NFSERR_NOTDIR;
2660 return;
2661 }
2662
2663 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2664
2665 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2666
2667 if (error) {
2668 rd->rd_entries = NULL;
2669 goto bad;
2670 }
2671
2672 if (rda->rda_count == 0) {
2673 rd->rd_entries = NULL;
2674 rd->rd_size = 0;
2675 rd->rd_eof = FALSE;
2676 goto bad;
2677 }
2678
2679 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2680
2681 /*
2682 * Allocate data for entries. This will be freed by rfs_rddirfree.
2683 */
2684 rd->rd_bufsize = (uint_t)rda->rda_count;
2685 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2686
2687 /*
2688 * Set up io vector to read directory data
2689 */
2690 iov.iov_base = (caddr_t)rd->rd_entries;
2691 iov.iov_len = rda->rda_count;
2692 uio.uio_iov = &iov;
2693 uio.uio_iovcnt = 1;
2694 uio.uio_segflg = UIO_SYSSPACE;
2695 uio.uio_extflg = UIO_COPY_CACHED;
2696 uio.uio_loffset = (offset_t)rda->rda_offset;
2697 uio.uio_resid = rda->rda_count;
2698
2699 /*
2700 * read directory
2701 */
2702 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2703
2704 /*
2705 * Clean up
2706 */
2707 if (!error) {
2708 /*
2709 * set size and eof
2710 */
2711 if (uio.uio_resid == rda->rda_count) {
2712 rd->rd_size = 0;
2713 rd->rd_eof = TRUE;
2714 } else {
2715 rd->rd_size = (uint32_t)(rda->rda_count -
2716 uio.uio_resid);
2717 rd->rd_eof = iseof ? TRUE : FALSE;
2718 }
2719 }
2720
2721 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2722 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2723 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2724 rda->rda_count, &ndata);
2725
2726 if (ret != 0) {
2727 size_t dropbytes;
2728 /*
2729 * We had to drop one or more entries in order to fit
2730 * during the character conversion. We need to patch
2731 * up the size and eof info.
2732 */
2733 if (rd->rd_eof)
2734 rd->rd_eof = FALSE;
2735 dropbytes = nfscmd_dropped_entrysize(
2736 (struct dirent64 *)rd->rd_entries, nents, ret);
2737 rd->rd_size -= dropbytes;
2738 }
2739 if (ndata == NULL) {
2740 ndata = (char *)rd->rd_entries;
2741 } else if (ndata != (char *)rd->rd_entries) {
2742 kmem_free(rd->rd_entries, rd->rd_bufsize);
2743 rd->rd_entries = (void *)ndata;
2744 rd->rd_bufsize = rda->rda_count;
2745 }
2746
2747 bad:
2748 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2749
2750 #if 0 /* notyet */
2751 /*
2752 * Don't do this. It causes local disk writes when just
2753 * reading the file and the overhead is deemed larger
2754 * than the benefit.
2755 */
2756 /*
2757 * Force modified metadata out to stable storage.
2758 */
2759 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2760 #endif
2761
2762 VN_RELE(vp);
2763
2764 rd->rd_status = puterrno(error);
2765
2766 }
2767 void *
2768 rfs_readdir_getfh(struct nfsrddirargs *rda)
2769 {
2770 return (&rda->rda_fh);
2771 }
2772 void
2773 rfs_rddirfree(struct nfsrddirres *rd)
2774 {
2775 if (rd->rd_entries != NULL)
2776 kmem_free(rd->rd_entries, rd->rd_bufsize);
2777 }
2778
2779 /* ARGSUSED */
2780 void
2781 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2782 struct svc_req *req, cred_t *cr, bool_t ro)
2783 {
2784 int error;
2785 struct statvfs64 sb;
2786 vnode_t *vp;
2787
2788 vp = nfs_fhtovp(fh, exi);
2789 if (vp == NULL) {
2790 fs->fs_status = NFSERR_STALE;
2791 return;
2792 }
2793
2794 error = VFS_STATVFS(vp->v_vfsp, &sb);
2795
2796 if (!error) {
2797 fs->fs_tsize = nfstsize();
2798 fs->fs_bsize = sb.f_frsize;
2799 fs->fs_blocks = sb.f_blocks;
2800 fs->fs_bfree = sb.f_bfree;
2801 fs->fs_bavail = sb.f_bavail;
2802 }
2803
2804 VN_RELE(vp);
2805
2806 fs->fs_status = puterrno(error);
2807
2808 }
2809 void *
2810 rfs_statfs_getfh(fhandle_t *fh)
2811 {
2812 return (fh);
2813 }
2814
2815 static int
2816 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2817 {
2818 vap->va_mask = 0;
2819
2820 /*
2821 * There was a sign extension bug in some VFS based systems
2822 * which stored the mode as a short. When it would get
2823 * assigned to a u_long, no sign extension would occur.
2824 * It needed to, but this wasn't noticed because sa_mode
2825 * would then get assigned back to the short, thus ignoring
2826 * the upper 16 bits of sa_mode.
2827 *
2828 * To make this implementation work for both broken
2829 * clients and good clients, we check for both versions
2830 * of the mode.
2831 */
2832 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2833 sa->sa_mode != (uint32_t)-1) {
2834 vap->va_mask |= AT_MODE;
2835 vap->va_mode = sa->sa_mode;
2836 }
2837 if (sa->sa_uid != (uint32_t)-1) {
2838 vap->va_mask |= AT_UID;
2839 vap->va_uid = sa->sa_uid;
2840 }
2841 if (sa->sa_gid != (uint32_t)-1) {
2842 vap->va_mask |= AT_GID;
2843 vap->va_gid = sa->sa_gid;
2844 }
2845 if (sa->sa_size != (uint32_t)-1) {
2846 vap->va_mask |= AT_SIZE;
2847 vap->va_size = sa->sa_size;
2848 }
2849 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2850 sa->sa_atime.tv_usec != (int32_t)-1) {
2851 #ifndef _LP64
2852 /* return error if time overflow */
2853 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2854 return (EOVERFLOW);
2855 #endif
2856 vap->va_mask |= AT_ATIME;
2857 /*
2858 * nfs protocol defines times as unsigned so don't extend sign,
2859 * unless sysadmin set nfs_allow_preepoch_time.
2860 */
2861 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2862 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2863 }
2864 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2865 sa->sa_mtime.tv_usec != (int32_t)-1) {
2866 #ifndef _LP64
2867 /* return error if time overflow */
2868 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2869 return (EOVERFLOW);
2870 #endif
2871 vap->va_mask |= AT_MTIME;
2872 /*
2873 * nfs protocol defines times as unsigned so don't extend sign,
2874 * unless sysadmin set nfs_allow_preepoch_time.
2875 */
2876 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2877 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2878 }
2879 return (0);
2880 }
2881
2882 static const enum nfsftype vt_to_nf[] = {
2883 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2884 };
2885
2886 /*
2887 * check the following fields for overflow: nodeid, size, and time.
2888 * There could be a problem when converting 64-bit LP64 fields
2889 * into 32-bit ones. Return an error if there is an overflow.
2890 */
2891 int
2892 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2893 {
2894 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2895 na->na_type = vt_to_nf[vap->va_type];
2896
2897 if (vap->va_mode == (unsigned short) -1)
2898 na->na_mode = (uint32_t)-1;
2899 else
2900 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2901
2902 if (vap->va_uid == (unsigned short)(-1))
2903 na->na_uid = (uint32_t)(-1);
2904 else if (vap->va_uid == UID_NOBODY)
2905 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2906 else
2907 na->na_uid = vap->va_uid;
2908
2909 if (vap->va_gid == (unsigned short)(-1))
2910 na->na_gid = (uint32_t)-1;
2911 else if (vap->va_gid == GID_NOBODY)
2912 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2913 else
2914 na->na_gid = vap->va_gid;
2915
2916 /*
2917 * Do we need to check fsid for overflow? It is 64-bit in the
2918 * vattr, but are bigger than 32 bit values supported?
2919 */
2920 na->na_fsid = vap->va_fsid;
2921
2922 na->na_nodeid = vap->va_nodeid;
2923
2924 /*
2925 * Check to make sure that the nodeid is representable over the
2926 * wire without losing bits.
2927 */
2928 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2929 return (EFBIG);
2930 na->na_nlink = vap->va_nlink;
2931
2932 /*
2933 * Check for big files here, instead of at the caller. See
2934 * comments in cstat for large special file explanation.
2935 */
2936 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2937 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2938 return (EFBIG);
2939 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2940 /* UNKNOWN_SIZE | OVERFLOW */
2941 na->na_size = MAXOFF32_T;
2942 } else
2943 na->na_size = vap->va_size;
2944 } else
2945 na->na_size = vap->va_size;
2946
2947 /*
2948 * If the vnode times overflow the 32-bit times that NFS2
2949 * uses on the wire then return an error.
2950 */
2951 if (!NFS_VAP_TIME_OK(vap)) {
2952 return (EOVERFLOW);
2953 }
2954 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2955 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2956
2957 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2958 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2959
2960 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2961 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2962
2963 /*
2964 * If the dev_t will fit into 16 bits then compress
2965 * it, otherwise leave it alone. See comments in
2966 * nfs_client.c.
2967 */
2968 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2969 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2970 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2971 else
2972 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2973
2974 na->na_blocks = vap->va_nblocks;
2975 na->na_blocksize = vap->va_blksize;
2976
2977 /*
2978 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2979 * over-the-wire protocols for named-pipe vnodes. It remaps the
2980 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2981 *
2982 * BUYER BEWARE:
2983 * If you are porting the NFS to a non-Sun server, you probably
2984 * don't want to include the following block of code. The
2985 * over-the-wire special file types will be changing with the
2986 * NFS Protocol Revision.
2987 */
2988 if (vap->va_type == VFIFO)
2989 NA_SETFIFO(na);
2990 return (0);
2991 }
2992
2993 /*
2994 * acl v2 support: returns approximate permission.
2995 * default: returns minimal permission (more restrictive)
2996 * aclok: returns maximal permission (less restrictive)
2997 * This routine changes the permissions that are alaredy in *va.
2998 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2999 * CLASS_OBJ is always the same as GROUP_OBJ entry.
3000 */
3001 static void
3002 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3003 {
3004 vsecattr_t vsa;
3005 int aclcnt;
3006 aclent_t *aclentp;
3007 mode_t mask_perm;
3008 mode_t grp_perm;
3009 mode_t other_perm;
3010 mode_t other_orig;
3011 int error;
3012
3013 /* dont care default acl */
3014 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3015 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3016
3017 if (!error) {
3018 aclcnt = vsa.vsa_aclcnt;
3019 if (aclcnt > MIN_ACL_ENTRIES) {
3020 /* non-trivial ACL */
3021 aclentp = vsa.vsa_aclentp;
3022 if (exi->exi_export.ex_flags & EX_ACLOK) {
3023 /* maximal permissions */
3024 grp_perm = 0;
3025 other_perm = 0;
3026 for (; aclcnt > 0; aclcnt--, aclentp++) {
3027 switch (aclentp->a_type) {
3028 case USER_OBJ:
3029 break;
3030 case USER:
3031 grp_perm |=
3032 aclentp->a_perm << 3;
3033 other_perm |= aclentp->a_perm;
3034 break;
3035 case GROUP_OBJ:
3036 grp_perm |=
3037 aclentp->a_perm << 3;
3038 break;
3039 case GROUP:
3040 other_perm |= aclentp->a_perm;
3041 break;
3042 case OTHER_OBJ:
3043 other_orig = aclentp->a_perm;
3044 break;
3045 case CLASS_OBJ:
3046 mask_perm = aclentp->a_perm;
3047 break;
3048 default:
3049 break;
3050 }
3051 }
3052 grp_perm &= mask_perm << 3;
3053 other_perm &= mask_perm;
3054 other_perm |= other_orig;
3055
3056 } else {
3057 /* minimal permissions */
3058 grp_perm = 070;
3059 other_perm = 07;
3060 for (; aclcnt > 0; aclcnt--, aclentp++) {
3061 switch (aclentp->a_type) {
3062 case USER_OBJ:
3063 break;
3064 case USER:
3065 case CLASS_OBJ:
3066 grp_perm &=
3067 aclentp->a_perm << 3;
3068 other_perm &=
3069 aclentp->a_perm;
3070 break;
3071 case GROUP_OBJ:
3072 grp_perm &=
3073 aclentp->a_perm << 3;
3074 break;
3075 case GROUP:
3076 other_perm &=
3077 aclentp->a_perm;
3078 break;
3079 case OTHER_OBJ:
3080 other_perm &=
3081 aclentp->a_perm;
3082 break;
3083 default:
3084 break;
3085 }
3086 }
3087 }
3088 /* copy to va */
3089 va->va_mode &= ~077;
3090 va->va_mode |= grp_perm | other_perm;
3091 }
3092 if (vsa.vsa_aclcnt)
3093 kmem_free(vsa.vsa_aclentp,
3094 vsa.vsa_aclcnt * sizeof (aclent_t));
3095 }
3096 }
3097
3098 void
3099 rfs_srvrinit(void)
3100 {
3101 nfs2_srv_caller_id = fs_new_caller_id();
3102 zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3103 }
3104
3105 void
3106 rfs_srvrfini(void)
3107 {
3108 }
3109
3110 /* ARGSUSED */
3111 static void *
3112 rfs_zone_init(zoneid_t zoneid)
3113 {
3114 nfs_srv_t *ns;
3115
3116 ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3117
3118 mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3119 ns->write_async = 1;
3120
3121 return (ns);
3122 }
3123
3124 /* ARGSUSED */
3125 static void
3126 rfs_zone_fini(zoneid_t zoneid, void *data)
3127 {
3128 nfs_srv_t *ns;
3129
3130 ns = (nfs_srv_t *)data;
3131 mutex_destroy(&ns->async_write_lock);
3132 kmem_free(ns, sizeof (*ns));
3133 }
3134
3135 static int
3136 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3137 {
3138 struct clist *wcl;
3139 int wlist_len;
3140 uint32_t count = rr->rr_count;
3141
3142 wcl = ra->ra_wlist;
3143
3144 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3145 return (FALSE);
3146 }
3147
3148 wcl = ra->ra_wlist;
3149 rr->rr_ok.rrok_wlist_len = wlist_len;
3150 rr->rr_ok.rrok_wlist = wcl;
3151
3152 return (TRUE);
3153 }