1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 * All rights reserved.
29 */
30
31 /*
32 * Copyright 2018 Nexenta Systems, Inc.
33 * Copyright (c) 2016 by Delphix. All rights reserved.
34 */
35
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/systm.h>
39 #include <sys/cred.h>
40 #include <sys/buf.h>
41 #include <sys/vfs.h>
42 #include <sys/vnode.h>
43 #include <sys/uio.h>
44 #include <sys/stat.h>
45 #include <sys/errno.h>
46 #include <sys/sysmacros.h>
47 #include <sys/statvfs.h>
48 #include <sys/kmem.h>
49 #include <sys/kstat.h>
50 #include <sys/dirent.h>
51 #include <sys/cmn_err.h>
52 #include <sys/debug.h>
53 #include <sys/vtrace.h>
54 #include <sys/mode.h>
55 #include <sys/acl.h>
56 #include <sys/nbmlock.h>
57 #include <sys/policy.h>
58 #include <sys/sdt.h>
59
60 #include <rpc/types.h>
61 #include <rpc/auth.h>
62 #include <rpc/svc.h>
63
64 #include <nfs/nfs.h>
65 #include <nfs/export.h>
66 #include <nfs/nfs_cmd.h>
67
68 #include <vm/hat.h>
69 #include <vm/as.h>
70 #include <vm/seg.h>
71 #include <vm/seg_map.h>
72 #include <vm/seg_kmem.h>
73
74 #include <sys/strsubr.h>
75
76 struct rfs_async_write_list;
77
78 /*
79 * Zone globals of NFSv2 server
80 */
81 typedef struct nfs_srv {
82 kmutex_t async_write_lock;
83 struct rfs_async_write_list *async_write_head;
84
85 /*
86 * enables write clustering if == 1
87 */
88 int write_async;
89 } nfs_srv_t;
90
91 /*
92 * These are the interface routines for the server side of the
93 * Network File System. See the NFS version 2 protocol specification
94 * for a description of this interface.
95 */
96
97 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
98 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
99 cred_t *);
100 static void *rfs_zone_init(zoneid_t zoneid);
101 static void rfs_zone_fini(zoneid_t zoneid, void *data);
102
103
104 /*
105 * Some "over the wire" UNIX file types. These are encoded
106 * into the mode. This needs to be fixed in the next rev.
107 */
108 #define IFMT 0170000 /* type of file */
109 #define IFCHR 0020000 /* character special */
110 #define IFBLK 0060000 /* block special */
111 #define IFSOCK 0140000 /* socket */
112
113 u_longlong_t nfs2_srv_caller_id;
114 static zone_key_t rfs_zone_key;
115
116 /*
117 * Get file attributes.
118 * Returns the current attributes of the file with the given fhandle.
119 */
120 /* ARGSUSED */
121 void
122 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
123 struct svc_req *req, cred_t *cr, bool_t ro)
124 {
125 int error;
126 vnode_t *vp;
127 struct vattr va;
128
129 vp = nfs_fhtovp(fhp, exi);
130 if (vp == NULL) {
131 ns->ns_status = NFSERR_STALE;
132 return;
133 }
134
135 /*
136 * Do the getattr.
137 */
138 va.va_mask = AT_ALL; /* we want all the attributes */
139
140 error = rfs4_delegated_getattr(vp, &va, 0, cr);
141
142 /* check for overflows */
143 if (!error) {
144 /* Lie about the object type for a referral */
145 if (vn_is_nfs_reparse(vp, cr))
146 va.va_type = VLNK;
147
148 acl_perm(vp, exi, &va, cr);
149 error = vattr_to_nattr(&va, &ns->ns_attr);
150 }
151
152 VN_RELE(vp);
153
154 ns->ns_status = puterrno(error);
155 }
156 void *
157 rfs_getattr_getfh(fhandle_t *fhp)
158 {
159 return (fhp);
160 }
161
162 /*
163 * Set file attributes.
164 * Sets the attributes of the file with the given fhandle. Returns
165 * the new attributes.
166 */
167 /* ARGSUSED */
168 void
169 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
170 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
171 {
172 int error;
173 int flag;
174 int in_crit = 0;
175 vnode_t *vp;
176 struct vattr va;
177 struct vattr bva;
178 struct flock64 bf;
179 caller_context_t ct;
180
181
182 vp = nfs_fhtovp(&args->saa_fh, exi);
183 if (vp == NULL) {
184 ns->ns_status = NFSERR_STALE;
185 return;
186 }
187
188 if (rdonly(ro, vp)) {
189 VN_RELE(vp);
190 ns->ns_status = NFSERR_ROFS;
191 return;
192 }
193
194 error = sattr_to_vattr(&args->saa_sa, &va);
195 if (error) {
196 VN_RELE(vp);
197 ns->ns_status = puterrno(error);
198 return;
199 }
200
201 /*
202 * If the client is requesting a change to the mtime,
203 * but the nanosecond field is set to 1 billion, then
204 * this is a flag to the server that it should set the
205 * atime and mtime fields to the server's current time.
206 * The 1 billion number actually came from the client
207 * as 1 million, but the units in the over the wire
208 * request are microseconds instead of nanoseconds.
209 *
210 * This is an overload of the protocol and should be
211 * documented in the NFS Version 2 protocol specification.
212 */
213 if (va.va_mask & AT_MTIME) {
214 if (va.va_mtime.tv_nsec == 1000000000) {
215 gethrestime(&va.va_mtime);
216 va.va_atime = va.va_mtime;
217 va.va_mask |= AT_ATIME;
218 flag = 0;
219 } else
220 flag = ATTR_UTIME;
221 } else
222 flag = 0;
223
224 /*
225 * If the filesystem is exported with nosuid, then mask off
226 * the setuid and setgid bits.
227 */
228 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
229 (exi->exi_export.ex_flags & EX_NOSUID))
230 va.va_mode &= ~(VSUID | VSGID);
231
232 ct.cc_sysid = 0;
233 ct.cc_pid = 0;
234 ct.cc_caller_id = nfs2_srv_caller_id;
235 ct.cc_flags = CC_DONTBLOCK;
236
237 /*
238 * We need to specially handle size changes because it is
239 * possible for the client to create a file with modes
240 * which indicate read-only, but with the file opened for
241 * writing. If the client then tries to set the size of
242 * the file, then the normal access checking done in
243 * VOP_SETATTR would prevent the client from doing so,
244 * although it should be legal for it to do so. To get
245 * around this, we do the access checking for ourselves
246 * and then use VOP_SPACE which doesn't do the access
247 * checking which VOP_SETATTR does. VOP_SPACE can only
248 * operate on VREG files, let VOP_SETATTR handle the other
249 * extremely rare cases.
250 * Also the client should not be allowed to change the
251 * size of the file if there is a conflicting non-blocking
252 * mandatory lock in the region of change.
253 */
254 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
255 if (nbl_need_check(vp)) {
256 nbl_start_crit(vp, RW_READER);
257 in_crit = 1;
258 }
259
260 bva.va_mask = AT_UID | AT_SIZE;
261
262 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
263
264 if (error) {
265 if (in_crit)
266 nbl_end_crit(vp);
267 VN_RELE(vp);
268 ns->ns_status = puterrno(error);
269 return;
270 }
271
272 if (in_crit) {
273 u_offset_t offset;
274 ssize_t length;
275
276 if (va.va_size < bva.va_size) {
277 offset = va.va_size;
278 length = bva.va_size - va.va_size;
279 } else {
280 offset = bva.va_size;
281 length = va.va_size - bva.va_size;
282 }
283 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
284 NULL)) {
285 error = EACCES;
286 }
287 }
288
289 if (crgetuid(cr) == bva.va_uid && !error &&
290 va.va_size != bva.va_size) {
291 va.va_mask &= ~AT_SIZE;
292 bf.l_type = F_WRLCK;
293 bf.l_whence = 0;
294 bf.l_start = (off64_t)va.va_size;
295 bf.l_len = 0;
296 bf.l_sysid = 0;
297 bf.l_pid = 0;
298
299 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
300 (offset_t)va.va_size, cr, &ct);
301 }
302 if (in_crit)
303 nbl_end_crit(vp);
304 } else
305 error = 0;
306
307 /*
308 * Do the setattr.
309 */
310 if (!error && va.va_mask) {
311 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
312 }
313
314 /*
315 * check if the monitor on either vop_space or vop_setattr detected
316 * a delegation conflict and if so, mark the thread flag as
317 * wouldblock so that the response is dropped and the client will
318 * try again.
319 */
320 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
321 VN_RELE(vp);
322 curthread->t_flag |= T_WOULDBLOCK;
323 return;
324 }
325
326 if (!error) {
327 va.va_mask = AT_ALL; /* get everything */
328
329 error = rfs4_delegated_getattr(vp, &va, 0, cr);
330
331 /* check for overflows */
332 if (!error) {
333 acl_perm(vp, exi, &va, cr);
334 error = vattr_to_nattr(&va, &ns->ns_attr);
335 }
336 }
337
338 ct.cc_flags = 0;
339
340 /*
341 * Force modified metadata out to stable storage.
342 */
343 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
344
345 VN_RELE(vp);
346
347 ns->ns_status = puterrno(error);
348 }
349 void *
350 rfs_setattr_getfh(struct nfssaargs *args)
351 {
352 return (&args->saa_fh);
353 }
354
355 /* Change and release @exip and @vpp only in success */
356 int
357 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
358 {
359 struct exportinfo *exi;
360 vnode_t *vp = *vpp;
361 fid_t fid;
362 int error;
363
364 VN_HOLD(vp);
365
366 if ((error = traverse(&vp)) != 0) {
367 VN_RELE(vp);
368 return (error);
369 }
370
371 bzero(&fid, sizeof (fid));
372 fid.fid_len = MAXFIDSZ;
373 error = VOP_FID(vp, &fid, NULL);
374 if (error) {
375 VN_RELE(vp);
376 return (error);
377 }
378
379 exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
380 if (exi == NULL ||
381 (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
382 /*
383 * It is not error, just subdir is not exported
384 * or "nohide" is not set
385 */
386 if (exi != NULL)
387 exi_rele(&exi);
388 VN_RELE(vp);
389 } else {
390 /* go to submount */
391 exi_rele(exip);
392 *exip = exi;
393
394 VN_RELE(*vpp);
395 *vpp = vp;
396 }
397
398 return (0);
399 }
400
401 /*
402 * Given mounted "dvp" and "exi", go upper mountpoint
403 * with dvp/exi correction
404 * Return 0 in success
405 */
406 int
407 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
408 {
409 struct exportinfo *exi;
410 vnode_t *dvp = *dvpp;
411
412 ASSERT(dvp->v_flag & VROOT);
413
414 VN_HOLD(dvp);
415 dvp = untraverse(dvp);
416 exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
417 if (exi == NULL) {
418 VN_RELE(dvp);
419 return (-1);
420 }
421
422 exi_rele(exip);
423 *exip = exi;
424 VN_RELE(*dvpp);
425 *dvpp = dvp;
426
427 return (0);
428 }
429 /*
430 * Directory lookup.
431 * Returns an fhandle and file attributes for file name in a directory.
432 */
433 /* ARGSUSED */
434 void
435 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
436 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
437 {
438 int error;
439 vnode_t *dvp;
440 vnode_t *vp;
441 struct vattr va;
442 fhandle_t *fhp = da->da_fhandle;
443 struct sec_ol sec = {0, 0};
444 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
445 char *name;
446 struct sockaddr *ca;
447
448 /*
449 * Trusted Extension doesn't support NFSv2. MOUNT
450 * will reject v2 clients. Need to prevent v2 client
451 * access via WebNFS here.
452 */
453 if (is_system_labeled() && req->rq_vers == 2) {
454 dr->dr_status = NFSERR_ACCES;
455 return;
456 }
457
458 /*
459 * Disallow NULL paths
460 */
461 if (da->da_name == NULL || *da->da_name == '\0') {
462 dr->dr_status = NFSERR_ACCES;
463 return;
464 }
465
466 /*
467 * Allow lookups from the root - the default
468 * location of the public filehandle.
469 */
470 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
471 dvp = ZONE_ROOTVP();
472 VN_HOLD(dvp);
473 } else {
474 dvp = nfs_fhtovp(fhp, exi);
475 if (dvp == NULL) {
476 dr->dr_status = NFSERR_STALE;
477 return;
478 }
479 }
480
481 exi_hold(exi);
482
483 /*
484 * Not allow lookup beyond root.
485 * If the filehandle matches a filehandle of the exi,
486 * then the ".." refers beyond the root of an exported filesystem.
487 */
488 if (strcmp(da->da_name, "..") == 0 &&
489 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
490 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
491 (dvp->v_flag & VROOT)) {
492 /*
493 * special case for ".." and 'nohide'exported root
494 */
495 if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
496 error = NFSERR_ACCES;
497 goto out;
498 }
499 } else {
500 error = NFSERR_NOENT;
501 goto out;
502 }
503 }
504
505 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
506 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
507 MAXPATHLEN);
508
509 if (name == NULL) {
510 error = NFSERR_ACCES;
511 goto out;
512 }
513
514 /*
515 * If the public filehandle is used then allow
516 * a multi-component lookup, i.e. evaluate
517 * a pathname and follow symbolic links if
518 * necessary.
519 *
520 * This may result in a vnode in another filesystem
521 * which is OK as long as the filesystem is exported.
522 */
523 if (PUBLIC_FH2(fhp)) {
524 publicfh_flag = TRUE;
525
526 exi_rele(&exi);
527
528 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
529 &sec);
530 } else {
531 /*
532 * Do a normal single component lookup.
533 */
534 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
535 NULL, NULL, NULL);
536 }
537
538 if (name != da->da_name)
539 kmem_free(name, MAXPATHLEN);
540
541 if (error == 0 && vn_ismntpt(vp)) {
542 error = rfs_cross_mnt(&vp, &exi);
543 if (error)
544 VN_RELE(vp);
545 }
546
547 if (!error) {
548 va.va_mask = AT_ALL; /* we want everything */
549
550 error = rfs4_delegated_getattr(vp, &va, 0, cr);
551
552 /* check for overflows */
553 if (!error) {
554 acl_perm(vp, exi, &va, cr);
555 error = vattr_to_nattr(&va, &dr->dr_attr);
556 if (!error) {
557 if (sec.sec_flags & SEC_QUERY)
558 error = makefh_ol(&dr->dr_fhandle, exi,
559 sec.sec_index);
560 else {
561 error = makefh(&dr->dr_fhandle, vp,
562 exi);
563 if (!error && publicfh_flag &&
564 !chk_clnt_sec(exi, req))
565 auth_weak = TRUE;
566 }
567 }
568 }
569 VN_RELE(vp);
570 }
571
572 out:
573 VN_RELE(dvp);
574
575 if (exi != NULL)
576 exi_rele(&exi);
577
578 /*
579 * If it's public fh, no 0x81, and client's flavor is
580 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
581 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
582 */
583 if (auth_weak)
584 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
585 else
586 dr->dr_status = puterrno(error);
587 }
588 void *
589 rfs_lookup_getfh(struct nfsdiropargs *da)
590 {
591 return (da->da_fhandle);
592 }
593
594 /*
595 * Read symbolic link.
596 * Returns the string in the symbolic link at the given fhandle.
597 */
598 /* ARGSUSED */
599 void
600 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
601 struct svc_req *req, cred_t *cr, bool_t ro)
602 {
603 int error;
604 struct iovec iov;
605 struct uio uio;
606 vnode_t *vp;
607 struct vattr va;
608 struct sockaddr *ca;
609 char *name = NULL;
610 int is_referral = 0;
611
612 vp = nfs_fhtovp(fhp, exi);
613 if (vp == NULL) {
614 rl->rl_data = NULL;
615 rl->rl_status = NFSERR_STALE;
616 return;
617 }
618
619 va.va_mask = AT_MODE;
620
621 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
622
623 if (error) {
624 VN_RELE(vp);
625 rl->rl_data = NULL;
626 rl->rl_status = puterrno(error);
627 return;
628 }
629
630 if (MANDLOCK(vp, va.va_mode)) {
631 VN_RELE(vp);
632 rl->rl_data = NULL;
633 rl->rl_status = NFSERR_ACCES;
634 return;
635 }
636
637 /* We lied about the object type for a referral */
638 if (vn_is_nfs_reparse(vp, cr))
639 is_referral = 1;
640
641 /*
642 * XNFS and RFC1094 require us to return ENXIO if argument
643 * is not a link. BUGID 1138002.
644 */
645 if (vp->v_type != VLNK && !is_referral) {
646 VN_RELE(vp);
647 rl->rl_data = NULL;
648 rl->rl_status = NFSERR_NXIO;
649 return;
650 }
651
652 /*
653 * Allocate data for pathname. This will be freed by rfs_rlfree.
654 */
655 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
656
657 if (is_referral) {
658 char *s;
659 size_t strsz;
660
661 /* Get an artificial symlink based on a referral */
662 s = build_symlink(vp, cr, &strsz);
663 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
664 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
665 vnode_t *, vp, char *, s);
666 if (s == NULL)
667 error = EINVAL;
668 else {
669 error = 0;
670 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
671 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
672 kmem_free(s, strsz);
673 }
674
675 } else {
676
677 /*
678 * Set up io vector to read sym link data
679 */
680 iov.iov_base = rl->rl_data;
681 iov.iov_len = NFS_MAXPATHLEN;
682 uio.uio_iov = &iov;
683 uio.uio_iovcnt = 1;
684 uio.uio_segflg = UIO_SYSSPACE;
685 uio.uio_extflg = UIO_COPY_CACHED;
686 uio.uio_loffset = (offset_t)0;
687 uio.uio_resid = NFS_MAXPATHLEN;
688
689 /*
690 * Do the readlink.
691 */
692 error = VOP_READLINK(vp, &uio, cr, NULL);
693
694 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
695
696 if (!error)
697 rl->rl_data[rl->rl_count] = '\0';
698
699 }
700
701
702 VN_RELE(vp);
703
704 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
705 name = nfscmd_convname(ca, exi, rl->rl_data,
706 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
707
708 if (name != NULL && name != rl->rl_data) {
709 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
710 rl->rl_data = name;
711 }
712
713 /*
714 * XNFS and RFC1094 require us to return ENXIO if argument
715 * is not a link. UFS returns EINVAL if this is the case,
716 * so we do the mapping here. BUGID 1138002.
717 */
718 if (error == EINVAL)
719 rl->rl_status = NFSERR_NXIO;
720 else
721 rl->rl_status = puterrno(error);
722
723 }
724 void *
725 rfs_readlink_getfh(fhandle_t *fhp)
726 {
727 return (fhp);
728 }
729 /*
730 * Free data allocated by rfs_readlink
731 */
732 void
733 rfs_rlfree(struct nfsrdlnres *rl)
734 {
735 if (rl->rl_data != NULL)
736 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
737 }
738
739 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
740
741 /*
742 * Read data.
743 * Returns some data read from the file at the given fhandle.
744 */
745 /* ARGSUSED */
746 void
747 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
748 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
749 {
750 vnode_t *vp;
751 int error;
752 struct vattr va;
753 struct iovec iov;
754 struct uio uio;
755 mblk_t *mp;
756 int alloc_err = 0;
757 int in_crit = 0;
758 caller_context_t ct;
759
760 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
761 if (vp == NULL) {
762 rr->rr_data = NULL;
763 rr->rr_status = NFSERR_STALE;
764 return;
765 }
766
767 if (vp->v_type != VREG) {
768 VN_RELE(vp);
769 rr->rr_data = NULL;
770 rr->rr_status = NFSERR_ISDIR;
771 return;
772 }
773
774 ct.cc_sysid = 0;
775 ct.cc_pid = 0;
776 ct.cc_caller_id = nfs2_srv_caller_id;
777 ct.cc_flags = CC_DONTBLOCK;
778
779 /*
780 * Enter the critical region before calling VOP_RWLOCK
781 * to avoid a deadlock with write requests.
782 */
783 if (nbl_need_check(vp)) {
784 nbl_start_crit(vp, RW_READER);
785 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
786 0, NULL)) {
787 nbl_end_crit(vp);
788 VN_RELE(vp);
789 rr->rr_data = NULL;
790 rr->rr_status = NFSERR_ACCES;
791 return;
792 }
793 in_crit = 1;
794 }
795
796 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
797
798 /* check if a monitor detected a delegation conflict */
799 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
800 if (in_crit)
801 nbl_end_crit(vp);
802 VN_RELE(vp);
803 /* mark as wouldblock so response is dropped */
804 curthread->t_flag |= T_WOULDBLOCK;
805
806 rr->rr_data = NULL;
807 return;
808 }
809
810 va.va_mask = AT_ALL;
811
812 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
813
814 if (error) {
815 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
816 if (in_crit)
817 nbl_end_crit(vp);
818
819 VN_RELE(vp);
820 rr->rr_data = NULL;
821 rr->rr_status = puterrno(error);
822
823 return;
824 }
825
826 /*
827 * This is a kludge to allow reading of files created
828 * with no read permission. The owner of the file
829 * is always allowed to read it.
830 */
831 if (crgetuid(cr) != va.va_uid) {
832 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
833
834 if (error) {
835 /*
836 * Exec is the same as read over the net because
837 * of demand loading.
838 */
839 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
840 }
841 if (error) {
842 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
843 if (in_crit)
844 nbl_end_crit(vp);
845 VN_RELE(vp);
846 rr->rr_data = NULL;
847 rr->rr_status = puterrno(error);
848
849 return;
850 }
851 }
852
853 if (MANDLOCK(vp, va.va_mode)) {
854 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
855 if (in_crit)
856 nbl_end_crit(vp);
857
858 VN_RELE(vp);
859 rr->rr_data = NULL;
860 rr->rr_status = NFSERR_ACCES;
861
862 return;
863 }
864
865 rr->rr_ok.rrok_wlist_len = 0;
866 rr->rr_ok.rrok_wlist = NULL;
867
868 if ((u_offset_t)ra->ra_offset >= va.va_size) {
869 rr->rr_count = 0;
870 rr->rr_data = NULL;
871 /*
872 * In this case, status is NFS_OK, but there is no data
873 * to encode. So set rr_mp to NULL.
874 */
875 rr->rr_mp = NULL;
876 rr->rr_ok.rrok_wlist = ra->ra_wlist;
877 if (rr->rr_ok.rrok_wlist)
878 clist_zero_len(rr->rr_ok.rrok_wlist);
879 goto done;
880 }
881
882 if (ra->ra_wlist) {
883 mp = NULL;
884 rr->rr_mp = NULL;
885 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
886 if (ra->ra_count > iov.iov_len) {
887 rr->rr_data = NULL;
888 rr->rr_status = NFSERR_INVAL;
889 goto done;
890 }
891 } else {
892 /*
893 * mp will contain the data to be sent out in the read reply.
894 * This will be freed after the reply has been sent out (by the
895 * driver).
896 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
897 * that the call to xdrmblk_putmblk() never fails.
898 */
899 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
900 &alloc_err);
901 ASSERT(mp != NULL);
902 ASSERT(alloc_err == 0);
903
904 rr->rr_mp = mp;
905
906 /*
907 * Set up io vector
908 */
909 iov.iov_base = (caddr_t)mp->b_datap->db_base;
910 iov.iov_len = ra->ra_count;
911 }
912
913 uio.uio_iov = &iov;
914 uio.uio_iovcnt = 1;
915 uio.uio_segflg = UIO_SYSSPACE;
916 uio.uio_extflg = UIO_COPY_CACHED;
917 uio.uio_loffset = (offset_t)ra->ra_offset;
918 uio.uio_resid = ra->ra_count;
919
920 error = VOP_READ(vp, &uio, 0, cr, &ct);
921
922 if (error) {
923 if (mp)
924 freeb(mp);
925
926 /*
927 * check if a monitor detected a delegation conflict and
928 * mark as wouldblock so response is dropped
929 */
930 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
931 curthread->t_flag |= T_WOULDBLOCK;
932 else
933 rr->rr_status = puterrno(error);
934
935 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
936 if (in_crit)
937 nbl_end_crit(vp);
938
939 VN_RELE(vp);
940 rr->rr_data = NULL;
941
942 return;
943 }
944
945 /*
946 * Get attributes again so we can send the latest access
947 * time to the client side for its cache.
948 */
949 va.va_mask = AT_ALL;
950
951 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
952
953 if (error) {
954 if (mp)
955 freeb(mp);
956
957 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
958 if (in_crit)
959 nbl_end_crit(vp);
960
961 VN_RELE(vp);
962 rr->rr_data = NULL;
963 rr->rr_status = puterrno(error);
964
965 return;
966 }
967
968 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
969
970 if (mp) {
971 rr->rr_data = (char *)mp->b_datap->db_base;
972 } else {
973 if (ra->ra_wlist) {
974 rr->rr_data = (caddr_t)iov.iov_base;
975 if (!rdma_setup_read_data2(ra, rr)) {
976 rr->rr_data = NULL;
977 rr->rr_status = puterrno(NFSERR_INVAL);
978 }
979 }
980 }
981 done:
982 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
983 if (in_crit)
984 nbl_end_crit(vp);
985
986 acl_perm(vp, exi, &va, cr);
987
988 /* check for overflows */
989 error = vattr_to_nattr(&va, &rr->rr_attr);
990
991 VN_RELE(vp);
992
993 rr->rr_status = puterrno(error);
994 }
995
996 /*
997 * Free data allocated by rfs_read
998 */
999 void
1000 rfs_rdfree(struct nfsrdresult *rr)
1001 {
1002 mblk_t *mp;
1003
1004 if (rr->rr_status == NFS_OK) {
1005 mp = rr->rr_mp;
1006 if (mp != NULL)
1007 freeb(mp);
1008 }
1009 }
1010
1011 void *
1012 rfs_read_getfh(struct nfsreadargs *ra)
1013 {
1014 return (&ra->ra_fhandle);
1015 }
1016
1017 #define MAX_IOVECS 12
1018
1019 #ifdef DEBUG
1020 static int rfs_write_sync_hits = 0;
1021 static int rfs_write_sync_misses = 0;
1022 #endif
1023
1024 /*
1025 * Write data to file.
1026 * Returns attributes of a file after writing some data to it.
1027 *
1028 * Any changes made here, especially in error handling might have
1029 * to also be done in rfs_write (which clusters write requests).
1030 */
1031 /* ARGSUSED */
1032 void
1033 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1034 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1035 {
1036 int error;
1037 vnode_t *vp;
1038 rlim64_t rlimit;
1039 struct vattr va;
1040 struct uio uio;
1041 struct iovec iov[MAX_IOVECS];
1042 mblk_t *m;
1043 struct iovec *iovp;
1044 int iovcnt;
1045 cred_t *savecred;
1046 int in_crit = 0;
1047 caller_context_t ct;
1048
1049 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1050 if (vp == NULL) {
1051 ns->ns_status = NFSERR_STALE;
1052 return;
1053 }
1054
1055 if (rdonly(ro, vp)) {
1056 VN_RELE(vp);
1057 ns->ns_status = NFSERR_ROFS;
1058 return;
1059 }
1060
1061 if (vp->v_type != VREG) {
1062 VN_RELE(vp);
1063 ns->ns_status = NFSERR_ISDIR;
1064 return;
1065 }
1066
1067 ct.cc_sysid = 0;
1068 ct.cc_pid = 0;
1069 ct.cc_caller_id = nfs2_srv_caller_id;
1070 ct.cc_flags = CC_DONTBLOCK;
1071
1072 va.va_mask = AT_UID|AT_MODE;
1073
1074 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1075
1076 if (error) {
1077 VN_RELE(vp);
1078 ns->ns_status = puterrno(error);
1079
1080 return;
1081 }
1082
1083 if (crgetuid(cr) != va.va_uid) {
1084 /*
1085 * This is a kludge to allow writes of files created
1086 * with read only permission. The owner of the file
1087 * is always allowed to write it.
1088 */
1089 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1090
1091 if (error) {
1092 VN_RELE(vp);
1093 ns->ns_status = puterrno(error);
1094 return;
1095 }
1096 }
1097
1098 /*
1099 * Can't access a mandatory lock file. This might cause
1100 * the NFS service thread to block forever waiting for a
1101 * lock to be released that will never be released.
1102 */
1103 if (MANDLOCK(vp, va.va_mode)) {
1104 VN_RELE(vp);
1105 ns->ns_status = NFSERR_ACCES;
1106 return;
1107 }
1108
1109 /*
1110 * We have to enter the critical region before calling VOP_RWLOCK
1111 * to avoid a deadlock with ufs.
1112 */
1113 if (nbl_need_check(vp)) {
1114 nbl_start_crit(vp, RW_READER);
1115 in_crit = 1;
1116 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1117 wa->wa_count, 0, NULL)) {
1118 error = EACCES;
1119 goto out;
1120 }
1121 }
1122
1123 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1124
1125 /* check if a monitor detected a delegation conflict */
1126 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1127 goto out;
1128 }
1129
1130 if (wa->wa_data || wa->wa_rlist) {
1131 /* Do the RDMA thing if necessary */
1132 if (wa->wa_rlist) {
1133 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1134 iov[0].iov_len = wa->wa_count;
1135 } else {
1136 iov[0].iov_base = wa->wa_data;
1137 iov[0].iov_len = wa->wa_count;
1138 }
1139 uio.uio_iov = iov;
1140 uio.uio_iovcnt = 1;
1141 uio.uio_segflg = UIO_SYSSPACE;
1142 uio.uio_extflg = UIO_COPY_DEFAULT;
1143 uio.uio_loffset = (offset_t)wa->wa_offset;
1144 uio.uio_resid = wa->wa_count;
1145 /*
1146 * The limit is checked on the client. We
1147 * should allow any size writes here.
1148 */
1149 uio.uio_llimit = curproc->p_fsz_ctl;
1150 rlimit = uio.uio_llimit - wa->wa_offset;
1151 if (rlimit < (rlim64_t)uio.uio_resid)
1152 uio.uio_resid = (uint_t)rlimit;
1153
1154 /*
1155 * for now we assume no append mode
1156 */
1157 /*
1158 * We're changing creds because VM may fault and we need
1159 * the cred of the current thread to be used if quota
1160 * checking is enabled.
1161 */
1162 savecred = curthread->t_cred;
1163 curthread->t_cred = cr;
1164 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1165 curthread->t_cred = savecred;
1166 } else {
1167
1168 iovcnt = 0;
1169 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1170 iovcnt++;
1171 if (iovcnt <= MAX_IOVECS) {
1172 #ifdef DEBUG
1173 rfs_write_sync_hits++;
1174 #endif
1175 iovp = iov;
1176 } else {
1177 #ifdef DEBUG
1178 rfs_write_sync_misses++;
1179 #endif
1180 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1181 }
1182 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1183 uio.uio_iov = iovp;
1184 uio.uio_iovcnt = iovcnt;
1185 uio.uio_segflg = UIO_SYSSPACE;
1186 uio.uio_extflg = UIO_COPY_DEFAULT;
1187 uio.uio_loffset = (offset_t)wa->wa_offset;
1188 uio.uio_resid = wa->wa_count;
1189 /*
1190 * The limit is checked on the client. We
1191 * should allow any size writes here.
1192 */
1193 uio.uio_llimit = curproc->p_fsz_ctl;
1194 rlimit = uio.uio_llimit - wa->wa_offset;
1195 if (rlimit < (rlim64_t)uio.uio_resid)
1196 uio.uio_resid = (uint_t)rlimit;
1197
1198 /*
1199 * For now we assume no append mode.
1200 */
1201 /*
1202 * We're changing creds because VM may fault and we need
1203 * the cred of the current thread to be used if quota
1204 * checking is enabled.
1205 */
1206 savecred = curthread->t_cred;
1207 curthread->t_cred = cr;
1208 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1209 curthread->t_cred = savecred;
1210
1211 if (iovp != iov)
1212 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1213 }
1214
1215 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1216
1217 if (!error) {
1218 /*
1219 * Get attributes again so we send the latest mod
1220 * time to the client side for its cache.
1221 */
1222 va.va_mask = AT_ALL; /* now we want everything */
1223
1224 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1225
1226 /* check for overflows */
1227 if (!error) {
1228 acl_perm(vp, exi, &va, cr);
1229 error = vattr_to_nattr(&va, &ns->ns_attr);
1230 }
1231 }
1232
1233 out:
1234 if (in_crit)
1235 nbl_end_crit(vp);
1236 VN_RELE(vp);
1237
1238 /* check if a monitor detected a delegation conflict */
1239 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1240 /* mark as wouldblock so response is dropped */
1241 curthread->t_flag |= T_WOULDBLOCK;
1242 else
1243 ns->ns_status = puterrno(error);
1244
1245 }
1246
1247 struct rfs_async_write {
1248 struct nfswriteargs *wa;
1249 struct nfsattrstat *ns;
1250 struct svc_req *req;
1251 cred_t *cr;
1252 bool_t ro;
1253 kthread_t *thread;
1254 struct rfs_async_write *list;
1255 };
1256
1257 struct rfs_async_write_list {
1258 fhandle_t *fhp;
1259 kcondvar_t cv;
1260 struct rfs_async_write *list;
1261 struct rfs_async_write_list *next;
1262 };
1263
1264 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1265 static kmutex_t rfs_async_write_lock;
1266 volatile int rfs_write_async = 1; /* enables write clustering if == 1 */
1267
1268 #define MAXCLIOVECS 42
1269 #define RFSWRITE_INITVAL (enum nfsstat) -1
1270
1271 #ifdef DEBUG
1272 static int rfs_write_hits = 0;
1273 static int rfs_write_misses = 0;
1274 #endif
1275
1276 /*
1277 * Write data to file.
1278 * Returns attributes of a file after writing some data to it.
1279 */
1280 void
1281 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1282 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1283 {
1284 int error;
1285 vnode_t *vp;
1286 rlim64_t rlimit;
1287 struct vattr va;
1288 struct uio uio;
1289 struct rfs_async_write_list *lp;
1290 struct rfs_async_write_list *nlp;
1291 struct rfs_async_write *rp;
1292 struct rfs_async_write *nrp;
1293 struct rfs_async_write *trp;
1294 struct rfs_async_write *lrp;
1295 int data_written;
1296 int iovcnt;
1297 mblk_t *m;
1298 struct iovec *iovp;
1299 struct iovec *niovp;
1300 struct iovec iov[MAXCLIOVECS];
1301 int count;
1302 int rcount;
1303 uint_t off;
1304 uint_t len;
1305 struct rfs_async_write nrpsp;
1306 struct rfs_async_write_list nlpsp;
1307 ushort_t t_flag;
1308 cred_t *savecred;
1309 int in_crit = 0;
1310 caller_context_t ct;
1311 nfs_srv_t *nsrv;
1312
1313 nsrv = zone_getspecific(rfs_zone_key, curzone);
1314 if (!nsrv->write_async) {
1315 rfs_write_sync(wa, ns, exi, req, cr, ro);
1316 return;
1317 }
1318
1319 /*
1320 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1321 * is considered an OK.
1322 */
1323 ns->ns_status = RFSWRITE_INITVAL;
1324
1325 nrp = &nrpsp;
1326 nrp->wa = wa;
1327 nrp->ns = ns;
1328 nrp->req = req;
1329 nrp->cr = cr;
1330 nrp->ro = ro;
1331 nrp->thread = curthread;
1332
1333 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1334
1335 /*
1336 * Look to see if there is already a cluster started
1337 * for this file.
1338 */
1339 mutex_enter(&nsrv->async_write_lock);
1340 for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1341 if (bcmp(&wa->wa_fhandle, lp->fhp,
1342 sizeof (fhandle_t)) == 0)
1343 break;
1344 }
1345
1346 /*
1347 * If lp is non-NULL, then there is already a cluster
1348 * started. We need to place ourselves in the cluster
1349 * list in the right place as determined by starting
1350 * offset. Conflicts with non-blocking mandatory locked
1351 * regions will be checked when the cluster is processed.
1352 */
1353 if (lp != NULL) {
1354 rp = lp->list;
1355 trp = NULL;
1356 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1357 trp = rp;
1358 rp = rp->list;
1359 }
1360 nrp->list = rp;
1361 if (trp == NULL)
1362 lp->list = nrp;
1363 else
1364 trp->list = nrp;
1365 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1366 cv_wait(&lp->cv, &nsrv->async_write_lock);
1367 mutex_exit(&nsrv->async_write_lock);
1368
1369 return;
1370 }
1371
1372 /*
1373 * No cluster started yet, start one and add ourselves
1374 * to the list of clusters.
1375 */
1376 nrp->list = NULL;
1377
1378 nlp = &nlpsp;
1379 nlp->fhp = &wa->wa_fhandle;
1380 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1381 nlp->list = nrp;
1382 nlp->next = NULL;
1383
1384 if (nsrv->async_write_head == NULL) {
1385 nsrv->async_write_head = nlp;
1386 } else {
1387 lp = nsrv->async_write_head;
1388 while (lp->next != NULL)
1389 lp = lp->next;
1390 lp->next = nlp;
1391 }
1392 mutex_exit(&nsrv->async_write_lock);
1393
1394 /*
1395 * Convert the file handle common to all of the requests
1396 * in this cluster to a vnode.
1397 */
1398 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1399 if (vp == NULL) {
1400 mutex_enter(&nsrv->async_write_lock);
1401 if (nsrv->async_write_head == nlp)
1402 nsrv->async_write_head = nlp->next;
1403 else {
1404 lp = nsrv->async_write_head;
1405 while (lp->next != nlp)
1406 lp = lp->next;
1407 lp->next = nlp->next;
1408 }
1409 t_flag = curthread->t_flag & T_WOULDBLOCK;
1410 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1411 rp->ns->ns_status = NFSERR_STALE;
1412 rp->thread->t_flag |= t_flag;
1413 }
1414 cv_broadcast(&nlp->cv);
1415 mutex_exit(&nsrv->async_write_lock);
1416
1417 return;
1418 }
1419
1420 /*
1421 * Can only write regular files. Attempts to write any
1422 * other file types fail with EISDIR.
1423 */
1424 if (vp->v_type != VREG) {
1425 VN_RELE(vp);
1426 mutex_enter(&nsrv->async_write_lock);
1427 if (nsrv->async_write_head == nlp)
1428 nsrv->async_write_head = nlp->next;
1429 else {
1430 lp = nsrv->async_write_head;
1431 while (lp->next != nlp)
1432 lp = lp->next;
1433 lp->next = nlp->next;
1434 }
1435 t_flag = curthread->t_flag & T_WOULDBLOCK;
1436 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1437 rp->ns->ns_status = NFSERR_ISDIR;
1438 rp->thread->t_flag |= t_flag;
1439 }
1440 cv_broadcast(&nlp->cv);
1441 mutex_exit(&nsrv->async_write_lock);
1442
1443 return;
1444 }
1445
1446 /*
1447 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1448 * deadlock with ufs.
1449 */
1450 if (nbl_need_check(vp)) {
1451 nbl_start_crit(vp, RW_READER);
1452 in_crit = 1;
1453 }
1454
1455 ct.cc_sysid = 0;
1456 ct.cc_pid = 0;
1457 ct.cc_caller_id = nfs2_srv_caller_id;
1458 ct.cc_flags = CC_DONTBLOCK;
1459
1460 /*
1461 * Lock the file for writing. This operation provides
1462 * the delay which allows clusters to grow.
1463 */
1464 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1465
1466 /* check if a monitor detected a delegation conflict */
1467 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1468 if (in_crit)
1469 nbl_end_crit(vp);
1470 VN_RELE(vp);
1471 /* mark as wouldblock so response is dropped */
1472 curthread->t_flag |= T_WOULDBLOCK;
1473 mutex_enter(&nsrv->async_write_lock);
1474 if (nsrv->async_write_head == nlp)
1475 nsrv->async_write_head = nlp->next;
1476 else {
1477 lp = nsrv->async_write_head;
1478 while (lp->next != nlp)
1479 lp = lp->next;
1480 lp->next = nlp->next;
1481 }
1482 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1483 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1484 rp->ns->ns_status = puterrno(error);
1485 rp->thread->t_flag |= T_WOULDBLOCK;
1486 }
1487 }
1488 cv_broadcast(&nlp->cv);
1489 mutex_exit(&nsrv->async_write_lock);
1490
1491 return;
1492 }
1493
1494 /*
1495 * Disconnect this cluster from the list of clusters.
1496 * The cluster that is being dealt with must be fixed
1497 * in size after this point, so there is no reason
1498 * to leave it on the list so that new requests can
1499 * find it.
1500 *
1501 * The algorithm is that the first write request will
1502 * create a cluster, convert the file handle to a
1503 * vnode pointer, and then lock the file for writing.
1504 * This request is not likely to be clustered with
1505 * any others. However, the next request will create
1506 * a new cluster and be blocked in VOP_RWLOCK while
1507 * the first request is being processed. This delay
1508 * will allow more requests to be clustered in this
1509 * second cluster.
1510 */
1511 mutex_enter(&nsrv->async_write_lock);
1512 if (nsrv->async_write_head == nlp)
1513 nsrv->async_write_head = nlp->next;
1514 else {
1515 lp = nsrv->async_write_head;
1516 while (lp->next != nlp)
1517 lp = lp->next;
1518 lp->next = nlp->next;
1519 }
1520 mutex_exit(&nsrv->async_write_lock);
1521
1522 /*
1523 * Step through the list of requests in this cluster.
1524 * We need to check permissions to make sure that all
1525 * of the requests have sufficient permission to write
1526 * the file. A cluster can be composed of requests
1527 * from different clients and different users on each
1528 * client.
1529 *
1530 * As a side effect, we also calculate the size of the
1531 * byte range that this cluster encompasses.
1532 */
1533 rp = nlp->list;
1534 off = rp->wa->wa_offset;
1535 len = (uint_t)0;
1536 do {
1537 if (rdonly(rp->ro, vp)) {
1538 rp->ns->ns_status = NFSERR_ROFS;
1539 t_flag = curthread->t_flag & T_WOULDBLOCK;
1540 rp->thread->t_flag |= t_flag;
1541 continue;
1542 }
1543
1544 va.va_mask = AT_UID|AT_MODE;
1545
1546 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1547
1548 if (!error) {
1549 if (crgetuid(rp->cr) != va.va_uid) {
1550 /*
1551 * This is a kludge to allow writes of files
1552 * created with read only permission. The
1553 * owner of the file is always allowed to
1554 * write it.
1555 */
1556 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1557 }
1558 if (!error && MANDLOCK(vp, va.va_mode))
1559 error = EACCES;
1560 }
1561
1562 /*
1563 * Check for a conflict with a nbmand-locked region.
1564 */
1565 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1566 rp->wa->wa_count, 0, NULL)) {
1567 error = EACCES;
1568 }
1569
1570 if (error) {
1571 rp->ns->ns_status = puterrno(error);
1572 t_flag = curthread->t_flag & T_WOULDBLOCK;
1573 rp->thread->t_flag |= t_flag;
1574 continue;
1575 }
1576 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1577 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1578 } while ((rp = rp->list) != NULL);
1579
1580 /*
1581 * Step through the cluster attempting to gather as many
1582 * requests which are contiguous as possible. These
1583 * contiguous requests are handled via one call to VOP_WRITE
1584 * instead of different calls to VOP_WRITE. We also keep
1585 * track of the fact that any data was written.
1586 */
1587 rp = nlp->list;
1588 data_written = 0;
1589 do {
1590 /*
1591 * Skip any requests which are already marked as having an
1592 * error.
1593 */
1594 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1595 rp = rp->list;
1596 continue;
1597 }
1598
1599 /*
1600 * Count the number of iovec's which are required
1601 * to handle this set of requests. One iovec is
1602 * needed for each data buffer, whether addressed
1603 * by wa_data or by the b_rptr pointers in the
1604 * mblk chains.
1605 */
1606 iovcnt = 0;
1607 lrp = rp;
1608 for (;;) {
1609 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1610 iovcnt++;
1611 else {
1612 m = lrp->wa->wa_mblk;
1613 while (m != NULL) {
1614 iovcnt++;
1615 m = m->b_cont;
1616 }
1617 }
1618 if (lrp->list == NULL ||
1619 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1620 lrp->wa->wa_offset + lrp->wa->wa_count !=
1621 lrp->list->wa->wa_offset) {
1622 lrp = lrp->list;
1623 break;
1624 }
1625 lrp = lrp->list;
1626 }
1627
1628 if (iovcnt <= MAXCLIOVECS) {
1629 #ifdef DEBUG
1630 rfs_write_hits++;
1631 #endif
1632 niovp = iov;
1633 } else {
1634 #ifdef DEBUG
1635 rfs_write_misses++;
1636 #endif
1637 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1638 }
1639 /*
1640 * Put together the scatter/gather iovecs.
1641 */
1642 iovp = niovp;
1643 trp = rp;
1644 count = 0;
1645 do {
1646 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1647 if (trp->wa->wa_rlist) {
1648 iovp->iov_base =
1649 (char *)((trp->wa->wa_rlist)->
1650 u.c_daddr3);
1651 iovp->iov_len = trp->wa->wa_count;
1652 } else {
1653 iovp->iov_base = trp->wa->wa_data;
1654 iovp->iov_len = trp->wa->wa_count;
1655 }
1656 iovp++;
1657 } else {
1658 m = trp->wa->wa_mblk;
1659 rcount = trp->wa->wa_count;
1660 while (m != NULL) {
1661 iovp->iov_base = (caddr_t)m->b_rptr;
1662 iovp->iov_len = (m->b_wptr - m->b_rptr);
1663 rcount -= iovp->iov_len;
1664 if (rcount < 0)
1665 iovp->iov_len += rcount;
1666 iovp++;
1667 if (rcount <= 0)
1668 break;
1669 m = m->b_cont;
1670 }
1671 }
1672 count += trp->wa->wa_count;
1673 trp = trp->list;
1674 } while (trp != lrp);
1675
1676 uio.uio_iov = niovp;
1677 uio.uio_iovcnt = iovcnt;
1678 uio.uio_segflg = UIO_SYSSPACE;
1679 uio.uio_extflg = UIO_COPY_DEFAULT;
1680 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1681 uio.uio_resid = count;
1682 /*
1683 * The limit is checked on the client. We
1684 * should allow any size writes here.
1685 */
1686 uio.uio_llimit = curproc->p_fsz_ctl;
1687 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1688 if (rlimit < (rlim64_t)uio.uio_resid)
1689 uio.uio_resid = (uint_t)rlimit;
1690
1691 /*
1692 * For now we assume no append mode.
1693 */
1694
1695 /*
1696 * We're changing creds because VM may fault
1697 * and we need the cred of the current
1698 * thread to be used if quota * checking is
1699 * enabled.
1700 */
1701 savecred = curthread->t_cred;
1702 curthread->t_cred = cr;
1703 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1704 curthread->t_cred = savecred;
1705
1706 /* check if a monitor detected a delegation conflict */
1707 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1708 /* mark as wouldblock so response is dropped */
1709 curthread->t_flag |= T_WOULDBLOCK;
1710
1711 if (niovp != iov)
1712 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1713
1714 if (!error) {
1715 data_written = 1;
1716 /*
1717 * Get attributes again so we send the latest mod
1718 * time to the client side for its cache.
1719 */
1720 va.va_mask = AT_ALL; /* now we want everything */
1721
1722 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1723
1724 if (!error)
1725 acl_perm(vp, exi, &va, rp->cr);
1726 }
1727
1728 /*
1729 * Fill in the status responses for each request
1730 * which was just handled. Also, copy the latest
1731 * attributes in to the attribute responses if
1732 * appropriate.
1733 */
1734 t_flag = curthread->t_flag & T_WOULDBLOCK;
1735 do {
1736 rp->thread->t_flag |= t_flag;
1737 /* check for overflows */
1738 if (!error) {
1739 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1740 }
1741 rp->ns->ns_status = puterrno(error);
1742 rp = rp->list;
1743 } while (rp != lrp);
1744 } while (rp != NULL);
1745
1746 /*
1747 * If any data was written at all, then we need to flush
1748 * the data and metadata to stable storage.
1749 */
1750 if (data_written) {
1751 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1752
1753 if (!error) {
1754 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1755 }
1756 }
1757
1758 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1759
1760 if (in_crit)
1761 nbl_end_crit(vp);
1762 VN_RELE(vp);
1763
1764 t_flag = curthread->t_flag & T_WOULDBLOCK;
1765 mutex_enter(&nsrv->async_write_lock);
1766 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1767 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1768 rp->ns->ns_status = puterrno(error);
1769 rp->thread->t_flag |= t_flag;
1770 }
1771 }
1772 cv_broadcast(&nlp->cv);
1773 mutex_exit(&nsrv->async_write_lock);
1774
1775 }
1776
1777 void *
1778 rfs_write_getfh(struct nfswriteargs *wa)
1779 {
1780 return (&wa->wa_fhandle);
1781 }
1782
1783 /*
1784 * Create a file.
1785 * Creates a file with given attributes and returns those attributes
1786 * and an fhandle for the new file.
1787 */
1788 void
1789 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1790 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1791 {
1792 int error;
1793 int lookuperr;
1794 int in_crit = 0;
1795 struct vattr va;
1796 vnode_t *vp;
1797 vnode_t *realvp;
1798 vnode_t *dvp;
1799 char *name = args->ca_da.da_name;
1800 vnode_t *tvp = NULL;
1801 int mode;
1802 int lookup_ok;
1803 bool_t trunc;
1804 struct sockaddr *ca;
1805
1806 /*
1807 * Disallow NULL paths
1808 */
1809 if (name == NULL || *name == '\0') {
1810 dr->dr_status = NFSERR_ACCES;
1811 return;
1812 }
1813
1814 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1815 if (dvp == NULL) {
1816 dr->dr_status = NFSERR_STALE;
1817 return;
1818 }
1819
1820 error = sattr_to_vattr(args->ca_sa, &va);
1821 if (error) {
1822 dr->dr_status = puterrno(error);
1823 return;
1824 }
1825
1826 /*
1827 * Must specify the mode.
1828 */
1829 if (!(va.va_mask & AT_MODE)) {
1830 VN_RELE(dvp);
1831 dr->dr_status = NFSERR_INVAL;
1832 return;
1833 }
1834
1835 if (protect_zfs_mntpt(dvp) != 0) {
1836 VN_RELE(dvp);
1837 dr->dr_status = NFSERR_ACCES;
1838 return;
1839 }
1840
1841 /*
1842 * This is a completely gross hack to make mknod
1843 * work over the wire until we can wack the protocol
1844 */
1845 if ((va.va_mode & IFMT) == IFCHR) {
1846 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1847 va.va_type = VFIFO; /* xtra kludge for named pipe */
1848 else {
1849 va.va_type = VCHR;
1850 /*
1851 * uncompress the received dev_t
1852 * if the top half is zero indicating a request
1853 * from an `older style' OS.
1854 */
1855 if ((va.va_size & 0xffff0000) == 0)
1856 va.va_rdev = nfsv2_expdev(va.va_size);
1857 else
1858 va.va_rdev = (dev_t)va.va_size;
1859 }
1860 va.va_mask &= ~AT_SIZE;
1861 } else if ((va.va_mode & IFMT) == IFBLK) {
1862 va.va_type = VBLK;
1863 /*
1864 * uncompress the received dev_t
1865 * if the top half is zero indicating a request
1866 * from an `older style' OS.
1867 */
1868 if ((va.va_size & 0xffff0000) == 0)
1869 va.va_rdev = nfsv2_expdev(va.va_size);
1870 else
1871 va.va_rdev = (dev_t)va.va_size;
1872 va.va_mask &= ~AT_SIZE;
1873 } else if ((va.va_mode & IFMT) == IFSOCK) {
1874 va.va_type = VSOCK;
1875 } else {
1876 va.va_type = VREG;
1877 }
1878 va.va_mode &= ~IFMT;
1879 va.va_mask |= AT_TYPE;
1880
1881 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1882 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1883 MAXPATHLEN);
1884 if (name == NULL) {
1885 dr->dr_status = puterrno(EINVAL);
1886 return;
1887 }
1888
1889 /*
1890 * Why was the choice made to use VWRITE as the mode to the
1891 * call to VOP_CREATE ? This results in a bug. When a client
1892 * opens a file that already exists and is RDONLY, the second
1893 * open fails with an EACESS because of the mode.
1894 * bug ID 1054648.
1895 */
1896 lookup_ok = 0;
1897 mode = VWRITE;
1898 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1899 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1900 NULL, NULL, NULL);
1901 if (!error) {
1902 struct vattr at;
1903
1904 lookup_ok = 1;
1905 at.va_mask = AT_MODE;
1906 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1907 if (!error)
1908 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1909 VN_RELE(tvp);
1910 tvp = NULL;
1911 }
1912 }
1913
1914 if (!lookup_ok) {
1915 if (rdonly(ro, dvp)) {
1916 error = EROFS;
1917 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1918 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1919 error = EPERM;
1920 } else {
1921 error = 0;
1922 }
1923 }
1924
1925 /*
1926 * If file size is being modified on an already existing file
1927 * make sure that there are no conflicting non-blocking mandatory
1928 * locks in the region being manipulated. Return EACCES if there
1929 * are conflicting locks.
1930 */
1931 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1932 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1933 NULL, NULL, NULL);
1934
1935 if (!lookuperr &&
1936 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1937 VN_RELE(tvp);
1938 curthread->t_flag |= T_WOULDBLOCK;
1939 goto out;
1940 }
1941
1942 if (!lookuperr && nbl_need_check(tvp)) {
1943 /*
1944 * The file exists. Now check if it has any
1945 * conflicting non-blocking mandatory locks
1946 * in the region being changed.
1947 */
1948 struct vattr bva;
1949 u_offset_t offset;
1950 ssize_t length;
1951
1952 nbl_start_crit(tvp, RW_READER);
1953 in_crit = 1;
1954
1955 bva.va_mask = AT_SIZE;
1956 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1957 if (!error) {
1958 if (va.va_size < bva.va_size) {
1959 offset = va.va_size;
1960 length = bva.va_size - va.va_size;
1961 } else {
1962 offset = bva.va_size;
1963 length = va.va_size - bva.va_size;
1964 }
1965 if (length) {
1966 if (nbl_conflict(tvp, NBL_WRITE,
1967 offset, length, 0, NULL)) {
1968 error = EACCES;
1969 }
1970 }
1971 }
1972 if (error) {
1973 nbl_end_crit(tvp);
1974 VN_RELE(tvp);
1975 in_crit = 0;
1976 }
1977 } else if (tvp != NULL) {
1978 VN_RELE(tvp);
1979 }
1980 }
1981
1982 if (!error) {
1983 /*
1984 * If filesystem is shared with nosuid the remove any
1985 * setuid/setgid bits on create.
1986 */
1987 if (va.va_type == VREG &&
1988 exi->exi_export.ex_flags & EX_NOSUID)
1989 va.va_mode &= ~(VSUID | VSGID);
1990
1991 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1992 NULL, NULL);
1993
1994 if (!error) {
1995
1996 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1997 trunc = TRUE;
1998 else
1999 trunc = FALSE;
2000
2001 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2002 VN_RELE(vp);
2003 curthread->t_flag |= T_WOULDBLOCK;
2004 goto out;
2005 }
2006 va.va_mask = AT_ALL;
2007
2008 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2009
2010 /* check for overflows */
2011 if (!error) {
2012 acl_perm(vp, exi, &va, cr);
2013 error = vattr_to_nattr(&va, &dr->dr_attr);
2014 if (!error) {
2015 error = makefh(&dr->dr_fhandle, vp,
2016 exi);
2017 }
2018 }
2019 /*
2020 * Force modified metadata out to stable storage.
2021 *
2022 * if a underlying vp exists, pass it to VOP_FSYNC
2023 */
2024 if (VOP_REALVP(vp, &realvp, NULL) == 0)
2025 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2026 else
2027 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2028 VN_RELE(vp);
2029 }
2030
2031 if (in_crit) {
2032 nbl_end_crit(tvp);
2033 VN_RELE(tvp);
2034 }
2035 }
2036
2037 /*
2038 * Force modified data and metadata out to stable storage.
2039 */
2040 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2041
2042 out:
2043
2044 VN_RELE(dvp);
2045
2046 dr->dr_status = puterrno(error);
2047
2048 if (name != args->ca_da.da_name)
2049 kmem_free(name, MAXPATHLEN);
2050 }
2051 void *
2052 rfs_create_getfh(struct nfscreatargs *args)
2053 {
2054 return (args->ca_da.da_fhandle);
2055 }
2056
2057 /*
2058 * Remove a file.
2059 * Remove named file from parent directory.
2060 */
2061 /* ARGSUSED */
2062 void
2063 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2064 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2065 {
2066 int error = 0;
2067 vnode_t *vp;
2068 vnode_t *targvp;
2069 int in_crit = 0;
2070
2071 /*
2072 * Disallow NULL paths
2073 */
2074 if (da->da_name == NULL || *da->da_name == '\0') {
2075 *status = NFSERR_ACCES;
2076 return;
2077 }
2078
2079 vp = nfs_fhtovp(da->da_fhandle, exi);
2080 if (vp == NULL) {
2081 *status = NFSERR_STALE;
2082 return;
2083 }
2084
2085 if (rdonly(ro, vp)) {
2086 VN_RELE(vp);
2087 *status = NFSERR_ROFS;
2088 return;
2089 }
2090
2091 /*
2092 * Check for a conflict with a non-blocking mandatory share reservation.
2093 */
2094 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2095 NULL, cr, NULL, NULL, NULL);
2096 if (error != 0) {
2097 VN_RELE(vp);
2098 *status = puterrno(error);
2099 return;
2100 }
2101
2102 /*
2103 * If the file is delegated to an v4 client, then initiate
2104 * recall and drop this request (by setting T_WOULDBLOCK).
2105 * The client will eventually re-transmit the request and
2106 * (hopefully), by then, the v4 client will have returned
2107 * the delegation.
2108 */
2109
2110 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2111 VN_RELE(vp);
2112 VN_RELE(targvp);
2113 curthread->t_flag |= T_WOULDBLOCK;
2114 return;
2115 }
2116
2117 if (nbl_need_check(targvp)) {
2118 nbl_start_crit(targvp, RW_READER);
2119 in_crit = 1;
2120 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2121 error = EACCES;
2122 goto out;
2123 }
2124 }
2125
2126 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2127
2128 /*
2129 * Force modified data and metadata out to stable storage.
2130 */
2131 (void) VOP_FSYNC(vp, 0, cr, NULL);
2132
2133 out:
2134 if (in_crit)
2135 nbl_end_crit(targvp);
2136 VN_RELE(targvp);
2137 VN_RELE(vp);
2138
2139 *status = puterrno(error);
2140
2141 }
2142
2143 void *
2144 rfs_remove_getfh(struct nfsdiropargs *da)
2145 {
2146 return (da->da_fhandle);
2147 }
2148
2149 /*
2150 * rename a file
2151 * Give a file (from) a new name (to).
2152 */
2153 /* ARGSUSED */
2154 void
2155 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2156 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2157 {
2158 int error = 0;
2159 vnode_t *fromvp;
2160 vnode_t *tovp;
2161 struct exportinfo *to_exi;
2162 fhandle_t *fh;
2163 vnode_t *srcvp;
2164 vnode_t *targvp;
2165 int in_crit = 0;
2166
2167 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2168 if (fromvp == NULL) {
2169 *status = NFSERR_STALE;
2170 return;
2171 }
2172
2173 fh = args->rna_to.da_fhandle;
2174 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2175 if (to_exi == NULL) {
2176 VN_RELE(fromvp);
2177 *status = NFSERR_ACCES;
2178 return;
2179 }
2180 exi_rele(&to_exi);
2181
2182 if (to_exi != exi) {
2183 VN_RELE(fromvp);
2184 *status = NFSERR_XDEV;
2185 return;
2186 }
2187
2188 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2189 if (tovp == NULL) {
2190 VN_RELE(fromvp);
2191 *status = NFSERR_STALE;
2192 return;
2193 }
2194
2195 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2196 VN_RELE(tovp);
2197 VN_RELE(fromvp);
2198 *status = NFSERR_NOTDIR;
2199 return;
2200 }
2201
2202 /*
2203 * Disallow NULL paths
2204 */
2205 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2206 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2207 VN_RELE(tovp);
2208 VN_RELE(fromvp);
2209 *status = NFSERR_ACCES;
2210 return;
2211 }
2212
2213 if (rdonly(ro, tovp)) {
2214 VN_RELE(tovp);
2215 VN_RELE(fromvp);
2216 *status = NFSERR_ROFS;
2217 return;
2218 }
2219
2220 if (protect_zfs_mntpt(tovp) != 0) {
2221 VN_RELE(tovp);
2222 VN_RELE(fromvp);
2223 *status = NFSERR_ACCES;
2224 return;
2225 }
2226
2227 /*
2228 * Check for a conflict with a non-blocking mandatory share reservation.
2229 */
2230 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2231 NULL, cr, NULL, NULL, NULL);
2232 if (error != 0) {
2233 VN_RELE(tovp);
2234 VN_RELE(fromvp);
2235 *status = puterrno(error);
2236 return;
2237 }
2238
2239 /* Check for delegations on the source file */
2240
2241 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2242 VN_RELE(tovp);
2243 VN_RELE(fromvp);
2244 VN_RELE(srcvp);
2245 curthread->t_flag |= T_WOULDBLOCK;
2246 return;
2247 }
2248
2249 /* Check for delegation on the file being renamed over, if it exists */
2250
2251 if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2252 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2253 NULL, NULL, NULL) == 0) {
2254
2255 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2256 VN_RELE(tovp);
2257 VN_RELE(fromvp);
2258 VN_RELE(srcvp);
2259 VN_RELE(targvp);
2260 curthread->t_flag |= T_WOULDBLOCK;
2261 return;
2262 }
2263 VN_RELE(targvp);
2264 }
2265
2266
2267 if (nbl_need_check(srcvp)) {
2268 nbl_start_crit(srcvp, RW_READER);
2269 in_crit = 1;
2270 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2271 error = EACCES;
2272 goto out;
2273 }
2274 }
2275
2276 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2277 tovp, args->rna_to.da_name, cr, NULL, 0);
2278
2279 if (error == 0)
2280 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2281 strlen(args->rna_to.da_name));
2282
2283 /*
2284 * Force modified data and metadata out to stable storage.
2285 */
2286 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2287 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2288
2289 out:
2290 if (in_crit)
2291 nbl_end_crit(srcvp);
2292 VN_RELE(srcvp);
2293 VN_RELE(tovp);
2294 VN_RELE(fromvp);
2295
2296 *status = puterrno(error);
2297
2298 }
2299 void *
2300 rfs_rename_getfh(struct nfsrnmargs *args)
2301 {
2302 return (args->rna_from.da_fhandle);
2303 }
2304
2305 /*
2306 * Link to a file.
2307 * Create a file (to) which is a hard link to the given file (from).
2308 */
2309 /* ARGSUSED */
2310 void
2311 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2312 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2313 {
2314 int error;
2315 vnode_t *fromvp;
2316 vnode_t *tovp;
2317 struct exportinfo *to_exi;
2318 fhandle_t *fh;
2319
2320 fromvp = nfs_fhtovp(args->la_from, exi);
2321 if (fromvp == NULL) {
2322 *status = NFSERR_STALE;
2323 return;
2324 }
2325
2326 fh = args->la_to.da_fhandle;
2327 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2328 if (to_exi == NULL) {
2329 VN_RELE(fromvp);
2330 *status = NFSERR_ACCES;
2331 return;
2332 }
2333 exi_rele(&to_exi);
2334
2335 if (to_exi != exi) {
2336 VN_RELE(fromvp);
2337 *status = NFSERR_XDEV;
2338 return;
2339 }
2340
2341 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2342 if (tovp == NULL) {
2343 VN_RELE(fromvp);
2344 *status = NFSERR_STALE;
2345 return;
2346 }
2347
2348 if (tovp->v_type != VDIR) {
2349 VN_RELE(tovp);
2350 VN_RELE(fromvp);
2351 *status = NFSERR_NOTDIR;
2352 return;
2353 }
2354 /*
2355 * Disallow NULL paths
2356 */
2357 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2358 VN_RELE(tovp);
2359 VN_RELE(fromvp);
2360 *status = NFSERR_ACCES;
2361 return;
2362 }
2363
2364 if (rdonly(ro, tovp)) {
2365 VN_RELE(tovp);
2366 VN_RELE(fromvp);
2367 *status = NFSERR_ROFS;
2368 return;
2369 }
2370
2371 if (protect_zfs_mntpt(tovp) != 0) {
2372 VN_RELE(tovp);
2373 VN_RELE(fromvp);
2374 *status = NFSERR_ACCES;
2375 return;
2376 }
2377
2378 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2379
2380 /*
2381 * Force modified data and metadata out to stable storage.
2382 */
2383 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2384 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2385
2386 VN_RELE(tovp);
2387 VN_RELE(fromvp);
2388
2389 *status = puterrno(error);
2390
2391 }
2392 void *
2393 rfs_link_getfh(struct nfslinkargs *args)
2394 {
2395 return (args->la_from);
2396 }
2397
2398 /*
2399 * Symbolicly link to a file.
2400 * Create a file (from) with the given attributes which is a symbolic link
2401 * to the given path name (to).
2402 */
2403 void
2404 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2405 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2406 {
2407 int error;
2408 struct vattr va;
2409 vnode_t *vp;
2410 vnode_t *svp;
2411 int lerror;
2412 struct sockaddr *ca;
2413 char *name = NULL;
2414
2415 /*
2416 * Disallow NULL paths
2417 */
2418 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2419 *status = NFSERR_ACCES;
2420 return;
2421 }
2422
2423 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2424 if (vp == NULL) {
2425 *status = NFSERR_STALE;
2426 return;
2427 }
2428
2429 if (rdonly(ro, vp)) {
2430 VN_RELE(vp);
2431 *status = NFSERR_ROFS;
2432 return;
2433 }
2434
2435 error = sattr_to_vattr(args->sla_sa, &va);
2436 if (error) {
2437 VN_RELE(vp);
2438 *status = puterrno(error);
2439 return;
2440 }
2441
2442 if (!(va.va_mask & AT_MODE)) {
2443 VN_RELE(vp);
2444 *status = NFSERR_INVAL;
2445 return;
2446 }
2447
2448 if (protect_zfs_mntpt(vp) != 0) {
2449 VN_RELE(vp);
2450 *status = NFSERR_ACCES;
2451 return;
2452 }
2453
2454 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2455 name = nfscmd_convname(ca, exi, args->sla_tnm,
2456 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2457
2458 if (name == NULL) {
2459 *status = NFSERR_ACCES;
2460 return;
2461 }
2462
2463 va.va_type = VLNK;
2464 va.va_mask |= AT_TYPE;
2465
2466 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2467
2468 /*
2469 * Force new data and metadata out to stable storage.
2470 */
2471 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2472 NULL, cr, NULL, NULL, NULL);
2473
2474 if (!lerror) {
2475 (void) VOP_FSYNC(svp, 0, cr, NULL);
2476 VN_RELE(svp);
2477 }
2478
2479 /*
2480 * Force modified data and metadata out to stable storage.
2481 */
2482 (void) VOP_FSYNC(vp, 0, cr, NULL);
2483
2484 VN_RELE(vp);
2485
2486 *status = puterrno(error);
2487 if (name != args->sla_tnm)
2488 kmem_free(name, MAXPATHLEN);
2489
2490 }
2491 void *
2492 rfs_symlink_getfh(struct nfsslargs *args)
2493 {
2494 return (args->sla_from.da_fhandle);
2495 }
2496
2497 /*
2498 * Make a directory.
2499 * Create a directory with the given name, parent directory, and attributes.
2500 * Returns a file handle and attributes for the new directory.
2501 */
2502 /* ARGSUSED */
2503 void
2504 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2505 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2506 {
2507 int error;
2508 struct vattr va;
2509 vnode_t *dvp = NULL;
2510 vnode_t *vp;
2511 char *name = args->ca_da.da_name;
2512
2513 /*
2514 * Disallow NULL paths
2515 */
2516 if (name == NULL || *name == '\0') {
2517 dr->dr_status = NFSERR_ACCES;
2518 return;
2519 }
2520
2521 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2522 if (vp == NULL) {
2523 dr->dr_status = NFSERR_STALE;
2524 return;
2525 }
2526
2527 if (rdonly(ro, vp)) {
2528 VN_RELE(vp);
2529 dr->dr_status = NFSERR_ROFS;
2530 return;
2531 }
2532
2533 error = sattr_to_vattr(args->ca_sa, &va);
2534 if (error) {
2535 VN_RELE(vp);
2536 dr->dr_status = puterrno(error);
2537 return;
2538 }
2539
2540 if (!(va.va_mask & AT_MODE)) {
2541 VN_RELE(vp);
2542 dr->dr_status = NFSERR_INVAL;
2543 return;
2544 }
2545
2546 if (protect_zfs_mntpt(vp) != 0) {
2547 VN_RELE(vp);
2548 dr->dr_status = NFSERR_ACCES;
2549 return;
2550 }
2551
2552 va.va_type = VDIR;
2553 va.va_mask |= AT_TYPE;
2554
2555 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2556
2557 if (!error) {
2558 /*
2559 * Attribtutes of the newly created directory should
2560 * be returned to the client.
2561 */
2562 va.va_mask = AT_ALL; /* We want everything */
2563 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2564
2565 /* check for overflows */
2566 if (!error) {
2567 acl_perm(vp, exi, &va, cr);
2568 error = vattr_to_nattr(&va, &dr->dr_attr);
2569 if (!error) {
2570 error = makefh(&dr->dr_fhandle, dvp, exi);
2571 }
2572 }
2573 /*
2574 * Force new data and metadata out to stable storage.
2575 */
2576 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2577 VN_RELE(dvp);
2578 }
2579
2580 /*
2581 * Force modified data and metadata out to stable storage.
2582 */
2583 (void) VOP_FSYNC(vp, 0, cr, NULL);
2584
2585 VN_RELE(vp);
2586
2587 dr->dr_status = puterrno(error);
2588
2589 }
2590 void *
2591 rfs_mkdir_getfh(struct nfscreatargs *args)
2592 {
2593 return (args->ca_da.da_fhandle);
2594 }
2595
2596 /*
2597 * Remove a directory.
2598 * Remove the given directory name from the given parent directory.
2599 */
2600 /* ARGSUSED */
2601 void
2602 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2603 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2604 {
2605 int error;
2606 vnode_t *vp;
2607
2608 /*
2609 * Disallow NULL paths
2610 */
2611 if (da->da_name == NULL || *da->da_name == '\0') {
2612 *status = NFSERR_ACCES;
2613 return;
2614 }
2615
2616 vp = nfs_fhtovp(da->da_fhandle, exi);
2617 if (vp == NULL) {
2618 *status = NFSERR_STALE;
2619 return;
2620 }
2621
2622 if (rdonly(ro, vp)) {
2623 VN_RELE(vp);
2624 *status = NFSERR_ROFS;
2625 return;
2626 }
2627
2628 /*
2629 * VOP_RMDIR takes a third argument (the current
2630 * directory of the process). That's because someone
2631 * wants to return EINVAL if one tries to remove ".".
2632 * Of course, NFS servers have no idea what their
2633 * clients' current directories are. We fake it by
2634 * supplying a vnode known to exist and illegal to
2635 * remove.
2636 */
2637 error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2638
2639 /*
2640 * Force modified data and metadata out to stable storage.
2641 */
2642 (void) VOP_FSYNC(vp, 0, cr, NULL);
2643
2644 VN_RELE(vp);
2645
2646 /*
2647 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2648 * if the directory is not empty. A System V NFS server
2649 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2650 * over the wire.
2651 */
2652 if (error == EEXIST)
2653 *status = NFSERR_NOTEMPTY;
2654 else
2655 *status = puterrno(error);
2656
2657 }
2658 void *
2659 rfs_rmdir_getfh(struct nfsdiropargs *da)
2660 {
2661 return (da->da_fhandle);
2662 }
2663
2664 #ifdef nextdp
2665 #undef nextdp
2666 #endif
2667 #define nextdp(dp) ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
2668
2669 /* ARGSUSED */
2670 void
2671 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2672 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2673 {
2674 int error;
2675 vnode_t *vp;
2676 struct iovec iov;
2677 struct uio uio;
2678 int iseof;
2679
2680 uint32_t count = rda->rda_count;
2681 uint32_t size; /* size of the readdirres structure */
2682 int overflow = 0;
2683
2684 size_t datasz;
2685 char *data = NULL;
2686 dirent64_t *dp;
2687
2688 struct sockaddr *ca;
2689 struct nfsentry **eptr;
2690 struct nfsentry *entry;
2691
2692 vp = nfs_fhtovp(&rda->rda_fh, exi);
2693 if (vp == NULL) {
2694 rd->rd_status = NFSERR_STALE;
2695 return;
2696 }
2697
2698 if (vp->v_type != VDIR) {
2699 VN_RELE(vp);
2700 rd->rd_status = NFSERR_NOTDIR;
2701 return;
2702 }
2703
2704 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2705
2706 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2707 if (error)
2708 goto bad;
2709
2710 /*
2711 * Don't allow arbitrary counts for allocation
2712 */
2713 if (count > NFS_MAXDATA)
2714 count = NFS_MAXDATA;
2715
2716 /*
2717 * struct readdirres:
2718 * status: 1
2719 * entries (bool): 1
2720 * eof: 1
2721 */
2722 size = (1 + 1 + 1) * BYTES_PER_XDR_UNIT;
2723
2724 if (size > count) {
2725 eptr = &rd->rd_entries;
2726 iseof = 0;
2727 size = 0;
2728
2729 goto done;
2730 }
2731
2732 /*
2733 * This is simplification. The dirent64_t size is not the same as the
2734 * size of XDR representation of entry, but the sizes are similar so
2735 * we'll assume they are same. This assumption should not cause any
2736 * harm. In worst case we will need to issue VOP_READDIR() once more.
2737 */
2738 datasz = count;
2739
2740 /*
2741 * Make sure that there is room to read at least one entry
2742 * if any are available.
2743 */
2744 if (datasz < DIRENT64_RECLEN(MAXNAMELEN))
2745 datasz = DIRENT64_RECLEN(MAXNAMELEN);
2746
2747 data = kmem_alloc(datasz, KM_NOSLEEP);
2748 if (data == NULL) {
2749 /* The allocation failed; downsize and wait for it this time */
2750 if (datasz > MAXBSIZE)
2751 datasz = MAXBSIZE;
2752 data = kmem_alloc(datasz, KM_SLEEP);
2753 }
2754
2755 uio.uio_iov = &iov;
2756 uio.uio_iovcnt = 1;
2757 uio.uio_segflg = UIO_SYSSPACE;
2758 uio.uio_extflg = UIO_COPY_CACHED;
2759 uio.uio_loffset = (offset_t)rda->rda_offset;
2760 uio.uio_resid = datasz;
2761
2762 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2763 eptr = &rd->rd_entries;
2764 entry = NULL;
2765
2766 getmoredents:
2767 iov.iov_base = data;
2768 iov.iov_len = datasz;
2769
2770 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2771 if (error) {
2772 iseof = 0;
2773 goto done;
2774 }
2775
2776 if (iov.iov_len == datasz)
2777 goto done;
2778
2779 for (dp = (dirent64_t *)data;
2780 (char *)dp - data < datasz - iov.iov_len && !overflow;
2781 dp = nextdp(dp)) {
2782 char *name;
2783 uint32_t esize;
2784 uint32_t cookie;
2785
2786 overflow = (uint64_t)dp->d_off > UINT32_MAX;
2787 if (overflow) {
2788 cookie = 0;
2789 iseof = 1;
2790 } else
2791 cookie = (uint32_t)dp->d_off;
2792
2793 if (dp->d_ino == 0 || (uint64_t)dp->d_ino > UINT32_MAX) {
2794 if (entry != NULL)
2795 entry->cookie = cookie;
2796 continue;
2797 }
2798
2799 name = nfscmd_convname(ca, exi, dp->d_name,
2800 NFSCMD_CONV_OUTBOUND, NFS_MAXPATHLEN + 1);
2801 if (name == NULL) {
2802 if (entry != NULL)
2803 entry->cookie = cookie;
2804 continue;
2805 }
2806
2807 /*
2808 * struct entry:
2809 * fileid: 1
2810 * name (length): 1
2811 * name (data): length (rounded up)
2812 * cookie: 1
2813 * nextentry (bool): 1
2814 */
2815 esize = (1 + 1 + 1 + 1) * BYTES_PER_XDR_UNIT +
2816 RNDUP(strlen(name));
2817
2818 /* If the new entry does not fit, discard it */
2819 if (esize > count - size) {
2820 if (name != dp->d_name)
2821 kmem_free(name, NFS_MAXPATHLEN + 1);
2822 iseof = 0;
2823 goto done;
2824 }
2825
2826 entry = kmem_alloc(sizeof (struct nfsentry), KM_SLEEP);
2827
2828 entry->fileid = (uint32_t)dp->d_ino;
2829 entry->name = strdup(name);
2830 if (name != dp->d_name)
2831 kmem_free(name, NFS_MAXPATHLEN + 1);
2832 entry->cookie = cookie;
2833
2834 size += esize;
2835
2836 /* Add the entry to the linked list */
2837 *eptr = entry;
2838 eptr = &entry->nextentry;
2839 }
2840
2841 if (!iseof && size < count) {
2842 uio.uio_resid = MIN(datasz, MAXBSIZE);
2843 goto getmoredents;
2844 }
2845
2846 done:
2847 *eptr = NULL;
2848
2849 if (iseof || rd->rd_entries != NULL || !error) {
2850 error = 0;
2851 rd->rd_eof = iseof ? TRUE : FALSE;
2852
2853 /* This is for nfslog only */
2854 rd->rd_offset = rda->rda_offset;
2855 rd->rd_size = size;
2856 }
2857
2858 bad:
2859 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2860
2861 #if 0 /* notyet */
2862 /*
2863 * Don't do this. It causes local disk writes when just
2864 * reading the file and the overhead is deemed larger
2865 * than the benefit.
2866 */
2867 /*
2868 * Force modified metadata out to stable storage.
2869 */
2870 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2871 #endif
2872
2873 VN_RELE(vp);
2874
2875 rd->rd_status = puterrno(error);
2876
2877 if (data != NULL)
2878 kmem_free(data, datasz);
2879 }
2880 void *
2881 rfs_readdir_getfh(struct nfsrddirargs *rda)
2882 {
2883 return (&rda->rda_fh);
2884 }
2885 void
2886 rfs_rddirfree(struct nfsrddirres *rd)
2887 {
2888 if (rd->rd_status == NFS_OK) {
2889 struct nfsentry *entry, *nentry;
2890
2891 for (entry = rd->rd_entries; entry != NULL; entry = nentry) {
2892 nentry = entry->nextentry;
2893 strfree(entry->name);
2894 kmem_free(entry, sizeof (struct nfsentry));
2895 }
2896 }
2897 }
2898
2899 /* ARGSUSED */
2900 void
2901 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2902 struct svc_req *req, cred_t *cr, bool_t ro)
2903 {
2904 int error;
2905 struct statvfs64 sb;
2906 vnode_t *vp;
2907
2908 vp = nfs_fhtovp(fh, exi);
2909 if (vp == NULL) {
2910 fs->fs_status = NFSERR_STALE;
2911 return;
2912 }
2913
2914 error = VFS_STATVFS(vp->v_vfsp, &sb);
2915
2916 if (!error) {
2917 fs->fs_tsize = nfstsize();
2918 fs->fs_bsize = sb.f_frsize;
2919 fs->fs_blocks = sb.f_blocks;
2920 fs->fs_bfree = sb.f_bfree;
2921 fs->fs_bavail = sb.f_bavail;
2922 }
2923
2924 VN_RELE(vp);
2925
2926 fs->fs_status = puterrno(error);
2927
2928 }
2929 void *
2930 rfs_statfs_getfh(fhandle_t *fh)
2931 {
2932 return (fh);
2933 }
2934
2935 static int
2936 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2937 {
2938 vap->va_mask = 0;
2939
2940 /*
2941 * There was a sign extension bug in some VFS based systems
2942 * which stored the mode as a short. When it would get
2943 * assigned to a u_long, no sign extension would occur.
2944 * It needed to, but this wasn't noticed because sa_mode
2945 * would then get assigned back to the short, thus ignoring
2946 * the upper 16 bits of sa_mode.
2947 *
2948 * To make this implementation work for both broken
2949 * clients and good clients, we check for both versions
2950 * of the mode.
2951 */
2952 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2953 sa->sa_mode != (uint32_t)-1) {
2954 vap->va_mask |= AT_MODE;
2955 vap->va_mode = sa->sa_mode;
2956 }
2957 if (sa->sa_uid != (uint32_t)-1) {
2958 vap->va_mask |= AT_UID;
2959 vap->va_uid = sa->sa_uid;
2960 }
2961 if (sa->sa_gid != (uint32_t)-1) {
2962 vap->va_mask |= AT_GID;
2963 vap->va_gid = sa->sa_gid;
2964 }
2965 if (sa->sa_size != (uint32_t)-1) {
2966 vap->va_mask |= AT_SIZE;
2967 vap->va_size = sa->sa_size;
2968 }
2969 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2970 sa->sa_atime.tv_usec != (int32_t)-1) {
2971 #ifndef _LP64
2972 /* return error if time overflow */
2973 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2974 return (EOVERFLOW);
2975 #endif
2976 vap->va_mask |= AT_ATIME;
2977 /*
2978 * nfs protocol defines times as unsigned so don't extend sign,
2979 * unless sysadmin set nfs_allow_preepoch_time.
2980 */
2981 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2982 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2983 }
2984 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2985 sa->sa_mtime.tv_usec != (int32_t)-1) {
2986 #ifndef _LP64
2987 /* return error if time overflow */
2988 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2989 return (EOVERFLOW);
2990 #endif
2991 vap->va_mask |= AT_MTIME;
2992 /*
2993 * nfs protocol defines times as unsigned so don't extend sign,
2994 * unless sysadmin set nfs_allow_preepoch_time.
2995 */
2996 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2997 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2998 }
2999 return (0);
3000 }
3001
3002 static const enum nfsftype vt_to_nf[] = {
3003 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
3004 };
3005
3006 /*
3007 * check the following fields for overflow: nodeid, size, and time.
3008 * There could be a problem when converting 64-bit LP64 fields
3009 * into 32-bit ones. Return an error if there is an overflow.
3010 */
3011 int
3012 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
3013 {
3014 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
3015 na->na_type = vt_to_nf[vap->va_type];
3016
3017 if (vap->va_mode == (unsigned short) -1)
3018 na->na_mode = (uint32_t)-1;
3019 else
3020 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
3021
3022 if (vap->va_uid == (unsigned short)(-1))
3023 na->na_uid = (uint32_t)(-1);
3024 else if (vap->va_uid == UID_NOBODY)
3025 na->na_uid = (uint32_t)NFS_UID_NOBODY;
3026 else
3027 na->na_uid = vap->va_uid;
3028
3029 if (vap->va_gid == (unsigned short)(-1))
3030 na->na_gid = (uint32_t)-1;
3031 else if (vap->va_gid == GID_NOBODY)
3032 na->na_gid = (uint32_t)NFS_GID_NOBODY;
3033 else
3034 na->na_gid = vap->va_gid;
3035
3036 /*
3037 * Do we need to check fsid for overflow? It is 64-bit in the
3038 * vattr, but are bigger than 32 bit values supported?
3039 */
3040 na->na_fsid = vap->va_fsid;
3041
3042 na->na_nodeid = vap->va_nodeid;
3043
3044 /*
3045 * Check to make sure that the nodeid is representable over the
3046 * wire without losing bits.
3047 */
3048 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
3049 return (EFBIG);
3050 na->na_nlink = vap->va_nlink;
3051
3052 /*
3053 * Check for big files here, instead of at the caller. See
3054 * comments in cstat for large special file explanation.
3055 */
3056 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
3057 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
3058 return (EFBIG);
3059 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
3060 /* UNKNOWN_SIZE | OVERFLOW */
3061 na->na_size = MAXOFF32_T;
3062 } else
3063 na->na_size = vap->va_size;
3064 } else
3065 na->na_size = vap->va_size;
3066
3067 /*
3068 * If the vnode times overflow the 32-bit times that NFS2
3069 * uses on the wire then return an error.
3070 */
3071 if (!NFS_VAP_TIME_OK(vap)) {
3072 return (EOVERFLOW);
3073 }
3074 na->na_atime.tv_sec = vap->va_atime.tv_sec;
3075 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
3076
3077 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
3078 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
3079
3080 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
3081 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
3082
3083 /*
3084 * If the dev_t will fit into 16 bits then compress
3085 * it, otherwise leave it alone. See comments in
3086 * nfs_client.c.
3087 */
3088 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
3089 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
3090 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
3091 else
3092 (void) cmpldev(&na->na_rdev, vap->va_rdev);
3093
3094 na->na_blocks = vap->va_nblocks;
3095 na->na_blocksize = vap->va_blksize;
3096
3097 /*
3098 * This bit of ugliness is a *TEMPORARY* hack to preserve the
3099 * over-the-wire protocols for named-pipe vnodes. It remaps the
3100 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
3101 *
3102 * BUYER BEWARE:
3103 * If you are porting the NFS to a non-Sun server, you probably
3104 * don't want to include the following block of code. The
3105 * over-the-wire special file types will be changing with the
3106 * NFS Protocol Revision.
3107 */
3108 if (vap->va_type == VFIFO)
3109 NA_SETFIFO(na);
3110 return (0);
3111 }
3112
3113 /*
3114 * acl v2 support: returns approximate permission.
3115 * default: returns minimal permission (more restrictive)
3116 * aclok: returns maximal permission (less restrictive)
3117 * This routine changes the permissions that are alaredy in *va.
3118 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3119 * CLASS_OBJ is always the same as GROUP_OBJ entry.
3120 */
3121 static void
3122 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3123 {
3124 vsecattr_t vsa;
3125 int aclcnt;
3126 aclent_t *aclentp;
3127 mode_t mask_perm;
3128 mode_t grp_perm;
3129 mode_t other_perm;
3130 mode_t other_orig;
3131 int error;
3132
3133 /* dont care default acl */
3134 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3135 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3136
3137 if (!error) {
3138 aclcnt = vsa.vsa_aclcnt;
3139 if (aclcnt > MIN_ACL_ENTRIES) {
3140 /* non-trivial ACL */
3141 aclentp = vsa.vsa_aclentp;
3142 if (exi->exi_export.ex_flags & EX_ACLOK) {
3143 /* maximal permissions */
3144 grp_perm = 0;
3145 other_perm = 0;
3146 for (; aclcnt > 0; aclcnt--, aclentp++) {
3147 switch (aclentp->a_type) {
3148 case USER_OBJ:
3149 break;
3150 case USER:
3151 grp_perm |=
3152 aclentp->a_perm << 3;
3153 other_perm |= aclentp->a_perm;
3154 break;
3155 case GROUP_OBJ:
3156 grp_perm |=
3157 aclentp->a_perm << 3;
3158 break;
3159 case GROUP:
3160 other_perm |= aclentp->a_perm;
3161 break;
3162 case OTHER_OBJ:
3163 other_orig = aclentp->a_perm;
3164 break;
3165 case CLASS_OBJ:
3166 mask_perm = aclentp->a_perm;
3167 break;
3168 default:
3169 break;
3170 }
3171 }
3172 grp_perm &= mask_perm << 3;
3173 other_perm &= mask_perm;
3174 other_perm |= other_orig;
3175
3176 } else {
3177 /* minimal permissions */
3178 grp_perm = 070;
3179 other_perm = 07;
3180 for (; aclcnt > 0; aclcnt--, aclentp++) {
3181 switch (aclentp->a_type) {
3182 case USER_OBJ:
3183 break;
3184 case USER:
3185 case CLASS_OBJ:
3186 grp_perm &=
3187 aclentp->a_perm << 3;
3188 other_perm &=
3189 aclentp->a_perm;
3190 break;
3191 case GROUP_OBJ:
3192 grp_perm &=
3193 aclentp->a_perm << 3;
3194 break;
3195 case GROUP:
3196 other_perm &=
3197 aclentp->a_perm;
3198 break;
3199 case OTHER_OBJ:
3200 other_perm &=
3201 aclentp->a_perm;
3202 break;
3203 default:
3204 break;
3205 }
3206 }
3207 }
3208 /* copy to va */
3209 va->va_mode &= ~077;
3210 va->va_mode |= grp_perm | other_perm;
3211 }
3212 if (vsa.vsa_aclcnt)
3213 kmem_free(vsa.vsa_aclentp,
3214 vsa.vsa_aclcnt * sizeof (aclent_t));
3215 }
3216 }
3217
3218 void
3219 rfs_srvrinit(void)
3220 {
3221 nfs2_srv_caller_id = fs_new_caller_id();
3222 zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3223 }
3224
3225 void
3226 rfs_srvrfini(void)
3227 {
3228 }
3229
3230 /* ARGSUSED */
3231 static void *
3232 rfs_zone_init(zoneid_t zoneid)
3233 {
3234 nfs_srv_t *ns;
3235
3236 ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3237
3238 mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3239 ns->write_async = 1;
3240
3241 return (ns);
3242 }
3243
3244 /* ARGSUSED */
3245 static void
3246 rfs_zone_fini(zoneid_t zoneid, void *data)
3247 {
3248 nfs_srv_t *ns;
3249
3250 ns = (nfs_srv_t *)data;
3251 mutex_destroy(&ns->async_write_lock);
3252 kmem_free(ns, sizeof (*ns));
3253 }
3254
3255 static int
3256 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3257 {
3258 struct clist *wcl;
3259 int wlist_len;
3260 uint32_t count = rr->rr_count;
3261
3262 wcl = ra->ra_wlist;
3263
3264 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3265 return (FALSE);
3266 }
3267
3268 wcl = ra->ra_wlist;
3269 rr->rr_ok.rrok_wlist_len = wlist_len;
3270 rr->rr_ok.rrok_wlist = wcl;
3271
3272 return (TRUE);
3273 }