1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
30 * All rights reserved.
31 */
32
33 /*
34 * Copyright 2018 Nexenta Systems, Inc.
35 * Copyright (c) 2016 by Delphix. All rights reserved.
36 */
37
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/uio.h>
46 #include <sys/stat.h>
47 #include <sys/errno.h>
48 #include <sys/sysmacros.h>
49 #include <sys/statvfs.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/dirent.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/vtrace.h>
56 #include <sys/mode.h>
57 #include <sys/acl.h>
58 #include <sys/nbmlock.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/svc.h>
65
66 #include <nfs/nfs.h>
67 #include <nfs/export.h>
68 #include <nfs/nfs_cmd.h>
69
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_kmem.h>
75
76 #include <sys/strsubr.h>
77
78 struct rfs_async_write_list;
79
80 /*
81 * Zone globals of NFSv2 server
82 */
83 typedef struct nfs_srv {
84 kmutex_t async_write_lock;
85 struct rfs_async_write_list *async_write_head;
86
87 /*
88 * enables write clustering if == 1
89 */
90 int write_async;
91 } nfs_srv_t;
92
93 /*
94 * These are the interface routines for the server side of the
95 * Network File System. See the NFS version 2 protocol specification
96 * for a description of this interface.
97 */
98
99 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
100 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
101 cred_t *);
102 static void *rfs_zone_init(zoneid_t zoneid);
103 static void rfs_zone_fini(zoneid_t zoneid, void *data);
104
105
106 /*
107 * Some "over the wire" UNIX file types. These are encoded
108 * into the mode. This needs to be fixed in the next rev.
109 */
110 #define IFMT 0170000 /* type of file */
111 #define IFCHR 0020000 /* character special */
112 #define IFBLK 0060000 /* block special */
113 #define IFSOCK 0140000 /* socket */
114
115 u_longlong_t nfs2_srv_caller_id;
116 static zone_key_t rfs_zone_key;
117
118 /*
119 * Get file attributes.
120 * Returns the current attributes of the file with the given fhandle.
121 */
122 /* ARGSUSED */
123 void
124 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
125 struct svc_req *req, cred_t *cr, bool_t ro)
126 {
127 int error;
128 vnode_t *vp;
129 struct vattr va;
130
131 vp = nfs_fhtovp(fhp, exi);
132 if (vp == NULL) {
133 ns->ns_status = NFSERR_STALE;
134 return;
135 }
136
137 /*
138 * Do the getattr.
139 */
140 va.va_mask = AT_ALL; /* we want all the attributes */
141
142 error = rfs4_delegated_getattr(vp, &va, 0, cr);
143
144 /* check for overflows */
145 if (!error) {
146 /* Lie about the object type for a referral */
147 if (vn_is_nfs_reparse(vp, cr))
148 va.va_type = VLNK;
149
150 acl_perm(vp, exi, &va, cr);
151 error = vattr_to_nattr(&va, &ns->ns_attr);
152 }
153
154 VN_RELE(vp);
155
156 ns->ns_status = puterrno(error);
157 }
158 void *
159 rfs_getattr_getfh(fhandle_t *fhp)
160 {
161 return (fhp);
162 }
163
164 /*
165 * Set file attributes.
166 * Sets the attributes of the file with the given fhandle. Returns
167 * the new attributes.
168 */
169 /* ARGSUSED */
170 void
171 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
172 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
173 {
174 int error;
175 int flag;
176 int in_crit = 0;
177 vnode_t *vp;
178 struct vattr va;
179 struct vattr bva;
180 struct flock64 bf;
181 caller_context_t ct;
182
183
184 vp = nfs_fhtovp(&args->saa_fh, exi);
185 if (vp == NULL) {
186 ns->ns_status = NFSERR_STALE;
187 return;
188 }
189
190 if (rdonly(ro, vp)) {
191 VN_RELE(vp);
192 ns->ns_status = NFSERR_ROFS;
193 return;
194 }
195
196 error = sattr_to_vattr(&args->saa_sa, &va);
197 if (error) {
198 VN_RELE(vp);
199 ns->ns_status = puterrno(error);
200 return;
201 }
202
203 /*
204 * If the client is requesting a change to the mtime,
205 * but the nanosecond field is set to 1 billion, then
206 * this is a flag to the server that it should set the
207 * atime and mtime fields to the server's current time.
208 * The 1 billion number actually came from the client
209 * as 1 million, but the units in the over the wire
210 * request are microseconds instead of nanoseconds.
211 *
212 * This is an overload of the protocol and should be
213 * documented in the NFS Version 2 protocol specification.
214 */
215 if (va.va_mask & AT_MTIME) {
216 if (va.va_mtime.tv_nsec == 1000000000) {
217 gethrestime(&va.va_mtime);
218 va.va_atime = va.va_mtime;
219 va.va_mask |= AT_ATIME;
220 flag = 0;
221 } else
222 flag = ATTR_UTIME;
223 } else
224 flag = 0;
225
226 /*
227 * If the filesystem is exported with nosuid, then mask off
228 * the setuid and setgid bits.
229 */
230 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
231 (exi->exi_export.ex_flags & EX_NOSUID))
232 va.va_mode &= ~(VSUID | VSGID);
233
234 ct.cc_sysid = 0;
235 ct.cc_pid = 0;
236 ct.cc_caller_id = nfs2_srv_caller_id;
237 ct.cc_flags = CC_DONTBLOCK;
238
239 /*
240 * We need to specially handle size changes because it is
241 * possible for the client to create a file with modes
242 * which indicate read-only, but with the file opened for
243 * writing. If the client then tries to set the size of
244 * the file, then the normal access checking done in
245 * VOP_SETATTR would prevent the client from doing so,
246 * although it should be legal for it to do so. To get
247 * around this, we do the access checking for ourselves
248 * and then use VOP_SPACE which doesn't do the access
249 * checking which VOP_SETATTR does. VOP_SPACE can only
250 * operate on VREG files, let VOP_SETATTR handle the other
251 * extremely rare cases.
252 * Also the client should not be allowed to change the
253 * size of the file if there is a conflicting non-blocking
254 * mandatory lock in the region of change.
255 */
256 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
257 if (nbl_need_check(vp)) {
258 nbl_start_crit(vp, RW_READER);
259 in_crit = 1;
260 }
261
262 bva.va_mask = AT_UID | AT_SIZE;
263
264 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
265
266 if (error) {
267 if (in_crit)
268 nbl_end_crit(vp);
269 VN_RELE(vp);
270 ns->ns_status = puterrno(error);
271 return;
272 }
273
274 if (in_crit) {
275 u_offset_t offset;
276 ssize_t length;
277
278 if (va.va_size < bva.va_size) {
279 offset = va.va_size;
280 length = bva.va_size - va.va_size;
281 } else {
282 offset = bva.va_size;
283 length = va.va_size - bva.va_size;
284 }
285 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
286 NULL)) {
287 error = EACCES;
288 }
289 }
290
291 if (crgetuid(cr) == bva.va_uid && !error &&
292 va.va_size != bva.va_size) {
293 va.va_mask &= ~AT_SIZE;
294 bf.l_type = F_WRLCK;
295 bf.l_whence = 0;
296 bf.l_start = (off64_t)va.va_size;
297 bf.l_len = 0;
298 bf.l_sysid = 0;
299 bf.l_pid = 0;
300
301 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
302 (offset_t)va.va_size, cr, &ct);
303 }
304 if (in_crit)
305 nbl_end_crit(vp);
306 } else
307 error = 0;
308
309 /*
310 * Do the setattr.
311 */
312 if (!error && va.va_mask) {
313 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
314 }
315
316 /*
317 * check if the monitor on either vop_space or vop_setattr detected
318 * a delegation conflict and if so, mark the thread flag as
319 * wouldblock so that the response is dropped and the client will
320 * try again.
321 */
322 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
323 VN_RELE(vp);
324 curthread->t_flag |= T_WOULDBLOCK;
325 return;
326 }
327
328 if (!error) {
329 va.va_mask = AT_ALL; /* get everything */
330
331 error = rfs4_delegated_getattr(vp, &va, 0, cr);
332
333 /* check for overflows */
334 if (!error) {
335 acl_perm(vp, exi, &va, cr);
336 error = vattr_to_nattr(&va, &ns->ns_attr);
337 }
338 }
339
340 ct.cc_flags = 0;
341
342 /*
343 * Force modified metadata out to stable storage.
344 */
345 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
346
347 VN_RELE(vp);
348
349 ns->ns_status = puterrno(error);
350 }
351 void *
352 rfs_setattr_getfh(struct nfssaargs *args)
353 {
354 return (&args->saa_fh);
355 }
356
357 /* Change and release @exip and @vpp only in success */
358 int
359 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
360 {
361 struct exportinfo *exi;
362 vnode_t *vp = *vpp;
363 fid_t fid;
364 int error;
365
366 VN_HOLD(vp);
367
368 if ((error = traverse(&vp)) != 0) {
369 VN_RELE(vp);
370 return (error);
371 }
372
373 bzero(&fid, sizeof (fid));
374 fid.fid_len = MAXFIDSZ;
375 error = VOP_FID(vp, &fid, NULL);
376 if (error) {
377 VN_RELE(vp);
378 return (error);
379 }
380
381 exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
382 if (exi == NULL ||
383 (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
384 /*
385 * It is not error, just subdir is not exported
386 * or "nohide" is not set
387 */
388 if (exi != NULL)
389 exi_rele(exi);
390 VN_RELE(vp);
391 } else {
392 /* go to submount */
393 exi_rele(*exip);
394 *exip = exi;
395
396 VN_RELE(*vpp);
397 *vpp = vp;
398 }
399
400 return (0);
401 }
402
403 /*
404 * Given mounted "dvp" and "exi", go upper mountpoint
405 * with dvp/exi correction
406 * Return 0 in success
407 */
408 int
409 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
410 {
411 struct exportinfo *exi;
412 vnode_t *dvp = *dvpp;
413
414 ASSERT3P((*exip)->exi_zone, ==, curzone);
415 ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
416
417 VN_HOLD(dvp);
418 dvp = untraverse(dvp);
419 exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
420 if (exi == NULL) {
421 VN_RELE(dvp);
422 return (-1);
423 }
424
425 ASSERT3P(exi->exi_zone, ==, curzone);
426 exi_rele(*exip);
427 *exip = exi;
428 VN_RELE(*dvpp);
429 *dvpp = dvp;
430
431 return (0);
432 }
433 /*
434 * Directory lookup.
435 * Returns an fhandle and file attributes for file name in a directory.
436 */
437 /* ARGSUSED */
438 void
439 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
440 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
441 {
442 int error;
443 vnode_t *dvp;
444 vnode_t *vp;
445 struct vattr va;
446 fhandle_t *fhp = da->da_fhandle;
447 struct sec_ol sec = {0, 0};
448 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
449 char *name;
450 struct sockaddr *ca;
451
452 /*
453 * Trusted Extension doesn't support NFSv2. MOUNT
454 * will reject v2 clients. Need to prevent v2 client
455 * access via WebNFS here.
456 */
457 if (is_system_labeled() && req->rq_vers == 2) {
458 dr->dr_status = NFSERR_ACCES;
459 return;
460 }
461
462 /*
463 * Disallow NULL paths
464 */
465 if (da->da_name == NULL || *da->da_name == '\0') {
466 dr->dr_status = NFSERR_ACCES;
467 return;
468 }
469
470 /*
471 * Allow lookups from the root - the default
472 * location of the public filehandle.
473 */
474 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
475 dvp = ZONE_ROOTVP();
476 VN_HOLD(dvp);
477 } else {
478 dvp = nfs_fhtovp(fhp, exi);
479 if (dvp == NULL) {
480 dr->dr_status = NFSERR_STALE;
481 return;
482 }
483 }
484
485 exi_hold(exi);
486 ASSERT3P(exi->exi_zone, ==, curzone);
487
488 /*
489 * Not allow lookup beyond root.
490 * If the filehandle matches a filehandle of the exi,
491 * then the ".." refers beyond the root of an exported filesystem.
492 */
493 if (strcmp(da->da_name, "..") == 0 &&
494 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
495 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
496 ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
497 /*
498 * special case for ".." and 'nohide'exported root
499 */
500 if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
501 error = NFSERR_ACCES;
502 goto out;
503 }
504 } else {
505 error = NFSERR_NOENT;
506 goto out;
507 }
508 }
509
510 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
511 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
512 MAXPATHLEN);
513
514 if (name == NULL) {
515 error = NFSERR_ACCES;
516 goto out;
517 }
518
519 /*
520 * If the public filehandle is used then allow
521 * a multi-component lookup, i.e. evaluate
522 * a pathname and follow symbolic links if
523 * necessary.
524 *
525 * This may result in a vnode in another filesystem
526 * which is OK as long as the filesystem is exported.
527 */
528 if (PUBLIC_FH2(fhp)) {
529 publicfh_flag = TRUE;
530
531 exi_rele(exi);
532
533 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
534 &sec);
535 } else {
536 /*
537 * Do a normal single component lookup.
538 */
539 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
540 NULL, NULL, NULL);
541 }
542
543 if (name != da->da_name)
544 kmem_free(name, MAXPATHLEN);
545
546 if (error == 0 && vn_ismntpt(vp)) {
547 error = rfs_cross_mnt(&vp, &exi);
548 if (error)
549 VN_RELE(vp);
550 }
551
552 if (!error) {
553 va.va_mask = AT_ALL; /* we want everything */
554
555 error = rfs4_delegated_getattr(vp, &va, 0, cr);
556
557 /* check for overflows */
558 if (!error) {
559 acl_perm(vp, exi, &va, cr);
560 error = vattr_to_nattr(&va, &dr->dr_attr);
561 if (!error) {
562 if (sec.sec_flags & SEC_QUERY)
563 error = makefh_ol(&dr->dr_fhandle, exi,
564 sec.sec_index);
565 else {
566 error = makefh(&dr->dr_fhandle, vp,
567 exi);
568 if (!error && publicfh_flag &&
569 !chk_clnt_sec(exi, req))
570 auth_weak = TRUE;
571 }
572 }
573 }
574 VN_RELE(vp);
575 }
576
577 out:
578 VN_RELE(dvp);
579
580 if (exi != NULL)
581 exi_rele(exi);
582
583 /*
584 * If it's public fh, no 0x81, and client's flavor is
585 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
586 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
587 */
588 if (auth_weak)
589 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
590 else
591 dr->dr_status = puterrno(error);
592 }
593 void *
594 rfs_lookup_getfh(struct nfsdiropargs *da)
595 {
596 return (da->da_fhandle);
597 }
598
599 /*
600 * Read symbolic link.
601 * Returns the string in the symbolic link at the given fhandle.
602 */
603 /* ARGSUSED */
604 void
605 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
606 struct svc_req *req, cred_t *cr, bool_t ro)
607 {
608 int error;
609 struct iovec iov;
610 struct uio uio;
611 vnode_t *vp;
612 struct vattr va;
613 struct sockaddr *ca;
614 char *name = NULL;
615 int is_referral = 0;
616
617 vp = nfs_fhtovp(fhp, exi);
618 if (vp == NULL) {
619 rl->rl_data = NULL;
620 rl->rl_status = NFSERR_STALE;
621 return;
622 }
623
624 va.va_mask = AT_MODE;
625
626 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
627
628 if (error) {
629 VN_RELE(vp);
630 rl->rl_data = NULL;
631 rl->rl_status = puterrno(error);
632 return;
633 }
634
635 if (MANDLOCK(vp, va.va_mode)) {
636 VN_RELE(vp);
637 rl->rl_data = NULL;
638 rl->rl_status = NFSERR_ACCES;
639 return;
640 }
641
642 /* We lied about the object type for a referral */
643 if (vn_is_nfs_reparse(vp, cr))
644 is_referral = 1;
645
646 /*
647 * XNFS and RFC1094 require us to return ENXIO if argument
648 * is not a link. BUGID 1138002.
649 */
650 if (vp->v_type != VLNK && !is_referral) {
651 VN_RELE(vp);
652 rl->rl_data = NULL;
653 rl->rl_status = NFSERR_NXIO;
654 return;
655 }
656
657 /*
658 * Allocate data for pathname. This will be freed by rfs_rlfree.
659 */
660 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
661
662 if (is_referral) {
663 char *s;
664 size_t strsz;
665
666 /* Get an artificial symlink based on a referral */
667 s = build_symlink(vp, cr, &strsz);
668 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
669 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
670 vnode_t *, vp, char *, s);
671 if (s == NULL)
672 error = EINVAL;
673 else {
674 error = 0;
675 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
676 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
677 kmem_free(s, strsz);
678 }
679
680 } else {
681
682 /*
683 * Set up io vector to read sym link data
684 */
685 iov.iov_base = rl->rl_data;
686 iov.iov_len = NFS_MAXPATHLEN;
687 uio.uio_iov = &iov;
688 uio.uio_iovcnt = 1;
689 uio.uio_segflg = UIO_SYSSPACE;
690 uio.uio_extflg = UIO_COPY_CACHED;
691 uio.uio_loffset = (offset_t)0;
692 uio.uio_resid = NFS_MAXPATHLEN;
693
694 /*
695 * Do the readlink.
696 */
697 error = VOP_READLINK(vp, &uio, cr, NULL);
698
699 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
700
701 if (!error)
702 rl->rl_data[rl->rl_count] = '\0';
703
704 }
705
706
707 VN_RELE(vp);
708
709 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
710 name = nfscmd_convname(ca, exi, rl->rl_data,
711 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
712
713 if (name != NULL && name != rl->rl_data) {
714 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
715 rl->rl_data = name;
716 }
717
718 /*
719 * XNFS and RFC1094 require us to return ENXIO if argument
720 * is not a link. UFS returns EINVAL if this is the case,
721 * so we do the mapping here. BUGID 1138002.
722 */
723 if (error == EINVAL)
724 rl->rl_status = NFSERR_NXIO;
725 else
726 rl->rl_status = puterrno(error);
727
728 }
729 void *
730 rfs_readlink_getfh(fhandle_t *fhp)
731 {
732 return (fhp);
733 }
734 /*
735 * Free data allocated by rfs_readlink
736 */
737 void
738 rfs_rlfree(struct nfsrdlnres *rl)
739 {
740 if (rl->rl_data != NULL)
741 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
742 }
743
744 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
745
746 /*
747 * Read data.
748 * Returns some data read from the file at the given fhandle.
749 */
750 /* ARGSUSED */
751 void
752 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
753 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
754 {
755 vnode_t *vp;
756 int error;
757 struct vattr va;
758 struct iovec iov;
759 struct uio uio;
760 mblk_t *mp;
761 int alloc_err = 0;
762 int in_crit = 0;
763 caller_context_t ct;
764
765 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
766 if (vp == NULL) {
767 rr->rr_data = NULL;
768 rr->rr_status = NFSERR_STALE;
769 return;
770 }
771
772 if (vp->v_type != VREG) {
773 VN_RELE(vp);
774 rr->rr_data = NULL;
775 rr->rr_status = NFSERR_ISDIR;
776 return;
777 }
778
779 ct.cc_sysid = 0;
780 ct.cc_pid = 0;
781 ct.cc_caller_id = nfs2_srv_caller_id;
782 ct.cc_flags = CC_DONTBLOCK;
783
784 /*
785 * Enter the critical region before calling VOP_RWLOCK
786 * to avoid a deadlock with write requests.
787 */
788 if (nbl_need_check(vp)) {
789 nbl_start_crit(vp, RW_READER);
790 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
791 0, NULL)) {
792 nbl_end_crit(vp);
793 VN_RELE(vp);
794 rr->rr_data = NULL;
795 rr->rr_status = NFSERR_ACCES;
796 return;
797 }
798 in_crit = 1;
799 }
800
801 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
802
803 /* check if a monitor detected a delegation conflict */
804 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
805 if (in_crit)
806 nbl_end_crit(vp);
807 VN_RELE(vp);
808 /* mark as wouldblock so response is dropped */
809 curthread->t_flag |= T_WOULDBLOCK;
810
811 rr->rr_data = NULL;
812 return;
813 }
814
815 va.va_mask = AT_ALL;
816
817 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
818
819 if (error) {
820 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
821 if (in_crit)
822 nbl_end_crit(vp);
823
824 VN_RELE(vp);
825 rr->rr_data = NULL;
826 rr->rr_status = puterrno(error);
827
828 return;
829 }
830
831 /*
832 * This is a kludge to allow reading of files created
833 * with no read permission. The owner of the file
834 * is always allowed to read it.
835 */
836 if (crgetuid(cr) != va.va_uid) {
837 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
838
839 if (error) {
840 /*
841 * Exec is the same as read over the net because
842 * of demand loading.
843 */
844 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
845 }
846 if (error) {
847 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
848 if (in_crit)
849 nbl_end_crit(vp);
850 VN_RELE(vp);
851 rr->rr_data = NULL;
852 rr->rr_status = puterrno(error);
853
854 return;
855 }
856 }
857
858 if (MANDLOCK(vp, va.va_mode)) {
859 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
860 if (in_crit)
861 nbl_end_crit(vp);
862
863 VN_RELE(vp);
864 rr->rr_data = NULL;
865 rr->rr_status = NFSERR_ACCES;
866
867 return;
868 }
869
870 rr->rr_ok.rrok_wlist_len = 0;
871 rr->rr_ok.rrok_wlist = NULL;
872
873 if ((u_offset_t)ra->ra_offset >= va.va_size) {
874 rr->rr_count = 0;
875 rr->rr_data = NULL;
876 /*
877 * In this case, status is NFS_OK, but there is no data
878 * to encode. So set rr_mp to NULL.
879 */
880 rr->rr_mp = NULL;
881 rr->rr_ok.rrok_wlist = ra->ra_wlist;
882 if (rr->rr_ok.rrok_wlist)
883 clist_zero_len(rr->rr_ok.rrok_wlist);
884 goto done;
885 }
886
887 if (ra->ra_wlist) {
888 mp = NULL;
889 rr->rr_mp = NULL;
890 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
891 if (ra->ra_count > iov.iov_len) {
892 rr->rr_data = NULL;
893 rr->rr_status = NFSERR_INVAL;
894 goto done;
895 }
896 } else {
897 /*
898 * mp will contain the data to be sent out in the read reply.
899 * This will be freed after the reply has been sent out (by the
900 * driver).
901 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
902 * that the call to xdrmblk_putmblk() never fails.
903 */
904 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
905 &alloc_err);
906 ASSERT(mp != NULL);
907 ASSERT(alloc_err == 0);
908
909 rr->rr_mp = mp;
910
911 /*
912 * Set up io vector
913 */
914 iov.iov_base = (caddr_t)mp->b_datap->db_base;
915 iov.iov_len = ra->ra_count;
916 }
917
918 uio.uio_iov = &iov;
919 uio.uio_iovcnt = 1;
920 uio.uio_segflg = UIO_SYSSPACE;
921 uio.uio_extflg = UIO_COPY_CACHED;
922 uio.uio_loffset = (offset_t)ra->ra_offset;
923 uio.uio_resid = ra->ra_count;
924
925 error = VOP_READ(vp, &uio, 0, cr, &ct);
926
927 if (error) {
928 if (mp)
929 freeb(mp);
930
931 /*
932 * check if a monitor detected a delegation conflict and
933 * mark as wouldblock so response is dropped
934 */
935 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
936 curthread->t_flag |= T_WOULDBLOCK;
937 else
938 rr->rr_status = puterrno(error);
939
940 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
941 if (in_crit)
942 nbl_end_crit(vp);
943
944 VN_RELE(vp);
945 rr->rr_data = NULL;
946
947 return;
948 }
949
950 /*
951 * Get attributes again so we can send the latest access
952 * time to the client side for its cache.
953 */
954 va.va_mask = AT_ALL;
955
956 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
957
958 if (error) {
959 if (mp)
960 freeb(mp);
961
962 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
963 if (in_crit)
964 nbl_end_crit(vp);
965
966 VN_RELE(vp);
967 rr->rr_data = NULL;
968 rr->rr_status = puterrno(error);
969
970 return;
971 }
972
973 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
974
975 if (mp) {
976 rr->rr_data = (char *)mp->b_datap->db_base;
977 } else {
978 if (ra->ra_wlist) {
979 rr->rr_data = (caddr_t)iov.iov_base;
980 if (!rdma_setup_read_data2(ra, rr)) {
981 rr->rr_data = NULL;
982 rr->rr_status = puterrno(NFSERR_INVAL);
983 }
984 }
985 }
986 done:
987 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
988 if (in_crit)
989 nbl_end_crit(vp);
990
991 acl_perm(vp, exi, &va, cr);
992
993 /* check for overflows */
994 error = vattr_to_nattr(&va, &rr->rr_attr);
995
996 VN_RELE(vp);
997
998 rr->rr_status = puterrno(error);
999 }
1000
1001 /*
1002 * Free data allocated by rfs_read
1003 */
1004 void
1005 rfs_rdfree(struct nfsrdresult *rr)
1006 {
1007 mblk_t *mp;
1008
1009 if (rr->rr_status == NFS_OK) {
1010 mp = rr->rr_mp;
1011 if (mp != NULL)
1012 freeb(mp);
1013 }
1014 }
1015
1016 void *
1017 rfs_read_getfh(struct nfsreadargs *ra)
1018 {
1019 return (&ra->ra_fhandle);
1020 }
1021
1022 #define MAX_IOVECS 12
1023
1024 #ifdef DEBUG
1025 static int rfs_write_sync_hits = 0;
1026 static int rfs_write_sync_misses = 0;
1027 #endif
1028
1029 /*
1030 * Write data to file.
1031 * Returns attributes of a file after writing some data to it.
1032 *
1033 * Any changes made here, especially in error handling might have
1034 * to also be done in rfs_write (which clusters write requests).
1035 */
1036 /* ARGSUSED */
1037 void
1038 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1039 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1040 {
1041 int error;
1042 vnode_t *vp;
1043 rlim64_t rlimit;
1044 struct vattr va;
1045 struct uio uio;
1046 struct iovec iov[MAX_IOVECS];
1047 mblk_t *m;
1048 struct iovec *iovp;
1049 int iovcnt;
1050 cred_t *savecred;
1051 int in_crit = 0;
1052 caller_context_t ct;
1053
1054 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1055 if (vp == NULL) {
1056 ns->ns_status = NFSERR_STALE;
1057 return;
1058 }
1059
1060 if (rdonly(ro, vp)) {
1061 VN_RELE(vp);
1062 ns->ns_status = NFSERR_ROFS;
1063 return;
1064 }
1065
1066 if (vp->v_type != VREG) {
1067 VN_RELE(vp);
1068 ns->ns_status = NFSERR_ISDIR;
1069 return;
1070 }
1071
1072 ct.cc_sysid = 0;
1073 ct.cc_pid = 0;
1074 ct.cc_caller_id = nfs2_srv_caller_id;
1075 ct.cc_flags = CC_DONTBLOCK;
1076
1077 va.va_mask = AT_UID|AT_MODE;
1078
1079 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1080
1081 if (error) {
1082 VN_RELE(vp);
1083 ns->ns_status = puterrno(error);
1084
1085 return;
1086 }
1087
1088 if (crgetuid(cr) != va.va_uid) {
1089 /*
1090 * This is a kludge to allow writes of files created
1091 * with read only permission. The owner of the file
1092 * is always allowed to write it.
1093 */
1094 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1095
1096 if (error) {
1097 VN_RELE(vp);
1098 ns->ns_status = puterrno(error);
1099 return;
1100 }
1101 }
1102
1103 /*
1104 * Can't access a mandatory lock file. This might cause
1105 * the NFS service thread to block forever waiting for a
1106 * lock to be released that will never be released.
1107 */
1108 if (MANDLOCK(vp, va.va_mode)) {
1109 VN_RELE(vp);
1110 ns->ns_status = NFSERR_ACCES;
1111 return;
1112 }
1113
1114 /*
1115 * We have to enter the critical region before calling VOP_RWLOCK
1116 * to avoid a deadlock with ufs.
1117 */
1118 if (nbl_need_check(vp)) {
1119 nbl_start_crit(vp, RW_READER);
1120 in_crit = 1;
1121 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1122 wa->wa_count, 0, NULL)) {
1123 error = EACCES;
1124 goto out;
1125 }
1126 }
1127
1128 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1129
1130 /* check if a monitor detected a delegation conflict */
1131 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1132 goto out;
1133 }
1134
1135 if (wa->wa_data || wa->wa_rlist) {
1136 /* Do the RDMA thing if necessary */
1137 if (wa->wa_rlist) {
1138 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1139 iov[0].iov_len = wa->wa_count;
1140 } else {
1141 iov[0].iov_base = wa->wa_data;
1142 iov[0].iov_len = wa->wa_count;
1143 }
1144 uio.uio_iov = iov;
1145 uio.uio_iovcnt = 1;
1146 uio.uio_segflg = UIO_SYSSPACE;
1147 uio.uio_extflg = UIO_COPY_DEFAULT;
1148 uio.uio_loffset = (offset_t)wa->wa_offset;
1149 uio.uio_resid = wa->wa_count;
1150 /*
1151 * The limit is checked on the client. We
1152 * should allow any size writes here.
1153 */
1154 uio.uio_llimit = curproc->p_fsz_ctl;
1155 rlimit = uio.uio_llimit - wa->wa_offset;
1156 if (rlimit < (rlim64_t)uio.uio_resid)
1157 uio.uio_resid = (uint_t)rlimit;
1158
1159 /*
1160 * for now we assume no append mode
1161 */
1162 /*
1163 * We're changing creds because VM may fault and we need
1164 * the cred of the current thread to be used if quota
1165 * checking is enabled.
1166 */
1167 savecred = curthread->t_cred;
1168 curthread->t_cred = cr;
1169 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1170 curthread->t_cred = savecred;
1171 } else {
1172
1173 iovcnt = 0;
1174 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1175 iovcnt++;
1176 if (iovcnt <= MAX_IOVECS) {
1177 #ifdef DEBUG
1178 rfs_write_sync_hits++;
1179 #endif
1180 iovp = iov;
1181 } else {
1182 #ifdef DEBUG
1183 rfs_write_sync_misses++;
1184 #endif
1185 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1186 }
1187 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1188 uio.uio_iov = iovp;
1189 uio.uio_iovcnt = iovcnt;
1190 uio.uio_segflg = UIO_SYSSPACE;
1191 uio.uio_extflg = UIO_COPY_DEFAULT;
1192 uio.uio_loffset = (offset_t)wa->wa_offset;
1193 uio.uio_resid = wa->wa_count;
1194 /*
1195 * The limit is checked on the client. We
1196 * should allow any size writes here.
1197 */
1198 uio.uio_llimit = curproc->p_fsz_ctl;
1199 rlimit = uio.uio_llimit - wa->wa_offset;
1200 if (rlimit < (rlim64_t)uio.uio_resid)
1201 uio.uio_resid = (uint_t)rlimit;
1202
1203 /*
1204 * For now we assume no append mode.
1205 */
1206 /*
1207 * We're changing creds because VM may fault and we need
1208 * the cred of the current thread to be used if quota
1209 * checking is enabled.
1210 */
1211 savecred = curthread->t_cred;
1212 curthread->t_cred = cr;
1213 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1214 curthread->t_cred = savecred;
1215
1216 if (iovp != iov)
1217 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1218 }
1219
1220 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1221
1222 if (!error) {
1223 /*
1224 * Get attributes again so we send the latest mod
1225 * time to the client side for its cache.
1226 */
1227 va.va_mask = AT_ALL; /* now we want everything */
1228
1229 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1230
1231 /* check for overflows */
1232 if (!error) {
1233 acl_perm(vp, exi, &va, cr);
1234 error = vattr_to_nattr(&va, &ns->ns_attr);
1235 }
1236 }
1237
1238 out:
1239 if (in_crit)
1240 nbl_end_crit(vp);
1241 VN_RELE(vp);
1242
1243 /* check if a monitor detected a delegation conflict */
1244 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1245 /* mark as wouldblock so response is dropped */
1246 curthread->t_flag |= T_WOULDBLOCK;
1247 else
1248 ns->ns_status = puterrno(error);
1249
1250 }
1251
1252 struct rfs_async_write {
1253 struct nfswriteargs *wa;
1254 struct nfsattrstat *ns;
1255 struct svc_req *req;
1256 cred_t *cr;
1257 bool_t ro;
1258 kthread_t *thread;
1259 struct rfs_async_write *list;
1260 };
1261
1262 struct rfs_async_write_list {
1263 fhandle_t *fhp;
1264 kcondvar_t cv;
1265 struct rfs_async_write *list;
1266 struct rfs_async_write_list *next;
1267 };
1268
1269 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1270 static kmutex_t rfs_async_write_lock;
1271 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1272
1273 #define MAXCLIOVECS 42
1274 #define RFSWRITE_INITVAL (enum nfsstat) -1
1275
1276 #ifdef DEBUG
1277 static int rfs_write_hits = 0;
1278 static int rfs_write_misses = 0;
1279 #endif
1280
1281 /*
1282 * Write data to file.
1283 * Returns attributes of a file after writing some data to it.
1284 */
1285 void
1286 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1287 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1288 {
1289 int error;
1290 vnode_t *vp;
1291 rlim64_t rlimit;
1292 struct vattr va;
1293 struct uio uio;
1294 struct rfs_async_write_list *lp;
1295 struct rfs_async_write_list *nlp;
1296 struct rfs_async_write *rp;
1297 struct rfs_async_write *nrp;
1298 struct rfs_async_write *trp;
1299 struct rfs_async_write *lrp;
1300 int data_written;
1301 int iovcnt;
1302 mblk_t *m;
1303 struct iovec *iovp;
1304 struct iovec *niovp;
1305 struct iovec iov[MAXCLIOVECS];
1306 int count;
1307 int rcount;
1308 uint_t off;
1309 uint_t len;
1310 struct rfs_async_write nrpsp;
1311 struct rfs_async_write_list nlpsp;
1312 ushort_t t_flag;
1313 cred_t *savecred;
1314 int in_crit = 0;
1315 caller_context_t ct;
1316 nfs_srv_t *nsrv;
1317
1318 ASSERT3P(curzone, ==, ((exi == NULL) ? curzone : exi->exi_zone));
1319 nsrv = zone_getspecific(rfs_zone_key, curzone);
1320 if (!nsrv->write_async) {
1321 rfs_write_sync(wa, ns, exi, req, cr, ro);
1322 return;
1323 }
1324
1325 /*
1326 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1327 * is considered an OK.
1328 */
1329 ns->ns_status = RFSWRITE_INITVAL;
1330
1331 nrp = &nrpsp;
1332 nrp->wa = wa;
1333 nrp->ns = ns;
1334 nrp->req = req;
1335 nrp->cr = cr;
1336 nrp->ro = ro;
1337 nrp->thread = curthread;
1338
1339 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1340
1341 /*
1342 * Look to see if there is already a cluster started
1343 * for this file.
1344 */
1345 mutex_enter(&nsrv->async_write_lock);
1346 for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1347 if (bcmp(&wa->wa_fhandle, lp->fhp,
1348 sizeof (fhandle_t)) == 0)
1349 break;
1350 }
1351
1352 /*
1353 * If lp is non-NULL, then there is already a cluster
1354 * started. We need to place ourselves in the cluster
1355 * list in the right place as determined by starting
1356 * offset. Conflicts with non-blocking mandatory locked
1357 * regions will be checked when the cluster is processed.
1358 */
1359 if (lp != NULL) {
1360 rp = lp->list;
1361 trp = NULL;
1362 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1363 trp = rp;
1364 rp = rp->list;
1365 }
1366 nrp->list = rp;
1367 if (trp == NULL)
1368 lp->list = nrp;
1369 else
1370 trp->list = nrp;
1371 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1372 cv_wait(&lp->cv, &nsrv->async_write_lock);
1373 mutex_exit(&nsrv->async_write_lock);
1374
1375 return;
1376 }
1377
1378 /*
1379 * No cluster started yet, start one and add ourselves
1380 * to the list of clusters.
1381 */
1382 nrp->list = NULL;
1383
1384 nlp = &nlpsp;
1385 nlp->fhp = &wa->wa_fhandle;
1386 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1387 nlp->list = nrp;
1388 nlp->next = NULL;
1389
1390 if (nsrv->async_write_head == NULL) {
1391 nsrv->async_write_head = nlp;
1392 } else {
1393 lp = nsrv->async_write_head;
1394 while (lp->next != NULL)
1395 lp = lp->next;
1396 lp->next = nlp;
1397 }
1398 mutex_exit(&nsrv->async_write_lock);
1399
1400 /*
1401 * Convert the file handle common to all of the requests
1402 * in this cluster to a vnode.
1403 */
1404 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1405 if (vp == NULL) {
1406 mutex_enter(&nsrv->async_write_lock);
1407 if (nsrv->async_write_head == nlp)
1408 nsrv->async_write_head = nlp->next;
1409 else {
1410 lp = nsrv->async_write_head;
1411 while (lp->next != nlp)
1412 lp = lp->next;
1413 lp->next = nlp->next;
1414 }
1415 t_flag = curthread->t_flag & T_WOULDBLOCK;
1416 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1417 rp->ns->ns_status = NFSERR_STALE;
1418 rp->thread->t_flag |= t_flag;
1419 }
1420 cv_broadcast(&nlp->cv);
1421 mutex_exit(&nsrv->async_write_lock);
1422
1423 return;
1424 }
1425
1426 /*
1427 * Can only write regular files. Attempts to write any
1428 * other file types fail with EISDIR.
1429 */
1430 if (vp->v_type != VREG) {
1431 VN_RELE(vp);
1432 mutex_enter(&nsrv->async_write_lock);
1433 if (nsrv->async_write_head == nlp)
1434 nsrv->async_write_head = nlp->next;
1435 else {
1436 lp = nsrv->async_write_head;
1437 while (lp->next != nlp)
1438 lp = lp->next;
1439 lp->next = nlp->next;
1440 }
1441 t_flag = curthread->t_flag & T_WOULDBLOCK;
1442 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1443 rp->ns->ns_status = NFSERR_ISDIR;
1444 rp->thread->t_flag |= t_flag;
1445 }
1446 cv_broadcast(&nlp->cv);
1447 mutex_exit(&nsrv->async_write_lock);
1448
1449 return;
1450 }
1451
1452 /*
1453 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1454 * deadlock with ufs.
1455 */
1456 if (nbl_need_check(vp)) {
1457 nbl_start_crit(vp, RW_READER);
1458 in_crit = 1;
1459 }
1460
1461 ct.cc_sysid = 0;
1462 ct.cc_pid = 0;
1463 ct.cc_caller_id = nfs2_srv_caller_id;
1464 ct.cc_flags = CC_DONTBLOCK;
1465
1466 /*
1467 * Lock the file for writing. This operation provides
1468 * the delay which allows clusters to grow.
1469 */
1470 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1471
1472 /* check if a monitor detected a delegation conflict */
1473 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1474 if (in_crit)
1475 nbl_end_crit(vp);
1476 VN_RELE(vp);
1477 /* mark as wouldblock so response is dropped */
1478 curthread->t_flag |= T_WOULDBLOCK;
1479 mutex_enter(&nsrv->async_write_lock);
1480 if (nsrv->async_write_head == nlp)
1481 nsrv->async_write_head = nlp->next;
1482 else {
1483 lp = nsrv->async_write_head;
1484 while (lp->next != nlp)
1485 lp = lp->next;
1486 lp->next = nlp->next;
1487 }
1488 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1489 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1490 rp->ns->ns_status = puterrno(error);
1491 rp->thread->t_flag |= T_WOULDBLOCK;
1492 }
1493 }
1494 cv_broadcast(&nlp->cv);
1495 mutex_exit(&nsrv->async_write_lock);
1496
1497 return;
1498 }
1499
1500 /*
1501 * Disconnect this cluster from the list of clusters.
1502 * The cluster that is being dealt with must be fixed
1503 * in size after this point, so there is no reason
1504 * to leave it on the list so that new requests can
1505 * find it.
1506 *
1507 * The algorithm is that the first write request will
1508 * create a cluster, convert the file handle to a
1509 * vnode pointer, and then lock the file for writing.
1510 * This request is not likely to be clustered with
1511 * any others. However, the next request will create
1512 * a new cluster and be blocked in VOP_RWLOCK while
1513 * the first request is being processed. This delay
1514 * will allow more requests to be clustered in this
1515 * second cluster.
1516 */
1517 mutex_enter(&nsrv->async_write_lock);
1518 if (nsrv->async_write_head == nlp)
1519 nsrv->async_write_head = nlp->next;
1520 else {
1521 lp = nsrv->async_write_head;
1522 while (lp->next != nlp)
1523 lp = lp->next;
1524 lp->next = nlp->next;
1525 }
1526 mutex_exit(&nsrv->async_write_lock);
1527
1528 /*
1529 * Step through the list of requests in this cluster.
1530 * We need to check permissions to make sure that all
1531 * of the requests have sufficient permission to write
1532 * the file. A cluster can be composed of requests
1533 * from different clients and different users on each
1534 * client.
1535 *
1536 * As a side effect, we also calculate the size of the
1537 * byte range that this cluster encompasses.
1538 */
1539 rp = nlp->list;
1540 off = rp->wa->wa_offset;
1541 len = (uint_t)0;
1542 do {
1543 if (rdonly(rp->ro, vp)) {
1544 rp->ns->ns_status = NFSERR_ROFS;
1545 t_flag = curthread->t_flag & T_WOULDBLOCK;
1546 rp->thread->t_flag |= t_flag;
1547 continue;
1548 }
1549
1550 va.va_mask = AT_UID|AT_MODE;
1551
1552 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1553
1554 if (!error) {
1555 if (crgetuid(rp->cr) != va.va_uid) {
1556 /*
1557 * This is a kludge to allow writes of files
1558 * created with read only permission. The
1559 * owner of the file is always allowed to
1560 * write it.
1561 */
1562 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1563 }
1564 if (!error && MANDLOCK(vp, va.va_mode))
1565 error = EACCES;
1566 }
1567
1568 /*
1569 * Check for a conflict with a nbmand-locked region.
1570 */
1571 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1572 rp->wa->wa_count, 0, NULL)) {
1573 error = EACCES;
1574 }
1575
1576 if (error) {
1577 rp->ns->ns_status = puterrno(error);
1578 t_flag = curthread->t_flag & T_WOULDBLOCK;
1579 rp->thread->t_flag |= t_flag;
1580 continue;
1581 }
1582 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1583 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1584 } while ((rp = rp->list) != NULL);
1585
1586 /*
1587 * Step through the cluster attempting to gather as many
1588 * requests which are contiguous as possible. These
1589 * contiguous requests are handled via one call to VOP_WRITE
1590 * instead of different calls to VOP_WRITE. We also keep
1591 * track of the fact that any data was written.
1592 */
1593 rp = nlp->list;
1594 data_written = 0;
1595 do {
1596 /*
1597 * Skip any requests which are already marked as having an
1598 * error.
1599 */
1600 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1601 rp = rp->list;
1602 continue;
1603 }
1604
1605 /*
1606 * Count the number of iovec's which are required
1607 * to handle this set of requests. One iovec is
1608 * needed for each data buffer, whether addressed
1609 * by wa_data or by the b_rptr pointers in the
1610 * mblk chains.
1611 */
1612 iovcnt = 0;
1613 lrp = rp;
1614 for (;;) {
1615 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1616 iovcnt++;
1617 else {
1618 m = lrp->wa->wa_mblk;
1619 while (m != NULL) {
1620 iovcnt++;
1621 m = m->b_cont;
1622 }
1623 }
1624 if (lrp->list == NULL ||
1625 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1626 lrp->wa->wa_offset + lrp->wa->wa_count !=
1627 lrp->list->wa->wa_offset) {
1628 lrp = lrp->list;
1629 break;
1630 }
1631 lrp = lrp->list;
1632 }
1633
1634 if (iovcnt <= MAXCLIOVECS) {
1635 #ifdef DEBUG
1636 rfs_write_hits++;
1637 #endif
1638 niovp = iov;
1639 } else {
1640 #ifdef DEBUG
1641 rfs_write_misses++;
1642 #endif
1643 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1644 }
1645 /*
1646 * Put together the scatter/gather iovecs.
1647 */
1648 iovp = niovp;
1649 trp = rp;
1650 count = 0;
1651 do {
1652 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1653 if (trp->wa->wa_rlist) {
1654 iovp->iov_base =
1655 (char *)((trp->wa->wa_rlist)->
1656 u.c_daddr3);
1657 iovp->iov_len = trp->wa->wa_count;
1658 } else {
1659 iovp->iov_base = trp->wa->wa_data;
1660 iovp->iov_len = trp->wa->wa_count;
1661 }
1662 iovp++;
1663 } else {
1664 m = trp->wa->wa_mblk;
1665 rcount = trp->wa->wa_count;
1666 while (m != NULL) {
1667 iovp->iov_base = (caddr_t)m->b_rptr;
1668 iovp->iov_len = (m->b_wptr - m->b_rptr);
1669 rcount -= iovp->iov_len;
1670 if (rcount < 0)
1671 iovp->iov_len += rcount;
1672 iovp++;
1673 if (rcount <= 0)
1674 break;
1675 m = m->b_cont;
1676 }
1677 }
1678 count += trp->wa->wa_count;
1679 trp = trp->list;
1680 } while (trp != lrp);
1681
1682 uio.uio_iov = niovp;
1683 uio.uio_iovcnt = iovcnt;
1684 uio.uio_segflg = UIO_SYSSPACE;
1685 uio.uio_extflg = UIO_COPY_DEFAULT;
1686 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1687 uio.uio_resid = count;
1688 /*
1689 * The limit is checked on the client. We
1690 * should allow any size writes here.
1691 */
1692 uio.uio_llimit = curproc->p_fsz_ctl;
1693 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1694 if (rlimit < (rlim64_t)uio.uio_resid)
1695 uio.uio_resid = (uint_t)rlimit;
1696
1697 /*
1698 * For now we assume no append mode.
1699 */
1700
1701 /*
1702 * We're changing creds because VM may fault
1703 * and we need the cred of the current
1704 * thread to be used if quota * checking is
1705 * enabled.
1706 */
1707 savecred = curthread->t_cred;
1708 curthread->t_cred = cr;
1709 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1710 curthread->t_cred = savecred;
1711
1712 /* check if a monitor detected a delegation conflict */
1713 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1714 /* mark as wouldblock so response is dropped */
1715 curthread->t_flag |= T_WOULDBLOCK;
1716
1717 if (niovp != iov)
1718 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1719
1720 if (!error) {
1721 data_written = 1;
1722 /*
1723 * Get attributes again so we send the latest mod
1724 * time to the client side for its cache.
1725 */
1726 va.va_mask = AT_ALL; /* now we want everything */
1727
1728 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1729
1730 if (!error)
1731 acl_perm(vp, exi, &va, rp->cr);
1732 }
1733
1734 /*
1735 * Fill in the status responses for each request
1736 * which was just handled. Also, copy the latest
1737 * attributes in to the attribute responses if
1738 * appropriate.
1739 */
1740 t_flag = curthread->t_flag & T_WOULDBLOCK;
1741 do {
1742 rp->thread->t_flag |= t_flag;
1743 /* check for overflows */
1744 if (!error) {
1745 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1746 }
1747 rp->ns->ns_status = puterrno(error);
1748 rp = rp->list;
1749 } while (rp != lrp);
1750 } while (rp != NULL);
1751
1752 /*
1753 * If any data was written at all, then we need to flush
1754 * the data and metadata to stable storage.
1755 */
1756 if (data_written) {
1757 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1758
1759 if (!error) {
1760 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1761 }
1762 }
1763
1764 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1765
1766 if (in_crit)
1767 nbl_end_crit(vp);
1768 VN_RELE(vp);
1769
1770 t_flag = curthread->t_flag & T_WOULDBLOCK;
1771 mutex_enter(&nsrv->async_write_lock);
1772 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1773 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1774 rp->ns->ns_status = puterrno(error);
1775 rp->thread->t_flag |= t_flag;
1776 }
1777 }
1778 cv_broadcast(&nlp->cv);
1779 mutex_exit(&nsrv->async_write_lock);
1780
1781 }
1782
1783 void *
1784 rfs_write_getfh(struct nfswriteargs *wa)
1785 {
1786 return (&wa->wa_fhandle);
1787 }
1788
1789 /*
1790 * Create a file.
1791 * Creates a file with given attributes and returns those attributes
1792 * and an fhandle for the new file.
1793 */
1794 void
1795 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1796 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1797 {
1798 int error;
1799 int lookuperr;
1800 int in_crit = 0;
1801 struct vattr va;
1802 vnode_t *vp;
1803 vnode_t *realvp;
1804 vnode_t *dvp;
1805 char *name = args->ca_da.da_name;
1806 vnode_t *tvp = NULL;
1807 int mode;
1808 int lookup_ok;
1809 bool_t trunc;
1810 struct sockaddr *ca;
1811
1812 /*
1813 * Disallow NULL paths
1814 */
1815 if (name == NULL || *name == '\0') {
1816 dr->dr_status = NFSERR_ACCES;
1817 return;
1818 }
1819
1820 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1821 if (dvp == NULL) {
1822 dr->dr_status = NFSERR_STALE;
1823 return;
1824 }
1825
1826 error = sattr_to_vattr(args->ca_sa, &va);
1827 if (error) {
1828 dr->dr_status = puterrno(error);
1829 return;
1830 }
1831
1832 /*
1833 * Must specify the mode.
1834 */
1835 if (!(va.va_mask & AT_MODE)) {
1836 VN_RELE(dvp);
1837 dr->dr_status = NFSERR_INVAL;
1838 return;
1839 }
1840
1841 /*
1842 * This is a completely gross hack to make mknod
1843 * work over the wire until we can wack the protocol
1844 */
1845 if ((va.va_mode & IFMT) == IFCHR) {
1846 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1847 va.va_type = VFIFO; /* xtra kludge for named pipe */
1848 else {
1849 va.va_type = VCHR;
1850 /*
1851 * uncompress the received dev_t
1852 * if the top half is zero indicating a request
1853 * from an `older style' OS.
1854 */
1855 if ((va.va_size & 0xffff0000) == 0)
1856 va.va_rdev = nfsv2_expdev(va.va_size);
1857 else
1858 va.va_rdev = (dev_t)va.va_size;
1859 }
1860 va.va_mask &= ~AT_SIZE;
1861 } else if ((va.va_mode & IFMT) == IFBLK) {
1862 va.va_type = VBLK;
1863 /*
1864 * uncompress the received dev_t
1865 * if the top half is zero indicating a request
1866 * from an `older style' OS.
1867 */
1868 if ((va.va_size & 0xffff0000) == 0)
1869 va.va_rdev = nfsv2_expdev(va.va_size);
1870 else
1871 va.va_rdev = (dev_t)va.va_size;
1872 va.va_mask &= ~AT_SIZE;
1873 } else if ((va.va_mode & IFMT) == IFSOCK) {
1874 va.va_type = VSOCK;
1875 } else {
1876 va.va_type = VREG;
1877 }
1878 va.va_mode &= ~IFMT;
1879 va.va_mask |= AT_TYPE;
1880
1881 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1882 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1883 MAXPATHLEN);
1884 if (name == NULL) {
1885 dr->dr_status = puterrno(EINVAL);
1886 return;
1887 }
1888
1889 /*
1890 * Why was the choice made to use VWRITE as the mode to the
1891 * call to VOP_CREATE ? This results in a bug. When a client
1892 * opens a file that already exists and is RDONLY, the second
1893 * open fails with an EACESS because of the mode.
1894 * bug ID 1054648.
1895 */
1896 lookup_ok = 0;
1897 mode = VWRITE;
1898 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1899 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1900 NULL, NULL, NULL);
1901 if (!error) {
1902 struct vattr at;
1903
1904 lookup_ok = 1;
1905 at.va_mask = AT_MODE;
1906 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1907 if (!error)
1908 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1909 VN_RELE(tvp);
1910 tvp = NULL;
1911 }
1912 }
1913
1914 if (!lookup_ok) {
1915 if (rdonly(ro, dvp)) {
1916 error = EROFS;
1917 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1918 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1919 error = EPERM;
1920 } else {
1921 error = 0;
1922 }
1923 }
1924
1925 /*
1926 * If file size is being modified on an already existing file
1927 * make sure that there are no conflicting non-blocking mandatory
1928 * locks in the region being manipulated. Return EACCES if there
1929 * are conflicting locks.
1930 */
1931 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1932 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1933 NULL, NULL, NULL);
1934
1935 if (!lookuperr &&
1936 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1937 VN_RELE(tvp);
1938 curthread->t_flag |= T_WOULDBLOCK;
1939 goto out;
1940 }
1941
1942 if (!lookuperr && nbl_need_check(tvp)) {
1943 /*
1944 * The file exists. Now check if it has any
1945 * conflicting non-blocking mandatory locks
1946 * in the region being changed.
1947 */
1948 struct vattr bva;
1949 u_offset_t offset;
1950 ssize_t length;
1951
1952 nbl_start_crit(tvp, RW_READER);
1953 in_crit = 1;
1954
1955 bva.va_mask = AT_SIZE;
1956 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1957 if (!error) {
1958 if (va.va_size < bva.va_size) {
1959 offset = va.va_size;
1960 length = bva.va_size - va.va_size;
1961 } else {
1962 offset = bva.va_size;
1963 length = va.va_size - bva.va_size;
1964 }
1965 if (length) {
1966 if (nbl_conflict(tvp, NBL_WRITE,
1967 offset, length, 0, NULL)) {
1968 error = EACCES;
1969 }
1970 }
1971 }
1972 if (error) {
1973 nbl_end_crit(tvp);
1974 VN_RELE(tvp);
1975 in_crit = 0;
1976 }
1977 } else if (tvp != NULL) {
1978 VN_RELE(tvp);
1979 }
1980 }
1981
1982 if (!error) {
1983 /*
1984 * If filesystem is shared with nosuid the remove any
1985 * setuid/setgid bits on create.
1986 */
1987 if (va.va_type == VREG &&
1988 exi->exi_export.ex_flags & EX_NOSUID)
1989 va.va_mode &= ~(VSUID | VSGID);
1990
1991 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1992 NULL, NULL);
1993
1994 if (!error) {
1995
1996 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1997 trunc = TRUE;
1998 else
1999 trunc = FALSE;
2000
2001 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2002 VN_RELE(vp);
2003 curthread->t_flag |= T_WOULDBLOCK;
2004 goto out;
2005 }
2006 va.va_mask = AT_ALL;
2007
2008 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2009
2010 /* check for overflows */
2011 if (!error) {
2012 acl_perm(vp, exi, &va, cr);
2013 error = vattr_to_nattr(&va, &dr->dr_attr);
2014 if (!error) {
2015 error = makefh(&dr->dr_fhandle, vp,
2016 exi);
2017 }
2018 }
2019 /*
2020 * Force modified metadata out to stable storage.
2021 *
2022 * if a underlying vp exists, pass it to VOP_FSYNC
2023 */
2024 if (VOP_REALVP(vp, &realvp, NULL) == 0)
2025 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2026 else
2027 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2028 VN_RELE(vp);
2029 }
2030
2031 if (in_crit) {
2032 nbl_end_crit(tvp);
2033 VN_RELE(tvp);
2034 }
2035 }
2036
2037 /*
2038 * Force modified data and metadata out to stable storage.
2039 */
2040 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2041
2042 out:
2043
2044 VN_RELE(dvp);
2045
2046 dr->dr_status = puterrno(error);
2047
2048 if (name != args->ca_da.da_name)
2049 kmem_free(name, MAXPATHLEN);
2050 }
2051 void *
2052 rfs_create_getfh(struct nfscreatargs *args)
2053 {
2054 return (args->ca_da.da_fhandle);
2055 }
2056
2057 /*
2058 * Remove a file.
2059 * Remove named file from parent directory.
2060 */
2061 /* ARGSUSED */
2062 void
2063 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2064 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2065 {
2066 int error = 0;
2067 vnode_t *vp;
2068 vnode_t *targvp;
2069 int in_crit = 0;
2070
2071 /*
2072 * Disallow NULL paths
2073 */
2074 if (da->da_name == NULL || *da->da_name == '\0') {
2075 *status = NFSERR_ACCES;
2076 return;
2077 }
2078
2079 vp = nfs_fhtovp(da->da_fhandle, exi);
2080 if (vp == NULL) {
2081 *status = NFSERR_STALE;
2082 return;
2083 }
2084
2085 if (rdonly(ro, vp)) {
2086 VN_RELE(vp);
2087 *status = NFSERR_ROFS;
2088 return;
2089 }
2090
2091 /*
2092 * Check for a conflict with a non-blocking mandatory share reservation.
2093 */
2094 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2095 NULL, cr, NULL, NULL, NULL);
2096 if (error != 0) {
2097 VN_RELE(vp);
2098 *status = puterrno(error);
2099 return;
2100 }
2101
2102 /*
2103 * If the file is delegated to an v4 client, then initiate
2104 * recall and drop this request (by setting T_WOULDBLOCK).
2105 * The client will eventually re-transmit the request and
2106 * (hopefully), by then, the v4 client will have returned
2107 * the delegation.
2108 */
2109
2110 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2111 VN_RELE(vp);
2112 VN_RELE(targvp);
2113 curthread->t_flag |= T_WOULDBLOCK;
2114 return;
2115 }
2116
2117 if (nbl_need_check(targvp)) {
2118 nbl_start_crit(targvp, RW_READER);
2119 in_crit = 1;
2120 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2121 error = EACCES;
2122 goto out;
2123 }
2124 }
2125
2126 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2127
2128 /*
2129 * Force modified data and metadata out to stable storage.
2130 */
2131 (void) VOP_FSYNC(vp, 0, cr, NULL);
2132
2133 out:
2134 if (in_crit)
2135 nbl_end_crit(targvp);
2136 VN_RELE(targvp);
2137 VN_RELE(vp);
2138
2139 *status = puterrno(error);
2140
2141 }
2142
2143 void *
2144 rfs_remove_getfh(struct nfsdiropargs *da)
2145 {
2146 return (da->da_fhandle);
2147 }
2148
2149 /*
2150 * rename a file
2151 * Give a file (from) a new name (to).
2152 */
2153 /* ARGSUSED */
2154 void
2155 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2156 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2157 {
2158 int error = 0;
2159 vnode_t *fromvp;
2160 vnode_t *tovp;
2161 struct exportinfo *to_exi;
2162 fhandle_t *fh;
2163 vnode_t *srcvp;
2164 vnode_t *targvp;
2165 int in_crit = 0;
2166
2167 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2168 if (fromvp == NULL) {
2169 *status = NFSERR_STALE;
2170 return;
2171 }
2172
2173 fh = args->rna_to.da_fhandle;
2174 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2175 if (to_exi == NULL) {
2176 VN_RELE(fromvp);
2177 *status = NFSERR_ACCES;
2178 return;
2179 }
2180 exi_rele(to_exi);
2181
2182 if (to_exi != exi) {
2183 VN_RELE(fromvp);
2184 *status = NFSERR_XDEV;
2185 return;
2186 }
2187
2188 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2189 if (tovp == NULL) {
2190 VN_RELE(fromvp);
2191 *status = NFSERR_STALE;
2192 return;
2193 }
2194
2195 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2196 VN_RELE(tovp);
2197 VN_RELE(fromvp);
2198 *status = NFSERR_NOTDIR;
2199 return;
2200 }
2201
2202 /*
2203 * Disallow NULL paths
2204 */
2205 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2206 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2207 VN_RELE(tovp);
2208 VN_RELE(fromvp);
2209 *status = NFSERR_ACCES;
2210 return;
2211 }
2212
2213 if (rdonly(ro, tovp)) {
2214 VN_RELE(tovp);
2215 VN_RELE(fromvp);
2216 *status = NFSERR_ROFS;
2217 return;
2218 }
2219
2220 /*
2221 * Check for a conflict with a non-blocking mandatory share reservation.
2222 */
2223 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2224 NULL, cr, NULL, NULL, NULL);
2225 if (error != 0) {
2226 VN_RELE(tovp);
2227 VN_RELE(fromvp);
2228 *status = puterrno(error);
2229 return;
2230 }
2231
2232 /* Check for delegations on the source file */
2233
2234 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2235 VN_RELE(tovp);
2236 VN_RELE(fromvp);
2237 VN_RELE(srcvp);
2238 curthread->t_flag |= T_WOULDBLOCK;
2239 return;
2240 }
2241
2242 /* Check for delegation on the file being renamed over, if it exists */
2243
2244 if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2245 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2246 NULL, NULL, NULL) == 0) {
2247
2248 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2249 VN_RELE(tovp);
2250 VN_RELE(fromvp);
2251 VN_RELE(srcvp);
2252 VN_RELE(targvp);
2253 curthread->t_flag |= T_WOULDBLOCK;
2254 return;
2255 }
2256 VN_RELE(targvp);
2257 }
2258
2259
2260 if (nbl_need_check(srcvp)) {
2261 nbl_start_crit(srcvp, RW_READER);
2262 in_crit = 1;
2263 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2264 error = EACCES;
2265 goto out;
2266 }
2267 }
2268
2269 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2270 tovp, args->rna_to.da_name, cr, NULL, 0);
2271
2272 if (error == 0)
2273 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2274 strlen(args->rna_to.da_name));
2275
2276 /*
2277 * Force modified data and metadata out to stable storage.
2278 */
2279 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2280 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2281
2282 out:
2283 if (in_crit)
2284 nbl_end_crit(srcvp);
2285 VN_RELE(srcvp);
2286 VN_RELE(tovp);
2287 VN_RELE(fromvp);
2288
2289 *status = puterrno(error);
2290
2291 }
2292 void *
2293 rfs_rename_getfh(struct nfsrnmargs *args)
2294 {
2295 return (args->rna_from.da_fhandle);
2296 }
2297
2298 /*
2299 * Link to a file.
2300 * Create a file (to) which is a hard link to the given file (from).
2301 */
2302 /* ARGSUSED */
2303 void
2304 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2305 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2306 {
2307 int error;
2308 vnode_t *fromvp;
2309 vnode_t *tovp;
2310 struct exportinfo *to_exi;
2311 fhandle_t *fh;
2312
2313 fromvp = nfs_fhtovp(args->la_from, exi);
2314 if (fromvp == NULL) {
2315 *status = NFSERR_STALE;
2316 return;
2317 }
2318
2319 fh = args->la_to.da_fhandle;
2320 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2321 if (to_exi == NULL) {
2322 VN_RELE(fromvp);
2323 *status = NFSERR_ACCES;
2324 return;
2325 }
2326 exi_rele(to_exi);
2327
2328 if (to_exi != exi) {
2329 VN_RELE(fromvp);
2330 *status = NFSERR_XDEV;
2331 return;
2332 }
2333
2334 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2335 if (tovp == NULL) {
2336 VN_RELE(fromvp);
2337 *status = NFSERR_STALE;
2338 return;
2339 }
2340
2341 if (tovp->v_type != VDIR) {
2342 VN_RELE(tovp);
2343 VN_RELE(fromvp);
2344 *status = NFSERR_NOTDIR;
2345 return;
2346 }
2347 /*
2348 * Disallow NULL paths
2349 */
2350 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2351 VN_RELE(tovp);
2352 VN_RELE(fromvp);
2353 *status = NFSERR_ACCES;
2354 return;
2355 }
2356
2357 if (rdonly(ro, tovp)) {
2358 VN_RELE(tovp);
2359 VN_RELE(fromvp);
2360 *status = NFSERR_ROFS;
2361 return;
2362 }
2363
2364 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2365
2366 /*
2367 * Force modified data and metadata out to stable storage.
2368 */
2369 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2370 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2371
2372 VN_RELE(tovp);
2373 VN_RELE(fromvp);
2374
2375 *status = puterrno(error);
2376
2377 }
2378 void *
2379 rfs_link_getfh(struct nfslinkargs *args)
2380 {
2381 return (args->la_from);
2382 }
2383
2384 /*
2385 * Symbolicly link to a file.
2386 * Create a file (to) with the given attributes which is a symbolic link
2387 * to the given path name (to).
2388 */
2389 void
2390 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2391 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2392 {
2393 int error;
2394 struct vattr va;
2395 vnode_t *vp;
2396 vnode_t *svp;
2397 int lerror;
2398 struct sockaddr *ca;
2399 char *name = NULL;
2400
2401 /*
2402 * Disallow NULL paths
2403 */
2404 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2405 *status = NFSERR_ACCES;
2406 return;
2407 }
2408
2409 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2410 if (vp == NULL) {
2411 *status = NFSERR_STALE;
2412 return;
2413 }
2414
2415 if (rdonly(ro, vp)) {
2416 VN_RELE(vp);
2417 *status = NFSERR_ROFS;
2418 return;
2419 }
2420
2421 error = sattr_to_vattr(args->sla_sa, &va);
2422 if (error) {
2423 VN_RELE(vp);
2424 *status = puterrno(error);
2425 return;
2426 }
2427
2428 if (!(va.va_mask & AT_MODE)) {
2429 VN_RELE(vp);
2430 *status = NFSERR_INVAL;
2431 return;
2432 }
2433
2434 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2435 name = nfscmd_convname(ca, exi, args->sla_tnm,
2436 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2437
2438 if (name == NULL) {
2439 *status = NFSERR_ACCES;
2440 return;
2441 }
2442
2443 va.va_type = VLNK;
2444 va.va_mask |= AT_TYPE;
2445
2446 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2447
2448 /*
2449 * Force new data and metadata out to stable storage.
2450 */
2451 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2452 NULL, cr, NULL, NULL, NULL);
2453
2454 if (!lerror) {
2455 (void) VOP_FSYNC(svp, 0, cr, NULL);
2456 VN_RELE(svp);
2457 }
2458
2459 /*
2460 * Force modified data and metadata out to stable storage.
2461 */
2462 (void) VOP_FSYNC(vp, 0, cr, NULL);
2463
2464 VN_RELE(vp);
2465
2466 *status = puterrno(error);
2467 if (name != args->sla_tnm)
2468 kmem_free(name, MAXPATHLEN);
2469
2470 }
2471 void *
2472 rfs_symlink_getfh(struct nfsslargs *args)
2473 {
2474 return (args->sla_from.da_fhandle);
2475 }
2476
2477 /*
2478 * Make a directory.
2479 * Create a directory with the given name, parent directory, and attributes.
2480 * Returns a file handle and attributes for the new directory.
2481 */
2482 /* ARGSUSED */
2483 void
2484 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2485 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2486 {
2487 int error;
2488 struct vattr va;
2489 vnode_t *dvp = NULL;
2490 vnode_t *vp;
2491 char *name = args->ca_da.da_name;
2492
2493 /*
2494 * Disallow NULL paths
2495 */
2496 if (name == NULL || *name == '\0') {
2497 dr->dr_status = NFSERR_ACCES;
2498 return;
2499 }
2500
2501 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2502 if (vp == NULL) {
2503 dr->dr_status = NFSERR_STALE;
2504 return;
2505 }
2506
2507 if (rdonly(ro, vp)) {
2508 VN_RELE(vp);
2509 dr->dr_status = NFSERR_ROFS;
2510 return;
2511 }
2512
2513 error = sattr_to_vattr(args->ca_sa, &va);
2514 if (error) {
2515 VN_RELE(vp);
2516 dr->dr_status = puterrno(error);
2517 return;
2518 }
2519
2520 if (!(va.va_mask & AT_MODE)) {
2521 VN_RELE(vp);
2522 dr->dr_status = NFSERR_INVAL;
2523 return;
2524 }
2525
2526 va.va_type = VDIR;
2527 va.va_mask |= AT_TYPE;
2528
2529 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2530
2531 if (!error) {
2532 /*
2533 * Attribtutes of the newly created directory should
2534 * be returned to the client.
2535 */
2536 va.va_mask = AT_ALL; /* We want everything */
2537 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2538
2539 /* check for overflows */
2540 if (!error) {
2541 acl_perm(vp, exi, &va, cr);
2542 error = vattr_to_nattr(&va, &dr->dr_attr);
2543 if (!error) {
2544 error = makefh(&dr->dr_fhandle, dvp, exi);
2545 }
2546 }
2547 /*
2548 * Force new data and metadata out to stable storage.
2549 */
2550 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2551 VN_RELE(dvp);
2552 }
2553
2554 /*
2555 * Force modified data and metadata out to stable storage.
2556 */
2557 (void) VOP_FSYNC(vp, 0, cr, NULL);
2558
2559 VN_RELE(vp);
2560
2561 dr->dr_status = puterrno(error);
2562
2563 }
2564 void *
2565 rfs_mkdir_getfh(struct nfscreatargs *args)
2566 {
2567 return (args->ca_da.da_fhandle);
2568 }
2569
2570 /*
2571 * Remove a directory.
2572 * Remove the given directory name from the given parent directory.
2573 */
2574 /* ARGSUSED */
2575 void
2576 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2577 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2578 {
2579 int error;
2580 vnode_t *vp;
2581
2582 /*
2583 * Disallow NULL paths
2584 */
2585 if (da->da_name == NULL || *da->da_name == '\0') {
2586 *status = NFSERR_ACCES;
2587 return;
2588 }
2589
2590 vp = nfs_fhtovp(da->da_fhandle, exi);
2591 if (vp == NULL) {
2592 *status = NFSERR_STALE;
2593 return;
2594 }
2595
2596 if (rdonly(ro, vp)) {
2597 VN_RELE(vp);
2598 *status = NFSERR_ROFS;
2599 return;
2600 }
2601
2602 /*
2603 * VOP_RMDIR takes a third argument (the current
2604 * directory of the process). That's because someone
2605 * wants to return EINVAL if one tries to remove ".".
2606 * Of course, NFS servers have no idea what their
2607 * clients' current directories are. We fake it by
2608 * supplying a vnode known to exist and illegal to
2609 * remove.
2610 */
2611 error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2612
2613 /*
2614 * Force modified data and metadata out to stable storage.
2615 */
2616 (void) VOP_FSYNC(vp, 0, cr, NULL);
2617
2618 VN_RELE(vp);
2619
2620 /*
2621 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2622 * if the directory is not empty. A System V NFS server
2623 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2624 * over the wire.
2625 */
2626 if (error == EEXIST)
2627 *status = NFSERR_NOTEMPTY;
2628 else
2629 *status = puterrno(error);
2630
2631 }
2632 void *
2633 rfs_rmdir_getfh(struct nfsdiropargs *da)
2634 {
2635 return (da->da_fhandle);
2636 }
2637
2638 /* ARGSUSED */
2639 void
2640 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2641 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2642 {
2643 int error;
2644 int iseof;
2645 struct iovec iov;
2646 struct uio uio;
2647 vnode_t *vp;
2648 char *ndata = NULL;
2649 struct sockaddr *ca;
2650 size_t nents;
2651 int ret;
2652
2653 vp = nfs_fhtovp(&rda->rda_fh, exi);
2654 if (vp == NULL) {
2655 rd->rd_entries = NULL;
2656 rd->rd_status = NFSERR_STALE;
2657 return;
2658 }
2659
2660 if (vp->v_type != VDIR) {
2661 VN_RELE(vp);
2662 rd->rd_entries = NULL;
2663 rd->rd_status = NFSERR_NOTDIR;
2664 return;
2665 }
2666
2667 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2668
2669 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2670
2671 if (error) {
2672 rd->rd_entries = NULL;
2673 goto bad;
2674 }
2675
2676 if (rda->rda_count == 0) {
2677 rd->rd_entries = NULL;
2678 rd->rd_size = 0;
2679 rd->rd_eof = FALSE;
2680 goto bad;
2681 }
2682
2683 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2684
2685 /*
2686 * Allocate data for entries. This will be freed by rfs_rddirfree.
2687 */
2688 rd->rd_bufsize = (uint_t)rda->rda_count;
2689 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2690
2691 /*
2692 * Set up io vector to read directory data
2693 */
2694 iov.iov_base = (caddr_t)rd->rd_entries;
2695 iov.iov_len = rda->rda_count;
2696 uio.uio_iov = &iov;
2697 uio.uio_iovcnt = 1;
2698 uio.uio_segflg = UIO_SYSSPACE;
2699 uio.uio_extflg = UIO_COPY_CACHED;
2700 uio.uio_loffset = (offset_t)rda->rda_offset;
2701 uio.uio_resid = rda->rda_count;
2702
2703 /*
2704 * read directory
2705 */
2706 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2707
2708 /*
2709 * Clean up
2710 */
2711 if (!error) {
2712 /*
2713 * set size and eof
2714 */
2715 if (uio.uio_resid == rda->rda_count) {
2716 rd->rd_size = 0;
2717 rd->rd_eof = TRUE;
2718 } else {
2719 rd->rd_size = (uint32_t)(rda->rda_count -
2720 uio.uio_resid);
2721 rd->rd_eof = iseof ? TRUE : FALSE;
2722 }
2723 }
2724
2725 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2726 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2727 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2728 rda->rda_count, &ndata);
2729
2730 if (ret != 0) {
2731 size_t dropbytes;
2732 /*
2733 * We had to drop one or more entries in order to fit
2734 * during the character conversion. We need to patch
2735 * up the size and eof info.
2736 */
2737 if (rd->rd_eof)
2738 rd->rd_eof = FALSE;
2739 dropbytes = nfscmd_dropped_entrysize(
2740 (struct dirent64 *)rd->rd_entries, nents, ret);
2741 rd->rd_size -= dropbytes;
2742 }
2743 if (ndata == NULL) {
2744 ndata = (char *)rd->rd_entries;
2745 } else if (ndata != (char *)rd->rd_entries) {
2746 kmem_free(rd->rd_entries, rd->rd_bufsize);
2747 rd->rd_entries = (void *)ndata;
2748 rd->rd_bufsize = rda->rda_count;
2749 }
2750
2751 bad:
2752 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2753
2754 #if 0 /* notyet */
2755 /*
2756 * Don't do this. It causes local disk writes when just
2757 * reading the file and the overhead is deemed larger
2758 * than the benefit.
2759 */
2760 /*
2761 * Force modified metadata out to stable storage.
2762 */
2763 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2764 #endif
2765
2766 VN_RELE(vp);
2767
2768 rd->rd_status = puterrno(error);
2769
2770 }
2771 void *
2772 rfs_readdir_getfh(struct nfsrddirargs *rda)
2773 {
2774 return (&rda->rda_fh);
2775 }
2776 void
2777 rfs_rddirfree(struct nfsrddirres *rd)
2778 {
2779 if (rd->rd_entries != NULL)
2780 kmem_free(rd->rd_entries, rd->rd_bufsize);
2781 }
2782
2783 /* ARGSUSED */
2784 void
2785 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2786 struct svc_req *req, cred_t *cr, bool_t ro)
2787 {
2788 int error;
2789 struct statvfs64 sb;
2790 vnode_t *vp;
2791
2792 vp = nfs_fhtovp(fh, exi);
2793 if (vp == NULL) {
2794 fs->fs_status = NFSERR_STALE;
2795 return;
2796 }
2797
2798 error = VFS_STATVFS(vp->v_vfsp, &sb);
2799
2800 if (!error) {
2801 fs->fs_tsize = nfstsize();
2802 fs->fs_bsize = sb.f_frsize;
2803 fs->fs_blocks = sb.f_blocks;
2804 fs->fs_bfree = sb.f_bfree;
2805 fs->fs_bavail = sb.f_bavail;
2806 }
2807
2808 VN_RELE(vp);
2809
2810 fs->fs_status = puterrno(error);
2811
2812 }
2813 void *
2814 rfs_statfs_getfh(fhandle_t *fh)
2815 {
2816 return (fh);
2817 }
2818
2819 static int
2820 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2821 {
2822 vap->va_mask = 0;
2823
2824 /*
2825 * There was a sign extension bug in some VFS based systems
2826 * which stored the mode as a short. When it would get
2827 * assigned to a u_long, no sign extension would occur.
2828 * It needed to, but this wasn't noticed because sa_mode
2829 * would then get assigned back to the short, thus ignoring
2830 * the upper 16 bits of sa_mode.
2831 *
2832 * To make this implementation work for both broken
2833 * clients and good clients, we check for both versions
2834 * of the mode.
2835 */
2836 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2837 sa->sa_mode != (uint32_t)-1) {
2838 vap->va_mask |= AT_MODE;
2839 vap->va_mode = sa->sa_mode;
2840 }
2841 if (sa->sa_uid != (uint32_t)-1) {
2842 vap->va_mask |= AT_UID;
2843 vap->va_uid = sa->sa_uid;
2844 }
2845 if (sa->sa_gid != (uint32_t)-1) {
2846 vap->va_mask |= AT_GID;
2847 vap->va_gid = sa->sa_gid;
2848 }
2849 if (sa->sa_size != (uint32_t)-1) {
2850 vap->va_mask |= AT_SIZE;
2851 vap->va_size = sa->sa_size;
2852 }
2853 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2854 sa->sa_atime.tv_usec != (int32_t)-1) {
2855 #ifndef _LP64
2856 /* return error if time overflow */
2857 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2858 return (EOVERFLOW);
2859 #endif
2860 vap->va_mask |= AT_ATIME;
2861 /*
2862 * nfs protocol defines times as unsigned so don't extend sign,
2863 * unless sysadmin set nfs_allow_preepoch_time.
2864 */
2865 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2866 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2867 }
2868 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2869 sa->sa_mtime.tv_usec != (int32_t)-1) {
2870 #ifndef _LP64
2871 /* return error if time overflow */
2872 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2873 return (EOVERFLOW);
2874 #endif
2875 vap->va_mask |= AT_MTIME;
2876 /*
2877 * nfs protocol defines times as unsigned so don't extend sign,
2878 * unless sysadmin set nfs_allow_preepoch_time.
2879 */
2880 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2881 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2882 }
2883 return (0);
2884 }
2885
2886 static const enum nfsftype vt_to_nf[] = {
2887 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2888 };
2889
2890 /*
2891 * check the following fields for overflow: nodeid, size, and time.
2892 * There could be a problem when converting 64-bit LP64 fields
2893 * into 32-bit ones. Return an error if there is an overflow.
2894 */
2895 int
2896 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2897 {
2898 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2899 na->na_type = vt_to_nf[vap->va_type];
2900
2901 if (vap->va_mode == (unsigned short) -1)
2902 na->na_mode = (uint32_t)-1;
2903 else
2904 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2905
2906 if (vap->va_uid == (unsigned short)(-1))
2907 na->na_uid = (uint32_t)(-1);
2908 else if (vap->va_uid == UID_NOBODY)
2909 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2910 else
2911 na->na_uid = vap->va_uid;
2912
2913 if (vap->va_gid == (unsigned short)(-1))
2914 na->na_gid = (uint32_t)-1;
2915 else if (vap->va_gid == GID_NOBODY)
2916 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2917 else
2918 na->na_gid = vap->va_gid;
2919
2920 /*
2921 * Do we need to check fsid for overflow? It is 64-bit in the
2922 * vattr, but are bigger than 32 bit values supported?
2923 */
2924 na->na_fsid = vap->va_fsid;
2925
2926 na->na_nodeid = vap->va_nodeid;
2927
2928 /*
2929 * Check to make sure that the nodeid is representable over the
2930 * wire without losing bits.
2931 */
2932 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2933 return (EFBIG);
2934 na->na_nlink = vap->va_nlink;
2935
2936 /*
2937 * Check for big files here, instead of at the caller. See
2938 * comments in cstat for large special file explanation.
2939 */
2940 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2941 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2942 return (EFBIG);
2943 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2944 /* UNKNOWN_SIZE | OVERFLOW */
2945 na->na_size = MAXOFF32_T;
2946 } else
2947 na->na_size = vap->va_size;
2948 } else
2949 na->na_size = vap->va_size;
2950
2951 /*
2952 * If the vnode times overflow the 32-bit times that NFS2
2953 * uses on the wire then return an error.
2954 */
2955 if (!NFS_VAP_TIME_OK(vap)) {
2956 return (EOVERFLOW);
2957 }
2958 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2959 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2960
2961 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2962 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2963
2964 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2965 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2966
2967 /*
2968 * If the dev_t will fit into 16 bits then compress
2969 * it, otherwise leave it alone. See comments in
2970 * nfs_client.c.
2971 */
2972 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2973 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2974 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2975 else
2976 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2977
2978 na->na_blocks = vap->va_nblocks;
2979 na->na_blocksize = vap->va_blksize;
2980
2981 /*
2982 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2983 * over-the-wire protocols for named-pipe vnodes. It remaps the
2984 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2985 *
2986 * BUYER BEWARE:
2987 * If you are porting the NFS to a non-Sun server, you probably
2988 * don't want to include the following block of code. The
2989 * over-the-wire special file types will be changing with the
2990 * NFS Protocol Revision.
2991 */
2992 if (vap->va_type == VFIFO)
2993 NA_SETFIFO(na);
2994 return (0);
2995 }
2996
2997 /*
2998 * acl v2 support: returns approximate permission.
2999 * default: returns minimal permission (more restrictive)
3000 * aclok: returns maximal permission (less restrictive)
3001 * This routine changes the permissions that are alaredy in *va.
3002 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3003 * CLASS_OBJ is always the same as GROUP_OBJ entry.
3004 */
3005 static void
3006 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3007 {
3008 vsecattr_t vsa;
3009 int aclcnt;
3010 aclent_t *aclentp;
3011 mode_t mask_perm;
3012 mode_t grp_perm;
3013 mode_t other_perm;
3014 mode_t other_orig;
3015 int error;
3016
3017 /* dont care default acl */
3018 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3019 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3020
3021 if (!error) {
3022 aclcnt = vsa.vsa_aclcnt;
3023 if (aclcnt > MIN_ACL_ENTRIES) {
3024 /* non-trivial ACL */
3025 aclentp = vsa.vsa_aclentp;
3026 if (exi->exi_export.ex_flags & EX_ACLOK) {
3027 /* maximal permissions */
3028 grp_perm = 0;
3029 other_perm = 0;
3030 for (; aclcnt > 0; aclcnt--, aclentp++) {
3031 switch (aclentp->a_type) {
3032 case USER_OBJ:
3033 break;
3034 case USER:
3035 grp_perm |=
3036 aclentp->a_perm << 3;
3037 other_perm |= aclentp->a_perm;
3038 break;
3039 case GROUP_OBJ:
3040 grp_perm |=
3041 aclentp->a_perm << 3;
3042 break;
3043 case GROUP:
3044 other_perm |= aclentp->a_perm;
3045 break;
3046 case OTHER_OBJ:
3047 other_orig = aclentp->a_perm;
3048 break;
3049 case CLASS_OBJ:
3050 mask_perm = aclentp->a_perm;
3051 break;
3052 default:
3053 break;
3054 }
3055 }
3056 grp_perm &= mask_perm << 3;
3057 other_perm &= mask_perm;
3058 other_perm |= other_orig;
3059
3060 } else {
3061 /* minimal permissions */
3062 grp_perm = 070;
3063 other_perm = 07;
3064 for (; aclcnt > 0; aclcnt--, aclentp++) {
3065 switch (aclentp->a_type) {
3066 case USER_OBJ:
3067 break;
3068 case USER:
3069 case CLASS_OBJ:
3070 grp_perm &=
3071 aclentp->a_perm << 3;
3072 other_perm &=
3073 aclentp->a_perm;
3074 break;
3075 case GROUP_OBJ:
3076 grp_perm &=
3077 aclentp->a_perm << 3;
3078 break;
3079 case GROUP:
3080 other_perm &=
3081 aclentp->a_perm;
3082 break;
3083 case OTHER_OBJ:
3084 other_perm &=
3085 aclentp->a_perm;
3086 break;
3087 default:
3088 break;
3089 }
3090 }
3091 }
3092 /* copy to va */
3093 va->va_mode &= ~077;
3094 va->va_mode |= grp_perm | other_perm;
3095 }
3096 if (vsa.vsa_aclcnt)
3097 kmem_free(vsa.vsa_aclentp,
3098 vsa.vsa_aclcnt * sizeof (aclent_t));
3099 }
3100 }
3101
3102 void
3103 rfs_srvrinit(void)
3104 {
3105 nfs2_srv_caller_id = fs_new_caller_id();
3106 zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3107 }
3108
3109 void
3110 rfs_srvrfini(void)
3111 {
3112 }
3113
3114 /* ARGSUSED */
3115 static void *
3116 rfs_zone_init(zoneid_t zoneid)
3117 {
3118 nfs_srv_t *ns;
3119
3120 ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3121
3122 mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3123 ns->write_async = 1;
3124
3125 return (ns);
3126 }
3127
3128 /* ARGSUSED */
3129 static void
3130 rfs_zone_fini(zoneid_t zoneid, void *data)
3131 {
3132 nfs_srv_t *ns;
3133
3134 ns = (nfs_srv_t *)data;
3135 mutex_destroy(&ns->async_write_lock);
3136 kmem_free(ns, sizeof (*ns));
3137 }
3138
3139 static int
3140 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3141 {
3142 struct clist *wcl;
3143 int wlist_len;
3144 uint32_t count = rr->rr_count;
3145
3146 wcl = ra->ra_wlist;
3147
3148 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3149 return (FALSE);
3150 }
3151
3152 wcl = ra->ra_wlist;
3153 rr->rr_ok.rrok_wlist_len = wlist_len;
3154 rr->rr_ok.rrok_wlist = wcl;
3155
3156 return (TRUE);
3157 }