1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
30 * All rights reserved.
31 */
32
33 /*
34 * Copyright 2018 Nexenta Systems, Inc.
35 * Copyright (c) 2016 by Delphix. All rights reserved.
36 */
37
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/uio.h>
46 #include <sys/stat.h>
47 #include <sys/errno.h>
48 #include <sys/sysmacros.h>
49 #include <sys/statvfs.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/dirent.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/vtrace.h>
56 #include <sys/mode.h>
57 #include <sys/acl.h>
58 #include <sys/nbmlock.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/svc.h>
65
66 #include <nfs/nfs.h>
67 #include <nfs/export.h>
68 #include <nfs/nfs_cmd.h>
69
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_kmem.h>
75
76 #include <sys/strsubr.h>
77
78 struct rfs_async_write_list;
79
80 /*
81 * Zone globals of NFSv2 server
82 */
83 typedef struct nfs_srv {
84 kmutex_t async_write_lock;
85 struct rfs_async_write_list *async_write_head;
86
87 /*
88 * enables write clustering if == 1
89 */
90 int write_async;
91 } nfs_srv_t;
92
93 /*
94 * These are the interface routines for the server side of the
95 * Network File System. See the NFS version 2 protocol specification
96 * for a description of this interface.
97 */
98
99 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
100 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
101 cred_t *);
102
103
104 /*
105 * Some "over the wire" UNIX file types. These are encoded
106 * into the mode. This needs to be fixed in the next rev.
107 */
108 #define IFMT 0170000 /* type of file */
109 #define IFCHR 0020000 /* character special */
110 #define IFBLK 0060000 /* block special */
111 #define IFSOCK 0140000 /* socket */
112
113 u_longlong_t nfs2_srv_caller_id;
114
115 static nfs_srv_t *
116 nfs_get_srv(void)
117 {
118 nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
119 nfs_srv_t *srv = ng->nfs_srv;
120 ASSERT(srv != NULL);
121 return (srv);
122 }
123
124 /*
125 * Get file attributes.
126 * Returns the current attributes of the file with the given fhandle.
127 */
128 /* ARGSUSED */
129 void
130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
131 struct svc_req *req, cred_t *cr, bool_t ro)
132 {
133 int error;
134 vnode_t *vp;
135 struct vattr va;
136
137 vp = nfs_fhtovp(fhp, exi);
138 if (vp == NULL) {
139 ns->ns_status = NFSERR_STALE;
140 return;
141 }
142
143 /*
144 * Do the getattr.
145 */
146 va.va_mask = AT_ALL; /* we want all the attributes */
147
148 error = rfs4_delegated_getattr(vp, &va, 0, cr);
149
150 /* check for overflows */
151 if (!error) {
152 /* Lie about the object type for a referral */
153 if (vn_is_nfs_reparse(vp, cr))
154 va.va_type = VLNK;
155
156 acl_perm(vp, exi, &va, cr);
157 error = vattr_to_nattr(&va, &ns->ns_attr);
158 }
159
160 VN_RELE(vp);
161
162 ns->ns_status = puterrno(error);
163 }
164 void *
165 rfs_getattr_getfh(fhandle_t *fhp)
166 {
167 return (fhp);
168 }
169
170 /*
171 * Set file attributes.
172 * Sets the attributes of the file with the given fhandle. Returns
173 * the new attributes.
174 */
175 /* ARGSUSED */
176 void
177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
178 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
179 {
180 int error;
181 int flag;
182 int in_crit = 0;
183 vnode_t *vp;
184 struct vattr va;
185 struct vattr bva;
186 struct flock64 bf;
187 caller_context_t ct;
188
189
190 vp = nfs_fhtovp(&args->saa_fh, exi);
191 if (vp == NULL) {
192 ns->ns_status = NFSERR_STALE;
193 return;
194 }
195
196 if (rdonly(ro, vp)) {
197 VN_RELE(vp);
198 ns->ns_status = NFSERR_ROFS;
199 return;
200 }
201
202 error = sattr_to_vattr(&args->saa_sa, &va);
203 if (error) {
204 VN_RELE(vp);
205 ns->ns_status = puterrno(error);
206 return;
207 }
208
209 /*
210 * If the client is requesting a change to the mtime,
211 * but the nanosecond field is set to 1 billion, then
212 * this is a flag to the server that it should set the
213 * atime and mtime fields to the server's current time.
214 * The 1 billion number actually came from the client
215 * as 1 million, but the units in the over the wire
216 * request are microseconds instead of nanoseconds.
217 *
218 * This is an overload of the protocol and should be
219 * documented in the NFS Version 2 protocol specification.
220 */
221 if (va.va_mask & AT_MTIME) {
222 if (va.va_mtime.tv_nsec == 1000000000) {
223 gethrestime(&va.va_mtime);
224 va.va_atime = va.va_mtime;
225 va.va_mask |= AT_ATIME;
226 flag = 0;
227 } else
228 flag = ATTR_UTIME;
229 } else
230 flag = 0;
231
232 /*
233 * If the filesystem is exported with nosuid, then mask off
234 * the setuid and setgid bits.
235 */
236 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
237 (exi->exi_export.ex_flags & EX_NOSUID))
238 va.va_mode &= ~(VSUID | VSGID);
239
240 ct.cc_sysid = 0;
241 ct.cc_pid = 0;
242 ct.cc_caller_id = nfs2_srv_caller_id;
243 ct.cc_flags = CC_DONTBLOCK;
244
245 /*
246 * We need to specially handle size changes because it is
247 * possible for the client to create a file with modes
248 * which indicate read-only, but with the file opened for
249 * writing. If the client then tries to set the size of
250 * the file, then the normal access checking done in
251 * VOP_SETATTR would prevent the client from doing so,
252 * although it should be legal for it to do so. To get
253 * around this, we do the access checking for ourselves
254 * and then use VOP_SPACE which doesn't do the access
255 * checking which VOP_SETATTR does. VOP_SPACE can only
256 * operate on VREG files, let VOP_SETATTR handle the other
257 * extremely rare cases.
258 * Also the client should not be allowed to change the
259 * size of the file if there is a conflicting non-blocking
260 * mandatory lock in the region of change.
261 */
262 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
263 if (nbl_need_check(vp)) {
264 nbl_start_crit(vp, RW_READER);
265 in_crit = 1;
266 }
267
268 bva.va_mask = AT_UID | AT_SIZE;
269
270 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
271
272 if (error) {
273 if (in_crit)
274 nbl_end_crit(vp);
275 VN_RELE(vp);
276 ns->ns_status = puterrno(error);
277 return;
278 }
279
280 if (in_crit) {
281 u_offset_t offset;
282 ssize_t length;
283
284 if (va.va_size < bva.va_size) {
285 offset = va.va_size;
286 length = bva.va_size - va.va_size;
287 } else {
288 offset = bva.va_size;
289 length = va.va_size - bva.va_size;
290 }
291 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
292 NULL)) {
293 error = EACCES;
294 }
295 }
296
297 if (crgetuid(cr) == bva.va_uid && !error &&
298 va.va_size != bva.va_size) {
299 va.va_mask &= ~AT_SIZE;
300 bf.l_type = F_WRLCK;
301 bf.l_whence = 0;
302 bf.l_start = (off64_t)va.va_size;
303 bf.l_len = 0;
304 bf.l_sysid = 0;
305 bf.l_pid = 0;
306
307 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
308 (offset_t)va.va_size, cr, &ct);
309 }
310 if (in_crit)
311 nbl_end_crit(vp);
312 } else
313 error = 0;
314
315 /*
316 * Do the setattr.
317 */
318 if (!error && va.va_mask) {
319 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
320 }
321
322 /*
323 * check if the monitor on either vop_space or vop_setattr detected
324 * a delegation conflict and if so, mark the thread flag as
325 * wouldblock so that the response is dropped and the client will
326 * try again.
327 */
328 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
329 VN_RELE(vp);
330 curthread->t_flag |= T_WOULDBLOCK;
331 return;
332 }
333
334 if (!error) {
335 va.va_mask = AT_ALL; /* get everything */
336
337 error = rfs4_delegated_getattr(vp, &va, 0, cr);
338
339 /* check for overflows */
340 if (!error) {
341 acl_perm(vp, exi, &va, cr);
342 error = vattr_to_nattr(&va, &ns->ns_attr);
343 }
344 }
345
346 ct.cc_flags = 0;
347
348 /*
349 * Force modified metadata out to stable storage.
350 */
351 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
352
353 VN_RELE(vp);
354
355 ns->ns_status = puterrno(error);
356 }
357 void *
358 rfs_setattr_getfh(struct nfssaargs *args)
359 {
360 return (&args->saa_fh);
361 }
362
363 /* Change and release @exip and @vpp only in success */
364 int
365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
366 {
367 struct exportinfo *exi;
368 vnode_t *vp = *vpp;
369 fid_t fid;
370 int error;
371
372 VN_HOLD(vp);
373
374 if ((error = traverse(&vp)) != 0) {
375 VN_RELE(vp);
376 return (error);
377 }
378
379 bzero(&fid, sizeof (fid));
380 fid.fid_len = MAXFIDSZ;
381 error = VOP_FID(vp, &fid, NULL);
382 if (error) {
383 VN_RELE(vp);
384 return (error);
385 }
386
387 exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
388 if (exi == NULL ||
389 (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
390 /*
391 * It is not error, just subdir is not exported
392 * or "nohide" is not set
393 */
394 if (exi != NULL)
395 exi_rele(exi);
396 VN_RELE(vp);
397 } else {
398 /* go to submount */
399 exi_rele(*exip);
400 *exip = exi;
401
402 VN_RELE(*vpp);
403 *vpp = vp;
404 }
405
406 return (0);
407 }
408
409 /*
410 * Given mounted "dvp" and "exi", go upper mountpoint
411 * with dvp/exi correction
412 * Return 0 in success
413 */
414 int
415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
416 {
417 struct exportinfo *exi;
418 vnode_t *dvp = *dvpp;
419
420 ASSERT3P((*exip)->exi_zone, ==, curzone);
421 ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
422
423 VN_HOLD(dvp);
424 dvp = untraverse(dvp);
425 exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
426 if (exi == NULL) {
427 VN_RELE(dvp);
428 return (-1);
429 }
430
431 ASSERT3P(exi->exi_zone, ==, curzone);
432 exi_rele(*exip);
433 *exip = exi;
434 VN_RELE(*dvpp);
435 *dvpp = dvp;
436
437 return (0);
438 }
439 /*
440 * Directory lookup.
441 * Returns an fhandle and file attributes for file name in a directory.
442 */
443 /* ARGSUSED */
444 void
445 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
446 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
447 {
448 int error;
449 vnode_t *dvp;
450 vnode_t *vp;
451 struct vattr va;
452 fhandle_t *fhp = da->da_fhandle;
453 struct sec_ol sec = {0, 0};
454 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
455 char *name;
456 struct sockaddr *ca;
457
458 /*
459 * Trusted Extension doesn't support NFSv2. MOUNT
460 * will reject v2 clients. Need to prevent v2 client
461 * access via WebNFS here.
462 */
463 if (is_system_labeled() && req->rq_vers == 2) {
464 dr->dr_status = NFSERR_ACCES;
465 return;
466 }
467
468 /*
469 * Disallow NULL paths
470 */
471 if (da->da_name == NULL || *da->da_name == '\0') {
472 dr->dr_status = NFSERR_ACCES;
473 return;
474 }
475
476 /*
477 * Allow lookups from the root - the default
478 * location of the public filehandle.
479 */
480 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
481 dvp = ZONE_ROOTVP();
482 VN_HOLD(dvp);
483 } else {
484 dvp = nfs_fhtovp(fhp, exi);
485 if (dvp == NULL) {
486 dr->dr_status = NFSERR_STALE;
487 return;
488 }
489 }
490
491 exi_hold(exi);
492 ASSERT3P(exi->exi_zone, ==, curzone);
493
494 /*
495 * Not allow lookup beyond root.
496 * If the filehandle matches a filehandle of the exi,
497 * then the ".." refers beyond the root of an exported filesystem.
498 */
499 if (strcmp(da->da_name, "..") == 0 &&
500 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
501 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
502 ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
503 /*
504 * special case for ".." and 'nohide'exported root
505 */
506 if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
507 error = NFSERR_ACCES;
508 goto out;
509 }
510 } else {
511 error = NFSERR_NOENT;
512 goto out;
513 }
514 }
515
516 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
517 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
518 MAXPATHLEN);
519
520 if (name == NULL) {
521 error = NFSERR_ACCES;
522 goto out;
523 }
524
525 /*
526 * If the public filehandle is used then allow
527 * a multi-component lookup, i.e. evaluate
528 * a pathname and follow symbolic links if
529 * necessary.
530 *
531 * This may result in a vnode in another filesystem
532 * which is OK as long as the filesystem is exported.
533 */
534 if (PUBLIC_FH2(fhp)) {
535 publicfh_flag = TRUE;
536
537 exi_rele(exi);
538
539 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
540 &sec);
541 } else {
542 /*
543 * Do a normal single component lookup.
544 */
545 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
546 NULL, NULL, NULL);
547 }
548
549 if (name != da->da_name)
550 kmem_free(name, MAXPATHLEN);
551
552 if (error == 0 && vn_ismntpt(vp)) {
553 error = rfs_cross_mnt(&vp, &exi);
554 if (error)
555 VN_RELE(vp);
556 }
557
558 if (!error) {
559 va.va_mask = AT_ALL; /* we want everything */
560
561 error = rfs4_delegated_getattr(vp, &va, 0, cr);
562
563 /* check for overflows */
564 if (!error) {
565 acl_perm(vp, exi, &va, cr);
566 error = vattr_to_nattr(&va, &dr->dr_attr);
567 if (!error) {
568 if (sec.sec_flags & SEC_QUERY)
569 error = makefh_ol(&dr->dr_fhandle, exi,
570 sec.sec_index);
571 else {
572 error = makefh(&dr->dr_fhandle, vp,
573 exi);
574 if (!error && publicfh_flag &&
575 !chk_clnt_sec(exi, req))
576 auth_weak = TRUE;
577 }
578 }
579 }
580 VN_RELE(vp);
581 }
582
583 out:
584 VN_RELE(dvp);
585
586 if (exi != NULL)
587 exi_rele(exi);
588
589 /*
590 * If it's public fh, no 0x81, and client's flavor is
591 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
592 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
593 */
594 if (auth_weak)
595 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
596 else
597 dr->dr_status = puterrno(error);
598 }
599 void *
600 rfs_lookup_getfh(struct nfsdiropargs *da)
601 {
602 return (da->da_fhandle);
603 }
604
605 /*
606 * Read symbolic link.
607 * Returns the string in the symbolic link at the given fhandle.
608 */
609 /* ARGSUSED */
610 void
611 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
612 struct svc_req *req, cred_t *cr, bool_t ro)
613 {
614 int error;
615 struct iovec iov;
616 struct uio uio;
617 vnode_t *vp;
618 struct vattr va;
619 struct sockaddr *ca;
620 char *name = NULL;
621 int is_referral = 0;
622
623 vp = nfs_fhtovp(fhp, exi);
624 if (vp == NULL) {
625 rl->rl_data = NULL;
626 rl->rl_status = NFSERR_STALE;
627 return;
628 }
629
630 va.va_mask = AT_MODE;
631
632 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
633
634 if (error) {
635 VN_RELE(vp);
636 rl->rl_data = NULL;
637 rl->rl_status = puterrno(error);
638 return;
639 }
640
641 if (MANDLOCK(vp, va.va_mode)) {
642 VN_RELE(vp);
643 rl->rl_data = NULL;
644 rl->rl_status = NFSERR_ACCES;
645 return;
646 }
647
648 /* We lied about the object type for a referral */
649 if (vn_is_nfs_reparse(vp, cr))
650 is_referral = 1;
651
652 /*
653 * XNFS and RFC1094 require us to return ENXIO if argument
654 * is not a link. BUGID 1138002.
655 */
656 if (vp->v_type != VLNK && !is_referral) {
657 VN_RELE(vp);
658 rl->rl_data = NULL;
659 rl->rl_status = NFSERR_NXIO;
660 return;
661 }
662
663 /*
664 * Allocate data for pathname. This will be freed by rfs_rlfree.
665 */
666 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
667
668 if (is_referral) {
669 char *s;
670 size_t strsz;
671
672 /* Get an artificial symlink based on a referral */
673 s = build_symlink(vp, cr, &strsz);
674 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
675 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
676 vnode_t *, vp, char *, s);
677 if (s == NULL)
678 error = EINVAL;
679 else {
680 error = 0;
681 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
682 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
683 kmem_free(s, strsz);
684 }
685
686 } else {
687
688 /*
689 * Set up io vector to read sym link data
690 */
691 iov.iov_base = rl->rl_data;
692 iov.iov_len = NFS_MAXPATHLEN;
693 uio.uio_iov = &iov;
694 uio.uio_iovcnt = 1;
695 uio.uio_segflg = UIO_SYSSPACE;
696 uio.uio_extflg = UIO_COPY_CACHED;
697 uio.uio_loffset = (offset_t)0;
698 uio.uio_resid = NFS_MAXPATHLEN;
699
700 /*
701 * Do the readlink.
702 */
703 error = VOP_READLINK(vp, &uio, cr, NULL);
704
705 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
706
707 if (!error)
708 rl->rl_data[rl->rl_count] = '\0';
709
710 }
711
712
713 VN_RELE(vp);
714
715 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
716 name = nfscmd_convname(ca, exi, rl->rl_data,
717 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
718
719 if (name != NULL && name != rl->rl_data) {
720 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
721 rl->rl_data = name;
722 }
723
724 /*
725 * XNFS and RFC1094 require us to return ENXIO if argument
726 * is not a link. UFS returns EINVAL if this is the case,
727 * so we do the mapping here. BUGID 1138002.
728 */
729 if (error == EINVAL)
730 rl->rl_status = NFSERR_NXIO;
731 else
732 rl->rl_status = puterrno(error);
733
734 }
735 void *
736 rfs_readlink_getfh(fhandle_t *fhp)
737 {
738 return (fhp);
739 }
740 /*
741 * Free data allocated by rfs_readlink
742 */
743 void
744 rfs_rlfree(struct nfsrdlnres *rl)
745 {
746 if (rl->rl_data != NULL)
747 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
748 }
749
750 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
751
752 /*
753 * Read data.
754 * Returns some data read from the file at the given fhandle.
755 */
756 /* ARGSUSED */
757 void
758 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
759 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
760 {
761 vnode_t *vp;
762 int error;
763 struct vattr va;
764 struct iovec iov;
765 struct uio uio;
766 mblk_t *mp;
767 int alloc_err = 0;
768 int in_crit = 0;
769 caller_context_t ct;
770
771 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
772 if (vp == NULL) {
773 rr->rr_data = NULL;
774 rr->rr_status = NFSERR_STALE;
775 return;
776 }
777
778 if (vp->v_type != VREG) {
779 VN_RELE(vp);
780 rr->rr_data = NULL;
781 rr->rr_status = NFSERR_ISDIR;
782 return;
783 }
784
785 ct.cc_sysid = 0;
786 ct.cc_pid = 0;
787 ct.cc_caller_id = nfs2_srv_caller_id;
788 ct.cc_flags = CC_DONTBLOCK;
789
790 /*
791 * Enter the critical region before calling VOP_RWLOCK
792 * to avoid a deadlock with write requests.
793 */
794 if (nbl_need_check(vp)) {
795 nbl_start_crit(vp, RW_READER);
796 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
797 0, NULL)) {
798 nbl_end_crit(vp);
799 VN_RELE(vp);
800 rr->rr_data = NULL;
801 rr->rr_status = NFSERR_ACCES;
802 return;
803 }
804 in_crit = 1;
805 }
806
807 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
808
809 /* check if a monitor detected a delegation conflict */
810 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
811 if (in_crit)
812 nbl_end_crit(vp);
813 VN_RELE(vp);
814 /* mark as wouldblock so response is dropped */
815 curthread->t_flag |= T_WOULDBLOCK;
816
817 rr->rr_data = NULL;
818 return;
819 }
820
821 va.va_mask = AT_ALL;
822
823 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
824
825 if (error) {
826 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
827 if (in_crit)
828 nbl_end_crit(vp);
829
830 VN_RELE(vp);
831 rr->rr_data = NULL;
832 rr->rr_status = puterrno(error);
833
834 return;
835 }
836
837 /*
838 * This is a kludge to allow reading of files created
839 * with no read permission. The owner of the file
840 * is always allowed to read it.
841 */
842 if (crgetuid(cr) != va.va_uid) {
843 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
844
845 if (error) {
846 /*
847 * Exec is the same as read over the net because
848 * of demand loading.
849 */
850 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
851 }
852 if (error) {
853 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
854 if (in_crit)
855 nbl_end_crit(vp);
856 VN_RELE(vp);
857 rr->rr_data = NULL;
858 rr->rr_status = puterrno(error);
859
860 return;
861 }
862 }
863
864 if (MANDLOCK(vp, va.va_mode)) {
865 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
866 if (in_crit)
867 nbl_end_crit(vp);
868
869 VN_RELE(vp);
870 rr->rr_data = NULL;
871 rr->rr_status = NFSERR_ACCES;
872
873 return;
874 }
875
876 rr->rr_ok.rrok_wlist_len = 0;
877 rr->rr_ok.rrok_wlist = NULL;
878
879 if ((u_offset_t)ra->ra_offset >= va.va_size) {
880 rr->rr_count = 0;
881 rr->rr_data = NULL;
882 /*
883 * In this case, status is NFS_OK, but there is no data
884 * to encode. So set rr_mp to NULL.
885 */
886 rr->rr_mp = NULL;
887 rr->rr_ok.rrok_wlist = ra->ra_wlist;
888 if (rr->rr_ok.rrok_wlist)
889 clist_zero_len(rr->rr_ok.rrok_wlist);
890 goto done;
891 }
892
893 if (ra->ra_wlist) {
894 mp = NULL;
895 rr->rr_mp = NULL;
896 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
897 if (ra->ra_count > iov.iov_len) {
898 rr->rr_data = NULL;
899 rr->rr_status = NFSERR_INVAL;
900 goto done;
901 }
902 } else {
903 /*
904 * mp will contain the data to be sent out in the read reply.
905 * This will be freed after the reply has been sent out (by the
906 * driver).
907 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
908 * that the call to xdrmblk_putmblk() never fails.
909 */
910 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
911 &alloc_err);
912 ASSERT(mp != NULL);
913 ASSERT(alloc_err == 0);
914
915 rr->rr_mp = mp;
916
917 /*
918 * Set up io vector
919 */
920 iov.iov_base = (caddr_t)mp->b_datap->db_base;
921 iov.iov_len = ra->ra_count;
922 }
923
924 uio.uio_iov = &iov;
925 uio.uio_iovcnt = 1;
926 uio.uio_segflg = UIO_SYSSPACE;
927 uio.uio_extflg = UIO_COPY_CACHED;
928 uio.uio_loffset = (offset_t)ra->ra_offset;
929 uio.uio_resid = ra->ra_count;
930
931 error = VOP_READ(vp, &uio, 0, cr, &ct);
932
933 if (error) {
934 if (mp)
935 freeb(mp);
936
937 /*
938 * check if a monitor detected a delegation conflict and
939 * mark as wouldblock so response is dropped
940 */
941 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
942 curthread->t_flag |= T_WOULDBLOCK;
943 else
944 rr->rr_status = puterrno(error);
945
946 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
947 if (in_crit)
948 nbl_end_crit(vp);
949
950 VN_RELE(vp);
951 rr->rr_data = NULL;
952
953 return;
954 }
955
956 /*
957 * Get attributes again so we can send the latest access
958 * time to the client side for its cache.
959 */
960 va.va_mask = AT_ALL;
961
962 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
963
964 if (error) {
965 if (mp)
966 freeb(mp);
967
968 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
969 if (in_crit)
970 nbl_end_crit(vp);
971
972 VN_RELE(vp);
973 rr->rr_data = NULL;
974 rr->rr_status = puterrno(error);
975
976 return;
977 }
978
979 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
980
981 if (mp) {
982 rr->rr_data = (char *)mp->b_datap->db_base;
983 } else {
984 if (ra->ra_wlist) {
985 rr->rr_data = (caddr_t)iov.iov_base;
986 if (!rdma_setup_read_data2(ra, rr)) {
987 rr->rr_data = NULL;
988 rr->rr_status = puterrno(NFSERR_INVAL);
989 }
990 }
991 }
992 done:
993 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
994 if (in_crit)
995 nbl_end_crit(vp);
996
997 acl_perm(vp, exi, &va, cr);
998
999 /* check for overflows */
1000 error = vattr_to_nattr(&va, &rr->rr_attr);
1001
1002 VN_RELE(vp);
1003
1004 rr->rr_status = puterrno(error);
1005 }
1006
1007 /*
1008 * Free data allocated by rfs_read
1009 */
1010 void
1011 rfs_rdfree(struct nfsrdresult *rr)
1012 {
1013 mblk_t *mp;
1014
1015 if (rr->rr_status == NFS_OK) {
1016 mp = rr->rr_mp;
1017 if (mp != NULL)
1018 freeb(mp);
1019 }
1020 }
1021
1022 void *
1023 rfs_read_getfh(struct nfsreadargs *ra)
1024 {
1025 return (&ra->ra_fhandle);
1026 }
1027
1028 #define MAX_IOVECS 12
1029
1030 #ifdef DEBUG
1031 static int rfs_write_sync_hits = 0;
1032 static int rfs_write_sync_misses = 0;
1033 #endif
1034
1035 /*
1036 * Write data to file.
1037 * Returns attributes of a file after writing some data to it.
1038 *
1039 * Any changes made here, especially in error handling might have
1040 * to also be done in rfs_write (which clusters write requests).
1041 */
1042 /* ARGSUSED */
1043 void
1044 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1045 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1046 {
1047 int error;
1048 vnode_t *vp;
1049 rlim64_t rlimit;
1050 struct vattr va;
1051 struct uio uio;
1052 struct iovec iov[MAX_IOVECS];
1053 mblk_t *m;
1054 struct iovec *iovp;
1055 int iovcnt;
1056 cred_t *savecred;
1057 int in_crit = 0;
1058 caller_context_t ct;
1059
1060 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1061 if (vp == NULL) {
1062 ns->ns_status = NFSERR_STALE;
1063 return;
1064 }
1065
1066 if (rdonly(ro, vp)) {
1067 VN_RELE(vp);
1068 ns->ns_status = NFSERR_ROFS;
1069 return;
1070 }
1071
1072 if (vp->v_type != VREG) {
1073 VN_RELE(vp);
1074 ns->ns_status = NFSERR_ISDIR;
1075 return;
1076 }
1077
1078 ct.cc_sysid = 0;
1079 ct.cc_pid = 0;
1080 ct.cc_caller_id = nfs2_srv_caller_id;
1081 ct.cc_flags = CC_DONTBLOCK;
1082
1083 va.va_mask = AT_UID|AT_MODE;
1084
1085 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1086
1087 if (error) {
1088 VN_RELE(vp);
1089 ns->ns_status = puterrno(error);
1090
1091 return;
1092 }
1093
1094 if (crgetuid(cr) != va.va_uid) {
1095 /*
1096 * This is a kludge to allow writes of files created
1097 * with read only permission. The owner of the file
1098 * is always allowed to write it.
1099 */
1100 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1101
1102 if (error) {
1103 VN_RELE(vp);
1104 ns->ns_status = puterrno(error);
1105 return;
1106 }
1107 }
1108
1109 /*
1110 * Can't access a mandatory lock file. This might cause
1111 * the NFS service thread to block forever waiting for a
1112 * lock to be released that will never be released.
1113 */
1114 if (MANDLOCK(vp, va.va_mode)) {
1115 VN_RELE(vp);
1116 ns->ns_status = NFSERR_ACCES;
1117 return;
1118 }
1119
1120 /*
1121 * We have to enter the critical region before calling VOP_RWLOCK
1122 * to avoid a deadlock with ufs.
1123 */
1124 if (nbl_need_check(vp)) {
1125 nbl_start_crit(vp, RW_READER);
1126 in_crit = 1;
1127 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1128 wa->wa_count, 0, NULL)) {
1129 error = EACCES;
1130 goto out;
1131 }
1132 }
1133
1134 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1135
1136 /* check if a monitor detected a delegation conflict */
1137 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1138 goto out;
1139 }
1140
1141 if (wa->wa_data || wa->wa_rlist) {
1142 /* Do the RDMA thing if necessary */
1143 if (wa->wa_rlist) {
1144 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1145 iov[0].iov_len = wa->wa_count;
1146 } else {
1147 iov[0].iov_base = wa->wa_data;
1148 iov[0].iov_len = wa->wa_count;
1149 }
1150 uio.uio_iov = iov;
1151 uio.uio_iovcnt = 1;
1152 uio.uio_segflg = UIO_SYSSPACE;
1153 uio.uio_extflg = UIO_COPY_DEFAULT;
1154 uio.uio_loffset = (offset_t)wa->wa_offset;
1155 uio.uio_resid = wa->wa_count;
1156 /*
1157 * The limit is checked on the client. We
1158 * should allow any size writes here.
1159 */
1160 uio.uio_llimit = curproc->p_fsz_ctl;
1161 rlimit = uio.uio_llimit - wa->wa_offset;
1162 if (rlimit < (rlim64_t)uio.uio_resid)
1163 uio.uio_resid = (uint_t)rlimit;
1164
1165 /*
1166 * for now we assume no append mode
1167 */
1168 /*
1169 * We're changing creds because VM may fault and we need
1170 * the cred of the current thread to be used if quota
1171 * checking is enabled.
1172 */
1173 savecred = curthread->t_cred;
1174 curthread->t_cred = cr;
1175 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1176 curthread->t_cred = savecred;
1177 } else {
1178
1179 iovcnt = 0;
1180 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1181 iovcnt++;
1182 if (iovcnt <= MAX_IOVECS) {
1183 #ifdef DEBUG
1184 rfs_write_sync_hits++;
1185 #endif
1186 iovp = iov;
1187 } else {
1188 #ifdef DEBUG
1189 rfs_write_sync_misses++;
1190 #endif
1191 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1192 }
1193 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1194 uio.uio_iov = iovp;
1195 uio.uio_iovcnt = iovcnt;
1196 uio.uio_segflg = UIO_SYSSPACE;
1197 uio.uio_extflg = UIO_COPY_DEFAULT;
1198 uio.uio_loffset = (offset_t)wa->wa_offset;
1199 uio.uio_resid = wa->wa_count;
1200 /*
1201 * The limit is checked on the client. We
1202 * should allow any size writes here.
1203 */
1204 uio.uio_llimit = curproc->p_fsz_ctl;
1205 rlimit = uio.uio_llimit - wa->wa_offset;
1206 if (rlimit < (rlim64_t)uio.uio_resid)
1207 uio.uio_resid = (uint_t)rlimit;
1208
1209 /*
1210 * For now we assume no append mode.
1211 */
1212 /*
1213 * We're changing creds because VM may fault and we need
1214 * the cred of the current thread to be used if quota
1215 * checking is enabled.
1216 */
1217 savecred = curthread->t_cred;
1218 curthread->t_cred = cr;
1219 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1220 curthread->t_cred = savecred;
1221
1222 if (iovp != iov)
1223 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1224 }
1225
1226 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1227
1228 if (!error) {
1229 /*
1230 * Get attributes again so we send the latest mod
1231 * time to the client side for its cache.
1232 */
1233 va.va_mask = AT_ALL; /* now we want everything */
1234
1235 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1236
1237 /* check for overflows */
1238 if (!error) {
1239 acl_perm(vp, exi, &va, cr);
1240 error = vattr_to_nattr(&va, &ns->ns_attr);
1241 }
1242 }
1243
1244 out:
1245 if (in_crit)
1246 nbl_end_crit(vp);
1247 VN_RELE(vp);
1248
1249 /* check if a monitor detected a delegation conflict */
1250 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1251 /* mark as wouldblock so response is dropped */
1252 curthread->t_flag |= T_WOULDBLOCK;
1253 else
1254 ns->ns_status = puterrno(error);
1255
1256 }
1257
1258 struct rfs_async_write {
1259 struct nfswriteargs *wa;
1260 struct nfsattrstat *ns;
1261 struct svc_req *req;
1262 cred_t *cr;
1263 bool_t ro;
1264 kthread_t *thread;
1265 struct rfs_async_write *list;
1266 };
1267
1268 struct rfs_async_write_list {
1269 fhandle_t *fhp;
1270 kcondvar_t cv;
1271 struct rfs_async_write *list;
1272 struct rfs_async_write_list *next;
1273 };
1274
1275 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1276 static kmutex_t rfs_async_write_lock;
1277 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1278
1279 #define MAXCLIOVECS 42
1280 #define RFSWRITE_INITVAL (enum nfsstat) -1
1281
1282 #ifdef DEBUG
1283 static int rfs_write_hits = 0;
1284 static int rfs_write_misses = 0;
1285 #endif
1286
1287 /*
1288 * Write data to file.
1289 * Returns attributes of a file after writing some data to it.
1290 */
1291 void
1292 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1293 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1294 {
1295 int error;
1296 vnode_t *vp;
1297 rlim64_t rlimit;
1298 struct vattr va;
1299 struct uio uio;
1300 struct rfs_async_write_list *lp;
1301 struct rfs_async_write_list *nlp;
1302 struct rfs_async_write *rp;
1303 struct rfs_async_write *nrp;
1304 struct rfs_async_write *trp;
1305 struct rfs_async_write *lrp;
1306 int data_written;
1307 int iovcnt;
1308 mblk_t *m;
1309 struct iovec *iovp;
1310 struct iovec *niovp;
1311 struct iovec iov[MAXCLIOVECS];
1312 int count;
1313 int rcount;
1314 uint_t off;
1315 uint_t len;
1316 struct rfs_async_write nrpsp;
1317 struct rfs_async_write_list nlpsp;
1318 ushort_t t_flag;
1319 cred_t *savecred;
1320 int in_crit = 0;
1321 caller_context_t ct;
1322 nfs_srv_t *nsrv;
1323
1324 ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1325 nsrv = nfs_get_srv();
1326 if (!nsrv->write_async) {
1327 rfs_write_sync(wa, ns, exi, req, cr, ro);
1328 return;
1329 }
1330
1331 /*
1332 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1333 * is considered an OK.
1334 */
1335 ns->ns_status = RFSWRITE_INITVAL;
1336
1337 nrp = &nrpsp;
1338 nrp->wa = wa;
1339 nrp->ns = ns;
1340 nrp->req = req;
1341 nrp->cr = cr;
1342 nrp->ro = ro;
1343 nrp->thread = curthread;
1344
1345 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1346
1347 /*
1348 * Look to see if there is already a cluster started
1349 * for this file.
1350 */
1351 mutex_enter(&nsrv->async_write_lock);
1352 for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1353 if (bcmp(&wa->wa_fhandle, lp->fhp,
1354 sizeof (fhandle_t)) == 0)
1355 break;
1356 }
1357
1358 /*
1359 * If lp is non-NULL, then there is already a cluster
1360 * started. We need to place ourselves in the cluster
1361 * list in the right place as determined by starting
1362 * offset. Conflicts with non-blocking mandatory locked
1363 * regions will be checked when the cluster is processed.
1364 */
1365 if (lp != NULL) {
1366 rp = lp->list;
1367 trp = NULL;
1368 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1369 trp = rp;
1370 rp = rp->list;
1371 }
1372 nrp->list = rp;
1373 if (trp == NULL)
1374 lp->list = nrp;
1375 else
1376 trp->list = nrp;
1377 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1378 cv_wait(&lp->cv, &nsrv->async_write_lock);
1379 mutex_exit(&nsrv->async_write_lock);
1380
1381 return;
1382 }
1383
1384 /*
1385 * No cluster started yet, start one and add ourselves
1386 * to the list of clusters.
1387 */
1388 nrp->list = NULL;
1389
1390 nlp = &nlpsp;
1391 nlp->fhp = &wa->wa_fhandle;
1392 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1393 nlp->list = nrp;
1394 nlp->next = NULL;
1395
1396 if (nsrv->async_write_head == NULL) {
1397 nsrv->async_write_head = nlp;
1398 } else {
1399 lp = nsrv->async_write_head;
1400 while (lp->next != NULL)
1401 lp = lp->next;
1402 lp->next = nlp;
1403 }
1404 mutex_exit(&nsrv->async_write_lock);
1405
1406 /*
1407 * Convert the file handle common to all of the requests
1408 * in this cluster to a vnode.
1409 */
1410 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1411 if (vp == NULL) {
1412 mutex_enter(&nsrv->async_write_lock);
1413 if (nsrv->async_write_head == nlp)
1414 nsrv->async_write_head = nlp->next;
1415 else {
1416 lp = nsrv->async_write_head;
1417 while (lp->next != nlp)
1418 lp = lp->next;
1419 lp->next = nlp->next;
1420 }
1421 t_flag = curthread->t_flag & T_WOULDBLOCK;
1422 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1423 rp->ns->ns_status = NFSERR_STALE;
1424 rp->thread->t_flag |= t_flag;
1425 }
1426 cv_broadcast(&nlp->cv);
1427 mutex_exit(&nsrv->async_write_lock);
1428
1429 return;
1430 }
1431
1432 /*
1433 * Can only write regular files. Attempts to write any
1434 * other file types fail with EISDIR.
1435 */
1436 if (vp->v_type != VREG) {
1437 VN_RELE(vp);
1438 mutex_enter(&nsrv->async_write_lock);
1439 if (nsrv->async_write_head == nlp)
1440 nsrv->async_write_head = nlp->next;
1441 else {
1442 lp = nsrv->async_write_head;
1443 while (lp->next != nlp)
1444 lp = lp->next;
1445 lp->next = nlp->next;
1446 }
1447 t_flag = curthread->t_flag & T_WOULDBLOCK;
1448 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1449 rp->ns->ns_status = NFSERR_ISDIR;
1450 rp->thread->t_flag |= t_flag;
1451 }
1452 cv_broadcast(&nlp->cv);
1453 mutex_exit(&nsrv->async_write_lock);
1454
1455 return;
1456 }
1457
1458 /*
1459 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1460 * deadlock with ufs.
1461 */
1462 if (nbl_need_check(vp)) {
1463 nbl_start_crit(vp, RW_READER);
1464 in_crit = 1;
1465 }
1466
1467 ct.cc_sysid = 0;
1468 ct.cc_pid = 0;
1469 ct.cc_caller_id = nfs2_srv_caller_id;
1470 ct.cc_flags = CC_DONTBLOCK;
1471
1472 /*
1473 * Lock the file for writing. This operation provides
1474 * the delay which allows clusters to grow.
1475 */
1476 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1477
1478 /* check if a monitor detected a delegation conflict */
1479 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1480 if (in_crit)
1481 nbl_end_crit(vp);
1482 VN_RELE(vp);
1483 /* mark as wouldblock so response is dropped */
1484 curthread->t_flag |= T_WOULDBLOCK;
1485 mutex_enter(&nsrv->async_write_lock);
1486 if (nsrv->async_write_head == nlp)
1487 nsrv->async_write_head = nlp->next;
1488 else {
1489 lp = nsrv->async_write_head;
1490 while (lp->next != nlp)
1491 lp = lp->next;
1492 lp->next = nlp->next;
1493 }
1494 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1495 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1496 rp->ns->ns_status = puterrno(error);
1497 rp->thread->t_flag |= T_WOULDBLOCK;
1498 }
1499 }
1500 cv_broadcast(&nlp->cv);
1501 mutex_exit(&nsrv->async_write_lock);
1502
1503 return;
1504 }
1505
1506 /*
1507 * Disconnect this cluster from the list of clusters.
1508 * The cluster that is being dealt with must be fixed
1509 * in size after this point, so there is no reason
1510 * to leave it on the list so that new requests can
1511 * find it.
1512 *
1513 * The algorithm is that the first write request will
1514 * create a cluster, convert the file handle to a
1515 * vnode pointer, and then lock the file for writing.
1516 * This request is not likely to be clustered with
1517 * any others. However, the next request will create
1518 * a new cluster and be blocked in VOP_RWLOCK while
1519 * the first request is being processed. This delay
1520 * will allow more requests to be clustered in this
1521 * second cluster.
1522 */
1523 mutex_enter(&nsrv->async_write_lock);
1524 if (nsrv->async_write_head == nlp)
1525 nsrv->async_write_head = nlp->next;
1526 else {
1527 lp = nsrv->async_write_head;
1528 while (lp->next != nlp)
1529 lp = lp->next;
1530 lp->next = nlp->next;
1531 }
1532 mutex_exit(&nsrv->async_write_lock);
1533
1534 /*
1535 * Step through the list of requests in this cluster.
1536 * We need to check permissions to make sure that all
1537 * of the requests have sufficient permission to write
1538 * the file. A cluster can be composed of requests
1539 * from different clients and different users on each
1540 * client.
1541 *
1542 * As a side effect, we also calculate the size of the
1543 * byte range that this cluster encompasses.
1544 */
1545 rp = nlp->list;
1546 off = rp->wa->wa_offset;
1547 len = (uint_t)0;
1548 do {
1549 if (rdonly(rp->ro, vp)) {
1550 rp->ns->ns_status = NFSERR_ROFS;
1551 t_flag = curthread->t_flag & T_WOULDBLOCK;
1552 rp->thread->t_flag |= t_flag;
1553 continue;
1554 }
1555
1556 va.va_mask = AT_UID|AT_MODE;
1557
1558 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1559
1560 if (!error) {
1561 if (crgetuid(rp->cr) != va.va_uid) {
1562 /*
1563 * This is a kludge to allow writes of files
1564 * created with read only permission. The
1565 * owner of the file is always allowed to
1566 * write it.
1567 */
1568 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1569 }
1570 if (!error && MANDLOCK(vp, va.va_mode))
1571 error = EACCES;
1572 }
1573
1574 /*
1575 * Check for a conflict with a nbmand-locked region.
1576 */
1577 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1578 rp->wa->wa_count, 0, NULL)) {
1579 error = EACCES;
1580 }
1581
1582 if (error) {
1583 rp->ns->ns_status = puterrno(error);
1584 t_flag = curthread->t_flag & T_WOULDBLOCK;
1585 rp->thread->t_flag |= t_flag;
1586 continue;
1587 }
1588 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1589 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1590 } while ((rp = rp->list) != NULL);
1591
1592 /*
1593 * Step through the cluster attempting to gather as many
1594 * requests which are contiguous as possible. These
1595 * contiguous requests are handled via one call to VOP_WRITE
1596 * instead of different calls to VOP_WRITE. We also keep
1597 * track of the fact that any data was written.
1598 */
1599 rp = nlp->list;
1600 data_written = 0;
1601 do {
1602 /*
1603 * Skip any requests which are already marked as having an
1604 * error.
1605 */
1606 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1607 rp = rp->list;
1608 continue;
1609 }
1610
1611 /*
1612 * Count the number of iovec's which are required
1613 * to handle this set of requests. One iovec is
1614 * needed for each data buffer, whether addressed
1615 * by wa_data or by the b_rptr pointers in the
1616 * mblk chains.
1617 */
1618 iovcnt = 0;
1619 lrp = rp;
1620 for (;;) {
1621 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1622 iovcnt++;
1623 else {
1624 m = lrp->wa->wa_mblk;
1625 while (m != NULL) {
1626 iovcnt++;
1627 m = m->b_cont;
1628 }
1629 }
1630 if (lrp->list == NULL ||
1631 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1632 lrp->wa->wa_offset + lrp->wa->wa_count !=
1633 lrp->list->wa->wa_offset) {
1634 lrp = lrp->list;
1635 break;
1636 }
1637 lrp = lrp->list;
1638 }
1639
1640 if (iovcnt <= MAXCLIOVECS) {
1641 #ifdef DEBUG
1642 rfs_write_hits++;
1643 #endif
1644 niovp = iov;
1645 } else {
1646 #ifdef DEBUG
1647 rfs_write_misses++;
1648 #endif
1649 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1650 }
1651 /*
1652 * Put together the scatter/gather iovecs.
1653 */
1654 iovp = niovp;
1655 trp = rp;
1656 count = 0;
1657 do {
1658 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1659 if (trp->wa->wa_rlist) {
1660 iovp->iov_base =
1661 (char *)((trp->wa->wa_rlist)->
1662 u.c_daddr3);
1663 iovp->iov_len = trp->wa->wa_count;
1664 } else {
1665 iovp->iov_base = trp->wa->wa_data;
1666 iovp->iov_len = trp->wa->wa_count;
1667 }
1668 iovp++;
1669 } else {
1670 m = trp->wa->wa_mblk;
1671 rcount = trp->wa->wa_count;
1672 while (m != NULL) {
1673 iovp->iov_base = (caddr_t)m->b_rptr;
1674 iovp->iov_len = (m->b_wptr - m->b_rptr);
1675 rcount -= iovp->iov_len;
1676 if (rcount < 0)
1677 iovp->iov_len += rcount;
1678 iovp++;
1679 if (rcount <= 0)
1680 break;
1681 m = m->b_cont;
1682 }
1683 }
1684 count += trp->wa->wa_count;
1685 trp = trp->list;
1686 } while (trp != lrp);
1687
1688 uio.uio_iov = niovp;
1689 uio.uio_iovcnt = iovcnt;
1690 uio.uio_segflg = UIO_SYSSPACE;
1691 uio.uio_extflg = UIO_COPY_DEFAULT;
1692 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1693 uio.uio_resid = count;
1694 /*
1695 * The limit is checked on the client. We
1696 * should allow any size writes here.
1697 */
1698 uio.uio_llimit = curproc->p_fsz_ctl;
1699 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1700 if (rlimit < (rlim64_t)uio.uio_resid)
1701 uio.uio_resid = (uint_t)rlimit;
1702
1703 /*
1704 * For now we assume no append mode.
1705 */
1706
1707 /*
1708 * We're changing creds because VM may fault
1709 * and we need the cred of the current
1710 * thread to be used if quota * checking is
1711 * enabled.
1712 */
1713 savecred = curthread->t_cred;
1714 curthread->t_cred = cr;
1715 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1716 curthread->t_cred = savecred;
1717
1718 /* check if a monitor detected a delegation conflict */
1719 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1720 /* mark as wouldblock so response is dropped */
1721 curthread->t_flag |= T_WOULDBLOCK;
1722
1723 if (niovp != iov)
1724 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1725
1726 if (!error) {
1727 data_written = 1;
1728 /*
1729 * Get attributes again so we send the latest mod
1730 * time to the client side for its cache.
1731 */
1732 va.va_mask = AT_ALL; /* now we want everything */
1733
1734 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1735
1736 if (!error)
1737 acl_perm(vp, exi, &va, rp->cr);
1738 }
1739
1740 /*
1741 * Fill in the status responses for each request
1742 * which was just handled. Also, copy the latest
1743 * attributes in to the attribute responses if
1744 * appropriate.
1745 */
1746 t_flag = curthread->t_flag & T_WOULDBLOCK;
1747 do {
1748 rp->thread->t_flag |= t_flag;
1749 /* check for overflows */
1750 if (!error) {
1751 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1752 }
1753 rp->ns->ns_status = puterrno(error);
1754 rp = rp->list;
1755 } while (rp != lrp);
1756 } while (rp != NULL);
1757
1758 /*
1759 * If any data was written at all, then we need to flush
1760 * the data and metadata to stable storage.
1761 */
1762 if (data_written) {
1763 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1764
1765 if (!error) {
1766 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1767 }
1768 }
1769
1770 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1771
1772 if (in_crit)
1773 nbl_end_crit(vp);
1774 VN_RELE(vp);
1775
1776 t_flag = curthread->t_flag & T_WOULDBLOCK;
1777 mutex_enter(&nsrv->async_write_lock);
1778 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1779 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1780 rp->ns->ns_status = puterrno(error);
1781 rp->thread->t_flag |= t_flag;
1782 }
1783 }
1784 cv_broadcast(&nlp->cv);
1785 mutex_exit(&nsrv->async_write_lock);
1786
1787 }
1788
1789 void *
1790 rfs_write_getfh(struct nfswriteargs *wa)
1791 {
1792 return (&wa->wa_fhandle);
1793 }
1794
1795 /*
1796 * Create a file.
1797 * Creates a file with given attributes and returns those attributes
1798 * and an fhandle for the new file.
1799 */
1800 void
1801 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1802 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1803 {
1804 int error;
1805 int lookuperr;
1806 int in_crit = 0;
1807 struct vattr va;
1808 vnode_t *vp;
1809 vnode_t *realvp;
1810 vnode_t *dvp;
1811 char *name = args->ca_da.da_name;
1812 vnode_t *tvp = NULL;
1813 int mode;
1814 int lookup_ok;
1815 bool_t trunc;
1816 struct sockaddr *ca;
1817
1818 /*
1819 * Disallow NULL paths
1820 */
1821 if (name == NULL || *name == '\0') {
1822 dr->dr_status = NFSERR_ACCES;
1823 return;
1824 }
1825
1826 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1827 if (dvp == NULL) {
1828 dr->dr_status = NFSERR_STALE;
1829 return;
1830 }
1831
1832 error = sattr_to_vattr(args->ca_sa, &va);
1833 if (error) {
1834 dr->dr_status = puterrno(error);
1835 return;
1836 }
1837
1838 /*
1839 * Must specify the mode.
1840 */
1841 if (!(va.va_mask & AT_MODE)) {
1842 VN_RELE(dvp);
1843 dr->dr_status = NFSERR_INVAL;
1844 return;
1845 }
1846
1847 /*
1848 * This is a completely gross hack to make mknod
1849 * work over the wire until we can wack the protocol
1850 */
1851 if ((va.va_mode & IFMT) == IFCHR) {
1852 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1853 va.va_type = VFIFO; /* xtra kludge for named pipe */
1854 else {
1855 va.va_type = VCHR;
1856 /*
1857 * uncompress the received dev_t
1858 * if the top half is zero indicating a request
1859 * from an `older style' OS.
1860 */
1861 if ((va.va_size & 0xffff0000) == 0)
1862 va.va_rdev = nfsv2_expdev(va.va_size);
1863 else
1864 va.va_rdev = (dev_t)va.va_size;
1865 }
1866 va.va_mask &= ~AT_SIZE;
1867 } else if ((va.va_mode & IFMT) == IFBLK) {
1868 va.va_type = VBLK;
1869 /*
1870 * uncompress the received dev_t
1871 * if the top half is zero indicating a request
1872 * from an `older style' OS.
1873 */
1874 if ((va.va_size & 0xffff0000) == 0)
1875 va.va_rdev = nfsv2_expdev(va.va_size);
1876 else
1877 va.va_rdev = (dev_t)va.va_size;
1878 va.va_mask &= ~AT_SIZE;
1879 } else if ((va.va_mode & IFMT) == IFSOCK) {
1880 va.va_type = VSOCK;
1881 } else {
1882 va.va_type = VREG;
1883 }
1884 va.va_mode &= ~IFMT;
1885 va.va_mask |= AT_TYPE;
1886
1887 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1888 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1889 MAXPATHLEN);
1890 if (name == NULL) {
1891 dr->dr_status = puterrno(EINVAL);
1892 return;
1893 }
1894
1895 /*
1896 * Why was the choice made to use VWRITE as the mode to the
1897 * call to VOP_CREATE ? This results in a bug. When a client
1898 * opens a file that already exists and is RDONLY, the second
1899 * open fails with an EACESS because of the mode.
1900 * bug ID 1054648.
1901 */
1902 lookup_ok = 0;
1903 mode = VWRITE;
1904 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1905 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1906 NULL, NULL, NULL);
1907 if (!error) {
1908 struct vattr at;
1909
1910 lookup_ok = 1;
1911 at.va_mask = AT_MODE;
1912 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1913 if (!error)
1914 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1915 VN_RELE(tvp);
1916 tvp = NULL;
1917 }
1918 }
1919
1920 if (!lookup_ok) {
1921 if (rdonly(ro, dvp)) {
1922 error = EROFS;
1923 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1924 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1925 error = EPERM;
1926 } else {
1927 error = 0;
1928 }
1929 }
1930
1931 /*
1932 * If file size is being modified on an already existing file
1933 * make sure that there are no conflicting non-blocking mandatory
1934 * locks in the region being manipulated. Return EACCES if there
1935 * are conflicting locks.
1936 */
1937 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1938 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1939 NULL, NULL, NULL);
1940
1941 if (!lookuperr &&
1942 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1943 VN_RELE(tvp);
1944 curthread->t_flag |= T_WOULDBLOCK;
1945 goto out;
1946 }
1947
1948 if (!lookuperr && nbl_need_check(tvp)) {
1949 /*
1950 * The file exists. Now check if it has any
1951 * conflicting non-blocking mandatory locks
1952 * in the region being changed.
1953 */
1954 struct vattr bva;
1955 u_offset_t offset;
1956 ssize_t length;
1957
1958 nbl_start_crit(tvp, RW_READER);
1959 in_crit = 1;
1960
1961 bva.va_mask = AT_SIZE;
1962 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1963 if (!error) {
1964 if (va.va_size < bva.va_size) {
1965 offset = va.va_size;
1966 length = bva.va_size - va.va_size;
1967 } else {
1968 offset = bva.va_size;
1969 length = va.va_size - bva.va_size;
1970 }
1971 if (length) {
1972 if (nbl_conflict(tvp, NBL_WRITE,
1973 offset, length, 0, NULL)) {
1974 error = EACCES;
1975 }
1976 }
1977 }
1978 if (error) {
1979 nbl_end_crit(tvp);
1980 VN_RELE(tvp);
1981 in_crit = 0;
1982 }
1983 } else if (tvp != NULL) {
1984 VN_RELE(tvp);
1985 }
1986 }
1987
1988 if (!error) {
1989 /*
1990 * If filesystem is shared with nosuid the remove any
1991 * setuid/setgid bits on create.
1992 */
1993 if (va.va_type == VREG &&
1994 exi->exi_export.ex_flags & EX_NOSUID)
1995 va.va_mode &= ~(VSUID | VSGID);
1996
1997 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1998 NULL, NULL);
1999
2000 if (!error) {
2001
2002 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2003 trunc = TRUE;
2004 else
2005 trunc = FALSE;
2006
2007 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2008 VN_RELE(vp);
2009 curthread->t_flag |= T_WOULDBLOCK;
2010 goto out;
2011 }
2012 va.va_mask = AT_ALL;
2013
2014 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2015
2016 /* check for overflows */
2017 if (!error) {
2018 acl_perm(vp, exi, &va, cr);
2019 error = vattr_to_nattr(&va, &dr->dr_attr);
2020 if (!error) {
2021 error = makefh(&dr->dr_fhandle, vp,
2022 exi);
2023 }
2024 }
2025 /*
2026 * Force modified metadata out to stable storage.
2027 *
2028 * if a underlying vp exists, pass it to VOP_FSYNC
2029 */
2030 if (VOP_REALVP(vp, &realvp, NULL) == 0)
2031 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2032 else
2033 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2034 VN_RELE(vp);
2035 }
2036
2037 if (in_crit) {
2038 nbl_end_crit(tvp);
2039 VN_RELE(tvp);
2040 }
2041 }
2042
2043 /*
2044 * Force modified data and metadata out to stable storage.
2045 */
2046 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2047
2048 out:
2049
2050 VN_RELE(dvp);
2051
2052 dr->dr_status = puterrno(error);
2053
2054 if (name != args->ca_da.da_name)
2055 kmem_free(name, MAXPATHLEN);
2056 }
2057 void *
2058 rfs_create_getfh(struct nfscreatargs *args)
2059 {
2060 return (args->ca_da.da_fhandle);
2061 }
2062
2063 /*
2064 * Remove a file.
2065 * Remove named file from parent directory.
2066 */
2067 /* ARGSUSED */
2068 void
2069 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2070 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2071 {
2072 int error = 0;
2073 vnode_t *vp;
2074 vnode_t *targvp;
2075 int in_crit = 0;
2076
2077 /*
2078 * Disallow NULL paths
2079 */
2080 if (da->da_name == NULL || *da->da_name == '\0') {
2081 *status = NFSERR_ACCES;
2082 return;
2083 }
2084
2085 vp = nfs_fhtovp(da->da_fhandle, exi);
2086 if (vp == NULL) {
2087 *status = NFSERR_STALE;
2088 return;
2089 }
2090
2091 if (rdonly(ro, vp)) {
2092 VN_RELE(vp);
2093 *status = NFSERR_ROFS;
2094 return;
2095 }
2096
2097 /*
2098 * Check for a conflict with a non-blocking mandatory share reservation.
2099 */
2100 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2101 NULL, cr, NULL, NULL, NULL);
2102 if (error != 0) {
2103 VN_RELE(vp);
2104 *status = puterrno(error);
2105 return;
2106 }
2107
2108 /*
2109 * If the file is delegated to an v4 client, then initiate
2110 * recall and drop this request (by setting T_WOULDBLOCK).
2111 * The client will eventually re-transmit the request and
2112 * (hopefully), by then, the v4 client will have returned
2113 * the delegation.
2114 */
2115
2116 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2117 VN_RELE(vp);
2118 VN_RELE(targvp);
2119 curthread->t_flag |= T_WOULDBLOCK;
2120 return;
2121 }
2122
2123 if (nbl_need_check(targvp)) {
2124 nbl_start_crit(targvp, RW_READER);
2125 in_crit = 1;
2126 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2127 error = EACCES;
2128 goto out;
2129 }
2130 }
2131
2132 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2133
2134 /*
2135 * Force modified data and metadata out to stable storage.
2136 */
2137 (void) VOP_FSYNC(vp, 0, cr, NULL);
2138
2139 out:
2140 if (in_crit)
2141 nbl_end_crit(targvp);
2142 VN_RELE(targvp);
2143 VN_RELE(vp);
2144
2145 *status = puterrno(error);
2146
2147 }
2148
2149 void *
2150 rfs_remove_getfh(struct nfsdiropargs *da)
2151 {
2152 return (da->da_fhandle);
2153 }
2154
2155 /*
2156 * rename a file
2157 * Give a file (from) a new name (to).
2158 */
2159 /* ARGSUSED */
2160 void
2161 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2162 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2163 {
2164 int error = 0;
2165 vnode_t *fromvp;
2166 vnode_t *tovp;
2167 struct exportinfo *to_exi;
2168 fhandle_t *fh;
2169 vnode_t *srcvp;
2170 vnode_t *targvp;
2171 int in_crit = 0;
2172
2173 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2174 if (fromvp == NULL) {
2175 *status = NFSERR_STALE;
2176 return;
2177 }
2178
2179 fh = args->rna_to.da_fhandle;
2180 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2181 if (to_exi == NULL) {
2182 VN_RELE(fromvp);
2183 *status = NFSERR_ACCES;
2184 return;
2185 }
2186 exi_rele(to_exi);
2187
2188 if (to_exi != exi) {
2189 VN_RELE(fromvp);
2190 *status = NFSERR_XDEV;
2191 return;
2192 }
2193
2194 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2195 if (tovp == NULL) {
2196 VN_RELE(fromvp);
2197 *status = NFSERR_STALE;
2198 return;
2199 }
2200
2201 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2202 VN_RELE(tovp);
2203 VN_RELE(fromvp);
2204 *status = NFSERR_NOTDIR;
2205 return;
2206 }
2207
2208 /*
2209 * Disallow NULL paths
2210 */
2211 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2212 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2213 VN_RELE(tovp);
2214 VN_RELE(fromvp);
2215 *status = NFSERR_ACCES;
2216 return;
2217 }
2218
2219 if (rdonly(ro, tovp)) {
2220 VN_RELE(tovp);
2221 VN_RELE(fromvp);
2222 *status = NFSERR_ROFS;
2223 return;
2224 }
2225
2226 /*
2227 * Check for a conflict with a non-blocking mandatory share reservation.
2228 */
2229 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2230 NULL, cr, NULL, NULL, NULL);
2231 if (error != 0) {
2232 VN_RELE(tovp);
2233 VN_RELE(fromvp);
2234 *status = puterrno(error);
2235 return;
2236 }
2237
2238 /* Check for delegations on the source file */
2239
2240 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2241 VN_RELE(tovp);
2242 VN_RELE(fromvp);
2243 VN_RELE(srcvp);
2244 curthread->t_flag |= T_WOULDBLOCK;
2245 return;
2246 }
2247
2248 /* Check for delegation on the file being renamed over, if it exists */
2249
2250 if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2251 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2252 NULL, NULL, NULL) == 0) {
2253
2254 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2255 VN_RELE(tovp);
2256 VN_RELE(fromvp);
2257 VN_RELE(srcvp);
2258 VN_RELE(targvp);
2259 curthread->t_flag |= T_WOULDBLOCK;
2260 return;
2261 }
2262 VN_RELE(targvp);
2263 }
2264
2265
2266 if (nbl_need_check(srcvp)) {
2267 nbl_start_crit(srcvp, RW_READER);
2268 in_crit = 1;
2269 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2270 error = EACCES;
2271 goto out;
2272 }
2273 }
2274
2275 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2276 tovp, args->rna_to.da_name, cr, NULL, 0);
2277
2278 if (error == 0)
2279 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2280 strlen(args->rna_to.da_name));
2281
2282 /*
2283 * Force modified data and metadata out to stable storage.
2284 */
2285 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2286 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2287
2288 out:
2289 if (in_crit)
2290 nbl_end_crit(srcvp);
2291 VN_RELE(srcvp);
2292 VN_RELE(tovp);
2293 VN_RELE(fromvp);
2294
2295 *status = puterrno(error);
2296
2297 }
2298 void *
2299 rfs_rename_getfh(struct nfsrnmargs *args)
2300 {
2301 return (args->rna_from.da_fhandle);
2302 }
2303
2304 /*
2305 * Link to a file.
2306 * Create a file (to) which is a hard link to the given file (from).
2307 */
2308 /* ARGSUSED */
2309 void
2310 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2311 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2312 {
2313 int error;
2314 vnode_t *fromvp;
2315 vnode_t *tovp;
2316 struct exportinfo *to_exi;
2317 fhandle_t *fh;
2318
2319 fromvp = nfs_fhtovp(args->la_from, exi);
2320 if (fromvp == NULL) {
2321 *status = NFSERR_STALE;
2322 return;
2323 }
2324
2325 fh = args->la_to.da_fhandle;
2326 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2327 if (to_exi == NULL) {
2328 VN_RELE(fromvp);
2329 *status = NFSERR_ACCES;
2330 return;
2331 }
2332 exi_rele(to_exi);
2333
2334 if (to_exi != exi) {
2335 VN_RELE(fromvp);
2336 *status = NFSERR_XDEV;
2337 return;
2338 }
2339
2340 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2341 if (tovp == NULL) {
2342 VN_RELE(fromvp);
2343 *status = NFSERR_STALE;
2344 return;
2345 }
2346
2347 if (tovp->v_type != VDIR) {
2348 VN_RELE(tovp);
2349 VN_RELE(fromvp);
2350 *status = NFSERR_NOTDIR;
2351 return;
2352 }
2353 /*
2354 * Disallow NULL paths
2355 */
2356 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2357 VN_RELE(tovp);
2358 VN_RELE(fromvp);
2359 *status = NFSERR_ACCES;
2360 return;
2361 }
2362
2363 if (rdonly(ro, tovp)) {
2364 VN_RELE(tovp);
2365 VN_RELE(fromvp);
2366 *status = NFSERR_ROFS;
2367 return;
2368 }
2369
2370 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2371
2372 /*
2373 * Force modified data and metadata out to stable storage.
2374 */
2375 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2376 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2377
2378 VN_RELE(tovp);
2379 VN_RELE(fromvp);
2380
2381 *status = puterrno(error);
2382
2383 }
2384 void *
2385 rfs_link_getfh(struct nfslinkargs *args)
2386 {
2387 return (args->la_from);
2388 }
2389
2390 /*
2391 * Symbolicly link to a file.
2392 * Create a file (to) with the given attributes which is a symbolic link
2393 * to the given path name (to).
2394 */
2395 void
2396 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2397 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2398 {
2399 int error;
2400 struct vattr va;
2401 vnode_t *vp;
2402 vnode_t *svp;
2403 int lerror;
2404 struct sockaddr *ca;
2405 char *name = NULL;
2406
2407 /*
2408 * Disallow NULL paths
2409 */
2410 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2411 *status = NFSERR_ACCES;
2412 return;
2413 }
2414
2415 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2416 if (vp == NULL) {
2417 *status = NFSERR_STALE;
2418 return;
2419 }
2420
2421 if (rdonly(ro, vp)) {
2422 VN_RELE(vp);
2423 *status = NFSERR_ROFS;
2424 return;
2425 }
2426
2427 error = sattr_to_vattr(args->sla_sa, &va);
2428 if (error) {
2429 VN_RELE(vp);
2430 *status = puterrno(error);
2431 return;
2432 }
2433
2434 if (!(va.va_mask & AT_MODE)) {
2435 VN_RELE(vp);
2436 *status = NFSERR_INVAL;
2437 return;
2438 }
2439
2440 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2441 name = nfscmd_convname(ca, exi, args->sla_tnm,
2442 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2443
2444 if (name == NULL) {
2445 *status = NFSERR_ACCES;
2446 return;
2447 }
2448
2449 va.va_type = VLNK;
2450 va.va_mask |= AT_TYPE;
2451
2452 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2453
2454 /*
2455 * Force new data and metadata out to stable storage.
2456 */
2457 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2458 NULL, cr, NULL, NULL, NULL);
2459
2460 if (!lerror) {
2461 (void) VOP_FSYNC(svp, 0, cr, NULL);
2462 VN_RELE(svp);
2463 }
2464
2465 /*
2466 * Force modified data and metadata out to stable storage.
2467 */
2468 (void) VOP_FSYNC(vp, 0, cr, NULL);
2469
2470 VN_RELE(vp);
2471
2472 *status = puterrno(error);
2473 if (name != args->sla_tnm)
2474 kmem_free(name, MAXPATHLEN);
2475
2476 }
2477 void *
2478 rfs_symlink_getfh(struct nfsslargs *args)
2479 {
2480 return (args->sla_from.da_fhandle);
2481 }
2482
2483 /*
2484 * Make a directory.
2485 * Create a directory with the given name, parent directory, and attributes.
2486 * Returns a file handle and attributes for the new directory.
2487 */
2488 /* ARGSUSED */
2489 void
2490 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2491 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2492 {
2493 int error;
2494 struct vattr va;
2495 vnode_t *dvp = NULL;
2496 vnode_t *vp;
2497 char *name = args->ca_da.da_name;
2498
2499 /*
2500 * Disallow NULL paths
2501 */
2502 if (name == NULL || *name == '\0') {
2503 dr->dr_status = NFSERR_ACCES;
2504 return;
2505 }
2506
2507 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2508 if (vp == NULL) {
2509 dr->dr_status = NFSERR_STALE;
2510 return;
2511 }
2512
2513 if (rdonly(ro, vp)) {
2514 VN_RELE(vp);
2515 dr->dr_status = NFSERR_ROFS;
2516 return;
2517 }
2518
2519 error = sattr_to_vattr(args->ca_sa, &va);
2520 if (error) {
2521 VN_RELE(vp);
2522 dr->dr_status = puterrno(error);
2523 return;
2524 }
2525
2526 if (!(va.va_mask & AT_MODE)) {
2527 VN_RELE(vp);
2528 dr->dr_status = NFSERR_INVAL;
2529 return;
2530 }
2531
2532 va.va_type = VDIR;
2533 va.va_mask |= AT_TYPE;
2534
2535 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2536
2537 if (!error) {
2538 /*
2539 * Attribtutes of the newly created directory should
2540 * be returned to the client.
2541 */
2542 va.va_mask = AT_ALL; /* We want everything */
2543 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2544
2545 /* check for overflows */
2546 if (!error) {
2547 acl_perm(vp, exi, &va, cr);
2548 error = vattr_to_nattr(&va, &dr->dr_attr);
2549 if (!error) {
2550 error = makefh(&dr->dr_fhandle, dvp, exi);
2551 }
2552 }
2553 /*
2554 * Force new data and metadata out to stable storage.
2555 */
2556 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2557 VN_RELE(dvp);
2558 }
2559
2560 /*
2561 * Force modified data and metadata out to stable storage.
2562 */
2563 (void) VOP_FSYNC(vp, 0, cr, NULL);
2564
2565 VN_RELE(vp);
2566
2567 dr->dr_status = puterrno(error);
2568
2569 }
2570 void *
2571 rfs_mkdir_getfh(struct nfscreatargs *args)
2572 {
2573 return (args->ca_da.da_fhandle);
2574 }
2575
2576 /*
2577 * Remove a directory.
2578 * Remove the given directory name from the given parent directory.
2579 */
2580 /* ARGSUSED */
2581 void
2582 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2583 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2584 {
2585 int error;
2586 vnode_t *vp;
2587
2588 /*
2589 * Disallow NULL paths
2590 */
2591 if (da->da_name == NULL || *da->da_name == '\0') {
2592 *status = NFSERR_ACCES;
2593 return;
2594 }
2595
2596 vp = nfs_fhtovp(da->da_fhandle, exi);
2597 if (vp == NULL) {
2598 *status = NFSERR_STALE;
2599 return;
2600 }
2601
2602 if (rdonly(ro, vp)) {
2603 VN_RELE(vp);
2604 *status = NFSERR_ROFS;
2605 return;
2606 }
2607
2608 /*
2609 * VOP_RMDIR takes a third argument (the current
2610 * directory of the process). That's because someone
2611 * wants to return EINVAL if one tries to remove ".".
2612 * Of course, NFS servers have no idea what their
2613 * clients' current directories are. We fake it by
2614 * supplying a vnode known to exist and illegal to
2615 * remove.
2616 */
2617 error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2618
2619 /*
2620 * Force modified data and metadata out to stable storage.
2621 */
2622 (void) VOP_FSYNC(vp, 0, cr, NULL);
2623
2624 VN_RELE(vp);
2625
2626 /*
2627 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2628 * if the directory is not empty. A System V NFS server
2629 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2630 * over the wire.
2631 */
2632 if (error == EEXIST)
2633 *status = NFSERR_NOTEMPTY;
2634 else
2635 *status = puterrno(error);
2636
2637 }
2638 void *
2639 rfs_rmdir_getfh(struct nfsdiropargs *da)
2640 {
2641 return (da->da_fhandle);
2642 }
2643
2644 /* ARGSUSED */
2645 void
2646 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2647 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2648 {
2649 int error;
2650 int iseof;
2651 struct iovec iov;
2652 struct uio uio;
2653 vnode_t *vp;
2654 char *ndata = NULL;
2655 struct sockaddr *ca;
2656 size_t nents;
2657 int ret;
2658
2659 vp = nfs_fhtovp(&rda->rda_fh, exi);
2660 if (vp == NULL) {
2661 rd->rd_entries = NULL;
2662 rd->rd_status = NFSERR_STALE;
2663 return;
2664 }
2665
2666 if (vp->v_type != VDIR) {
2667 VN_RELE(vp);
2668 rd->rd_entries = NULL;
2669 rd->rd_status = NFSERR_NOTDIR;
2670 return;
2671 }
2672
2673 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2674
2675 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2676
2677 if (error) {
2678 rd->rd_entries = NULL;
2679 goto bad;
2680 }
2681
2682 if (rda->rda_count == 0) {
2683 rd->rd_entries = NULL;
2684 rd->rd_size = 0;
2685 rd->rd_eof = FALSE;
2686 goto bad;
2687 }
2688
2689 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2690
2691 /*
2692 * Allocate data for entries. This will be freed by rfs_rddirfree.
2693 */
2694 rd->rd_bufsize = (uint_t)rda->rda_count;
2695 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2696
2697 /*
2698 * Set up io vector to read directory data
2699 */
2700 iov.iov_base = (caddr_t)rd->rd_entries;
2701 iov.iov_len = rda->rda_count;
2702 uio.uio_iov = &iov;
2703 uio.uio_iovcnt = 1;
2704 uio.uio_segflg = UIO_SYSSPACE;
2705 uio.uio_extflg = UIO_COPY_CACHED;
2706 uio.uio_loffset = (offset_t)rda->rda_offset;
2707 uio.uio_resid = rda->rda_count;
2708
2709 /*
2710 * read directory
2711 */
2712 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2713
2714 /*
2715 * Clean up
2716 */
2717 if (!error) {
2718 /*
2719 * set size and eof
2720 */
2721 if (uio.uio_resid == rda->rda_count) {
2722 rd->rd_size = 0;
2723 rd->rd_eof = TRUE;
2724 } else {
2725 rd->rd_size = (uint32_t)(rda->rda_count -
2726 uio.uio_resid);
2727 rd->rd_eof = iseof ? TRUE : FALSE;
2728 }
2729 }
2730
2731 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2732 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2733 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2734 rda->rda_count, &ndata);
2735
2736 if (ret != 0) {
2737 size_t dropbytes;
2738 /*
2739 * We had to drop one or more entries in order to fit
2740 * during the character conversion. We need to patch
2741 * up the size and eof info.
2742 */
2743 if (rd->rd_eof)
2744 rd->rd_eof = FALSE;
2745 dropbytes = nfscmd_dropped_entrysize(
2746 (struct dirent64 *)rd->rd_entries, nents, ret);
2747 rd->rd_size -= dropbytes;
2748 }
2749 if (ndata == NULL) {
2750 ndata = (char *)rd->rd_entries;
2751 } else if (ndata != (char *)rd->rd_entries) {
2752 kmem_free(rd->rd_entries, rd->rd_bufsize);
2753 rd->rd_entries = (void *)ndata;
2754 rd->rd_bufsize = rda->rda_count;
2755 }
2756
2757 bad:
2758 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2759
2760 #if 0 /* notyet */
2761 /*
2762 * Don't do this. It causes local disk writes when just
2763 * reading the file and the overhead is deemed larger
2764 * than the benefit.
2765 */
2766 /*
2767 * Force modified metadata out to stable storage.
2768 */
2769 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2770 #endif
2771
2772 VN_RELE(vp);
2773
2774 rd->rd_status = puterrno(error);
2775
2776 }
2777 void *
2778 rfs_readdir_getfh(struct nfsrddirargs *rda)
2779 {
2780 return (&rda->rda_fh);
2781 }
2782 void
2783 rfs_rddirfree(struct nfsrddirres *rd)
2784 {
2785 if (rd->rd_entries != NULL)
2786 kmem_free(rd->rd_entries, rd->rd_bufsize);
2787 }
2788
2789 /* ARGSUSED */
2790 void
2791 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2792 struct svc_req *req, cred_t *cr, bool_t ro)
2793 {
2794 int error;
2795 struct statvfs64 sb;
2796 vnode_t *vp;
2797
2798 vp = nfs_fhtovp(fh, exi);
2799 if (vp == NULL) {
2800 fs->fs_status = NFSERR_STALE;
2801 return;
2802 }
2803
2804 error = VFS_STATVFS(vp->v_vfsp, &sb);
2805
2806 if (!error) {
2807 fs->fs_tsize = nfstsize();
2808 fs->fs_bsize = sb.f_frsize;
2809 fs->fs_blocks = sb.f_blocks;
2810 fs->fs_bfree = sb.f_bfree;
2811 fs->fs_bavail = sb.f_bavail;
2812 }
2813
2814 VN_RELE(vp);
2815
2816 fs->fs_status = puterrno(error);
2817
2818 }
2819 void *
2820 rfs_statfs_getfh(fhandle_t *fh)
2821 {
2822 return (fh);
2823 }
2824
2825 static int
2826 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2827 {
2828 vap->va_mask = 0;
2829
2830 /*
2831 * There was a sign extension bug in some VFS based systems
2832 * which stored the mode as a short. When it would get
2833 * assigned to a u_long, no sign extension would occur.
2834 * It needed to, but this wasn't noticed because sa_mode
2835 * would then get assigned back to the short, thus ignoring
2836 * the upper 16 bits of sa_mode.
2837 *
2838 * To make this implementation work for both broken
2839 * clients and good clients, we check for both versions
2840 * of the mode.
2841 */
2842 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2843 sa->sa_mode != (uint32_t)-1) {
2844 vap->va_mask |= AT_MODE;
2845 vap->va_mode = sa->sa_mode;
2846 }
2847 if (sa->sa_uid != (uint32_t)-1) {
2848 vap->va_mask |= AT_UID;
2849 vap->va_uid = sa->sa_uid;
2850 }
2851 if (sa->sa_gid != (uint32_t)-1) {
2852 vap->va_mask |= AT_GID;
2853 vap->va_gid = sa->sa_gid;
2854 }
2855 if (sa->sa_size != (uint32_t)-1) {
2856 vap->va_mask |= AT_SIZE;
2857 vap->va_size = sa->sa_size;
2858 }
2859 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2860 sa->sa_atime.tv_usec != (int32_t)-1) {
2861 #ifndef _LP64
2862 /* return error if time overflow */
2863 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2864 return (EOVERFLOW);
2865 #endif
2866 vap->va_mask |= AT_ATIME;
2867 /*
2868 * nfs protocol defines times as unsigned so don't extend sign,
2869 * unless sysadmin set nfs_allow_preepoch_time.
2870 */
2871 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2872 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2873 }
2874 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2875 sa->sa_mtime.tv_usec != (int32_t)-1) {
2876 #ifndef _LP64
2877 /* return error if time overflow */
2878 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2879 return (EOVERFLOW);
2880 #endif
2881 vap->va_mask |= AT_MTIME;
2882 /*
2883 * nfs protocol defines times as unsigned so don't extend sign,
2884 * unless sysadmin set nfs_allow_preepoch_time.
2885 */
2886 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2887 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2888 }
2889 return (0);
2890 }
2891
2892 static const enum nfsftype vt_to_nf[] = {
2893 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2894 };
2895
2896 /*
2897 * check the following fields for overflow: nodeid, size, and time.
2898 * There could be a problem when converting 64-bit LP64 fields
2899 * into 32-bit ones. Return an error if there is an overflow.
2900 */
2901 int
2902 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2903 {
2904 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2905 na->na_type = vt_to_nf[vap->va_type];
2906
2907 if (vap->va_mode == (unsigned short) -1)
2908 na->na_mode = (uint32_t)-1;
2909 else
2910 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2911
2912 if (vap->va_uid == (unsigned short)(-1))
2913 na->na_uid = (uint32_t)(-1);
2914 else if (vap->va_uid == UID_NOBODY)
2915 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2916 else
2917 na->na_uid = vap->va_uid;
2918
2919 if (vap->va_gid == (unsigned short)(-1))
2920 na->na_gid = (uint32_t)-1;
2921 else if (vap->va_gid == GID_NOBODY)
2922 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2923 else
2924 na->na_gid = vap->va_gid;
2925
2926 /*
2927 * Do we need to check fsid for overflow? It is 64-bit in the
2928 * vattr, but are bigger than 32 bit values supported?
2929 */
2930 na->na_fsid = vap->va_fsid;
2931
2932 na->na_nodeid = vap->va_nodeid;
2933
2934 /*
2935 * Check to make sure that the nodeid is representable over the
2936 * wire without losing bits.
2937 */
2938 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2939 return (EFBIG);
2940 na->na_nlink = vap->va_nlink;
2941
2942 /*
2943 * Check for big files here, instead of at the caller. See
2944 * comments in cstat for large special file explanation.
2945 */
2946 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2947 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2948 return (EFBIG);
2949 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2950 /* UNKNOWN_SIZE | OVERFLOW */
2951 na->na_size = MAXOFF32_T;
2952 } else
2953 na->na_size = vap->va_size;
2954 } else
2955 na->na_size = vap->va_size;
2956
2957 /*
2958 * If the vnode times overflow the 32-bit times that NFS2
2959 * uses on the wire then return an error.
2960 */
2961 if (!NFS_VAP_TIME_OK(vap)) {
2962 return (EOVERFLOW);
2963 }
2964 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2965 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2966
2967 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2968 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2969
2970 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2971 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2972
2973 /*
2974 * If the dev_t will fit into 16 bits then compress
2975 * it, otherwise leave it alone. See comments in
2976 * nfs_client.c.
2977 */
2978 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2979 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2980 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2981 else
2982 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2983
2984 na->na_blocks = vap->va_nblocks;
2985 na->na_blocksize = vap->va_blksize;
2986
2987 /*
2988 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2989 * over-the-wire protocols for named-pipe vnodes. It remaps the
2990 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2991 *
2992 * BUYER BEWARE:
2993 * If you are porting the NFS to a non-Sun server, you probably
2994 * don't want to include the following block of code. The
2995 * over-the-wire special file types will be changing with the
2996 * NFS Protocol Revision.
2997 */
2998 if (vap->va_type == VFIFO)
2999 NA_SETFIFO(na);
3000 return (0);
3001 }
3002
3003 /*
3004 * acl v2 support: returns approximate permission.
3005 * default: returns minimal permission (more restrictive)
3006 * aclok: returns maximal permission (less restrictive)
3007 * This routine changes the permissions that are alaredy in *va.
3008 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3009 * CLASS_OBJ is always the same as GROUP_OBJ entry.
3010 */
3011 static void
3012 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3013 {
3014 vsecattr_t vsa;
3015 int aclcnt;
3016 aclent_t *aclentp;
3017 mode_t mask_perm;
3018 mode_t grp_perm;
3019 mode_t other_perm;
3020 mode_t other_orig;
3021 int error;
3022
3023 /* dont care default acl */
3024 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3025 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3026
3027 if (!error) {
3028 aclcnt = vsa.vsa_aclcnt;
3029 if (aclcnt > MIN_ACL_ENTRIES) {
3030 /* non-trivial ACL */
3031 aclentp = vsa.vsa_aclentp;
3032 if (exi->exi_export.ex_flags & EX_ACLOK) {
3033 /* maximal permissions */
3034 grp_perm = 0;
3035 other_perm = 0;
3036 for (; aclcnt > 0; aclcnt--, aclentp++) {
3037 switch (aclentp->a_type) {
3038 case USER_OBJ:
3039 break;
3040 case USER:
3041 grp_perm |=
3042 aclentp->a_perm << 3;
3043 other_perm |= aclentp->a_perm;
3044 break;
3045 case GROUP_OBJ:
3046 grp_perm |=
3047 aclentp->a_perm << 3;
3048 break;
3049 case GROUP:
3050 other_perm |= aclentp->a_perm;
3051 break;
3052 case OTHER_OBJ:
3053 other_orig = aclentp->a_perm;
3054 break;
3055 case CLASS_OBJ:
3056 mask_perm = aclentp->a_perm;
3057 break;
3058 default:
3059 break;
3060 }
3061 }
3062 grp_perm &= mask_perm << 3;
3063 other_perm &= mask_perm;
3064 other_perm |= other_orig;
3065
3066 } else {
3067 /* minimal permissions */
3068 grp_perm = 070;
3069 other_perm = 07;
3070 for (; aclcnt > 0; aclcnt--, aclentp++) {
3071 switch (aclentp->a_type) {
3072 case USER_OBJ:
3073 break;
3074 case USER:
3075 case CLASS_OBJ:
3076 grp_perm &=
3077 aclentp->a_perm << 3;
3078 other_perm &=
3079 aclentp->a_perm;
3080 break;
3081 case GROUP_OBJ:
3082 grp_perm &=
3083 aclentp->a_perm << 3;
3084 break;
3085 case GROUP:
3086 other_perm &=
3087 aclentp->a_perm;
3088 break;
3089 case OTHER_OBJ:
3090 other_perm &=
3091 aclentp->a_perm;
3092 break;
3093 default:
3094 break;
3095 }
3096 }
3097 }
3098 /* copy to va */
3099 va->va_mode &= ~077;
3100 va->va_mode |= grp_perm | other_perm;
3101 }
3102 if (vsa.vsa_aclcnt)
3103 kmem_free(vsa.vsa_aclentp,
3104 vsa.vsa_aclcnt * sizeof (aclent_t));
3105 }
3106 }
3107
3108 void
3109 rfs_srvrinit(void)
3110 {
3111 nfs2_srv_caller_id = fs_new_caller_id();
3112 }
3113
3114 void
3115 rfs_srvrfini(void)
3116 {
3117 }
3118
3119 /* ARGSUSED */
3120 void
3121 rfs_srv_zone_init(nfs_globals_t *ng)
3122 {
3123 nfs_srv_t *ns;
3124
3125 ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3126
3127 mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3128 ns->write_async = 1;
3129
3130 ng->nfs_srv = ns;
3131 }
3132
3133 /* ARGSUSED */
3134 void
3135 rfs_srv_zone_fini(nfs_globals_t *ng)
3136 {
3137 nfs_srv_t *ns = ng->nfs_srv;
3138
3139 ng->nfs_srv = NULL;
3140
3141 mutex_destroy(&ns->async_write_lock);
3142 kmem_free(ns, sizeof (*ns));
3143 }
3144
3145 static int
3146 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3147 {
3148 struct clist *wcl;
3149 int wlist_len;
3150 uint32_t count = rr->rr_count;
3151
3152 wcl = ra->ra_wlist;
3153
3154 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3155 return (FALSE);
3156 }
3157
3158 wcl = ra->ra_wlist;
3159 rr->rr_ok.rrok_wlist_len = wlist_len;
3160 rr->rr_ok.rrok_wlist = wcl;
3161
3162 return (TRUE);
3163 }