1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
30 * All rights reserved.
31 */
32
33 /*
34 * Copyright 2018 Nexenta Systems, Inc.
35 * Copyright (c) 2016 by Delphix. All rights reserved.
36 */
37
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/uio.h>
46 #include <sys/stat.h>
47 #include <sys/errno.h>
48 #include <sys/sysmacros.h>
49 #include <sys/statvfs.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/dirent.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/vtrace.h>
56 #include <sys/mode.h>
57 #include <sys/acl.h>
58 #include <sys/nbmlock.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/svc.h>
65
66 #include <nfs/nfs.h>
67 #include <nfs/export.h>
68 #include <nfs/nfs_cmd.h>
69
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_kmem.h>
75
76 #include <sys/strsubr.h>
77
78 struct rfs_async_write_list;
79
80 /*
81 * Zone globals of NFSv2 server
82 */
83 typedef struct nfs_srv {
84 kmutex_t async_write_lock;
85 struct rfs_async_write_list *async_write_head;
86
87 /*
88 * enables write clustering if == 1
89 */
90 int write_async;
91 } nfs_srv_t;
92
93 /*
94 * These are the interface routines for the server side of the
95 * Network File System. See the NFS version 2 protocol specification
96 * for a description of this interface.
97 */
98
99 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
100 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
101 cred_t *);
102
103
104 /*
105 * Some "over the wire" UNIX file types. These are encoded
106 * into the mode. This needs to be fixed in the next rev.
107 */
108 #define IFMT 0170000 /* type of file */
109 #define IFCHR 0020000 /* character special */
110 #define IFBLK 0060000 /* block special */
111 #define IFSOCK 0140000 /* socket */
112
113 u_longlong_t nfs2_srv_caller_id;
114
115 static nfs_srv_t *
116 nfs_get_srv(void)
117 {
118 nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
119 nfs_srv_t *srv = ng->nfs_srv;
120 ASSERT(srv != NULL);
121 return (srv);
122 }
123
124 /*
125 * Get file attributes.
126 * Returns the current attributes of the file with the given fhandle.
127 */
128 /* ARGSUSED */
129 void
130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
131 struct svc_req *req, cred_t *cr, bool_t ro)
132 {
133 int error;
134 vnode_t *vp;
135 struct vattr va;
136
137 vp = nfs_fhtovp(fhp, exi);
138 if (vp == NULL) {
139 ns->ns_status = NFSERR_STALE;
140 return;
141 }
142
143 /*
144 * Do the getattr.
145 */
146 va.va_mask = AT_ALL; /* we want all the attributes */
147
148 error = rfs4_delegated_getattr(vp, &va, 0, cr);
149
150 /* check for overflows */
151 if (!error) {
152 /* Lie about the object type for a referral */
153 if (vn_is_nfs_reparse(vp, cr))
154 va.va_type = VLNK;
155
156 acl_perm(vp, exi, &va, cr);
157 error = vattr_to_nattr(&va, &ns->ns_attr);
158 }
159
160 VN_RELE(vp);
161
162 ns->ns_status = puterrno(error);
163 }
164 void *
165 rfs_getattr_getfh(fhandle_t *fhp)
166 {
167 return (fhp);
168 }
169
170 /*
171 * Set file attributes.
172 * Sets the attributes of the file with the given fhandle. Returns
173 * the new attributes.
174 */
175 /* ARGSUSED */
176 void
177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
178 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
179 {
180 int error;
181 int flag;
182 int in_crit = 0;
183 vnode_t *vp;
184 struct vattr va;
185 struct vattr bva;
186 struct flock64 bf;
187 caller_context_t ct;
188
189
190 vp = nfs_fhtovp(&args->saa_fh, exi);
191 if (vp == NULL) {
192 ns->ns_status = NFSERR_STALE;
193 return;
194 }
195
196 if (rdonly(ro, vp)) {
197 VN_RELE(vp);
198 ns->ns_status = NFSERR_ROFS;
199 return;
200 }
201
202 error = sattr_to_vattr(&args->saa_sa, &va);
203 if (error) {
204 VN_RELE(vp);
205 ns->ns_status = puterrno(error);
206 return;
207 }
208
209 /*
210 * If the client is requesting a change to the mtime,
211 * but the nanosecond field is set to 1 billion, then
212 * this is a flag to the server that it should set the
213 * atime and mtime fields to the server's current time.
214 * The 1 billion number actually came from the client
215 * as 1 million, but the units in the over the wire
216 * request are microseconds instead of nanoseconds.
217 *
218 * This is an overload of the protocol and should be
219 * documented in the NFS Version 2 protocol specification.
220 */
221 if (va.va_mask & AT_MTIME) {
222 if (va.va_mtime.tv_nsec == 1000000000) {
223 gethrestime(&va.va_mtime);
224 va.va_atime = va.va_mtime;
225 va.va_mask |= AT_ATIME;
226 flag = 0;
227 } else
228 flag = ATTR_UTIME;
229 } else
230 flag = 0;
231
232 /*
233 * If the filesystem is exported with nosuid, then mask off
234 * the setuid and setgid bits.
235 */
236 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
237 (exi->exi_export.ex_flags & EX_NOSUID))
238 va.va_mode &= ~(VSUID | VSGID);
239
240 ct.cc_sysid = 0;
241 ct.cc_pid = 0;
242 ct.cc_caller_id = nfs2_srv_caller_id;
243 ct.cc_flags = CC_DONTBLOCK;
244
245 /*
246 * We need to specially handle size changes because it is
247 * possible for the client to create a file with modes
248 * which indicate read-only, but with the file opened for
249 * writing. If the client then tries to set the size of
250 * the file, then the normal access checking done in
251 * VOP_SETATTR would prevent the client from doing so,
252 * although it should be legal for it to do so. To get
253 * around this, we do the access checking for ourselves
254 * and then use VOP_SPACE which doesn't do the access
255 * checking which VOP_SETATTR does. VOP_SPACE can only
256 * operate on VREG files, let VOP_SETATTR handle the other
257 * extremely rare cases.
258 * Also the client should not be allowed to change the
259 * size of the file if there is a conflicting non-blocking
260 * mandatory lock in the region of change.
261 */
262 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
263 if (nbl_need_check(vp)) {
264 nbl_start_crit(vp, RW_READER);
265 in_crit = 1;
266 }
267
268 bva.va_mask = AT_UID | AT_SIZE;
269
270 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
271
272 if (error) {
273 if (in_crit)
274 nbl_end_crit(vp);
275 VN_RELE(vp);
276 ns->ns_status = puterrno(error);
277 return;
278 }
279
280 if (in_crit) {
281 u_offset_t offset;
282 ssize_t length;
283
284 if (va.va_size < bva.va_size) {
285 offset = va.va_size;
286 length = bva.va_size - va.va_size;
287 } else {
288 offset = bva.va_size;
289 length = va.va_size - bva.va_size;
290 }
291 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
292 NULL)) {
293 error = EACCES;
294 }
295 }
296
297 if (crgetuid(cr) == bva.va_uid && !error &&
298 va.va_size != bva.va_size) {
299 va.va_mask &= ~AT_SIZE;
300 bf.l_type = F_WRLCK;
301 bf.l_whence = 0;
302 bf.l_start = (off64_t)va.va_size;
303 bf.l_len = 0;
304 bf.l_sysid = 0;
305 bf.l_pid = 0;
306
307 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
308 (offset_t)va.va_size, cr, &ct);
309 }
310 if (in_crit)
311 nbl_end_crit(vp);
312 } else
313 error = 0;
314
315 /*
316 * Do the setattr.
317 */
318 if (!error && va.va_mask) {
319 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
320 }
321
322 /*
323 * check if the monitor on either vop_space or vop_setattr detected
324 * a delegation conflict and if so, mark the thread flag as
325 * wouldblock so that the response is dropped and the client will
326 * try again.
327 */
328 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
329 VN_RELE(vp);
330 curthread->t_flag |= T_WOULDBLOCK;
331 return;
332 }
333
334 if (!error) {
335 va.va_mask = AT_ALL; /* get everything */
336
337 error = rfs4_delegated_getattr(vp, &va, 0, cr);
338
339 /* check for overflows */
340 if (!error) {
341 acl_perm(vp, exi, &va, cr);
342 error = vattr_to_nattr(&va, &ns->ns_attr);
343 }
344 }
345
346 ct.cc_flags = 0;
347
348 /*
349 * Force modified metadata out to stable storage.
350 */
351 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
352
353 VN_RELE(vp);
354
355 ns->ns_status = puterrno(error);
356 }
357 void *
358 rfs_setattr_getfh(struct nfssaargs *args)
359 {
360 return (&args->saa_fh);
361 }
362
363 /* Change and release @exip and @vpp only in success */
364 int
365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
366 {
367 struct exportinfo *exi;
368 vnode_t *vp = *vpp;
369 fid_t fid;
370 int error;
371
372 VN_HOLD(vp);
373
374 if ((error = traverse(&vp)) != 0) {
375 VN_RELE(vp);
376 return (error);
377 }
378
379 bzero(&fid, sizeof (fid));
380 fid.fid_len = MAXFIDSZ;
381 error = VOP_FID(vp, &fid, NULL);
382 if (error) {
383 VN_RELE(vp);
384 return (error);
385 }
386
387 exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
388 if (exi == NULL ||
389 (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
390 /*
391 * It is not error, just subdir is not exported
392 * or "nohide" is not set
393 */
394 if (exi != NULL)
395 exi_rele(exi);
396 VN_RELE(vp);
397 } else {
398 /* go to submount */
399 exi_rele(*exip);
400 *exip = exi;
401
402 VN_RELE(*vpp);
403 *vpp = vp;
404 }
405
406 return (0);
407 }
408
409 /*
410 * Given mounted "dvp" and "exi", go upper mountpoint
411 * with dvp/exi correction
412 * Return 0 in success
413 */
414 int
415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
416 {
417 struct exportinfo *exi;
418 vnode_t *dvp = *dvpp;
419 vnode_t *zone_rootvp;
420
421 zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
422 ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
423
424 VN_HOLD(dvp);
425 dvp = untraverse(dvp, zone_rootvp);
426 exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
427 if (exi == NULL) {
428 VN_RELE(dvp);
429 return (-1);
430 }
431
432 ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
433 exi_rele(*exip);
434 *exip = exi;
435 VN_RELE(*dvpp);
436 *dvpp = dvp;
437
438 return (0);
439 }
440 /*
441 * Directory lookup.
442 * Returns an fhandle and file attributes for file name in a directory.
443 */
444 /* ARGSUSED */
445 void
446 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
447 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
448 {
449 int error;
450 vnode_t *dvp;
451 vnode_t *vp;
452 struct vattr va;
453 fhandle_t *fhp = da->da_fhandle;
454 struct sec_ol sec = {0, 0};
455 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
456 char *name;
457 struct sockaddr *ca;
458
459 /*
460 * Trusted Extension doesn't support NFSv2. MOUNT
461 * will reject v2 clients. Need to prevent v2 client
462 * access via WebNFS here.
463 */
464 if (is_system_labeled() && req->rq_vers == 2) {
465 dr->dr_status = NFSERR_ACCES;
466 return;
467 }
468
469 /*
470 * Disallow NULL paths
471 */
472 if (da->da_name == NULL || *da->da_name == '\0') {
473 dr->dr_status = NFSERR_ACCES;
474 return;
475 }
476
477 /*
478 * Allow lookups from the root - the default
479 * location of the public filehandle.
480 */
481 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
482 dvp = ZONE_ROOTVP();
483 VN_HOLD(dvp);
484 } else {
485 dvp = nfs_fhtovp(fhp, exi);
486 if (dvp == NULL) {
487 dr->dr_status = NFSERR_STALE;
488 return;
489 }
490 }
491
492 exi_hold(exi);
493 ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
494
495 /*
496 * Not allow lookup beyond root.
497 * If the filehandle matches a filehandle of the exi,
498 * then the ".." refers beyond the root of an exported filesystem.
499 */
500 if (strcmp(da->da_name, "..") == 0 &&
501 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
502 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
503 ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
504 /*
505 * special case for ".." and 'nohide'exported root
506 */
507 if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
508 error = NFSERR_ACCES;
509 goto out;
510 }
511 } else {
512 error = NFSERR_NOENT;
513 goto out;
514 }
515 }
516
517 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
518 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
519 MAXPATHLEN);
520
521 if (name == NULL) {
522 error = NFSERR_ACCES;
523 goto out;
524 }
525
526 /*
527 * If the public filehandle is used then allow
528 * a multi-component lookup, i.e. evaluate
529 * a pathname and follow symbolic links if
530 * necessary.
531 *
532 * This may result in a vnode in another filesystem
533 * which is OK as long as the filesystem is exported.
534 */
535 if (PUBLIC_FH2(fhp)) {
536 publicfh_flag = TRUE;
537
538 exi_rele(exi);
539 exi = NULL;
540
541 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
542 &sec);
543 } else {
544 /*
545 * Do a normal single component lookup.
546 */
547 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
548 NULL, NULL, NULL);
549 }
550
551 if (name != da->da_name)
552 kmem_free(name, MAXPATHLEN);
553
554 if (error == 0 && vn_ismntpt(vp)) {
555 error = rfs_cross_mnt(&vp, &exi);
556 if (error)
557 VN_RELE(vp);
558 }
559
560 if (!error) {
561 va.va_mask = AT_ALL; /* we want everything */
562
563 error = rfs4_delegated_getattr(vp, &va, 0, cr);
564
565 /* check for overflows */
566 if (!error) {
567 acl_perm(vp, exi, &va, cr);
568 error = vattr_to_nattr(&va, &dr->dr_attr);
569 if (!error) {
570 if (sec.sec_flags & SEC_QUERY)
571 error = makefh_ol(&dr->dr_fhandle, exi,
572 sec.sec_index);
573 else {
574 error = makefh(&dr->dr_fhandle, vp,
575 exi);
576 if (!error && publicfh_flag &&
577 !chk_clnt_sec(exi, req))
578 auth_weak = TRUE;
579 }
580 }
581 }
582 VN_RELE(vp);
583 }
584
585 out:
586 VN_RELE(dvp);
587
588 if (exi != NULL)
589 exi_rele(exi);
590
591 /*
592 * If it's public fh, no 0x81, and client's flavor is
593 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
594 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
595 */
596 if (auth_weak)
597 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
598 else
599 dr->dr_status = puterrno(error);
600 }
601 void *
602 rfs_lookup_getfh(struct nfsdiropargs *da)
603 {
604 return (da->da_fhandle);
605 }
606
607 /*
608 * Read symbolic link.
609 * Returns the string in the symbolic link at the given fhandle.
610 */
611 /* ARGSUSED */
612 void
613 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
614 struct svc_req *req, cred_t *cr, bool_t ro)
615 {
616 int error;
617 struct iovec iov;
618 struct uio uio;
619 vnode_t *vp;
620 struct vattr va;
621 struct sockaddr *ca;
622 char *name = NULL;
623 int is_referral = 0;
624
625 vp = nfs_fhtovp(fhp, exi);
626 if (vp == NULL) {
627 rl->rl_data = NULL;
628 rl->rl_status = NFSERR_STALE;
629 return;
630 }
631
632 va.va_mask = AT_MODE;
633
634 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
635
636 if (error) {
637 VN_RELE(vp);
638 rl->rl_data = NULL;
639 rl->rl_status = puterrno(error);
640 return;
641 }
642
643 if (MANDLOCK(vp, va.va_mode)) {
644 VN_RELE(vp);
645 rl->rl_data = NULL;
646 rl->rl_status = NFSERR_ACCES;
647 return;
648 }
649
650 /* We lied about the object type for a referral */
651 if (vn_is_nfs_reparse(vp, cr))
652 is_referral = 1;
653
654 /*
655 * XNFS and RFC1094 require us to return ENXIO if argument
656 * is not a link. BUGID 1138002.
657 */
658 if (vp->v_type != VLNK && !is_referral) {
659 VN_RELE(vp);
660 rl->rl_data = NULL;
661 rl->rl_status = NFSERR_NXIO;
662 return;
663 }
664
665 /*
666 * Allocate data for pathname. This will be freed by rfs_rlfree.
667 */
668 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
669
670 if (is_referral) {
671 char *s;
672 size_t strsz;
673
674 /* Get an artificial symlink based on a referral */
675 s = build_symlink(vp, cr, &strsz);
676 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
677 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
678 vnode_t *, vp, char *, s);
679 if (s == NULL)
680 error = EINVAL;
681 else {
682 error = 0;
683 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
684 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
685 kmem_free(s, strsz);
686 }
687
688 } else {
689
690 /*
691 * Set up io vector to read sym link data
692 */
693 iov.iov_base = rl->rl_data;
694 iov.iov_len = NFS_MAXPATHLEN;
695 uio.uio_iov = &iov;
696 uio.uio_iovcnt = 1;
697 uio.uio_segflg = UIO_SYSSPACE;
698 uio.uio_extflg = UIO_COPY_CACHED;
699 uio.uio_loffset = (offset_t)0;
700 uio.uio_resid = NFS_MAXPATHLEN;
701
702 /*
703 * Do the readlink.
704 */
705 error = VOP_READLINK(vp, &uio, cr, NULL);
706
707 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
708
709 if (!error)
710 rl->rl_data[rl->rl_count] = '\0';
711
712 }
713
714
715 VN_RELE(vp);
716
717 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
718 name = nfscmd_convname(ca, exi, rl->rl_data,
719 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
720
721 if (name != NULL && name != rl->rl_data) {
722 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
723 rl->rl_data = name;
724 }
725
726 /*
727 * XNFS and RFC1094 require us to return ENXIO if argument
728 * is not a link. UFS returns EINVAL if this is the case,
729 * so we do the mapping here. BUGID 1138002.
730 */
731 if (error == EINVAL)
732 rl->rl_status = NFSERR_NXIO;
733 else
734 rl->rl_status = puterrno(error);
735
736 }
737 void *
738 rfs_readlink_getfh(fhandle_t *fhp)
739 {
740 return (fhp);
741 }
742 /*
743 * Free data allocated by rfs_readlink
744 */
745 void
746 rfs_rlfree(struct nfsrdlnres *rl)
747 {
748 if (rl->rl_data != NULL)
749 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
750 }
751
752 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
753
754 /*
755 * Read data.
756 * Returns some data read from the file at the given fhandle.
757 */
758 /* ARGSUSED */
759 void
760 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
761 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
762 {
763 vnode_t *vp;
764 int error;
765 struct vattr va;
766 struct iovec iov;
767 struct uio uio;
768 mblk_t *mp;
769 int alloc_err = 0;
770 int in_crit = 0;
771 caller_context_t ct;
772
773 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
774 if (vp == NULL) {
775 rr->rr_data = NULL;
776 rr->rr_status = NFSERR_STALE;
777 return;
778 }
779
780 if (vp->v_type != VREG) {
781 VN_RELE(vp);
782 rr->rr_data = NULL;
783 rr->rr_status = NFSERR_ISDIR;
784 return;
785 }
786
787 ct.cc_sysid = 0;
788 ct.cc_pid = 0;
789 ct.cc_caller_id = nfs2_srv_caller_id;
790 ct.cc_flags = CC_DONTBLOCK;
791
792 /*
793 * Enter the critical region before calling VOP_RWLOCK
794 * to avoid a deadlock with write requests.
795 */
796 if (nbl_need_check(vp)) {
797 nbl_start_crit(vp, RW_READER);
798 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
799 0, NULL)) {
800 nbl_end_crit(vp);
801 VN_RELE(vp);
802 rr->rr_data = NULL;
803 rr->rr_status = NFSERR_ACCES;
804 return;
805 }
806 in_crit = 1;
807 }
808
809 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
810
811 /* check if a monitor detected a delegation conflict */
812 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
813 if (in_crit)
814 nbl_end_crit(vp);
815 VN_RELE(vp);
816 /* mark as wouldblock so response is dropped */
817 curthread->t_flag |= T_WOULDBLOCK;
818
819 rr->rr_data = NULL;
820 return;
821 }
822
823 va.va_mask = AT_ALL;
824
825 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
826
827 if (error) {
828 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
829 if (in_crit)
830 nbl_end_crit(vp);
831
832 VN_RELE(vp);
833 rr->rr_data = NULL;
834 rr->rr_status = puterrno(error);
835
836 return;
837 }
838
839 /*
840 * This is a kludge to allow reading of files created
841 * with no read permission. The owner of the file
842 * is always allowed to read it.
843 */
844 if (crgetuid(cr) != va.va_uid) {
845 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
846
847 if (error) {
848 /*
849 * Exec is the same as read over the net because
850 * of demand loading.
851 */
852 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
853 }
854 if (error) {
855 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
856 if (in_crit)
857 nbl_end_crit(vp);
858 VN_RELE(vp);
859 rr->rr_data = NULL;
860 rr->rr_status = puterrno(error);
861
862 return;
863 }
864 }
865
866 if (MANDLOCK(vp, va.va_mode)) {
867 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
868 if (in_crit)
869 nbl_end_crit(vp);
870
871 VN_RELE(vp);
872 rr->rr_data = NULL;
873 rr->rr_status = NFSERR_ACCES;
874
875 return;
876 }
877
878 rr->rr_ok.rrok_wlist_len = 0;
879 rr->rr_ok.rrok_wlist = NULL;
880
881 if ((u_offset_t)ra->ra_offset >= va.va_size) {
882 rr->rr_count = 0;
883 rr->rr_data = NULL;
884 /*
885 * In this case, status is NFS_OK, but there is no data
886 * to encode. So set rr_mp to NULL.
887 */
888 rr->rr_mp = NULL;
889 rr->rr_ok.rrok_wlist = ra->ra_wlist;
890 if (rr->rr_ok.rrok_wlist)
891 clist_zero_len(rr->rr_ok.rrok_wlist);
892 goto done;
893 }
894
895 if (ra->ra_wlist) {
896 mp = NULL;
897 rr->rr_mp = NULL;
898 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
899 if (ra->ra_count > iov.iov_len) {
900 rr->rr_data = NULL;
901 rr->rr_status = NFSERR_INVAL;
902 goto done;
903 }
904 } else {
905 /*
906 * mp will contain the data to be sent out in the read reply.
907 * This will be freed after the reply has been sent out (by the
908 * driver).
909 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
910 * that the call to xdrmblk_putmblk() never fails.
911 */
912 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
913 &alloc_err);
914 ASSERT(mp != NULL);
915 ASSERT(alloc_err == 0);
916
917 rr->rr_mp = mp;
918
919 /*
920 * Set up io vector
921 */
922 iov.iov_base = (caddr_t)mp->b_datap->db_base;
923 iov.iov_len = ra->ra_count;
924 }
925
926 uio.uio_iov = &iov;
927 uio.uio_iovcnt = 1;
928 uio.uio_segflg = UIO_SYSSPACE;
929 uio.uio_extflg = UIO_COPY_CACHED;
930 uio.uio_loffset = (offset_t)ra->ra_offset;
931 uio.uio_resid = ra->ra_count;
932
933 error = VOP_READ(vp, &uio, 0, cr, &ct);
934
935 if (error) {
936 if (mp)
937 freeb(mp);
938
939 /*
940 * check if a monitor detected a delegation conflict and
941 * mark as wouldblock so response is dropped
942 */
943 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
944 curthread->t_flag |= T_WOULDBLOCK;
945 else
946 rr->rr_status = puterrno(error);
947
948 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
949 if (in_crit)
950 nbl_end_crit(vp);
951
952 VN_RELE(vp);
953 rr->rr_data = NULL;
954
955 return;
956 }
957
958 /*
959 * Get attributes again so we can send the latest access
960 * time to the client side for its cache.
961 */
962 va.va_mask = AT_ALL;
963
964 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
965
966 if (error) {
967 if (mp)
968 freeb(mp);
969
970 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
971 if (in_crit)
972 nbl_end_crit(vp);
973
974 VN_RELE(vp);
975 rr->rr_data = NULL;
976 rr->rr_status = puterrno(error);
977
978 return;
979 }
980
981 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
982
983 if (mp) {
984 rr->rr_data = (char *)mp->b_datap->db_base;
985 } else {
986 if (ra->ra_wlist) {
987 rr->rr_data = (caddr_t)iov.iov_base;
988 if (!rdma_setup_read_data2(ra, rr)) {
989 rr->rr_data = NULL;
990 rr->rr_status = puterrno(NFSERR_INVAL);
991 }
992 }
993 }
994 done:
995 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
996 if (in_crit)
997 nbl_end_crit(vp);
998
999 acl_perm(vp, exi, &va, cr);
1000
1001 /* check for overflows */
1002 error = vattr_to_nattr(&va, &rr->rr_attr);
1003
1004 VN_RELE(vp);
1005
1006 rr->rr_status = puterrno(error);
1007 }
1008
1009 /*
1010 * Free data allocated by rfs_read
1011 */
1012 void
1013 rfs_rdfree(struct nfsrdresult *rr)
1014 {
1015 mblk_t *mp;
1016
1017 if (rr->rr_status == NFS_OK) {
1018 mp = rr->rr_mp;
1019 if (mp != NULL)
1020 freeb(mp);
1021 }
1022 }
1023
1024 void *
1025 rfs_read_getfh(struct nfsreadargs *ra)
1026 {
1027 return (&ra->ra_fhandle);
1028 }
1029
1030 #define MAX_IOVECS 12
1031
1032 #ifdef DEBUG
1033 static int rfs_write_sync_hits = 0;
1034 static int rfs_write_sync_misses = 0;
1035 #endif
1036
1037 /*
1038 * Write data to file.
1039 * Returns attributes of a file after writing some data to it.
1040 *
1041 * Any changes made here, especially in error handling might have
1042 * to also be done in rfs_write (which clusters write requests).
1043 */
1044 /* ARGSUSED */
1045 void
1046 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1047 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1048 {
1049 int error;
1050 vnode_t *vp;
1051 rlim64_t rlimit;
1052 struct vattr va;
1053 struct uio uio;
1054 struct iovec iov[MAX_IOVECS];
1055 mblk_t *m;
1056 struct iovec *iovp;
1057 int iovcnt;
1058 cred_t *savecred;
1059 int in_crit = 0;
1060 caller_context_t ct;
1061
1062 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1063 if (vp == NULL) {
1064 ns->ns_status = NFSERR_STALE;
1065 return;
1066 }
1067
1068 if (rdonly(ro, vp)) {
1069 VN_RELE(vp);
1070 ns->ns_status = NFSERR_ROFS;
1071 return;
1072 }
1073
1074 if (vp->v_type != VREG) {
1075 VN_RELE(vp);
1076 ns->ns_status = NFSERR_ISDIR;
1077 return;
1078 }
1079
1080 ct.cc_sysid = 0;
1081 ct.cc_pid = 0;
1082 ct.cc_caller_id = nfs2_srv_caller_id;
1083 ct.cc_flags = CC_DONTBLOCK;
1084
1085 va.va_mask = AT_UID|AT_MODE;
1086
1087 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1088
1089 if (error) {
1090 VN_RELE(vp);
1091 ns->ns_status = puterrno(error);
1092
1093 return;
1094 }
1095
1096 if (crgetuid(cr) != va.va_uid) {
1097 /*
1098 * This is a kludge to allow writes of files created
1099 * with read only permission. The owner of the file
1100 * is always allowed to write it.
1101 */
1102 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1103
1104 if (error) {
1105 VN_RELE(vp);
1106 ns->ns_status = puterrno(error);
1107 return;
1108 }
1109 }
1110
1111 /*
1112 * Can't access a mandatory lock file. This might cause
1113 * the NFS service thread to block forever waiting for a
1114 * lock to be released that will never be released.
1115 */
1116 if (MANDLOCK(vp, va.va_mode)) {
1117 VN_RELE(vp);
1118 ns->ns_status = NFSERR_ACCES;
1119 return;
1120 }
1121
1122 /*
1123 * We have to enter the critical region before calling VOP_RWLOCK
1124 * to avoid a deadlock with ufs.
1125 */
1126 if (nbl_need_check(vp)) {
1127 nbl_start_crit(vp, RW_READER);
1128 in_crit = 1;
1129 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1130 wa->wa_count, 0, NULL)) {
1131 error = EACCES;
1132 goto out;
1133 }
1134 }
1135
1136 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1137
1138 /* check if a monitor detected a delegation conflict */
1139 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1140 goto out;
1141 }
1142
1143 if (wa->wa_data || wa->wa_rlist) {
1144 /* Do the RDMA thing if necessary */
1145 if (wa->wa_rlist) {
1146 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1147 iov[0].iov_len = wa->wa_count;
1148 } else {
1149 iov[0].iov_base = wa->wa_data;
1150 iov[0].iov_len = wa->wa_count;
1151 }
1152 uio.uio_iov = iov;
1153 uio.uio_iovcnt = 1;
1154 uio.uio_segflg = UIO_SYSSPACE;
1155 uio.uio_extflg = UIO_COPY_DEFAULT;
1156 uio.uio_loffset = (offset_t)wa->wa_offset;
1157 uio.uio_resid = wa->wa_count;
1158 /*
1159 * The limit is checked on the client. We
1160 * should allow any size writes here.
1161 */
1162 uio.uio_llimit = curproc->p_fsz_ctl;
1163 rlimit = uio.uio_llimit - wa->wa_offset;
1164 if (rlimit < (rlim64_t)uio.uio_resid)
1165 uio.uio_resid = (uint_t)rlimit;
1166
1167 /*
1168 * for now we assume no append mode
1169 */
1170 /*
1171 * We're changing creds because VM may fault and we need
1172 * the cred of the current thread to be used if quota
1173 * checking is enabled.
1174 */
1175 savecred = curthread->t_cred;
1176 curthread->t_cred = cr;
1177 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1178 curthread->t_cred = savecred;
1179 } else {
1180
1181 iovcnt = 0;
1182 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1183 iovcnt++;
1184 if (iovcnt <= MAX_IOVECS) {
1185 #ifdef DEBUG
1186 rfs_write_sync_hits++;
1187 #endif
1188 iovp = iov;
1189 } else {
1190 #ifdef DEBUG
1191 rfs_write_sync_misses++;
1192 #endif
1193 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1194 }
1195 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1196 uio.uio_iov = iovp;
1197 uio.uio_iovcnt = iovcnt;
1198 uio.uio_segflg = UIO_SYSSPACE;
1199 uio.uio_extflg = UIO_COPY_DEFAULT;
1200 uio.uio_loffset = (offset_t)wa->wa_offset;
1201 uio.uio_resid = wa->wa_count;
1202 /*
1203 * The limit is checked on the client. We
1204 * should allow any size writes here.
1205 */
1206 uio.uio_llimit = curproc->p_fsz_ctl;
1207 rlimit = uio.uio_llimit - wa->wa_offset;
1208 if (rlimit < (rlim64_t)uio.uio_resid)
1209 uio.uio_resid = (uint_t)rlimit;
1210
1211 /*
1212 * For now we assume no append mode.
1213 */
1214 /*
1215 * We're changing creds because VM may fault and we need
1216 * the cred of the current thread to be used if quota
1217 * checking is enabled.
1218 */
1219 savecred = curthread->t_cred;
1220 curthread->t_cred = cr;
1221 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1222 curthread->t_cred = savecred;
1223
1224 if (iovp != iov)
1225 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1226 }
1227
1228 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1229
1230 if (!error) {
1231 /*
1232 * Get attributes again so we send the latest mod
1233 * time to the client side for its cache.
1234 */
1235 va.va_mask = AT_ALL; /* now we want everything */
1236
1237 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1238
1239 /* check for overflows */
1240 if (!error) {
1241 acl_perm(vp, exi, &va, cr);
1242 error = vattr_to_nattr(&va, &ns->ns_attr);
1243 }
1244 }
1245
1246 out:
1247 if (in_crit)
1248 nbl_end_crit(vp);
1249 VN_RELE(vp);
1250
1251 /* check if a monitor detected a delegation conflict */
1252 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1253 /* mark as wouldblock so response is dropped */
1254 curthread->t_flag |= T_WOULDBLOCK;
1255 else
1256 ns->ns_status = puterrno(error);
1257
1258 }
1259
1260 struct rfs_async_write {
1261 struct nfswriteargs *wa;
1262 struct nfsattrstat *ns;
1263 struct svc_req *req;
1264 cred_t *cr;
1265 bool_t ro;
1266 kthread_t *thread;
1267 struct rfs_async_write *list;
1268 };
1269
1270 struct rfs_async_write_list {
1271 fhandle_t *fhp;
1272 kcondvar_t cv;
1273 struct rfs_async_write *list;
1274 struct rfs_async_write_list *next;
1275 };
1276
1277 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1278 static kmutex_t rfs_async_write_lock;
1279 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1280
1281 #define MAXCLIOVECS 42
1282 #define RFSWRITE_INITVAL (enum nfsstat) -1
1283
1284 #ifdef DEBUG
1285 static int rfs_write_hits = 0;
1286 static int rfs_write_misses = 0;
1287 #endif
1288
1289 /*
1290 * Write data to file.
1291 * Returns attributes of a file after writing some data to it.
1292 */
1293 void
1294 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1295 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1296 {
1297 int error;
1298 vnode_t *vp;
1299 rlim64_t rlimit;
1300 struct vattr va;
1301 struct uio uio;
1302 struct rfs_async_write_list *lp;
1303 struct rfs_async_write_list *nlp;
1304 struct rfs_async_write *rp;
1305 struct rfs_async_write *nrp;
1306 struct rfs_async_write *trp;
1307 struct rfs_async_write *lrp;
1308 int data_written;
1309 int iovcnt;
1310 mblk_t *m;
1311 struct iovec *iovp;
1312 struct iovec *niovp;
1313 struct iovec iov[MAXCLIOVECS];
1314 int count;
1315 int rcount;
1316 uint_t off;
1317 uint_t len;
1318 struct rfs_async_write nrpsp;
1319 struct rfs_async_write_list nlpsp;
1320 ushort_t t_flag;
1321 cred_t *savecred;
1322 int in_crit = 0;
1323 caller_context_t ct;
1324 nfs_srv_t *nsrv;
1325
1326 ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1327 nsrv = nfs_get_srv();
1328 if (!nsrv->write_async) {
1329 rfs_write_sync(wa, ns, exi, req, cr, ro);
1330 return;
1331 }
1332
1333 /*
1334 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1335 * is considered an OK.
1336 */
1337 ns->ns_status = RFSWRITE_INITVAL;
1338
1339 nrp = &nrpsp;
1340 nrp->wa = wa;
1341 nrp->ns = ns;
1342 nrp->req = req;
1343 nrp->cr = cr;
1344 nrp->ro = ro;
1345 nrp->thread = curthread;
1346
1347 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1348
1349 /*
1350 * Look to see if there is already a cluster started
1351 * for this file.
1352 */
1353 mutex_enter(&nsrv->async_write_lock);
1354 for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1355 if (bcmp(&wa->wa_fhandle, lp->fhp,
1356 sizeof (fhandle_t)) == 0)
1357 break;
1358 }
1359
1360 /*
1361 * If lp is non-NULL, then there is already a cluster
1362 * started. We need to place ourselves in the cluster
1363 * list in the right place as determined by starting
1364 * offset. Conflicts with non-blocking mandatory locked
1365 * regions will be checked when the cluster is processed.
1366 */
1367 if (lp != NULL) {
1368 rp = lp->list;
1369 trp = NULL;
1370 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1371 trp = rp;
1372 rp = rp->list;
1373 }
1374 nrp->list = rp;
1375 if (trp == NULL)
1376 lp->list = nrp;
1377 else
1378 trp->list = nrp;
1379 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1380 cv_wait(&lp->cv, &nsrv->async_write_lock);
1381 mutex_exit(&nsrv->async_write_lock);
1382
1383 return;
1384 }
1385
1386 /*
1387 * No cluster started yet, start one and add ourselves
1388 * to the list of clusters.
1389 */
1390 nrp->list = NULL;
1391
1392 nlp = &nlpsp;
1393 nlp->fhp = &wa->wa_fhandle;
1394 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1395 nlp->list = nrp;
1396 nlp->next = NULL;
1397
1398 if (nsrv->async_write_head == NULL) {
1399 nsrv->async_write_head = nlp;
1400 } else {
1401 lp = nsrv->async_write_head;
1402 while (lp->next != NULL)
1403 lp = lp->next;
1404 lp->next = nlp;
1405 }
1406 mutex_exit(&nsrv->async_write_lock);
1407
1408 /*
1409 * Convert the file handle common to all of the requests
1410 * in this cluster to a vnode.
1411 */
1412 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1413 if (vp == NULL) {
1414 mutex_enter(&nsrv->async_write_lock);
1415 if (nsrv->async_write_head == nlp)
1416 nsrv->async_write_head = nlp->next;
1417 else {
1418 lp = nsrv->async_write_head;
1419 while (lp->next != nlp)
1420 lp = lp->next;
1421 lp->next = nlp->next;
1422 }
1423 t_flag = curthread->t_flag & T_WOULDBLOCK;
1424 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1425 rp->ns->ns_status = NFSERR_STALE;
1426 rp->thread->t_flag |= t_flag;
1427 }
1428 cv_broadcast(&nlp->cv);
1429 mutex_exit(&nsrv->async_write_lock);
1430
1431 return;
1432 }
1433
1434 /*
1435 * Can only write regular files. Attempts to write any
1436 * other file types fail with EISDIR.
1437 */
1438 if (vp->v_type != VREG) {
1439 VN_RELE(vp);
1440 mutex_enter(&nsrv->async_write_lock);
1441 if (nsrv->async_write_head == nlp)
1442 nsrv->async_write_head = nlp->next;
1443 else {
1444 lp = nsrv->async_write_head;
1445 while (lp->next != nlp)
1446 lp = lp->next;
1447 lp->next = nlp->next;
1448 }
1449 t_flag = curthread->t_flag & T_WOULDBLOCK;
1450 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1451 rp->ns->ns_status = NFSERR_ISDIR;
1452 rp->thread->t_flag |= t_flag;
1453 }
1454 cv_broadcast(&nlp->cv);
1455 mutex_exit(&nsrv->async_write_lock);
1456
1457 return;
1458 }
1459
1460 /*
1461 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1462 * deadlock with ufs.
1463 */
1464 if (nbl_need_check(vp)) {
1465 nbl_start_crit(vp, RW_READER);
1466 in_crit = 1;
1467 }
1468
1469 ct.cc_sysid = 0;
1470 ct.cc_pid = 0;
1471 ct.cc_caller_id = nfs2_srv_caller_id;
1472 ct.cc_flags = CC_DONTBLOCK;
1473
1474 /*
1475 * Lock the file for writing. This operation provides
1476 * the delay which allows clusters to grow.
1477 */
1478 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1479
1480 /* check if a monitor detected a delegation conflict */
1481 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1482 if (in_crit)
1483 nbl_end_crit(vp);
1484 VN_RELE(vp);
1485 /* mark as wouldblock so response is dropped */
1486 curthread->t_flag |= T_WOULDBLOCK;
1487 mutex_enter(&nsrv->async_write_lock);
1488 if (nsrv->async_write_head == nlp)
1489 nsrv->async_write_head = nlp->next;
1490 else {
1491 lp = nsrv->async_write_head;
1492 while (lp->next != nlp)
1493 lp = lp->next;
1494 lp->next = nlp->next;
1495 }
1496 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1497 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1498 rp->ns->ns_status = puterrno(error);
1499 rp->thread->t_flag |= T_WOULDBLOCK;
1500 }
1501 }
1502 cv_broadcast(&nlp->cv);
1503 mutex_exit(&nsrv->async_write_lock);
1504
1505 return;
1506 }
1507
1508 /*
1509 * Disconnect this cluster from the list of clusters.
1510 * The cluster that is being dealt with must be fixed
1511 * in size after this point, so there is no reason
1512 * to leave it on the list so that new requests can
1513 * find it.
1514 *
1515 * The algorithm is that the first write request will
1516 * create a cluster, convert the file handle to a
1517 * vnode pointer, and then lock the file for writing.
1518 * This request is not likely to be clustered with
1519 * any others. However, the next request will create
1520 * a new cluster and be blocked in VOP_RWLOCK while
1521 * the first request is being processed. This delay
1522 * will allow more requests to be clustered in this
1523 * second cluster.
1524 */
1525 mutex_enter(&nsrv->async_write_lock);
1526 if (nsrv->async_write_head == nlp)
1527 nsrv->async_write_head = nlp->next;
1528 else {
1529 lp = nsrv->async_write_head;
1530 while (lp->next != nlp)
1531 lp = lp->next;
1532 lp->next = nlp->next;
1533 }
1534 mutex_exit(&nsrv->async_write_lock);
1535
1536 /*
1537 * Step through the list of requests in this cluster.
1538 * We need to check permissions to make sure that all
1539 * of the requests have sufficient permission to write
1540 * the file. A cluster can be composed of requests
1541 * from different clients and different users on each
1542 * client.
1543 *
1544 * As a side effect, we also calculate the size of the
1545 * byte range that this cluster encompasses.
1546 */
1547 rp = nlp->list;
1548 off = rp->wa->wa_offset;
1549 len = (uint_t)0;
1550 do {
1551 if (rdonly(rp->ro, vp)) {
1552 rp->ns->ns_status = NFSERR_ROFS;
1553 t_flag = curthread->t_flag & T_WOULDBLOCK;
1554 rp->thread->t_flag |= t_flag;
1555 continue;
1556 }
1557
1558 va.va_mask = AT_UID|AT_MODE;
1559
1560 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1561
1562 if (!error) {
1563 if (crgetuid(rp->cr) != va.va_uid) {
1564 /*
1565 * This is a kludge to allow writes of files
1566 * created with read only permission. The
1567 * owner of the file is always allowed to
1568 * write it.
1569 */
1570 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1571 }
1572 if (!error && MANDLOCK(vp, va.va_mode))
1573 error = EACCES;
1574 }
1575
1576 /*
1577 * Check for a conflict with a nbmand-locked region.
1578 */
1579 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1580 rp->wa->wa_count, 0, NULL)) {
1581 error = EACCES;
1582 }
1583
1584 if (error) {
1585 rp->ns->ns_status = puterrno(error);
1586 t_flag = curthread->t_flag & T_WOULDBLOCK;
1587 rp->thread->t_flag |= t_flag;
1588 continue;
1589 }
1590 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1591 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1592 } while ((rp = rp->list) != NULL);
1593
1594 /*
1595 * Step through the cluster attempting to gather as many
1596 * requests which are contiguous as possible. These
1597 * contiguous requests are handled via one call to VOP_WRITE
1598 * instead of different calls to VOP_WRITE. We also keep
1599 * track of the fact that any data was written.
1600 */
1601 rp = nlp->list;
1602 data_written = 0;
1603 do {
1604 /*
1605 * Skip any requests which are already marked as having an
1606 * error.
1607 */
1608 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1609 rp = rp->list;
1610 continue;
1611 }
1612
1613 /*
1614 * Count the number of iovec's which are required
1615 * to handle this set of requests. One iovec is
1616 * needed for each data buffer, whether addressed
1617 * by wa_data or by the b_rptr pointers in the
1618 * mblk chains.
1619 */
1620 iovcnt = 0;
1621 lrp = rp;
1622 for (;;) {
1623 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1624 iovcnt++;
1625 else {
1626 m = lrp->wa->wa_mblk;
1627 while (m != NULL) {
1628 iovcnt++;
1629 m = m->b_cont;
1630 }
1631 }
1632 if (lrp->list == NULL ||
1633 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1634 lrp->wa->wa_offset + lrp->wa->wa_count !=
1635 lrp->list->wa->wa_offset) {
1636 lrp = lrp->list;
1637 break;
1638 }
1639 lrp = lrp->list;
1640 }
1641
1642 if (iovcnt <= MAXCLIOVECS) {
1643 #ifdef DEBUG
1644 rfs_write_hits++;
1645 #endif
1646 niovp = iov;
1647 } else {
1648 #ifdef DEBUG
1649 rfs_write_misses++;
1650 #endif
1651 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1652 }
1653 /*
1654 * Put together the scatter/gather iovecs.
1655 */
1656 iovp = niovp;
1657 trp = rp;
1658 count = 0;
1659 do {
1660 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1661 if (trp->wa->wa_rlist) {
1662 iovp->iov_base =
1663 (char *)((trp->wa->wa_rlist)->
1664 u.c_daddr3);
1665 iovp->iov_len = trp->wa->wa_count;
1666 } else {
1667 iovp->iov_base = trp->wa->wa_data;
1668 iovp->iov_len = trp->wa->wa_count;
1669 }
1670 iovp++;
1671 } else {
1672 m = trp->wa->wa_mblk;
1673 rcount = trp->wa->wa_count;
1674 while (m != NULL) {
1675 iovp->iov_base = (caddr_t)m->b_rptr;
1676 iovp->iov_len = (m->b_wptr - m->b_rptr);
1677 rcount -= iovp->iov_len;
1678 if (rcount < 0)
1679 iovp->iov_len += rcount;
1680 iovp++;
1681 if (rcount <= 0)
1682 break;
1683 m = m->b_cont;
1684 }
1685 }
1686 count += trp->wa->wa_count;
1687 trp = trp->list;
1688 } while (trp != lrp);
1689
1690 uio.uio_iov = niovp;
1691 uio.uio_iovcnt = iovcnt;
1692 uio.uio_segflg = UIO_SYSSPACE;
1693 uio.uio_extflg = UIO_COPY_DEFAULT;
1694 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1695 uio.uio_resid = count;
1696 /*
1697 * The limit is checked on the client. We
1698 * should allow any size writes here.
1699 */
1700 uio.uio_llimit = curproc->p_fsz_ctl;
1701 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1702 if (rlimit < (rlim64_t)uio.uio_resid)
1703 uio.uio_resid = (uint_t)rlimit;
1704
1705 /*
1706 * For now we assume no append mode.
1707 */
1708
1709 /*
1710 * We're changing creds because VM may fault
1711 * and we need the cred of the current
1712 * thread to be used if quota * checking is
1713 * enabled.
1714 */
1715 savecred = curthread->t_cred;
1716 curthread->t_cred = cr;
1717 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1718 curthread->t_cred = savecred;
1719
1720 /* check if a monitor detected a delegation conflict */
1721 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1722 /* mark as wouldblock so response is dropped */
1723 curthread->t_flag |= T_WOULDBLOCK;
1724
1725 if (niovp != iov)
1726 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1727
1728 if (!error) {
1729 data_written = 1;
1730 /*
1731 * Get attributes again so we send the latest mod
1732 * time to the client side for its cache.
1733 */
1734 va.va_mask = AT_ALL; /* now we want everything */
1735
1736 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1737
1738 if (!error)
1739 acl_perm(vp, exi, &va, rp->cr);
1740 }
1741
1742 /*
1743 * Fill in the status responses for each request
1744 * which was just handled. Also, copy the latest
1745 * attributes in to the attribute responses if
1746 * appropriate.
1747 */
1748 t_flag = curthread->t_flag & T_WOULDBLOCK;
1749 do {
1750 rp->thread->t_flag |= t_flag;
1751 /* check for overflows */
1752 if (!error) {
1753 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1754 }
1755 rp->ns->ns_status = puterrno(error);
1756 rp = rp->list;
1757 } while (rp != lrp);
1758 } while (rp != NULL);
1759
1760 /*
1761 * If any data was written at all, then we need to flush
1762 * the data and metadata to stable storage.
1763 */
1764 if (data_written) {
1765 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1766
1767 if (!error) {
1768 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1769 }
1770 }
1771
1772 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1773
1774 if (in_crit)
1775 nbl_end_crit(vp);
1776 VN_RELE(vp);
1777
1778 t_flag = curthread->t_flag & T_WOULDBLOCK;
1779 mutex_enter(&nsrv->async_write_lock);
1780 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1781 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1782 rp->ns->ns_status = puterrno(error);
1783 rp->thread->t_flag |= t_flag;
1784 }
1785 }
1786 cv_broadcast(&nlp->cv);
1787 mutex_exit(&nsrv->async_write_lock);
1788
1789 }
1790
1791 void *
1792 rfs_write_getfh(struct nfswriteargs *wa)
1793 {
1794 return (&wa->wa_fhandle);
1795 }
1796
1797 /*
1798 * Create a file.
1799 * Creates a file with given attributes and returns those attributes
1800 * and an fhandle for the new file.
1801 */
1802 void
1803 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1804 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1805 {
1806 int error;
1807 int lookuperr;
1808 int in_crit = 0;
1809 struct vattr va;
1810 vnode_t *vp;
1811 vnode_t *realvp;
1812 vnode_t *dvp;
1813 char *name = args->ca_da.da_name;
1814 vnode_t *tvp = NULL;
1815 int mode;
1816 int lookup_ok;
1817 bool_t trunc;
1818 struct sockaddr *ca;
1819
1820 /*
1821 * Disallow NULL paths
1822 */
1823 if (name == NULL || *name == '\0') {
1824 dr->dr_status = NFSERR_ACCES;
1825 return;
1826 }
1827
1828 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1829 if (dvp == NULL) {
1830 dr->dr_status = NFSERR_STALE;
1831 return;
1832 }
1833
1834 error = sattr_to_vattr(args->ca_sa, &va);
1835 if (error) {
1836 dr->dr_status = puterrno(error);
1837 return;
1838 }
1839
1840 /*
1841 * Must specify the mode.
1842 */
1843 if (!(va.va_mask & AT_MODE)) {
1844 VN_RELE(dvp);
1845 dr->dr_status = NFSERR_INVAL;
1846 return;
1847 }
1848
1849 /*
1850 * This is a completely gross hack to make mknod
1851 * work over the wire until we can wack the protocol
1852 */
1853 if ((va.va_mode & IFMT) == IFCHR) {
1854 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1855 va.va_type = VFIFO; /* xtra kludge for named pipe */
1856 else {
1857 va.va_type = VCHR;
1858 /*
1859 * uncompress the received dev_t
1860 * if the top half is zero indicating a request
1861 * from an `older style' OS.
1862 */
1863 if ((va.va_size & 0xffff0000) == 0)
1864 va.va_rdev = nfsv2_expdev(va.va_size);
1865 else
1866 va.va_rdev = (dev_t)va.va_size;
1867 }
1868 va.va_mask &= ~AT_SIZE;
1869 } else if ((va.va_mode & IFMT) == IFBLK) {
1870 va.va_type = VBLK;
1871 /*
1872 * uncompress the received dev_t
1873 * if the top half is zero indicating a request
1874 * from an `older style' OS.
1875 */
1876 if ((va.va_size & 0xffff0000) == 0)
1877 va.va_rdev = nfsv2_expdev(va.va_size);
1878 else
1879 va.va_rdev = (dev_t)va.va_size;
1880 va.va_mask &= ~AT_SIZE;
1881 } else if ((va.va_mode & IFMT) == IFSOCK) {
1882 va.va_type = VSOCK;
1883 } else {
1884 va.va_type = VREG;
1885 }
1886 va.va_mode &= ~IFMT;
1887 va.va_mask |= AT_TYPE;
1888
1889 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1890 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1891 MAXPATHLEN);
1892 if (name == NULL) {
1893 dr->dr_status = puterrno(EINVAL);
1894 return;
1895 }
1896
1897 /*
1898 * Why was the choice made to use VWRITE as the mode to the
1899 * call to VOP_CREATE ? This results in a bug. When a client
1900 * opens a file that already exists and is RDONLY, the second
1901 * open fails with an EACESS because of the mode.
1902 * bug ID 1054648.
1903 */
1904 lookup_ok = 0;
1905 mode = VWRITE;
1906 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1907 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1908 NULL, NULL, NULL);
1909 if (!error) {
1910 struct vattr at;
1911
1912 lookup_ok = 1;
1913 at.va_mask = AT_MODE;
1914 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1915 if (!error)
1916 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1917 VN_RELE(tvp);
1918 tvp = NULL;
1919 }
1920 }
1921
1922 if (!lookup_ok) {
1923 if (rdonly(ro, dvp)) {
1924 error = EROFS;
1925 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1926 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1927 error = EPERM;
1928 } else {
1929 error = 0;
1930 }
1931 }
1932
1933 /*
1934 * If file size is being modified on an already existing file
1935 * make sure that there are no conflicting non-blocking mandatory
1936 * locks in the region being manipulated. Return EACCES if there
1937 * are conflicting locks.
1938 */
1939 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1940 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1941 NULL, NULL, NULL);
1942
1943 if (!lookuperr &&
1944 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1945 VN_RELE(tvp);
1946 curthread->t_flag |= T_WOULDBLOCK;
1947 goto out;
1948 }
1949
1950 if (!lookuperr && nbl_need_check(tvp)) {
1951 /*
1952 * The file exists. Now check if it has any
1953 * conflicting non-blocking mandatory locks
1954 * in the region being changed.
1955 */
1956 struct vattr bva;
1957 u_offset_t offset;
1958 ssize_t length;
1959
1960 nbl_start_crit(tvp, RW_READER);
1961 in_crit = 1;
1962
1963 bva.va_mask = AT_SIZE;
1964 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1965 if (!error) {
1966 if (va.va_size < bva.va_size) {
1967 offset = va.va_size;
1968 length = bva.va_size - va.va_size;
1969 } else {
1970 offset = bva.va_size;
1971 length = va.va_size - bva.va_size;
1972 }
1973 if (length) {
1974 if (nbl_conflict(tvp, NBL_WRITE,
1975 offset, length, 0, NULL)) {
1976 error = EACCES;
1977 }
1978 }
1979 }
1980 if (error) {
1981 nbl_end_crit(tvp);
1982 VN_RELE(tvp);
1983 in_crit = 0;
1984 }
1985 } else if (tvp != NULL) {
1986 VN_RELE(tvp);
1987 }
1988 }
1989
1990 if (!error) {
1991 /*
1992 * If filesystem is shared with nosuid the remove any
1993 * setuid/setgid bits on create.
1994 */
1995 if (va.va_type == VREG &&
1996 exi->exi_export.ex_flags & EX_NOSUID)
1997 va.va_mode &= ~(VSUID | VSGID);
1998
1999 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
2000 NULL, NULL);
2001
2002 if (!error) {
2003
2004 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2005 trunc = TRUE;
2006 else
2007 trunc = FALSE;
2008
2009 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2010 VN_RELE(vp);
2011 curthread->t_flag |= T_WOULDBLOCK;
2012 goto out;
2013 }
2014 va.va_mask = AT_ALL;
2015
2016 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2017
2018 /* check for overflows */
2019 if (!error) {
2020 acl_perm(vp, exi, &va, cr);
2021 error = vattr_to_nattr(&va, &dr->dr_attr);
2022 if (!error) {
2023 error = makefh(&dr->dr_fhandle, vp,
2024 exi);
2025 }
2026 }
2027 /*
2028 * Force modified metadata out to stable storage.
2029 *
2030 * if a underlying vp exists, pass it to VOP_FSYNC
2031 */
2032 if (VOP_REALVP(vp, &realvp, NULL) == 0)
2033 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2034 else
2035 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2036 VN_RELE(vp);
2037 }
2038
2039 if (in_crit) {
2040 nbl_end_crit(tvp);
2041 VN_RELE(tvp);
2042 }
2043 }
2044
2045 /*
2046 * Force modified data and metadata out to stable storage.
2047 */
2048 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2049
2050 out:
2051
2052 VN_RELE(dvp);
2053
2054 dr->dr_status = puterrno(error);
2055
2056 if (name != args->ca_da.da_name)
2057 kmem_free(name, MAXPATHLEN);
2058 }
2059 void *
2060 rfs_create_getfh(struct nfscreatargs *args)
2061 {
2062 return (args->ca_da.da_fhandle);
2063 }
2064
2065 /*
2066 * Remove a file.
2067 * Remove named file from parent directory.
2068 */
2069 /* ARGSUSED */
2070 void
2071 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2072 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2073 {
2074 int error = 0;
2075 vnode_t *vp;
2076 vnode_t *targvp;
2077 int in_crit = 0;
2078
2079 /*
2080 * Disallow NULL paths
2081 */
2082 if (da->da_name == NULL || *da->da_name == '\0') {
2083 *status = NFSERR_ACCES;
2084 return;
2085 }
2086
2087 vp = nfs_fhtovp(da->da_fhandle, exi);
2088 if (vp == NULL) {
2089 *status = NFSERR_STALE;
2090 return;
2091 }
2092
2093 if (rdonly(ro, vp)) {
2094 VN_RELE(vp);
2095 *status = NFSERR_ROFS;
2096 return;
2097 }
2098
2099 /*
2100 * Check for a conflict with a non-blocking mandatory share reservation.
2101 */
2102 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2103 NULL, cr, NULL, NULL, NULL);
2104 if (error != 0) {
2105 VN_RELE(vp);
2106 *status = puterrno(error);
2107 return;
2108 }
2109
2110 /*
2111 * If the file is delegated to an v4 client, then initiate
2112 * recall and drop this request (by setting T_WOULDBLOCK).
2113 * The client will eventually re-transmit the request and
2114 * (hopefully), by then, the v4 client will have returned
2115 * the delegation.
2116 */
2117
2118 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2119 VN_RELE(vp);
2120 VN_RELE(targvp);
2121 curthread->t_flag |= T_WOULDBLOCK;
2122 return;
2123 }
2124
2125 if (nbl_need_check(targvp)) {
2126 nbl_start_crit(targvp, RW_READER);
2127 in_crit = 1;
2128 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2129 error = EACCES;
2130 goto out;
2131 }
2132 }
2133
2134 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2135
2136 /*
2137 * Force modified data and metadata out to stable storage.
2138 */
2139 (void) VOP_FSYNC(vp, 0, cr, NULL);
2140
2141 out:
2142 if (in_crit)
2143 nbl_end_crit(targvp);
2144 VN_RELE(targvp);
2145 VN_RELE(vp);
2146
2147 *status = puterrno(error);
2148
2149 }
2150
2151 void *
2152 rfs_remove_getfh(struct nfsdiropargs *da)
2153 {
2154 return (da->da_fhandle);
2155 }
2156
2157 /*
2158 * rename a file
2159 * Give a file (from) a new name (to).
2160 */
2161 /* ARGSUSED */
2162 void
2163 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2164 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2165 {
2166 int error = 0;
2167 vnode_t *fromvp;
2168 vnode_t *tovp;
2169 struct exportinfo *to_exi;
2170 fhandle_t *fh;
2171 vnode_t *srcvp;
2172 vnode_t *targvp;
2173 int in_crit = 0;
2174
2175 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2176 if (fromvp == NULL) {
2177 *status = NFSERR_STALE;
2178 return;
2179 }
2180
2181 fh = args->rna_to.da_fhandle;
2182 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2183 if (to_exi == NULL) {
2184 VN_RELE(fromvp);
2185 *status = NFSERR_ACCES;
2186 return;
2187 }
2188 exi_rele(to_exi);
2189
2190 if (to_exi != exi) {
2191 VN_RELE(fromvp);
2192 *status = NFSERR_XDEV;
2193 return;
2194 }
2195
2196 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2197 if (tovp == NULL) {
2198 VN_RELE(fromvp);
2199 *status = NFSERR_STALE;
2200 return;
2201 }
2202
2203 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2204 VN_RELE(tovp);
2205 VN_RELE(fromvp);
2206 *status = NFSERR_NOTDIR;
2207 return;
2208 }
2209
2210 /*
2211 * Disallow NULL paths
2212 */
2213 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2214 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2215 VN_RELE(tovp);
2216 VN_RELE(fromvp);
2217 *status = NFSERR_ACCES;
2218 return;
2219 }
2220
2221 if (rdonly(ro, tovp)) {
2222 VN_RELE(tovp);
2223 VN_RELE(fromvp);
2224 *status = NFSERR_ROFS;
2225 return;
2226 }
2227
2228 /*
2229 * Check for a conflict with a non-blocking mandatory share reservation.
2230 */
2231 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2232 NULL, cr, NULL, NULL, NULL);
2233 if (error != 0) {
2234 VN_RELE(tovp);
2235 VN_RELE(fromvp);
2236 *status = puterrno(error);
2237 return;
2238 }
2239
2240 /* Check for delegations on the source file */
2241
2242 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2243 VN_RELE(tovp);
2244 VN_RELE(fromvp);
2245 VN_RELE(srcvp);
2246 curthread->t_flag |= T_WOULDBLOCK;
2247 return;
2248 }
2249
2250 /* Check for delegation on the file being renamed over, if it exists */
2251
2252 if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2253 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2254 NULL, NULL, NULL) == 0) {
2255
2256 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2257 VN_RELE(tovp);
2258 VN_RELE(fromvp);
2259 VN_RELE(srcvp);
2260 VN_RELE(targvp);
2261 curthread->t_flag |= T_WOULDBLOCK;
2262 return;
2263 }
2264 VN_RELE(targvp);
2265 }
2266
2267
2268 if (nbl_need_check(srcvp)) {
2269 nbl_start_crit(srcvp, RW_READER);
2270 in_crit = 1;
2271 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2272 error = EACCES;
2273 goto out;
2274 }
2275 }
2276
2277 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2278 tovp, args->rna_to.da_name, cr, NULL, 0);
2279
2280 if (error == 0)
2281 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2282 strlen(args->rna_to.da_name));
2283
2284 /*
2285 * Force modified data and metadata out to stable storage.
2286 */
2287 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2288 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2289
2290 out:
2291 if (in_crit)
2292 nbl_end_crit(srcvp);
2293 VN_RELE(srcvp);
2294 VN_RELE(tovp);
2295 VN_RELE(fromvp);
2296
2297 *status = puterrno(error);
2298
2299 }
2300 void *
2301 rfs_rename_getfh(struct nfsrnmargs *args)
2302 {
2303 return (args->rna_from.da_fhandle);
2304 }
2305
2306 /*
2307 * Link to a file.
2308 * Create a file (to) which is a hard link to the given file (from).
2309 */
2310 /* ARGSUSED */
2311 void
2312 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2313 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2314 {
2315 int error;
2316 vnode_t *fromvp;
2317 vnode_t *tovp;
2318 struct exportinfo *to_exi;
2319 fhandle_t *fh;
2320
2321 fromvp = nfs_fhtovp(args->la_from, exi);
2322 if (fromvp == NULL) {
2323 *status = NFSERR_STALE;
2324 return;
2325 }
2326
2327 fh = args->la_to.da_fhandle;
2328 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2329 if (to_exi == NULL) {
2330 VN_RELE(fromvp);
2331 *status = NFSERR_ACCES;
2332 return;
2333 }
2334 exi_rele(to_exi);
2335
2336 if (to_exi != exi) {
2337 VN_RELE(fromvp);
2338 *status = NFSERR_XDEV;
2339 return;
2340 }
2341
2342 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2343 if (tovp == NULL) {
2344 VN_RELE(fromvp);
2345 *status = NFSERR_STALE;
2346 return;
2347 }
2348
2349 if (tovp->v_type != VDIR) {
2350 VN_RELE(tovp);
2351 VN_RELE(fromvp);
2352 *status = NFSERR_NOTDIR;
2353 return;
2354 }
2355 /*
2356 * Disallow NULL paths
2357 */
2358 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2359 VN_RELE(tovp);
2360 VN_RELE(fromvp);
2361 *status = NFSERR_ACCES;
2362 return;
2363 }
2364
2365 if (rdonly(ro, tovp)) {
2366 VN_RELE(tovp);
2367 VN_RELE(fromvp);
2368 *status = NFSERR_ROFS;
2369 return;
2370 }
2371
2372 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2373
2374 /*
2375 * Force modified data and metadata out to stable storage.
2376 */
2377 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2378 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2379
2380 VN_RELE(tovp);
2381 VN_RELE(fromvp);
2382
2383 *status = puterrno(error);
2384
2385 }
2386 void *
2387 rfs_link_getfh(struct nfslinkargs *args)
2388 {
2389 return (args->la_from);
2390 }
2391
2392 /*
2393 * Symbolicly link to a file.
2394 * Create a file (to) with the given attributes which is a symbolic link
2395 * to the given path name (to).
2396 */
2397 void
2398 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2399 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2400 {
2401 int error;
2402 struct vattr va;
2403 vnode_t *vp;
2404 vnode_t *svp;
2405 int lerror;
2406 struct sockaddr *ca;
2407 char *name = NULL;
2408
2409 /*
2410 * Disallow NULL paths
2411 */
2412 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2413 *status = NFSERR_ACCES;
2414 return;
2415 }
2416
2417 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2418 if (vp == NULL) {
2419 *status = NFSERR_STALE;
2420 return;
2421 }
2422
2423 if (rdonly(ro, vp)) {
2424 VN_RELE(vp);
2425 *status = NFSERR_ROFS;
2426 return;
2427 }
2428
2429 error = sattr_to_vattr(args->sla_sa, &va);
2430 if (error) {
2431 VN_RELE(vp);
2432 *status = puterrno(error);
2433 return;
2434 }
2435
2436 if (!(va.va_mask & AT_MODE)) {
2437 VN_RELE(vp);
2438 *status = NFSERR_INVAL;
2439 return;
2440 }
2441
2442 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2443 name = nfscmd_convname(ca, exi, args->sla_tnm,
2444 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2445
2446 if (name == NULL) {
2447 *status = NFSERR_ACCES;
2448 return;
2449 }
2450
2451 va.va_type = VLNK;
2452 va.va_mask |= AT_TYPE;
2453
2454 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2455
2456 /*
2457 * Force new data and metadata out to stable storage.
2458 */
2459 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2460 NULL, cr, NULL, NULL, NULL);
2461
2462 if (!lerror) {
2463 (void) VOP_FSYNC(svp, 0, cr, NULL);
2464 VN_RELE(svp);
2465 }
2466
2467 /*
2468 * Force modified data and metadata out to stable storage.
2469 */
2470 (void) VOP_FSYNC(vp, 0, cr, NULL);
2471
2472 VN_RELE(vp);
2473
2474 *status = puterrno(error);
2475 if (name != args->sla_tnm)
2476 kmem_free(name, MAXPATHLEN);
2477
2478 }
2479 void *
2480 rfs_symlink_getfh(struct nfsslargs *args)
2481 {
2482 return (args->sla_from.da_fhandle);
2483 }
2484
2485 /*
2486 * Make a directory.
2487 * Create a directory with the given name, parent directory, and attributes.
2488 * Returns a file handle and attributes for the new directory.
2489 */
2490 /* ARGSUSED */
2491 void
2492 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2493 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2494 {
2495 int error;
2496 struct vattr va;
2497 vnode_t *dvp = NULL;
2498 vnode_t *vp;
2499 char *name = args->ca_da.da_name;
2500
2501 /*
2502 * Disallow NULL paths
2503 */
2504 if (name == NULL || *name == '\0') {
2505 dr->dr_status = NFSERR_ACCES;
2506 return;
2507 }
2508
2509 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2510 if (vp == NULL) {
2511 dr->dr_status = NFSERR_STALE;
2512 return;
2513 }
2514
2515 if (rdonly(ro, vp)) {
2516 VN_RELE(vp);
2517 dr->dr_status = NFSERR_ROFS;
2518 return;
2519 }
2520
2521 error = sattr_to_vattr(args->ca_sa, &va);
2522 if (error) {
2523 VN_RELE(vp);
2524 dr->dr_status = puterrno(error);
2525 return;
2526 }
2527
2528 if (!(va.va_mask & AT_MODE)) {
2529 VN_RELE(vp);
2530 dr->dr_status = NFSERR_INVAL;
2531 return;
2532 }
2533
2534 va.va_type = VDIR;
2535 va.va_mask |= AT_TYPE;
2536
2537 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2538
2539 if (!error) {
2540 /*
2541 * Attribtutes of the newly created directory should
2542 * be returned to the client.
2543 */
2544 va.va_mask = AT_ALL; /* We want everything */
2545 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2546
2547 /* check for overflows */
2548 if (!error) {
2549 acl_perm(vp, exi, &va, cr);
2550 error = vattr_to_nattr(&va, &dr->dr_attr);
2551 if (!error) {
2552 error = makefh(&dr->dr_fhandle, dvp, exi);
2553 }
2554 }
2555 /*
2556 * Force new data and metadata out to stable storage.
2557 */
2558 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2559 VN_RELE(dvp);
2560 }
2561
2562 /*
2563 * Force modified data and metadata out to stable storage.
2564 */
2565 (void) VOP_FSYNC(vp, 0, cr, NULL);
2566
2567 VN_RELE(vp);
2568
2569 dr->dr_status = puterrno(error);
2570
2571 }
2572 void *
2573 rfs_mkdir_getfh(struct nfscreatargs *args)
2574 {
2575 return (args->ca_da.da_fhandle);
2576 }
2577
2578 /*
2579 * Remove a directory.
2580 * Remove the given directory name from the given parent directory.
2581 */
2582 /* ARGSUSED */
2583 void
2584 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2585 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2586 {
2587 int error;
2588 vnode_t *vp;
2589
2590 /*
2591 * Disallow NULL paths
2592 */
2593 if (da->da_name == NULL || *da->da_name == '\0') {
2594 *status = NFSERR_ACCES;
2595 return;
2596 }
2597
2598 vp = nfs_fhtovp(da->da_fhandle, exi);
2599 if (vp == NULL) {
2600 *status = NFSERR_STALE;
2601 return;
2602 }
2603
2604 if (rdonly(ro, vp)) {
2605 VN_RELE(vp);
2606 *status = NFSERR_ROFS;
2607 return;
2608 }
2609
2610 /*
2611 * VOP_RMDIR takes a third argument (the current
2612 * directory of the process). That's because someone
2613 * wants to return EINVAL if one tries to remove ".".
2614 * Of course, NFS servers have no idea what their
2615 * clients' current directories are. We fake it by
2616 * supplying a vnode known to exist and illegal to
2617 * remove.
2618 */
2619 error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2620
2621 /*
2622 * Force modified data and metadata out to stable storage.
2623 */
2624 (void) VOP_FSYNC(vp, 0, cr, NULL);
2625
2626 VN_RELE(vp);
2627
2628 /*
2629 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2630 * if the directory is not empty. A System V NFS server
2631 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2632 * over the wire.
2633 */
2634 if (error == EEXIST)
2635 *status = NFSERR_NOTEMPTY;
2636 else
2637 *status = puterrno(error);
2638
2639 }
2640 void *
2641 rfs_rmdir_getfh(struct nfsdiropargs *da)
2642 {
2643 return (da->da_fhandle);
2644 }
2645
2646 /* ARGSUSED */
2647 void
2648 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2649 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2650 {
2651 int error;
2652 int iseof;
2653 struct iovec iov;
2654 struct uio uio;
2655 vnode_t *vp;
2656 char *ndata = NULL;
2657 struct sockaddr *ca;
2658 size_t nents;
2659 int ret;
2660
2661 vp = nfs_fhtovp(&rda->rda_fh, exi);
2662 if (vp == NULL) {
2663 rd->rd_entries = NULL;
2664 rd->rd_status = NFSERR_STALE;
2665 return;
2666 }
2667
2668 if (vp->v_type != VDIR) {
2669 VN_RELE(vp);
2670 rd->rd_entries = NULL;
2671 rd->rd_status = NFSERR_NOTDIR;
2672 return;
2673 }
2674
2675 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2676
2677 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2678
2679 if (error) {
2680 rd->rd_entries = NULL;
2681 goto bad;
2682 }
2683
2684 if (rda->rda_count == 0) {
2685 rd->rd_entries = NULL;
2686 rd->rd_size = 0;
2687 rd->rd_eof = FALSE;
2688 goto bad;
2689 }
2690
2691 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2692
2693 /*
2694 * Allocate data for entries. This will be freed by rfs_rddirfree.
2695 */
2696 rd->rd_bufsize = (uint_t)rda->rda_count;
2697 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2698
2699 /*
2700 * Set up io vector to read directory data
2701 */
2702 iov.iov_base = (caddr_t)rd->rd_entries;
2703 iov.iov_len = rda->rda_count;
2704 uio.uio_iov = &iov;
2705 uio.uio_iovcnt = 1;
2706 uio.uio_segflg = UIO_SYSSPACE;
2707 uio.uio_extflg = UIO_COPY_CACHED;
2708 uio.uio_loffset = (offset_t)rda->rda_offset;
2709 uio.uio_resid = rda->rda_count;
2710
2711 /*
2712 * read directory
2713 */
2714 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2715
2716 /*
2717 * Clean up
2718 */
2719 if (!error) {
2720 /*
2721 * set size and eof
2722 */
2723 if (uio.uio_resid == rda->rda_count) {
2724 rd->rd_size = 0;
2725 rd->rd_eof = TRUE;
2726 } else {
2727 rd->rd_size = (uint32_t)(rda->rda_count -
2728 uio.uio_resid);
2729 rd->rd_eof = iseof ? TRUE : FALSE;
2730 }
2731 }
2732
2733 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2734 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2735 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2736 rda->rda_count, &ndata);
2737
2738 if (ret != 0) {
2739 size_t dropbytes;
2740 /*
2741 * We had to drop one or more entries in order to fit
2742 * during the character conversion. We need to patch
2743 * up the size and eof info.
2744 */
2745 if (rd->rd_eof)
2746 rd->rd_eof = FALSE;
2747 dropbytes = nfscmd_dropped_entrysize(
2748 (struct dirent64 *)rd->rd_entries, nents, ret);
2749 rd->rd_size -= dropbytes;
2750 }
2751 if (ndata == NULL) {
2752 ndata = (char *)rd->rd_entries;
2753 } else if (ndata != (char *)rd->rd_entries) {
2754 kmem_free(rd->rd_entries, rd->rd_bufsize);
2755 rd->rd_entries = (void *)ndata;
2756 rd->rd_bufsize = rda->rda_count;
2757 }
2758
2759 bad:
2760 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2761
2762 #if 0 /* notyet */
2763 /*
2764 * Don't do this. It causes local disk writes when just
2765 * reading the file and the overhead is deemed larger
2766 * than the benefit.
2767 */
2768 /*
2769 * Force modified metadata out to stable storage.
2770 */
2771 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2772 #endif
2773
2774 VN_RELE(vp);
2775
2776 rd->rd_status = puterrno(error);
2777
2778 }
2779 void *
2780 rfs_readdir_getfh(struct nfsrddirargs *rda)
2781 {
2782 return (&rda->rda_fh);
2783 }
2784 void
2785 rfs_rddirfree(struct nfsrddirres *rd)
2786 {
2787 if (rd->rd_entries != NULL)
2788 kmem_free(rd->rd_entries, rd->rd_bufsize);
2789 }
2790
2791 /* ARGSUSED */
2792 void
2793 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2794 struct svc_req *req, cred_t *cr, bool_t ro)
2795 {
2796 int error;
2797 struct statvfs64 sb;
2798 vnode_t *vp;
2799
2800 vp = nfs_fhtovp(fh, exi);
2801 if (vp == NULL) {
2802 fs->fs_status = NFSERR_STALE;
2803 return;
2804 }
2805
2806 error = VFS_STATVFS(vp->v_vfsp, &sb);
2807
2808 if (!error) {
2809 fs->fs_tsize = nfstsize();
2810 fs->fs_bsize = sb.f_frsize;
2811 fs->fs_blocks = sb.f_blocks;
2812 fs->fs_bfree = sb.f_bfree;
2813 fs->fs_bavail = sb.f_bavail;
2814 }
2815
2816 VN_RELE(vp);
2817
2818 fs->fs_status = puterrno(error);
2819
2820 }
2821 void *
2822 rfs_statfs_getfh(fhandle_t *fh)
2823 {
2824 return (fh);
2825 }
2826
2827 static int
2828 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2829 {
2830 vap->va_mask = 0;
2831
2832 /*
2833 * There was a sign extension bug in some VFS based systems
2834 * which stored the mode as a short. When it would get
2835 * assigned to a u_long, no sign extension would occur.
2836 * It needed to, but this wasn't noticed because sa_mode
2837 * would then get assigned back to the short, thus ignoring
2838 * the upper 16 bits of sa_mode.
2839 *
2840 * To make this implementation work for both broken
2841 * clients and good clients, we check for both versions
2842 * of the mode.
2843 */
2844 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2845 sa->sa_mode != (uint32_t)-1) {
2846 vap->va_mask |= AT_MODE;
2847 vap->va_mode = sa->sa_mode;
2848 }
2849 if (sa->sa_uid != (uint32_t)-1) {
2850 vap->va_mask |= AT_UID;
2851 vap->va_uid = sa->sa_uid;
2852 }
2853 if (sa->sa_gid != (uint32_t)-1) {
2854 vap->va_mask |= AT_GID;
2855 vap->va_gid = sa->sa_gid;
2856 }
2857 if (sa->sa_size != (uint32_t)-1) {
2858 vap->va_mask |= AT_SIZE;
2859 vap->va_size = sa->sa_size;
2860 }
2861 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2862 sa->sa_atime.tv_usec != (int32_t)-1) {
2863 #ifndef _LP64
2864 /* return error if time overflow */
2865 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2866 return (EOVERFLOW);
2867 #endif
2868 vap->va_mask |= AT_ATIME;
2869 /*
2870 * nfs protocol defines times as unsigned so don't extend sign,
2871 * unless sysadmin set nfs_allow_preepoch_time.
2872 */
2873 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2874 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2875 }
2876 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2877 sa->sa_mtime.tv_usec != (int32_t)-1) {
2878 #ifndef _LP64
2879 /* return error if time overflow */
2880 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2881 return (EOVERFLOW);
2882 #endif
2883 vap->va_mask |= AT_MTIME;
2884 /*
2885 * nfs protocol defines times as unsigned so don't extend sign,
2886 * unless sysadmin set nfs_allow_preepoch_time.
2887 */
2888 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2889 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2890 }
2891 return (0);
2892 }
2893
2894 static const enum nfsftype vt_to_nf[] = {
2895 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2896 };
2897
2898 /*
2899 * check the following fields for overflow: nodeid, size, and time.
2900 * There could be a problem when converting 64-bit LP64 fields
2901 * into 32-bit ones. Return an error if there is an overflow.
2902 */
2903 int
2904 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2905 {
2906 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2907 na->na_type = vt_to_nf[vap->va_type];
2908
2909 if (vap->va_mode == (unsigned short) -1)
2910 na->na_mode = (uint32_t)-1;
2911 else
2912 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2913
2914 if (vap->va_uid == (unsigned short)(-1))
2915 na->na_uid = (uint32_t)(-1);
2916 else if (vap->va_uid == UID_NOBODY)
2917 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2918 else
2919 na->na_uid = vap->va_uid;
2920
2921 if (vap->va_gid == (unsigned short)(-1))
2922 na->na_gid = (uint32_t)-1;
2923 else if (vap->va_gid == GID_NOBODY)
2924 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2925 else
2926 na->na_gid = vap->va_gid;
2927
2928 /*
2929 * Do we need to check fsid for overflow? It is 64-bit in the
2930 * vattr, but are bigger than 32 bit values supported?
2931 */
2932 na->na_fsid = vap->va_fsid;
2933
2934 na->na_nodeid = vap->va_nodeid;
2935
2936 /*
2937 * Check to make sure that the nodeid is representable over the
2938 * wire without losing bits.
2939 */
2940 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2941 return (EFBIG);
2942 na->na_nlink = vap->va_nlink;
2943
2944 /*
2945 * Check for big files here, instead of at the caller. See
2946 * comments in cstat for large special file explanation.
2947 */
2948 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2949 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2950 return (EFBIG);
2951 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2952 /* UNKNOWN_SIZE | OVERFLOW */
2953 na->na_size = MAXOFF32_T;
2954 } else
2955 na->na_size = vap->va_size;
2956 } else
2957 na->na_size = vap->va_size;
2958
2959 /*
2960 * If the vnode times overflow the 32-bit times that NFS2
2961 * uses on the wire then return an error.
2962 */
2963 if (!NFS_VAP_TIME_OK(vap)) {
2964 return (EOVERFLOW);
2965 }
2966 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2967 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2968
2969 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2970 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2971
2972 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2973 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2974
2975 /*
2976 * If the dev_t will fit into 16 bits then compress
2977 * it, otherwise leave it alone. See comments in
2978 * nfs_client.c.
2979 */
2980 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2981 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2982 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2983 else
2984 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2985
2986 na->na_blocks = vap->va_nblocks;
2987 na->na_blocksize = vap->va_blksize;
2988
2989 /*
2990 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2991 * over-the-wire protocols for named-pipe vnodes. It remaps the
2992 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2993 *
2994 * BUYER BEWARE:
2995 * If you are porting the NFS to a non-Sun server, you probably
2996 * don't want to include the following block of code. The
2997 * over-the-wire special file types will be changing with the
2998 * NFS Protocol Revision.
2999 */
3000 if (vap->va_type == VFIFO)
3001 NA_SETFIFO(na);
3002 return (0);
3003 }
3004
3005 /*
3006 * acl v2 support: returns approximate permission.
3007 * default: returns minimal permission (more restrictive)
3008 * aclok: returns maximal permission (less restrictive)
3009 * This routine changes the permissions that are alaredy in *va.
3010 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3011 * CLASS_OBJ is always the same as GROUP_OBJ entry.
3012 */
3013 static void
3014 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3015 {
3016 vsecattr_t vsa;
3017 int aclcnt;
3018 aclent_t *aclentp;
3019 mode_t mask_perm;
3020 mode_t grp_perm;
3021 mode_t other_perm;
3022 mode_t other_orig;
3023 int error;
3024
3025 /* dont care default acl */
3026 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3027 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3028
3029 if (!error) {
3030 aclcnt = vsa.vsa_aclcnt;
3031 if (aclcnt > MIN_ACL_ENTRIES) {
3032 /* non-trivial ACL */
3033 aclentp = vsa.vsa_aclentp;
3034 if (exi->exi_export.ex_flags & EX_ACLOK) {
3035 /* maximal permissions */
3036 grp_perm = 0;
3037 other_perm = 0;
3038 for (; aclcnt > 0; aclcnt--, aclentp++) {
3039 switch (aclentp->a_type) {
3040 case USER_OBJ:
3041 break;
3042 case USER:
3043 grp_perm |=
3044 aclentp->a_perm << 3;
3045 other_perm |= aclentp->a_perm;
3046 break;
3047 case GROUP_OBJ:
3048 grp_perm |=
3049 aclentp->a_perm << 3;
3050 break;
3051 case GROUP:
3052 other_perm |= aclentp->a_perm;
3053 break;
3054 case OTHER_OBJ:
3055 other_orig = aclentp->a_perm;
3056 break;
3057 case CLASS_OBJ:
3058 mask_perm = aclentp->a_perm;
3059 break;
3060 default:
3061 break;
3062 }
3063 }
3064 grp_perm &= mask_perm << 3;
3065 other_perm &= mask_perm;
3066 other_perm |= other_orig;
3067
3068 } else {
3069 /* minimal permissions */
3070 grp_perm = 070;
3071 other_perm = 07;
3072 for (; aclcnt > 0; aclcnt--, aclentp++) {
3073 switch (aclentp->a_type) {
3074 case USER_OBJ:
3075 break;
3076 case USER:
3077 case CLASS_OBJ:
3078 grp_perm &=
3079 aclentp->a_perm << 3;
3080 other_perm &=
3081 aclentp->a_perm;
3082 break;
3083 case GROUP_OBJ:
3084 grp_perm &=
3085 aclentp->a_perm << 3;
3086 break;
3087 case GROUP:
3088 other_perm &=
3089 aclentp->a_perm;
3090 break;
3091 case OTHER_OBJ:
3092 other_perm &=
3093 aclentp->a_perm;
3094 break;
3095 default:
3096 break;
3097 }
3098 }
3099 }
3100 /* copy to va */
3101 va->va_mode &= ~077;
3102 va->va_mode |= grp_perm | other_perm;
3103 }
3104 if (vsa.vsa_aclcnt)
3105 kmem_free(vsa.vsa_aclentp,
3106 vsa.vsa_aclcnt * sizeof (aclent_t));
3107 }
3108 }
3109
3110 void
3111 rfs_srvrinit(void)
3112 {
3113 nfs2_srv_caller_id = fs_new_caller_id();
3114 }
3115
3116 void
3117 rfs_srvrfini(void)
3118 {
3119 }
3120
3121 /* ARGSUSED */
3122 void
3123 rfs_srv_zone_init(nfs_globals_t *ng)
3124 {
3125 nfs_srv_t *ns;
3126
3127 ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3128
3129 mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3130 ns->write_async = 1;
3131
3132 ng->nfs_srv = ns;
3133 }
3134
3135 /* ARGSUSED */
3136 void
3137 rfs_srv_zone_fini(nfs_globals_t *ng)
3138 {
3139 nfs_srv_t *ns = ng->nfs_srv;
3140
3141 ng->nfs_srv = NULL;
3142
3143 mutex_destroy(&ns->async_write_lock);
3144 kmem_free(ns, sizeof (*ns));
3145 }
3146
3147 static int
3148 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3149 {
3150 struct clist *wcl;
3151 int wlist_len;
3152 uint32_t count = rr->rr_count;
3153
3154 wcl = ra->ra_wlist;
3155
3156 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3157 return (FALSE);
3158 }
3159
3160 wcl = ra->ra_wlist;
3161 rr->rr_ok.rrok_wlist_len = wlist_len;
3162 rr->rr_ok.rrok_wlist = wcl;
3163
3164 return (TRUE);
3165 }