1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 */
26
27 #include <sys/types.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vfs.h>
37 #include <sys/vfs_opreg.h>
38 #include <sys/vnode.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/time.h>
42 #include <sys/file.h>
43 #include <sys/open.h>
44 #include <sys/user.h>
45 #include <sys/termios.h>
46 #include <sys/stream.h>
47 #include <sys/strsubr.h>
48 #include <sys/strsun.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/cmn_err.h>
53 #include <sys/mkdev.h>
54 #include <sys/pathname.h>
55 #include <sys/ddi.h>
56 #include <sys/stat.h>
57 #include <sys/fs/snode.h>
58 #include <sys/fs/dv_node.h>
59 #include <sys/zone.h>
60
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <netinet/in.h>
64 #include <sys/un.h>
65 #include <sys/ucred.h>
66
67 #include <sys/tiuser.h>
68 #define _SUN_TPI_VERSION 2
69 #include <sys/tihdr.h>
70
71 #include <c2/audit.h>
72
73 #include <fs/sockfs/nl7c.h>
74 #include <fs/sockfs/sockcommon.h>
75 #include <fs/sockfs/sockfilter_impl.h>
76 #include <fs/sockfs/socktpi.h>
77 #include <fs/sockfs/socktpi_impl.h>
78 #include <fs/sockfs/sodirect.h>
79
80 /*
81 * Macros that operate on struct cmsghdr.
82 * The CMSG_VALID macro does not assume that the last option buffer is padded.
83 */
84 #define CMSG_CONTENT(cmsg) (&((cmsg)[1]))
85 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr))
86 #define CMSG_VALID(cmsg, start, end) \
87 (ISALIGNED_cmsghdr(cmsg) && \
88 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \
89 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \
90 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \
91 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
92 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */
93
94 dev_t sockdev; /* For fsid in getattr */
95 int sockfs_defer_nl7c_init = 0;
96
97 struct socklist socklist;
98
99 struct kmem_cache *socket_cache;
100
101 /*
102 * sockconf_lock protects the socket configuration (socket types and
103 * socket filters) which is changed via the sockconfig system call.
104 */
105 krwlock_t sockconf_lock;
106
107 static int sockfs_update(kstat_t *, int);
108 static int sockfs_snapshot(kstat_t *, void *, int);
109 extern smod_info_t *sotpi_smod_create(void);
110
111 extern void sendfile_init();
112
113 extern void nl7c_init(void);
114
115 extern int modrootloaded;
116
117 #define ADRSTRLEN (2 * sizeof (void *) + 1)
118 /*
119 * kernel structure for passing the sockinfo data back up to the user.
120 * the strings array allows us to convert AF_UNIX addresses into strings
121 * with a common method regardless of which n-bit kernel we're running.
122 */
123 struct k_sockinfo {
124 struct sockinfo ks_si;
125 char ks_straddr[3][ADRSTRLEN];
126 };
127
128 /*
129 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode.
130 * Returns with the vnode held.
131 */
132 int
133 sogetvp(char *devpath, vnode_t **vpp, int uioflag)
134 {
135 struct snode *csp;
136 vnode_t *vp, *dvp;
137 major_t maj;
138 int error;
139
140 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE);
141
142 /*
143 * Lookup the underlying filesystem vnode.
144 */
145 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp);
146 if (error)
147 return (error);
148
149 /* Check that it is the correct vnode */
150 if (vp->v_type != VCHR) {
151 VN_RELE(vp);
152 return (ENOTSOCK);
153 }
154
155 /*
156 * If devpath went through devfs, the device should already
157 * be configured. If devpath is a mknod file, however, we
158 * need to make sure the device is properly configured.
159 * To do this, we do something similar to spec_open()
160 * except that we resolve to the minor/leaf level since
161 * we need to return a vnode.
162 */
163 csp = VTOS(VTOS(vp)->s_commonvp);
164 if (!(csp->s_flag & SDIPSET)) {
165 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
166 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname);
167 if (error == 0)
168 error = devfs_lookupname(pathname, NULLVPP, &dvp);
169 VN_RELE(vp);
170 kmem_free(pathname, MAXPATHLEN);
171 if (error != 0)
172 return (ENXIO);
173 vp = dvp; /* use the devfs vp */
174 }
175
176 /* device is configured at this point */
177 maj = getmajor(vp->v_rdev);
178 if (!STREAMSTAB(maj)) {
179 VN_RELE(vp);
180 return (ENOSTR);
181 }
182
183 *vpp = vp;
184 return (0);
185 }
186
187 /*
188 * Update the accessed, updated, or changed times in an sonode
189 * with the current time.
190 *
191 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable
192 * attributes in a fstat call. (They return the current time and 0 for
193 * all timestamps, respectively.) We maintain the current timestamps
194 * here primarily so that should sockmod be popped the resulting
195 * file descriptor will behave like a stream w.r.t. the timestamps.
196 */
197 void
198 so_update_attrs(struct sonode *so, int flag)
199 {
200 time_t now = gethrestime_sec();
201
202 if (SOCK_IS_NONSTR(so))
203 return;
204
205 mutex_enter(&so->so_lock);
206 so->so_flag |= flag;
207 if (flag & SOACC)
208 SOTOTPI(so)->sti_atime = now;
209 if (flag & SOMOD)
210 SOTOTPI(so)->sti_mtime = now;
211 mutex_exit(&so->so_lock);
212 }
213
214 extern so_create_func_t sock_comm_create_function;
215 extern so_destroy_func_t sock_comm_destroy_function;
216 /*
217 * Init function called when sockfs is loaded.
218 */
219 int
220 sockinit(int fstype, char *name)
221 {
222 static const fs_operation_def_t sock_vfsops_template[] = {
223 NULL, NULL
224 };
225 int error;
226 major_t dev;
227 char *err_str;
228
229 error = vfs_setfsops(fstype, sock_vfsops_template, NULL);
230 if (error != 0) {
231 zcmn_err(GLOBAL_ZONEID, CE_WARN,
232 "sockinit: bad vfs ops template");
233 return (error);
234 }
235
236 error = vn_make_ops(name, socket_vnodeops_template,
237 &socket_vnodeops);
238 if (error != 0) {
239 err_str = "sockinit: bad socket vnode ops template";
240 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */
241 socket_vnodeops = NULL;
242 goto failure;
243 }
244
245 socket_cache = kmem_cache_create("socket_cache",
246 sizeof (struct sonode), 0, sonode_constructor,
247 sonode_destructor, NULL, NULL, NULL, 0);
248
249 rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
250
251 error = socktpi_init();
252 if (error != 0) {
253 err_str = NULL;
254 goto failure;
255 }
256
257 error = sod_init();
258 if (error != 0) {
259 err_str = NULL;
260 goto failure;
261 }
262
263 /*
264 * Set up the default create and destroy functions
265 */
266 sock_comm_create_function = socket_sonode_create;
267 sock_comm_destroy_function = socket_sonode_destroy;
268
269 /*
270 * Build initial list mapping socket parameters to vnode.
271 */
272 smod_init();
273 smod_add(sotpi_smod_create());
274
275 sockparams_init();
276
277 /*
278 * If sockets are needed before init runs /sbin/soconfig
279 * it is possible to preload the sockparams list here using
280 * calls like:
281 * sockconfig(1,2,3, "/dev/tcp", 0);
282 */
283
284 /*
285 * Create a unique dev_t for use in so_fsid.
286 */
287
288 if ((dev = getudev()) == (major_t)-1)
289 dev = 0;
290 sockdev = makedevice(dev, 0);
291
292 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL);
293 sendfile_init();
294 if (!modrootloaded) {
295 sockfs_defer_nl7c_init = 1;
296 } else {
297 nl7c_init();
298 }
299
300 /* Initialize socket filters */
301 sof_init();
302
303 return (0);
304
305 failure:
306 (void) vfs_freevfsops_by_type(fstype);
307 if (socket_vnodeops != NULL)
308 vn_freevnodeops(socket_vnodeops);
309 if (err_str != NULL)
310 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str);
311 return (error);
312 }
313
314 /*
315 * Caller must hold the mutex. Used to set SOLOCKED.
316 */
317 void
318 so_lock_single(struct sonode *so)
319 {
320 ASSERT(MUTEX_HELD(&so->so_lock));
321
322 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) {
323 cv_wait_stop(&so->so_single_cv, &so->so_lock,
324 SO_LOCK_WAKEUP_TIME);
325 }
326 so->so_flag |= SOLOCKED;
327 }
328
329 /*
330 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND.
331 * Used to clear SOLOCKED or SOASYNC_UNBIND.
332 */
333 void
334 so_unlock_single(struct sonode *so, int flag)
335 {
336 ASSERT(MUTEX_HELD(&so->so_lock));
337 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND));
338 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0);
339 ASSERT(so->so_flag & flag);
340 /*
341 * Process the T_DISCON_IND on sti_discon_ind_mp.
342 *
343 * Call to so_drain_discon_ind will result in so_lock
344 * being dropped and re-acquired later.
345 */
346 if (!SOCK_IS_NONSTR(so)) {
347 sotpi_info_t *sti = SOTOTPI(so);
348
349 if (sti->sti_discon_ind_mp != NULL)
350 so_drain_discon_ind(so);
351 }
352
353 cv_signal(&so->so_single_cv);
354 so->so_flag &= ~flag;
355 }
356
357 /*
358 * Caller must hold the mutex. Used to set SOREADLOCKED.
359 * If the caller wants nonblocking behavior it should set fmode.
360 */
361 int
362 so_lock_read(struct sonode *so, int fmode)
363 {
364 ASSERT(MUTEX_HELD(&so->so_lock));
365
366 while (so->so_flag & SOREADLOCKED) {
367 if (fmode & (FNDELAY|FNONBLOCK))
368 return (EWOULDBLOCK);
369 cv_wait_stop(&so->so_read_cv, &so->so_lock,
370 SO_LOCK_WAKEUP_TIME);
371 }
372 so->so_flag |= SOREADLOCKED;
373 return (0);
374 }
375
376 /*
377 * Like so_lock_read above but allows signals.
378 */
379 int
380 so_lock_read_intr(struct sonode *so, int fmode)
381 {
382 ASSERT(MUTEX_HELD(&so->so_lock));
383
384 while (so->so_flag & SOREADLOCKED) {
385 if (fmode & (FNDELAY|FNONBLOCK))
386 return (EWOULDBLOCK);
387 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock))
388 return (EINTR);
389 }
390 so->so_flag |= SOREADLOCKED;
391 return (0);
392 }
393
394 /*
395 * Caller must hold the mutex. Used to clear SOREADLOCKED,
396 * set in so_lock_read() or so_lock_read_intr().
397 */
398 void
399 so_unlock_read(struct sonode *so)
400 {
401 ASSERT(MUTEX_HELD(&so->so_lock));
402 ASSERT(so->so_flag & SOREADLOCKED);
403
404 cv_signal(&so->so_read_cv);
405 so->so_flag &= ~SOREADLOCKED;
406 }
407
408 /*
409 * Verify that the specified offset falls within the mblk and
410 * that the resulting pointer is aligned.
411 * Returns NULL if not.
412 */
413 void *
414 sogetoff(mblk_t *mp, t_uscalar_t offset,
415 t_uscalar_t length, uint_t align_size)
416 {
417 uintptr_t ptr1, ptr2;
418
419 ASSERT(mp && mp->b_wptr >= mp->b_rptr);
420 ptr1 = (uintptr_t)mp->b_rptr + offset;
421 ptr2 = (uintptr_t)ptr1 + length;
422 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
423 eprintline(0);
424 return (NULL);
425 }
426 if ((ptr1 & (align_size - 1)) != 0) {
427 eprintline(0);
428 return (NULL);
429 }
430 return ((void *)ptr1);
431 }
432
433 /*
434 * Return the AF_UNIX underlying filesystem vnode matching a given name.
435 * Makes sure the sending and the destination sonodes are compatible.
436 * The vnode is returned held.
437 *
438 * The underlying filesystem VSOCK vnode has a v_stream pointer that
439 * references the actual stream head (hence indirectly the actual sonode).
440 *
441 * This function is non-static so it can be used by brand emulation.
442 */
443 int
444 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
445 vnode_t **vpp)
446 {
447 vnode_t *vp; /* Underlying filesystem vnode */
448 vnode_t *rvp; /* real vnode */
449 vnode_t *svp; /* sockfs vnode */
450 struct sonode *so2;
451 int error;
452
453 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so,
454 soun->sun_path));
455
456 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
457 if (error) {
458 eprintsoline(so, error);
459 return (error);
460 }
461
462 /*
463 * Traverse lofs mounts get the real vnode
464 */
465 if (VOP_REALVP(vp, &rvp, NULL) == 0) {
466 VN_HOLD(rvp); /* hold the real vnode */
467 VN_RELE(vp); /* release hold from lookup */
468 vp = rvp;
469 }
470
471 if (vp->v_type != VSOCK) {
472 error = ENOTSOCK;
473 eprintsoline(so, error);
474 goto done2;
475 }
476
477 if (checkaccess) {
478 /*
479 * Check that we have permissions to access the destination
480 * vnode. This check is not done in BSD but it is required
481 * by X/Open.
482 */
483 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) {
484 eprintsoline(so, error);
485 goto done2;
486 }
487 }
488
489 /*
490 * Check if the remote socket has been closed.
491 *
492 * Synchronize with vn_rele_stream by holding v_lock while traversing
493 * v_stream->sd_vnode.
494 */
495 mutex_enter(&vp->v_lock);
496 if (vp->v_stream == NULL) {
497 mutex_exit(&vp->v_lock);
498 if (so->so_type == SOCK_DGRAM)
499 error = EDESTADDRREQ;
500 else
501 error = ECONNREFUSED;
502
503 eprintsoline(so, error);
504 goto done2;
505 }
506 ASSERT(vp->v_stream->sd_vnode);
507 svp = vp->v_stream->sd_vnode;
508 /*
509 * holding v_lock on underlying filesystem vnode and acquiring
510 * it on sockfs vnode. Assumes that no code ever attempts to
511 * acquire these locks in the reverse order.
512 */
513 VN_HOLD(svp);
514 mutex_exit(&vp->v_lock);
515
516 if (svp->v_type != VSOCK) {
517 error = ENOTSOCK;
518 eprintsoline(so, error);
519 goto done;
520 }
521
522 so2 = VTOSO(svp);
523
524 if (so->so_type != so2->so_type) {
525 error = EPROTOTYPE;
526 eprintsoline(so, error);
527 goto done;
528 }
529
530 VN_RELE(svp);
531 *vpp = vp;
532 return (0);
533
534 done:
535 VN_RELE(svp);
536 done2:
537 VN_RELE(vp);
538 return (error);
539 }
540
541 /*
542 * Verify peer address for connect and sendto/sendmsg.
543 * Since sendto/sendmsg would not get synchronous errors from the transport
544 * provider we have to do these ugly checks in the socket layer to
545 * preserve compatibility with SunOS 4.X.
546 */
547 int
548 so_addr_verify(struct sonode *so, const struct sockaddr *name,
549 socklen_t namelen)
550 {
551 int family;
552
553 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n",
554 (void *)so, (void *)name, namelen));
555
556 ASSERT(name != NULL);
557
558 family = so->so_family;
559 switch (family) {
560 case AF_INET:
561 if (name->sa_family != family) {
562 eprintsoline(so, EAFNOSUPPORT);
563 return (EAFNOSUPPORT);
564 }
565 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) {
566 eprintsoline(so, EINVAL);
567 return (EINVAL);
568 }
569 break;
570 case AF_INET6: {
571 #ifdef DEBUG
572 struct sockaddr_in6 *sin6;
573 #endif /* DEBUG */
574
575 if (name->sa_family != family) {
576 eprintsoline(so, EAFNOSUPPORT);
577 return (EAFNOSUPPORT);
578 }
579 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) {
580 eprintsoline(so, EINVAL);
581 return (EINVAL);
582 }
583 #ifdef DEBUG
584 /* Verify that apps don't forget to clear sin6_scope_id etc */
585 sin6 = (struct sockaddr_in6 *)name;
586 if (sin6->sin6_scope_id != 0 &&
587 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
588 zcmn_err(getzoneid(), CE_WARN,
589 "connect/send* with uninitialized sin6_scope_id "
590 "(%d) on socket. Pid = %d\n",
591 (int)sin6->sin6_scope_id, (int)curproc->p_pid);
592 }
593 #endif /* DEBUG */
594 break;
595 }
596 case AF_UNIX:
597 if (SOTOTPI(so)->sti_faddr_noxlate) {
598 return (0);
599 }
600 if (namelen < (socklen_t)sizeof (short)) {
601 eprintsoline(so, ENOENT);
602 return (ENOENT);
603 }
604 if (name->sa_family != family) {
605 eprintsoline(so, EAFNOSUPPORT);
606 return (EAFNOSUPPORT);
607 }
608 /* MAXPATHLEN + soun_family + nul termination */
609 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
610 eprintsoline(so, ENAMETOOLONG);
611 return (ENAMETOOLONG);
612 }
613
614 break;
615
616 default:
617 /*
618 * Default is don't do any length or sa_family check
619 * to allow non-sockaddr style addresses.
620 */
621 break;
622 }
623
624 return (0);
625 }
626
627
628 /*
629 * Translate an AF_UNIX sockaddr_un to the transport internal name.
630 * Assumes caller has called so_addr_verify first.
631 */
632 /*ARGSUSED*/
633 int
634 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
635 socklen_t namelen, int checkaccess,
636 void **addrp, socklen_t *addrlenp)
637 {
638 int error;
639 struct sockaddr_un *soun;
640 vnode_t *vp;
641 void *addr;
642 socklen_t addrlen;
643 sotpi_info_t *sti = SOTOTPI(so);
644
645 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n",
646 (void *)so, (void *)name, namelen, checkaccess));
647
648 ASSERT(name != NULL);
649 ASSERT(so->so_family == AF_UNIX);
650 ASSERT(!sti->sti_faddr_noxlate);
651 ASSERT(namelen >= (socklen_t)sizeof (short));
652 ASSERT(name->sa_family == AF_UNIX);
653 soun = (struct sockaddr_un *)name;
654 /*
655 * Lookup vnode for the specified path name and verify that
656 * it is a socket.
657 */
658 error = so_ux_lookup(so, soun, checkaccess, &vp);
659 if (error) {
660 eprintsoline(so, error);
661 return (error);
662 }
663 /*
664 * Use the address of the peer vnode as the address to send
665 * to. We release the peer vnode here. In case it has been
666 * closed by the time the T_CONN_REQ or T_UNITDATA_REQ reaches the
667 * transport the message will get an error or be dropped.
668 */
669 sti->sti_ux_faddr.soua_vp = vp;
670 sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT;
671 addr = &sti->sti_ux_faddr;
672 addrlen = (socklen_t)sizeof (sti->sti_ux_faddr);
673 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n",
674 addrlen, (void *)vp));
675 VN_RELE(vp);
676 *addrp = addr;
677 *addrlenp = (socklen_t)addrlen;
678 return (0);
679 }
680
681 /*
682 * Esballoc free function for messages that contain SO_FILEP option.
683 * Decrement the reference count on the file pointers using closef.
684 */
685 void
686 fdbuf_free(struct fdbuf *fdbuf)
687 {
688 int i;
689 struct file *fp;
690
691 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd));
692 for (i = 0; i < fdbuf->fd_numfd; i++) {
693 /*
694 * We need pointer size alignment for fd_fds. On a LP64
695 * kernel, the required alignment is 8 bytes while
696 * the option headers and values are only 4 bytes
697 * aligned. So its safer to do a bcopy compared to
698 * assigning fdbuf->fd_fds[i] to fp.
699 */
700 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
701 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp));
702 (void) closef(fp);
703 }
704 if (fdbuf->fd_ebuf != NULL)
705 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen);
706 kmem_free(fdbuf, fdbuf->fd_size);
707 }
708
709 /*
710 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing.
711 * Waits if memory is not available.
712 */
713 mblk_t *
714 fdbuf_allocmsg(int size, struct fdbuf *fdbuf)
715 {
716 uchar_t *buf;
717 mblk_t *mp;
718
719 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd));
720 buf = kmem_alloc(size, KM_SLEEP);
721 fdbuf->fd_ebuf = (caddr_t)buf;
722 fdbuf->fd_ebuflen = size;
723 fdbuf->fd_frtn.free_func = fdbuf_free;
724 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf;
725
726 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn);
727 mp->b_datap->db_type = M_PROTO;
728 return (mp);
729 }
730
731 /*
732 * Extract file descriptors from a fdbuf.
733 * Return list in rights/rightslen.
734 */
735 /*ARGSUSED*/
736 static int
737 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen)
738 {
739 int i, fd;
740 int *rp;
741 struct file *fp;
742 int numfd;
743
744 dprint(1, ("fdbuf_extract: %d fds, len %d\n",
745 fdbuf->fd_numfd, rightslen));
746
747 numfd = fdbuf->fd_numfd;
748 ASSERT(rightslen == numfd * (int)sizeof (int));
749
750 /*
751 * Allocate a file descriptor and increment the f_count.
752 * The latter is needed since we always call fdbuf_free
753 * which performs a closef.
754 */
755 rp = (int *)rights;
756 for (i = 0; i < numfd; i++) {
757 if ((fd = ufalloc(0)) == -1)
758 goto cleanup;
759 /*
760 * We need pointer size alignment for fd_fds. On a LP64
761 * kernel, the required alignment is 8 bytes while
762 * the option headers and values are only 4 bytes
763 * aligned. So its safer to do a bcopy compared to
764 * assigning fdbuf->fd_fds[i] to fp.
765 */
766 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
767 mutex_enter(&fp->f_tlock);
768 fp->f_count++;
769 mutex_exit(&fp->f_tlock);
770 setf(fd, fp);
771 *rp++ = fd;
772 if (AU_AUDITING())
773 audit_fdrecv(fd, fp);
774 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n",
775 i, fd, (void *)fp, fp->f_count));
776 }
777 return (0);
778
779 cleanup:
780 /*
781 * Undo whatever partial work the loop above has done.
782 */
783 {
784 int j;
785
786 rp = (int *)rights;
787 for (j = 0; j < i; j++) {
788 dprint(0,
789 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp));
790 (void) closeandsetf(*rp++, NULL);
791 }
792 }
793
794 return (EMFILE);
795 }
796
797 /*
798 * Insert file descriptors into an fdbuf.
799 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed
800 * by calling fdbuf_free().
801 */
802 int
803 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp)
804 {
805 int numfd, i;
806 int *fds;
807 struct file *fp;
808 struct fdbuf *fdbuf;
809 int fdbufsize;
810
811 dprint(1, ("fdbuf_create: len %d\n", rightslen));
812
813 numfd = rightslen / (int)sizeof (int);
814
815 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *));
816 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP);
817 fdbuf->fd_size = fdbufsize;
818 fdbuf->fd_numfd = 0;
819 fdbuf->fd_ebuf = NULL;
820 fdbuf->fd_ebuflen = 0;
821 fds = (int *)rights;
822 for (i = 0; i < numfd; i++) {
823 if ((fp = getf(fds[i])) == NULL) {
824 fdbuf_free(fdbuf);
825 return (EBADF);
826 }
827 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n",
828 i, fds[i], (void *)fp, fp->f_count));
829 mutex_enter(&fp->f_tlock);
830 fp->f_count++;
831 mutex_exit(&fp->f_tlock);
832 /*
833 * The maximum alignment for fdbuf (or any option header
834 * and its value) it 4 bytes. On a LP64 kernel, the alignment
835 * is not sufficient for pointers (fd_fds in this case). Since
836 * we just did a kmem_alloc (we get a double word alignment),
837 * we don't need to do anything on the send side (we loose
838 * the double word alignment because fdbuf goes after an
839 * option header (eg T_unitdata_req) which is only 4 byte
840 * aligned). We take care of this when we extract the file
841 * descriptor in fdbuf_extract or fdbuf_free.
842 */
843 fdbuf->fd_fds[i] = fp;
844 fdbuf->fd_numfd++;
845 releasef(fds[i]);
846 if (AU_AUDITING())
847 audit_fdsend(fds[i], fp, 0);
848 }
849 *fdbufp = fdbuf;
850 return (0);
851 }
852
853 static int
854 fdbuf_optlen(int rightslen)
855 {
856 int numfd;
857
858 numfd = rightslen / (int)sizeof (int);
859
860 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)));
861 }
862
863 static t_uscalar_t
864 fdbuf_cmsglen(int fdbuflen)
865 {
866 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) /
867 (int)sizeof (struct file *) * (int)sizeof (int));
868 }
869
870
871 /*
872 * Return non-zero if the mblk and fdbuf are consistent.
873 */
874 static int
875 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen)
876 {
877 if (fdbuflen >= FDBUF_HDRSIZE &&
878 fdbuflen == fdbuf->fd_size) {
879 frtn_t *frp = mp->b_datap->db_frtnp;
880 /*
881 * Check that the SO_FILEP portion of the
882 * message has not been modified by
883 * the loopback transport. The sending sockfs generates
884 * a message that is esballoc'ed with the free function
885 * being fdbuf_free() and where free_arg contains the
886 * identical information as the SO_FILEP content.
887 *
888 * If any of these constraints are not satisfied we
889 * silently ignore the option.
890 */
891 ASSERT(mp);
892 if (frp != NULL &&
893 frp->free_func == fdbuf_free &&
894 frp->free_arg != NULL &&
895 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) {
896 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n",
897 (void *)fdbuf, fdbuflen));
898 return (1);
899 } else {
900 zcmn_err(getzoneid(), CE_WARN,
901 "sockfs: mismatched fdbuf content (%p)",
902 (void *)mp);
903 return (0);
904 }
905 } else {
906 zcmn_err(getzoneid(), CE_WARN,
907 "sockfs: mismatched fdbuf len %d, %d\n",
908 fdbuflen, fdbuf->fd_size);
909 return (0);
910 }
911 }
912
913 /*
914 * When the file descriptors returned by sorecvmsg can not be passed
915 * to the application this routine will cleanup the references on
916 * the files. Start at startoff bytes into the buffer.
917 */
918 static void
919 close_fds(void *fdbuf, int fdbuflen, int startoff)
920 {
921 int *fds = (int *)fdbuf;
922 int numfd = fdbuflen / (int)sizeof (int);
923 int i;
924
925 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff));
926
927 for (i = 0; i < numfd; i++) {
928 if (startoff < 0)
929 startoff = 0;
930 if (startoff < (int)sizeof (int)) {
931 /*
932 * This file descriptor is partially or fully after
933 * the offset
934 */
935 dprint(0,
936 ("close_fds: cleanup[%d] = %d\n", i, fds[i]));
937 (void) closeandsetf(fds[i], NULL);
938 }
939 startoff -= (int)sizeof (int);
940 }
941 }
942
943 /*
944 * Close all file descriptors contained in the control part starting at
945 * the startoffset.
946 */
947 void
948 so_closefds(void *control, t_uscalar_t controllen, int oldflg,
949 int startoff)
950 {
951 struct cmsghdr *cmsg;
952
953 if (control == NULL)
954 return;
955
956 if (oldflg) {
957 close_fds(control, controllen, startoff);
958 return;
959 }
960 /* Scan control part for file descriptors. */
961 for (cmsg = (struct cmsghdr *)control;
962 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
963 cmsg = CMSG_NEXT(cmsg)) {
964 if (cmsg->cmsg_level == SOL_SOCKET &&
965 cmsg->cmsg_type == SCM_RIGHTS) {
966 close_fds(CMSG_CONTENT(cmsg),
967 (int)CMSG_CONTENTLEN(cmsg),
968 startoff - (int)sizeof (struct cmsghdr));
969 }
970 startoff -= cmsg->cmsg_len;
971 }
972 }
973
974 /*
975 * Returns a pointer/length for the file descriptors contained
976 * in the control buffer. Returns with *fdlenp == -1 if there are no
977 * file descriptor options present. This is different than there being
978 * a zero-length file descriptor option.
979 * Fail if there are multiple SCM_RIGHT cmsgs.
980 */
981 int
982 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg,
983 void **fdsp, int *fdlenp)
984 {
985 struct cmsghdr *cmsg;
986 void *fds;
987 int fdlen;
988
989 if (control == NULL) {
990 *fdsp = NULL;
991 *fdlenp = -1;
992 return (0);
993 }
994
995 if (oldflg) {
996 *fdsp = control;
997 if (controllen == 0)
998 *fdlenp = -1;
999 else
1000 *fdlenp = controllen;
1001 dprint(1, ("so_getfdopt: old %d\n", *fdlenp));
1002 return (0);
1003 }
1004
1005 fds = NULL;
1006 fdlen = 0;
1007
1008 for (cmsg = (struct cmsghdr *)control;
1009 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1010 cmsg = CMSG_NEXT(cmsg)) {
1011 if (cmsg->cmsg_level == SOL_SOCKET &&
1012 cmsg->cmsg_type == SCM_RIGHTS) {
1013 if (fds != NULL)
1014 return (EINVAL);
1015 fds = CMSG_CONTENT(cmsg);
1016 fdlen = (int)CMSG_CONTENTLEN(cmsg);
1017 dprint(1, ("so_getfdopt: new %lu\n",
1018 (size_t)CMSG_CONTENTLEN(cmsg)));
1019 }
1020 }
1021 if (fds == NULL) {
1022 dprint(1, ("so_getfdopt: NONE\n"));
1023 *fdlenp = -1;
1024 } else
1025 *fdlenp = fdlen;
1026 *fdsp = fds;
1027 return (0);
1028 }
1029
1030 /*
1031 * Return the length of the options including any file descriptor options.
1032 */
1033 t_uscalar_t
1034 so_optlen(void *control, t_uscalar_t controllen, int oldflg)
1035 {
1036 struct cmsghdr *cmsg;
1037 t_uscalar_t optlen = 0;
1038 t_uscalar_t len;
1039
1040 if (control == NULL)
1041 return (0);
1042
1043 if (oldflg)
1044 return ((t_uscalar_t)(sizeof (struct T_opthdr) +
1045 fdbuf_optlen(controllen)));
1046
1047 for (cmsg = (struct cmsghdr *)control;
1048 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1049 cmsg = CMSG_NEXT(cmsg)) {
1050 if (cmsg->cmsg_level == SOL_SOCKET &&
1051 cmsg->cmsg_type == SCM_RIGHTS) {
1052 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg));
1053 } else {
1054 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1055 }
1056 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) +
1057 sizeof (struct T_opthdr));
1058 }
1059 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n",
1060 controllen, oldflg, optlen));
1061 return (optlen);
1062 }
1063
1064 /*
1065 * Copy options from control to the mblk. Skip any file descriptor options.
1066 */
1067 void
1068 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp)
1069 {
1070 struct T_opthdr toh;
1071 struct cmsghdr *cmsg;
1072
1073 if (control == NULL)
1074 return;
1075
1076 if (oldflg) {
1077 /* No real options - caller has handled file descriptors */
1078 return;
1079 }
1080 for (cmsg = (struct cmsghdr *)control;
1081 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1082 cmsg = CMSG_NEXT(cmsg)) {
1083 /*
1084 * Note: The caller handles file descriptors prior
1085 * to calling this function.
1086 */
1087 t_uscalar_t len;
1088
1089 if (cmsg->cmsg_level == SOL_SOCKET &&
1090 cmsg->cmsg_type == SCM_RIGHTS)
1091 continue;
1092
1093 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1094 toh.level = cmsg->cmsg_level;
1095 toh.name = cmsg->cmsg_type;
1096 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr);
1097 toh.status = 0;
1098
1099 soappendmsg(mp, &toh, sizeof (toh));
1100 soappendmsg(mp, CMSG_CONTENT(cmsg), len);
1101 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len;
1102 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
1103 }
1104 }
1105
1106 /*
1107 * Return the length of the control message derived from the options.
1108 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP.
1109 * When oldflg is set only include SO_FILEP.
1110 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1111 * allocates the space that so_opt2cmsg fills. If one changes, the other should
1112 * also be checked for any possible impacts.
1113 */
1114 t_uscalar_t
1115 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg)
1116 {
1117 t_uscalar_t cmsglen = 0;
1118 struct T_opthdr *tohp;
1119 t_uscalar_t len;
1120 t_uscalar_t last_roundup = 0;
1121
1122 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1123
1124 for (tohp = (struct T_opthdr *)opt;
1125 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1126 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1127 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n",
1128 tohp->level, tohp->name, tohp->len));
1129 if (tohp->level == SOL_SOCKET &&
1130 (tohp->name == SO_SRCADDR ||
1131 tohp->name == SO_UNIX_CLOSE)) {
1132 continue;
1133 }
1134 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1135 struct fdbuf *fdbuf;
1136 int fdbuflen;
1137
1138 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1139 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1140
1141 if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1142 continue;
1143 if (oldflg) {
1144 cmsglen += fdbuf_cmsglen(fdbuflen);
1145 continue;
1146 }
1147 len = fdbuf_cmsglen(fdbuflen);
1148 } else if (tohp->level == SOL_SOCKET &&
1149 tohp->name == SCM_TIMESTAMP) {
1150 if (oldflg)
1151 continue;
1152
1153 if (get_udatamodel() == DATAMODEL_NATIVE) {
1154 len = sizeof (struct timeval);
1155 } else {
1156 len = sizeof (struct timeval32);
1157 }
1158 } else {
1159 if (oldflg)
1160 continue;
1161 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1162 }
1163 /*
1164 * Exclude roundup for last option to not set
1165 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit.
1166 */
1167 last_roundup = (t_uscalar_t)
1168 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) -
1169 (len + (int)sizeof (struct cmsghdr)));
1170 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) +
1171 last_roundup;
1172 }
1173 cmsglen -= last_roundup;
1174 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n",
1175 optlen, oldflg, cmsglen));
1176 return (cmsglen);
1177 }
1178
1179 /*
1180 * Copy options from options to the control. Convert SO_FILEP to
1181 * file descriptors.
1182 * Returns errno or zero.
1183 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1184 * allocates the space that so_opt2cmsg fills. If one changes, the other should
1185 * also be checked for any possible impacts.
1186 */
1187 int
1188 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg,
1189 void *control, t_uscalar_t controllen)
1190 {
1191 struct T_opthdr *tohp;
1192 struct cmsghdr *cmsg;
1193 struct fdbuf *fdbuf;
1194 int fdbuflen;
1195 int error;
1196 #if defined(DEBUG) || defined(__lint)
1197 struct cmsghdr *cend = (struct cmsghdr *)
1198 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen));
1199 #endif
1200 cmsg = (struct cmsghdr *)control;
1201
1202 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1203
1204 for (tohp = (struct T_opthdr *)opt;
1205 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1206 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1207 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n",
1208 tohp->level, tohp->name, tohp->len));
1209
1210 if (tohp->level == SOL_SOCKET &&
1211 (tohp->name == SO_SRCADDR ||
1212 tohp->name == SO_UNIX_CLOSE)) {
1213 continue;
1214 }
1215 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen);
1216 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1217 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1218 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1219
1220 if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1221 return (EPROTO);
1222 if (oldflg) {
1223 error = fdbuf_extract(fdbuf, control,
1224 (int)controllen);
1225 if (error != 0)
1226 return (error);
1227 continue;
1228 } else {
1229 int fdlen;
1230
1231 fdlen = (int)fdbuf_cmsglen(
1232 (int)_TPI_TOPT_DATALEN(tohp));
1233
1234 cmsg->cmsg_level = tohp->level;
1235 cmsg->cmsg_type = SCM_RIGHTS;
1236 cmsg->cmsg_len = (socklen_t)(fdlen +
1237 sizeof (struct cmsghdr));
1238
1239 error = fdbuf_extract(fdbuf,
1240 CMSG_CONTENT(cmsg), fdlen);
1241 if (error != 0)
1242 return (error);
1243 }
1244 } else if (tohp->level == SOL_SOCKET &&
1245 tohp->name == SCM_TIMESTAMP) {
1246 timestruc_t *timestamp;
1247
1248 if (oldflg)
1249 continue;
1250
1251 cmsg->cmsg_level = tohp->level;
1252 cmsg->cmsg_type = tohp->name;
1253
1254 timestamp =
1255 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1],
1256 sizeof (intptr_t));
1257
1258 if (get_udatamodel() == DATAMODEL_NATIVE) {
1259 struct timeval tv;
1260
1261 cmsg->cmsg_len = sizeof (struct timeval) +
1262 sizeof (struct cmsghdr);
1263 tv.tv_sec = timestamp->tv_sec;
1264 tv.tv_usec = timestamp->tv_nsec /
1265 (NANOSEC / MICROSEC);
1266 /*
1267 * on LP64 systems, the struct timeval in
1268 * the destination will not be 8-byte aligned,
1269 * so use bcopy to avoid alignment trouble
1270 */
1271 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv));
1272 } else {
1273 struct timeval32 *time32;
1274
1275 cmsg->cmsg_len = sizeof (struct timeval32) +
1276 sizeof (struct cmsghdr);
1277 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg);
1278 time32->tv_sec = (time32_t)timestamp->tv_sec;
1279 time32->tv_usec =
1280 (int32_t)(timestamp->tv_nsec /
1281 (NANOSEC / MICROSEC));
1282 }
1283
1284 } else {
1285 if (oldflg)
1286 continue;
1287
1288 cmsg->cmsg_level = tohp->level;
1289 cmsg->cmsg_type = tohp->name;
1290 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) +
1291 sizeof (struct cmsghdr));
1292
1293 /* copy content to control data part */
1294 bcopy(&tohp[1], CMSG_CONTENT(cmsg),
1295 CMSG_CONTENTLEN(cmsg));
1296 }
1297 /* move to next CMSG structure! */
1298 cmsg = CMSG_NEXT(cmsg);
1299 }
1300 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n",
1301 control, controllen, (void *)cend, (void *)cmsg));
1302 ASSERT(cmsg <= cend);
1303 return (0);
1304 }
1305
1306 /*
1307 * Extract the SO_SRCADDR option value if present.
1308 */
1309 void
1310 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp,
1311 t_uscalar_t *srclenp)
1312 {
1313 struct T_opthdr *tohp;
1314
1315 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1316
1317 ASSERT(srcp != NULL && srclenp != NULL);
1318 *srcp = NULL;
1319 *srclenp = 0;
1320
1321 for (tohp = (struct T_opthdr *)opt;
1322 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1323 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1324 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n",
1325 tohp->level, tohp->name, tohp->len));
1326 if (tohp->level == SOL_SOCKET &&
1327 tohp->name == SO_SRCADDR) {
1328 *srcp = _TPI_TOPT_DATA(tohp);
1329 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1330 }
1331 }
1332 }
1333
1334 /*
1335 * Verify if the SO_UNIX_CLOSE option is present.
1336 */
1337 int
1338 so_getopt_unix_close(void *opt, t_uscalar_t optlen)
1339 {
1340 struct T_opthdr *tohp;
1341
1342 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1343
1344 for (tohp = (struct T_opthdr *)opt;
1345 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1346 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1347 dprint(1,
1348 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n",
1349 tohp->level, tohp->name, tohp->len));
1350 if (tohp->level == SOL_SOCKET &&
1351 tohp->name == SO_UNIX_CLOSE)
1352 return (1);
1353 }
1354 return (0);
1355 }
1356
1357 /*
1358 * Allocate an M_PROTO message.
1359 *
1360 * If allocation fails the behavior depends on sleepflg:
1361 * _ALLOC_NOSLEEP fail immediately
1362 * _ALLOC_INTR sleep for memory until a signal is caught
1363 * _ALLOC_SLEEP sleep forever. Don't return NULL.
1364 */
1365 mblk_t *
1366 soallocproto(size_t size, int sleepflg, cred_t *cr)
1367 {
1368 mblk_t *mp;
1369
1370 /* Round up size for reuse */
1371 size = MAX(size, 64);
1372 if (cr != NULL)
1373 mp = allocb_cred(size, cr, curproc->p_pid);
1374 else
1375 mp = allocb(size, BPRI_MED);
1376
1377 if (mp == NULL) {
1378 int error; /* Dummy - error not returned to caller */
1379
1380 switch (sleepflg) {
1381 case _ALLOC_SLEEP:
1382 if (cr != NULL) {
1383 mp = allocb_cred_wait(size, STR_NOSIG, &error,
1384 cr, curproc->p_pid);
1385 } else {
1386 mp = allocb_wait(size, BPRI_MED, STR_NOSIG,
1387 &error);
1388 }
1389 ASSERT(mp);
1390 break;
1391 case _ALLOC_INTR:
1392 if (cr != NULL) {
1393 mp = allocb_cred_wait(size, 0, &error, cr,
1394 curproc->p_pid);
1395 } else {
1396 mp = allocb_wait(size, BPRI_MED, 0, &error);
1397 }
1398 if (mp == NULL) {
1399 /* Caught signal while sleeping for memory */
1400 eprintline(ENOBUFS);
1401 return (NULL);
1402 }
1403 break;
1404 case _ALLOC_NOSLEEP:
1405 default:
1406 eprintline(ENOBUFS);
1407 return (NULL);
1408 }
1409 }
1410 DB_TYPE(mp) = M_PROTO;
1411 return (mp);
1412 }
1413
1414 /*
1415 * Allocate an M_PROTO message with a single component.
1416 * len is the length of buf. size is the amount to allocate.
1417 *
1418 * buf can be NULL with a non-zero len.
1419 * This results in a bzero'ed chunk being placed the message.
1420 */
1421 mblk_t *
1422 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg,
1423 cred_t *cr)
1424 {
1425 mblk_t *mp;
1426
1427 if (size == 0)
1428 size = len;
1429
1430 ASSERT(size >= len);
1431 /* Round up size for reuse */
1432 size = MAX(size, 64);
1433 mp = soallocproto(size, sleepflg, cr);
1434 if (mp == NULL)
1435 return (NULL);
1436 mp->b_datap->db_type = M_PROTO;
1437 if (len != 0) {
1438 if (buf != NULL)
1439 bcopy(buf, mp->b_wptr, len);
1440 else
1441 bzero(mp->b_wptr, len);
1442 mp->b_wptr += len;
1443 }
1444 return (mp);
1445 }
1446
1447 /*
1448 * Append buf/len to mp.
1449 * The caller has to ensure that there is enough room in the mblk.
1450 *
1451 * buf can be NULL with a non-zero len.
1452 * This results in a bzero'ed chunk being placed the message.
1453 */
1454 void
1455 soappendmsg(mblk_t *mp, const void *buf, ssize_t len)
1456 {
1457 ASSERT(mp);
1458
1459 if (len != 0) {
1460 /* Assert for room left */
1461 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len);
1462 if (buf != NULL)
1463 bcopy(buf, mp->b_wptr, len);
1464 else
1465 bzero(mp->b_wptr, len);
1466 }
1467 mp->b_wptr += len;
1468 }
1469
1470 /*
1471 * Create a message using two kernel buffers.
1472 * If size is set that will determine the allocation size (e.g. for future
1473 * soappendmsg calls). If size is zero it is derived from the buffer
1474 * lengths.
1475 */
1476 mblk_t *
1477 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1478 ssize_t size, int sleepflg, cred_t *cr)
1479 {
1480 mblk_t *mp;
1481
1482 if (size == 0)
1483 size = len1 + len2;
1484 ASSERT(size >= len1 + len2);
1485
1486 mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1487 if (mp)
1488 soappendmsg(mp, buf2, len2);
1489 return (mp);
1490 }
1491
1492 /*
1493 * Create a message using three kernel buffers.
1494 * If size is set that will determine the allocation size (for future
1495 * soappendmsg calls). If size is zero it is derived from the buffer
1496 * lengths.
1497 */
1498 mblk_t *
1499 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1500 const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr)
1501 {
1502 mblk_t *mp;
1503
1504 if (size == 0)
1505 size = len1 + len2 +len3;
1506 ASSERT(size >= len1 + len2 + len3);
1507
1508 mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1509 if (mp != NULL) {
1510 soappendmsg(mp, buf2, len2);
1511 soappendmsg(mp, buf3, len3);
1512 }
1513 return (mp);
1514 }
1515
1516 #ifdef DEBUG
1517 char *
1518 pr_state(uint_t state, uint_t mode)
1519 {
1520 static char buf[1024];
1521
1522 buf[0] = 0;
1523 if (state & SS_ISCONNECTED)
1524 (void) strcat(buf, "ISCONNECTED ");
1525 if (state & SS_ISCONNECTING)
1526 (void) strcat(buf, "ISCONNECTING ");
1527 if (state & SS_ISDISCONNECTING)
1528 (void) strcat(buf, "ISDISCONNECTING ");
1529 if (state & SS_CANTSENDMORE)
1530 (void) strcat(buf, "CANTSENDMORE ");
1531
1532 if (state & SS_CANTRCVMORE)
1533 (void) strcat(buf, "CANTRCVMORE ");
1534 if (state & SS_ISBOUND)
1535 (void) strcat(buf, "ISBOUND ");
1536 if (state & SS_NDELAY)
1537 (void) strcat(buf, "NDELAY ");
1538 if (state & SS_NONBLOCK)
1539 (void) strcat(buf, "NONBLOCK ");
1540
1541 if (state & SS_ASYNC)
1542 (void) strcat(buf, "ASYNC ");
1543 if (state & SS_ACCEPTCONN)
1544 (void) strcat(buf, "ACCEPTCONN ");
1545 if (state & SS_SAVEDEOR)
1546 (void) strcat(buf, "SAVEDEOR ");
1547
1548 if (state & SS_RCVATMARK)
1549 (void) strcat(buf, "RCVATMARK ");
1550 if (state & SS_OOBPEND)
1551 (void) strcat(buf, "OOBPEND ");
1552 if (state & SS_HAVEOOBDATA)
1553 (void) strcat(buf, "HAVEOOBDATA ");
1554 if (state & SS_HADOOBDATA)
1555 (void) strcat(buf, "HADOOBDATA ");
1556
1557 if (mode & SM_PRIV)
1558 (void) strcat(buf, "PRIV ");
1559 if (mode & SM_ATOMIC)
1560 (void) strcat(buf, "ATOMIC ");
1561 if (mode & SM_ADDR)
1562 (void) strcat(buf, "ADDR ");
1563 if (mode & SM_CONNREQUIRED)
1564 (void) strcat(buf, "CONNREQUIRED ");
1565
1566 if (mode & SM_FDPASSING)
1567 (void) strcat(buf, "FDPASSING ");
1568 if (mode & SM_EXDATA)
1569 (void) strcat(buf, "EXDATA ");
1570 if (mode & SM_OPTDATA)
1571 (void) strcat(buf, "OPTDATA ");
1572 if (mode & SM_BYTESTREAM)
1573 (void) strcat(buf, "BYTESTREAM ");
1574 return (buf);
1575 }
1576
1577 char *
1578 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen)
1579 {
1580 static char buf[1024];
1581
1582 if (addr == NULL || addrlen == 0) {
1583 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr);
1584 return (buf);
1585 }
1586 switch (family) {
1587 case AF_INET: {
1588 struct sockaddr_in sin;
1589
1590 bcopy(addr, &sin, sizeof (sin));
1591
1592 (void) sprintf(buf, "(len %d) %x/%d",
1593 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1594 break;
1595 }
1596 case AF_INET6: {
1597 struct sockaddr_in6 sin6;
1598 uint16_t *piece = (uint16_t *)&sin6.sin6_addr;
1599
1600 bcopy((char *)addr, (char *)&sin6, sizeof (sin6));
1601 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d",
1602 addrlen,
1603 ntohs(piece[0]), ntohs(piece[1]),
1604 ntohs(piece[2]), ntohs(piece[3]),
1605 ntohs(piece[4]), ntohs(piece[5]),
1606 ntohs(piece[6]), ntohs(piece[7]),
1607 ntohs(sin6.sin6_port));
1608 break;
1609 }
1610 case AF_UNIX: {
1611 struct sockaddr_un *soun = (struct sockaddr_un *)addr;
1612
1613 (void) sprintf(buf, "(len %d) %s", addrlen,
1614 (soun == NULL) ? "(none)" : soun->sun_path);
1615 break;
1616 }
1617 default:
1618 (void) sprintf(buf, "(unknown af %d)", family);
1619 break;
1620 }
1621 return (buf);
1622 }
1623
1624 /* The logical equivalence operator (a if-and-only-if b) */
1625 #define EQUIVALENT(a, b) (((a) && (b)) || (!(a) && (!(b))))
1626
1627 /*
1628 * Verify limitations and invariants on oob state.
1629 * Return 1 if OK, otherwise 0 so that it can be used as
1630 * ASSERT(verify_oobstate(so));
1631 */
1632 int
1633 so_verify_oobstate(struct sonode *so)
1634 {
1635 boolean_t havemark;
1636
1637 ASSERT(MUTEX_HELD(&so->so_lock));
1638
1639 /*
1640 * The possible state combinations are:
1641 * 0
1642 * SS_OOBPEND
1643 * SS_OOBPEND|SS_HAVEOOBDATA
1644 * SS_OOBPEND|SS_HADOOBDATA
1645 * SS_HADOOBDATA
1646 */
1647 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) {
1648 case 0:
1649 case SS_OOBPEND:
1650 case SS_OOBPEND|SS_HAVEOOBDATA:
1651 case SS_OOBPEND|SS_HADOOBDATA:
1652 case SS_HADOOBDATA:
1653 break;
1654 default:
1655 printf("Bad oob state 1 (%p): state %s\n",
1656 (void *)so, pr_state(so->so_state, so->so_mode));
1657 return (0);
1658 }
1659
1660 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */
1661 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) {
1662 printf("Bad oob state 2 (%p): state %s\n",
1663 (void *)so, pr_state(so->so_state, so->so_mode));
1664 return (0);
1665 }
1666
1667 /*
1668 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND
1669 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt.
1670 */
1671 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 :
1672 SOTOTPI(so)->sti_oobsigcnt > 0;
1673
1674 if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK),
1675 so->so_state & SS_OOBPEND)) {
1676 printf("Bad oob state 3 (%p): state %s\n",
1677 (void *)so, pr_state(so->so_state, so->so_mode));
1678 return (0);
1679 }
1680
1681 /*
1682 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA
1683 */
1684 if (!(so->so_options & SO_OOBINLINE) &&
1685 !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) {
1686 printf("Bad oob state 4 (%p): state %s\n",
1687 (void *)so, pr_state(so->so_state, so->so_mode));
1688 return (0);
1689 }
1690
1691 if (!SOCK_IS_NONSTR(so) &&
1692 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) {
1693 printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
1694 (void *)so, SOTOTPI(so)->sti_oobsigcnt,
1695 SOTOTPI(so)->sti_oobcnt,
1696 pr_state(so->so_state, so->so_mode));
1697 return (0);
1698 }
1699
1700 return (1);
1701 }
1702 #undef EQUIVALENT
1703 #endif /* DEBUG */
1704
1705 /* initialize sockfs zone specific kstat related items */
1706 void *
1707 sock_kstat_init(zoneid_t zoneid)
1708 {
1709 kstat_t *ksp;
1710
1711 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc",
1712 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid);
1713
1714 if (ksp != NULL) {
1715 ksp->ks_update = sockfs_update;
1716 ksp->ks_snapshot = sockfs_snapshot;
1717 ksp->ks_lock = &socklist.sl_lock;
1718 ksp->ks_private = (void *)(uintptr_t)zoneid;
1719 kstat_install(ksp);
1720 }
1721
1722 return (ksp);
1723 }
1724
1725 /* tear down sockfs zone specific kstat related items */
1726 /*ARGSUSED*/
1727 void
1728 sock_kstat_fini(zoneid_t zoneid, void *arg)
1729 {
1730 kstat_t *ksp = (kstat_t *)arg;
1731
1732 if (ksp != NULL) {
1733 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private);
1734 kstat_delete(ksp);
1735 }
1736 }
1737
1738 /*
1739 * Zones:
1740 * Note that nactive is going to be different for each zone.
1741 * This means we require kstat to call sockfs_update and then sockfs_snapshot
1742 * for the same zone, or sockfs_snapshot will be taken into the wrong size
1743 * buffer. This is safe, but if the buffer is too small, user will not be
1744 * given details of all sockets. However, as this kstat has a ks_lock, kstat
1745 * driver will keep it locked between the update and the snapshot, so no
1746 * other process (zone) can currently get inbetween resulting in a wrong size
1747 * buffer allocation.
1748 */
1749 static int
1750 sockfs_update(kstat_t *ksp, int rw)
1751 {
1752 uint_t nactive = 0; /* # of active AF_UNIX sockets */
1753 struct sonode *so; /* current sonode on socklist */
1754 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1755
1756 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1757
1758 if (rw == KSTAT_WRITE) { /* bounce all writes */
1759 return (EACCES);
1760 }
1761
1762 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1763 if (so->so_count != 0 && so->so_zoneid == myzoneid) {
1764 nactive++;
1765 }
1766 }
1767 ksp->ks_ndata = nactive;
1768 ksp->ks_data_size = nactive * sizeof (struct k_sockinfo);
1769
1770 return (0);
1771 }
1772
1773 static int
1774 sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
1775 {
1776 int ns; /* # of sonodes we've copied */
1777 struct sonode *so; /* current sonode on socklist */
1778 struct k_sockinfo *pksi; /* where we put sockinfo data */
1779 t_uscalar_t sn_len; /* soa_len */
1780 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1781 sotpi_info_t *sti;
1782
1783 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1784
1785 ksp->ks_snaptime = gethrtime();
1786
1787 if (rw == KSTAT_WRITE) { /* bounce all writes */
1788 return (EACCES);
1789 }
1790
1791 /*
1792 * for each sonode on the socklist, we massage the important
1793 * info into buf, in k_sockinfo format.
1794 */
1795 pksi = (struct k_sockinfo *)buf;
1796 ns = 0;
1797 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1798 /* only stuff active sonodes and the same zone: */
1799 if (so->so_count == 0 || so->so_zoneid != myzoneid) {
1800 continue;
1801 }
1802
1803 /*
1804 * If the sonode was activated between the update and the
1805 * snapshot, we're done - as this is only a snapshot.
1806 */
1807 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) {
1808 break;
1809 }
1810
1811 sti = SOTOTPI(so);
1812 /* copy important info into buf: */
1813 pksi->ks_si.si_size = sizeof (struct k_sockinfo);
1814 pksi->ks_si.si_family = so->so_family;
1815 pksi->ks_si.si_type = so->so_type;
1816 pksi->ks_si.si_flag = so->so_flag;
1817 pksi->ks_si.si_state = so->so_state;
1818 pksi->ks_si.si_serv_type = sti->sti_serv_type;
1819 pksi->ks_si.si_ux_laddr_sou_magic =
1820 sti->sti_ux_laddr.soua_magic;
1821 pksi->ks_si.si_ux_faddr_sou_magic =
1822 sti->sti_ux_faddr.soua_magic;
1823 pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len;
1824 pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len;
1825 pksi->ks_si.si_szoneid = so->so_zoneid;
1826 pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate;
1827
1828 mutex_enter(&so->so_lock);
1829
1830 if (sti->sti_laddr_sa != NULL) {
1831 ASSERT(sti->sti_laddr_sa->sa_data != NULL);
1832 sn_len = sti->sti_laddr_len;
1833 ASSERT(sn_len <= sizeof (short) +
1834 sizeof (pksi->ks_si.si_laddr_sun_path));
1835
1836 pksi->ks_si.si_laddr_family =
1837 sti->sti_laddr_sa->sa_family;
1838 if (sn_len != 0) {
1839 /* AF_UNIX socket names are NULL terminated */
1840 (void) strncpy(pksi->ks_si.si_laddr_sun_path,
1841 sti->sti_laddr_sa->sa_data,
1842 sizeof (pksi->ks_si.si_laddr_sun_path));
1843 sn_len = strlen(pksi->ks_si.si_laddr_sun_path);
1844 }
1845 pksi->ks_si.si_laddr_sun_path[sn_len] = 0;
1846 }
1847
1848 if (sti->sti_faddr_sa != NULL) {
1849 ASSERT(sti->sti_faddr_sa->sa_data != NULL);
1850 sn_len = sti->sti_faddr_len;
1851 ASSERT(sn_len <= sizeof (short) +
1852 sizeof (pksi->ks_si.si_faddr_sun_path));
1853
1854 pksi->ks_si.si_faddr_family =
1855 sti->sti_faddr_sa->sa_family;
1856 if (sn_len != 0) {
1857 (void) strncpy(pksi->ks_si.si_faddr_sun_path,
1858 sti->sti_faddr_sa->sa_data,
1859 sizeof (pksi->ks_si.si_faddr_sun_path));
1860 sn_len = strlen(pksi->ks_si.si_faddr_sun_path);
1861 }
1862 pksi->ks_si.si_faddr_sun_path[sn_len] = 0;
1863 }
1864
1865 mutex_exit(&so->so_lock);
1866
1867 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so);
1868 (void) sprintf(pksi->ks_straddr[1], "%p",
1869 (void *)sti->sti_ux_laddr.soua_vp);
1870 (void) sprintf(pksi->ks_straddr[2], "%p",
1871 (void *)sti->sti_ux_faddr.soua_vp);
1872
1873 ns++;
1874 pksi++;
1875 }
1876
1877 ksp->ks_ndata = ns;
1878 return (0);
1879 }
1880
1881 ssize_t
1882 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
1883 {
1884 struct uio auio;
1885 struct iovec aiov[1];
1886 register vnode_t *vp;
1887 int ioflag, rwflag;
1888 ssize_t cnt;
1889 int error = 0;
1890 int iovcnt = 0;
1891 short fflag;
1892
1893 vp = fp->f_vnode;
1894 fflag = fp->f_flag;
1895
1896 rwflag = 0;
1897 aiov[0].iov_base = (caddr_t)buf;
1898 aiov[0].iov_len = size;
1899 iovcnt = 1;
1900 cnt = (ssize_t)size;
1901 (void) VOP_RWLOCK(vp, rwflag, NULL);
1902
1903 auio.uio_loffset = fileoff;
1904 auio.uio_iov = aiov;
1905 auio.uio_iovcnt = iovcnt;
1906 auio.uio_resid = cnt;
1907 auio.uio_segflg = UIO_SYSSPACE;
1908 auio.uio_llimit = MAXOFFSET_T;
1909 auio.uio_fmode = fflag;
1910 auio.uio_extflg = UIO_COPY_CACHED;
1911
1912 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1913
1914 /* If read sync is not asked for, filter sync flags */
1915 if ((ioflag & FRSYNC) == 0)
1916 ioflag &= ~(FSYNC|FDSYNC);
1917 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1918 cnt -= auio.uio_resid;
1919
1920 VOP_RWUNLOCK(vp, rwflag, NULL);
1921
1922 if (error == EINTR && cnt != 0)
1923 error = 0;
1924 out:
1925 if (error != 0) {
1926 *err = error;
1927 return (0);
1928 } else {
1929 *err = 0;
1930 return (cnt);
1931 }
1932 }
1933
1934 int
1935 so_copyin(const void *from, void *to, size_t size, int fromkernel)
1936 {
1937 if (fromkernel) {
1938 bcopy(from, to, size);
1939 return (0);
1940 }
1941 return (xcopyin(from, to, size));
1942 }
1943
1944 int
1945 so_copyout(const void *from, void *to, size_t size, int tokernel)
1946 {
1947 if (tokernel) {
1948 bcopy(from, to, size);
1949 return (0);
1950 }
1951 return (xcopyout(from, to, size));
1952 }