1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2022 MNX Cloud, Inc.
27 */
28
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/kmem_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/debug.h>
42 #include <sys/errno.h>
43 #include <sys/time.h>
44 #include <sys/file.h>
45 #include <sys/open.h>
46 #include <sys/user.h>
47 #include <sys/termios.h>
48 #include <sys/stream.h>
49 #include <sys/strsubr.h>
50 #include <sys/strsun.h>
51 #include <sys/suntpi.h>
52 #include <sys/ddi.h>
53 #include <sys/esunddi.h>
54 #include <sys/flock.h>
55 #include <sys/modctl.h>
56 #include <sys/vtrace.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathname.h>
59
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <netinet/in.h>
64 #include <sys/un.h>
65 #include <sys/strsun.h>
66
67 #include <sys/tiuser.h>
68 #define _SUN_TPI_VERSION 2
69 #include <sys/tihdr.h>
70 #include <sys/timod.h> /* TI_GETMYNAME, TI_GETPEERNAME */
71
72 #include <c2/audit.h>
73
74 #include <inet/common.h>
75 #include <inet/ip.h>
76 #include <inet/ip6.h>
77 #include <inet/tcp.h>
78 #include <inet/udp_impl.h>
79
80 #include <sys/zone.h>
81
82 #include <fs/sockfs/nl7c.h>
83 #include <fs/sockfs/nl7curi.h>
84
85 #include <fs/sockfs/sockcommon.h>
86 #include <fs/sockfs/socktpi.h>
87 #include <fs/sockfs/socktpi_impl.h>
88
89 /*
90 * Possible failures when memory can't be allocated. The documented behavior:
91 *
92 * 5.5: 4.X: XNET:
93 * accept: ENOMEM/ENOSR/EINTR - (EINTR) ENOMEM/ENOBUFS/ENOSR/
94 * EINTR
95 * (4.X does not document EINTR but returns it)
96 * bind: ENOSR - ENOBUFS/ENOSR
97 * connect: EINTR EINTR ENOBUFS/ENOSR/EINTR
98 * getpeername: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
99 * getsockname: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
100 * (4.X getpeername and getsockname do not fail in practice)
101 * getsockopt: ENOMEM/ENOSR - ENOBUFS/ENOSR
102 * listen: - - ENOBUFS
103 * recv: ENOMEM/ENOSR/EINTR EINTR ENOBUFS/ENOMEM/ENOSR/
104 * EINTR
105 * send: ENOMEM/ENOSR/EINTR ENOBUFS/EINTR ENOBUFS/ENOMEM/ENOSR/
106 * EINTR
107 * setsockopt: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
108 * shutdown: ENOMEM/ENOSR - ENOBUFS/ENOSR
109 * socket: ENOMEM/ENOSR ENOBUFS ENOBUFS/ENOMEM/ENOSR
110 * socketpair: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
111 *
112 * Resolution. When allocation fails:
113 * recv: return EINTR
114 * send: return EINTR
115 * connect, accept: EINTR
116 * bind, listen, shutdown (unbind, unix_close, disconnect): sleep
117 * socket, socketpair: ENOBUFS
118 * getpeername, getsockname: sleep
119 * getsockopt, setsockopt: sleep
120 */
121
122 #ifdef SOCK_TEST
123 /*
124 * Variables that make sockfs do something other than the standard TPI
125 * for the AF_INET transports.
126 *
127 * solisten_tpi_tcp:
128 * TCP can handle a O_T_BIND_REQ with an increased backlog even though
129 * the transport is already bound. This is needed to avoid loosing the
130 * port number should listen() do a T_UNBIND_REQ followed by a
131 * O_T_BIND_REQ.
132 *
133 * soconnect_tpi_udp:
134 * UDP and ICMP can handle a T_CONN_REQ.
135 * This is needed to make the sequence of connect(), getsockname()
136 * return the local IP address used to send packets to the connected to
137 * destination.
138 *
139 * soconnect_tpi_tcp:
140 * TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
141 * Set this to non-zero to send TPI conformant messages to TCP in this
142 * respect. This is a performance optimization.
143 *
144 * soaccept_tpi_tcp:
145 * TCP can handle a T_CONN_REQ without the acceptor being bound.
146 * This is a performance optimization that has been picked up in XTI.
147 *
148 * soaccept_tpi_multioptions:
149 * When inheriting SOL_SOCKET options from the listener to the accepting
150 * socket send them as a single message for AF_INET{,6}.
151 */
152 int solisten_tpi_tcp = 0;
153 int soconnect_tpi_udp = 0;
154 int soconnect_tpi_tcp = 0;
155 int soaccept_tpi_tcp = 0;
156 int soaccept_tpi_multioptions = 1;
157 #else /* SOCK_TEST */
158 #define soconnect_tpi_tcp 0
159 #define soconnect_tpi_udp 0
160 #define solisten_tpi_tcp 0
161 #define soaccept_tpi_tcp 0
162 #define soaccept_tpi_multioptions 1
163 #endif /* SOCK_TEST */
164
165 #ifdef SOCK_TEST
166 extern int do_useracc;
167 extern clock_t sock_test_timelimit;
168 #endif /* SOCK_TEST */
169
170 extern uint32_t ucredsize;
171
172 /*
173 * Some X/Open added checks might have to be backed out to keep SunOS 4.X
174 * applications working. Turn on this flag to disable these checks.
175 */
176 int xnet_skip_checks = 0;
177 int xnet_check_print = 0;
178 int xnet_truncate_print = 0;
179
180 static void sotpi_destroy(struct sonode *);
181 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
182 int, int *, cred_t *cr);
183
184 static boolean_t sotpi_info_create(struct sonode *, int);
185 static void sotpi_info_init(struct sonode *);
186 static void sotpi_info_fini(struct sonode *);
187 static void sotpi_info_destroy(struct sonode *);
188
189 /*
190 * Do direct function call to the transport layer below; this would
191 * also allow the transport to utilize read-side synchronous stream
192 * interface if necessary. This is a /etc/system tunable that must
193 * not be modified on a running system. By default this is enabled
194 * for performance reasons and may be disabled for debugging purposes.
195 */
196 boolean_t socktpi_direct = B_TRUE;
197
198 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
199
200 extern void sigintr(k_sigset_t *, int);
201 extern void sigunintr(k_sigset_t *);
202
203 static int sotpi_unbind(struct sonode *, int);
204
205 /* TPI sockfs sonode operations */
206 int sotpi_init(struct sonode *, struct sonode *, struct cred *,
207 int);
208 static int sotpi_accept(struct sonode *, int, struct cred *,
209 struct sonode **);
210 static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
211 int, struct cred *);
212 static int sotpi_listen(struct sonode *, int, struct cred *);
213 static int sotpi_connect(struct sonode *, struct sockaddr *,
214 socklen_t, int, int, struct cred *);
215 extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
216 struct uio *, struct cred *);
217 static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
218 struct uio *, struct cred *);
219 static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
220 struct cred *, mblk_t **);
221 static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
222 struct uio *, void *, t_uscalar_t, int);
223 static int sodgram_direct(struct sonode *, struct sockaddr *,
224 socklen_t, struct uio *, int);
225 extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
226 socklen_t *, boolean_t, struct cred *);
227 static int sotpi_getsockname(struct sonode *, struct sockaddr *,
228 socklen_t *, struct cred *);
229 static int sotpi_shutdown(struct sonode *, int, struct cred *);
230 extern int sotpi_getsockopt(struct sonode *, int, int, void *,
231 socklen_t *, int, struct cred *);
232 extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
233 socklen_t, struct cred *);
234 static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
235 int32_t *);
236 static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
237 struct cred *, int32_t *);
238 static int sotpi_poll(struct sonode *, short, int, short *,
239 struct pollhead **);
240 static int sotpi_close(struct sonode *, int, struct cred *);
241
242 static int i_sotpi_info_constructor(sotpi_info_t *);
243 static void i_sotpi_info_destructor(sotpi_info_t *);
244
245 sonodeops_t sotpi_sonodeops = {
246 sotpi_init, /* sop_init */
247 sotpi_accept, /* sop_accept */
248 sotpi_bind, /* sop_bind */
249 sotpi_listen, /* sop_listen */
250 sotpi_connect, /* sop_connect */
251 sotpi_recvmsg, /* sop_recvmsg */
252 sotpi_sendmsg, /* sop_sendmsg */
253 sotpi_sendmblk, /* sop_sendmblk */
254 sotpi_getpeername, /* sop_getpeername */
255 sotpi_getsockname, /* sop_getsockname */
256 sotpi_shutdown, /* sop_shutdown */
257 sotpi_getsockopt, /* sop_getsockopt */
258 sotpi_setsockopt, /* sop_setsockopt */
259 sotpi_ioctl, /* sop_ioctl */
260 sotpi_poll, /* sop_poll */
261 sotpi_close, /* sop_close */
262 };
263
264 /*
265 * Post-close reality check for NULL v_stream...
266 *
267 * Kernel callers (e.g. in procfs) may attempt socket operations, after
268 * holding the vnode, after it has been closed. For TPI sockets, post-close
269 * operations will have a NULL v_stream (which all functions here assume
270 * or even ASSERT() is non-NULL). See sotpi_close for where we wipe it out.
271 *
272 * If we are in a state where we lost a race to close(), we need to stop ASAP,
273 * and return the acceptable-as-an-errno EBADF. Because cleanup may be
274 * required, this macro only checks the v_stream.
275 *
276 * Checking should only be relevant for in-kernel other-thread inspectors.
277 * Userland ones (i.e. same process that opened the socktpi socket) SHOULD be
278 * protected by higher-level mechanisms. The only in-kernel inspector in the
279 * source base is procfs, which only accesses get{sockname,peername,sockopt}().
280 */
281 #define SOTPI_VN_NOSTREAM(vn) ((vn)->v_stream == NULL)
282
283 /*
284 * Return a TPI socket vnode.
285 *
286 * Note that sockets assume that the driver will clone (either itself
287 * or by using the clone driver) i.e. a socket() call will always
288 * result in a new vnode being created.
289 */
290
291 /*
292 * Common create code for socket and accept. If tso is set the values
293 * from that node is used instead of issuing a T_INFO_REQ.
294 */
295
296 /* ARGSUSED */
297 static struct sonode *
298 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
299 int version, int sflags, int *errorp, cred_t *cr)
300 {
301 struct sonode *so;
302 kmem_cache_t *cp;
303 int sfamily = family;
304
305 ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
306
307 if (family == AF_NCA) {
308 /*
309 * The request is for an NCA socket so for NL7C use the
310 * INET domain instead and mark NL7C_AF_NCA below.
311 */
312 family = AF_INET;
313 /*
314 * NL7C is not supported in the non-global zone,
315 * we enforce this restriction here.
316 */
317 if (getzoneid() != GLOBAL_ZONEID) {
318 *errorp = ENOTSUP;
319 return (NULL);
320 }
321 }
322
323 /*
324 * to be compatible with old tpi socket implementation ignore
325 * sleep flag (sflags) passed in
326 */
327 cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
328 so = kmem_cache_alloc(cp, KM_SLEEP);
329 if (so == NULL) {
330 *errorp = ENOMEM;
331 return (NULL);
332 }
333
334 sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
335 sotpi_info_init(so);
336
337 if (sfamily == AF_NCA) {
338 SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
339 }
340
341 if (version == SOV_DEFAULT)
342 version = so_default_version;
343
344 so->so_version = (short)version;
345 *errorp = 0;
346
347 return (so);
348 }
349
350 static void
351 sotpi_destroy(struct sonode *so)
352 {
353 kmem_cache_t *cp;
354 struct sockparams *origsp;
355
356 /*
357 * If there is a new dealloc function (ie. smod_destroy_func),
358 * then it should check the correctness of the ops.
359 */
360
361 ASSERT(so->so_ops == &sotpi_sonodeops);
362
363 origsp = SOTOTPI(so)->sti_orig_sp;
364
365 sotpi_info_fini(so);
366
367 if (so->so_state & SS_FALLBACK_COMP) {
368 /*
369 * A fallback happend, which means that a sotpi_info_t struct
370 * was allocated (as opposed to being allocated from the TPI
371 * sonode cache. Therefore we explicitly free the struct
372 * here.
373 */
374 sotpi_info_destroy(so);
375 ASSERT(origsp != NULL);
376
377 origsp->sp_smod_info->smod_sock_destroy_func(so);
378 SOCKPARAMS_DEC_REF(origsp);
379 } else {
380 sonode_fini(so);
381 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
382 socktpi_cache;
383 kmem_cache_free(cp, so);
384 }
385 }
386
387 /* ARGSUSED1 */
388 int
389 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
390 {
391 major_t maj;
392 dev_t newdev;
393 struct vnode *vp;
394 int error = 0;
395 struct stdata *stp;
396
397 sotpi_info_t *sti = SOTOTPI(so);
398
399 dprint(1, ("sotpi_init()\n"));
400
401 /*
402 * over write the sleep flag passed in but that is ok
403 * as tpi socket does not honor sleep flag.
404 */
405 flags |= FREAD|FWRITE;
406
407 /*
408 * Record in so_flag that it is a clone.
409 */
410 if (getmajor(sti->sti_dev) == clone_major)
411 so->so_flag |= SOCLONE;
412
413 if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
414 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
415 (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
416 so->so_protocol == IPPROTO_IP)) {
417 /* Tell tcp or udp that it's talking to sockets */
418 flags |= SO_SOCKSTR;
419
420 /*
421 * Here we indicate to socktpi_open() our attempt to
422 * make direct calls between sockfs and transport.
423 * The final decision is left to socktpi_open().
424 */
425 sti->sti_direct = 1;
426
427 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
428 if (so->so_type == SOCK_STREAM && tso != NULL) {
429 if (SOTOTPI(tso)->sti_direct) {
430 /*
431 * Inherit sti_direct from listener and pass
432 * SO_ACCEPTOR open flag to tcp, indicating
433 * that this is an accept fast-path instance.
434 */
435 flags |= SO_ACCEPTOR;
436 } else {
437 /*
438 * sti_direct is not set on listener, meaning
439 * that the listener has been converted from
440 * a socket to a stream. Ensure that the
441 * acceptor inherits these settings.
442 */
443 sti->sti_direct = 0;
444 flags &= ~SO_SOCKSTR;
445 }
446 }
447 }
448
449 /*
450 * Tell local transport that it is talking to sockets.
451 */
452 if (so->so_family == AF_UNIX) {
453 flags |= SO_SOCKSTR;
454 }
455
456 vp = SOTOV(so);
457 newdev = vp->v_rdev;
458 maj = getmajor(newdev);
459 ASSERT(STREAMSTAB(maj));
460
461 error = stropen(vp, &newdev, flags, cr);
462
463 stp = vp->v_stream;
464 if (error == 0) {
465 if (so->so_flag & SOCLONE)
466 ASSERT(newdev != vp->v_rdev);
467 mutex_enter(&so->so_lock);
468 sti->sti_dev = newdev;
469 vp->v_rdev = newdev;
470 mutex_exit(&so->so_lock);
471
472 if (stp->sd_flag & STRISTTY) {
473 /*
474 * this is a post SVR4 tty driver - a socket can not
475 * be a controlling terminal. Fail the open.
476 */
477 (void) sotpi_close(so, flags, cr);
478 return (ENOTTY); /* XXX */
479 }
480
481 ASSERT(stp->sd_wrq != NULL);
482 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
483
484 /*
485 * If caller is interested in doing direct function call
486 * interface to/from transport module, probe the module
487 * directly beneath the streamhead to see if it qualifies.
488 *
489 * We turn off the direct interface when qualifications fail.
490 * In the acceptor case, we simply turn off the sti_direct
491 * flag on the socket. We do the fallback after the accept
492 * has completed, before the new socket is returned to the
493 * application.
494 */
495 if (sti->sti_direct) {
496 queue_t *tq = stp->sd_wrq->q_next;
497
498 /*
499 * sti_direct is currently supported and tested
500 * only for tcp/udp; this is the main reason to
501 * have the following assertions.
502 */
503 ASSERT(so->so_family == AF_INET ||
504 so->so_family == AF_INET6);
505 ASSERT(so->so_protocol == IPPROTO_UDP ||
506 so->so_protocol == IPPROTO_TCP ||
507 so->so_protocol == IPPROTO_IP);
508 ASSERT(so->so_type == SOCK_DGRAM ||
509 so->so_type == SOCK_STREAM);
510
511 /*
512 * Abort direct call interface if the module directly
513 * underneath the stream head is not defined with the
514 * _D_DIRECT flag. This could happen in the tcp or
515 * udp case, when some other module is autopushed
516 * above it, or for some reasons the expected module
517 * isn't purely D_MP (which is the main requirement).
518 */
519 if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
520 !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
521 int rval;
522
523 /* Continue on without direct calls */
524 sti->sti_direct = 0;
525
526 /*
527 * Cannot issue ioctl on fallback socket since
528 * there is no conn associated with the queue.
529 * The fallback downcall will notify the proto
530 * of the change.
531 */
532 if (!(flags & SO_ACCEPTOR) &&
533 !(flags & SO_FALLBACK)) {
534 if ((error = strioctl(vp,
535 _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
536 cr, &rval)) != 0) {
537 (void) sotpi_close(so, flags,
538 cr);
539 return (error);
540 }
541 }
542 }
543 }
544
545 if (flags & SO_FALLBACK) {
546 /*
547 * The stream created does not have a conn.
548 * do stream set up after conn has been assigned
549 */
550 return (error);
551 }
552 if (error = so_strinit(so, tso)) {
553 (void) sotpi_close(so, flags, cr);
554 return (error);
555 }
556
557 /* Enable sendfile() on AF_UNIX streams */
558 if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
559 mutex_enter(&so->so_lock);
560 so->so_mode |= SM_SENDFILESUPP;
561 mutex_exit(&so->so_lock);
562 }
563
564 /* Wildcard */
565 if (so->so_protocol != so->so_sockparams->sp_protocol) {
566 int protocol = so->so_protocol;
567 /*
568 * Issue SO_PROTOTYPE setsockopt.
569 */
570 error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
571 &protocol, (t_uscalar_t)sizeof (protocol), cr);
572 if (error != 0) {
573 (void) sotpi_close(so, flags, cr);
574 /*
575 * Setsockopt often fails with ENOPROTOOPT but
576 * socket() should fail with
577 * EPROTONOSUPPORT/EPROTOTYPE.
578 */
579 return (EPROTONOSUPPORT);
580 }
581 }
582
583 } else {
584 /*
585 * While the same socket can not be reopened (unlike specfs)
586 * the stream head sets STREOPENFAIL when the autopush fails.
587 */
588 if ((stp != NULL) &&
589 (stp->sd_flag & STREOPENFAIL)) {
590 /*
591 * Open failed part way through.
592 */
593 mutex_enter(&stp->sd_lock);
594 stp->sd_flag &= ~STREOPENFAIL;
595 mutex_exit(&stp->sd_lock);
596 (void) sotpi_close(so, flags, cr);
597 return (error);
598 /*NOTREACHED*/
599 }
600 ASSERT(stp == NULL);
601 }
602 TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
603 "sockfs open:maj %d vp %p so %p error %d",
604 maj, vp, so, error);
605 return (error);
606 }
607
608 /*
609 * Bind the socket to an unspecified address in sockfs only.
610 * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
611 * required in all cases.
612 */
613 static void
614 so_automatic_bind(struct sonode *so)
615 {
616 sotpi_info_t *sti = SOTOTPI(so);
617 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
618
619 ASSERT(MUTEX_HELD(&so->so_lock));
620 ASSERT(!(so->so_state & SS_ISBOUND));
621 ASSERT(sti->sti_unbind_mp);
622
623 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
624 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
625 sti->sti_laddr_sa->sa_family = so->so_family;
626 so->so_state |= SS_ISBOUND;
627 }
628
629
630 /*
631 * bind the socket.
632 *
633 * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
634 * are passed in we allow rebinding. Note that for backwards compatibility
635 * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
636 * Thus the rebinding code is currently not executed.
637 *
638 * The constraints for rebinding are:
639 * - it is a SOCK_DGRAM, or
640 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
641 * and no listen() has been done.
642 * This rebinding code was added based on some language in the XNET book
643 * about not returning EINVAL it the protocol allows rebinding. However,
644 * this language is not present in the Posix socket draft. Thus maybe the
645 * rebinding logic should be deleted from the source.
646 *
647 * A null "name" can be used to unbind the socket if:
648 * - it is a SOCK_DGRAM, or
649 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
650 * and no listen() has been done.
651 */
652 /* ARGSUSED */
653 static int
654 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
655 socklen_t namelen, int backlog, int flags, struct cred *cr)
656 {
657 struct T_bind_req bind_req;
658 struct T_bind_ack *bind_ack;
659 int error = 0;
660 mblk_t *mp;
661 void *addr;
662 t_uscalar_t addrlen;
663 int unbind_on_err = 1;
664 boolean_t clear_acceptconn_on_err = B_FALSE;
665 boolean_t restore_backlog_on_err = B_FALSE;
666 int save_so_backlog;
667 t_scalar_t PRIM_type = O_T_BIND_REQ;
668 boolean_t tcp_udp_xport;
669 void *nl7c = NULL;
670 sotpi_info_t *sti = SOTOTPI(so);
671
672 dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
673 (void *)so, (void *)name, namelen, backlog, flags,
674 pr_state(so->so_state, so->so_mode)));
675
676 tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
677
678 if (!(flags & _SOBIND_LOCK_HELD)) {
679 mutex_enter(&so->so_lock);
680 so_lock_single(so); /* Set SOLOCKED */
681 } else {
682 ASSERT(MUTEX_HELD(&so->so_lock));
683 ASSERT(so->so_flag & SOLOCKED);
684 }
685
686 /*
687 * Make sure that there is a preallocated unbind_req message
688 * before binding. This message allocated when the socket is
689 * created but it might be have been consumed.
690 */
691 if (sti->sti_unbind_mp == NULL) {
692 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
693 /* NOTE: holding so_lock while sleeping */
694 sti->sti_unbind_mp =
695 soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
696 cr);
697 }
698
699 if (flags & _SOBIND_REBIND) {
700 /*
701 * Called from solisten after doing an sotpi_unbind() or
702 * potentially without the unbind (latter for AF_INET{,6}).
703 */
704 ASSERT(name == NULL && namelen == 0);
705
706 if (so->so_family == AF_UNIX) {
707 ASSERT(sti->sti_ux_bound_vp);
708 addr = &sti->sti_ux_laddr;
709 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
710 dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
711 "addr 0x%p, vp %p\n",
712 addrlen,
713 (void *)((struct so_ux_addr *)addr)->soua_vp,
714 (void *)sti->sti_ux_bound_vp));
715 } else {
716 addr = sti->sti_laddr_sa;
717 addrlen = (t_uscalar_t)sti->sti_laddr_len;
718 }
719 } else if (flags & _SOBIND_UNSPEC) {
720 ASSERT(name == NULL && namelen == 0);
721
722 /*
723 * The caller checked SS_ISBOUND but not necessarily
724 * under so_lock
725 */
726 if (so->so_state & SS_ISBOUND) {
727 /* No error */
728 goto done;
729 }
730
731 /* Set an initial local address */
732 switch (so->so_family) {
733 case AF_UNIX:
734 /*
735 * Use an address with same size as struct sockaddr
736 * just like BSD.
737 */
738 sti->sti_laddr_len =
739 (socklen_t)sizeof (struct sockaddr);
740 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
741 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
742 sti->sti_laddr_sa->sa_family = so->so_family;
743
744 /*
745 * Pass down an address with the implicit bind
746 * magic number and the rest all zeros.
747 * The transport will return a unique address.
748 */
749 sti->sti_ux_laddr.soua_vp = NULL;
750 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
751 addr = &sti->sti_ux_laddr;
752 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
753 break;
754
755 case AF_INET:
756 case AF_INET6:
757 /*
758 * An unspecified bind in TPI has a NULL address.
759 * Set the address in sockfs to have the sa_family.
760 */
761 sti->sti_laddr_len = (so->so_family == AF_INET) ?
762 (socklen_t)sizeof (sin_t) :
763 (socklen_t)sizeof (sin6_t);
764 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
765 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
766 sti->sti_laddr_sa->sa_family = so->so_family;
767 addr = NULL;
768 addrlen = 0;
769 break;
770
771 default:
772 /*
773 * An unspecified bind in TPI has a NULL address.
774 * Set the address in sockfs to be zero length.
775 *
776 * Can not assume there is a sa_family for all
777 * protocol families. For example, AF_X25 does not
778 * have a family field.
779 */
780 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
781 sti->sti_laddr_len = 0; /* XXX correct? */
782 addr = NULL;
783 addrlen = 0;
784 break;
785 }
786
787 } else {
788 if (so->so_state & SS_ISBOUND) {
789 /*
790 * If it is ok to rebind the socket, first unbind
791 * with the transport. A rebind to the NULL address
792 * is interpreted as an unbind.
793 * Note that a bind to NULL in BSD does unbind the
794 * socket but it fails with EINVAL.
795 * Note that regular sockets set SOV_SOCKBSD i.e.
796 * _SOBIND_SOCKBSD gets set here hence no type of
797 * socket does currently allow rebinding.
798 *
799 * If the name is NULL just do an unbind.
800 */
801 if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
802 name != NULL) {
803 error = EINVAL;
804 unbind_on_err = 0;
805 eprintsoline(so, error);
806 goto done;
807 }
808 if ((so->so_mode & SM_CONNREQUIRED) &&
809 (so->so_state & SS_CANTREBIND)) {
810 error = EINVAL;
811 unbind_on_err = 0;
812 eprintsoline(so, error);
813 goto done;
814 }
815 error = sotpi_unbind(so, 0);
816 if (error) {
817 eprintsoline(so, error);
818 goto done;
819 }
820 ASSERT(!(so->so_state & SS_ISBOUND));
821 if (name == NULL) {
822 so->so_state &=
823 ~(SS_ISCONNECTED|SS_ISCONNECTING);
824 goto done;
825 }
826 }
827
828 /* X/Open requires this check */
829 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
830 if (xnet_check_print) {
831 printf("sockfs: X/Open bind state check "
832 "caused EINVAL\n");
833 }
834 error = EINVAL;
835 goto done;
836 }
837
838 switch (so->so_family) {
839 case AF_UNIX:
840 /*
841 * All AF_UNIX addresses are nul terminated
842 * when copied (copyin_name) in so the minimum
843 * length is 3 bytes.
844 */
845 if (name == NULL ||
846 (ssize_t)namelen <= sizeof (short) + 1) {
847 error = EISDIR;
848 eprintsoline(so, error);
849 goto done;
850 }
851 /*
852 * Verify so_family matches the bound family.
853 * BSD does not check this for AF_UNIX resulting
854 * in funny mknods.
855 */
856 if (name->sa_family != so->so_family) {
857 error = EAFNOSUPPORT;
858 goto done;
859 }
860 break;
861 case AF_INET:
862 if (name == NULL) {
863 error = EINVAL;
864 eprintsoline(so, error);
865 goto done;
866 }
867 if ((size_t)namelen != sizeof (sin_t)) {
868 error = name->sa_family != so->so_family ?
869 EAFNOSUPPORT : EINVAL;
870 eprintsoline(so, error);
871 goto done;
872 }
873 if ((flags & _SOBIND_XPG4_2) &&
874 (name->sa_family != so->so_family)) {
875 /*
876 * This check has to be made for X/Open
877 * sockets however application failures have
878 * been observed when it is applied to
879 * all sockets.
880 */
881 error = EAFNOSUPPORT;
882 eprintsoline(so, error);
883 goto done;
884 }
885 /*
886 * Force a zero sa_family to match so_family.
887 *
888 * Some programs like inetd(8) don't set the
889 * family field. Other programs leave
890 * sin_family set to garbage - SunOS 4.X does
891 * not check the family field on a bind.
892 * We use the family field that
893 * was passed in to the socket() call.
894 */
895 name->sa_family = so->so_family;
896 break;
897
898 case AF_INET6: {
899 #ifdef DEBUG
900 sin6_t *sin6 = (sin6_t *)name;
901 #endif /* DEBUG */
902
903 if (name == NULL) {
904 error = EINVAL;
905 eprintsoline(so, error);
906 goto done;
907 }
908 if ((size_t)namelen != sizeof (sin6_t)) {
909 error = name->sa_family != so->so_family ?
910 EAFNOSUPPORT : EINVAL;
911 eprintsoline(so, error);
912 goto done;
913 }
914 if (name->sa_family != so->so_family) {
915 /*
916 * With IPv6 we require the family to match
917 * unlike in IPv4.
918 */
919 error = EAFNOSUPPORT;
920 eprintsoline(so, error);
921 goto done;
922 }
923 #ifdef DEBUG
924 /*
925 * Verify that apps don't forget to clear
926 * sin6_scope_id etc
927 */
928 if (sin6->sin6_scope_id != 0 &&
929 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
930 zcmn_err(getzoneid(), CE_WARN,
931 "bind with uninitialized sin6_scope_id "
932 "(%d) on socket. Pid = %d\n",
933 (int)sin6->sin6_scope_id,
934 (int)curproc->p_pid);
935 }
936 if (sin6->__sin6_src_id != 0) {
937 zcmn_err(getzoneid(), CE_WARN,
938 "bind with uninitialized __sin6_src_id "
939 "(%d) on socket. Pid = %d\n",
940 (int)sin6->__sin6_src_id,
941 (int)curproc->p_pid);
942 }
943 #endif /* DEBUG */
944 break;
945 }
946 default:
947 /*
948 * Don't do any length or sa_family check to allow
949 * non-sockaddr style addresses.
950 */
951 if (name == NULL) {
952 error = EINVAL;
953 eprintsoline(so, error);
954 goto done;
955 }
956 break;
957 }
958
959 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
960 error = ENAMETOOLONG;
961 eprintsoline(so, error);
962 goto done;
963 }
964 /*
965 * Save local address.
966 */
967 sti->sti_laddr_len = (socklen_t)namelen;
968 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
969 bcopy(name, sti->sti_laddr_sa, namelen);
970
971 addr = sti->sti_laddr_sa;
972 addrlen = (t_uscalar_t)sti->sti_laddr_len;
973 switch (so->so_family) {
974 case AF_INET6:
975 case AF_INET:
976 break;
977 case AF_UNIX: {
978 struct sockaddr_un *soun =
979 (struct sockaddr_un *)sti->sti_laddr_sa;
980 struct vnode *vp, *rvp;
981 struct vattr vattr;
982
983 ASSERT(sti->sti_ux_bound_vp == NULL);
984 /*
985 * Create vnode for the specified path name.
986 * Keep vnode held with a reference in sti_ux_bound_vp.
987 * Use the vnode pointer as the address used in the
988 * bind with the transport.
989 *
990 * Use the same mode as in BSD. In particular this does
991 * not observe the umask.
992 */
993 /* MAXPATHLEN + soun_family + nul termination */
994 if (sti->sti_laddr_len >
995 (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
996 error = ENAMETOOLONG;
997 eprintsoline(so, error);
998 goto done;
999 }
1000 vattr.va_type = VSOCK;
1001 vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
1002 vattr.va_mask = AT_TYPE|AT_MODE;
1003 /* NOTE: holding so_lock */
1004 error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
1005 EXCL, 0, &vp, CRMKNOD, 0, 0);
1006 if (error) {
1007 if (error == EEXIST)
1008 error = EADDRINUSE;
1009 eprintsoline(so, error);
1010 goto done;
1011 }
1012 /*
1013 * Establish pointer from the underlying filesystem
1014 * vnode to the socket node.
1015 * sti_ux_bound_vp and v_stream->sd_vnode form the
1016 * cross-linkage between the underlying filesystem
1017 * node and the socket node.
1018 */
1019
1020 if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
1021 VN_HOLD(rvp);
1022 VN_RELE(vp);
1023 vp = rvp;
1024 }
1025
1026 ASSERT(SOTOV(so)->v_stream != NULL);
1027 mutex_enter(&vp->v_lock);
1028 vp->v_stream = SOTOV(so)->v_stream;
1029 sti->sti_ux_bound_vp = vp;
1030 mutex_exit(&vp->v_lock);
1031
1032 /*
1033 * Use the vnode pointer value as a unique address
1034 * (together with the magic number to avoid conflicts
1035 * with implicit binds) in the transport provider.
1036 */
1037 sti->sti_ux_laddr.soua_vp =
1038 (void *)sti->sti_ux_bound_vp;
1039 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1040 addr = &sti->sti_ux_laddr;
1041 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1042 dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1043 addrlen,
1044 (void *)((struct so_ux_addr *)addr)->soua_vp));
1045 break;
1046 }
1047 } /* end switch (so->so_family) */
1048 }
1049
1050 /*
1051 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1052 * the transport can start passing up T_CONN_IND messages
1053 * as soon as it receives the bind req and strsock_proto()
1054 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1055 */
1056 if (flags & _SOBIND_LISTEN) {
1057 if ((so->so_state & SS_ACCEPTCONN) == 0)
1058 clear_acceptconn_on_err = B_TRUE;
1059 save_so_backlog = so->so_backlog;
1060 restore_backlog_on_err = B_TRUE;
1061 so->so_state |= SS_ACCEPTCONN;
1062 so->so_backlog = backlog;
1063 }
1064
1065 /*
1066 * If NL7C addr(s) have been configured check for addr/port match,
1067 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1068 *
1069 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1070 * family sockets only. If match mark as such.
1071 */
1072 if (nl7c_enabled && ((addr != NULL &&
1073 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1074 (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1075 sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1076 /*
1077 * NL7C is not supported in non-global zones,
1078 * we enforce this restriction here.
1079 */
1080 if (so->so_zoneid == GLOBAL_ZONEID) {
1081 /* An NL7C socket, mark it */
1082 sti->sti_nl7c_flags |= NL7C_ENABLED;
1083 if (nl7c == NULL) {
1084 /*
1085 * Was an AF_NCA bind() so add it to the
1086 * addr list for reporting purposes.
1087 */
1088 nl7c = nl7c_add_addr(addr, addrlen);
1089 }
1090 } else
1091 nl7c = NULL;
1092 }
1093
1094 /*
1095 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1096 * for other transports we will send in a O_T_BIND_REQ.
1097 */
1098 if (tcp_udp_xport &&
1099 (so->so_family == AF_INET || so->so_family == AF_INET6))
1100 PRIM_type = T_BIND_REQ;
1101
1102 bind_req.PRIM_type = PRIM_type;
1103 bind_req.ADDR_length = addrlen;
1104 bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1105 bind_req.CONIND_number = backlog;
1106 /* NOTE: holding so_lock while sleeping */
1107 mp = soallocproto2(&bind_req, sizeof (bind_req),
1108 addr, addrlen, 0, _ALLOC_SLEEP, cr);
1109 sti->sti_laddr_valid = 0;
1110
1111 /* Done using sti_laddr_sa - can drop the lock */
1112 mutex_exit(&so->so_lock);
1113
1114 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1115 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1116 if (error) {
1117 eprintsoline(so, error);
1118 mutex_enter(&so->so_lock);
1119 goto done;
1120 }
1121
1122 mutex_enter(&so->so_lock);
1123 error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1124 (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1125 if (error) {
1126 eprintsoline(so, error);
1127 goto done;
1128 }
1129 ASSERT(mp);
1130 /*
1131 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1132 * strsock_proto while the lock was dropped above, the bind
1133 * is allowed to complete.
1134 */
1135
1136 /* Mark as bound. This will be undone if we detect errors below. */
1137 if (flags & _SOBIND_NOXLATE) {
1138 ASSERT(so->so_family == AF_UNIX);
1139 sti->sti_faddr_noxlate = 1;
1140 }
1141 ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1142 so->so_state |= SS_ISBOUND;
1143 ASSERT(sti->sti_unbind_mp);
1144
1145 /* note that we've already set SS_ACCEPTCONN above */
1146
1147 /*
1148 * Recompute addrlen - an unspecied bind sent down an
1149 * address of length zero but we expect the appropriate length
1150 * in return.
1151 */
1152 addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1153 sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1154
1155 bind_ack = (struct T_bind_ack *)mp->b_rptr;
1156 /*
1157 * The alignment restriction is really too strict but
1158 * we want enough alignment to inspect the fields of
1159 * a sockaddr_in.
1160 */
1161 addr = sogetoff(mp, bind_ack->ADDR_offset,
1162 bind_ack->ADDR_length,
1163 __TPI_ALIGN_SIZE);
1164 if (addr == NULL) {
1165 freemsg(mp);
1166 error = EPROTO;
1167 eprintsoline(so, error);
1168 goto done;
1169 }
1170 if (!(flags & _SOBIND_UNSPEC)) {
1171 /*
1172 * Verify that the transport didn't return something we
1173 * did not want e.g. an address other than what we asked for.
1174 *
1175 * NOTE: These checks would go away if/when we switch to
1176 * using the new TPI (in which the transport would fail
1177 * the request instead of assigning a different address).
1178 *
1179 * NOTE2: For protocols that we don't know (i.e. any
1180 * other than AF_INET6, AF_INET and AF_UNIX), we
1181 * cannot know if the transport should be expected to
1182 * return the same address as that requested.
1183 *
1184 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1185 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1186 *
1187 * For example, in the case of netatalk it may be
1188 * inappropriate for the transport to return the
1189 * requested address (as it may have allocated a local
1190 * port number in behaviour similar to that of an
1191 * AF_INET bind request with a port number of zero).
1192 *
1193 * Given the definition of O_T_BIND_REQ, where the
1194 * transport may bind to an address other than the
1195 * requested address, it's not possible to determine
1196 * whether a returned address that differs from the
1197 * requested address is a reason to fail (because the
1198 * requested address was not available) or succeed
1199 * (because the transport allocated an appropriate
1200 * address and/or port).
1201 *
1202 * sockfs currently requires that the transport return
1203 * the requested address in the T_BIND_ACK, unless
1204 * there is code here to allow for any discrepancy.
1205 * Such code exists for AF_INET and AF_INET6.
1206 *
1207 * Netatalk chooses to return the requested address
1208 * rather than the (correct) allocated address. This
1209 * means that netatalk violates the TPI specification
1210 * (and would not function correctly if used from a
1211 * TLI application), but it does mean that it works
1212 * with sockfs.
1213 *
1214 * As noted above, using the newer XTI bind primitive
1215 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1216 * allow sockfs to be more sure about whether or not
1217 * the bind request had succeeded (as transports are
1218 * not permitted to bind to a different address than
1219 * that requested - they must return failure).
1220 * Unfortunately, support for T_BIND_REQ may not be
1221 * present in all transport implementations (netatalk,
1222 * for example, doesn't have it), making the
1223 * transition difficult.
1224 */
1225 if (bind_ack->ADDR_length != addrlen) {
1226 /* Assumes that the requested address was in use */
1227 freemsg(mp);
1228 error = EADDRINUSE;
1229 eprintsoline(so, error);
1230 goto done;
1231 }
1232
1233 switch (so->so_family) {
1234 case AF_INET6:
1235 case AF_INET: {
1236 sin_t *rname, *aname;
1237
1238 rname = (sin_t *)addr;
1239 aname = (sin_t *)sti->sti_laddr_sa;
1240
1241 /*
1242 * Take advantage of the alignment
1243 * of sin_port and sin6_port which fall
1244 * in the same place in their data structures.
1245 * Just use sin_port for either address family.
1246 *
1247 * This may become a problem if (heaven forbid)
1248 * there's a separate ipv6port_reserved... :-P
1249 *
1250 * Binding to port 0 has the semantics of letting
1251 * the transport bind to any port.
1252 *
1253 * If the transport is TCP or UDP since we had sent
1254 * a T_BIND_REQ we would not get a port other than
1255 * what we asked for.
1256 */
1257 if (tcp_udp_xport) {
1258 /*
1259 * Pick up the new port number if we bound to
1260 * port 0.
1261 */
1262 if (aname->sin_port == 0)
1263 aname->sin_port = rname->sin_port;
1264 sti->sti_laddr_valid = 1;
1265 break;
1266 }
1267 if (aname->sin_port != 0 &&
1268 aname->sin_port != rname->sin_port) {
1269 freemsg(mp);
1270 error = EADDRINUSE;
1271 eprintsoline(so, error);
1272 goto done;
1273 }
1274 /*
1275 * Pick up the new port number if we bound to port 0.
1276 */
1277 aname->sin_port = rname->sin_port;
1278
1279 /*
1280 * Unfortunately, addresses aren't _quite_ the same.
1281 */
1282 if (so->so_family == AF_INET) {
1283 if (aname->sin_addr.s_addr !=
1284 rname->sin_addr.s_addr) {
1285 freemsg(mp);
1286 error = EADDRNOTAVAIL;
1287 eprintsoline(so, error);
1288 goto done;
1289 }
1290 } else {
1291 sin6_t *rname6 = (sin6_t *)rname;
1292 sin6_t *aname6 = (sin6_t *)aname;
1293
1294 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1295 &rname6->sin6_addr)) {
1296 freemsg(mp);
1297 error = EADDRNOTAVAIL;
1298 eprintsoline(so, error);
1299 goto done;
1300 }
1301 }
1302 break;
1303 }
1304 case AF_UNIX:
1305 if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1306 freemsg(mp);
1307 error = EADDRINUSE;
1308 eprintsoline(so, error);
1309 eprintso(so,
1310 ("addrlen %d, addr 0x%x, vp %p\n",
1311 addrlen, *((int *)addr),
1312 (void *)sti->sti_ux_bound_vp));
1313 goto done;
1314 }
1315 sti->sti_laddr_valid = 1;
1316 break;
1317 default:
1318 /*
1319 * NOTE: This assumes that addresses can be
1320 * byte-compared for equivalence.
1321 */
1322 if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1323 freemsg(mp);
1324 error = EADDRINUSE;
1325 eprintsoline(so, error);
1326 goto done;
1327 }
1328 /*
1329 * Don't mark sti_laddr_valid, as we cannot be
1330 * sure that the returned address is the real
1331 * bound address when talking to an unknown
1332 * transport.
1333 */
1334 break;
1335 }
1336 } else {
1337 /*
1338 * Save for returned address for getsockname.
1339 * Needed for unspecific bind unless transport supports
1340 * the TI_GETMYNAME ioctl.
1341 * Do this for AF_INET{,6} even though they do, as
1342 * caching info here is much better performance than
1343 * a TPI/STREAMS trip to the transport for getsockname.
1344 * Any which can't for some reason _must_ _not_ set
1345 * sti_laddr_valid here for the caching version of
1346 * getsockname to not break;
1347 */
1348 switch (so->so_family) {
1349 case AF_UNIX:
1350 /*
1351 * Record the address bound with the transport
1352 * for use by socketpair.
1353 */
1354 bcopy(addr, &sti->sti_ux_laddr, addrlen);
1355 sti->sti_laddr_valid = 1;
1356 break;
1357 case AF_INET:
1358 case AF_INET6:
1359 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1360 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1361 sti->sti_laddr_valid = 1;
1362 break;
1363 default:
1364 /*
1365 * Don't mark sti_laddr_valid, as we cannot be
1366 * sure that the returned address is the real
1367 * bound address when talking to an unknown
1368 * transport.
1369 */
1370 break;
1371 }
1372 }
1373
1374 if (nl7c != NULL) {
1375 /* Register listen()er sonode pointer with NL7C */
1376 nl7c_listener_addr(nl7c, so);
1377 }
1378
1379 freemsg(mp);
1380
1381 done:
1382 if (error) {
1383 /* reset state & backlog to values held on entry */
1384 if (clear_acceptconn_on_err == B_TRUE)
1385 so->so_state &= ~SS_ACCEPTCONN;
1386 if (restore_backlog_on_err == B_TRUE)
1387 so->so_backlog = save_so_backlog;
1388
1389 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1390 int err;
1391
1392 err = sotpi_unbind(so, 0);
1393 /* LINTED - statement has no consequent: if */
1394 if (err) {
1395 eprintsoline(so, error);
1396 } else {
1397 ASSERT(!(so->so_state & SS_ISBOUND));
1398 }
1399 }
1400 }
1401 if (!(flags & _SOBIND_LOCK_HELD)) {
1402 so_unlock_single(so, SOLOCKED);
1403 mutex_exit(&so->so_lock);
1404 } else {
1405 ASSERT(MUTEX_HELD(&so->so_lock));
1406 ASSERT(so->so_flag & SOLOCKED);
1407 }
1408 return (error);
1409 }
1410
1411 /* bind the socket */
1412 static int
1413 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1414 int flags, struct cred *cr)
1415 {
1416 if ((flags & _SOBIND_SOCKETPAIR) == 0)
1417 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1418
1419 flags &= ~_SOBIND_SOCKETPAIR;
1420 return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1421 }
1422
1423 /*
1424 * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1425 * address, or when listen needs to unbind and bind.
1426 * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1427 * so that a sobind can pick them up.
1428 */
1429 static int
1430 sotpi_unbind(struct sonode *so, int flags)
1431 {
1432 struct T_unbind_req unbind_req;
1433 int error = 0;
1434 mblk_t *mp;
1435 sotpi_info_t *sti = SOTOTPI(so);
1436
1437 dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1438 (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1439
1440 ASSERT(MUTEX_HELD(&so->so_lock));
1441 ASSERT(so->so_flag & SOLOCKED);
1442
1443 if (!(so->so_state & SS_ISBOUND)) {
1444 error = EINVAL;
1445 eprintsoline(so, error);
1446 goto done;
1447 }
1448
1449 mutex_exit(&so->so_lock);
1450
1451 /*
1452 * Flush the read and write side (except stream head read queue)
1453 * and send down T_UNBIND_REQ.
1454 */
1455 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1456
1457 unbind_req.PRIM_type = T_UNBIND_REQ;
1458 mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1459 0, _ALLOC_SLEEP, CRED());
1460 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1461 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1462 mutex_enter(&so->so_lock);
1463 if (error) {
1464 eprintsoline(so, error);
1465 goto done;
1466 }
1467
1468 error = sowaitokack(so, T_UNBIND_REQ);
1469 if (error) {
1470 eprintsoline(so, error);
1471 goto done;
1472 }
1473
1474 /*
1475 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1476 * strsock_proto while the lock was dropped above, the unbind
1477 * is allowed to complete.
1478 */
1479 if (!(flags & _SOUNBIND_REBIND)) {
1480 /*
1481 * Clear out bound address.
1482 */
1483 vnode_t *vp;
1484
1485 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1486 sti->sti_ux_bound_vp = NULL;
1487 vn_rele_stream(vp);
1488 }
1489 /* Clear out address */
1490 sti->sti_laddr_len = 0;
1491 }
1492 so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1493 sti->sti_laddr_valid = 0;
1494
1495 done:
1496
1497 /* If the caller held the lock don't release it here */
1498 ASSERT(MUTEX_HELD(&so->so_lock));
1499 ASSERT(so->so_flag & SOLOCKED);
1500
1501 return (error);
1502 }
1503
1504 /*
1505 * listen on the socket.
1506 * For TPI conforming transports this has to first unbind with the transport
1507 * and then bind again using the new backlog.
1508 */
1509 /* ARGSUSED */
1510 int
1511 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1512 {
1513 int error = 0;
1514 sotpi_info_t *sti = SOTOTPI(so);
1515
1516 dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1517 (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1518
1519 if (sti->sti_serv_type == T_CLTS)
1520 return (EOPNOTSUPP);
1521
1522 /*
1523 * If the socket is ready to accept connections already, then
1524 * return without doing anything. This avoids a problem where
1525 * a second listen() call fails if a connection is pending and
1526 * leaves the socket unbound. Only when we are not unbinding
1527 * with the transport can we safely increase the backlog.
1528 */
1529 if (so->so_state & SS_ACCEPTCONN &&
1530 !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1531 /*CONSTCOND*/
1532 !solisten_tpi_tcp))
1533 return (0);
1534
1535 if (so->so_state & SS_ISCONNECTED)
1536 return (EINVAL);
1537
1538 mutex_enter(&so->so_lock);
1539 so_lock_single(so); /* Set SOLOCKED */
1540
1541 /*
1542 * If the listen doesn't change the backlog we do nothing.
1543 * This avoids an EPROTO error from the transport.
1544 */
1545 if ((so->so_state & SS_ACCEPTCONN) &&
1546 so->so_backlog == backlog)
1547 goto done;
1548
1549 if (!(so->so_state & SS_ISBOUND)) {
1550 /*
1551 * Must have been explicitly bound in the UNIX domain.
1552 */
1553 if (so->so_family == AF_UNIX) {
1554 error = EINVAL;
1555 goto done;
1556 }
1557 error = sotpi_bindlisten(so, NULL, 0, backlog,
1558 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1559 } else if (backlog > 0) {
1560 /*
1561 * AF_INET{,6} hack to avoid losing the port.
1562 * Assumes that all AF_INET{,6} transports can handle a
1563 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1564 * has already bound thus it is possible to avoid the unbind.
1565 */
1566 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1567 /*CONSTCOND*/
1568 !solisten_tpi_tcp)) {
1569 error = sotpi_unbind(so, _SOUNBIND_REBIND);
1570 if (error)
1571 goto done;
1572 }
1573 error = sotpi_bindlisten(so, NULL, 0, backlog,
1574 _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1575 } else {
1576 so->so_state |= SS_ACCEPTCONN;
1577 so->so_backlog = backlog;
1578 }
1579 if (error)
1580 goto done;
1581 ASSERT(so->so_state & SS_ACCEPTCONN);
1582 done:
1583 so_unlock_single(so, SOLOCKED);
1584 mutex_exit(&so->so_lock);
1585 return (error);
1586 }
1587
1588 /*
1589 * Disconnect either a specified seqno or all (-1).
1590 * The former is used on listening sockets only.
1591 *
1592 * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1593 * the current use of sodisconnect(seqno == -1) is only for shutdown
1594 * so there is no point (and potentially incorrect) to unbind.
1595 */
1596 static int
1597 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1598 {
1599 struct T_discon_req discon_req;
1600 int error = 0;
1601 mblk_t *mp;
1602
1603 dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1604 (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1605
1606 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1607 mutex_enter(&so->so_lock);
1608 so_lock_single(so); /* Set SOLOCKED */
1609 } else {
1610 ASSERT(MUTEX_HELD(&so->so_lock));
1611 ASSERT(so->so_flag & SOLOCKED);
1612 }
1613
1614 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1615 error = EINVAL;
1616 eprintsoline(so, error);
1617 goto done;
1618 }
1619
1620 mutex_exit(&so->so_lock);
1621 /*
1622 * Flush the write side (unless this is a listener)
1623 * and then send down a T_DISCON_REQ.
1624 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1625 * and other messages.)
1626 */
1627 if (!(so->so_state & SS_ACCEPTCONN))
1628 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1629
1630 discon_req.PRIM_type = T_DISCON_REQ;
1631 discon_req.SEQ_number = seqno;
1632 mp = soallocproto1(&discon_req, sizeof (discon_req),
1633 0, _ALLOC_SLEEP, CRED());
1634 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1635 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1636 mutex_enter(&so->so_lock);
1637 if (error) {
1638 eprintsoline(so, error);
1639 goto done;
1640 }
1641
1642 error = sowaitokack(so, T_DISCON_REQ);
1643 if (error) {
1644 eprintsoline(so, error);
1645 goto done;
1646 }
1647 /*
1648 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1649 * strsock_proto while the lock was dropped above, the disconnect
1650 * is allowed to complete. However, it is not possible to
1651 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1652 */
1653 so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1654 SOTOTPI(so)->sti_laddr_valid = 0;
1655 SOTOTPI(so)->sti_faddr_valid = 0;
1656 done:
1657 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1658 so_unlock_single(so, SOLOCKED);
1659 mutex_exit(&so->so_lock);
1660 } else {
1661 /* If the caller held the lock don't release it here */
1662 ASSERT(MUTEX_HELD(&so->so_lock));
1663 ASSERT(so->so_flag & SOLOCKED);
1664 }
1665 return (error);
1666 }
1667
1668 /* ARGSUSED */
1669 int
1670 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1671 struct sonode **nsop)
1672 {
1673 struct T_conn_ind *conn_ind;
1674 struct T_conn_res *conn_res;
1675 int error = 0;
1676 mblk_t *mp, *ack_mp;
1677 struct sonode *nso;
1678 vnode_t *nvp;
1679 void *src;
1680 t_uscalar_t srclen;
1681 void *opt;
1682 t_uscalar_t optlen;
1683 t_scalar_t PRIM_type;
1684 t_scalar_t SEQ_number;
1685 size_t sinlen;
1686 sotpi_info_t *sti = SOTOTPI(so);
1687 sotpi_info_t *nsti;
1688
1689 dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1690 (void *)so, fflag, (void *)nsop,
1691 pr_state(so->so_state, so->so_mode)));
1692
1693 /*
1694 * Defer single-threading the accepting socket until
1695 * the T_CONN_IND has been received and parsed and the
1696 * new sonode has been opened.
1697 */
1698
1699 /* Check that we are not already connected */
1700 if ((so->so_state & SS_ACCEPTCONN) == 0)
1701 goto conn_bad;
1702 again:
1703 if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1704 goto e_bad;
1705
1706 ASSERT(mp != NULL);
1707 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1708
1709 /*
1710 * Save SEQ_number for error paths.
1711 */
1712 SEQ_number = conn_ind->SEQ_number;
1713
1714 srclen = conn_ind->SRC_length;
1715 src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1716 if (src == NULL) {
1717 error = EPROTO;
1718 freemsg(mp);
1719 eprintsoline(so, error);
1720 goto disconnect_unlocked;
1721 }
1722 optlen = conn_ind->OPT_length;
1723 switch (so->so_family) {
1724 case AF_INET:
1725 case AF_INET6:
1726 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1727 bcopy(mp->b_rptr + conn_ind->OPT_offset,
1728 &opt, conn_ind->OPT_length);
1729 } else {
1730 /*
1731 * The transport (in this case TCP) hasn't sent up
1732 * a pointer to an instance for the accept fast-path.
1733 * Disable fast-path completely because the call to
1734 * sotpi_create() below would otherwise create an
1735 * incomplete TCP instance, which would lead to
1736 * problems when sockfs sends a normal T_CONN_RES
1737 * message down the new stream.
1738 */
1739 if (sti->sti_direct) {
1740 int rval;
1741 /*
1742 * For consistency we inform tcp to disable
1743 * direct interface on the listener, though
1744 * we can certainly live without doing this
1745 * because no data will ever travel upstream
1746 * on the listening socket.
1747 */
1748 sti->sti_direct = 0;
1749 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1750 0, 0, K_TO_K, cr, &rval);
1751 }
1752 opt = NULL;
1753 optlen = 0;
1754 }
1755 break;
1756 case AF_UNIX:
1757 default:
1758 if (optlen != 0) {
1759 opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1760 __TPI_ALIGN_SIZE);
1761 if (opt == NULL) {
1762 error = EPROTO;
1763 freemsg(mp);
1764 eprintsoline(so, error);
1765 goto disconnect_unlocked;
1766 }
1767 }
1768 if (so->so_family == AF_UNIX) {
1769 if (!sti->sti_faddr_noxlate) {
1770 src = NULL;
1771 srclen = 0;
1772 }
1773 /* Extract src address from options */
1774 if (optlen != 0)
1775 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1776 }
1777 break;
1778 }
1779
1780 /*
1781 * Create the new socket.
1782 */
1783 nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1784 if (nso == NULL) {
1785 ASSERT(error != 0);
1786 /*
1787 * Accept can not fail with ENOBUFS. sotpi_create
1788 * sleeps waiting for memory until a signal is caught
1789 * so return EINTR.
1790 */
1791 freemsg(mp);
1792 if (error == ENOBUFS)
1793 error = EINTR;
1794 goto e_disc_unl;
1795 }
1796 nvp = SOTOV(nso);
1797 nsti = SOTOTPI(nso);
1798
1799 #ifdef DEBUG
1800 /*
1801 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1802 * it's inherited early to allow debugging of the accept code itself.
1803 */
1804 nso->so_options |= so->so_options & SO_DEBUG;
1805 #endif /* DEBUG */
1806
1807 /*
1808 * Save the SRC address from the T_CONN_IND
1809 * for getpeername to work on AF_UNIX and on transports that do not
1810 * support TI_GETPEERNAME.
1811 *
1812 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1813 * copyin_name().
1814 */
1815 if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1816 error = EINVAL;
1817 freemsg(mp);
1818 eprintsoline(so, error);
1819 goto disconnect_vp_unlocked;
1820 }
1821 nsti->sti_faddr_len = (socklen_t)srclen;
1822 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1823 bcopy(src, nsti->sti_faddr_sa, srclen);
1824 nsti->sti_faddr_valid = 1;
1825
1826 /*
1827 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1828 */
1829 if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1830 (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1831 cred_t *cr;
1832 pid_t cpid;
1833
1834 cr = msg_getcred(mp, &cpid);
1835 if (cr != NULL) {
1836 crhold(cr);
1837 nso->so_peercred = cr;
1838 nso->so_cpid = cpid;
1839 }
1840 freemsg(mp);
1841
1842 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1843 sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1844 if (mp == NULL) {
1845 /*
1846 * Accept can not fail with ENOBUFS.
1847 * A signal was caught so return EINTR.
1848 */
1849 error = EINTR;
1850 eprintsoline(so, error);
1851 goto disconnect_vp_unlocked;
1852 }
1853 conn_res = (struct T_conn_res *)mp->b_rptr;
1854 } else {
1855 /*
1856 * For efficency reasons we use msg_extractcred; no crhold
1857 * needed since db_credp is cleared (i.e., we move the cred
1858 * from the message to so_peercred.
1859 */
1860 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1861
1862 mp->b_rptr = DB_BASE(mp);
1863 conn_res = (struct T_conn_res *)mp->b_rptr;
1864 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1865
1866 mblk_setcred(mp, cr, curproc->p_pid);
1867 }
1868
1869 /*
1870 * New socket must be bound at least in sockfs and, except for AF_INET,
1871 * (or AF_INET6) it also has to be bound in the transport provider.
1872 * We set the local address in the sonode from the T_OK_ACK of the
1873 * T_CONN_RES. For this reason the address we bind to here isn't
1874 * important.
1875 */
1876 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1877 /*CONSTCOND*/
1878 nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1879 /*
1880 * Optimization for AF_INET{,6} transports
1881 * that can handle a T_CONN_RES without being bound.
1882 */
1883 mutex_enter(&nso->so_lock);
1884 so_automatic_bind(nso);
1885 mutex_exit(&nso->so_lock);
1886 } else {
1887 /* Perform NULL bind with the transport provider. */
1888 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1889 cr)) != 0) {
1890 ASSERT(error != ENOBUFS);
1891 freemsg(mp);
1892 eprintsoline(nso, error);
1893 goto disconnect_vp_unlocked;
1894 }
1895 }
1896
1897 /*
1898 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1899 * so that any data arriving on the new socket will cause the
1900 * appropriate signals to be delivered for the new socket.
1901 *
1902 * No other thread (except strsock_proto and strsock_misc)
1903 * can access the new socket thus we relax the locking.
1904 */
1905 nso->so_pgrp = so->so_pgrp;
1906 nso->so_state |= so->so_state & SS_ASYNC;
1907 nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1908
1909 if (nso->so_pgrp != 0) {
1910 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1911 eprintsoline(nso, error);
1912 error = 0;
1913 nso->so_pgrp = 0;
1914 }
1915 }
1916
1917 /*
1918 * Make note of the socket level options. TCP and IP level options
1919 * are already inherited. We could do all this after accept is
1920 * successful but doing it here simplifies code and no harm done
1921 * for error case.
1922 */
1923 nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1924 SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1925 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1926 nso->so_sndbuf = so->so_sndbuf;
1927 nso->so_rcvbuf = so->so_rcvbuf;
1928 if (nso->so_options & SO_LINGER)
1929 nso->so_linger = so->so_linger;
1930
1931 /*
1932 * Note that the following sti_direct code path should be
1933 * removed once we are confident that the direct sockets
1934 * do not result in any degradation.
1935 */
1936 if (sti->sti_direct) {
1937
1938 ASSERT(opt != NULL);
1939
1940 conn_res->OPT_length = optlen;
1941 conn_res->OPT_offset = MBLKL(mp);
1942 bcopy(&opt, mp->b_wptr, optlen);
1943 mp->b_wptr += optlen;
1944 conn_res->PRIM_type = T_CONN_RES;
1945 conn_res->ACCEPTOR_id = 0;
1946 PRIM_type = T_CONN_RES;
1947
1948 /* Send down the T_CONN_RES on acceptor STREAM */
1949 error = kstrputmsg(SOTOV(nso), mp, NULL,
1950 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1951 if (error) {
1952 mutex_enter(&so->so_lock);
1953 so_lock_single(so);
1954 eprintsoline(so, error);
1955 goto disconnect_vp;
1956 }
1957 mutex_enter(&nso->so_lock);
1958 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1959 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1960 if (error) {
1961 mutex_exit(&nso->so_lock);
1962 mutex_enter(&so->so_lock);
1963 so_lock_single(so);
1964 eprintsoline(so, error);
1965 goto disconnect_vp;
1966 }
1967 if (nso->so_family == AF_INET) {
1968 sin_t *sin;
1969
1970 sin = (sin_t *)(ack_mp->b_rptr +
1971 sizeof (struct T_ok_ack));
1972 bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1973 nsti->sti_laddr_len = sizeof (sin_t);
1974 } else {
1975 sin6_t *sin6;
1976
1977 sin6 = (sin6_t *)(ack_mp->b_rptr +
1978 sizeof (struct T_ok_ack));
1979 bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1980 nsti->sti_laddr_len = sizeof (sin6_t);
1981 }
1982 freemsg(ack_mp);
1983
1984 nso->so_state |= SS_ISCONNECTED;
1985 nso->so_proto_handle = (sock_lower_handle_t)opt;
1986 nsti->sti_laddr_valid = 1;
1987
1988 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1989 /*
1990 * A NL7C marked listen()er so the new socket
1991 * inherits the listen()er's NL7C state, except
1992 * for NL7C_POLLIN.
1993 *
1994 * Only call NL7C to process the new socket if
1995 * the listen socket allows blocking i/o.
1996 */
1997 nsti->sti_nl7c_flags =
1998 sti->sti_nl7c_flags & (~NL7C_POLLIN);
1999 if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
2000 /*
2001 * Nonblocking accept() just make it
2002 * persist to defer processing to the
2003 * read-side syscall (e.g. read).
2004 */
2005 nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
2006 } else if (nl7c_process(nso, B_FALSE)) {
2007 /*
2008 * NL7C has completed processing on the
2009 * socket, close the socket and back to
2010 * the top to await the next T_CONN_IND.
2011 */
2012 mutex_exit(&nso->so_lock);
2013 (void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
2014 cr, NULL);
2015 VN_RELE(nvp);
2016 goto again;
2017 }
2018 /* Pass the new socket out */
2019 }
2020
2021 mutex_exit(&nso->so_lock);
2022
2023 /*
2024 * It's possible, through the use of autopush for example,
2025 * that the acceptor stream may not support sti_direct
2026 * semantics. If the new socket does not support sti_direct
2027 * we issue a _SIOCSOCKFALLBACK to inform the transport
2028 * as we would in the I_PUSH case.
2029 */
2030 if (nsti->sti_direct == 0) {
2031 int rval;
2032
2033 if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2034 0, 0, K_TO_K, cr, &rval)) != 0) {
2035 mutex_enter(&so->so_lock);
2036 so_lock_single(so);
2037 eprintsoline(so, error);
2038 goto disconnect_vp;
2039 }
2040 }
2041
2042 /*
2043 * Pass out new socket.
2044 */
2045 if (nsop != NULL)
2046 *nsop = nso;
2047
2048 return (0);
2049 }
2050
2051 /*
2052 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2053 * which don't support the FireEngine accept fast-path. It is also
2054 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2055 * again. Neither sockfs nor TCP attempt to find out if some other
2056 * random module has been inserted in between (in which case we
2057 * should follow TLI accept behaviour). We blindly assume the worst
2058 * case and revert back to old behaviour i.e. TCP will not send us
2059 * any option (eager) and the accept should happen on the listener
2060 * queue. Any queued T_conn_ind have already got their options removed
2061 * by so_sock2_stream() when "sockmod" was I_POP'd.
2062 */
2063 /*
2064 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2065 */
2066 if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2067 #ifdef _ILP32
2068 queue_t *q;
2069
2070 /*
2071 * Find read queue in driver
2072 * Can safely do this since we "own" nso/nvp.
2073 */
2074 q = strvp2wq(nvp)->q_next;
2075 while (SAMESTR(q))
2076 q = q->q_next;
2077 q = RD(q);
2078 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2079 #else
2080 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2081 #endif /* _ILP32 */
2082 conn_res->PRIM_type = O_T_CONN_RES;
2083 PRIM_type = O_T_CONN_RES;
2084 } else {
2085 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2086 conn_res->PRIM_type = T_CONN_RES;
2087 PRIM_type = T_CONN_RES;
2088 }
2089 conn_res->SEQ_number = SEQ_number;
2090 conn_res->OPT_length = 0;
2091 conn_res->OPT_offset = 0;
2092
2093 mutex_enter(&so->so_lock);
2094 so_lock_single(so); /* Set SOLOCKED */
2095 mutex_exit(&so->so_lock);
2096
2097 error = kstrputmsg(SOTOV(so), mp, NULL,
2098 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2099 mutex_enter(&so->so_lock);
2100 if (error) {
2101 eprintsoline(so, error);
2102 goto disconnect_vp;
2103 }
2104 error = sowaitprim(so, PRIM_type, T_OK_ACK,
2105 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2106 if (error) {
2107 eprintsoline(so, error);
2108 goto disconnect_vp;
2109 }
2110 mutex_exit(&so->so_lock);
2111 /*
2112 * If there is a sin/sin6 appended onto the T_OK_ACK use
2113 * that to set the local address. If this is not present
2114 * then we zero out the address and don't set the
2115 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2116 * the pathname from the listening socket.
2117 * In the case where this is TCP or an AF_UNIX socket the
2118 * client side may have queued data or a T_ORDREL in the
2119 * transport. Having now sent the T_CONN_RES we may receive
2120 * those queued messages at any time. Hold the acceptor
2121 * so_lock until its state and laddr are finalized.
2122 */
2123 mutex_enter(&nso->so_lock);
2124 sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2125 if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2126 MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2127 ack_mp->b_rptr += sizeof (struct T_ok_ack);
2128 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2129 nsti->sti_laddr_len = sinlen;
2130 nsti->sti_laddr_valid = 1;
2131 } else if (nso->so_family == AF_UNIX) {
2132 ASSERT(so->so_family == AF_UNIX);
2133 nsti->sti_laddr_len = sti->sti_laddr_len;
2134 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2135 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2136 nsti->sti_laddr_len);
2137 nsti->sti_laddr_valid = 1;
2138 } else {
2139 nsti->sti_laddr_len = sti->sti_laddr_len;
2140 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2141 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2142 nsti->sti_laddr_sa->sa_family = nso->so_family;
2143 }
2144 nso->so_state |= SS_ISCONNECTED;
2145 mutex_exit(&nso->so_lock);
2146
2147 freemsg(ack_mp);
2148
2149 mutex_enter(&so->so_lock);
2150 so_unlock_single(so, SOLOCKED);
2151 mutex_exit(&so->so_lock);
2152
2153 /*
2154 * Pass out new socket.
2155 */
2156 if (nsop != NULL)
2157 *nsop = nso;
2158
2159 return (0);
2160
2161
2162 eproto_disc_unl:
2163 error = EPROTO;
2164 e_disc_unl:
2165 eprintsoline(so, error);
2166 goto disconnect_unlocked;
2167
2168 pr_disc_vp_unl:
2169 eprintsoline(so, error);
2170 disconnect_vp_unlocked:
2171 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2172 VN_RELE(nvp);
2173 disconnect_unlocked:
2174 (void) sodisconnect(so, SEQ_number, 0);
2175 return (error);
2176
2177 pr_disc_vp:
2178 eprintsoline(so, error);
2179 disconnect_vp:
2180 (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2181 so_unlock_single(so, SOLOCKED);
2182 mutex_exit(&so->so_lock);
2183 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2184 VN_RELE(nvp);
2185 return (error);
2186
2187 conn_bad: /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2188 error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2189 ? EOPNOTSUPP : EINVAL;
2190 e_bad:
2191 eprintsoline(so, error);
2192 return (error);
2193 }
2194
2195 /*
2196 * connect a socket.
2197 *
2198 * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2199 * unconnect (by specifying a null address).
2200 */
2201 int
2202 sotpi_connect(struct sonode *so,
2203 struct sockaddr *name,
2204 socklen_t namelen,
2205 int fflag,
2206 int flags,
2207 struct cred *cr)
2208 {
2209 struct T_conn_req conn_req;
2210 int error = 0;
2211 mblk_t *mp;
2212 void *src;
2213 socklen_t srclen;
2214 void *addr;
2215 socklen_t addrlen;
2216 boolean_t need_unlock;
2217 sotpi_info_t *sti = SOTOTPI(so);
2218
2219 dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2220 (void *)so, (void *)name, namelen, fflag, flags,
2221 pr_state(so->so_state, so->so_mode)));
2222
2223 /*
2224 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2225 * avoid sleeping for memory with SOLOCKED held.
2226 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2227 * + sizeof (struct T_opthdr).
2228 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2229 * exceed sti_faddr_maxlen).
2230 */
2231 mp = soallocproto(sizeof (struct T_conn_req) +
2232 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2233 cr);
2234 if (mp == NULL) {
2235 /*
2236 * Connect can not fail with ENOBUFS. A signal was
2237 * caught so return EINTR.
2238 */
2239 error = EINTR;
2240 eprintsoline(so, error);
2241 return (error);
2242 }
2243
2244 mutex_enter(&so->so_lock);
2245 /*
2246 * Make sure there is a preallocated T_unbind_req message
2247 * before any binding. This message is allocated when the
2248 * socket is created. Since another thread can consume
2249 * so_unbind_mp by the time we return from so_lock_single(),
2250 * we should check the availability of so_unbind_mp after
2251 * we return from so_lock_single().
2252 */
2253
2254 so_lock_single(so); /* Set SOLOCKED */
2255 need_unlock = B_TRUE;
2256
2257 if (sti->sti_unbind_mp == NULL) {
2258 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2259 /* NOTE: holding so_lock while sleeping */
2260 sti->sti_unbind_mp =
2261 soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2262 if (sti->sti_unbind_mp == NULL) {
2263 error = EINTR;
2264 goto done;
2265 }
2266 }
2267
2268 /*
2269 * Can't have done a listen before connecting.
2270 */
2271 if (so->so_state & SS_ACCEPTCONN) {
2272 error = EOPNOTSUPP;
2273 goto done;
2274 }
2275
2276 /*
2277 * Must be bound with the transport
2278 */
2279 if (!(so->so_state & SS_ISBOUND)) {
2280 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2281 /*CONSTCOND*/
2282 so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2283 /*
2284 * Optimization for AF_INET{,6} transports
2285 * that can handle a T_CONN_REQ without being bound.
2286 */
2287 so_automatic_bind(so);
2288 } else {
2289 error = sotpi_bind(so, NULL, 0,
2290 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2291 if (error)
2292 goto done;
2293 }
2294 ASSERT(so->so_state & SS_ISBOUND);
2295 flags |= _SOCONNECT_DID_BIND;
2296 }
2297
2298 /*
2299 * Handle a connect to a name parameter of type AF_UNSPEC like a
2300 * connect to a null address. This is the portable method to
2301 * unconnect a socket.
2302 */
2303 if ((namelen >= sizeof (sa_family_t)) &&
2304 (name->sa_family == AF_UNSPEC)) {
2305 name = NULL;
2306 namelen = 0;
2307 }
2308
2309 /*
2310 * Check that we are not already connected.
2311 * A connection-oriented socket cannot be reconnected.
2312 * A connected connection-less socket can be
2313 * - connected to a different address by a subsequent connect
2314 * - "unconnected" by a connect to the NULL address
2315 */
2316 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2317 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2318 if (so->so_mode & SM_CONNREQUIRED) {
2319 /* Connection-oriented socket */
2320 error = so->so_state & SS_ISCONNECTED ?
2321 EISCONN : EALREADY;
2322 goto done;
2323 }
2324 /* Connection-less socket */
2325 if (name == NULL) {
2326 /*
2327 * Remove the connected state and clear SO_DGRAM_ERRIND
2328 * since it was set when the socket was connected.
2329 * If this is UDP also send down a T_DISCON_REQ.
2330 */
2331 int val;
2332
2333 if ((so->so_family == AF_INET ||
2334 so->so_family == AF_INET6) &&
2335 (so->so_type == SOCK_DGRAM ||
2336 so->so_type == SOCK_RAW) &&
2337 /*CONSTCOND*/
2338 !soconnect_tpi_udp) {
2339 /* XXX What about implicitly unbinding here? */
2340 error = sodisconnect(so, -1,
2341 _SODISCONNECT_LOCK_HELD);
2342 } else {
2343 so->so_state &=
2344 ~(SS_ISCONNECTED | SS_ISCONNECTING);
2345 sti->sti_faddr_valid = 0;
2346 sti->sti_faddr_len = 0;
2347 }
2348
2349 /* Remove SOLOCKED since setsockopt will grab it */
2350 so_unlock_single(so, SOLOCKED);
2351 mutex_exit(&so->so_lock);
2352
2353 val = 0;
2354 (void) sotpi_setsockopt(so, SOL_SOCKET,
2355 SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2356 cr);
2357
2358 mutex_enter(&so->so_lock);
2359 so_lock_single(so); /* Set SOLOCKED */
2360 goto done;
2361 }
2362 }
2363 ASSERT(so->so_state & SS_ISBOUND);
2364
2365 if (name == NULL || namelen == 0) {
2366 error = EINVAL;
2367 goto done;
2368 }
2369 /*
2370 * Mark the socket if sti_faddr_sa represents the transport level
2371 * address.
2372 */
2373 if (flags & _SOCONNECT_NOXLATE) {
2374 struct sockaddr_ux *soaddr_ux;
2375
2376 ASSERT(so->so_family == AF_UNIX);
2377 if (namelen != sizeof (struct sockaddr_ux)) {
2378 error = EINVAL;
2379 goto done;
2380 }
2381 soaddr_ux = (struct sockaddr_ux *)name;
2382 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2383 namelen = sizeof (soaddr_ux->sou_addr);
2384 sti->sti_faddr_noxlate = 1;
2385 }
2386
2387 /*
2388 * Length and family checks.
2389 */
2390 error = so_addr_verify(so, name, namelen);
2391 if (error)
2392 goto bad;
2393
2394 /*
2395 * Save foreign address. Needed for AF_UNIX as well as
2396 * transport providers that do not support TI_GETPEERNAME.
2397 * Also used for cached foreign address for TCP and UDP.
2398 */
2399 if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2400 error = EINVAL;
2401 goto done;
2402 }
2403 sti->sti_faddr_len = (socklen_t)namelen;
2404 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2405 bcopy(name, sti->sti_faddr_sa, namelen);
2406 sti->sti_faddr_valid = 1;
2407
2408 if (so->so_family == AF_UNIX) {
2409 if (sti->sti_faddr_noxlate) {
2410 /*
2411 * sti_faddr is a transport-level address, so
2412 * don't pass it as an option. Do save it in
2413 * sti_ux_faddr, used for connected DG send.
2414 */
2415 src = NULL;
2416 srclen = 0;
2417 addr = sti->sti_faddr_sa;
2418 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2419 bcopy(addr, &sti->sti_ux_faddr,
2420 sizeof (sti->sti_ux_faddr));
2421 } else {
2422 /*
2423 * Pass the sockaddr_un source address as an option
2424 * and translate the remote address.
2425 * Holding so_lock thus sti_laddr_sa can not change.
2426 */
2427 src = sti->sti_laddr_sa;
2428 srclen = (t_uscalar_t)sti->sti_laddr_len;
2429 dprintso(so, 1,
2430 ("sotpi_connect UNIX: srclen %d, src %p\n",
2431 srclen, src));
2432 /*
2433 * Translate the destination address into our
2434 * internal form, and save it in sti_ux_faddr.
2435 * After this call, addr==&sti->sti_ux_taddr,
2436 * and we copy that to sti->sti_ux_faddr so
2437 * we save the connected peer address.
2438 */
2439 error = so_ux_addr_xlate(so,
2440 sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2441 (flags & _SOCONNECT_XPG4_2),
2442 &addr, &addrlen);
2443 if (error)
2444 goto bad;
2445 bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2446 sizeof (sti->sti_ux_faddr));
2447 }
2448 } else {
2449 addr = sti->sti_faddr_sa;
2450 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2451 src = NULL;
2452 srclen = 0;
2453 }
2454 /*
2455 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2456 * option which asks the transport provider to send T_UDERR_IND
2457 * messages. These T_UDERR_IND messages are used to return connected
2458 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2459 *
2460 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2461 * we send down a T_CONN_REQ. This is needed to let the
2462 * transport assign a local address that is consistent with
2463 * the remote address. Applications depend on a getsockname()
2464 * after a connect() to retrieve the "source" IP address for
2465 * the connected socket. Invalidate the cached local address
2466 * to force getsockname() to enquire of the transport.
2467 */
2468 if (!(so->so_mode & SM_CONNREQUIRED)) {
2469 /*
2470 * Datagram socket.
2471 */
2472 int32_t val;
2473
2474 so_unlock_single(so, SOLOCKED);
2475 mutex_exit(&so->so_lock);
2476
2477 val = 1;
2478 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2479 &val, (t_uscalar_t)sizeof (val), cr);
2480
2481 mutex_enter(&so->so_lock);
2482 so_lock_single(so); /* Set SOLOCKED */
2483 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2484 (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2485 soconnect_tpi_udp) {
2486 soisconnected(so);
2487 goto done;
2488 }
2489 /*
2490 * Send down T_CONN_REQ etc.
2491 * Clear fflag to avoid returning EWOULDBLOCK.
2492 */
2493 fflag = 0;
2494 ASSERT(so->so_family != AF_UNIX);
2495 sti->sti_laddr_valid = 0;
2496 } else if (sti->sti_laddr_len != 0) {
2497 /*
2498 * If the local address or port was "any" then it may be
2499 * changed by the transport as a result of the
2500 * connect. Invalidate the cached version if we have one.
2501 */
2502 switch (so->so_family) {
2503 case AF_INET:
2504 ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2505 if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2506 INADDR_ANY ||
2507 ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2508 sti->sti_laddr_valid = 0;
2509 break;
2510
2511 case AF_INET6:
2512 ASSERT(sti->sti_laddr_len ==
2513 (socklen_t)sizeof (sin6_t));
2514 if (IN6_IS_ADDR_UNSPECIFIED(
2515 &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2516 IN6_IS_ADDR_V4MAPPED_ANY(
2517 &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2518 ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2519 sti->sti_laddr_valid = 0;
2520 break;
2521
2522 default:
2523 break;
2524 }
2525 }
2526
2527 /*
2528 * Check for failure of an earlier call
2529 */
2530 if (so->so_error != 0)
2531 goto so_bad;
2532
2533 /*
2534 * Send down T_CONN_REQ. Message was allocated above.
2535 */
2536 conn_req.PRIM_type = T_CONN_REQ;
2537 conn_req.DEST_length = addrlen;
2538 conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2539 if (srclen == 0) {
2540 conn_req.OPT_length = 0;
2541 conn_req.OPT_offset = 0;
2542 soappendmsg(mp, &conn_req, sizeof (conn_req));
2543 soappendmsg(mp, addr, addrlen);
2544 } else {
2545 /*
2546 * There is a AF_UNIX sockaddr_un to include as a source
2547 * address option.
2548 */
2549 struct T_opthdr toh;
2550
2551 toh.level = SOL_SOCKET;
2552 toh.name = SO_SRCADDR;
2553 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2554 toh.status = 0;
2555 conn_req.OPT_length =
2556 (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2557 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2558 _TPI_ALIGN_TOPT(addrlen));
2559
2560 soappendmsg(mp, &conn_req, sizeof (conn_req));
2561 soappendmsg(mp, addr, addrlen);
2562 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2563 soappendmsg(mp, &toh, sizeof (toh));
2564 soappendmsg(mp, src, srclen);
2565 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2566 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2567 }
2568 /*
2569 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2570 * in order to have the right state when the T_CONN_CON shows up.
2571 */
2572 soisconnecting(so);
2573 mutex_exit(&so->so_lock);
2574
2575 if (AU_AUDITING())
2576 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2577
2578 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2579 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2580 mp = NULL;
2581 mutex_enter(&so->so_lock);
2582 if (error != 0)
2583 goto bad;
2584
2585 if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2586 goto bad;
2587
2588 /* Allow other threads to access the socket */
2589 so_unlock_single(so, SOLOCKED);
2590 need_unlock = B_FALSE;
2591
2592 /*
2593 * Wait until we get a T_CONN_CON or an error
2594 */
2595 if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2596 so_lock_single(so); /* Set SOLOCKED */
2597 need_unlock = B_TRUE;
2598 }
2599
2600 done:
2601 freemsg(mp);
2602 switch (error) {
2603 case EINPROGRESS:
2604 case EALREADY:
2605 case EISCONN:
2606 case EINTR:
2607 /* Non-fatal errors */
2608 sti->sti_laddr_valid = 0;
2609 /* FALLTHRU */
2610 case 0:
2611 break;
2612 default:
2613 ASSERT(need_unlock);
2614 /*
2615 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2616 * and invalidate local-address cache
2617 */
2618 so->so_state &= ~SS_ISCONNECTING;
2619 sti->sti_laddr_valid = 0;
2620 /* A discon_ind might have already unbound us */
2621 if ((flags & _SOCONNECT_DID_BIND) &&
2622 (so->so_state & SS_ISBOUND)) {
2623 int err;
2624
2625 err = sotpi_unbind(so, 0);
2626 /* LINTED - statement has no conseq */
2627 if (err) {
2628 eprintsoline(so, err);
2629 }
2630 }
2631 break;
2632 }
2633 if (need_unlock)
2634 so_unlock_single(so, SOLOCKED);
2635 mutex_exit(&so->so_lock);
2636 return (error);
2637
2638 so_bad: error = sogeterr(so, B_TRUE);
2639 bad: eprintsoline(so, error);
2640 goto done;
2641 }
2642
2643 /* ARGSUSED */
2644 int
2645 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2646 {
2647 struct T_ordrel_req ordrel_req;
2648 mblk_t *mp;
2649 uint_t old_state, state_change;
2650 int error = 0;
2651 sotpi_info_t *sti = SOTOTPI(so);
2652
2653 dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2654 (void *)so, how, pr_state(so->so_state, so->so_mode)));
2655
2656 mutex_enter(&so->so_lock);
2657 so_lock_single(so); /* Set SOLOCKED */
2658
2659 /*
2660 * SunOS 4.X has no check for datagram sockets.
2661 * 5.X checks that it is connected (ENOTCONN)
2662 * X/Open requires that we check the connected state.
2663 */
2664 if (!(so->so_state & SS_ISCONNECTED)) {
2665 if (!xnet_skip_checks) {
2666 error = ENOTCONN;
2667 if (xnet_check_print) {
2668 printf("sockfs: X/Open shutdown check "
2669 "caused ENOTCONN\n");
2670 }
2671 }
2672 goto done;
2673 }
2674 /*
2675 * Record the current state and then perform any state changes.
2676 * Then use the difference between the old and new states to
2677 * determine which messages need to be sent.
2678 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2679 * duplicate calls to shutdown().
2680 */
2681 old_state = so->so_state;
2682
2683 switch (how) {
2684 case 0:
2685 socantrcvmore(so);
2686 break;
2687 case 1:
2688 socantsendmore(so);
2689 break;
2690 case 2:
2691 socantsendmore(so);
2692 socantrcvmore(so);
2693 break;
2694 default:
2695 error = EINVAL;
2696 goto done;
2697 }
2698
2699 /*
2700 * Assumes that the SS_CANT* flags are never cleared in the above code.
2701 */
2702 state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2703 (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2704 ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2705
2706 switch (state_change) {
2707 case 0:
2708 dprintso(so, 1,
2709 ("sotpi_shutdown: nothing to send in state 0x%x\n",
2710 so->so_state));
2711 goto done;
2712
2713 case SS_CANTRCVMORE:
2714 mutex_exit(&so->so_lock);
2715 strseteof(SOTOV(so), 1);
2716 /*
2717 * strseteof takes care of read side wakeups,
2718 * pollwakeups, and signals.
2719 */
2720 /*
2721 * Get the read lock before flushing data to avoid problems
2722 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2723 */
2724 mutex_enter(&so->so_lock);
2725 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2726 mutex_exit(&so->so_lock);
2727
2728 /* Flush read side queue */
2729 strflushrq(SOTOV(so), FLUSHALL);
2730
2731 mutex_enter(&so->so_lock);
2732 so_unlock_read(so); /* Clear SOREADLOCKED */
2733 break;
2734
2735 case SS_CANTSENDMORE:
2736 mutex_exit(&so->so_lock);
2737 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2738 mutex_enter(&so->so_lock);
2739 break;
2740
2741 case SS_CANTSENDMORE|SS_CANTRCVMORE:
2742 mutex_exit(&so->so_lock);
2743 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2744 strseteof(SOTOV(so), 1);
2745 /*
2746 * strseteof takes care of read side wakeups,
2747 * pollwakeups, and signals.
2748 */
2749 /*
2750 * Get the read lock before flushing data to avoid problems
2751 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2752 */
2753 mutex_enter(&so->so_lock);
2754 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2755 mutex_exit(&so->so_lock);
2756
2757 /* Flush read side queue */
2758 strflushrq(SOTOV(so), FLUSHALL);
2759
2760 mutex_enter(&so->so_lock);
2761 so_unlock_read(so); /* Clear SOREADLOCKED */
2762 break;
2763 }
2764
2765 ASSERT(MUTEX_HELD(&so->so_lock));
2766
2767 /*
2768 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2769 * was set due to this call and the new state has both of them set:
2770 * Send the AF_UNIX close indication
2771 * For T_COTS send a discon_ind
2772 *
2773 * If cantsend was set due to this call:
2774 * For T_COTSORD send an ordrel_ind
2775 *
2776 * Note that for T_CLTS there is no message sent here.
2777 */
2778 if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2779 (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2780 /*
2781 * For SunOS 4.X compatibility we tell the other end
2782 * that we are unable to receive at this point.
2783 */
2784 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2785 so_unix_close(so);
2786
2787 if (sti->sti_serv_type == T_COTS)
2788 error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2789 }
2790 if ((state_change & SS_CANTSENDMORE) &&
2791 (sti->sti_serv_type == T_COTS_ORD)) {
2792 /* Send an orderly release */
2793 ordrel_req.PRIM_type = T_ORDREL_REQ;
2794
2795 mutex_exit(&so->so_lock);
2796 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2797 0, _ALLOC_SLEEP, cr);
2798 /*
2799 * Send down the T_ORDREL_REQ even if there is flow control.
2800 * This prevents shutdown from blocking.
2801 * Note that there is no T_OK_ACK for ordrel_req.
2802 */
2803 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2804 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2805 mutex_enter(&so->so_lock);
2806 if (error) {
2807 eprintsoline(so, error);
2808 goto done;
2809 }
2810 }
2811
2812 done:
2813 so_unlock_single(so, SOLOCKED);
2814 mutex_exit(&so->so_lock);
2815 return (error);
2816 }
2817
2818 /*
2819 * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2820 * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2821 * that we have closed.
2822 * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2823 * T_UNITDATA_REQ containing the same option.
2824 *
2825 * For SOCK_DGRAM half-connections (somebody connected to this end
2826 * but this end is not connect) we don't know where to send any
2827 * SO_UNIX_CLOSE.
2828 *
2829 * We have to ignore stream head errors just in case there has been
2830 * a shutdown(output).
2831 * Ignore any flow control to try to get the message more quickly to the peer.
2832 * While locally ignoring flow control solves the problem when there
2833 * is only the loopback transport on the stream it would not provide
2834 * the correct AF_UNIX socket semantics when one or more modules have
2835 * been pushed.
2836 */
2837 void
2838 so_unix_close(struct sonode *so)
2839 {
2840 struct T_opthdr toh;
2841 mblk_t *mp;
2842 sotpi_info_t *sti = SOTOTPI(so);
2843
2844 ASSERT(MUTEX_HELD(&so->so_lock));
2845
2846 ASSERT(so->so_family == AF_UNIX);
2847
2848 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2849 (SS_ISCONNECTED|SS_ISBOUND))
2850 return;
2851
2852 dprintso(so, 1, ("so_unix_close(%p) %s\n",
2853 (void *)so, pr_state(so->so_state, so->so_mode)));
2854
2855 toh.level = SOL_SOCKET;
2856 toh.name = SO_UNIX_CLOSE;
2857
2858 /* zero length + header */
2859 toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2860 toh.status = 0;
2861
2862 if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2863 struct T_optdata_req tdr;
2864
2865 tdr.PRIM_type = T_OPTDATA_REQ;
2866 tdr.DATA_flag = 0;
2867
2868 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2869 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2870
2871 /* NOTE: holding so_lock while sleeping */
2872 mp = soallocproto2(&tdr, sizeof (tdr),
2873 &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2874 } else {
2875 struct T_unitdata_req tudr;
2876 void *addr;
2877 socklen_t addrlen;
2878 void *src;
2879 socklen_t srclen;
2880 struct T_opthdr toh2;
2881 t_scalar_t size;
2882
2883 /*
2884 * We know this is an AF_UNIX connected DGRAM socket.
2885 * We therefore already have the destination address
2886 * in the internal form needed for this send. This is
2887 * similar to the sosend_dgram call later in this file
2888 * when there's no user-specified destination address.
2889 */
2890 if (sti->sti_faddr_noxlate) {
2891 /*
2892 * Already have a transport internal address. Do not
2893 * pass any (transport internal) source address.
2894 */
2895 addr = sti->sti_faddr_sa;
2896 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2897 src = NULL;
2898 srclen = 0;
2899 } else {
2900 /*
2901 * Pass the sockaddr_un source address as an option
2902 * and translate the remote address.
2903 * Holding so_lock thus sti_laddr_sa can not change.
2904 */
2905 src = sti->sti_laddr_sa;
2906 srclen = (socklen_t)sti->sti_laddr_len;
2907 dprintso(so, 1,
2908 ("so_ux_close: srclen %d, src %p\n",
2909 srclen, src));
2910 /*
2911 * Use the destination address saved in connect.
2912 */
2913 addr = &sti->sti_ux_faddr;
2914 addrlen = sizeof (sti->sti_ux_faddr);
2915 }
2916 tudr.PRIM_type = T_UNITDATA_REQ;
2917 tudr.DEST_length = addrlen;
2918 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2919 if (srclen == 0) {
2920 tudr.OPT_length = (t_scalar_t)sizeof (toh);
2921 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2922 _TPI_ALIGN_TOPT(addrlen));
2923
2924 size = tudr.OPT_offset + tudr.OPT_length;
2925 /* NOTE: holding so_lock while sleeping */
2926 mp = soallocproto2(&tudr, sizeof (tudr),
2927 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2928 mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2929 soappendmsg(mp, &toh, sizeof (toh));
2930 } else {
2931 /*
2932 * There is a AF_UNIX sockaddr_un to include as a
2933 * source address option.
2934 */
2935 tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2936 _TPI_ALIGN_TOPT(srclen));
2937 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2938 _TPI_ALIGN_TOPT(addrlen));
2939
2940 toh2.level = SOL_SOCKET;
2941 toh2.name = SO_SRCADDR;
2942 toh2.len = (t_uscalar_t)(srclen +
2943 sizeof (struct T_opthdr));
2944 toh2.status = 0;
2945
2946 size = tudr.OPT_offset + tudr.OPT_length;
2947
2948 /* NOTE: holding so_lock while sleeping */
2949 mp = soallocproto2(&tudr, sizeof (tudr),
2950 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2951 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2952 soappendmsg(mp, &toh, sizeof (toh));
2953 soappendmsg(mp, &toh2, sizeof (toh2));
2954 soappendmsg(mp, src, srclen);
2955 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2956 }
2957 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2958 }
2959 mutex_exit(&so->so_lock);
2960 (void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2961 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2962 mutex_enter(&so->so_lock);
2963 }
2964
2965 /*
2966 * Called by sotpi_recvmsg when reading a non-zero amount of data.
2967 * In addition, the caller typically verifies that there is some
2968 * potential state to clear by checking
2969 * if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2970 * before calling this routine.
2971 * Note that such a check can be made without holding so_lock since
2972 * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2973 * decrements sti_oobsigcnt.
2974 *
2975 * When data is read *after* the point that all pending
2976 * oob data has been consumed the oob indication is cleared.
2977 *
2978 * This logic keeps select/poll returning POLLRDBAND and
2979 * SIOCATMARK returning true until we have read past
2980 * the mark.
2981 */
2982 static void
2983 sorecv_update_oobstate(struct sonode *so)
2984 {
2985 sotpi_info_t *sti = SOTOTPI(so);
2986
2987 mutex_enter(&so->so_lock);
2988 ASSERT(so_verify_oobstate(so));
2989 dprintso(so, 1,
2990 ("sorecv_update_oobstate: counts %d/%d state %s\n",
2991 sti->sti_oobsigcnt,
2992 sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2993 if (sti->sti_oobsigcnt == 0) {
2994 /* No more pending oob indications */
2995 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2996 freemsg(so->so_oobmsg);
2997 so->so_oobmsg = NULL;
2998 }
2999 ASSERT(so_verify_oobstate(so));
3000 mutex_exit(&so->so_lock);
3001 }
3002
3003 /*
3004 * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
3005 */
3006 static int
3007 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
3008 {
3009 sotpi_info_t *sti = SOTOTPI(so);
3010 int error = 0;
3011 mblk_t *tmp = NULL;
3012 mblk_t *pmp = NULL;
3013 mblk_t *nmp = sti->sti_nl7c_rcv_mp;
3014
3015 ASSERT(nmp != NULL);
3016
3017 while (nmp != NULL && uiop->uio_resid > 0) {
3018 ssize_t n;
3019
3020 if (DB_TYPE(nmp) == M_DATA) {
3021 /*
3022 * We have some data, uiomove up to resid bytes.
3023 */
3024 n = MIN(MBLKL(nmp), uiop->uio_resid);
3025 if (n > 0)
3026 error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3027 nmp->b_rptr += n;
3028 if (nmp->b_rptr == nmp->b_wptr) {
3029 pmp = nmp;
3030 nmp = nmp->b_cont;
3031 }
3032 if (error)
3033 break;
3034 } else {
3035 /*
3036 * We only handle data, save for caller to handle.
3037 */
3038 if (pmp != NULL) {
3039 pmp->b_cont = nmp->b_cont;
3040 }
3041 nmp->b_cont = NULL;
3042 if (*rmp == NULL) {
3043 *rmp = nmp;
3044 } else {
3045 tmp->b_cont = nmp;
3046 }
3047 nmp = nmp->b_cont;
3048 tmp = nmp;
3049 }
3050 }
3051 if (pmp != NULL) {
3052 /* Free any mblk_t(s) which we have consumed */
3053 pmp->b_cont = NULL;
3054 freemsg(sti->sti_nl7c_rcv_mp);
3055 }
3056 if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3057 /* Last mblk_t so return the saved kstrgetmsg() rval/error */
3058 if (error == 0) {
3059 rval_t *p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3060
3061 error = p->r_v.r_v2;
3062 p->r_v.r_v2 = 0;
3063 }
3064 rp->r_vals = sti->sti_nl7c_rcv_rval;
3065 sti->sti_nl7c_rcv_rval = 0;
3066 } else {
3067 /* More mblk_t(s) to process so no rval to return */
3068 rp->r_vals = 0;
3069 }
3070 return (error);
3071 }
3072 /*
3073 * Receive the next message on the queue.
3074 * If msg_controllen is non-zero when called the caller is interested in
3075 * any received control info (options).
3076 * If msg_namelen is non-zero when called the caller is interested in
3077 * any received source address.
3078 * The routine returns with msg_control and msg_name pointing to
3079 * kmem_alloc'ed memory which the caller has to free.
3080 */
3081 /* ARGSUSED */
3082 int
3083 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3084 struct cred *cr)
3085 {
3086 union T_primitives *tpr;
3087 mblk_t *mp;
3088 uchar_t pri;
3089 int pflag, opflag;
3090 void *control;
3091 t_uscalar_t controllen;
3092 t_uscalar_t namelen;
3093 int so_state = so->so_state; /* Snapshot */
3094 ssize_t saved_resid;
3095 rval_t rval;
3096 int flags;
3097 clock_t timout;
3098 int error = 0;
3099 sotpi_info_t *sti = SOTOTPI(so);
3100
3101 flags = msg->msg_flags;
3102 msg->msg_flags = 0;
3103
3104 dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3105 (void *)so, (void *)msg, flags,
3106 pr_state(so->so_state, so->so_mode), so->so_error));
3107
3108 if (so->so_version == SOV_STREAM) {
3109 so_update_attrs(so, SOACC);
3110 /* The imaginary "sockmod" has been popped - act as a stream */
3111 return (strread(SOTOV(so), uiop, cr));
3112 }
3113
3114 /*
3115 * If we are not connected because we have never been connected
3116 * we return ENOTCONN. If we have been connected (but are no longer
3117 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3118 * the EOF.
3119 *
3120 * An alternative would be to post an ENOTCONN error in stream head
3121 * (read+write) and clear it when we're connected. However, that error
3122 * would cause incorrect poll/select behavior!
3123 */
3124 if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3125 (so->so_mode & SM_CONNREQUIRED)) {
3126 return (ENOTCONN);
3127 }
3128
3129 /*
3130 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3131 * after checking that the read queue is empty) and returns zero.
3132 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3133 * is zero.
3134 */
3135
3136 if (flags & MSG_OOB) {
3137 /* Check that the transport supports OOB */
3138 if (!(so->so_mode & SM_EXDATA))
3139 return (EOPNOTSUPP);
3140 so_update_attrs(so, SOACC);
3141 return (sorecvoob(so, msg, uiop, flags,
3142 (so->so_options & SO_OOBINLINE)));
3143 }
3144
3145 so_update_attrs(so, SOACC);
3146
3147 /*
3148 * Set msg_controllen and msg_namelen to zero here to make it
3149 * simpler in the cases that no control or name is returned.
3150 */
3151 controllen = msg->msg_controllen;
3152 namelen = msg->msg_namelen;
3153 msg->msg_controllen = 0;
3154 msg->msg_namelen = 0;
3155
3156 dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3157 namelen, controllen));
3158
3159 mutex_enter(&so->so_lock);
3160 /*
3161 * If an NL7C enabled socket and not waiting for write data.
3162 */
3163 if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3164 NL7C_ENABLED) {
3165 if (sti->sti_nl7c_uri) {
3166 /* Close uri processing for a previous request */
3167 nl7c_close(so);
3168 }
3169 if ((so_state & SS_CANTRCVMORE) &&
3170 sti->sti_nl7c_rcv_mp == NULL) {
3171 /* Nothing to process, EOF */
3172 mutex_exit(&so->so_lock);
3173 return (0);
3174 } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3175 /* Persistent NL7C socket, try to process request */
3176 boolean_t ret;
3177
3178 ret = nl7c_process(so,
3179 (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3180 rval.r_vals = sti->sti_nl7c_rcv_rval;
3181 error = rval.r_v.r_v2;
3182 if (error) {
3183 /* Error of some sort, return it */
3184 mutex_exit(&so->so_lock);
3185 return (error);
3186 }
3187 if (sti->sti_nl7c_flags &&
3188 ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3189 /*
3190 * Still an NL7C socket and no data
3191 * to pass up to the caller.
3192 */
3193 mutex_exit(&so->so_lock);
3194 if (ret) {
3195 /* EOF */
3196 return (0);
3197 } else {
3198 /* Need more data */
3199 return (EAGAIN);
3200 }
3201 }
3202 } else {
3203 /*
3204 * Not persistent so no further NL7C processing.
3205 */
3206 sti->sti_nl7c_flags = 0;
3207 }
3208 }
3209 /*
3210 * Only one reader is allowed at any given time. This is needed
3211 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3212 *
3213 * This is slightly different that BSD behavior in that it fails with
3214 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3215 * is single-threaded using sblock(), which is dropped while waiting
3216 * for data to appear. The difference shows up e.g. if one
3217 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3218 * does use nonblocking io and different threads are reading each
3219 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3220 * in this case as long as the read queue doesn't get empty.
3221 * In this implementation the thread using nonblocking io can
3222 * get an EWOULDBLOCK error due to the blocking thread executing
3223 * e.g. in the uiomove in kstrgetmsg.
3224 * This difference is not believed to be significant.
3225 */
3226 /* Set SOREADLOCKED */
3227 error = so_lock_read_intr(so,
3228 uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3229 mutex_exit(&so->so_lock);
3230 if (error)
3231 return (error);
3232
3233 /*
3234 * Tell kstrgetmsg to not inspect the stream head errors until all
3235 * queued data has been consumed.
3236 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3237 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3238 *
3239 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3240 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3241 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3242 */
3243 pflag = MSG_ANY | MSG_DELAYERROR;
3244 if (flags & MSG_PEEK) {
3245 pflag |= MSG_IPEEK;
3246 flags &= ~MSG_WAITALL;
3247 }
3248 if (so->so_mode & SM_ATOMIC)
3249 pflag |= MSG_DISCARDTAIL;
3250
3251 if (flags & MSG_DONTWAIT)
3252 timout = 0;
3253 else if (so->so_rcvtimeo != 0)
3254 timout = TICK_TO_MSEC(so->so_rcvtimeo);
3255 else
3256 timout = -1;
3257 opflag = pflag;
3258 retry:
3259 saved_resid = uiop->uio_resid;
3260 pri = 0;
3261 mp = NULL;
3262 if (sti->sti_nl7c_rcv_mp != NULL) {
3263 /* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3264 error = nl7c_sorecv(so, &mp, uiop, &rval);
3265 } else {
3266 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3267 timout, &rval);
3268 }
3269 if (error != 0) {
3270 /* kstrgetmsg returns ETIME when timeout expires */
3271 if (error == ETIME)
3272 error = EWOULDBLOCK;
3273 goto out;
3274 }
3275 /*
3276 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3277 * For non-datagrams MOREDATA is used to set MSG_EOR.
3278 */
3279 ASSERT(!(rval.r_val1 & MORECTL));
3280 if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3281 msg->msg_flags |= MSG_TRUNC;
3282
3283 if (mp == NULL) {
3284 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3285 /*
3286 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3287 * The draft Posix socket spec states that the mark should
3288 * not be cleared when peeking. We follow the latter.
3289 */
3290 if ((so->so_state &
3291 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3292 (uiop->uio_resid != saved_resid) &&
3293 !(flags & MSG_PEEK)) {
3294 sorecv_update_oobstate(so);
3295 }
3296
3297 mutex_enter(&so->so_lock);
3298 /* Set MSG_EOR based on MOREDATA */
3299 if (!(rval.r_val1 & MOREDATA)) {
3300 if (so->so_state & SS_SAVEDEOR) {
3301 msg->msg_flags |= MSG_EOR;
3302 so->so_state &= ~SS_SAVEDEOR;
3303 }
3304 }
3305 /*
3306 * If some data was received (i.e. not EOF) and the
3307 * read/recv* has not been satisfied wait for some more.
3308 */
3309 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3310 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3311 mutex_exit(&so->so_lock);
3312 pflag = opflag | MSG_NOMARK;
3313 goto retry;
3314 }
3315 goto out_locked;
3316 }
3317
3318 /* strsock_proto has already verified length and alignment */
3319 tpr = (union T_primitives *)mp->b_rptr;
3320 dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3321
3322 switch (tpr->type) {
3323 case T_DATA_IND: {
3324 if ((so->so_state &
3325 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3326 (uiop->uio_resid != saved_resid) &&
3327 !(flags & MSG_PEEK)) {
3328 sorecv_update_oobstate(so);
3329 }
3330
3331 /*
3332 * Set msg_flags to MSG_EOR based on
3333 * MORE_flag and MOREDATA.
3334 */
3335 mutex_enter(&so->so_lock);
3336 so->so_state &= ~SS_SAVEDEOR;
3337 if (!(tpr->data_ind.MORE_flag & 1)) {
3338 if (!(rval.r_val1 & MOREDATA))
3339 msg->msg_flags |= MSG_EOR;
3340 else
3341 so->so_state |= SS_SAVEDEOR;
3342 }
3343 freemsg(mp);
3344 /*
3345 * If some data was received (i.e. not EOF) and the
3346 * read/recv* has not been satisfied wait for some more.
3347 */
3348 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3349 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3350 mutex_exit(&so->so_lock);
3351 pflag = opflag | MSG_NOMARK;
3352 goto retry;
3353 }
3354 goto out_locked;
3355 }
3356 case T_UNITDATA_IND: {
3357 void *addr;
3358 t_uscalar_t addrlen;
3359 void *abuf;
3360 t_uscalar_t optlen;
3361 void *opt;
3362
3363 if ((so->so_state &
3364 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3365 (uiop->uio_resid != saved_resid) &&
3366 !(flags & MSG_PEEK)) {
3367 sorecv_update_oobstate(so);
3368 }
3369
3370 if (namelen != 0) {
3371 /* Caller wants source address */
3372 addrlen = tpr->unitdata_ind.SRC_length;
3373 addr = sogetoff(mp,
3374 tpr->unitdata_ind.SRC_offset,
3375 addrlen, 1);
3376 if (addr == NULL) {
3377 freemsg(mp);
3378 error = EPROTO;
3379 eprintsoline(so, error);
3380 goto out;
3381 }
3382 if (so->so_family == AF_UNIX) {
3383 /*
3384 * Can not use the transport level address.
3385 * If there is a SO_SRCADDR option carrying
3386 * the socket level address it will be
3387 * extracted below.
3388 */
3389 addr = NULL;
3390 addrlen = 0;
3391 }
3392 }
3393 optlen = tpr->unitdata_ind.OPT_length;
3394 if (optlen != 0) {
3395 t_uscalar_t ncontrollen;
3396
3397 /*
3398 * Extract any source address option.
3399 * Determine how large cmsg buffer is needed.
3400 */
3401 opt = sogetoff(mp,
3402 tpr->unitdata_ind.OPT_offset,
3403 optlen, __TPI_ALIGN_SIZE);
3404
3405 if (opt == NULL) {
3406 freemsg(mp);
3407 error = EPROTO;
3408 eprintsoline(so, error);
3409 goto out;
3410 }
3411 if (so->so_family == AF_UNIX)
3412 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3413 ncontrollen = so_cmsglen(mp, opt, optlen,
3414 !(flags & MSG_XPG4_2));
3415 if (controllen != 0)
3416 controllen = ncontrollen;
3417 else if (ncontrollen != 0)
3418 msg->msg_flags |= MSG_CTRUNC;
3419 } else {
3420 controllen = 0;
3421 }
3422
3423 if (namelen != 0) {
3424 /*
3425 * Return address to caller.
3426 * Caller handles truncation if length
3427 * exceeds msg_namelen.
3428 * NOTE: AF_UNIX NUL termination is ensured by
3429 * the sender's copyin_name().
3430 */
3431 abuf = kmem_alloc(addrlen, KM_SLEEP);
3432
3433 bcopy(addr, abuf, addrlen);
3434 msg->msg_name = abuf;
3435 msg->msg_namelen = addrlen;
3436 }
3437
3438 if (controllen != 0) {
3439 /*
3440 * Return control msg to caller.
3441 * Caller handles truncation if length
3442 * exceeds msg_controllen.
3443 */
3444 control = kmem_zalloc(controllen, KM_SLEEP);
3445
3446 error = so_opt2cmsg(mp, opt, optlen,
3447 !(flags & MSG_XPG4_2),
3448 control, controllen);
3449 if (error) {
3450 freemsg(mp);
3451 if (msg->msg_namelen != 0)
3452 kmem_free(msg->msg_name,
3453 msg->msg_namelen);
3454 kmem_free(control, controllen);
3455 eprintsoline(so, error);
3456 goto out;
3457 }
3458 msg->msg_control = control;
3459 msg->msg_controllen = controllen;
3460 }
3461
3462 freemsg(mp);
3463 goto out;
3464 }
3465 case T_OPTDATA_IND: {
3466 struct T_optdata_req *tdr;
3467 void *opt;
3468 t_uscalar_t optlen;
3469
3470 if ((so->so_state &
3471 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3472 (uiop->uio_resid != saved_resid) &&
3473 !(flags & MSG_PEEK)) {
3474 sorecv_update_oobstate(so);
3475 }
3476
3477 tdr = (struct T_optdata_req *)mp->b_rptr;
3478 optlen = tdr->OPT_length;
3479 if (optlen != 0) {
3480 t_uscalar_t ncontrollen;
3481 /*
3482 * Determine how large cmsg buffer is needed.
3483 */
3484 opt = sogetoff(mp,
3485 tpr->optdata_ind.OPT_offset,
3486 optlen, __TPI_ALIGN_SIZE);
3487
3488 if (opt == NULL) {
3489 freemsg(mp);
3490 error = EPROTO;
3491 eprintsoline(so, error);
3492 goto out;
3493 }
3494
3495 ncontrollen = so_cmsglen(mp, opt, optlen,
3496 !(flags & MSG_XPG4_2));
3497 if (controllen != 0)
3498 controllen = ncontrollen;
3499 else if (ncontrollen != 0)
3500 msg->msg_flags |= MSG_CTRUNC;
3501 } else {
3502 controllen = 0;
3503 }
3504
3505 if (controllen != 0) {
3506 /*
3507 * Return control msg to caller.
3508 * Caller handles truncation if length
3509 * exceeds msg_controllen.
3510 */
3511 control = kmem_zalloc(controllen, KM_SLEEP);
3512
3513 error = so_opt2cmsg(mp, opt, optlen,
3514 !(flags & MSG_XPG4_2),
3515 control, controllen);
3516 if (error) {
3517 freemsg(mp);
3518 kmem_free(control, controllen);
3519 eprintsoline(so, error);
3520 goto out;
3521 }
3522 msg->msg_control = control;
3523 msg->msg_controllen = controllen;
3524 }
3525
3526 /*
3527 * Set msg_flags to MSG_EOR based on
3528 * DATA_flag and MOREDATA.
3529 */
3530 mutex_enter(&so->so_lock);
3531 so->so_state &= ~SS_SAVEDEOR;
3532 if (!(tpr->data_ind.MORE_flag & 1)) {
3533 if (!(rval.r_val1 & MOREDATA))
3534 msg->msg_flags |= MSG_EOR;
3535 else
3536 so->so_state |= SS_SAVEDEOR;
3537 }
3538 freemsg(mp);
3539 /*
3540 * If some data was received (i.e. not EOF) and the
3541 * read/recv* has not been satisfied wait for some more.
3542 * Not possible to wait if control info was received.
3543 */
3544 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3545 controllen == 0 &&
3546 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3547 mutex_exit(&so->so_lock);
3548 pflag = opflag | MSG_NOMARK;
3549 goto retry;
3550 }
3551 goto out_locked;
3552 }
3553 case T_EXDATA_IND: {
3554 dprintso(so, 1,
3555 ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3556 "state %s\n",
3557 sti->sti_oobsigcnt, sti->sti_oobcnt,
3558 saved_resid - uiop->uio_resid,
3559 pr_state(so->so_state, so->so_mode)));
3560 /*
3561 * kstrgetmsg handles MSGMARK so there is nothing to
3562 * inspect in the T_EXDATA_IND.
3563 * strsock_proto makes the stream head queue the T_EXDATA_IND
3564 * as a separate message with no M_DATA component. Furthermore,
3565 * the stream head does not consolidate M_DATA messages onto
3566 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3567 * remains a message by itself. This is needed since MSGMARK
3568 * marks both the whole message as well as the last byte
3569 * of the message.
3570 */
3571 freemsg(mp);
3572 ASSERT(uiop->uio_resid == saved_resid); /* No data */
3573 if (flags & MSG_PEEK) {
3574 /*
3575 * Even though we are peeking we consume the
3576 * T_EXDATA_IND thereby moving the mark information
3577 * to SS_RCVATMARK. Then the oob code below will
3578 * retry the peeking kstrgetmsg.
3579 * Note that the stream head read queue is
3580 * never flushed without holding SOREADLOCKED
3581 * thus the T_EXDATA_IND can not disappear
3582 * underneath us.
3583 */
3584 dprintso(so, 1,
3585 ("sotpi_recvmsg: consume EXDATA_IND "
3586 "counts %d/%d state %s\n",
3587 sti->sti_oobsigcnt,
3588 sti->sti_oobcnt,
3589 pr_state(so->so_state, so->so_mode)));
3590
3591 pflag = MSG_ANY | MSG_DELAYERROR;
3592 if (so->so_mode & SM_ATOMIC)
3593 pflag |= MSG_DISCARDTAIL;
3594
3595 pri = 0;
3596 mp = NULL;
3597
3598 error = kstrgetmsg(SOTOV(so), &mp, uiop,
3599 &pri, &pflag, (clock_t)-1, &rval);
3600 ASSERT(uiop->uio_resid == saved_resid);
3601
3602 if (error) {
3603 #ifdef SOCK_DEBUG
3604 if (error != EWOULDBLOCK && error != EINTR) {
3605 eprintsoline(so, error);
3606 }
3607 #endif /* SOCK_DEBUG */
3608 goto out;
3609 }
3610 ASSERT(mp);
3611 tpr = (union T_primitives *)mp->b_rptr;
3612 ASSERT(tpr->type == T_EXDATA_IND);
3613 freemsg(mp);
3614 } /* end "if (flags & MSG_PEEK)" */
3615
3616 /*
3617 * Decrement the number of queued and pending oob.
3618 *
3619 * SS_RCVATMARK is cleared when we read past a mark.
3620 * SS_HAVEOOBDATA is cleared when we've read past the
3621 * last mark.
3622 * SS_OOBPEND is cleared if we've read past the last
3623 * mark and no (new) SIGURG has been posted.
3624 */
3625 mutex_enter(&so->so_lock);
3626 ASSERT(so_verify_oobstate(so));
3627 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3628 ASSERT(sti->sti_oobsigcnt > 0);
3629 sti->sti_oobsigcnt--;
3630 ASSERT(sti->sti_oobcnt > 0);
3631 sti->sti_oobcnt--;
3632 /*
3633 * Since the T_EXDATA_IND has been removed from the stream
3634 * head, but we have not read data past the mark,
3635 * sockfs needs to track that the socket is still at the mark.
3636 *
3637 * Since no data was received call kstrgetmsg again to wait
3638 * for data.
3639 */
3640 so->so_state |= SS_RCVATMARK;
3641 mutex_exit(&so->so_lock);
3642 dprintso(so, 1,
3643 ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3644 sti->sti_oobsigcnt, sti->sti_oobcnt,
3645 pr_state(so->so_state, so->so_mode)));
3646 pflag = opflag;
3647 goto retry;
3648 }
3649 default:
3650 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3651 (void *)so, tpr->type, (void *)mp);
3652 ASSERT(0);
3653 freemsg(mp);
3654 error = EPROTO;
3655 eprintsoline(so, error);
3656 goto out;
3657 }
3658 /* NOTREACHED */
3659 out:
3660 mutex_enter(&so->so_lock);
3661 out_locked:
3662 so_unlock_read(so); /* Clear SOREADLOCKED */
3663 mutex_exit(&so->so_lock);
3664 return (error);
3665 }
3666
3667 /*
3668 * Sending data with options on a datagram socket.
3669 * Assumes caller has verified that SS_ISBOUND etc. are set.
3670 *
3671 * For AF_UNIX the destination address may be already in
3672 * internal form, as indicated by sti->sti_faddr_noxlate
3673 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3674 * translate the destination address to internal form.
3675 *
3676 * The source address is passed as an option. If passing
3677 * file descriptors, those are passed as file pointers in
3678 * another option.
3679 */
3680 static int
3681 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3682 struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3683 {
3684 struct T_unitdata_req tudr;
3685 mblk_t *mp;
3686 int error;
3687 void *addr;
3688 socklen_t addrlen;
3689 void *src;
3690 socklen_t srclen;
3691 ssize_t len;
3692 int size;
3693 struct T_opthdr toh;
3694 struct fdbuf *fdbuf;
3695 t_uscalar_t optlen;
3696 void *fds;
3697 int fdlen;
3698 sotpi_info_t *sti = SOTOTPI(so);
3699
3700 ASSERT(name && namelen);
3701 ASSERT(control && controllen);
3702
3703 len = uiop->uio_resid;
3704 if (len > (ssize_t)sti->sti_tidu_size) {
3705 return (EMSGSIZE);
3706 }
3707
3708 if (sti->sti_faddr_noxlate == 0 &&
3709 (flags & MSG_SENDTO_NOXLATE) == 0) {
3710 /*
3711 * Length and family checks.
3712 * Don't verify internal form.
3713 */
3714 error = so_addr_verify(so, name, namelen);
3715 if (error) {
3716 eprintsoline(so, error);
3717 return (error);
3718 }
3719 }
3720
3721 if (so->so_family == AF_UNIX) {
3722 if (sti->sti_faddr_noxlate) {
3723 /*
3724 * Already have a transport internal address. Do not
3725 * pass any (transport internal) source address.
3726 */
3727 addr = name;
3728 addrlen = namelen;
3729 src = NULL;
3730 srclen = 0;
3731 } else if (flags & MSG_SENDTO_NOXLATE) {
3732 /*
3733 * Have an internal form dest. address.
3734 * Pass the source address as usual.
3735 */
3736 addr = name;
3737 addrlen = namelen;
3738 src = sti->sti_laddr_sa;
3739 srclen = (socklen_t)sti->sti_laddr_len;
3740 } else {
3741 /*
3742 * Pass the sockaddr_un source address as an option
3743 * and translate the remote address.
3744 *
3745 * Note that this code does not prevent sti_laddr_sa
3746 * from changing while it is being used. Thus
3747 * if an unbind+bind occurs concurrently with this
3748 * send the peer might see a partially new and a
3749 * partially old "from" address.
3750 */
3751 src = sti->sti_laddr_sa;
3752 srclen = (socklen_t)sti->sti_laddr_len;
3753 dprintso(so, 1,
3754 ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3755 srclen, src));
3756 /*
3757 * The sendmsg caller specified a destination
3758 * address, which we must translate into our
3759 * internal form. addr = &sti->sti_ux_taddr
3760 */
3761 error = so_ux_addr_xlate(so, name, namelen,
3762 (flags & MSG_XPG4_2),
3763 &addr, &addrlen);
3764 if (error) {
3765 eprintsoline(so, error);
3766 return (error);
3767 }
3768 }
3769 } else {
3770 addr = name;
3771 addrlen = namelen;
3772 src = NULL;
3773 srclen = 0;
3774 }
3775 optlen = so_optlen(control, controllen,
3776 !(flags & MSG_XPG4_2));
3777 tudr.PRIM_type = T_UNITDATA_REQ;
3778 tudr.DEST_length = addrlen;
3779 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3780 if (srclen != 0)
3781 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3782 _TPI_ALIGN_TOPT(srclen));
3783 else
3784 tudr.OPT_length = optlen;
3785 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3786 _TPI_ALIGN_TOPT(addrlen));
3787
3788 size = tudr.OPT_offset + tudr.OPT_length;
3789
3790 /*
3791 * File descriptors only when SM_FDPASSING set.
3792 */
3793 error = so_getfdopt(control, controllen,
3794 !(flags & MSG_XPG4_2), &fds, &fdlen);
3795 if (error)
3796 return (error);
3797 if (fdlen != -1) {
3798 if (!(so->so_mode & SM_FDPASSING))
3799 return (EOPNOTSUPP);
3800
3801 error = fdbuf_create(fds, fdlen, &fdbuf);
3802 if (error)
3803 return (error);
3804
3805 /*
3806 * Pre-allocate enough additional space for lower level modules
3807 * to append an option (e.g. see tl_unitdata). The following
3808 * is enough extra space for the largest option we might append.
3809 */
3810 size += sizeof (struct T_opthdr) + ucredsize;
3811 mp = fdbuf_allocmsg(size, fdbuf);
3812 } else {
3813 mp = soallocproto(size, _ALLOC_INTR, CRED());
3814 if (mp == NULL) {
3815 /*
3816 * Caught a signal waiting for memory.
3817 * Let send* return EINTR.
3818 */
3819 return (EINTR);
3820 }
3821 }
3822 soappendmsg(mp, &tudr, sizeof (tudr));
3823 soappendmsg(mp, addr, addrlen);
3824 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3825
3826 if (fdlen != -1) {
3827 ASSERT(fdbuf != NULL);
3828 toh.level = SOL_SOCKET;
3829 toh.name = SO_FILEP;
3830 toh.len = fdbuf->fd_size +
3831 (t_uscalar_t)sizeof (struct T_opthdr);
3832 toh.status = 0;
3833 soappendmsg(mp, &toh, sizeof (toh));
3834 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3835 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3836 }
3837 if (srclen != 0) {
3838 /*
3839 * There is a AF_UNIX sockaddr_un to include as a source
3840 * address option.
3841 */
3842 toh.level = SOL_SOCKET;
3843 toh.name = SO_SRCADDR;
3844 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3845 toh.status = 0;
3846 soappendmsg(mp, &toh, sizeof (toh));
3847 soappendmsg(mp, src, srclen);
3848 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3849 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3850 }
3851 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3852 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3853 /*
3854 * Normally at most 3 bytes left in the message, but we might have
3855 * allowed for extra space if we're passing fd's through.
3856 */
3857 ASSERT(MBLKL(mp) <= (ssize_t)size);
3858
3859 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3860 if (AU_AUDITING())
3861 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3862
3863 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3864 #ifdef SOCK_DEBUG
3865 if (error) {
3866 eprintsoline(so, error);
3867 }
3868 #endif /* SOCK_DEBUG */
3869 return (error);
3870 }
3871
3872 /*
3873 * Sending data with options on a connected stream socket.
3874 * Assumes caller has verified that SS_ISCONNECTED is set.
3875 */
3876 static int
3877 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3878 t_uscalar_t controllen, int flags)
3879 {
3880 struct T_optdata_req tdr;
3881 mblk_t *mp;
3882 int error;
3883 ssize_t iosize;
3884 int size;
3885 struct fdbuf *fdbuf;
3886 t_uscalar_t optlen;
3887 void *fds;
3888 int fdlen;
3889 struct T_opthdr toh;
3890 sotpi_info_t *sti = SOTOTPI(so);
3891
3892 dprintso(so, 1,
3893 ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3894
3895 /*
3896 * Has to be bound and connected. However, since no locks are
3897 * held the state could have changed after sotpi_sendmsg checked it
3898 * thus it is not possible to ASSERT on the state.
3899 */
3900
3901 /* Options on connection-oriented only when SM_OPTDATA set. */
3902 if (!(so->so_mode & SM_OPTDATA))
3903 return (EOPNOTSUPP);
3904
3905 do {
3906 /*
3907 * Set the MORE flag if uio_resid does not fit in this
3908 * message or if the caller passed in "more".
3909 * Error for transports with zero tidu_size.
3910 */
3911 tdr.PRIM_type = T_OPTDATA_REQ;
3912 iosize = sti->sti_tidu_size;
3913 if (iosize <= 0)
3914 return (EMSGSIZE);
3915 if (uiop->uio_resid > iosize) {
3916 tdr.DATA_flag = 1;
3917 } else {
3918 if (more)
3919 tdr.DATA_flag = 1;
3920 else
3921 tdr.DATA_flag = 0;
3922 iosize = uiop->uio_resid;
3923 }
3924 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3925 tdr.DATA_flag, iosize));
3926
3927 optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3928 tdr.OPT_length = optlen;
3929 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3930
3931 size = (int)sizeof (tdr) + optlen;
3932 /*
3933 * File descriptors only when SM_FDPASSING set.
3934 */
3935 error = so_getfdopt(control, controllen,
3936 !(flags & MSG_XPG4_2), &fds, &fdlen);
3937 if (error)
3938 return (error);
3939 if (fdlen != -1) {
3940 if (!(so->so_mode & SM_FDPASSING))
3941 return (EOPNOTSUPP);
3942
3943 error = fdbuf_create(fds, fdlen, &fdbuf);
3944 if (error)
3945 return (error);
3946
3947 /*
3948 * Pre-allocate enough additional space for lower level
3949 * modules to append an option (e.g. see tl_unitdata).
3950 * The following is enough extra space for the largest
3951 * option we might append.
3952 */
3953 size += sizeof (struct T_opthdr) + ucredsize;
3954 mp = fdbuf_allocmsg(size, fdbuf);
3955 } else {
3956 mp = soallocproto(size, _ALLOC_INTR, CRED());
3957 if (mp == NULL) {
3958 /*
3959 * Caught a signal waiting for memory.
3960 * Let send* return EINTR.
3961 */
3962 return (EINTR);
3963 }
3964 }
3965 soappendmsg(mp, &tdr, sizeof (tdr));
3966
3967 if (fdlen != -1) {
3968 ASSERT(fdbuf != NULL);
3969 toh.level = SOL_SOCKET;
3970 toh.name = SO_FILEP;
3971 toh.len = fdbuf->fd_size +
3972 (t_uscalar_t)sizeof (struct T_opthdr);
3973 toh.status = 0;
3974 soappendmsg(mp, &toh, sizeof (toh));
3975 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3976 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3977 }
3978 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3979 /*
3980 * Normally at most 3 bytes left in the message, but we might
3981 * have allowed for extra space if we're passing fd's through.
3982 */
3983 ASSERT(MBLKL(mp) <= (ssize_t)size);
3984
3985 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3986
3987 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3988 0, MSG_BAND, 0);
3989 if (error) {
3990 eprintsoline(so, error);
3991 return (error);
3992 }
3993 control = NULL;
3994 if (uiop->uio_resid > 0) {
3995 /*
3996 * Recheck for fatal errors. Fail write even though
3997 * some data have been written. This is consistent
3998 * with strwrite semantics and BSD sockets semantics.
3999 */
4000 if (so->so_state & SS_CANTSENDMORE) {
4001 eprintsoline(so, error);
4002 return (EPIPE);
4003 }
4004 if (so->so_error != 0) {
4005 mutex_enter(&so->so_lock);
4006 error = sogeterr(so, B_TRUE);
4007 mutex_exit(&so->so_lock);
4008 if (error != 0) {
4009 eprintsoline(so, error);
4010 return (error);
4011 }
4012 }
4013 }
4014 } while (uiop->uio_resid > 0);
4015 return (0);
4016 }
4017
4018 /*
4019 * Sending data on a datagram socket.
4020 * Assumes caller has verified that SS_ISBOUND etc. are set.
4021 *
4022 * For AF_UNIX the destination address may be already in
4023 * internal form, as indicated by sti->sti_faddr_noxlate
4024 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
4025 * translate the destination address to internal form.
4026 *
4027 * The source address is passed as an option.
4028 */
4029 int
4030 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
4031 struct uio *uiop, int flags)
4032 {
4033 struct T_unitdata_req tudr;
4034 mblk_t *mp;
4035 int error;
4036 void *addr;
4037 socklen_t addrlen;
4038 void *src;
4039 socklen_t srclen;
4040 ssize_t len;
4041 sotpi_info_t *sti = SOTOTPI(so);
4042
4043 ASSERT(name != NULL && namelen != 0);
4044
4045 len = uiop->uio_resid;
4046 if (len > sti->sti_tidu_size) {
4047 error = EMSGSIZE;
4048 goto done;
4049 }
4050
4051 if (sti->sti_faddr_noxlate == 0 &&
4052 (flags & MSG_SENDTO_NOXLATE) == 0) {
4053 /*
4054 * Length and family checks.
4055 * Don't verify internal form.
4056 */
4057 error = so_addr_verify(so, name, namelen);
4058 if (error != 0)
4059 goto done;
4060 }
4061
4062 if (sti->sti_direct) /* Never on AF_UNIX */
4063 return (sodgram_direct(so, name, namelen, uiop, flags));
4064
4065 if (so->so_family == AF_UNIX) {
4066 if (sti->sti_faddr_noxlate) {
4067 /*
4068 * Already have a transport internal address. Do not
4069 * pass any (transport internal) source address.
4070 */
4071 addr = name;
4072 addrlen = namelen;
4073 src = NULL;
4074 srclen = 0;
4075 } else if (flags & MSG_SENDTO_NOXLATE) {
4076 /*
4077 * Have an internal form dest. address.
4078 * Pass the source address as usual.
4079 */
4080 addr = name;
4081 addrlen = namelen;
4082 src = sti->sti_laddr_sa;
4083 srclen = (socklen_t)sti->sti_laddr_len;
4084 } else {
4085 /*
4086 * Pass the sockaddr_un source address as an option
4087 * and translate the remote address.
4088 *
4089 * Note that this code does not prevent sti_laddr_sa
4090 * from changing while it is being used. Thus
4091 * if an unbind+bind occurs concurrently with this
4092 * send the peer might see a partially new and a
4093 * partially old "from" address.
4094 */
4095 src = sti->sti_laddr_sa;
4096 srclen = (socklen_t)sti->sti_laddr_len;
4097 dprintso(so, 1,
4098 ("sosend_dgram UNIX: srclen %d, src %p\n",
4099 srclen, src));
4100 /*
4101 * The sendmsg caller specified a destination
4102 * address, which we must translate into our
4103 * internal form. addr = &sti->sti_ux_taddr
4104 */
4105 error = so_ux_addr_xlate(so, name, namelen,
4106 (flags & MSG_XPG4_2),
4107 &addr, &addrlen);
4108 if (error) {
4109 eprintsoline(so, error);
4110 goto done;
4111 }
4112 }
4113 } else {
4114 addr = name;
4115 addrlen = namelen;
4116 src = NULL;
4117 srclen = 0;
4118 }
4119 tudr.PRIM_type = T_UNITDATA_REQ;
4120 tudr.DEST_length = addrlen;
4121 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4122 if (srclen == 0) {
4123 tudr.OPT_length = 0;
4124 tudr.OPT_offset = 0;
4125
4126 mp = soallocproto2(&tudr, sizeof (tudr),
4127 addr, addrlen, 0, _ALLOC_INTR, CRED());
4128 if (mp == NULL) {
4129 /*
4130 * Caught a signal waiting for memory.
4131 * Let send* return EINTR.
4132 */
4133 error = EINTR;
4134 goto done;
4135 }
4136 } else {
4137 /*
4138 * There is a AF_UNIX sockaddr_un to include as a source
4139 * address option.
4140 */
4141 struct T_opthdr toh;
4142 ssize_t size;
4143
4144 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4145 _TPI_ALIGN_TOPT(srclen));
4146 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4147 _TPI_ALIGN_TOPT(addrlen));
4148
4149 toh.level = SOL_SOCKET;
4150 toh.name = SO_SRCADDR;
4151 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4152 toh.status = 0;
4153
4154 size = tudr.OPT_offset + tudr.OPT_length;
4155 mp = soallocproto2(&tudr, sizeof (tudr),
4156 addr, addrlen, size, _ALLOC_INTR, CRED());
4157 if (mp == NULL) {
4158 /*
4159 * Caught a signal waiting for memory.
4160 * Let send* return EINTR.
4161 */
4162 error = EINTR;
4163 goto done;
4164 }
4165 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4166 soappendmsg(mp, &toh, sizeof (toh));
4167 soappendmsg(mp, src, srclen);
4168 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4169 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4170 }
4171
4172 if (AU_AUDITING())
4173 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4174
4175 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4176 done:
4177 #ifdef SOCK_DEBUG
4178 if (error) {
4179 eprintsoline(so, error);
4180 }
4181 #endif /* SOCK_DEBUG */
4182 return (error);
4183 }
4184
4185 /*
4186 * Sending data on a connected stream socket.
4187 * Assumes caller has verified that SS_ISCONNECTED is set.
4188 */
4189 int
4190 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4191 int sflag)
4192 {
4193 struct T_data_req tdr;
4194 mblk_t *mp;
4195 int error;
4196 ssize_t iosize;
4197 sotpi_info_t *sti = SOTOTPI(so);
4198
4199 dprintso(so, 1,
4200 ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4201 (void *)so, uiop->uio_resid, prim, sflag));
4202
4203 /*
4204 * Has to be bound and connected. However, since no locks are
4205 * held the state could have changed after sotpi_sendmsg checked it
4206 * thus it is not possible to ASSERT on the state.
4207 */
4208
4209 do {
4210 /*
4211 * Set the MORE flag if uio_resid does not fit in this
4212 * message or if the caller passed in "more".
4213 * Error for transports with zero tidu_size.
4214 */
4215 tdr.PRIM_type = prim;
4216 iosize = sti->sti_tidu_size;
4217 if (iosize <= 0)
4218 return (EMSGSIZE);
4219 if (uiop->uio_resid > iosize) {
4220 tdr.MORE_flag = 1;
4221 } else {
4222 if (more)
4223 tdr.MORE_flag = 1;
4224 else
4225 tdr.MORE_flag = 0;
4226 iosize = uiop->uio_resid;
4227 }
4228 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4229 prim, tdr.MORE_flag, iosize));
4230 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4231 if (mp == NULL) {
4232 /*
4233 * Caught a signal waiting for memory.
4234 * Let send* return EINTR.
4235 */
4236 return (EINTR);
4237 }
4238
4239 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4240 0, sflag | MSG_BAND, 0);
4241 if (error) {
4242 eprintsoline(so, error);
4243 return (error);
4244 }
4245 if (uiop->uio_resid > 0) {
4246 /*
4247 * Recheck for fatal errors. Fail write even though
4248 * some data have been written. This is consistent
4249 * with strwrite semantics and BSD sockets semantics.
4250 */
4251 if (so->so_state & SS_CANTSENDMORE) {
4252 eprintsoline(so, error);
4253 return (EPIPE);
4254 }
4255 if (so->so_error != 0) {
4256 mutex_enter(&so->so_lock);
4257 error = sogeterr(so, B_TRUE);
4258 mutex_exit(&so->so_lock);
4259 if (error != 0) {
4260 eprintsoline(so, error);
4261 return (error);
4262 }
4263 }
4264 }
4265 } while (uiop->uio_resid > 0);
4266 return (0);
4267 }
4268
4269 /*
4270 * Check the state for errors and call the appropriate send function.
4271 *
4272 * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4273 * this function issues a setsockopt to toggle SO_DONTROUTE before and
4274 * after sending the message.
4275 *
4276 * The caller may optionally specify a destination address, for either
4277 * stream or datagram sockets. This table summarizes the cases:
4278 *
4279 * Socket type Dest. given Connected Result
4280 * ----------- ----------- --------- --------------
4281 * Stream * Yes send to conn. addr.
4282 * Stream * No error ENOTCONN
4283 * Dgram yes * send to given addr.
4284 * Dgram no yes send to conn. addr.
4285 * Dgram no no error EDESTADDRREQ
4286 *
4287 * There are subtleties around the destination address when using
4288 * AF_UNIX datagram sockets. When the sendmsg call specifies the
4289 * destination address, it's in (struct sockaddr_un) form and we
4290 * need to translate it to our internal form (struct so_ux_addr).
4291 *
4292 * When the sendmsg call does not specify a destination address
4293 * we're using the peer address saved during sotpi_connect, and
4294 * that address is already in internal form. In this case, the
4295 * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4296 * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4297 * those functions should skip translation to internal form.
4298 * Avoiding that translation is not only more efficient, but it's
4299 * also necessary when a process does a connect on an AF_UNIX
4300 * datagram socket and then drops privileges. After the process
4301 * has dropped privileges, it may no longer be able to lookup the
4302 * the external name in the filesystem, but it should still be
4303 * able to send messages on the connected socket by leaving the
4304 * destination name unspecified.
4305 *
4306 * Yet more subtleties arise with sockets connected by socketpair(),
4307 * which puts internal form addresses in the fields where normally
4308 * the external form is found, and sets sti_faddr_noxlate=1, which
4309 * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4310 * to skip translation of destination addresses to internal form.
4311 * However, beware that the flag sti_faddr_noxlate=1 also triggers
4312 * different behaviour almost everywhere AF_UNIX addresses appear.
4313 */
4314 static int
4315 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4316 struct cred *cr)
4317 {
4318 int so_state;
4319 int so_mode;
4320 int error;
4321 struct sockaddr *name;
4322 t_uscalar_t namelen;
4323 int dontroute;
4324 int flags;
4325 sotpi_info_t *sti = SOTOTPI(so);
4326
4327 dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4328 (void *)so, (void *)msg, msg->msg_flags,
4329 pr_state(so->so_state, so->so_mode), so->so_error));
4330
4331 if (so->so_version == SOV_STREAM) {
4332 /* The imaginary "sockmod" has been popped - act as a stream */
4333 so_update_attrs(so, SOMOD);
4334 return (strwrite(SOTOV(so), uiop, cr));
4335 }
4336
4337 mutex_enter(&so->so_lock);
4338 so_state = so->so_state;
4339
4340 if (so_state & SS_CANTSENDMORE) {
4341 mutex_exit(&so->so_lock);
4342 return (EPIPE);
4343 }
4344
4345 if (so->so_error != 0) {
4346 error = sogeterr(so, B_TRUE);
4347 if (error != 0) {
4348 mutex_exit(&so->so_lock);
4349 return (error);
4350 }
4351 }
4352
4353 name = (struct sockaddr *)msg->msg_name;
4354 namelen = msg->msg_namelen;
4355 flags = msg->msg_flags;
4356
4357 /*
4358 * Historically, this function does not validate the flags
4359 * passed in, and any errant bits are ignored. However,
4360 * we would not want any such errant flag bits accidently
4361 * being treated as one of the internal-only flags, so
4362 * clear the internal-only flag bits.
4363 */
4364 flags &= ~MSG_SENDTO_NOXLATE;
4365
4366 so_mode = so->so_mode;
4367
4368 if (name == NULL) {
4369 if (!(so_state & SS_ISCONNECTED)) {
4370 mutex_exit(&so->so_lock);
4371 if (so_mode & SM_CONNREQUIRED)
4372 return (ENOTCONN);
4373 else
4374 return (EDESTADDRREQ);
4375 }
4376 /*
4377 * This is a connected socket.
4378 */
4379 if (so_mode & SM_CONNREQUIRED) {
4380 /*
4381 * This is a connected STREAM socket,
4382 * destination not specified.
4383 */
4384 name = NULL;
4385 namelen = 0;
4386 } else {
4387 /*
4388 * Datagram send on connected socket with
4389 * the destination name not specified.
4390 * Use the peer address from connect.
4391 */
4392 if (so->so_family == AF_UNIX) {
4393 /*
4394 * Use the (internal form) address saved
4395 * in sotpi_connect. See above.
4396 */
4397 name = (void *)&sti->sti_ux_faddr;
4398 namelen = sizeof (sti->sti_ux_faddr);
4399 flags |= MSG_SENDTO_NOXLATE;
4400 } else {
4401 ASSERT(sti->sti_faddr_sa);
4402 name = sti->sti_faddr_sa;
4403 namelen = (t_uscalar_t)sti->sti_faddr_len;
4404 }
4405 }
4406 } else {
4407 /*
4408 * Sendmsg specifies a destination name
4409 */
4410 if (!(so_state & SS_ISCONNECTED) &&
4411 (so_mode & SM_CONNREQUIRED)) {
4412 /* i.e. TCP not connected */
4413 mutex_exit(&so->so_lock);
4414 return (ENOTCONN);
4415 }
4416 /*
4417 * Ignore the address on connection-oriented sockets.
4418 * Just like BSD this code does not generate an error for
4419 * TCP (a CONNREQUIRED socket) when sending to an address
4420 * passed in with sendto/sendmsg. Instead the data is
4421 * delivered on the connection as if no address had been
4422 * supplied.
4423 */
4424 if ((so_state & SS_ISCONNECTED) &&
4425 !(so_mode & SM_CONNREQUIRED)) {
4426 mutex_exit(&so->so_lock);
4427 return (EISCONN);
4428 }
4429 if (!(so_state & SS_ISBOUND)) {
4430 so_lock_single(so); /* Set SOLOCKED */
4431 error = sotpi_bind(so, NULL, 0,
4432 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4433 so_unlock_single(so, SOLOCKED);
4434 if (error) {
4435 mutex_exit(&so->so_lock);
4436 eprintsoline(so, error);
4437 return (error);
4438 }
4439 }
4440 /*
4441 * Handle delayed datagram errors. These are only queued
4442 * when the application sets SO_DGRAM_ERRIND.
4443 * Return the error if we are sending to the address
4444 * that was returned in the last T_UDERROR_IND.
4445 * If sending to some other address discard the delayed
4446 * error indication.
4447 */
4448 if (sti->sti_delayed_error) {
4449 struct T_uderror_ind *tudi;
4450 void *addr;
4451 t_uscalar_t addrlen;
4452 boolean_t match = B_FALSE;
4453
4454 ASSERT(sti->sti_eaddr_mp);
4455 error = sti->sti_delayed_error;
4456 sti->sti_delayed_error = 0;
4457 tudi =
4458 (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4459 addrlen = tudi->DEST_length;
4460 addr = sogetoff(sti->sti_eaddr_mp,
4461 tudi->DEST_offset, addrlen, 1);
4462 ASSERT(addr); /* Checked by strsock_proto */
4463 switch (so->so_family) {
4464 case AF_INET: {
4465 /* Compare just IP address and port */
4466 sin_t *sin1 = (sin_t *)name;
4467 sin_t *sin2 = (sin_t *)addr;
4468
4469 if (addrlen == sizeof (sin_t) &&
4470 namelen == addrlen &&
4471 sin1->sin_port == sin2->sin_port &&
4472 sin1->sin_addr.s_addr ==
4473 sin2->sin_addr.s_addr)
4474 match = B_TRUE;
4475 break;
4476 }
4477 case AF_INET6: {
4478 /* Compare just IP address and port. Not flow */
4479 sin6_t *sin1 = (sin6_t *)name;
4480 sin6_t *sin2 = (sin6_t *)addr;
4481
4482 if (addrlen == sizeof (sin6_t) &&
4483 namelen == addrlen &&
4484 sin1->sin6_port == sin2->sin6_port &&
4485 IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4486 &sin2->sin6_addr))
4487 match = B_TRUE;
4488 break;
4489 }
4490 case AF_UNIX:
4491 default:
4492 if (namelen == addrlen &&
4493 bcmp(name, addr, namelen) == 0)
4494 match = B_TRUE;
4495 }
4496 if (match) {
4497 freemsg(sti->sti_eaddr_mp);
4498 sti->sti_eaddr_mp = NULL;
4499 mutex_exit(&so->so_lock);
4500 #ifdef DEBUG
4501 dprintso(so, 0,
4502 ("sockfs delayed error %d for %s\n",
4503 error,
4504 pr_addr(so->so_family, name, namelen)));
4505 #endif /* DEBUG */
4506 return (error);
4507 }
4508 freemsg(sti->sti_eaddr_mp);
4509 sti->sti_eaddr_mp = NULL;
4510 }
4511 }
4512 mutex_exit(&so->so_lock);
4513
4514 dontroute = 0;
4515 if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4516 uint32_t val;
4517
4518 val = 1;
4519 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4520 &val, (t_uscalar_t)sizeof (val), cr);
4521 if (error)
4522 return (error);
4523 dontroute = 1;
4524 }
4525
4526 if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4527 error = EOPNOTSUPP;
4528 goto done;
4529 }
4530 if (msg->msg_controllen != 0) {
4531 if (!(so_mode & SM_CONNREQUIRED)) {
4532 so_update_attrs(so, SOMOD);
4533 error = sosend_dgramcmsg(so, name, namelen, uiop,
4534 msg->msg_control, msg->msg_controllen, flags);
4535 } else {
4536 if (flags & MSG_OOB) {
4537 /* Can't generate T_EXDATA_REQ with options */
4538 error = EOPNOTSUPP;
4539 goto done;
4540 }
4541 so_update_attrs(so, SOMOD);
4542 error = sosend_svccmsg(so, uiop,
4543 !(flags & MSG_EOR),
4544 msg->msg_control, msg->msg_controllen,
4545 flags);
4546 }
4547 goto done;
4548 }
4549
4550 so_update_attrs(so, SOMOD);
4551 if (!(so_mode & SM_CONNREQUIRED)) {
4552 /*
4553 * If there is no SO_DONTROUTE to turn off return immediately
4554 * from send_dgram. This can allow tail-call optimizations.
4555 */
4556 if (!dontroute) {
4557 return (sosend_dgram(so, name, namelen, uiop, flags));
4558 }
4559 error = sosend_dgram(so, name, namelen, uiop, flags);
4560 } else {
4561 t_scalar_t prim;
4562 int sflag;
4563
4564 /* Ignore msg_name in the connected state */
4565 if (flags & MSG_OOB) {
4566 prim = T_EXDATA_REQ;
4567 /*
4568 * Send down T_EXDATA_REQ even if there is flow
4569 * control for data.
4570 */
4571 sflag = MSG_IGNFLOW;
4572 } else {
4573 if (so_mode & SM_BYTESTREAM) {
4574 /* Byte stream transport - use write */
4575 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4576
4577 /* Send M_DATA messages */
4578 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4579 (error = nl7c_data(so, uiop)) >= 0) {
4580 /* NL7C consumed the data */
4581 return (error);
4582 }
4583 /*
4584 * If there is no SO_DONTROUTE to turn off,
4585 * sti_direct is on, and there is no flow
4586 * control, we can take the fast path.
4587 */
4588 if (!dontroute && sti->sti_direct != 0 &&
4589 canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4590 return (sostream_direct(so, uiop,
4591 NULL, cr));
4592 }
4593 error = strwrite(SOTOV(so), uiop, cr);
4594 goto done;
4595 }
4596 prim = T_DATA_REQ;
4597 sflag = 0;
4598 }
4599 /*
4600 * If there is no SO_DONTROUTE to turn off return immediately
4601 * from sosend_svc. This can allow tail-call optimizations.
4602 */
4603 if (!dontroute)
4604 return (sosend_svc(so, uiop, prim,
4605 !(flags & MSG_EOR), sflag));
4606 error = sosend_svc(so, uiop, prim,
4607 !(flags & MSG_EOR), sflag);
4608 }
4609 ASSERT(dontroute);
4610 done:
4611 if (dontroute) {
4612 uint32_t val;
4613
4614 val = 0;
4615 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4616 &val, (t_uscalar_t)sizeof (val), cr);
4617 }
4618 return (error);
4619 }
4620
4621 /*
4622 * kstrwritemp() has very similar semantics as that of strwrite().
4623 * The main difference is it obtains mblks from the caller and also
4624 * does not do any copy as done in strwrite() from user buffers to
4625 * kernel buffers.
4626 *
4627 * Currently, this routine is used by sendfile to send data allocated
4628 * within the kernel without any copying. This interface does not use the
4629 * synchronous stream interface as synch. stream interface implies
4630 * copying.
4631 */
4632 int
4633 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4634 {
4635 struct stdata *stp;
4636 struct queue *wqp;
4637 mblk_t *newmp;
4638 char waitflag;
4639 int tempmode;
4640 int error = 0;
4641 int done = 0;
4642 struct sonode *so;
4643 boolean_t direct;
4644
4645 ASSERT(vp->v_stream);
4646 stp = vp->v_stream;
4647
4648 so = VTOSO(vp);
4649 direct = _SOTOTPI(so)->sti_direct;
4650
4651 /*
4652 * This is the sockfs direct fast path. canputnext() need
4653 * not be accurate so we don't grab the sd_lock here. If
4654 * we get flow-controlled, we grab sd_lock just before the
4655 * do..while loop below to emulate what strwrite() does.
4656 */
4657 wqp = stp->sd_wrq;
4658 if (canputnext(wqp) && direct &&
4659 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4660 return (sostream_direct(so, NULL, mp, CRED()));
4661 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4662 /* Fast check of flags before acquiring the lock */
4663 mutex_enter(&stp->sd_lock);
4664 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4665 mutex_exit(&stp->sd_lock);
4666 if (error != 0) {
4667 if (!(stp->sd_flag & STPLEX) &&
4668 (stp->sd_wput_opt & SW_SIGPIPE)) {
4669 error = EPIPE;
4670 }
4671 return (error);
4672 }
4673 }
4674
4675 waitflag = WRITEWAIT;
4676 if (stp->sd_flag & OLDNDELAY)
4677 tempmode = fmode & ~FNDELAY;
4678 else
4679 tempmode = fmode;
4680
4681 mutex_enter(&stp->sd_lock);
4682 do {
4683 if (canputnext(wqp)) {
4684 mutex_exit(&stp->sd_lock);
4685 if (stp->sd_wputdatafunc != NULL) {
4686 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4687 NULL, NULL, NULL);
4688 if (newmp == NULL) {
4689 /* The caller will free mp */
4690 return (ECOMM);
4691 }
4692 mp = newmp;
4693 }
4694 putnext(wqp, mp);
4695 return (0);
4696 }
4697 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4698 &done);
4699 } while (error == 0 && !done);
4700
4701 mutex_exit(&stp->sd_lock);
4702 /*
4703 * EAGAIN tells the application to try again. ENOMEM
4704 * is returned only if the memory allocation size
4705 * exceeds the physical limits of the system. ENOMEM
4706 * can't be true here.
4707 */
4708 if (error == ENOMEM)
4709 error = EAGAIN;
4710 return (error);
4711 }
4712
4713 /* ARGSUSED */
4714 static int
4715 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4716 struct cred *cr, mblk_t **mpp)
4717 {
4718 int error;
4719
4720 switch (so->so_family) {
4721 case AF_INET:
4722 case AF_INET6:
4723 case AF_UNIX:
4724 break;
4725 default:
4726 return (EAFNOSUPPORT);
4727
4728 }
4729
4730 if (so->so_state & SS_CANTSENDMORE)
4731 return (EPIPE);
4732
4733 if (so->so_type != SOCK_STREAM)
4734 return (EOPNOTSUPP);
4735
4736 if ((so->so_state & SS_ISCONNECTED) == 0)
4737 return (ENOTCONN);
4738
4739 error = kstrwritemp(so->so_vnode, *mpp, fflag);
4740 if (error == 0)
4741 *mpp = NULL;
4742 return (error);
4743 }
4744
4745 /*
4746 * Sending data on a datagram socket.
4747 * Assumes caller has verified that SS_ISBOUND etc. are set.
4748 */
4749 /* ARGSUSED */
4750 static int
4751 sodgram_direct(struct sonode *so, struct sockaddr *name,
4752 socklen_t namelen, struct uio *uiop, int flags)
4753 {
4754 struct T_unitdata_req tudr;
4755 mblk_t *mp = NULL;
4756 int error = 0;
4757 void *addr;
4758 socklen_t addrlen;
4759 ssize_t len;
4760 struct stdata *stp = SOTOV(so)->v_stream;
4761 int so_state;
4762 queue_t *udp_wq;
4763 boolean_t connected;
4764 mblk_t *mpdata = NULL;
4765 sotpi_info_t *sti = SOTOTPI(so);
4766 uint32_t auditing = AU_AUDITING();
4767
4768 ASSERT(name != NULL && namelen != 0);
4769 ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4770 ASSERT(!(so->so_mode & SM_EXDATA));
4771 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4772 ASSERT(SOTOV(so)->v_type == VSOCK);
4773
4774 /* Caller checked for proper length */
4775 len = uiop->uio_resid;
4776 ASSERT(len <= sti->sti_tidu_size);
4777
4778 /* Length and family checks have been done by caller */
4779 ASSERT(name->sa_family == so->so_family);
4780 ASSERT(so->so_family == AF_INET ||
4781 (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4782 ASSERT(so->so_family == AF_INET6 ||
4783 (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4784
4785 addr = name;
4786 addrlen = namelen;
4787
4788 if (stp->sd_sidp != NULL &&
4789 (error = straccess(stp, JCWRITE)) != 0)
4790 goto done;
4791
4792 so_state = so->so_state;
4793
4794 connected = so_state & SS_ISCONNECTED;
4795 if (!connected) {
4796 tudr.PRIM_type = T_UNITDATA_REQ;
4797 tudr.DEST_length = addrlen;
4798 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4799 tudr.OPT_length = 0;
4800 tudr.OPT_offset = 0;
4801
4802 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4803 _ALLOC_INTR, CRED());
4804 if (mp == NULL) {
4805 /*
4806 * Caught a signal waiting for memory.
4807 * Let send* return EINTR.
4808 */
4809 error = EINTR;
4810 goto done;
4811 }
4812 }
4813
4814 /*
4815 * For UDP we don't break up the copyin into smaller pieces
4816 * as in the TCP case. That means if ENOMEM is returned by
4817 * mcopyinuio() then the uio vector has not been modified at
4818 * all and we fallback to either strwrite() or kstrputmsg()
4819 * below. Note also that we never generate priority messages
4820 * from here.
4821 */
4822 udp_wq = stp->sd_wrq->q_next;
4823 if (canput(udp_wq) &&
4824 (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4825 ASSERT(DB_TYPE(mpdata) == M_DATA);
4826 ASSERT(uiop->uio_resid == 0);
4827 if (!connected)
4828 linkb(mp, mpdata);
4829 else
4830 mp = mpdata;
4831 if (auditing)
4832 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4833
4834 /* Always returns 0... */
4835 return (udp_wput(udp_wq, mp));
4836 }
4837
4838 ASSERT(mpdata == NULL);
4839 if (error != 0 && error != ENOMEM) {
4840 freemsg(mp);
4841 return (error);
4842 }
4843
4844 /*
4845 * For connected, let strwrite() handle the blocking case.
4846 * Otherwise we fall thru and use kstrputmsg().
4847 */
4848 if (connected)
4849 return (strwrite(SOTOV(so), uiop, CRED()));
4850
4851 if (auditing)
4852 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4853
4854 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4855 done:
4856 #ifdef SOCK_DEBUG
4857 if (error != 0) {
4858 eprintsoline(so, error);
4859 }
4860 #endif /* SOCK_DEBUG */
4861 return (error);
4862 }
4863
4864 int
4865 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4866 {
4867 struct stdata *stp = SOTOV(so)->v_stream;
4868 ssize_t iosize, rmax, maxblk;
4869 queue_t *tcp_wq = stp->sd_wrq->q_next;
4870 mblk_t *newmp;
4871 int error = 0, wflag = 0;
4872
4873 ASSERT(so->so_mode & SM_BYTESTREAM);
4874 ASSERT(SOTOV(so)->v_type == VSOCK);
4875
4876 if (stp->sd_sidp != NULL &&
4877 (error = straccess(stp, JCWRITE)) != 0)
4878 return (error);
4879
4880 if (uiop == NULL) {
4881 /*
4882 * kstrwritemp() should have checked sd_flag and
4883 * flow-control before coming here. If we end up
4884 * here it means that we can simply pass down the
4885 * data to tcp.
4886 */
4887 ASSERT(mp != NULL);
4888 if (stp->sd_wputdatafunc != NULL) {
4889 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4890 NULL, NULL, NULL);
4891 if (newmp == NULL) {
4892 /* The caller will free mp */
4893 return (ECOMM);
4894 }
4895 mp = newmp;
4896 }
4897 /* Always returns 0... */
4898 return (tcp_wput(tcp_wq, mp));
4899 }
4900
4901 /* Fallback to strwrite() to do proper error handling */
4902 if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4903 return (strwrite(SOTOV(so), uiop, cr));
4904
4905 rmax = stp->sd_qn_maxpsz;
4906 ASSERT(rmax >= 0 || rmax == INFPSZ);
4907 if (rmax == 0 || uiop->uio_resid <= 0)
4908 return (0);
4909
4910 if (rmax == INFPSZ)
4911 rmax = uiop->uio_resid;
4912
4913 maxblk = stp->sd_maxblk;
4914
4915 for (;;) {
4916 iosize = MIN(uiop->uio_resid, rmax);
4917
4918 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4919 if (mp == NULL) {
4920 /*
4921 * Fallback to strwrite() for ENOMEM; if this
4922 * is our first time in this routine and the uio
4923 * vector has not been modified, we will end up
4924 * calling strwrite() without any flag set.
4925 */
4926 if (error == ENOMEM)
4927 goto slow_send;
4928 else
4929 return (error);
4930 }
4931 ASSERT(uiop->uio_resid >= 0);
4932 /*
4933 * If mp is non-NULL and ENOMEM is set, it means that
4934 * mcopyinuio() was able to break down some of the user
4935 * data into one or more mblks. Send the partial data
4936 * to tcp and let the rest be handled in strwrite().
4937 */
4938 ASSERT(error == 0 || error == ENOMEM);
4939 if (stp->sd_wputdatafunc != NULL) {
4940 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4941 NULL, NULL, NULL);
4942 if (newmp == NULL) {
4943 /* The caller will free mp */
4944 return (ECOMM);
4945 }
4946 mp = newmp;
4947 }
4948 (void) tcp_wput(tcp_wq, mp); /* Always returns 0 anyway. */
4949
4950 wflag |= NOINTR;
4951
4952 if (uiop->uio_resid == 0) { /* No more data; we're done */
4953 ASSERT(error == 0);
4954 break;
4955 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4956 (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4957 slow_send:
4958 /*
4959 * We were able to send down partial data using
4960 * the direct call interface, but are now relying
4961 * on strwrite() to handle the non-fastpath cases.
4962 * If the socket is blocking we will sleep in
4963 * strwaitq() until write is permitted, otherwise,
4964 * we will need to return the amount of bytes
4965 * written so far back to the app. This is the
4966 * reason why we pass NOINTR flag to strwrite()
4967 * for non-blocking socket, because we don't want
4968 * to return EAGAIN when portion of the user data
4969 * has actually been sent down.
4970 */
4971 return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4972 }
4973 }
4974 return (0);
4975 }
4976
4977 /*
4978 * Update sti_faddr by asking the transport (unless AF_UNIX).
4979 */
4980 /* ARGSUSED */
4981 int
4982 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4983 boolean_t accept, struct cred *cr)
4984 {
4985 struct strbuf strbuf;
4986 int error = 0, res;
4987 void *addr;
4988 t_uscalar_t addrlen;
4989 k_sigset_t smask;
4990 sotpi_info_t *sti = SOTOTPI(so);
4991 vnode_t *vn;
4992
4993 dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4994 (void *)so, pr_state(so->so_state, so->so_mode)));
4995
4996 ASSERT(*namelen > 0);
4997 mutex_enter(&so->so_lock);
4998 so_lock_single(so); /* Set SOLOCKED */
4999 vn = SOTOV(so);
5000 if (SOTPI_VN_NOSTREAM(vn)) {
5001 error = EBADF;
5002 goto done;
5003 }
5004
5005 if (accept) {
5006 bcopy(sti->sti_faddr_sa, name,
5007 MIN(*namelen, sti->sti_faddr_len));
5008 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
5009 goto done;
5010 }
5011
5012 if (!(so->so_state & SS_ISCONNECTED)) {
5013 error = ENOTCONN;
5014 goto done;
5015 }
5016 /* Added this check for X/Open */
5017 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5018 error = EINVAL;
5019 if (xnet_check_print) {
5020 printf("sockfs: X/Open getpeername check => EINVAL\n");
5021 }
5022 goto done;
5023 }
5024
5025 if (sti->sti_faddr_valid) {
5026 bcopy(sti->sti_faddr_sa, name,
5027 MIN(*namelen, sti->sti_faddr_len));
5028 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
5029 goto done;
5030 }
5031
5032 #ifdef DEBUG
5033 dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
5034 pr_addr(so->so_family, sti->sti_faddr_sa,
5035 (t_uscalar_t)sti->sti_faddr_len)));
5036 #endif /* DEBUG */
5037
5038 if (so->so_family == AF_UNIX) {
5039 /* Transport has different name space - return local info */
5040 if (sti->sti_faddr_noxlate)
5041 *namelen = 0;
5042 error = 0;
5043 goto done;
5044 }
5045
5046 ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
5047
5048 ASSERT(sti->sti_faddr_sa);
5049 /* Allocate local buffer to use with ioctl */
5050 addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
5051 mutex_exit(&so->so_lock);
5052 addr = kmem_alloc(addrlen, KM_SLEEP);
5053
5054 /*
5055 * Issue TI_GETPEERNAME with signals masked.
5056 * Put the result in sti_faddr_sa so that getpeername works after
5057 * a shutdown(output).
5058 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5059 * back to the socket.
5060 */
5061 strbuf.buf = addr;
5062 strbuf.maxlen = addrlen;
5063 strbuf.len = 0;
5064
5065 sigintr(&smask, 0);
5066 res = 0;
5067 ASSERT(cr);
5068 error = strioctl(vn, TI_GETPEERNAME, (intptr_t)&strbuf,
5069 0, K_TO_K, cr, &res);
5070 sigunintr(&smask);
5071
5072 mutex_enter(&so->so_lock);
5073 /*
5074 * If there is an error record the error in so_error put don't fail
5075 * the getpeername. Instead fallback on the recorded
5076 * sti->sti_faddr_sa.
5077 */
5078 if (error) {
5079 /*
5080 * Various stream head errors can be returned to the ioctl.
5081 * However, it is impossible to determine which ones of
5082 * these are really socket level errors that were incorrectly
5083 * consumed by the ioctl. Thus this code silently ignores the
5084 * error - to code explicitly does not reinstate the error
5085 * using soseterror().
5086 * Experiments have shows that at least this set of
5087 * errors are reported and should not be reinstated on the
5088 * socket:
5089 * EINVAL E.g. if an I_LINK was in effect when
5090 * getpeername was called.
5091 * EPIPE The ioctl error semantics prefer the write
5092 * side error over the read side error.
5093 * ENOTCONN The transport just got disconnected but
5094 * sockfs had not yet seen the T_DISCON_IND
5095 * when issuing the ioctl.
5096 */
5097 error = 0;
5098 } else if (res == 0 && strbuf.len > 0 &&
5099 (so->so_state & SS_ISCONNECTED)) {
5100 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
5101 sti->sti_faddr_len = (socklen_t)strbuf.len;
5102 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
5103 sti->sti_faddr_valid = 1;
5104
5105 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
5106 *namelen = sti->sti_faddr_len;
5107 }
5108 kmem_free(addr, addrlen);
5109 #ifdef DEBUG
5110 dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
5111 pr_addr(so->so_family, sti->sti_faddr_sa,
5112 (t_uscalar_t)sti->sti_faddr_len)));
5113 #endif /* DEBUG */
5114 done:
5115 so_unlock_single(so, SOLOCKED);
5116 mutex_exit(&so->so_lock);
5117 return (error);
5118 }
5119
5120 /*
5121 * Update sti_laddr by asking the transport (unless AF_UNIX).
5122 */
5123 int
5124 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5125 struct cred *cr)
5126 {
5127 struct strbuf strbuf;
5128 int error = 0, res;
5129 void *addr;
5130 t_uscalar_t addrlen;
5131 k_sigset_t smask;
5132 sotpi_info_t *sti = SOTOTPI(so);
5133 vnode_t *vn;
5134
5135 dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5136 (void *)so, pr_state(so->so_state, so->so_mode)));
5137
5138 ASSERT(*namelen > 0);
5139 mutex_enter(&so->so_lock);
5140 so_lock_single(so); /* Set SOLOCKED */
5141 vn = SOTOV(so);
5142 if (SOTPI_VN_NOSTREAM(vn)) {
5143 error = EBADF;
5144 goto done;
5145 }
5146
5147 #ifdef DEBUG
5148
5149 dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5150 pr_addr(so->so_family, sti->sti_laddr_sa,
5151 (t_uscalar_t)sti->sti_laddr_len)));
5152 #endif /* DEBUG */
5153 if (sti->sti_laddr_valid) {
5154 bcopy(sti->sti_laddr_sa, name,
5155 MIN(*namelen, sti->sti_laddr_len));
5156 *namelen = sti->sti_laddr_len;
5157 goto done;
5158 }
5159
5160 if (so->so_family == AF_UNIX) {
5161 /*
5162 * Transport has different name space - return local info. If we
5163 * have enough space, let consumers know the family.
5164 */
5165 if (*namelen >= sizeof (sa_family_t)) {
5166 name->sa_family = AF_UNIX;
5167 *namelen = sizeof (sa_family_t);
5168 } else {
5169 *namelen = 0;
5170 }
5171 error = 0;
5172 goto done;
5173 }
5174 if (!(so->so_state & SS_ISBOUND)) {
5175 /* If not bound, then nothing to return. */
5176 error = 0;
5177 goto done;
5178 }
5179
5180 /* Allocate local buffer to use with ioctl */
5181 addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5182 mutex_exit(&so->so_lock);
5183 addr = kmem_alloc(addrlen, KM_SLEEP);
5184
5185 /*
5186 * Issue TI_GETMYNAME with signals masked.
5187 * Put the result in sti_laddr_sa so that getsockname works after
5188 * a shutdown(output).
5189 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5190 * back to the socket.
5191 */
5192 strbuf.buf = addr;
5193 strbuf.maxlen = addrlen;
5194 strbuf.len = 0;
5195
5196 sigintr(&smask, 0);
5197 res = 0;
5198 ASSERT(cr);
5199 error = strioctl(vn, TI_GETMYNAME, (intptr_t)&strbuf,
5200 0, K_TO_K, cr, &res);
5201 sigunintr(&smask);
5202
5203 mutex_enter(&so->so_lock);
5204 /*
5205 * If there is an error record the error in so_error put don't fail
5206 * the getsockname. Instead fallback on the recorded
5207 * sti->sti_laddr_sa.
5208 */
5209 if (error) {
5210 /*
5211 * Various stream head errors can be returned to the ioctl.
5212 * However, it is impossible to determine which ones of
5213 * these are really socket level errors that were incorrectly
5214 * consumed by the ioctl. Thus this code silently ignores the
5215 * error - to code explicitly does not reinstate the error
5216 * using soseterror().
5217 * Experiments have shows that at least this set of
5218 * errors are reported and should not be reinstated on the
5219 * socket:
5220 * EINVAL E.g. if an I_LINK was in effect when
5221 * getsockname was called.
5222 * EPIPE The ioctl error semantics prefer the write
5223 * side error over the read side error.
5224 */
5225 error = 0;
5226 } else if (res == 0 && strbuf.len > 0 &&
5227 (so->so_state & SS_ISBOUND)) {
5228 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5229 sti->sti_laddr_len = (socklen_t)strbuf.len;
5230 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5231 sti->sti_laddr_valid = 1;
5232
5233 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5234 *namelen = sti->sti_laddr_len;
5235 }
5236 kmem_free(addr, addrlen);
5237 #ifdef DEBUG
5238 dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5239 pr_addr(so->so_family, sti->sti_laddr_sa,
5240 (t_uscalar_t)sti->sti_laddr_len)));
5241 #endif /* DEBUG */
5242 done:
5243 so_unlock_single(so, SOLOCKED);
5244 mutex_exit(&so->so_lock);
5245 return (error);
5246 }
5247
5248 /*
5249 * Get socket options. For SOL_SOCKET options some options are handled
5250 * by the sockfs while others use the value recorded in the sonode as a
5251 * fallback should the T_SVR4_OPTMGMT_REQ fail.
5252 *
5253 * On the return most *optlenp bytes are copied to optval.
5254 */
5255 /* ARGSUSED */
5256 int
5257 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5258 void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5259 {
5260 struct T_optmgmt_req optmgmt_req;
5261 struct T_optmgmt_ack *optmgmt_ack;
5262 struct opthdr oh;
5263 struct opthdr *opt_res;
5264 mblk_t *mp = NULL;
5265 int error = 0;
5266 void *option = NULL; /* Set if fallback value */
5267 t_uscalar_t maxlen = *optlenp;
5268 t_uscalar_t len;
5269 uint32_t value;
5270 struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5271 struct timeval32 tmo_val32;
5272 struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
5273 vnode_t *vn;
5274
5275 dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5276 (void *)so, level, option_name, optval, (void *)optlenp,
5277 pr_state(so->so_state, so->so_mode)));
5278
5279 mutex_enter(&so->so_lock);
5280 so_lock_single(so); /* Set SOLOCKED */
5281 vn = SOTOV(so);
5282 if (SOTPI_VN_NOSTREAM(vn)) {
5283 error = EBADF;
5284 eprintsoline(so, error);
5285 goto done2;
5286 }
5287
5288 /*
5289 * Check for SOL_SOCKET options.
5290 * Certain SOL_SOCKET options are returned directly whereas
5291 * others only provide a default (fallback) value should
5292 * the T_SVR4_OPTMGMT_REQ fail.
5293 */
5294 if (level == SOL_SOCKET) {
5295 /* Check parameters */
5296 switch (option_name) {
5297 case SO_TYPE:
5298 case SO_ERROR:
5299 case SO_DEBUG:
5300 case SO_ACCEPTCONN:
5301 case SO_REUSEADDR:
5302 case SO_KEEPALIVE:
5303 case SO_DONTROUTE:
5304 case SO_BROADCAST:
5305 case SO_USELOOPBACK:
5306 case SO_OOBINLINE:
5307 case SO_SNDBUF:
5308 case SO_RCVBUF:
5309 #ifdef notyet
5310 case SO_SNDLOWAT:
5311 case SO_RCVLOWAT:
5312 #endif /* notyet */
5313 case SO_DOMAIN:
5314 case SO_DGRAM_ERRIND:
5315 if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5316 error = EINVAL;
5317 eprintsoline(so, error);
5318 goto done2;
5319 }
5320 break;
5321 case SO_RCVTIMEO:
5322 case SO_SNDTIMEO:
5323 if (get_udatamodel() == DATAMODEL_NONE ||
5324 get_udatamodel() == DATAMODEL_NATIVE) {
5325 if (maxlen < sizeof (struct timeval)) {
5326 error = EINVAL;
5327 eprintsoline(so, error);
5328 goto done2;
5329 }
5330 } else {
5331 if (maxlen < sizeof (struct timeval32)) {
5332 error = EINVAL;
5333 eprintsoline(so, error);
5334 goto done2;
5335 }
5336
5337 }
5338 break;
5339 case SO_LINGER:
5340 if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5341 error = EINVAL;
5342 eprintsoline(so, error);
5343 goto done2;
5344 }
5345 break;
5346 case SO_SND_BUFINFO:
5347 if (maxlen < (t_uscalar_t)
5348 sizeof (struct so_snd_bufinfo)) {
5349 error = EINVAL;
5350 eprintsoline(so, error);
5351 goto done2;
5352 }
5353 break;
5354 }
5355
5356 len = (t_uscalar_t)sizeof (uint32_t); /* Default */
5357
5358 switch (option_name) {
5359 case SO_TYPE:
5360 value = so->so_type;
5361 option = &value;
5362 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5363
5364 case SO_ERROR:
5365 value = sogeterr(so, B_TRUE);
5366 option = &value;
5367 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5368
5369 case SO_ACCEPTCONN:
5370 if (so->so_state & SS_ACCEPTCONN)
5371 value = SO_ACCEPTCONN;
5372 else
5373 value = 0;
5374 #ifdef DEBUG
5375 if (value) {
5376 dprintso(so, 1,
5377 ("sotpi_getsockopt: 0x%x is set\n",
5378 option_name));
5379 } else {
5380 dprintso(so, 1,
5381 ("sotpi_getsockopt: 0x%x not set\n",
5382 option_name));
5383 }
5384 #endif /* DEBUG */
5385 option = &value;
5386 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5387
5388 case SO_DEBUG:
5389 case SO_REUSEADDR:
5390 case SO_KEEPALIVE:
5391 case SO_DONTROUTE:
5392 case SO_BROADCAST:
5393 case SO_USELOOPBACK:
5394 case SO_OOBINLINE:
5395 case SO_DGRAM_ERRIND:
5396 value = (so->so_options & option_name);
5397 #ifdef DEBUG
5398 if (value) {
5399 dprintso(so, 1,
5400 ("sotpi_getsockopt: 0x%x is set\n",
5401 option_name));
5402 } else {
5403 dprintso(so, 1,
5404 ("sotpi_getsockopt: 0x%x not set\n",
5405 option_name));
5406 }
5407 #endif /* DEBUG */
5408 option = &value;
5409 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5410
5411 /*
5412 * The following options are only returned by sockfs when the
5413 * T_SVR4_OPTMGMT_REQ fails.
5414 */
5415 case SO_LINGER:
5416 option = &so->so_linger;
5417 len = (t_uscalar_t)sizeof (struct linger);
5418 break;
5419 case SO_SNDBUF: {
5420 ssize_t lvalue;
5421
5422 /*
5423 * If the option has not been set then get a default
5424 * value from the read queue. This value is
5425 * returned if the transport fails
5426 * the T_SVR4_OPTMGMT_REQ.
5427 */
5428 lvalue = so->so_sndbuf;
5429 if (lvalue == 0) {
5430 mutex_exit(&so->so_lock);
5431 (void) strqget(strvp2wq(vn)->q_next,
5432 QHIWAT, 0, &lvalue);
5433 mutex_enter(&so->so_lock);
5434 dprintso(so, 1,
5435 ("got SO_SNDBUF %ld from q\n", lvalue));
5436 }
5437 value = (int)lvalue;
5438 option = &value;
5439 len = (t_uscalar_t)sizeof (so->so_sndbuf);
5440 break;
5441 }
5442 case SO_RCVBUF: {
5443 ssize_t lvalue;
5444
5445 /*
5446 * If the option has not been set then get a default
5447 * value from the read queue. This value is
5448 * returned if the transport fails
5449 * the T_SVR4_OPTMGMT_REQ.
5450 *
5451 * XXX If SO_RCVBUF has been set and this is an
5452 * XPG 4.2 application then do not ask the transport
5453 * since the transport might adjust the value and not
5454 * return exactly what was set by the application.
5455 * For non-XPG 4.2 application we return the value
5456 * that the transport is actually using.
5457 */
5458 lvalue = so->so_rcvbuf;
5459 if (lvalue == 0) {
5460 mutex_exit(&so->so_lock);
5461 (void) strqget(RD(strvp2wq(vn)),
5462 QHIWAT, 0, &lvalue);
5463 mutex_enter(&so->so_lock);
5464 dprintso(so, 1,
5465 ("got SO_RCVBUF %ld from q\n", lvalue));
5466 } else if (flags & _SOGETSOCKOPT_XPG4_2) {
5467 value = (int)lvalue;
5468 option = &value;
5469 goto copyout; /* skip asking transport */
5470 }
5471 value = (int)lvalue;
5472 option = &value;
5473 len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5474 break;
5475 }
5476 case SO_DOMAIN:
5477 value = so->so_family;
5478 option = &value;
5479 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5480
5481 #ifdef notyet
5482 /*
5483 * We do not implement the semantics of these options
5484 * thus we shouldn't implement the options either.
5485 */
5486 case SO_SNDLOWAT:
5487 value = so->so_sndlowat;
5488 option = &value;
5489 break;
5490 case SO_RCVLOWAT:
5491 value = so->so_rcvlowat;
5492 option = &value;
5493 break;
5494 #endif /* notyet */
5495 case SO_SNDTIMEO:
5496 case SO_RCVTIMEO: {
5497 clock_t val;
5498
5499 if (option_name == SO_RCVTIMEO)
5500 val = drv_hztousec(so->so_rcvtimeo);
5501 else
5502 val = drv_hztousec(so->so_sndtimeo);
5503 tmo_val.tv_sec = val / (1000 * 1000);
5504 tmo_val.tv_usec = val % (1000 * 1000);
5505 if (get_udatamodel() == DATAMODEL_NONE ||
5506 get_udatamodel() == DATAMODEL_NATIVE) {
5507 option = &tmo_val;
5508 len = sizeof (struct timeval);
5509 } else {
5510 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5511 option = &tmo_val32;
5512 len = sizeof (struct timeval32);
5513 }
5514 break;
5515 }
5516 case SO_SND_BUFINFO: {
5517 snd_bufinfo.sbi_wroff =
5518 (so->so_proto_props).sopp_wroff;
5519 snd_bufinfo.sbi_maxblk =
5520 (so->so_proto_props).sopp_maxblk;
5521 snd_bufinfo.sbi_maxpsz =
5522 (so->so_proto_props).sopp_maxpsz;
5523 snd_bufinfo.sbi_tail =
5524 (so->so_proto_props).sopp_tail;
5525 option = &snd_bufinfo;
5526 len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5527 break;
5528 }
5529 }
5530 }
5531
5532 mutex_exit(&so->so_lock);
5533
5534 /* Send request */
5535 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5536 optmgmt_req.MGMT_flags = T_CHECK;
5537 optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5538 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5539
5540 oh.level = level;
5541 oh.name = option_name;
5542 oh.len = maxlen;
5543
5544 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5545 &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5546 /* Let option management work in the presence of data flow control */
5547 error = kstrputmsg(vn, mp, NULL, 0, 0,
5548 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5549 mp = NULL;
5550 mutex_enter(&so->so_lock);
5551 if (error) {
5552 eprintsoline(so, error);
5553 goto done2;
5554 }
5555 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5556 (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5557 if (error) {
5558 if (option != NULL) {
5559 /* We have a fallback value */
5560 error = 0;
5561 goto copyout;
5562 }
5563 eprintsoline(so, error);
5564 goto done2;
5565 }
5566 ASSERT(mp);
5567 optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5568 opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5569 optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5570 if (opt_res == NULL) {
5571 if (option != NULL) {
5572 /* We have a fallback value */
5573 error = 0;
5574 goto copyout;
5575 }
5576 error = EPROTO;
5577 eprintsoline(so, error);
5578 goto done;
5579 }
5580 option = &opt_res[1];
5581
5582 /* check to ensure that the option is within bounds */
5583 if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5584 (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5585 if (option != NULL) {
5586 /* We have a fallback value */
5587 error = 0;
5588 goto copyout;
5589 }
5590 error = EPROTO;
5591 eprintsoline(so, error);
5592 goto done;
5593 }
5594
5595 len = opt_res->len;
5596
5597 copyout: {
5598 t_uscalar_t size = MIN(len, maxlen);
5599 bcopy(option, optval, size);
5600 bcopy(&size, optlenp, sizeof (size));
5601 }
5602 done:
5603 freemsg(mp);
5604 done2:
5605 so_unlock_single(so, SOLOCKED);
5606 mutex_exit(&so->so_lock);
5607
5608 return (error);
5609 }
5610
5611 /*
5612 * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5613 * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5614 * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5615 * setsockopt has to work even if the transport does not support the option.
5616 */
5617 /* ARGSUSED */
5618 int
5619 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5620 const void *optval, t_uscalar_t optlen, struct cred *cr)
5621 {
5622 struct T_optmgmt_req optmgmt_req;
5623 struct opthdr oh;
5624 mblk_t *mp;
5625 int error = 0;
5626 boolean_t handled = B_FALSE;
5627
5628 dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5629 (void *)so, level, option_name, optval, optlen,
5630 pr_state(so->so_state, so->so_mode)));
5631
5632 /* X/Open requires this check */
5633 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5634 if (xnet_check_print)
5635 printf("sockfs: X/Open setsockopt check => EINVAL\n");
5636 return (EINVAL);
5637 }
5638
5639 mutex_enter(&so->so_lock);
5640 so_lock_single(so); /* Set SOLOCKED */
5641 mutex_exit(&so->so_lock);
5642
5643 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5644 optmgmt_req.MGMT_flags = T_NEGOTIATE;
5645 optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5646 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5647
5648 oh.level = level;
5649 oh.name = option_name;
5650 oh.len = optlen;
5651
5652 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5653 &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5654 /* Let option management work in the presence of data flow control */
5655 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5656 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5657 mp = NULL;
5658 mutex_enter(&so->so_lock);
5659 if (error) {
5660 eprintsoline(so, error);
5661 goto done2;
5662 }
5663 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5664 (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5665 if (error) {
5666 eprintsoline(so, error);
5667 goto done;
5668 }
5669 ASSERT(mp);
5670 /* No need to verify T_optmgmt_ack */
5671 freemsg(mp);
5672 done:
5673 /*
5674 * Check for SOL_SOCKET options and record their values.
5675 * If we know about a SOL_SOCKET parameter and the transport
5676 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5677 * EPROTO) we let the setsockopt succeed.
5678 */
5679 if (level == SOL_SOCKET) {
5680 /* Check parameters */
5681 switch (option_name) {
5682 case SO_DEBUG:
5683 case SO_REUSEADDR:
5684 case SO_KEEPALIVE:
5685 case SO_DONTROUTE:
5686 case SO_BROADCAST:
5687 case SO_USELOOPBACK:
5688 case SO_OOBINLINE:
5689 case SO_SNDBUF:
5690 case SO_RCVBUF:
5691 #ifdef notyet
5692 case SO_SNDLOWAT:
5693 case SO_RCVLOWAT:
5694 #endif /* notyet */
5695 case SO_DGRAM_ERRIND:
5696 if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5697 error = EINVAL;
5698 eprintsoline(so, error);
5699 goto done2;
5700 }
5701 ASSERT(optval);
5702 handled = B_TRUE;
5703 break;
5704 case SO_SNDTIMEO:
5705 case SO_RCVTIMEO:
5706 if (get_udatamodel() == DATAMODEL_NONE ||
5707 get_udatamodel() == DATAMODEL_NATIVE) {
5708 if (optlen != sizeof (struct timeval)) {
5709 error = EINVAL;
5710 eprintsoline(so, error);
5711 goto done2;
5712 }
5713 } else {
5714 if (optlen != sizeof (struct timeval32)) {
5715 error = EINVAL;
5716 eprintsoline(so, error);
5717 goto done2;
5718 }
5719 }
5720 ASSERT(optval);
5721 handled = B_TRUE;
5722 break;
5723 case SO_LINGER:
5724 if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5725 error = EINVAL;
5726 eprintsoline(so, error);
5727 goto done2;
5728 }
5729 ASSERT(optval);
5730 handled = B_TRUE;
5731 break;
5732 }
5733
5734 #define intvalue (*(int32_t *)optval)
5735
5736 switch (option_name) {
5737 case SO_TYPE:
5738 case SO_ERROR:
5739 case SO_ACCEPTCONN:
5740 /* Can't be set */
5741 error = ENOPROTOOPT;
5742 goto done2;
5743 case SO_LINGER: {
5744 struct linger *l = (struct linger *)optval;
5745
5746 so->so_linger.l_linger = l->l_linger;
5747 if (l->l_onoff) {
5748 so->so_linger.l_onoff = SO_LINGER;
5749 so->so_options |= SO_LINGER;
5750 } else {
5751 so->so_linger.l_onoff = 0;
5752 so->so_options &= ~SO_LINGER;
5753 }
5754 break;
5755 }
5756
5757 case SO_DEBUG:
5758 #ifdef SOCK_TEST
5759 if (intvalue & 2)
5760 sock_test_timelimit = 10 * hz;
5761 else
5762 sock_test_timelimit = 0;
5763
5764 if (intvalue & 4)
5765 do_useracc = 0;
5766 else
5767 do_useracc = 1;
5768 #endif /* SOCK_TEST */
5769 /* FALLTHRU */
5770 case SO_REUSEADDR:
5771 case SO_KEEPALIVE:
5772 case SO_DONTROUTE:
5773 case SO_BROADCAST:
5774 case SO_USELOOPBACK:
5775 case SO_OOBINLINE:
5776 case SO_DGRAM_ERRIND:
5777 if (intvalue != 0) {
5778 dprintso(so, 1,
5779 ("socket_setsockopt: setting 0x%x\n",
5780 option_name));
5781 so->so_options |= option_name;
5782 } else {
5783 dprintso(so, 1,
5784 ("socket_setsockopt: clearing 0x%x\n",
5785 option_name));
5786 so->so_options &= ~option_name;
5787 }
5788 break;
5789 /*
5790 * The following options are only returned by us when the
5791 * transport layer fails.
5792 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5793 * since the transport might adjust the value and not
5794 * return exactly what was set by the application.
5795 */
5796 case SO_SNDBUF:
5797 so->so_sndbuf = intvalue;
5798 break;
5799 case SO_RCVBUF:
5800 so->so_rcvbuf = intvalue;
5801 break;
5802 case SO_RCVPSH:
5803 so->so_rcv_timer_interval = intvalue;
5804 break;
5805 #ifdef notyet
5806 /*
5807 * We do not implement the semantics of these options
5808 * thus we shouldn't implement the options either.
5809 */
5810 case SO_SNDLOWAT:
5811 so->so_sndlowat = intvalue;
5812 break;
5813 case SO_RCVLOWAT:
5814 so->so_rcvlowat = intvalue;
5815 break;
5816 #endif /* notyet */
5817 case SO_SNDTIMEO:
5818 case SO_RCVTIMEO: {
5819 struct timeval tl;
5820 clock_t val;
5821
5822 if (get_udatamodel() == DATAMODEL_NONE ||
5823 get_udatamodel() == DATAMODEL_NATIVE)
5824 bcopy(&tl, (struct timeval *)optval,
5825 sizeof (struct timeval));
5826 else
5827 TIMEVAL32_TO_TIMEVAL(&tl,
5828 (struct timeval32 *)optval);
5829 val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5830 if (option_name == SO_RCVTIMEO)
5831 so->so_rcvtimeo = drv_usectohz(val);
5832 else
5833 so->so_sndtimeo = drv_usectohz(val);
5834 break;
5835 }
5836 }
5837 #undef intvalue
5838
5839 if (error) {
5840 if ((error == ENOPROTOOPT || error == EPROTO ||
5841 error == EINVAL) && handled) {
5842 dprintso(so, 1,
5843 ("setsockopt: ignoring error %d for 0x%x\n",
5844 error, option_name));
5845 error = 0;
5846 }
5847 }
5848 }
5849 done2:
5850 so_unlock_single(so, SOLOCKED);
5851 mutex_exit(&so->so_lock);
5852 return (error);
5853 }
5854
5855 /*
5856 * sotpi_close() is called when the last open reference goes away.
5857 */
5858 /* ARGSUSED */
5859 int
5860 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5861 {
5862 struct vnode *vp = SOTOV(so);
5863 dev_t dev;
5864 int error = 0;
5865 sotpi_info_t *sti = SOTOTPI(so);
5866
5867 dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5868 (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5869
5870 dev = sti->sti_dev;
5871
5872 ASSERT(STREAMSTAB(getmajor(dev)));
5873
5874 mutex_enter(&so->so_lock);
5875 so_lock_single(so); /* Set SOLOCKED */
5876
5877 ASSERT(so_verify_oobstate(so));
5878
5879 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5880 sti->sti_nl7c_flags = 0;
5881 nl7c_close(so);
5882 }
5883
5884 if (vp->v_stream != NULL) {
5885 vnode_t *ux_vp;
5886
5887 if (so->so_family == AF_UNIX) {
5888 /* Could avoid this when CANTSENDMORE for !dgram */
5889 so_unix_close(so);
5890 }
5891
5892 mutex_exit(&so->so_lock);
5893 /*
5894 * Disassemble the linkage from the AF_UNIX underlying file
5895 * system vnode to this socket (by atomically clearing
5896 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5897 * and frees the stream head.
5898 */
5899 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5900 ASSERT(ux_vp->v_stream);
5901 sti->sti_ux_bound_vp = NULL;
5902 vn_rele_stream(ux_vp);
5903 }
5904 error = strclose(vp, flag, cr);
5905 vp->v_stream = NULL;
5906 mutex_enter(&so->so_lock);
5907 }
5908
5909 /*
5910 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5911 */
5912 so_flush_discon_ind(so);
5913
5914 so_unlock_single(so, SOLOCKED);
5915 mutex_exit(&so->so_lock);
5916
5917 /*
5918 * Needed for STREAMs.
5919 * Decrement the device driver's reference count for streams
5920 * opened via the clone dip. The driver was held in clone_open().
5921 * The absence of clone_close() forces this asymmetry.
5922 */
5923 if (so->so_flag & SOCLONE)
5924 ddi_rele_driver(getmajor(dev));
5925
5926 return (error);
5927 }
5928
5929 static int
5930 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5931 struct cred *cr, int32_t *rvalp)
5932 {
5933 struct vnode *vp = SOTOV(so);
5934 sotpi_info_t *sti = SOTOTPI(so);
5935 int error = 0;
5936
5937 dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5938 cmd, arg, pr_state(so->so_state, so->so_mode)));
5939
5940 switch (cmd) {
5941 case SIOCSQPTR:
5942 /*
5943 * SIOCSQPTR is valid only when helper stream is created
5944 * by the protocol.
5945 */
5946 case _I_INSERT:
5947 case _I_REMOVE:
5948 /*
5949 * Since there's no compelling reason to support these ioctls
5950 * on sockets, and doing so would increase the complexity
5951 * markedly, prevent it.
5952 */
5953 return (EOPNOTSUPP);
5954
5955 case I_FIND:
5956 case I_LIST:
5957 case I_LOOK:
5958 case I_POP:
5959 case I_PUSH:
5960 /*
5961 * To prevent races and inconsistencies between the actual
5962 * state of the stream and the state according to the sonode,
5963 * we serialize all operations which modify or operate on the
5964 * list of modules on the socket's stream.
5965 */
5966 mutex_enter(&sti->sti_plumb_lock);
5967 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5968 mutex_exit(&sti->sti_plumb_lock);
5969 return (error);
5970
5971 default:
5972 if (so->so_version != SOV_STREAM)
5973 break;
5974
5975 /*
5976 * The imaginary "sockmod" has been popped; act as a stream.
5977 */
5978 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5979 }
5980
5981 ASSERT(so->so_version != SOV_STREAM);
5982
5983 /*
5984 * Process socket-specific ioctls.
5985 */
5986 switch (cmd) {
5987 case FIONBIO: {
5988 int32_t value;
5989
5990 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5991 (mode & (int)FKIOCTL)))
5992 return (EFAULT);
5993
5994 mutex_enter(&so->so_lock);
5995 if (value) {
5996 so->so_state |= SS_NDELAY;
5997 } else {
5998 so->so_state &= ~SS_NDELAY;
5999 }
6000 mutex_exit(&so->so_lock);
6001 return (0);
6002 }
6003
6004 case FIOASYNC: {
6005 int32_t value;
6006
6007 if (so_copyin((void *)arg, &value, sizeof (int32_t),
6008 (mode & (int)FKIOCTL)))
6009 return (EFAULT);
6010
6011 mutex_enter(&so->so_lock);
6012 /*
6013 * SS_ASYNC flag not already set correctly?
6014 * (!value != !(so->so_state & SS_ASYNC))
6015 * but some engineers find that too hard to read.
6016 */
6017 if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
6018 value != 0 && (so->so_state & SS_ASYNC) == 0)
6019 error = so_flip_async(so, vp, mode, cr);
6020 mutex_exit(&so->so_lock);
6021 return (error);
6022 }
6023
6024 case SIOCSPGRP:
6025 case FIOSETOWN: {
6026 pid_t pgrp;
6027
6028 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
6029 (mode & (int)FKIOCTL)))
6030 return (EFAULT);
6031
6032 mutex_enter(&so->so_lock);
6033 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
6034 /* Any change? */
6035 if (pgrp != so->so_pgrp)
6036 error = so_set_siggrp(so, vp, pgrp, mode, cr);
6037 mutex_exit(&so->so_lock);
6038 return (error);
6039 }
6040 case SIOCGPGRP:
6041 case FIOGETOWN:
6042 if (so_copyout(&so->so_pgrp, (void *)arg,
6043 sizeof (pid_t), (mode & (int)FKIOCTL)))
6044 return (EFAULT);
6045 return (0);
6046
6047 case SIOCATMARK: {
6048 int retval;
6049 uint_t so_state;
6050
6051 /*
6052 * strwaitmark has a finite timeout after which it
6053 * returns -1 if the mark state is undetermined.
6054 * In order to avoid any race between the mark state
6055 * in sockfs and the mark state in the stream head this
6056 * routine loops until the mark state can be determined
6057 * (or the urgent data indication has been removed by some
6058 * other thread).
6059 */
6060 do {
6061 mutex_enter(&so->so_lock);
6062 so_state = so->so_state;
6063 mutex_exit(&so->so_lock);
6064 if (so_state & SS_RCVATMARK) {
6065 retval = 1;
6066 } else if (!(so_state & SS_OOBPEND)) {
6067 /*
6068 * No SIGURG has been generated -- there is no
6069 * pending or present urgent data. Thus can't
6070 * possibly be at the mark.
6071 */
6072 retval = 0;
6073 } else {
6074 /*
6075 * Have the stream head wait until there is
6076 * either some messages on the read queue, or
6077 * STRATMARK or STRNOTATMARK gets set. The
6078 * STRNOTATMARK flag is used so that the
6079 * transport can send up a MSGNOTMARKNEXT
6080 * M_DATA to indicate that it is not
6081 * at the mark and additional data is not about
6082 * to be send upstream.
6083 *
6084 * If the mark state is undetermined this will
6085 * return -1 and we will loop rechecking the
6086 * socket state.
6087 */
6088 retval = strwaitmark(vp);
6089 }
6090 } while (retval == -1);
6091
6092 if (so_copyout(&retval, (void *)arg, sizeof (int),
6093 (mode & (int)FKIOCTL)))
6094 return (EFAULT);
6095 return (0);
6096 }
6097
6098 case I_FDINSERT:
6099 case I_SENDFD:
6100 case I_RECVFD:
6101 case I_ATMARK:
6102 case _SIOCSOCKFALLBACK:
6103 /*
6104 * These ioctls do not apply to sockets. I_FDINSERT can be
6105 * used to send M_PROTO messages without modifying the socket
6106 * state. I_SENDFD/RECVFD should not be used for socket file
6107 * descriptor passing since they assume a twisted stream.
6108 * SIOCATMARK must be used instead of I_ATMARK.
6109 *
6110 * _SIOCSOCKFALLBACK from an application should never be
6111 * processed. It is only generated by socktpi_open() or
6112 * in response to I_POP or I_PUSH.
6113 */
6114 #ifdef DEBUG
6115 zcmn_err(getzoneid(), CE_WARN,
6116 "Unsupported STREAMS ioctl 0x%x on socket. "
6117 "Pid = %d\n", cmd, curproc->p_pid);
6118 #endif /* DEBUG */
6119 return (EOPNOTSUPP);
6120
6121 case _I_GETPEERCRED:
6122 if ((mode & FKIOCTL) == 0)
6123 return (EINVAL);
6124
6125 mutex_enter(&so->so_lock);
6126 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6127 error = ENOTSUP;
6128 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
6129 error = ENOTCONN;
6130 } else if (so->so_peercred != NULL) {
6131 k_peercred_t *kp = (k_peercred_t *)arg;
6132 kp->pc_cr = so->so_peercred;
6133 kp->pc_cpid = so->so_cpid;
6134 crhold(so->so_peercred);
6135 } else {
6136 error = EINVAL;
6137 }
6138 mutex_exit(&so->so_lock);
6139 return (error);
6140
6141 default:
6142 /*
6143 * Do the higher-order bits of the ioctl cmd indicate
6144 * that it is an I_* streams ioctl?
6145 */
6146 if ((cmd & 0xffffff00U) == STR &&
6147 so->so_version == SOV_SOCKBSD) {
6148 #ifdef DEBUG
6149 zcmn_err(getzoneid(), CE_WARN,
6150 "Unsupported STREAMS ioctl 0x%x on socket. "
6151 "Pid = %d\n", cmd, curproc->p_pid);
6152 #endif /* DEBUG */
6153 return (EOPNOTSUPP);
6154 }
6155 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6156 }
6157 }
6158
6159 /*
6160 * Handle plumbing-related ioctls.
6161 */
6162 static int
6163 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6164 struct cred *cr, int32_t *rvalp)
6165 {
6166 static const char sockmod_name[] = "sockmod";
6167 struct sonode *so = VTOSO(vp);
6168 char mname[FMNAMESZ + 1];
6169 int error;
6170 sotpi_info_t *sti = SOTOTPI(so);
6171
6172 ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6173
6174 if (so->so_version == SOV_SOCKBSD)
6175 return (EOPNOTSUPP);
6176
6177 if (so->so_version == SOV_STREAM) {
6178 /*
6179 * The imaginary "sockmod" has been popped - act as a stream.
6180 * If this is a push of sockmod then change back to a socket.
6181 */
6182 if (cmd == I_PUSH) {
6183 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6184 (void *)arg, mname, sizeof (mname), NULL);
6185
6186 if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6187 dprintso(so, 0, ("socktpi_ioctl: going to "
6188 "socket version\n"));
6189 so_stream2sock(so);
6190 return (0);
6191 }
6192 }
6193 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6194 }
6195
6196 switch (cmd) {
6197 case I_PUSH:
6198 if (sti->sti_direct) {
6199 mutex_enter(&so->so_lock);
6200 so_lock_single(so);
6201 mutex_exit(&so->so_lock);
6202
6203 error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6204 cr, rvalp);
6205
6206 mutex_enter(&so->so_lock);
6207 if (error == 0)
6208 sti->sti_direct = 0;
6209 so_unlock_single(so, SOLOCKED);
6210 mutex_exit(&so->so_lock);
6211
6212 if (error != 0)
6213 return (error);
6214 }
6215
6216 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6217 if (error == 0)
6218 sti->sti_pushcnt++;
6219 return (error);
6220
6221 case I_POP:
6222 if (sti->sti_pushcnt == 0) {
6223 /* Emulate sockmod being popped */
6224 dprintso(so, 0,
6225 ("socktpi_ioctl: going to STREAMS version\n"));
6226 return (so_sock2stream(so));
6227 }
6228
6229 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6230 if (error == 0)
6231 sti->sti_pushcnt--;
6232 return (error);
6233
6234 case I_LIST: {
6235 struct str_mlist *kmlistp, *umlistp;
6236 struct str_list kstrlist;
6237 ssize_t kstrlistsize;
6238 int i, nmods;
6239
6240 STRUCT_DECL(str_list, ustrlist);
6241 STRUCT_INIT(ustrlist, mode);
6242
6243 if (arg == 0) {
6244 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6245 if (error == 0)
6246 (*rvalp)++; /* Add one for sockmod */
6247 return (error);
6248 }
6249
6250 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6251 STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6252 if (error != 0)
6253 return (error);
6254
6255 nmods = STRUCT_FGET(ustrlist, sl_nmods);
6256 if (nmods <= 0)
6257 return (EINVAL);
6258 /*
6259 * Ceiling nmods at nstrpush to prevent someone from
6260 * maliciously consuming lots of kernel memory.
6261 */
6262 nmods = MIN(nmods, nstrpush);
6263
6264 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6265 kstrlist.sl_nmods = nmods;
6266 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6267
6268 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6269 cr, rvalp);
6270 if (error != 0)
6271 goto done;
6272
6273 /*
6274 * Considering the module list as a 0-based array of sl_nmods
6275 * modules, sockmod should conceptually exist at slot
6276 * sti_pushcnt. Insert sockmod at this location by sliding all
6277 * of the module names after so_pushcnt over by one. We know
6278 * that there will be room to do this since we allocated
6279 * sl_modlist with an additional slot.
6280 */
6281 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6282 kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6283
6284 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6285 kstrlist.sl_nmods++;
6286
6287 /*
6288 * Copy all of the entries out to ustrlist.
6289 */
6290 kmlistp = kstrlist.sl_modlist;
6291 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6292 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6293 error = so_copyout(kmlistp++, umlistp++,
6294 sizeof (struct str_mlist), mode & FKIOCTL);
6295 if (error != 0)
6296 goto done;
6297 }
6298
6299 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6300 mode & FKIOCTL);
6301 if (error == 0)
6302 *rvalp = 0;
6303 done:
6304 kmem_free(kstrlist.sl_modlist, kstrlistsize);
6305 return (error);
6306 }
6307 case I_LOOK:
6308 if (sti->sti_pushcnt == 0) {
6309 return (so_copyout(sockmod_name, (void *)arg,
6310 sizeof (sockmod_name), mode & FKIOCTL));
6311 }
6312 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6313
6314 case I_FIND:
6315 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6316 if (error && error != EINVAL)
6317 return (error);
6318
6319 /* if not found and string was sockmod return 1 */
6320 if (*rvalp == 0 || error == EINVAL) {
6321 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6322 (void *)arg, mname, sizeof (mname), NULL);
6323 if (error == ENAMETOOLONG)
6324 error = EINVAL;
6325
6326 if (error == 0 && strcmp(mname, sockmod_name) == 0)
6327 *rvalp = 1;
6328 }
6329 return (error);
6330
6331 default:
6332 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6333 break;
6334 }
6335
6336 return (0);
6337 }
6338
6339 /*
6340 * Wrapper around the streams poll routine that implements socket poll
6341 * semantics.
6342 * The sockfs never calls pollwakeup itself - the stream head take care
6343 * of all pollwakeups. Since sockfs never holds so_lock when calling the
6344 * stream head there can never be a deadlock due to holding so_lock across
6345 * pollwakeup and acquiring so_lock in this routine.
6346 *
6347 * However, since the performance of VOP_POLL is critical we avoid
6348 * acquiring so_lock here. This is based on two assumptions:
6349 * - The poll implementation holds locks to serialize the VOP_POLL call
6350 * and a pollwakeup for the same pollhead. This ensures that should
6351 * e.g. so_state change during a socktpi_poll call the pollwakeup
6352 * (which strsock_* and strrput conspire to issue) is issued after
6353 * the state change. Thus the pollwakeup will block until VOP_POLL has
6354 * returned and then wake up poll and have it call VOP_POLL again.
6355 * - The reading of so_state without holding so_lock does not result in
6356 * stale data that is older than the latest state change that has dropped
6357 * so_lock. This is ensured by the mutex_exit issuing the appropriate
6358 * memory barrier to force the data into the coherency domain.
6359 */
6360 static int
6361 sotpi_poll(
6362 struct sonode *so,
6363 short events,
6364 int anyyet,
6365 short *reventsp,
6366 struct pollhead **phpp)
6367 {
6368 short origevents = events;
6369 struct vnode *vp = SOTOV(so);
6370 int error;
6371 int so_state = so->so_state; /* snapshot */
6372 sotpi_info_t *sti = SOTOTPI(so);
6373
6374 dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6375 (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6376
6377 ASSERT(vp->v_type == VSOCK);
6378 ASSERT(vp->v_stream != NULL);
6379
6380 if (so->so_version == SOV_STREAM) {
6381 /* The imaginary "sockmod" has been popped - act as a stream */
6382 return (strpoll(vp->v_stream, events, anyyet,
6383 reventsp, phpp));
6384 }
6385
6386 if (!(so_state & SS_ISCONNECTED) &&
6387 (so->so_mode & SM_CONNREQUIRED)) {
6388 /* Not connected yet - turn off write side events */
6389 events &= ~(POLLOUT|POLLWRBAND);
6390 }
6391 /*
6392 * Check for errors without calling strpoll if the caller wants them.
6393 * In sockets the errors are represented as input/output events
6394 * and there is no need to ask the stream head for this information.
6395 */
6396 if (so->so_error != 0 &&
6397 ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
6398 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6399 return (0);
6400 }
6401 /*
6402 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6403 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6404 * will not trigger a POLLIN event with POLLRDDATA set.
6405 * The handling of urgent data (causing POLLRDBAND) is done by
6406 * inspecting SS_OOBPEND below.
6407 */
6408 events |= POLLRDDATA;
6409
6410 /*
6411 * After shutdown(output) a stream head write error is set.
6412 * However, we should not return output events.
6413 */
6414 events |= POLLNOERR;
6415 error = strpoll(vp->v_stream, events, anyyet,
6416 reventsp, phpp);
6417 if (error)
6418 return (error);
6419
6420 ASSERT(!(*reventsp & POLLERR));
6421
6422 /*
6423 * Notes on T_CONN_IND handling for sockets.
6424 *
6425 * If strpoll() returned without events, SR_POLLIN is guaranteed
6426 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6427 *
6428 * Since the so_lock is not held, soqueueconnind() may have run
6429 * and a T_CONN_IND may be waiting. We now check for any queued
6430 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6431 * to ensure poll returns.
6432 *
6433 * However:
6434 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6435 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6436 * the following actions will occur; taken together they ensure the
6437 * syscall will return.
6438 *
6439 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6440 * the accept() was run on a non-blocking socket sowaitconnind()
6441 * may have already returned EWOULDBLOCK, so not be waiting to
6442 * process the message. Additionally socktpi_poll() has probably
6443 * proceeded past the sti_conn_ind_head check below.
6444 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6445 * this thread, however that could occur before poll_common()
6446 * has entered cv_wait.
6447 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6448 *
6449 * Before proceeding to cv_wait() in poll_common() for an event,
6450 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6451 * and if set, re-calls strpoll() to ensure the late arriving
6452 * T_CONN_IND is recognized, and pollsys() returns.
6453 */
6454
6455 if (sti->sti_conn_ind_head != NULL)
6456 *reventsp |= (POLLIN|POLLRDNORM) & events;
6457
6458 if (so->so_state & SS_CANTRCVMORE) {
6459 *reventsp |= POLLRDHUP & events;
6460
6461 if (so->so_state & SS_CANTSENDMORE)
6462 *reventsp |= POLLHUP;
6463 }
6464
6465 if (so->so_state & SS_OOBPEND)
6466 *reventsp |= POLLRDBAND & events;
6467
6468 if (sti->sti_nl7c_rcv_mp != NULL) {
6469 *reventsp |= (POLLIN|POLLRDNORM) & events;
6470 }
6471 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6472 ((POLLIN|POLLRDNORM) & *reventsp)) {
6473 sti->sti_nl7c_flags |= NL7C_POLLIN;
6474 }
6475
6476 return (0);
6477 }
6478
6479 /*ARGSUSED*/
6480 static int
6481 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6482 {
6483 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6484 int error = 0;
6485
6486 error = sonode_constructor(buf, cdrarg, kmflags);
6487 if (error != 0)
6488 return (error);
6489
6490 error = i_sotpi_info_constructor(&st->st_info);
6491 if (error != 0)
6492 sonode_destructor(buf, cdrarg);
6493
6494 st->st_sonode.so_priv = &st->st_info;
6495
6496 return (error);
6497 }
6498
6499 /*ARGSUSED1*/
6500 static void
6501 socktpi_destructor(void *buf, void *cdrarg)
6502 {
6503 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6504
6505 ASSERT(st->st_sonode.so_priv == &st->st_info);
6506 st->st_sonode.so_priv = NULL;
6507
6508 i_sotpi_info_destructor(&st->st_info);
6509 sonode_destructor(buf, cdrarg);
6510 }
6511
6512 static int
6513 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6514 {
6515 int retval;
6516
6517 if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6518 struct sonode *so = (struct sonode *)buf;
6519 sotpi_info_t *sti = SOTOTPI(so);
6520
6521 mutex_enter(&socklist.sl_lock);
6522
6523 sti->sti_next_so = socklist.sl_list;
6524 sti->sti_prev_so = NULL;
6525 if (sti->sti_next_so != NULL)
6526 SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6527 socklist.sl_list = so;
6528
6529 mutex_exit(&socklist.sl_lock);
6530
6531 }
6532 return (retval);
6533 }
6534
6535 static void
6536 socktpi_unix_destructor(void *buf, void *cdrarg)
6537 {
6538 struct sonode *so = (struct sonode *)buf;
6539 sotpi_info_t *sti = SOTOTPI(so);
6540
6541 mutex_enter(&socklist.sl_lock);
6542
6543 if (sti->sti_next_so != NULL)
6544 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6545 if (sti->sti_prev_so != NULL)
6546 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6547 else
6548 socklist.sl_list = sti->sti_next_so;
6549
6550 mutex_exit(&socklist.sl_lock);
6551
6552 socktpi_destructor(buf, cdrarg);
6553 }
6554
6555 int
6556 socktpi_init(void)
6557 {
6558 /*
6559 * Create sonode caches. We create a special one for AF_UNIX so
6560 * that we can track them for netstat(8).
6561 */
6562 socktpi_cache = kmem_cache_create("socktpi_cache",
6563 sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6564 socktpi_destructor, NULL, NULL, NULL, 0);
6565
6566 socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6567 sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6568 socktpi_unix_destructor, NULL, NULL, NULL, 0);
6569
6570 return (0);
6571 }
6572
6573 /*
6574 * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6575 *
6576 * Caller must still update state and mode using sotpi_update_state().
6577 */
6578 int
6579 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6580 boolean_t *direct, queue_t **qp, struct cred *cr)
6581 {
6582 sotpi_info_t *sti;
6583 struct sockparams *origsp = so->so_sockparams;
6584 sock_lower_handle_t handle = so->so_proto_handle;
6585 struct stdata *stp;
6586 struct vnode *vp;
6587 queue_t *q;
6588 int error = 0;
6589
6590 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6591 SS_FALLBACK_PENDING);
6592 ASSERT(SOCK_IS_NONSTR(so));
6593
6594 *qp = NULL;
6595 *direct = B_FALSE;
6596 so->so_sockparams = newsp;
6597 /*
6598 * Allocate and initalize fields required by TPI.
6599 */
6600 (void) sotpi_info_create(so, KM_SLEEP);
6601 sotpi_info_init(so);
6602
6603 if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6604 sotpi_info_fini(so);
6605 sotpi_info_destroy(so);
6606 return (error);
6607 }
6608 ASSERT(handle == so->so_proto_handle);
6609 sti = SOTOTPI(so);
6610 if (sti->sti_direct != 0)
6611 *direct = B_TRUE;
6612
6613 /*
6614 * Keep the original sp around so we can properly dispose of the
6615 * sonode when the socket is being closed.
6616 */
6617 sti->sti_orig_sp = origsp;
6618
6619 so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
6620 so_alloc_addr(so, so->so_max_addr_len);
6621
6622 /*
6623 * If the application has done a SIOCSPGRP, make sure the
6624 * STREAM head is aware. This needs to take place before
6625 * the protocol start sending up messages. Otherwise we
6626 * might miss to generate SIGPOLL.
6627 *
6628 * It is possible that the application will receive duplicate
6629 * signals if some were already generated for either data or
6630 * connection indications.
6631 */
6632 if (so->so_pgrp != 0) {
6633 if (so_set_events(so, so->so_vnode, cr) != 0)
6634 so->so_pgrp = 0;
6635 }
6636
6637 /*
6638 * Determine which queue to use.
6639 */
6640 vp = SOTOV(so);
6641 stp = vp->v_stream;
6642 ASSERT(stp != NULL);
6643 q = stp->sd_wrq->q_next;
6644
6645 /*
6646 * Skip any modules that may have been auto pushed when the device
6647 * was opened
6648 */
6649 while (q->q_next != NULL)
6650 q = q->q_next;
6651 *qp = _RD(q);
6652
6653 /* This is now a STREAMS sockets */
6654 so->so_not_str = B_FALSE;
6655
6656 return (error);
6657 }
6658
6659 /*
6660 * Revert a TPI sonode. It is only allowed to revert the sonode during
6661 * the fallback process.
6662 */
6663 void
6664 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6665 {
6666 vnode_t *vp = SOTOV(so);
6667
6668 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6669 SS_FALLBACK_PENDING);
6670 ASSERT(!SOCK_IS_NONSTR(so));
6671 ASSERT(vp->v_stream != NULL);
6672
6673 strclean(vp);
6674 (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6675
6676 /*
6677 * Restore the original sockparams. The caller is responsible for
6678 * dropping the ref to the new sp.
6679 */
6680 so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6681
6682 sotpi_info_fini(so);
6683 sotpi_info_destroy(so);
6684
6685 /* This is no longer a STREAMS sockets */
6686 so->so_not_str = B_TRUE;
6687 }
6688
6689 void
6690 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6691 struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6692 socklen_t faddrlen, short opts)
6693 {
6694 sotpi_info_t *sti = SOTOTPI(so);
6695
6696 so_proc_tcapability_ack(so, tcap);
6697
6698 so->so_options |= opts;
6699
6700 /*
6701 * Determine whether the foreign and local address are valid
6702 */
6703 if (laddrlen != 0) {
6704 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6705 sti->sti_laddr_len = laddrlen;
6706 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6707 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6708 }
6709
6710 if (faddrlen != 0) {
6711 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6712 sti->sti_faddr_len = faddrlen;
6713 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6714 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6715 }
6716
6717 }
6718
6719 /*
6720 * Allocate enough space to cache the local and foreign addresses.
6721 */
6722 void
6723 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6724 {
6725 sotpi_info_t *sti = SOTOTPI(so);
6726
6727 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6728 ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6729 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6730 P2ROUNDUP(maxlen, KMEM_ALIGN);
6731 so->so_max_addr_len = sti->sti_laddr_maxlen;
6732 sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6733 sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6734 + sti->sti_laddr_maxlen);
6735
6736 if (so->so_family == AF_UNIX) {
6737 /*
6738 * Initialize AF_UNIX related fields.
6739 */
6740 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6741 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6742 }
6743 }
6744
6745
6746 sotpi_info_t *
6747 sotpi_sototpi(struct sonode *so)
6748 {
6749 sotpi_info_t *sti;
6750
6751 ASSERT(so != NULL);
6752
6753 sti = (sotpi_info_t *)so->so_priv;
6754
6755 ASSERT(sti != NULL);
6756 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6757
6758 return (sti);
6759 }
6760
6761 static int
6762 i_sotpi_info_constructor(sotpi_info_t *sti)
6763 {
6764 sti->sti_magic = SOTPI_INFO_MAGIC;
6765 sti->sti_ack_mp = NULL;
6766 sti->sti_discon_ind_mp = NULL;
6767 sti->sti_ux_bound_vp = NULL;
6768 sti->sti_unbind_mp = NULL;
6769
6770 sti->sti_conn_ind_head = NULL;
6771 sti->sti_conn_ind_tail = NULL;
6772
6773 sti->sti_laddr_sa = NULL;
6774 sti->sti_faddr_sa = NULL;
6775
6776 sti->sti_nl7c_flags = 0;
6777 sti->sti_nl7c_uri = NULL;
6778 sti->sti_nl7c_rcv_mp = NULL;
6779
6780 mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6781 cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6782
6783 return (0);
6784 }
6785
6786 static void
6787 i_sotpi_info_destructor(sotpi_info_t *sti)
6788 {
6789 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6790 ASSERT(sti->sti_ack_mp == NULL);
6791 ASSERT(sti->sti_discon_ind_mp == NULL);
6792 ASSERT(sti->sti_ux_bound_vp == NULL);
6793 ASSERT(sti->sti_unbind_mp == NULL);
6794
6795 ASSERT(sti->sti_conn_ind_head == NULL);
6796 ASSERT(sti->sti_conn_ind_tail == NULL);
6797
6798 ASSERT(sti->sti_laddr_sa == NULL);
6799 ASSERT(sti->sti_faddr_sa == NULL);
6800
6801 ASSERT(sti->sti_nl7c_flags == 0);
6802 ASSERT(sti->sti_nl7c_uri == NULL);
6803 ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6804
6805 mutex_destroy(&sti->sti_plumb_lock);
6806 cv_destroy(&sti->sti_ack_cv);
6807 }
6808
6809 /*
6810 * Creates and attaches TPI information to the given sonode
6811 */
6812 static boolean_t
6813 sotpi_info_create(struct sonode *so, int kmflags)
6814 {
6815 sotpi_info_t *sti;
6816
6817 ASSERT(so->so_priv == NULL);
6818
6819 if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6820 return (B_FALSE);
6821
6822 if (i_sotpi_info_constructor(sti) != 0) {
6823 kmem_free(sti, sizeof (*sti));
6824 return (B_FALSE);
6825 }
6826
6827 so->so_priv = (void *)sti;
6828 return (B_TRUE);
6829 }
6830
6831 /*
6832 * Initializes the TPI information.
6833 */
6834 static void
6835 sotpi_info_init(struct sonode *so)
6836 {
6837 struct vnode *vp = SOTOV(so);
6838 sotpi_info_t *sti = SOTOTPI(so);
6839 time_t now;
6840
6841 sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6842 vp->v_rdev = sti->sti_dev;
6843
6844 sti->sti_orig_sp = NULL;
6845
6846 sti->sti_pushcnt = 0;
6847
6848 now = gethrestime_sec();
6849 sti->sti_atime = now;
6850 sti->sti_mtime = now;
6851 sti->sti_ctime = now;
6852
6853 sti->sti_eaddr_mp = NULL;
6854 sti->sti_delayed_error = 0;
6855
6856 sti->sti_provinfo = NULL;
6857
6858 sti->sti_oobcnt = 0;
6859 sti->sti_oobsigcnt = 0;
6860
6861 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6862
6863 sti->sti_laddr_sa = 0;
6864 sti->sti_faddr_sa = 0;
6865 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6866 sti->sti_laddr_len = sti->sti_faddr_len = 0;
6867
6868 sti->sti_laddr_valid = 0;
6869 sti->sti_faddr_valid = 0;
6870 sti->sti_faddr_noxlate = 0;
6871
6872 sti->sti_direct = 0;
6873
6874 ASSERT(sti->sti_ack_mp == NULL);
6875 ASSERT(sti->sti_ux_bound_vp == NULL);
6876 ASSERT(sti->sti_unbind_mp == NULL);
6877
6878 ASSERT(sti->sti_conn_ind_head == NULL);
6879 ASSERT(sti->sti_conn_ind_tail == NULL);
6880 }
6881
6882 /*
6883 * Given a sonode, grab the TPI info and free any data.
6884 */
6885 static void
6886 sotpi_info_fini(struct sonode *so)
6887 {
6888 sotpi_info_t *sti = SOTOTPI(so);
6889 mblk_t *mp;
6890
6891 ASSERT(sti->sti_discon_ind_mp == NULL);
6892
6893 if ((mp = sti->sti_conn_ind_head) != NULL) {
6894 mblk_t *mp1;
6895
6896 while (mp) {
6897 mp1 = mp->b_next;
6898 mp->b_next = NULL;
6899 freemsg(mp);
6900 mp = mp1;
6901 }
6902 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6903 }
6904
6905 /*
6906 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6907 * indirect them. It also uses so_count as a validity test.
6908 */
6909 mutex_enter(&so->so_lock);
6910
6911 if (sti->sti_laddr_sa) {
6912 ASSERT((caddr_t)sti->sti_faddr_sa ==
6913 (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6914 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6915 sti->sti_laddr_valid = 0;
6916 sti->sti_faddr_valid = 0;
6917 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6918 sti->sti_laddr_sa = NULL;
6919 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6920 sti->sti_faddr_sa = NULL;
6921 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6922 }
6923
6924 mutex_exit(&so->so_lock);
6925
6926 if ((mp = sti->sti_eaddr_mp) != NULL) {
6927 freemsg(mp);
6928 sti->sti_eaddr_mp = NULL;
6929 sti->sti_delayed_error = 0;
6930 }
6931
6932 if ((mp = sti->sti_ack_mp) != NULL) {
6933 freemsg(mp);
6934 sti->sti_ack_mp = NULL;
6935 }
6936
6937 if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6938 sti->sti_nl7c_rcv_mp = NULL;
6939 freemsg(mp);
6940 }
6941 sti->sti_nl7c_rcv_rval = 0;
6942 if (sti->sti_nl7c_uri != NULL) {
6943 nl7c_urifree(so);
6944 /* urifree() cleared nl7c_uri */
6945 }
6946 if (sti->sti_nl7c_flags) {
6947 sti->sti_nl7c_flags = 0;
6948 }
6949
6950 ASSERT(sti->sti_ux_bound_vp == NULL);
6951 if ((mp = sti->sti_unbind_mp) != NULL) {
6952 freemsg(mp);
6953 sti->sti_unbind_mp = NULL;
6954 }
6955 }
6956
6957 /*
6958 * Destroys the TPI information attached to a sonode.
6959 */
6960 static void
6961 sotpi_info_destroy(struct sonode *so)
6962 {
6963 sotpi_info_t *sti = SOTOTPI(so);
6964
6965 i_sotpi_info_destructor(sti);
6966 kmem_free(sti, sizeof (*sti));
6967
6968 so->so_priv = NULL;
6969 }
6970
6971 /*
6972 * Create the global sotpi socket module entry. It will never be freed.
6973 */
6974 smod_info_t *
6975 sotpi_smod_create(void)
6976 {
6977 smod_info_t *smodp;
6978
6979 smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6980 smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6981 (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6982 /*
6983 * Initialize the smod_refcnt to 1 so it will never be freed.
6984 */
6985 smodp->smod_refcnt = 1;
6986 smodp->smod_uc_version = SOCK_UC_VERSION;
6987 smodp->smod_dc_version = SOCK_DC_VERSION;
6988 smodp->smod_sock_create_func = &sotpi_create;
6989 smodp->smod_sock_destroy_func = &sotpi_destroy;
6990 return (smodp);
6991 }