1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/kmem_impl.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/suntpi.h>
51 #include <sys/ddi.h>
52 #include <sys/esunddi.h>
53 #include <sys/flock.h>
54 #include <sys/modctl.h>
55 #include <sys/vtrace.h>
56 #include <sys/cmn_err.h>
57 #include <sys/pathname.h>
58
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
65
66 #include <sys/tiuser.h>
67 #define _SUN_TPI_VERSION 2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h> /* TI_GETMYNAME, TI_GETPEERNAME */
70
71 #include <c2/audit.h>
72
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
78
79 #include <sys/zone.h>
80
81 #include <fs/sockfs/nl7c.h>
82 #include <fs/sockfs/nl7curi.h>
83
84 #include <fs/sockfs/sockcommon.h>
85 #include <fs/sockfs/socktpi.h>
86 #include <fs/sockfs/socktpi_impl.h>
87
88 /*
89 * Possible failures when memory can't be allocated. The documented behavior:
90 *
91 * 5.5: 4.X: XNET:
92 * accept: ENOMEM/ENOSR/EINTR - (EINTR) ENOMEM/ENOBUFS/ENOSR/
93 * EINTR
94 * (4.X does not document EINTR but returns it)
95 * bind: ENOSR - ENOBUFS/ENOSR
96 * connect: EINTR EINTR ENOBUFS/ENOSR/EINTR
97 * getpeername: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
98 * getsockname: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
99 * (4.X getpeername and getsockname do not fail in practice)
100 * getsockopt: ENOMEM/ENOSR - ENOBUFS/ENOSR
101 * listen: - - ENOBUFS
102 * recv: ENOMEM/ENOSR/EINTR EINTR ENOBUFS/ENOMEM/ENOSR/
103 * EINTR
104 * send: ENOMEM/ENOSR/EINTR ENOBUFS/EINTR ENOBUFS/ENOMEM/ENOSR/
105 * EINTR
106 * setsockopt: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
107 * shutdown: ENOMEM/ENOSR - ENOBUFS/ENOSR
108 * socket: ENOMEM/ENOSR ENOBUFS ENOBUFS/ENOMEM/ENOSR
109 * socketpair: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
110 *
111 * Resolution. When allocation fails:
112 * recv: return EINTR
113 * send: return EINTR
114 * connect, accept: EINTR
115 * bind, listen, shutdown (unbind, unix_close, disconnect): sleep
116 * socket, socketpair: ENOBUFS
117 * getpeername, getsockname: sleep
118 * getsockopt, setsockopt: sleep
119 */
120
121 #ifdef SOCK_TEST
122 /*
123 * Variables that make sockfs do something other than the standard TPI
124 * for the AF_INET transports.
125 *
126 * solisten_tpi_tcp:
127 * TCP can handle a O_T_BIND_REQ with an increased backlog even though
128 * the transport is already bound. This is needed to avoid loosing the
129 * port number should listen() do a T_UNBIND_REQ followed by a
130 * O_T_BIND_REQ.
131 *
132 * soconnect_tpi_udp:
133 * UDP and ICMP can handle a T_CONN_REQ.
134 * This is needed to make the sequence of connect(), getsockname()
135 * return the local IP address used to send packets to the connected to
136 * destination.
137 *
138 * soconnect_tpi_tcp:
139 * TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
140 * Set this to non-zero to send TPI conformant messages to TCP in this
141 * respect. This is a performance optimization.
142 *
143 * soaccept_tpi_tcp:
144 * TCP can handle a T_CONN_REQ without the acceptor being bound.
145 * This is a performance optimization that has been picked up in XTI.
146 *
147 * soaccept_tpi_multioptions:
148 * When inheriting SOL_SOCKET options from the listener to the accepting
149 * socket send them as a single message for AF_INET{,6}.
150 */
151 int solisten_tpi_tcp = 0;
152 int soconnect_tpi_udp = 0;
153 int soconnect_tpi_tcp = 0;
154 int soaccept_tpi_tcp = 0;
155 int soaccept_tpi_multioptions = 1;
156 #else /* SOCK_TEST */
157 #define soconnect_tpi_tcp 0
158 #define soconnect_tpi_udp 0
159 #define solisten_tpi_tcp 0
160 #define soaccept_tpi_tcp 0
161 #define soaccept_tpi_multioptions 1
162 #endif /* SOCK_TEST */
163
164 #ifdef SOCK_TEST
165 extern int do_useracc;
166 extern clock_t sock_test_timelimit;
167 #endif /* SOCK_TEST */
168
169 extern uint32_t ucredsize;
170
171 /*
172 * Some X/Open added checks might have to be backed out to keep SunOS 4.X
173 * applications working. Turn on this flag to disable these checks.
174 */
175 int xnet_skip_checks = 0;
176 int xnet_check_print = 0;
177 int xnet_truncate_print = 0;
178
179 static void sotpi_destroy(struct sonode *);
180 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
181 int, int *, cred_t *cr);
182
183 static boolean_t sotpi_info_create(struct sonode *, int);
184 static void sotpi_info_init(struct sonode *);
185 static void sotpi_info_fini(struct sonode *);
186 static void sotpi_info_destroy(struct sonode *);
187
188 /*
189 * Do direct function call to the transport layer below; this would
190 * also allow the transport to utilize read-side synchronous stream
191 * interface if necessary. This is a /etc/system tunable that must
192 * not be modified on a running system. By default this is enabled
193 * for performance reasons and may be disabled for debugging purposes.
194 */
195 boolean_t socktpi_direct = B_TRUE;
196
197 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
198
199 extern void sigintr(k_sigset_t *, int);
200 extern void sigunintr(k_sigset_t *);
201
202 static int sotpi_unbind(struct sonode *, int);
203
204 /* TPI sockfs sonode operations */
205 int sotpi_init(struct sonode *, struct sonode *, struct cred *,
206 int);
207 static int sotpi_accept(struct sonode *, int, struct cred *,
208 struct sonode **);
209 static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
210 int, struct cred *);
211 static int sotpi_listen(struct sonode *, int, struct cred *);
212 static int sotpi_connect(struct sonode *, struct sockaddr *,
213 socklen_t, int, int, struct cred *);
214 extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
215 struct uio *, struct cred *);
216 static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
217 struct uio *, struct cred *);
218 static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
219 struct cred *, mblk_t **);
220 static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
221 struct uio *, void *, t_uscalar_t, int);
222 static int sodgram_direct(struct sonode *, struct sockaddr *,
223 socklen_t, struct uio *, int);
224 extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
225 socklen_t *, boolean_t, struct cred *);
226 static int sotpi_getsockname(struct sonode *, struct sockaddr *,
227 socklen_t *, struct cred *);
228 static int sotpi_shutdown(struct sonode *, int, struct cred *);
229 extern int sotpi_getsockopt(struct sonode *, int, int, void *,
230 socklen_t *, int, struct cred *);
231 extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
232 socklen_t, struct cred *);
233 static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
234 int32_t *);
235 static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
236 struct cred *, int32_t *);
237 static int sotpi_poll(struct sonode *, short, int, short *,
238 struct pollhead **);
239 static int sotpi_close(struct sonode *, int, struct cred *);
240
241 static int i_sotpi_info_constructor(sotpi_info_t *);
242 static void i_sotpi_info_destructor(sotpi_info_t *);
243
244 sonodeops_t sotpi_sonodeops = {
245 sotpi_init, /* sop_init */
246 sotpi_accept, /* sop_accept */
247 sotpi_bind, /* sop_bind */
248 sotpi_listen, /* sop_listen */
249 sotpi_connect, /* sop_connect */
250 sotpi_recvmsg, /* sop_recvmsg */
251 sotpi_sendmsg, /* sop_sendmsg */
252 sotpi_sendmblk, /* sop_sendmblk */
253 sotpi_getpeername, /* sop_getpeername */
254 sotpi_getsockname, /* sop_getsockname */
255 sotpi_shutdown, /* sop_shutdown */
256 sotpi_getsockopt, /* sop_getsockopt */
257 sotpi_setsockopt, /* sop_setsockopt */
258 sotpi_ioctl, /* sop_ioctl */
259 sotpi_poll, /* sop_poll */
260 sotpi_close, /* sop_close */
261 };
262
263 /*
264 * Return a TPI socket vnode.
265 *
266 * Note that sockets assume that the driver will clone (either itself
267 * or by using the clone driver) i.e. a socket() call will always
268 * result in a new vnode being created.
269 */
270
271 /*
272 * Common create code for socket and accept. If tso is set the values
273 * from that node is used instead of issuing a T_INFO_REQ.
274 */
275
276 /* ARGSUSED */
277 static struct sonode *
278 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
279 int version, int sflags, int *errorp, cred_t *cr)
280 {
281 struct sonode *so;
282 kmem_cache_t *cp;
283 int sfamily = family;
284
285 ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
286
287 if (family == AF_NCA) {
288 /*
289 * The request is for an NCA socket so for NL7C use the
290 * INET domain instead and mark NL7C_AF_NCA below.
291 */
292 family = AF_INET;
293 /*
294 * NL7C is not supported in the non-global zone,
295 * we enforce this restriction here.
296 */
297 if (getzoneid() != GLOBAL_ZONEID) {
298 *errorp = ENOTSUP;
299 return (NULL);
300 }
301 }
302
303 /*
304 * to be compatible with old tpi socket implementation ignore
305 * sleep flag (sflags) passed in
306 */
307 cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
308 so = kmem_cache_alloc(cp, KM_SLEEP);
309 if (so == NULL) {
310 *errorp = ENOMEM;
311 return (NULL);
312 }
313
314 sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
315 sotpi_info_init(so);
316
317 if (sfamily == AF_NCA) {
318 SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
319 }
320
321 if (version == SOV_DEFAULT)
322 version = so_default_version;
323
324 so->so_version = (short)version;
325 *errorp = 0;
326
327 return (so);
328 }
329
330 static void
331 sotpi_destroy(struct sonode *so)
332 {
333 kmem_cache_t *cp;
334 struct sockparams *origsp;
335
336 /*
337 * If there is a new dealloc function (ie. smod_destroy_func),
338 * then it should check the correctness of the ops.
339 */
340
341 ASSERT(so->so_ops == &sotpi_sonodeops);
342
343 origsp = SOTOTPI(so)->sti_orig_sp;
344
345 sotpi_info_fini(so);
346
347 if (so->so_state & SS_FALLBACK_COMP) {
348 /*
349 * A fallback happend, which means that a sotpi_info_t struct
350 * was allocated (as opposed to being allocated from the TPI
351 * sonode cache. Therefore we explicitly free the struct
352 * here.
353 */
354 sotpi_info_destroy(so);
355 ASSERT(origsp != NULL);
356
357 origsp->sp_smod_info->smod_sock_destroy_func(so);
358 SOCKPARAMS_DEC_REF(origsp);
359 } else {
360 sonode_fini(so);
361 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
362 socktpi_cache;
363 kmem_cache_free(cp, so);
364 }
365 }
366
367 /* ARGSUSED1 */
368 int
369 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
370 {
371 major_t maj;
372 dev_t newdev;
373 struct vnode *vp;
374 int error = 0;
375 struct stdata *stp;
376
377 sotpi_info_t *sti = SOTOTPI(so);
378
379 dprint(1, ("sotpi_init()\n"));
380
381 /*
382 * over write the sleep flag passed in but that is ok
383 * as tpi socket does not honor sleep flag.
384 */
385 flags |= FREAD|FWRITE;
386
387 /*
388 * Record in so_flag that it is a clone.
389 */
390 if (getmajor(sti->sti_dev) == clone_major)
391 so->so_flag |= SOCLONE;
392
393 if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
394 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
395 (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
396 so->so_protocol == IPPROTO_IP)) {
397 /* Tell tcp or udp that it's talking to sockets */
398 flags |= SO_SOCKSTR;
399
400 /*
401 * Here we indicate to socktpi_open() our attempt to
402 * make direct calls between sockfs and transport.
403 * The final decision is left to socktpi_open().
404 */
405 sti->sti_direct = 1;
406
407 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
408 if (so->so_type == SOCK_STREAM && tso != NULL) {
409 if (SOTOTPI(tso)->sti_direct) {
410 /*
411 * Inherit sti_direct from listener and pass
412 * SO_ACCEPTOR open flag to tcp, indicating
413 * that this is an accept fast-path instance.
414 */
415 flags |= SO_ACCEPTOR;
416 } else {
417 /*
418 * sti_direct is not set on listener, meaning
419 * that the listener has been converted from
420 * a socket to a stream. Ensure that the
421 * acceptor inherits these settings.
422 */
423 sti->sti_direct = 0;
424 flags &= ~SO_SOCKSTR;
425 }
426 }
427 }
428
429 /*
430 * Tell local transport that it is talking to sockets.
431 */
432 if (so->so_family == AF_UNIX) {
433 flags |= SO_SOCKSTR;
434 }
435
436 vp = SOTOV(so);
437 newdev = vp->v_rdev;
438 maj = getmajor(newdev);
439 ASSERT(STREAMSTAB(maj));
440
441 error = stropen(vp, &newdev, flags, cr);
442
443 stp = vp->v_stream;
444 if (error == 0) {
445 if (so->so_flag & SOCLONE)
446 ASSERT(newdev != vp->v_rdev);
447 mutex_enter(&so->so_lock);
448 sti->sti_dev = newdev;
449 vp->v_rdev = newdev;
450 mutex_exit(&so->so_lock);
451
452 if (stp->sd_flag & STRISTTY) {
453 /*
454 * this is a post SVR4 tty driver - a socket can not
455 * be a controlling terminal. Fail the open.
456 */
457 (void) sotpi_close(so, flags, cr);
458 return (ENOTTY); /* XXX */
459 }
460
461 ASSERT(stp->sd_wrq != NULL);
462 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
463
464 /*
465 * If caller is interested in doing direct function call
466 * interface to/from transport module, probe the module
467 * directly beneath the streamhead to see if it qualifies.
468 *
469 * We turn off the direct interface when qualifications fail.
470 * In the acceptor case, we simply turn off the sti_direct
471 * flag on the socket. We do the fallback after the accept
472 * has completed, before the new socket is returned to the
473 * application.
474 */
475 if (sti->sti_direct) {
476 queue_t *tq = stp->sd_wrq->q_next;
477
478 /*
479 * sti_direct is currently supported and tested
480 * only for tcp/udp; this is the main reason to
481 * have the following assertions.
482 */
483 ASSERT(so->so_family == AF_INET ||
484 so->so_family == AF_INET6);
485 ASSERT(so->so_protocol == IPPROTO_UDP ||
486 so->so_protocol == IPPROTO_TCP ||
487 so->so_protocol == IPPROTO_IP);
488 ASSERT(so->so_type == SOCK_DGRAM ||
489 so->so_type == SOCK_STREAM);
490
491 /*
492 * Abort direct call interface if the module directly
493 * underneath the stream head is not defined with the
494 * _D_DIRECT flag. This could happen in the tcp or
495 * udp case, when some other module is autopushed
496 * above it, or for some reasons the expected module
497 * isn't purely D_MP (which is the main requirement).
498 */
499 if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
500 !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
501 int rval;
502
503 /* Continue on without direct calls */
504 sti->sti_direct = 0;
505
506 /*
507 * Cannot issue ioctl on fallback socket since
508 * there is no conn associated with the queue.
509 * The fallback downcall will notify the proto
510 * of the change.
511 */
512 if (!(flags & SO_ACCEPTOR) &&
513 !(flags & SO_FALLBACK)) {
514 if ((error = strioctl(vp,
515 _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
516 cr, &rval)) != 0) {
517 (void) sotpi_close(so, flags,
518 cr);
519 return (error);
520 }
521 }
522 }
523 }
524
525 if (flags & SO_FALLBACK) {
526 /*
527 * The stream created does not have a conn.
528 * do stream set up after conn has been assigned
529 */
530 return (error);
531 }
532 if (error = so_strinit(so, tso)) {
533 (void) sotpi_close(so, flags, cr);
534 return (error);
535 }
536
537 /* Enable sendfile() on AF_UNIX streams */
538 if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
539 mutex_enter(&so->so_lock);
540 so->so_mode |= SM_SENDFILESUPP;
541 mutex_exit(&so->so_lock);
542 }
543
544 /* Wildcard */
545 if (so->so_protocol != so->so_sockparams->sp_protocol) {
546 int protocol = so->so_protocol;
547 /*
548 * Issue SO_PROTOTYPE setsockopt.
549 */
550 error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
551 &protocol, (t_uscalar_t)sizeof (protocol), cr);
552 if (error != 0) {
553 (void) sotpi_close(so, flags, cr);
554 /*
555 * Setsockopt often fails with ENOPROTOOPT but
556 * socket() should fail with
557 * EPROTONOSUPPORT/EPROTOTYPE.
558 */
559 return (EPROTONOSUPPORT);
560 }
561 }
562
563 } else {
564 /*
565 * While the same socket can not be reopened (unlike specfs)
566 * the stream head sets STREOPENFAIL when the autopush fails.
567 */
568 if ((stp != NULL) &&
569 (stp->sd_flag & STREOPENFAIL)) {
570 /*
571 * Open failed part way through.
572 */
573 mutex_enter(&stp->sd_lock);
574 stp->sd_flag &= ~STREOPENFAIL;
575 mutex_exit(&stp->sd_lock);
576 (void) sotpi_close(so, flags, cr);
577 return (error);
578 /*NOTREACHED*/
579 }
580 ASSERT(stp == NULL);
581 }
582 TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
583 "sockfs open:maj %d vp %p so %p error %d",
584 maj, vp, so, error);
585 return (error);
586 }
587
588 /*
589 * Bind the socket to an unspecified address in sockfs only.
590 * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
591 * required in all cases.
592 */
593 static void
594 so_automatic_bind(struct sonode *so)
595 {
596 sotpi_info_t *sti = SOTOTPI(so);
597 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
598
599 ASSERT(MUTEX_HELD(&so->so_lock));
600 ASSERT(!(so->so_state & SS_ISBOUND));
601 ASSERT(sti->sti_unbind_mp);
602
603 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
604 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
605 sti->sti_laddr_sa->sa_family = so->so_family;
606 so->so_state |= SS_ISBOUND;
607 }
608
609
610 /*
611 * bind the socket.
612 *
613 * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
614 * are passed in we allow rebinding. Note that for backwards compatibility
615 * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
616 * Thus the rebinding code is currently not executed.
617 *
618 * The constraints for rebinding are:
619 * - it is a SOCK_DGRAM, or
620 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
621 * and no listen() has been done.
622 * This rebinding code was added based on some language in the XNET book
623 * about not returning EINVAL it the protocol allows rebinding. However,
624 * this language is not present in the Posix socket draft. Thus maybe the
625 * rebinding logic should be deleted from the source.
626 *
627 * A null "name" can be used to unbind the socket if:
628 * - it is a SOCK_DGRAM, or
629 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
630 * and no listen() has been done.
631 */
632 /* ARGSUSED */
633 static int
634 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
635 socklen_t namelen, int backlog, int flags, struct cred *cr)
636 {
637 struct T_bind_req bind_req;
638 struct T_bind_ack *bind_ack;
639 int error = 0;
640 mblk_t *mp;
641 void *addr;
642 t_uscalar_t addrlen;
643 int unbind_on_err = 1;
644 boolean_t clear_acceptconn_on_err = B_FALSE;
645 boolean_t restore_backlog_on_err = B_FALSE;
646 int save_so_backlog;
647 t_scalar_t PRIM_type = O_T_BIND_REQ;
648 boolean_t tcp_udp_xport;
649 void *nl7c = NULL;
650 sotpi_info_t *sti = SOTOTPI(so);
651
652 dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
653 (void *)so, (void *)name, namelen, backlog, flags,
654 pr_state(so->so_state, so->so_mode)));
655
656 tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
657
658 if (!(flags & _SOBIND_LOCK_HELD)) {
659 mutex_enter(&so->so_lock);
660 so_lock_single(so); /* Set SOLOCKED */
661 } else {
662 ASSERT(MUTEX_HELD(&so->so_lock));
663 ASSERT(so->so_flag & SOLOCKED);
664 }
665
666 /*
667 * Make sure that there is a preallocated unbind_req message
668 * before binding. This message allocated when the socket is
669 * created but it might be have been consumed.
670 */
671 if (sti->sti_unbind_mp == NULL) {
672 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
673 /* NOTE: holding so_lock while sleeping */
674 sti->sti_unbind_mp =
675 soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
676 cr);
677 }
678
679 if (flags & _SOBIND_REBIND) {
680 /*
681 * Called from solisten after doing an sotpi_unbind() or
682 * potentially without the unbind (latter for AF_INET{,6}).
683 */
684 ASSERT(name == NULL && namelen == 0);
685
686 if (so->so_family == AF_UNIX) {
687 ASSERT(sti->sti_ux_bound_vp);
688 addr = &sti->sti_ux_laddr;
689 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
690 dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
691 "addr 0x%p, vp %p\n",
692 addrlen,
693 (void *)((struct so_ux_addr *)addr)->soua_vp,
694 (void *)sti->sti_ux_bound_vp));
695 } else {
696 addr = sti->sti_laddr_sa;
697 addrlen = (t_uscalar_t)sti->sti_laddr_len;
698 }
699 } else if (flags & _SOBIND_UNSPEC) {
700 ASSERT(name == NULL && namelen == 0);
701
702 /*
703 * The caller checked SS_ISBOUND but not necessarily
704 * under so_lock
705 */
706 if (so->so_state & SS_ISBOUND) {
707 /* No error */
708 goto done;
709 }
710
711 /* Set an initial local address */
712 switch (so->so_family) {
713 case AF_UNIX:
714 /*
715 * Use an address with same size as struct sockaddr
716 * just like BSD.
717 */
718 sti->sti_laddr_len =
719 (socklen_t)sizeof (struct sockaddr);
720 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
721 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
722 sti->sti_laddr_sa->sa_family = so->so_family;
723
724 /*
725 * Pass down an address with the implicit bind
726 * magic number and the rest all zeros.
727 * The transport will return a unique address.
728 */
729 sti->sti_ux_laddr.soua_vp = NULL;
730 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
731 addr = &sti->sti_ux_laddr;
732 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
733 break;
734
735 case AF_INET:
736 case AF_INET6:
737 /*
738 * An unspecified bind in TPI has a NULL address.
739 * Set the address in sockfs to have the sa_family.
740 */
741 sti->sti_laddr_len = (so->so_family == AF_INET) ?
742 (socklen_t)sizeof (sin_t) :
743 (socklen_t)sizeof (sin6_t);
744 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
745 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
746 sti->sti_laddr_sa->sa_family = so->so_family;
747 addr = NULL;
748 addrlen = 0;
749 break;
750
751 default:
752 /*
753 * An unspecified bind in TPI has a NULL address.
754 * Set the address in sockfs to be zero length.
755 *
756 * Can not assume there is a sa_family for all
757 * protocol families. For example, AF_X25 does not
758 * have a family field.
759 */
760 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
761 sti->sti_laddr_len = 0; /* XXX correct? */
762 addr = NULL;
763 addrlen = 0;
764 break;
765 }
766
767 } else {
768 if (so->so_state & SS_ISBOUND) {
769 /*
770 * If it is ok to rebind the socket, first unbind
771 * with the transport. A rebind to the NULL address
772 * is interpreted as an unbind.
773 * Note that a bind to NULL in BSD does unbind the
774 * socket but it fails with EINVAL.
775 * Note that regular sockets set SOV_SOCKBSD i.e.
776 * _SOBIND_SOCKBSD gets set here hence no type of
777 * socket does currently allow rebinding.
778 *
779 * If the name is NULL just do an unbind.
780 */
781 if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
782 name != NULL) {
783 error = EINVAL;
784 unbind_on_err = 0;
785 eprintsoline(so, error);
786 goto done;
787 }
788 if ((so->so_mode & SM_CONNREQUIRED) &&
789 (so->so_state & SS_CANTREBIND)) {
790 error = EINVAL;
791 unbind_on_err = 0;
792 eprintsoline(so, error);
793 goto done;
794 }
795 error = sotpi_unbind(so, 0);
796 if (error) {
797 eprintsoline(so, error);
798 goto done;
799 }
800 ASSERT(!(so->so_state & SS_ISBOUND));
801 if (name == NULL) {
802 so->so_state &=
803 ~(SS_ISCONNECTED|SS_ISCONNECTING);
804 goto done;
805 }
806 }
807
808 /* X/Open requires this check */
809 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
810 if (xnet_check_print) {
811 printf("sockfs: X/Open bind state check "
812 "caused EINVAL\n");
813 }
814 error = EINVAL;
815 goto done;
816 }
817
818 switch (so->so_family) {
819 case AF_UNIX:
820 /*
821 * All AF_UNIX addresses are nul terminated
822 * when copied (copyin_name) in so the minimum
823 * length is 3 bytes.
824 */
825 if (name == NULL ||
826 (ssize_t)namelen <= sizeof (short) + 1) {
827 error = EISDIR;
828 eprintsoline(so, error);
829 goto done;
830 }
831 /*
832 * Verify so_family matches the bound family.
833 * BSD does not check this for AF_UNIX resulting
834 * in funny mknods.
835 */
836 if (name->sa_family != so->so_family) {
837 error = EAFNOSUPPORT;
838 goto done;
839 }
840 break;
841 case AF_INET:
842 if (name == NULL) {
843 error = EINVAL;
844 eprintsoline(so, error);
845 goto done;
846 }
847 if ((size_t)namelen != sizeof (sin_t)) {
848 error = name->sa_family != so->so_family ?
849 EAFNOSUPPORT : EINVAL;
850 eprintsoline(so, error);
851 goto done;
852 }
853 if ((flags & _SOBIND_XPG4_2) &&
854 (name->sa_family != so->so_family)) {
855 /*
856 * This check has to be made for X/Open
857 * sockets however application failures have
858 * been observed when it is applied to
859 * all sockets.
860 */
861 error = EAFNOSUPPORT;
862 eprintsoline(so, error);
863 goto done;
864 }
865 /*
866 * Force a zero sa_family to match so_family.
867 *
868 * Some programs like inetd(1M) don't set the
869 * family field. Other programs leave
870 * sin_family set to garbage - SunOS 4.X does
871 * not check the family field on a bind.
872 * We use the family field that
873 * was passed in to the socket() call.
874 */
875 name->sa_family = so->so_family;
876 break;
877
878 case AF_INET6: {
879 #ifdef DEBUG
880 sin6_t *sin6 = (sin6_t *)name;
881 #endif /* DEBUG */
882
883 if (name == NULL) {
884 error = EINVAL;
885 eprintsoline(so, error);
886 goto done;
887 }
888 if ((size_t)namelen != sizeof (sin6_t)) {
889 error = name->sa_family != so->so_family ?
890 EAFNOSUPPORT : EINVAL;
891 eprintsoline(so, error);
892 goto done;
893 }
894 if (name->sa_family != so->so_family) {
895 /*
896 * With IPv6 we require the family to match
897 * unlike in IPv4.
898 */
899 error = EAFNOSUPPORT;
900 eprintsoline(so, error);
901 goto done;
902 }
903 #ifdef DEBUG
904 /*
905 * Verify that apps don't forget to clear
906 * sin6_scope_id etc
907 */
908 if (sin6->sin6_scope_id != 0 &&
909 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
910 zcmn_err(getzoneid(), CE_WARN,
911 "bind with uninitialized sin6_scope_id "
912 "(%d) on socket. Pid = %d\n",
913 (int)sin6->sin6_scope_id,
914 (int)curproc->p_pid);
915 }
916 if (sin6->__sin6_src_id != 0) {
917 zcmn_err(getzoneid(), CE_WARN,
918 "bind with uninitialized __sin6_src_id "
919 "(%d) on socket. Pid = %d\n",
920 (int)sin6->__sin6_src_id,
921 (int)curproc->p_pid);
922 }
923 #endif /* DEBUG */
924 break;
925 }
926 default:
927 /*
928 * Don't do any length or sa_family check to allow
929 * non-sockaddr style addresses.
930 */
931 if (name == NULL) {
932 error = EINVAL;
933 eprintsoline(so, error);
934 goto done;
935 }
936 break;
937 }
938
939 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
940 error = ENAMETOOLONG;
941 eprintsoline(so, error);
942 goto done;
943 }
944 /*
945 * Save local address.
946 */
947 sti->sti_laddr_len = (socklen_t)namelen;
948 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
949 bcopy(name, sti->sti_laddr_sa, namelen);
950
951 addr = sti->sti_laddr_sa;
952 addrlen = (t_uscalar_t)sti->sti_laddr_len;
953 switch (so->so_family) {
954 case AF_INET6:
955 case AF_INET:
956 break;
957 case AF_UNIX: {
958 struct sockaddr_un *soun =
959 (struct sockaddr_un *)sti->sti_laddr_sa;
960 struct vnode *vp, *rvp;
961 struct vattr vattr;
962
963 ASSERT(sti->sti_ux_bound_vp == NULL);
964 /*
965 * Create vnode for the specified path name.
966 * Keep vnode held with a reference in sti_ux_bound_vp.
967 * Use the vnode pointer as the address used in the
968 * bind with the transport.
969 *
970 * Use the same mode as in BSD. In particular this does
971 * not observe the umask.
972 */
973 /* MAXPATHLEN + soun_family + nul termination */
974 if (sti->sti_laddr_len >
975 (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
976 error = ENAMETOOLONG;
977 eprintsoline(so, error);
978 goto done;
979 }
980 vattr.va_type = VSOCK;
981 vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
982 vattr.va_mask = AT_TYPE|AT_MODE;
983 /* NOTE: holding so_lock */
984 error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
985 EXCL, 0, &vp, CRMKNOD, 0, 0);
986 if (error) {
987 if (error == EEXIST)
988 error = EADDRINUSE;
989 eprintsoline(so, error);
990 goto done;
991 }
992 /*
993 * Establish pointer from the underlying filesystem
994 * vnode to the socket node.
995 * sti_ux_bound_vp and v_stream->sd_vnode form the
996 * cross-linkage between the underlying filesystem
997 * node and the socket node.
998 */
999
1000 if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
1001 VN_HOLD(rvp);
1002 VN_RELE(vp);
1003 vp = rvp;
1004 }
1005
1006 ASSERT(SOTOV(so)->v_stream);
1007 mutex_enter(&vp->v_lock);
1008 vp->v_stream = SOTOV(so)->v_stream;
1009 sti->sti_ux_bound_vp = vp;
1010 mutex_exit(&vp->v_lock);
1011
1012 /*
1013 * Use the vnode pointer value as a unique address
1014 * (together with the magic number to avoid conflicts
1015 * with implicit binds) in the transport provider.
1016 */
1017 sti->sti_ux_laddr.soua_vp =
1018 (void *)sti->sti_ux_bound_vp;
1019 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1020 addr = &sti->sti_ux_laddr;
1021 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1022 dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1023 addrlen,
1024 (void *)((struct so_ux_addr *)addr)->soua_vp));
1025 break;
1026 }
1027 } /* end switch (so->so_family) */
1028 }
1029
1030 /*
1031 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1032 * the transport can start passing up T_CONN_IND messages
1033 * as soon as it receives the bind req and strsock_proto()
1034 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1035 */
1036 if (flags & _SOBIND_LISTEN) {
1037 if ((so->so_state & SS_ACCEPTCONN) == 0)
1038 clear_acceptconn_on_err = B_TRUE;
1039 save_so_backlog = so->so_backlog;
1040 restore_backlog_on_err = B_TRUE;
1041 so->so_state |= SS_ACCEPTCONN;
1042 so->so_backlog = backlog;
1043 }
1044
1045 /*
1046 * If NL7C addr(s) have been configured check for addr/port match,
1047 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1048 *
1049 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1050 * family sockets only. If match mark as such.
1051 */
1052 if (nl7c_enabled && ((addr != NULL &&
1053 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1054 (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1055 sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1056 /*
1057 * NL7C is not supported in non-global zones,
1058 * we enforce this restriction here.
1059 */
1060 if (so->so_zoneid == GLOBAL_ZONEID) {
1061 /* An NL7C socket, mark it */
1062 sti->sti_nl7c_flags |= NL7C_ENABLED;
1063 if (nl7c == NULL) {
1064 /*
1065 * Was an AF_NCA bind() so add it to the
1066 * addr list for reporting purposes.
1067 */
1068 nl7c = nl7c_add_addr(addr, addrlen);
1069 }
1070 } else
1071 nl7c = NULL;
1072 }
1073
1074 /*
1075 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1076 * for other transports we will send in a O_T_BIND_REQ.
1077 */
1078 if (tcp_udp_xport &&
1079 (so->so_family == AF_INET || so->so_family == AF_INET6))
1080 PRIM_type = T_BIND_REQ;
1081
1082 bind_req.PRIM_type = PRIM_type;
1083 bind_req.ADDR_length = addrlen;
1084 bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1085 bind_req.CONIND_number = backlog;
1086 /* NOTE: holding so_lock while sleeping */
1087 mp = soallocproto2(&bind_req, sizeof (bind_req),
1088 addr, addrlen, 0, _ALLOC_SLEEP, cr);
1089 sti->sti_laddr_valid = 0;
1090
1091 /* Done using sti_laddr_sa - can drop the lock */
1092 mutex_exit(&so->so_lock);
1093
1094 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1095 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1096 if (error) {
1097 eprintsoline(so, error);
1098 mutex_enter(&so->so_lock);
1099 goto done;
1100 }
1101
1102 mutex_enter(&so->so_lock);
1103 error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1104 (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1105 if (error) {
1106 eprintsoline(so, error);
1107 goto done;
1108 }
1109 ASSERT(mp);
1110 /*
1111 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1112 * strsock_proto while the lock was dropped above, the bind
1113 * is allowed to complete.
1114 */
1115
1116 /* Mark as bound. This will be undone if we detect errors below. */
1117 if (flags & _SOBIND_NOXLATE) {
1118 ASSERT(so->so_family == AF_UNIX);
1119 sti->sti_faddr_noxlate = 1;
1120 }
1121 ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1122 so->so_state |= SS_ISBOUND;
1123 ASSERT(sti->sti_unbind_mp);
1124
1125 /* note that we've already set SS_ACCEPTCONN above */
1126
1127 /*
1128 * Recompute addrlen - an unspecied bind sent down an
1129 * address of length zero but we expect the appropriate length
1130 * in return.
1131 */
1132 addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1133 sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1134
1135 bind_ack = (struct T_bind_ack *)mp->b_rptr;
1136 /*
1137 * The alignment restriction is really too strict but
1138 * we want enough alignment to inspect the fields of
1139 * a sockaddr_in.
1140 */
1141 addr = sogetoff(mp, bind_ack->ADDR_offset,
1142 bind_ack->ADDR_length,
1143 __TPI_ALIGN_SIZE);
1144 if (addr == NULL) {
1145 freemsg(mp);
1146 error = EPROTO;
1147 eprintsoline(so, error);
1148 goto done;
1149 }
1150 if (!(flags & _SOBIND_UNSPEC)) {
1151 /*
1152 * Verify that the transport didn't return something we
1153 * did not want e.g. an address other than what we asked for.
1154 *
1155 * NOTE: These checks would go away if/when we switch to
1156 * using the new TPI (in which the transport would fail
1157 * the request instead of assigning a different address).
1158 *
1159 * NOTE2: For protocols that we don't know (i.e. any
1160 * other than AF_INET6, AF_INET and AF_UNIX), we
1161 * cannot know if the transport should be expected to
1162 * return the same address as that requested.
1163 *
1164 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1165 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1166 *
1167 * For example, in the case of netatalk it may be
1168 * inappropriate for the transport to return the
1169 * requested address (as it may have allocated a local
1170 * port number in behaviour similar to that of an
1171 * AF_INET bind request with a port number of zero).
1172 *
1173 * Given the definition of O_T_BIND_REQ, where the
1174 * transport may bind to an address other than the
1175 * requested address, it's not possible to determine
1176 * whether a returned address that differs from the
1177 * requested address is a reason to fail (because the
1178 * requested address was not available) or succeed
1179 * (because the transport allocated an appropriate
1180 * address and/or port).
1181 *
1182 * sockfs currently requires that the transport return
1183 * the requested address in the T_BIND_ACK, unless
1184 * there is code here to allow for any discrepancy.
1185 * Such code exists for AF_INET and AF_INET6.
1186 *
1187 * Netatalk chooses to return the requested address
1188 * rather than the (correct) allocated address. This
1189 * means that netatalk violates the TPI specification
1190 * (and would not function correctly if used from a
1191 * TLI application), but it does mean that it works
1192 * with sockfs.
1193 *
1194 * As noted above, using the newer XTI bind primitive
1195 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1196 * allow sockfs to be more sure about whether or not
1197 * the bind request had succeeded (as transports are
1198 * not permitted to bind to a different address than
1199 * that requested - they must return failure).
1200 * Unfortunately, support for T_BIND_REQ may not be
1201 * present in all transport implementations (netatalk,
1202 * for example, doesn't have it), making the
1203 * transition difficult.
1204 */
1205 if (bind_ack->ADDR_length != addrlen) {
1206 /* Assumes that the requested address was in use */
1207 freemsg(mp);
1208 error = EADDRINUSE;
1209 eprintsoline(so, error);
1210 goto done;
1211 }
1212
1213 switch (so->so_family) {
1214 case AF_INET6:
1215 case AF_INET: {
1216 sin_t *rname, *aname;
1217
1218 rname = (sin_t *)addr;
1219 aname = (sin_t *)sti->sti_laddr_sa;
1220
1221 /*
1222 * Take advantage of the alignment
1223 * of sin_port and sin6_port which fall
1224 * in the same place in their data structures.
1225 * Just use sin_port for either address family.
1226 *
1227 * This may become a problem if (heaven forbid)
1228 * there's a separate ipv6port_reserved... :-P
1229 *
1230 * Binding to port 0 has the semantics of letting
1231 * the transport bind to any port.
1232 *
1233 * If the transport is TCP or UDP since we had sent
1234 * a T_BIND_REQ we would not get a port other than
1235 * what we asked for.
1236 */
1237 if (tcp_udp_xport) {
1238 /*
1239 * Pick up the new port number if we bound to
1240 * port 0.
1241 */
1242 if (aname->sin_port == 0)
1243 aname->sin_port = rname->sin_port;
1244 sti->sti_laddr_valid = 1;
1245 break;
1246 }
1247 if (aname->sin_port != 0 &&
1248 aname->sin_port != rname->sin_port) {
1249 freemsg(mp);
1250 error = EADDRINUSE;
1251 eprintsoline(so, error);
1252 goto done;
1253 }
1254 /*
1255 * Pick up the new port number if we bound to port 0.
1256 */
1257 aname->sin_port = rname->sin_port;
1258
1259 /*
1260 * Unfortunately, addresses aren't _quite_ the same.
1261 */
1262 if (so->so_family == AF_INET) {
1263 if (aname->sin_addr.s_addr !=
1264 rname->sin_addr.s_addr) {
1265 freemsg(mp);
1266 error = EADDRNOTAVAIL;
1267 eprintsoline(so, error);
1268 goto done;
1269 }
1270 } else {
1271 sin6_t *rname6 = (sin6_t *)rname;
1272 sin6_t *aname6 = (sin6_t *)aname;
1273
1274 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1275 &rname6->sin6_addr)) {
1276 freemsg(mp);
1277 error = EADDRNOTAVAIL;
1278 eprintsoline(so, error);
1279 goto done;
1280 }
1281 }
1282 break;
1283 }
1284 case AF_UNIX:
1285 if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1286 freemsg(mp);
1287 error = EADDRINUSE;
1288 eprintsoline(so, error);
1289 eprintso(so,
1290 ("addrlen %d, addr 0x%x, vp %p\n",
1291 addrlen, *((int *)addr),
1292 (void *)sti->sti_ux_bound_vp));
1293 goto done;
1294 }
1295 sti->sti_laddr_valid = 1;
1296 break;
1297 default:
1298 /*
1299 * NOTE: This assumes that addresses can be
1300 * byte-compared for equivalence.
1301 */
1302 if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1303 freemsg(mp);
1304 error = EADDRINUSE;
1305 eprintsoline(so, error);
1306 goto done;
1307 }
1308 /*
1309 * Don't mark sti_laddr_valid, as we cannot be
1310 * sure that the returned address is the real
1311 * bound address when talking to an unknown
1312 * transport.
1313 */
1314 break;
1315 }
1316 } else {
1317 /*
1318 * Save for returned address for getsockname.
1319 * Needed for unspecific bind unless transport supports
1320 * the TI_GETMYNAME ioctl.
1321 * Do this for AF_INET{,6} even though they do, as
1322 * caching info here is much better performance than
1323 * a TPI/STREAMS trip to the transport for getsockname.
1324 * Any which can't for some reason _must_ _not_ set
1325 * sti_laddr_valid here for the caching version of
1326 * getsockname to not break;
1327 */
1328 switch (so->so_family) {
1329 case AF_UNIX:
1330 /*
1331 * Record the address bound with the transport
1332 * for use by socketpair.
1333 */
1334 bcopy(addr, &sti->sti_ux_laddr, addrlen);
1335 sti->sti_laddr_valid = 1;
1336 break;
1337 case AF_INET:
1338 case AF_INET6:
1339 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1340 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1341 sti->sti_laddr_valid = 1;
1342 break;
1343 default:
1344 /*
1345 * Don't mark sti_laddr_valid, as we cannot be
1346 * sure that the returned address is the real
1347 * bound address when talking to an unknown
1348 * transport.
1349 */
1350 break;
1351 }
1352 }
1353
1354 if (nl7c != NULL) {
1355 /* Register listen()er sonode pointer with NL7C */
1356 nl7c_listener_addr(nl7c, so);
1357 }
1358
1359 freemsg(mp);
1360
1361 done:
1362 if (error) {
1363 /* reset state & backlog to values held on entry */
1364 if (clear_acceptconn_on_err == B_TRUE)
1365 so->so_state &= ~SS_ACCEPTCONN;
1366 if (restore_backlog_on_err == B_TRUE)
1367 so->so_backlog = save_so_backlog;
1368
1369 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1370 int err;
1371
1372 err = sotpi_unbind(so, 0);
1373 /* LINTED - statement has no consequent: if */
1374 if (err) {
1375 eprintsoline(so, error);
1376 } else {
1377 ASSERT(!(so->so_state & SS_ISBOUND));
1378 }
1379 }
1380 }
1381 if (!(flags & _SOBIND_LOCK_HELD)) {
1382 so_unlock_single(so, SOLOCKED);
1383 mutex_exit(&so->so_lock);
1384 } else {
1385 ASSERT(MUTEX_HELD(&so->so_lock));
1386 ASSERT(so->so_flag & SOLOCKED);
1387 }
1388 return (error);
1389 }
1390
1391 /* bind the socket */
1392 static int
1393 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1394 int flags, struct cred *cr)
1395 {
1396 if ((flags & _SOBIND_SOCKETPAIR) == 0)
1397 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1398
1399 flags &= ~_SOBIND_SOCKETPAIR;
1400 return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1401 }
1402
1403 /*
1404 * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1405 * address, or when listen needs to unbind and bind.
1406 * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1407 * so that a sobind can pick them up.
1408 */
1409 static int
1410 sotpi_unbind(struct sonode *so, int flags)
1411 {
1412 struct T_unbind_req unbind_req;
1413 int error = 0;
1414 mblk_t *mp;
1415 sotpi_info_t *sti = SOTOTPI(so);
1416
1417 dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1418 (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1419
1420 ASSERT(MUTEX_HELD(&so->so_lock));
1421 ASSERT(so->so_flag & SOLOCKED);
1422
1423 if (!(so->so_state & SS_ISBOUND)) {
1424 error = EINVAL;
1425 eprintsoline(so, error);
1426 goto done;
1427 }
1428
1429 mutex_exit(&so->so_lock);
1430
1431 /*
1432 * Flush the read and write side (except stream head read queue)
1433 * and send down T_UNBIND_REQ.
1434 */
1435 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1436
1437 unbind_req.PRIM_type = T_UNBIND_REQ;
1438 mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1439 0, _ALLOC_SLEEP, CRED());
1440 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1441 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1442 mutex_enter(&so->so_lock);
1443 if (error) {
1444 eprintsoline(so, error);
1445 goto done;
1446 }
1447
1448 error = sowaitokack(so, T_UNBIND_REQ);
1449 if (error) {
1450 eprintsoline(so, error);
1451 goto done;
1452 }
1453
1454 /*
1455 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1456 * strsock_proto while the lock was dropped above, the unbind
1457 * is allowed to complete.
1458 */
1459 if (!(flags & _SOUNBIND_REBIND)) {
1460 /*
1461 * Clear out bound address.
1462 */
1463 vnode_t *vp;
1464
1465 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1466 sti->sti_ux_bound_vp = NULL;
1467 vn_rele_stream(vp);
1468 }
1469 /* Clear out address */
1470 sti->sti_laddr_len = 0;
1471 }
1472 so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1473 sti->sti_laddr_valid = 0;
1474
1475 done:
1476
1477 /* If the caller held the lock don't release it here */
1478 ASSERT(MUTEX_HELD(&so->so_lock));
1479 ASSERT(so->so_flag & SOLOCKED);
1480
1481 return (error);
1482 }
1483
1484 /*
1485 * listen on the socket.
1486 * For TPI conforming transports this has to first unbind with the transport
1487 * and then bind again using the new backlog.
1488 */
1489 /* ARGSUSED */
1490 int
1491 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1492 {
1493 int error = 0;
1494 sotpi_info_t *sti = SOTOTPI(so);
1495
1496 dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1497 (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1498
1499 if (sti->sti_serv_type == T_CLTS)
1500 return (EOPNOTSUPP);
1501
1502 /*
1503 * If the socket is ready to accept connections already, then
1504 * return without doing anything. This avoids a problem where
1505 * a second listen() call fails if a connection is pending and
1506 * leaves the socket unbound. Only when we are not unbinding
1507 * with the transport can we safely increase the backlog.
1508 */
1509 if (so->so_state & SS_ACCEPTCONN &&
1510 !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1511 /*CONSTCOND*/
1512 !solisten_tpi_tcp))
1513 return (0);
1514
1515 if (so->so_state & SS_ISCONNECTED)
1516 return (EINVAL);
1517
1518 mutex_enter(&so->so_lock);
1519 so_lock_single(so); /* Set SOLOCKED */
1520
1521 /*
1522 * If the listen doesn't change the backlog we do nothing.
1523 * This avoids an EPROTO error from the transport.
1524 */
1525 if ((so->so_state & SS_ACCEPTCONN) &&
1526 so->so_backlog == backlog)
1527 goto done;
1528
1529 if (!(so->so_state & SS_ISBOUND)) {
1530 /*
1531 * Must have been explicitly bound in the UNIX domain.
1532 */
1533 if (so->so_family == AF_UNIX) {
1534 error = EINVAL;
1535 goto done;
1536 }
1537 error = sotpi_bindlisten(so, NULL, 0, backlog,
1538 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1539 } else if (backlog > 0) {
1540 /*
1541 * AF_INET{,6} hack to avoid losing the port.
1542 * Assumes that all AF_INET{,6} transports can handle a
1543 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1544 * has already bound thus it is possible to avoid the unbind.
1545 */
1546 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1547 /*CONSTCOND*/
1548 !solisten_tpi_tcp)) {
1549 error = sotpi_unbind(so, _SOUNBIND_REBIND);
1550 if (error)
1551 goto done;
1552 }
1553 error = sotpi_bindlisten(so, NULL, 0, backlog,
1554 _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1555 } else {
1556 so->so_state |= SS_ACCEPTCONN;
1557 so->so_backlog = backlog;
1558 }
1559 if (error)
1560 goto done;
1561 ASSERT(so->so_state & SS_ACCEPTCONN);
1562 done:
1563 so_unlock_single(so, SOLOCKED);
1564 mutex_exit(&so->so_lock);
1565 return (error);
1566 }
1567
1568 /*
1569 * Disconnect either a specified seqno or all (-1).
1570 * The former is used on listening sockets only.
1571 *
1572 * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1573 * the current use of sodisconnect(seqno == -1) is only for shutdown
1574 * so there is no point (and potentially incorrect) to unbind.
1575 */
1576 static int
1577 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1578 {
1579 struct T_discon_req discon_req;
1580 int error = 0;
1581 mblk_t *mp;
1582
1583 dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1584 (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1585
1586 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1587 mutex_enter(&so->so_lock);
1588 so_lock_single(so); /* Set SOLOCKED */
1589 } else {
1590 ASSERT(MUTEX_HELD(&so->so_lock));
1591 ASSERT(so->so_flag & SOLOCKED);
1592 }
1593
1594 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1595 error = EINVAL;
1596 eprintsoline(so, error);
1597 goto done;
1598 }
1599
1600 mutex_exit(&so->so_lock);
1601 /*
1602 * Flush the write side (unless this is a listener)
1603 * and then send down a T_DISCON_REQ.
1604 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1605 * and other messages.)
1606 */
1607 if (!(so->so_state & SS_ACCEPTCONN))
1608 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1609
1610 discon_req.PRIM_type = T_DISCON_REQ;
1611 discon_req.SEQ_number = seqno;
1612 mp = soallocproto1(&discon_req, sizeof (discon_req),
1613 0, _ALLOC_SLEEP, CRED());
1614 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1615 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1616 mutex_enter(&so->so_lock);
1617 if (error) {
1618 eprintsoline(so, error);
1619 goto done;
1620 }
1621
1622 error = sowaitokack(so, T_DISCON_REQ);
1623 if (error) {
1624 eprintsoline(so, error);
1625 goto done;
1626 }
1627 /*
1628 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1629 * strsock_proto while the lock was dropped above, the disconnect
1630 * is allowed to complete. However, it is not possible to
1631 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1632 */
1633 so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1634 SOTOTPI(so)->sti_laddr_valid = 0;
1635 SOTOTPI(so)->sti_faddr_valid = 0;
1636 done:
1637 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1638 so_unlock_single(so, SOLOCKED);
1639 mutex_exit(&so->so_lock);
1640 } else {
1641 /* If the caller held the lock don't release it here */
1642 ASSERT(MUTEX_HELD(&so->so_lock));
1643 ASSERT(so->so_flag & SOLOCKED);
1644 }
1645 return (error);
1646 }
1647
1648 /* ARGSUSED */
1649 int
1650 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1651 struct sonode **nsop)
1652 {
1653 struct T_conn_ind *conn_ind;
1654 struct T_conn_res *conn_res;
1655 int error = 0;
1656 mblk_t *mp, *ack_mp;
1657 struct sonode *nso;
1658 vnode_t *nvp;
1659 void *src;
1660 t_uscalar_t srclen;
1661 void *opt;
1662 t_uscalar_t optlen;
1663 t_scalar_t PRIM_type;
1664 t_scalar_t SEQ_number;
1665 size_t sinlen;
1666 sotpi_info_t *sti = SOTOTPI(so);
1667 sotpi_info_t *nsti;
1668
1669 dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1670 (void *)so, fflag, (void *)nsop,
1671 pr_state(so->so_state, so->so_mode)));
1672
1673 /*
1674 * Defer single-threading the accepting socket until
1675 * the T_CONN_IND has been received and parsed and the
1676 * new sonode has been opened.
1677 */
1678
1679 /* Check that we are not already connected */
1680 if ((so->so_state & SS_ACCEPTCONN) == 0)
1681 goto conn_bad;
1682 again:
1683 if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1684 goto e_bad;
1685
1686 ASSERT(mp != NULL);
1687 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1688
1689 /*
1690 * Save SEQ_number for error paths.
1691 */
1692 SEQ_number = conn_ind->SEQ_number;
1693
1694 srclen = conn_ind->SRC_length;
1695 src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1696 if (src == NULL) {
1697 error = EPROTO;
1698 freemsg(mp);
1699 eprintsoline(so, error);
1700 goto disconnect_unlocked;
1701 }
1702 optlen = conn_ind->OPT_length;
1703 switch (so->so_family) {
1704 case AF_INET:
1705 case AF_INET6:
1706 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1707 bcopy(mp->b_rptr + conn_ind->OPT_offset,
1708 &opt, conn_ind->OPT_length);
1709 } else {
1710 /*
1711 * The transport (in this case TCP) hasn't sent up
1712 * a pointer to an instance for the accept fast-path.
1713 * Disable fast-path completely because the call to
1714 * sotpi_create() below would otherwise create an
1715 * incomplete TCP instance, which would lead to
1716 * problems when sockfs sends a normal T_CONN_RES
1717 * message down the new stream.
1718 */
1719 if (sti->sti_direct) {
1720 int rval;
1721 /*
1722 * For consistency we inform tcp to disable
1723 * direct interface on the listener, though
1724 * we can certainly live without doing this
1725 * because no data will ever travel upstream
1726 * on the listening socket.
1727 */
1728 sti->sti_direct = 0;
1729 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1730 0, 0, K_TO_K, cr, &rval);
1731 }
1732 opt = NULL;
1733 optlen = 0;
1734 }
1735 break;
1736 case AF_UNIX:
1737 default:
1738 if (optlen != 0) {
1739 opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1740 __TPI_ALIGN_SIZE);
1741 if (opt == NULL) {
1742 error = EPROTO;
1743 freemsg(mp);
1744 eprintsoline(so, error);
1745 goto disconnect_unlocked;
1746 }
1747 }
1748 if (so->so_family == AF_UNIX) {
1749 if (!sti->sti_faddr_noxlate) {
1750 src = NULL;
1751 srclen = 0;
1752 }
1753 /* Extract src address from options */
1754 if (optlen != 0)
1755 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1756 }
1757 break;
1758 }
1759
1760 /*
1761 * Create the new socket.
1762 */
1763 nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1764 if (nso == NULL) {
1765 ASSERT(error != 0);
1766 /*
1767 * Accept can not fail with ENOBUFS. sotpi_create
1768 * sleeps waiting for memory until a signal is caught
1769 * so return EINTR.
1770 */
1771 freemsg(mp);
1772 if (error == ENOBUFS)
1773 error = EINTR;
1774 goto e_disc_unl;
1775 }
1776 nvp = SOTOV(nso);
1777 nsti = SOTOTPI(nso);
1778
1779 #ifdef DEBUG
1780 /*
1781 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1782 * it's inherited early to allow debugging of the accept code itself.
1783 */
1784 nso->so_options |= so->so_options & SO_DEBUG;
1785 #endif /* DEBUG */
1786
1787 /*
1788 * Save the SRC address from the T_CONN_IND
1789 * for getpeername to work on AF_UNIX and on transports that do not
1790 * support TI_GETPEERNAME.
1791 *
1792 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1793 * copyin_name().
1794 */
1795 if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1796 error = EINVAL;
1797 freemsg(mp);
1798 eprintsoline(so, error);
1799 goto disconnect_vp_unlocked;
1800 }
1801 nsti->sti_faddr_len = (socklen_t)srclen;
1802 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1803 bcopy(src, nsti->sti_faddr_sa, srclen);
1804 nsti->sti_faddr_valid = 1;
1805
1806 /*
1807 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1808 */
1809 if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1810 (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1811 cred_t *cr;
1812 pid_t cpid;
1813
1814 cr = msg_getcred(mp, &cpid);
1815 if (cr != NULL) {
1816 crhold(cr);
1817 nso->so_peercred = cr;
1818 nso->so_cpid = cpid;
1819 }
1820 freemsg(mp);
1821
1822 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1823 sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1824 if (mp == NULL) {
1825 /*
1826 * Accept can not fail with ENOBUFS.
1827 * A signal was caught so return EINTR.
1828 */
1829 error = EINTR;
1830 eprintsoline(so, error);
1831 goto disconnect_vp_unlocked;
1832 }
1833 conn_res = (struct T_conn_res *)mp->b_rptr;
1834 } else {
1835 /*
1836 * For efficency reasons we use msg_extractcred; no crhold
1837 * needed since db_credp is cleared (i.e., we move the cred
1838 * from the message to so_peercred.
1839 */
1840 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1841
1842 mp->b_rptr = DB_BASE(mp);
1843 conn_res = (struct T_conn_res *)mp->b_rptr;
1844 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1845
1846 mblk_setcred(mp, cr, curproc->p_pid);
1847 }
1848
1849 /*
1850 * New socket must be bound at least in sockfs and, except for AF_INET,
1851 * (or AF_INET6) it also has to be bound in the transport provider.
1852 * We set the local address in the sonode from the T_OK_ACK of the
1853 * T_CONN_RES. For this reason the address we bind to here isn't
1854 * important.
1855 */
1856 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1857 /*CONSTCOND*/
1858 nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1859 /*
1860 * Optimization for AF_INET{,6} transports
1861 * that can handle a T_CONN_RES without being bound.
1862 */
1863 mutex_enter(&nso->so_lock);
1864 so_automatic_bind(nso);
1865 mutex_exit(&nso->so_lock);
1866 } else {
1867 /* Perform NULL bind with the transport provider. */
1868 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1869 cr)) != 0) {
1870 ASSERT(error != ENOBUFS);
1871 freemsg(mp);
1872 eprintsoline(nso, error);
1873 goto disconnect_vp_unlocked;
1874 }
1875 }
1876
1877 /*
1878 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1879 * so that any data arriving on the new socket will cause the
1880 * appropriate signals to be delivered for the new socket.
1881 *
1882 * No other thread (except strsock_proto and strsock_misc)
1883 * can access the new socket thus we relax the locking.
1884 */
1885 nso->so_pgrp = so->so_pgrp;
1886 nso->so_state |= so->so_state & SS_ASYNC;
1887 nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1888
1889 if (nso->so_pgrp != 0) {
1890 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1891 eprintsoline(nso, error);
1892 error = 0;
1893 nso->so_pgrp = 0;
1894 }
1895 }
1896
1897 /*
1898 * Make note of the socket level options. TCP and IP level options
1899 * are already inherited. We could do all this after accept is
1900 * successful but doing it here simplifies code and no harm done
1901 * for error case.
1902 */
1903 nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1904 SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1905 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1906 nso->so_sndbuf = so->so_sndbuf;
1907 nso->so_rcvbuf = so->so_rcvbuf;
1908 if (nso->so_options & SO_LINGER)
1909 nso->so_linger = so->so_linger;
1910
1911 /*
1912 * Note that the following sti_direct code path should be
1913 * removed once we are confident that the direct sockets
1914 * do not result in any degradation.
1915 */
1916 if (sti->sti_direct) {
1917
1918 ASSERT(opt != NULL);
1919
1920 conn_res->OPT_length = optlen;
1921 conn_res->OPT_offset = MBLKL(mp);
1922 bcopy(&opt, mp->b_wptr, optlen);
1923 mp->b_wptr += optlen;
1924 conn_res->PRIM_type = T_CONN_RES;
1925 conn_res->ACCEPTOR_id = 0;
1926 PRIM_type = T_CONN_RES;
1927
1928 /* Send down the T_CONN_RES on acceptor STREAM */
1929 error = kstrputmsg(SOTOV(nso), mp, NULL,
1930 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1931 if (error) {
1932 mutex_enter(&so->so_lock);
1933 so_lock_single(so);
1934 eprintsoline(so, error);
1935 goto disconnect_vp;
1936 }
1937 mutex_enter(&nso->so_lock);
1938 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1939 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1940 if (error) {
1941 mutex_exit(&nso->so_lock);
1942 mutex_enter(&so->so_lock);
1943 so_lock_single(so);
1944 eprintsoline(so, error);
1945 goto disconnect_vp;
1946 }
1947 if (nso->so_family == AF_INET) {
1948 sin_t *sin;
1949
1950 sin = (sin_t *)(ack_mp->b_rptr +
1951 sizeof (struct T_ok_ack));
1952 bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1953 nsti->sti_laddr_len = sizeof (sin_t);
1954 } else {
1955 sin6_t *sin6;
1956
1957 sin6 = (sin6_t *)(ack_mp->b_rptr +
1958 sizeof (struct T_ok_ack));
1959 bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1960 nsti->sti_laddr_len = sizeof (sin6_t);
1961 }
1962 freemsg(ack_mp);
1963
1964 nso->so_state |= SS_ISCONNECTED;
1965 nso->so_proto_handle = (sock_lower_handle_t)opt;
1966 nsti->sti_laddr_valid = 1;
1967
1968 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1969 /*
1970 * A NL7C marked listen()er so the new socket
1971 * inherits the listen()er's NL7C state, except
1972 * for NL7C_POLLIN.
1973 *
1974 * Only call NL7C to process the new socket if
1975 * the listen socket allows blocking i/o.
1976 */
1977 nsti->sti_nl7c_flags =
1978 sti->sti_nl7c_flags & (~NL7C_POLLIN);
1979 if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1980 /*
1981 * Nonblocking accept() just make it
1982 * persist to defer processing to the
1983 * read-side syscall (e.g. read).
1984 */
1985 nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
1986 } else if (nl7c_process(nso, B_FALSE)) {
1987 /*
1988 * NL7C has completed processing on the
1989 * socket, close the socket and back to
1990 * the top to await the next T_CONN_IND.
1991 */
1992 mutex_exit(&nso->so_lock);
1993 (void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1994 cr, NULL);
1995 VN_RELE(nvp);
1996 goto again;
1997 }
1998 /* Pass the new socket out */
1999 }
2000
2001 mutex_exit(&nso->so_lock);
2002
2003 /*
2004 * It's possible, through the use of autopush for example,
2005 * that the acceptor stream may not support sti_direct
2006 * semantics. If the new socket does not support sti_direct
2007 * we issue a _SIOCSOCKFALLBACK to inform the transport
2008 * as we would in the I_PUSH case.
2009 */
2010 if (nsti->sti_direct == 0) {
2011 int rval;
2012
2013 if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2014 0, 0, K_TO_K, cr, &rval)) != 0) {
2015 mutex_enter(&so->so_lock);
2016 so_lock_single(so);
2017 eprintsoline(so, error);
2018 goto disconnect_vp;
2019 }
2020 }
2021
2022 /*
2023 * Pass out new socket.
2024 */
2025 if (nsop != NULL)
2026 *nsop = nso;
2027
2028 return (0);
2029 }
2030
2031 /*
2032 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2033 * which don't support the FireEngine accept fast-path. It is also
2034 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2035 * again. Neither sockfs nor TCP attempt to find out if some other
2036 * random module has been inserted in between (in which case we
2037 * should follow TLI accept behaviour). We blindly assume the worst
2038 * case and revert back to old behaviour i.e. TCP will not send us
2039 * any option (eager) and the accept should happen on the listener
2040 * queue. Any queued T_conn_ind have already got their options removed
2041 * by so_sock2_stream() when "sockmod" was I_POP'd.
2042 */
2043 /*
2044 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2045 */
2046 if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2047 #ifdef _ILP32
2048 queue_t *q;
2049
2050 /*
2051 * Find read queue in driver
2052 * Can safely do this since we "own" nso/nvp.
2053 */
2054 q = strvp2wq(nvp)->q_next;
2055 while (SAMESTR(q))
2056 q = q->q_next;
2057 q = RD(q);
2058 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2059 #else
2060 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2061 #endif /* _ILP32 */
2062 conn_res->PRIM_type = O_T_CONN_RES;
2063 PRIM_type = O_T_CONN_RES;
2064 } else {
2065 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2066 conn_res->PRIM_type = T_CONN_RES;
2067 PRIM_type = T_CONN_RES;
2068 }
2069 conn_res->SEQ_number = SEQ_number;
2070 conn_res->OPT_length = 0;
2071 conn_res->OPT_offset = 0;
2072
2073 mutex_enter(&so->so_lock);
2074 so_lock_single(so); /* Set SOLOCKED */
2075 mutex_exit(&so->so_lock);
2076
2077 error = kstrputmsg(SOTOV(so), mp, NULL,
2078 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2079 mutex_enter(&so->so_lock);
2080 if (error) {
2081 eprintsoline(so, error);
2082 goto disconnect_vp;
2083 }
2084 error = sowaitprim(so, PRIM_type, T_OK_ACK,
2085 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2086 if (error) {
2087 eprintsoline(so, error);
2088 goto disconnect_vp;
2089 }
2090 mutex_exit(&so->so_lock);
2091 /*
2092 * If there is a sin/sin6 appended onto the T_OK_ACK use
2093 * that to set the local address. If this is not present
2094 * then we zero out the address and don't set the
2095 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2096 * the pathname from the listening socket.
2097 * In the case where this is TCP or an AF_UNIX socket the
2098 * client side may have queued data or a T_ORDREL in the
2099 * transport. Having now sent the T_CONN_RES we may receive
2100 * those queued messages at any time. Hold the acceptor
2101 * so_lock until its state and laddr are finalized.
2102 */
2103 mutex_enter(&nso->so_lock);
2104 sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2105 if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2106 MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2107 ack_mp->b_rptr += sizeof (struct T_ok_ack);
2108 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2109 nsti->sti_laddr_len = sinlen;
2110 nsti->sti_laddr_valid = 1;
2111 } else if (nso->so_family == AF_UNIX) {
2112 ASSERT(so->so_family == AF_UNIX);
2113 nsti->sti_laddr_len = sti->sti_laddr_len;
2114 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2115 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2116 nsti->sti_laddr_len);
2117 nsti->sti_laddr_valid = 1;
2118 } else {
2119 nsti->sti_laddr_len = sti->sti_laddr_len;
2120 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2121 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2122 nsti->sti_laddr_sa->sa_family = nso->so_family;
2123 }
2124 nso->so_state |= SS_ISCONNECTED;
2125 mutex_exit(&nso->so_lock);
2126
2127 freemsg(ack_mp);
2128
2129 mutex_enter(&so->so_lock);
2130 so_unlock_single(so, SOLOCKED);
2131 mutex_exit(&so->so_lock);
2132
2133 /*
2134 * Pass out new socket.
2135 */
2136 if (nsop != NULL)
2137 *nsop = nso;
2138
2139 return (0);
2140
2141
2142 eproto_disc_unl:
2143 error = EPROTO;
2144 e_disc_unl:
2145 eprintsoline(so, error);
2146 goto disconnect_unlocked;
2147
2148 pr_disc_vp_unl:
2149 eprintsoline(so, error);
2150 disconnect_vp_unlocked:
2151 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2152 VN_RELE(nvp);
2153 disconnect_unlocked:
2154 (void) sodisconnect(so, SEQ_number, 0);
2155 return (error);
2156
2157 pr_disc_vp:
2158 eprintsoline(so, error);
2159 disconnect_vp:
2160 (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2161 so_unlock_single(so, SOLOCKED);
2162 mutex_exit(&so->so_lock);
2163 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2164 VN_RELE(nvp);
2165 return (error);
2166
2167 conn_bad: /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2168 error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2169 ? EOPNOTSUPP : EINVAL;
2170 e_bad:
2171 eprintsoline(so, error);
2172 return (error);
2173 }
2174
2175 /*
2176 * connect a socket.
2177 *
2178 * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2179 * unconnect (by specifying a null address).
2180 */
2181 int
2182 sotpi_connect(struct sonode *so,
2183 struct sockaddr *name,
2184 socklen_t namelen,
2185 int fflag,
2186 int flags,
2187 struct cred *cr)
2188 {
2189 struct T_conn_req conn_req;
2190 int error = 0;
2191 mblk_t *mp;
2192 void *src;
2193 socklen_t srclen;
2194 void *addr;
2195 socklen_t addrlen;
2196 boolean_t need_unlock;
2197 sotpi_info_t *sti = SOTOTPI(so);
2198
2199 dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2200 (void *)so, (void *)name, namelen, fflag, flags,
2201 pr_state(so->so_state, so->so_mode)));
2202
2203 /*
2204 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2205 * avoid sleeping for memory with SOLOCKED held.
2206 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2207 * + sizeof (struct T_opthdr).
2208 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2209 * exceed sti_faddr_maxlen).
2210 */
2211 mp = soallocproto(sizeof (struct T_conn_req) +
2212 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2213 cr);
2214 if (mp == NULL) {
2215 /*
2216 * Connect can not fail with ENOBUFS. A signal was
2217 * caught so return EINTR.
2218 */
2219 error = EINTR;
2220 eprintsoline(so, error);
2221 return (error);
2222 }
2223
2224 mutex_enter(&so->so_lock);
2225 /*
2226 * Make sure there is a preallocated T_unbind_req message
2227 * before any binding. This message is allocated when the
2228 * socket is created. Since another thread can consume
2229 * so_unbind_mp by the time we return from so_lock_single(),
2230 * we should check the availability of so_unbind_mp after
2231 * we return from so_lock_single().
2232 */
2233
2234 so_lock_single(so); /* Set SOLOCKED */
2235 need_unlock = B_TRUE;
2236
2237 if (sti->sti_unbind_mp == NULL) {
2238 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2239 /* NOTE: holding so_lock while sleeping */
2240 sti->sti_unbind_mp =
2241 soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2242 if (sti->sti_unbind_mp == NULL) {
2243 error = EINTR;
2244 goto done;
2245 }
2246 }
2247
2248 /*
2249 * Can't have done a listen before connecting.
2250 */
2251 if (so->so_state & SS_ACCEPTCONN) {
2252 error = EOPNOTSUPP;
2253 goto done;
2254 }
2255
2256 /*
2257 * Must be bound with the transport
2258 */
2259 if (!(so->so_state & SS_ISBOUND)) {
2260 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2261 /*CONSTCOND*/
2262 so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2263 /*
2264 * Optimization for AF_INET{,6} transports
2265 * that can handle a T_CONN_REQ without being bound.
2266 */
2267 so_automatic_bind(so);
2268 } else {
2269 error = sotpi_bind(so, NULL, 0,
2270 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2271 if (error)
2272 goto done;
2273 }
2274 ASSERT(so->so_state & SS_ISBOUND);
2275 flags |= _SOCONNECT_DID_BIND;
2276 }
2277
2278 /*
2279 * Handle a connect to a name parameter of type AF_UNSPEC like a
2280 * connect to a null address. This is the portable method to
2281 * unconnect a socket.
2282 */
2283 if ((namelen >= sizeof (sa_family_t)) &&
2284 (name->sa_family == AF_UNSPEC)) {
2285 name = NULL;
2286 namelen = 0;
2287 }
2288
2289 /*
2290 * Check that we are not already connected.
2291 * A connection-oriented socket cannot be reconnected.
2292 * A connected connection-less socket can be
2293 * - connected to a different address by a subsequent connect
2294 * - "unconnected" by a connect to the NULL address
2295 */
2296 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2297 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2298 if (so->so_mode & SM_CONNREQUIRED) {
2299 /* Connection-oriented socket */
2300 error = so->so_state & SS_ISCONNECTED ?
2301 EISCONN : EALREADY;
2302 goto done;
2303 }
2304 /* Connection-less socket */
2305 if (name == NULL) {
2306 /*
2307 * Remove the connected state and clear SO_DGRAM_ERRIND
2308 * since it was set when the socket was connected.
2309 * If this is UDP also send down a T_DISCON_REQ.
2310 */
2311 int val;
2312
2313 if ((so->so_family == AF_INET ||
2314 so->so_family == AF_INET6) &&
2315 (so->so_type == SOCK_DGRAM ||
2316 so->so_type == SOCK_RAW) &&
2317 /*CONSTCOND*/
2318 !soconnect_tpi_udp) {
2319 /* XXX What about implicitly unbinding here? */
2320 error = sodisconnect(so, -1,
2321 _SODISCONNECT_LOCK_HELD);
2322 } else {
2323 so->so_state &=
2324 ~(SS_ISCONNECTED | SS_ISCONNECTING);
2325 sti->sti_faddr_valid = 0;
2326 sti->sti_faddr_len = 0;
2327 }
2328
2329 /* Remove SOLOCKED since setsockopt will grab it */
2330 so_unlock_single(so, SOLOCKED);
2331 mutex_exit(&so->so_lock);
2332
2333 val = 0;
2334 (void) sotpi_setsockopt(so, SOL_SOCKET,
2335 SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2336 cr);
2337
2338 mutex_enter(&so->so_lock);
2339 so_lock_single(so); /* Set SOLOCKED */
2340 goto done;
2341 }
2342 }
2343 ASSERT(so->so_state & SS_ISBOUND);
2344
2345 if (name == NULL || namelen == 0) {
2346 error = EINVAL;
2347 goto done;
2348 }
2349 /*
2350 * Mark the socket if sti_faddr_sa represents the transport level
2351 * address.
2352 */
2353 if (flags & _SOCONNECT_NOXLATE) {
2354 struct sockaddr_ux *soaddr_ux;
2355
2356 ASSERT(so->so_family == AF_UNIX);
2357 if (namelen != sizeof (struct sockaddr_ux)) {
2358 error = EINVAL;
2359 goto done;
2360 }
2361 soaddr_ux = (struct sockaddr_ux *)name;
2362 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2363 namelen = sizeof (soaddr_ux->sou_addr);
2364 sti->sti_faddr_noxlate = 1;
2365 }
2366
2367 /*
2368 * Length and family checks.
2369 */
2370 error = so_addr_verify(so, name, namelen);
2371 if (error)
2372 goto bad;
2373
2374 /*
2375 * Save foreign address. Needed for AF_UNIX as well as
2376 * transport providers that do not support TI_GETPEERNAME.
2377 * Also used for cached foreign address for TCP and UDP.
2378 */
2379 if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2380 error = EINVAL;
2381 goto done;
2382 }
2383 sti->sti_faddr_len = (socklen_t)namelen;
2384 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2385 bcopy(name, sti->sti_faddr_sa, namelen);
2386 sti->sti_faddr_valid = 1;
2387
2388 if (so->so_family == AF_UNIX) {
2389 if (sti->sti_faddr_noxlate) {
2390 /*
2391 * sti_faddr is a transport-level address, so
2392 * don't pass it as an option. Do save it in
2393 * sti_ux_faddr, used for connected DG send.
2394 */
2395 src = NULL;
2396 srclen = 0;
2397 addr = sti->sti_faddr_sa;
2398 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2399 bcopy(addr, &sti->sti_ux_faddr,
2400 sizeof (sti->sti_ux_faddr));
2401 } else {
2402 /*
2403 * Pass the sockaddr_un source address as an option
2404 * and translate the remote address.
2405 * Holding so_lock thus sti_laddr_sa can not change.
2406 */
2407 src = sti->sti_laddr_sa;
2408 srclen = (t_uscalar_t)sti->sti_laddr_len;
2409 dprintso(so, 1,
2410 ("sotpi_connect UNIX: srclen %d, src %p\n",
2411 srclen, src));
2412 /*
2413 * Translate the destination address into our
2414 * internal form, and save it in sti_ux_faddr.
2415 * After this call, addr==&sti->sti_ux_taddr,
2416 * and we copy that to sti->sti_ux_faddr so
2417 * we save the connected peer address.
2418 */
2419 error = so_ux_addr_xlate(so,
2420 sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2421 (flags & _SOCONNECT_XPG4_2),
2422 &addr, &addrlen);
2423 if (error)
2424 goto bad;
2425 bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2426 sizeof (sti->sti_ux_faddr));
2427 }
2428 } else {
2429 addr = sti->sti_faddr_sa;
2430 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2431 src = NULL;
2432 srclen = 0;
2433 }
2434 /*
2435 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2436 * option which asks the transport provider to send T_UDERR_IND
2437 * messages. These T_UDERR_IND messages are used to return connected
2438 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2439 *
2440 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2441 * we send down a T_CONN_REQ. This is needed to let the
2442 * transport assign a local address that is consistent with
2443 * the remote address. Applications depend on a getsockname()
2444 * after a connect() to retrieve the "source" IP address for
2445 * the connected socket. Invalidate the cached local address
2446 * to force getsockname() to enquire of the transport.
2447 */
2448 if (!(so->so_mode & SM_CONNREQUIRED)) {
2449 /*
2450 * Datagram socket.
2451 */
2452 int32_t val;
2453
2454 so_unlock_single(so, SOLOCKED);
2455 mutex_exit(&so->so_lock);
2456
2457 val = 1;
2458 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2459 &val, (t_uscalar_t)sizeof (val), cr);
2460
2461 mutex_enter(&so->so_lock);
2462 so_lock_single(so); /* Set SOLOCKED */
2463 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2464 (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2465 soconnect_tpi_udp) {
2466 soisconnected(so);
2467 goto done;
2468 }
2469 /*
2470 * Send down T_CONN_REQ etc.
2471 * Clear fflag to avoid returning EWOULDBLOCK.
2472 */
2473 fflag = 0;
2474 ASSERT(so->so_family != AF_UNIX);
2475 sti->sti_laddr_valid = 0;
2476 } else if (sti->sti_laddr_len != 0) {
2477 /*
2478 * If the local address or port was "any" then it may be
2479 * changed by the transport as a result of the
2480 * connect. Invalidate the cached version if we have one.
2481 */
2482 switch (so->so_family) {
2483 case AF_INET:
2484 ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2485 if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2486 INADDR_ANY ||
2487 ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2488 sti->sti_laddr_valid = 0;
2489 break;
2490
2491 case AF_INET6:
2492 ASSERT(sti->sti_laddr_len ==
2493 (socklen_t)sizeof (sin6_t));
2494 if (IN6_IS_ADDR_UNSPECIFIED(
2495 &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2496 IN6_IS_ADDR_V4MAPPED_ANY(
2497 &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2498 ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2499 sti->sti_laddr_valid = 0;
2500 break;
2501
2502 default:
2503 break;
2504 }
2505 }
2506
2507 /*
2508 * Check for failure of an earlier call
2509 */
2510 if (so->so_error != 0)
2511 goto so_bad;
2512
2513 /*
2514 * Send down T_CONN_REQ. Message was allocated above.
2515 */
2516 conn_req.PRIM_type = T_CONN_REQ;
2517 conn_req.DEST_length = addrlen;
2518 conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2519 if (srclen == 0) {
2520 conn_req.OPT_length = 0;
2521 conn_req.OPT_offset = 0;
2522 soappendmsg(mp, &conn_req, sizeof (conn_req));
2523 soappendmsg(mp, addr, addrlen);
2524 } else {
2525 /*
2526 * There is a AF_UNIX sockaddr_un to include as a source
2527 * address option.
2528 */
2529 struct T_opthdr toh;
2530
2531 toh.level = SOL_SOCKET;
2532 toh.name = SO_SRCADDR;
2533 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2534 toh.status = 0;
2535 conn_req.OPT_length =
2536 (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2537 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2538 _TPI_ALIGN_TOPT(addrlen));
2539
2540 soappendmsg(mp, &conn_req, sizeof (conn_req));
2541 soappendmsg(mp, addr, addrlen);
2542 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2543 soappendmsg(mp, &toh, sizeof (toh));
2544 soappendmsg(mp, src, srclen);
2545 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2546 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2547 }
2548 /*
2549 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2550 * in order to have the right state when the T_CONN_CON shows up.
2551 */
2552 soisconnecting(so);
2553 mutex_exit(&so->so_lock);
2554
2555 if (AU_AUDITING())
2556 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2557
2558 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2559 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2560 mp = NULL;
2561 mutex_enter(&so->so_lock);
2562 if (error != 0)
2563 goto bad;
2564
2565 if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2566 goto bad;
2567
2568 /* Allow other threads to access the socket */
2569 so_unlock_single(so, SOLOCKED);
2570 need_unlock = B_FALSE;
2571
2572 /*
2573 * Wait until we get a T_CONN_CON or an error
2574 */
2575 if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2576 so_lock_single(so); /* Set SOLOCKED */
2577 need_unlock = B_TRUE;
2578 }
2579
2580 done:
2581 freemsg(mp);
2582 switch (error) {
2583 case EINPROGRESS:
2584 case EALREADY:
2585 case EISCONN:
2586 case EINTR:
2587 /* Non-fatal errors */
2588 sti->sti_laddr_valid = 0;
2589 /* FALLTHRU */
2590 case 0:
2591 break;
2592 default:
2593 ASSERT(need_unlock);
2594 /*
2595 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2596 * and invalidate local-address cache
2597 */
2598 so->so_state &= ~SS_ISCONNECTING;
2599 sti->sti_laddr_valid = 0;
2600 /* A discon_ind might have already unbound us */
2601 if ((flags & _SOCONNECT_DID_BIND) &&
2602 (so->so_state & SS_ISBOUND)) {
2603 int err;
2604
2605 err = sotpi_unbind(so, 0);
2606 /* LINTED - statement has no conseq */
2607 if (err) {
2608 eprintsoline(so, err);
2609 }
2610 }
2611 break;
2612 }
2613 if (need_unlock)
2614 so_unlock_single(so, SOLOCKED);
2615 mutex_exit(&so->so_lock);
2616 return (error);
2617
2618 so_bad: error = sogeterr(so, B_TRUE);
2619 bad: eprintsoline(so, error);
2620 goto done;
2621 }
2622
2623 /* ARGSUSED */
2624 int
2625 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2626 {
2627 struct T_ordrel_req ordrel_req;
2628 mblk_t *mp;
2629 uint_t old_state, state_change;
2630 int error = 0;
2631 sotpi_info_t *sti = SOTOTPI(so);
2632
2633 dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2634 (void *)so, how, pr_state(so->so_state, so->so_mode)));
2635
2636 mutex_enter(&so->so_lock);
2637 so_lock_single(so); /* Set SOLOCKED */
2638
2639 /*
2640 * SunOS 4.X has no check for datagram sockets.
2641 * 5.X checks that it is connected (ENOTCONN)
2642 * X/Open requires that we check the connected state.
2643 */
2644 if (!(so->so_state & SS_ISCONNECTED)) {
2645 if (!xnet_skip_checks) {
2646 error = ENOTCONN;
2647 if (xnet_check_print) {
2648 printf("sockfs: X/Open shutdown check "
2649 "caused ENOTCONN\n");
2650 }
2651 }
2652 goto done;
2653 }
2654 /*
2655 * Record the current state and then perform any state changes.
2656 * Then use the difference between the old and new states to
2657 * determine which messages need to be sent.
2658 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2659 * duplicate calls to shutdown().
2660 */
2661 old_state = so->so_state;
2662
2663 switch (how) {
2664 case 0:
2665 socantrcvmore(so);
2666 break;
2667 case 1:
2668 socantsendmore(so);
2669 break;
2670 case 2:
2671 socantsendmore(so);
2672 socantrcvmore(so);
2673 break;
2674 default:
2675 error = EINVAL;
2676 goto done;
2677 }
2678
2679 /*
2680 * Assumes that the SS_CANT* flags are never cleared in the above code.
2681 */
2682 state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2683 (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2684 ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2685
2686 switch (state_change) {
2687 case 0:
2688 dprintso(so, 1,
2689 ("sotpi_shutdown: nothing to send in state 0x%x\n",
2690 so->so_state));
2691 goto done;
2692
2693 case SS_CANTRCVMORE:
2694 mutex_exit(&so->so_lock);
2695 strseteof(SOTOV(so), 1);
2696 /*
2697 * strseteof takes care of read side wakeups,
2698 * pollwakeups, and signals.
2699 */
2700 /*
2701 * Get the read lock before flushing data to avoid problems
2702 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2703 */
2704 mutex_enter(&so->so_lock);
2705 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2706 mutex_exit(&so->so_lock);
2707
2708 /* Flush read side queue */
2709 strflushrq(SOTOV(so), FLUSHALL);
2710
2711 mutex_enter(&so->so_lock);
2712 so_unlock_read(so); /* Clear SOREADLOCKED */
2713 break;
2714
2715 case SS_CANTSENDMORE:
2716 mutex_exit(&so->so_lock);
2717 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2718 mutex_enter(&so->so_lock);
2719 break;
2720
2721 case SS_CANTSENDMORE|SS_CANTRCVMORE:
2722 mutex_exit(&so->so_lock);
2723 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2724 strseteof(SOTOV(so), 1);
2725 /*
2726 * strseteof takes care of read side wakeups,
2727 * pollwakeups, and signals.
2728 */
2729 /*
2730 * Get the read lock before flushing data to avoid problems
2731 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2732 */
2733 mutex_enter(&so->so_lock);
2734 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2735 mutex_exit(&so->so_lock);
2736
2737 /* Flush read side queue */
2738 strflushrq(SOTOV(so), FLUSHALL);
2739
2740 mutex_enter(&so->so_lock);
2741 so_unlock_read(so); /* Clear SOREADLOCKED */
2742 break;
2743 }
2744
2745 ASSERT(MUTEX_HELD(&so->so_lock));
2746
2747 /*
2748 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2749 * was set due to this call and the new state has both of them set:
2750 * Send the AF_UNIX close indication
2751 * For T_COTS send a discon_ind
2752 *
2753 * If cantsend was set due to this call:
2754 * For T_COTSORD send an ordrel_ind
2755 *
2756 * Note that for T_CLTS there is no message sent here.
2757 */
2758 if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2759 (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2760 /*
2761 * For SunOS 4.X compatibility we tell the other end
2762 * that we are unable to receive at this point.
2763 */
2764 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2765 so_unix_close(so);
2766
2767 if (sti->sti_serv_type == T_COTS)
2768 error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2769 }
2770 if ((state_change & SS_CANTSENDMORE) &&
2771 (sti->sti_serv_type == T_COTS_ORD)) {
2772 /* Send an orderly release */
2773 ordrel_req.PRIM_type = T_ORDREL_REQ;
2774
2775 mutex_exit(&so->so_lock);
2776 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2777 0, _ALLOC_SLEEP, cr);
2778 /*
2779 * Send down the T_ORDREL_REQ even if there is flow control.
2780 * This prevents shutdown from blocking.
2781 * Note that there is no T_OK_ACK for ordrel_req.
2782 */
2783 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2784 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2785 mutex_enter(&so->so_lock);
2786 if (error) {
2787 eprintsoline(so, error);
2788 goto done;
2789 }
2790 }
2791
2792 done:
2793 so_unlock_single(so, SOLOCKED);
2794 mutex_exit(&so->so_lock);
2795 return (error);
2796 }
2797
2798 /*
2799 * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2800 * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2801 * that we have closed.
2802 * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2803 * T_UNITDATA_REQ containing the same option.
2804 *
2805 * For SOCK_DGRAM half-connections (somebody connected to this end
2806 * but this end is not connect) we don't know where to send any
2807 * SO_UNIX_CLOSE.
2808 *
2809 * We have to ignore stream head errors just in case there has been
2810 * a shutdown(output).
2811 * Ignore any flow control to try to get the message more quickly to the peer.
2812 * While locally ignoring flow control solves the problem when there
2813 * is only the loopback transport on the stream it would not provide
2814 * the correct AF_UNIX socket semantics when one or more modules have
2815 * been pushed.
2816 */
2817 void
2818 so_unix_close(struct sonode *so)
2819 {
2820 struct T_opthdr toh;
2821 mblk_t *mp;
2822 sotpi_info_t *sti = SOTOTPI(so);
2823
2824 ASSERT(MUTEX_HELD(&so->so_lock));
2825
2826 ASSERT(so->so_family == AF_UNIX);
2827
2828 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2829 (SS_ISCONNECTED|SS_ISBOUND))
2830 return;
2831
2832 dprintso(so, 1, ("so_unix_close(%p) %s\n",
2833 (void *)so, pr_state(so->so_state, so->so_mode)));
2834
2835 toh.level = SOL_SOCKET;
2836 toh.name = SO_UNIX_CLOSE;
2837
2838 /* zero length + header */
2839 toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2840 toh.status = 0;
2841
2842 if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2843 struct T_optdata_req tdr;
2844
2845 tdr.PRIM_type = T_OPTDATA_REQ;
2846 tdr.DATA_flag = 0;
2847
2848 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2849 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2850
2851 /* NOTE: holding so_lock while sleeping */
2852 mp = soallocproto2(&tdr, sizeof (tdr),
2853 &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2854 } else {
2855 struct T_unitdata_req tudr;
2856 void *addr;
2857 socklen_t addrlen;
2858 void *src;
2859 socklen_t srclen;
2860 struct T_opthdr toh2;
2861 t_scalar_t size;
2862
2863 /*
2864 * We know this is an AF_UNIX connected DGRAM socket.
2865 * We therefore already have the destination address
2866 * in the internal form needed for this send. This is
2867 * similar to the sosend_dgram call later in this file
2868 * when there's no user-specified destination address.
2869 */
2870 if (sti->sti_faddr_noxlate) {
2871 /*
2872 * Already have a transport internal address. Do not
2873 * pass any (transport internal) source address.
2874 */
2875 addr = sti->sti_faddr_sa;
2876 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2877 src = NULL;
2878 srclen = 0;
2879 } else {
2880 /*
2881 * Pass the sockaddr_un source address as an option
2882 * and translate the remote address.
2883 * Holding so_lock thus sti_laddr_sa can not change.
2884 */
2885 src = sti->sti_laddr_sa;
2886 srclen = (socklen_t)sti->sti_laddr_len;
2887 dprintso(so, 1,
2888 ("so_ux_close: srclen %d, src %p\n",
2889 srclen, src));
2890 /*
2891 * Use the destination address saved in connect.
2892 */
2893 addr = &sti->sti_ux_faddr;
2894 addrlen = sizeof (sti->sti_ux_faddr);
2895 }
2896 tudr.PRIM_type = T_UNITDATA_REQ;
2897 tudr.DEST_length = addrlen;
2898 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2899 if (srclen == 0) {
2900 tudr.OPT_length = (t_scalar_t)sizeof (toh);
2901 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2902 _TPI_ALIGN_TOPT(addrlen));
2903
2904 size = tudr.OPT_offset + tudr.OPT_length;
2905 /* NOTE: holding so_lock while sleeping */
2906 mp = soallocproto2(&tudr, sizeof (tudr),
2907 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2908 mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2909 soappendmsg(mp, &toh, sizeof (toh));
2910 } else {
2911 /*
2912 * There is a AF_UNIX sockaddr_un to include as a
2913 * source address option.
2914 */
2915 tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2916 _TPI_ALIGN_TOPT(srclen));
2917 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2918 _TPI_ALIGN_TOPT(addrlen));
2919
2920 toh2.level = SOL_SOCKET;
2921 toh2.name = SO_SRCADDR;
2922 toh2.len = (t_uscalar_t)(srclen +
2923 sizeof (struct T_opthdr));
2924 toh2.status = 0;
2925
2926 size = tudr.OPT_offset + tudr.OPT_length;
2927
2928 /* NOTE: holding so_lock while sleeping */
2929 mp = soallocproto2(&tudr, sizeof (tudr),
2930 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2931 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2932 soappendmsg(mp, &toh, sizeof (toh));
2933 soappendmsg(mp, &toh2, sizeof (toh2));
2934 soappendmsg(mp, src, srclen);
2935 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2936 }
2937 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2938 }
2939 mutex_exit(&so->so_lock);
2940 (void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2941 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2942 mutex_enter(&so->so_lock);
2943 }
2944
2945 /*
2946 * Called by sotpi_recvmsg when reading a non-zero amount of data.
2947 * In addition, the caller typically verifies that there is some
2948 * potential state to clear by checking
2949 * if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2950 * before calling this routine.
2951 * Note that such a check can be made without holding so_lock since
2952 * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2953 * decrements sti_oobsigcnt.
2954 *
2955 * When data is read *after* the point that all pending
2956 * oob data has been consumed the oob indication is cleared.
2957 *
2958 * This logic keeps select/poll returning POLLRDBAND and
2959 * SIOCATMARK returning true until we have read past
2960 * the mark.
2961 */
2962 static void
2963 sorecv_update_oobstate(struct sonode *so)
2964 {
2965 sotpi_info_t *sti = SOTOTPI(so);
2966
2967 mutex_enter(&so->so_lock);
2968 ASSERT(so_verify_oobstate(so));
2969 dprintso(so, 1,
2970 ("sorecv_update_oobstate: counts %d/%d state %s\n",
2971 sti->sti_oobsigcnt,
2972 sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2973 if (sti->sti_oobsigcnt == 0) {
2974 /* No more pending oob indications */
2975 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2976 freemsg(so->so_oobmsg);
2977 so->so_oobmsg = NULL;
2978 }
2979 ASSERT(so_verify_oobstate(so));
2980 mutex_exit(&so->so_lock);
2981 }
2982
2983 /*
2984 * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2985 */
2986 static int
2987 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2988 {
2989 sotpi_info_t *sti = SOTOTPI(so);
2990 int error = 0;
2991 mblk_t *tmp = NULL;
2992 mblk_t *pmp = NULL;
2993 mblk_t *nmp = sti->sti_nl7c_rcv_mp;
2994
2995 ASSERT(nmp != NULL);
2996
2997 while (nmp != NULL && uiop->uio_resid > 0) {
2998 ssize_t n;
2999
3000 if (DB_TYPE(nmp) == M_DATA) {
3001 /*
3002 * We have some data, uiomove up to resid bytes.
3003 */
3004 n = MIN(MBLKL(nmp), uiop->uio_resid);
3005 if (n > 0)
3006 error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3007 nmp->b_rptr += n;
3008 if (nmp->b_rptr == nmp->b_wptr) {
3009 pmp = nmp;
3010 nmp = nmp->b_cont;
3011 }
3012 if (error)
3013 break;
3014 } else {
3015 /*
3016 * We only handle data, save for caller to handle.
3017 */
3018 if (pmp != NULL) {
3019 pmp->b_cont = nmp->b_cont;
3020 }
3021 nmp->b_cont = NULL;
3022 if (*rmp == NULL) {
3023 *rmp = nmp;
3024 } else {
3025 tmp->b_cont = nmp;
3026 }
3027 nmp = nmp->b_cont;
3028 tmp = nmp;
3029 }
3030 }
3031 if (pmp != NULL) {
3032 /* Free any mblk_t(s) which we have consumed */
3033 pmp->b_cont = NULL;
3034 freemsg(sti->sti_nl7c_rcv_mp);
3035 }
3036 if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3037 /* Last mblk_t so return the saved kstrgetmsg() rval/error */
3038 if (error == 0) {
3039 rval_t *p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3040
3041 error = p->r_v.r_v2;
3042 p->r_v.r_v2 = 0;
3043 }
3044 rp->r_vals = sti->sti_nl7c_rcv_rval;
3045 sti->sti_nl7c_rcv_rval = 0;
3046 } else {
3047 /* More mblk_t(s) to process so no rval to return */
3048 rp->r_vals = 0;
3049 }
3050 return (error);
3051 }
3052 /*
3053 * Receive the next message on the queue.
3054 * If msg_controllen is non-zero when called the caller is interested in
3055 * any received control info (options).
3056 * If msg_namelen is non-zero when called the caller is interested in
3057 * any received source address.
3058 * The routine returns with msg_control and msg_name pointing to
3059 * kmem_alloc'ed memory which the caller has to free.
3060 */
3061 /* ARGSUSED */
3062 int
3063 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3064 struct cred *cr)
3065 {
3066 union T_primitives *tpr;
3067 mblk_t *mp;
3068 uchar_t pri;
3069 int pflag, opflag;
3070 void *control;
3071 t_uscalar_t controllen;
3072 t_uscalar_t namelen;
3073 int so_state = so->so_state; /* Snapshot */
3074 ssize_t saved_resid;
3075 rval_t rval;
3076 int flags;
3077 clock_t timout;
3078 int error = 0;
3079 sotpi_info_t *sti = SOTOTPI(so);
3080
3081 flags = msg->msg_flags;
3082 msg->msg_flags = 0;
3083
3084 dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3085 (void *)so, (void *)msg, flags,
3086 pr_state(so->so_state, so->so_mode), so->so_error));
3087
3088 if (so->so_version == SOV_STREAM) {
3089 so_update_attrs(so, SOACC);
3090 /* The imaginary "sockmod" has been popped - act as a stream */
3091 return (strread(SOTOV(so), uiop, cr));
3092 }
3093
3094 /*
3095 * If we are not connected because we have never been connected
3096 * we return ENOTCONN. If we have been connected (but are no longer
3097 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3098 * the EOF.
3099 *
3100 * An alternative would be to post an ENOTCONN error in stream head
3101 * (read+write) and clear it when we're connected. However, that error
3102 * would cause incorrect poll/select behavior!
3103 */
3104 if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3105 (so->so_mode & SM_CONNREQUIRED)) {
3106 return (ENOTCONN);
3107 }
3108
3109 /*
3110 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3111 * after checking that the read queue is empty) and returns zero.
3112 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3113 * is zero.
3114 */
3115
3116 if (flags & MSG_OOB) {
3117 /* Check that the transport supports OOB */
3118 if (!(so->so_mode & SM_EXDATA))
3119 return (EOPNOTSUPP);
3120 so_update_attrs(so, SOACC);
3121 return (sorecvoob(so, msg, uiop, flags,
3122 (so->so_options & SO_OOBINLINE)));
3123 }
3124
3125 so_update_attrs(so, SOACC);
3126
3127 /*
3128 * Set msg_controllen and msg_namelen to zero here to make it
3129 * simpler in the cases that no control or name is returned.
3130 */
3131 controllen = msg->msg_controllen;
3132 namelen = msg->msg_namelen;
3133 msg->msg_controllen = 0;
3134 msg->msg_namelen = 0;
3135
3136 dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3137 namelen, controllen));
3138
3139 mutex_enter(&so->so_lock);
3140 /*
3141 * If an NL7C enabled socket and not waiting for write data.
3142 */
3143 if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3144 NL7C_ENABLED) {
3145 if (sti->sti_nl7c_uri) {
3146 /* Close uri processing for a previous request */
3147 nl7c_close(so);
3148 }
3149 if ((so_state & SS_CANTRCVMORE) &&
3150 sti->sti_nl7c_rcv_mp == NULL) {
3151 /* Nothing to process, EOF */
3152 mutex_exit(&so->so_lock);
3153 return (0);
3154 } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3155 /* Persistent NL7C socket, try to process request */
3156 boolean_t ret;
3157
3158 ret = nl7c_process(so,
3159 (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3160 rval.r_vals = sti->sti_nl7c_rcv_rval;
3161 error = rval.r_v.r_v2;
3162 if (error) {
3163 /* Error of some sort, return it */
3164 mutex_exit(&so->so_lock);
3165 return (error);
3166 }
3167 if (sti->sti_nl7c_flags &&
3168 ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3169 /*
3170 * Still an NL7C socket and no data
3171 * to pass up to the caller.
3172 */
3173 mutex_exit(&so->so_lock);
3174 if (ret) {
3175 /* EOF */
3176 return (0);
3177 } else {
3178 /* Need more data */
3179 return (EAGAIN);
3180 }
3181 }
3182 } else {
3183 /*
3184 * Not persistent so no further NL7C processing.
3185 */
3186 sti->sti_nl7c_flags = 0;
3187 }
3188 }
3189 /*
3190 * Only one reader is allowed at any given time. This is needed
3191 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3192 *
3193 * This is slightly different that BSD behavior in that it fails with
3194 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3195 * is single-threaded using sblock(), which is dropped while waiting
3196 * for data to appear. The difference shows up e.g. if one
3197 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3198 * does use nonblocking io and different threads are reading each
3199 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3200 * in this case as long as the read queue doesn't get empty.
3201 * In this implementation the thread using nonblocking io can
3202 * get an EWOULDBLOCK error due to the blocking thread executing
3203 * e.g. in the uiomove in kstrgetmsg.
3204 * This difference is not believed to be significant.
3205 */
3206 /* Set SOREADLOCKED */
3207 error = so_lock_read_intr(so,
3208 uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3209 mutex_exit(&so->so_lock);
3210 if (error)
3211 return (error);
3212
3213 /*
3214 * Tell kstrgetmsg to not inspect the stream head errors until all
3215 * queued data has been consumed.
3216 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3217 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3218 *
3219 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3220 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3221 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3222 */
3223 pflag = MSG_ANY | MSG_DELAYERROR;
3224 if (flags & MSG_PEEK) {
3225 pflag |= MSG_IPEEK;
3226 flags &= ~MSG_WAITALL;
3227 }
3228 if (so->so_mode & SM_ATOMIC)
3229 pflag |= MSG_DISCARDTAIL;
3230
3231 if (flags & MSG_DONTWAIT)
3232 timout = 0;
3233 else if (so->so_rcvtimeo != 0)
3234 timout = TICK_TO_MSEC(so->so_rcvtimeo);
3235 else
3236 timout = -1;
3237 opflag = pflag;
3238 retry:
3239 saved_resid = uiop->uio_resid;
3240 pri = 0;
3241 mp = NULL;
3242 if (sti->sti_nl7c_rcv_mp != NULL) {
3243 /* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3244 error = nl7c_sorecv(so, &mp, uiop, &rval);
3245 } else {
3246 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3247 timout, &rval);
3248 }
3249 if (error != 0) {
3250 /* kstrgetmsg returns ETIME when timeout expires */
3251 if (error == ETIME)
3252 error = EWOULDBLOCK;
3253 goto out;
3254 }
3255 /*
3256 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3257 * For non-datagrams MOREDATA is used to set MSG_EOR.
3258 */
3259 ASSERT(!(rval.r_val1 & MORECTL));
3260 if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3261 msg->msg_flags |= MSG_TRUNC;
3262
3263 if (mp == NULL) {
3264 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3265 /*
3266 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3267 * The draft Posix socket spec states that the mark should
3268 * not be cleared when peeking. We follow the latter.
3269 */
3270 if ((so->so_state &
3271 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3272 (uiop->uio_resid != saved_resid) &&
3273 !(flags & MSG_PEEK)) {
3274 sorecv_update_oobstate(so);
3275 }
3276
3277 mutex_enter(&so->so_lock);
3278 /* Set MSG_EOR based on MOREDATA */
3279 if (!(rval.r_val1 & MOREDATA)) {
3280 if (so->so_state & SS_SAVEDEOR) {
3281 msg->msg_flags |= MSG_EOR;
3282 so->so_state &= ~SS_SAVEDEOR;
3283 }
3284 }
3285 /*
3286 * If some data was received (i.e. not EOF) and the
3287 * read/recv* has not been satisfied wait for some more.
3288 */
3289 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3290 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3291 mutex_exit(&so->so_lock);
3292 pflag = opflag | MSG_NOMARK;
3293 goto retry;
3294 }
3295 goto out_locked;
3296 }
3297
3298 /* strsock_proto has already verified length and alignment */
3299 tpr = (union T_primitives *)mp->b_rptr;
3300 dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3301
3302 switch (tpr->type) {
3303 case T_DATA_IND: {
3304 if ((so->so_state &
3305 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3306 (uiop->uio_resid != saved_resid) &&
3307 !(flags & MSG_PEEK)) {
3308 sorecv_update_oobstate(so);
3309 }
3310
3311 /*
3312 * Set msg_flags to MSG_EOR based on
3313 * MORE_flag and MOREDATA.
3314 */
3315 mutex_enter(&so->so_lock);
3316 so->so_state &= ~SS_SAVEDEOR;
3317 if (!(tpr->data_ind.MORE_flag & 1)) {
3318 if (!(rval.r_val1 & MOREDATA))
3319 msg->msg_flags |= MSG_EOR;
3320 else
3321 so->so_state |= SS_SAVEDEOR;
3322 }
3323 freemsg(mp);
3324 /*
3325 * If some data was received (i.e. not EOF) and the
3326 * read/recv* has not been satisfied wait for some more.
3327 */
3328 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3329 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3330 mutex_exit(&so->so_lock);
3331 pflag = opflag | MSG_NOMARK;
3332 goto retry;
3333 }
3334 goto out_locked;
3335 }
3336 case T_UNITDATA_IND: {
3337 void *addr;
3338 t_uscalar_t addrlen;
3339 void *abuf;
3340 t_uscalar_t optlen;
3341 void *opt;
3342
3343 if ((so->so_state &
3344 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3345 (uiop->uio_resid != saved_resid) &&
3346 !(flags & MSG_PEEK)) {
3347 sorecv_update_oobstate(so);
3348 }
3349
3350 if (namelen != 0) {
3351 /* Caller wants source address */
3352 addrlen = tpr->unitdata_ind.SRC_length;
3353 addr = sogetoff(mp,
3354 tpr->unitdata_ind.SRC_offset,
3355 addrlen, 1);
3356 if (addr == NULL) {
3357 freemsg(mp);
3358 error = EPROTO;
3359 eprintsoline(so, error);
3360 goto out;
3361 }
3362 if (so->so_family == AF_UNIX) {
3363 /*
3364 * Can not use the transport level address.
3365 * If there is a SO_SRCADDR option carrying
3366 * the socket level address it will be
3367 * extracted below.
3368 */
3369 addr = NULL;
3370 addrlen = 0;
3371 }
3372 }
3373 optlen = tpr->unitdata_ind.OPT_length;
3374 if (optlen != 0) {
3375 t_uscalar_t ncontrollen;
3376
3377 /*
3378 * Extract any source address option.
3379 * Determine how large cmsg buffer is needed.
3380 */
3381 opt = sogetoff(mp,
3382 tpr->unitdata_ind.OPT_offset,
3383 optlen, __TPI_ALIGN_SIZE);
3384
3385 if (opt == NULL) {
3386 freemsg(mp);
3387 error = EPROTO;
3388 eprintsoline(so, error);
3389 goto out;
3390 }
3391 if (so->so_family == AF_UNIX)
3392 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3393 ncontrollen = so_cmsglen(mp, opt, optlen,
3394 !(flags & MSG_XPG4_2));
3395 if (controllen != 0)
3396 controllen = ncontrollen;
3397 else if (ncontrollen != 0)
3398 msg->msg_flags |= MSG_CTRUNC;
3399 } else {
3400 controllen = 0;
3401 }
3402
3403 if (namelen != 0) {
3404 /*
3405 * Return address to caller.
3406 * Caller handles truncation if length
3407 * exceeds msg_namelen.
3408 * NOTE: AF_UNIX NUL termination is ensured by
3409 * the sender's copyin_name().
3410 */
3411 abuf = kmem_alloc(addrlen, KM_SLEEP);
3412
3413 bcopy(addr, abuf, addrlen);
3414 msg->msg_name = abuf;
3415 msg->msg_namelen = addrlen;
3416 }
3417
3418 if (controllen != 0) {
3419 /*
3420 * Return control msg to caller.
3421 * Caller handles truncation if length
3422 * exceeds msg_controllen.
3423 */
3424 control = kmem_zalloc(controllen, KM_SLEEP);
3425
3426 error = so_opt2cmsg(mp, opt, optlen,
3427 !(flags & MSG_XPG4_2),
3428 control, controllen);
3429 if (error) {
3430 freemsg(mp);
3431 if (msg->msg_namelen != 0)
3432 kmem_free(msg->msg_name,
3433 msg->msg_namelen);
3434 kmem_free(control, controllen);
3435 eprintsoline(so, error);
3436 goto out;
3437 }
3438 msg->msg_control = control;
3439 msg->msg_controllen = controllen;
3440 }
3441
3442 freemsg(mp);
3443 goto out;
3444 }
3445 case T_OPTDATA_IND: {
3446 struct T_optdata_req *tdr;
3447 void *opt;
3448 t_uscalar_t optlen;
3449
3450 if ((so->so_state &
3451 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3452 (uiop->uio_resid != saved_resid) &&
3453 !(flags & MSG_PEEK)) {
3454 sorecv_update_oobstate(so);
3455 }
3456
3457 tdr = (struct T_optdata_req *)mp->b_rptr;
3458 optlen = tdr->OPT_length;
3459 if (optlen != 0) {
3460 t_uscalar_t ncontrollen;
3461 /*
3462 * Determine how large cmsg buffer is needed.
3463 */
3464 opt = sogetoff(mp,
3465 tpr->optdata_ind.OPT_offset,
3466 optlen, __TPI_ALIGN_SIZE);
3467
3468 if (opt == NULL) {
3469 freemsg(mp);
3470 error = EPROTO;
3471 eprintsoline(so, error);
3472 goto out;
3473 }
3474
3475 ncontrollen = so_cmsglen(mp, opt, optlen,
3476 !(flags & MSG_XPG4_2));
3477 if (controllen != 0)
3478 controllen = ncontrollen;
3479 else if (ncontrollen != 0)
3480 msg->msg_flags |= MSG_CTRUNC;
3481 } else {
3482 controllen = 0;
3483 }
3484
3485 if (controllen != 0) {
3486 /*
3487 * Return control msg to caller.
3488 * Caller handles truncation if length
3489 * exceeds msg_controllen.
3490 */
3491 control = kmem_zalloc(controllen, KM_SLEEP);
3492
3493 error = so_opt2cmsg(mp, opt, optlen,
3494 !(flags & MSG_XPG4_2),
3495 control, controllen);
3496 if (error) {
3497 freemsg(mp);
3498 kmem_free(control, controllen);
3499 eprintsoline(so, error);
3500 goto out;
3501 }
3502 msg->msg_control = control;
3503 msg->msg_controllen = controllen;
3504 }
3505
3506 /*
3507 * Set msg_flags to MSG_EOR based on
3508 * DATA_flag and MOREDATA.
3509 */
3510 mutex_enter(&so->so_lock);
3511 so->so_state &= ~SS_SAVEDEOR;
3512 if (!(tpr->data_ind.MORE_flag & 1)) {
3513 if (!(rval.r_val1 & MOREDATA))
3514 msg->msg_flags |= MSG_EOR;
3515 else
3516 so->so_state |= SS_SAVEDEOR;
3517 }
3518 freemsg(mp);
3519 /*
3520 * If some data was received (i.e. not EOF) and the
3521 * read/recv* has not been satisfied wait for some more.
3522 * Not possible to wait if control info was received.
3523 */
3524 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3525 controllen == 0 &&
3526 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3527 mutex_exit(&so->so_lock);
3528 pflag = opflag | MSG_NOMARK;
3529 goto retry;
3530 }
3531 goto out_locked;
3532 }
3533 case T_EXDATA_IND: {
3534 dprintso(so, 1,
3535 ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3536 "state %s\n",
3537 sti->sti_oobsigcnt, sti->sti_oobcnt,
3538 saved_resid - uiop->uio_resid,
3539 pr_state(so->so_state, so->so_mode)));
3540 /*
3541 * kstrgetmsg handles MSGMARK so there is nothing to
3542 * inspect in the T_EXDATA_IND.
3543 * strsock_proto makes the stream head queue the T_EXDATA_IND
3544 * as a separate message with no M_DATA component. Furthermore,
3545 * the stream head does not consolidate M_DATA messages onto
3546 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3547 * remains a message by itself. This is needed since MSGMARK
3548 * marks both the whole message as well as the last byte
3549 * of the message.
3550 */
3551 freemsg(mp);
3552 ASSERT(uiop->uio_resid == saved_resid); /* No data */
3553 if (flags & MSG_PEEK) {
3554 /*
3555 * Even though we are peeking we consume the
3556 * T_EXDATA_IND thereby moving the mark information
3557 * to SS_RCVATMARK. Then the oob code below will
3558 * retry the peeking kstrgetmsg.
3559 * Note that the stream head read queue is
3560 * never flushed without holding SOREADLOCKED
3561 * thus the T_EXDATA_IND can not disappear
3562 * underneath us.
3563 */
3564 dprintso(so, 1,
3565 ("sotpi_recvmsg: consume EXDATA_IND "
3566 "counts %d/%d state %s\n",
3567 sti->sti_oobsigcnt,
3568 sti->sti_oobcnt,
3569 pr_state(so->so_state, so->so_mode)));
3570
3571 pflag = MSG_ANY | MSG_DELAYERROR;
3572 if (so->so_mode & SM_ATOMIC)
3573 pflag |= MSG_DISCARDTAIL;
3574
3575 pri = 0;
3576 mp = NULL;
3577
3578 error = kstrgetmsg(SOTOV(so), &mp, uiop,
3579 &pri, &pflag, (clock_t)-1, &rval);
3580 ASSERT(uiop->uio_resid == saved_resid);
3581
3582 if (error) {
3583 #ifdef SOCK_DEBUG
3584 if (error != EWOULDBLOCK && error != EINTR) {
3585 eprintsoline(so, error);
3586 }
3587 #endif /* SOCK_DEBUG */
3588 goto out;
3589 }
3590 ASSERT(mp);
3591 tpr = (union T_primitives *)mp->b_rptr;
3592 ASSERT(tpr->type == T_EXDATA_IND);
3593 freemsg(mp);
3594 } /* end "if (flags & MSG_PEEK)" */
3595
3596 /*
3597 * Decrement the number of queued and pending oob.
3598 *
3599 * SS_RCVATMARK is cleared when we read past a mark.
3600 * SS_HAVEOOBDATA is cleared when we've read past the
3601 * last mark.
3602 * SS_OOBPEND is cleared if we've read past the last
3603 * mark and no (new) SIGURG has been posted.
3604 */
3605 mutex_enter(&so->so_lock);
3606 ASSERT(so_verify_oobstate(so));
3607 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3608 ASSERT(sti->sti_oobsigcnt > 0);
3609 sti->sti_oobsigcnt--;
3610 ASSERT(sti->sti_oobcnt > 0);
3611 sti->sti_oobcnt--;
3612 /*
3613 * Since the T_EXDATA_IND has been removed from the stream
3614 * head, but we have not read data past the mark,
3615 * sockfs needs to track that the socket is still at the mark.
3616 *
3617 * Since no data was received call kstrgetmsg again to wait
3618 * for data.
3619 */
3620 so->so_state |= SS_RCVATMARK;
3621 mutex_exit(&so->so_lock);
3622 dprintso(so, 1,
3623 ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3624 sti->sti_oobsigcnt, sti->sti_oobcnt,
3625 pr_state(so->so_state, so->so_mode)));
3626 pflag = opflag;
3627 goto retry;
3628 }
3629 default:
3630 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3631 (void *)so, tpr->type, (void *)mp);
3632 ASSERT(0);
3633 freemsg(mp);
3634 error = EPROTO;
3635 eprintsoline(so, error);
3636 goto out;
3637 }
3638 /* NOTREACHED */
3639 out:
3640 mutex_enter(&so->so_lock);
3641 out_locked:
3642 so_unlock_read(so); /* Clear SOREADLOCKED */
3643 mutex_exit(&so->so_lock);
3644 return (error);
3645 }
3646
3647 /*
3648 * Sending data with options on a datagram socket.
3649 * Assumes caller has verified that SS_ISBOUND etc. are set.
3650 *
3651 * For AF_UNIX the destination address may be already in
3652 * internal form, as indicated by sti->sti_faddr_noxlate
3653 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3654 * translate the destination address to internal form.
3655 *
3656 * The source address is passed as an option. If passing
3657 * file descriptors, those are passed as file pointers in
3658 * another option.
3659 */
3660 static int
3661 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3662 struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3663 {
3664 struct T_unitdata_req tudr;
3665 mblk_t *mp;
3666 int error;
3667 void *addr;
3668 socklen_t addrlen;
3669 void *src;
3670 socklen_t srclen;
3671 ssize_t len;
3672 int size;
3673 struct T_opthdr toh;
3674 struct fdbuf *fdbuf;
3675 t_uscalar_t optlen;
3676 void *fds;
3677 int fdlen;
3678 sotpi_info_t *sti = SOTOTPI(so);
3679
3680 ASSERT(name && namelen);
3681 ASSERT(control && controllen);
3682
3683 len = uiop->uio_resid;
3684 if (len > (ssize_t)sti->sti_tidu_size) {
3685 return (EMSGSIZE);
3686 }
3687
3688 if (sti->sti_faddr_noxlate == 0 &&
3689 (flags & MSG_SENDTO_NOXLATE) == 0) {
3690 /*
3691 * Length and family checks.
3692 * Don't verify internal form.
3693 */
3694 error = so_addr_verify(so, name, namelen);
3695 if (error) {
3696 eprintsoline(so, error);
3697 return (error);
3698 }
3699 }
3700
3701 if (so->so_family == AF_UNIX) {
3702 if (sti->sti_faddr_noxlate) {
3703 /*
3704 * Already have a transport internal address. Do not
3705 * pass any (transport internal) source address.
3706 */
3707 addr = name;
3708 addrlen = namelen;
3709 src = NULL;
3710 srclen = 0;
3711 } else if (flags & MSG_SENDTO_NOXLATE) {
3712 /*
3713 * Have an internal form dest. address.
3714 * Pass the source address as usual.
3715 */
3716 addr = name;
3717 addrlen = namelen;
3718 src = sti->sti_laddr_sa;
3719 srclen = (socklen_t)sti->sti_laddr_len;
3720 } else {
3721 /*
3722 * Pass the sockaddr_un source address as an option
3723 * and translate the remote address.
3724 *
3725 * Note that this code does not prevent sti_laddr_sa
3726 * from changing while it is being used. Thus
3727 * if an unbind+bind occurs concurrently with this
3728 * send the peer might see a partially new and a
3729 * partially old "from" address.
3730 */
3731 src = sti->sti_laddr_sa;
3732 srclen = (socklen_t)sti->sti_laddr_len;
3733 dprintso(so, 1,
3734 ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3735 srclen, src));
3736 /*
3737 * The sendmsg caller specified a destination
3738 * address, which we must translate into our
3739 * internal form. addr = &sti->sti_ux_taddr
3740 */
3741 error = so_ux_addr_xlate(so, name, namelen,
3742 (flags & MSG_XPG4_2),
3743 &addr, &addrlen);
3744 if (error) {
3745 eprintsoline(so, error);
3746 return (error);
3747 }
3748 }
3749 } else {
3750 addr = name;
3751 addrlen = namelen;
3752 src = NULL;
3753 srclen = 0;
3754 }
3755 optlen = so_optlen(control, controllen,
3756 !(flags & MSG_XPG4_2));
3757 tudr.PRIM_type = T_UNITDATA_REQ;
3758 tudr.DEST_length = addrlen;
3759 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3760 if (srclen != 0)
3761 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3762 _TPI_ALIGN_TOPT(srclen));
3763 else
3764 tudr.OPT_length = optlen;
3765 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3766 _TPI_ALIGN_TOPT(addrlen));
3767
3768 size = tudr.OPT_offset + tudr.OPT_length;
3769
3770 /*
3771 * File descriptors only when SM_FDPASSING set.
3772 */
3773 error = so_getfdopt(control, controllen,
3774 !(flags & MSG_XPG4_2), &fds, &fdlen);
3775 if (error)
3776 return (error);
3777 if (fdlen != -1) {
3778 if (!(so->so_mode & SM_FDPASSING))
3779 return (EOPNOTSUPP);
3780
3781 error = fdbuf_create(fds, fdlen, &fdbuf);
3782 if (error)
3783 return (error);
3784
3785 /*
3786 * Pre-allocate enough additional space for lower level modules
3787 * to append an option (e.g. see tl_unitdata). The following
3788 * is enough extra space for the largest option we might append.
3789 */
3790 size += sizeof (struct T_opthdr) + ucredsize;
3791 mp = fdbuf_allocmsg(size, fdbuf);
3792 } else {
3793 mp = soallocproto(size, _ALLOC_INTR, CRED());
3794 if (mp == NULL) {
3795 /*
3796 * Caught a signal waiting for memory.
3797 * Let send* return EINTR.
3798 */
3799 return (EINTR);
3800 }
3801 }
3802 soappendmsg(mp, &tudr, sizeof (tudr));
3803 soappendmsg(mp, addr, addrlen);
3804 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3805
3806 if (fdlen != -1) {
3807 ASSERT(fdbuf != NULL);
3808 toh.level = SOL_SOCKET;
3809 toh.name = SO_FILEP;
3810 toh.len = fdbuf->fd_size +
3811 (t_uscalar_t)sizeof (struct T_opthdr);
3812 toh.status = 0;
3813 soappendmsg(mp, &toh, sizeof (toh));
3814 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3815 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3816 }
3817 if (srclen != 0) {
3818 /*
3819 * There is a AF_UNIX sockaddr_un to include as a source
3820 * address option.
3821 */
3822 toh.level = SOL_SOCKET;
3823 toh.name = SO_SRCADDR;
3824 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3825 toh.status = 0;
3826 soappendmsg(mp, &toh, sizeof (toh));
3827 soappendmsg(mp, src, srclen);
3828 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3829 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3830 }
3831 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3832 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3833 /*
3834 * Normally at most 3 bytes left in the message, but we might have
3835 * allowed for extra space if we're passing fd's through.
3836 */
3837 ASSERT(MBLKL(mp) <= (ssize_t)size);
3838
3839 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3840 if (AU_AUDITING())
3841 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3842
3843 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3844 #ifdef SOCK_DEBUG
3845 if (error) {
3846 eprintsoline(so, error);
3847 }
3848 #endif /* SOCK_DEBUG */
3849 return (error);
3850 }
3851
3852 /*
3853 * Sending data with options on a connected stream socket.
3854 * Assumes caller has verified that SS_ISCONNECTED is set.
3855 */
3856 static int
3857 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3858 t_uscalar_t controllen, int flags)
3859 {
3860 struct T_optdata_req tdr;
3861 mblk_t *mp;
3862 int error;
3863 ssize_t iosize;
3864 int size;
3865 struct fdbuf *fdbuf;
3866 t_uscalar_t optlen;
3867 void *fds;
3868 int fdlen;
3869 struct T_opthdr toh;
3870 sotpi_info_t *sti = SOTOTPI(so);
3871
3872 dprintso(so, 1,
3873 ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3874
3875 /*
3876 * Has to be bound and connected. However, since no locks are
3877 * held the state could have changed after sotpi_sendmsg checked it
3878 * thus it is not possible to ASSERT on the state.
3879 */
3880
3881 /* Options on connection-oriented only when SM_OPTDATA set. */
3882 if (!(so->so_mode & SM_OPTDATA))
3883 return (EOPNOTSUPP);
3884
3885 do {
3886 /*
3887 * Set the MORE flag if uio_resid does not fit in this
3888 * message or if the caller passed in "more".
3889 * Error for transports with zero tidu_size.
3890 */
3891 tdr.PRIM_type = T_OPTDATA_REQ;
3892 iosize = sti->sti_tidu_size;
3893 if (iosize <= 0)
3894 return (EMSGSIZE);
3895 if (uiop->uio_resid > iosize) {
3896 tdr.DATA_flag = 1;
3897 } else {
3898 if (more)
3899 tdr.DATA_flag = 1;
3900 else
3901 tdr.DATA_flag = 0;
3902 iosize = uiop->uio_resid;
3903 }
3904 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3905 tdr.DATA_flag, iosize));
3906
3907 optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3908 tdr.OPT_length = optlen;
3909 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3910
3911 size = (int)sizeof (tdr) + optlen;
3912 /*
3913 * File descriptors only when SM_FDPASSING set.
3914 */
3915 error = so_getfdopt(control, controllen,
3916 !(flags & MSG_XPG4_2), &fds, &fdlen);
3917 if (error)
3918 return (error);
3919 if (fdlen != -1) {
3920 if (!(so->so_mode & SM_FDPASSING))
3921 return (EOPNOTSUPP);
3922
3923 error = fdbuf_create(fds, fdlen, &fdbuf);
3924 if (error)
3925 return (error);
3926
3927 /*
3928 * Pre-allocate enough additional space for lower level
3929 * modules to append an option (e.g. see tl_unitdata).
3930 * The following is enough extra space for the largest
3931 * option we might append.
3932 */
3933 size += sizeof (struct T_opthdr) + ucredsize;
3934 mp = fdbuf_allocmsg(size, fdbuf);
3935 } else {
3936 mp = soallocproto(size, _ALLOC_INTR, CRED());
3937 if (mp == NULL) {
3938 /*
3939 * Caught a signal waiting for memory.
3940 * Let send* return EINTR.
3941 */
3942 return (EINTR);
3943 }
3944 }
3945 soappendmsg(mp, &tdr, sizeof (tdr));
3946
3947 if (fdlen != -1) {
3948 ASSERT(fdbuf != NULL);
3949 toh.level = SOL_SOCKET;
3950 toh.name = SO_FILEP;
3951 toh.len = fdbuf->fd_size +
3952 (t_uscalar_t)sizeof (struct T_opthdr);
3953 toh.status = 0;
3954 soappendmsg(mp, &toh, sizeof (toh));
3955 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3956 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3957 }
3958 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3959 /*
3960 * Normally at most 3 bytes left in the message, but we might
3961 * have allowed for extra space if we're passing fd's through.
3962 */
3963 ASSERT(MBLKL(mp) <= (ssize_t)size);
3964
3965 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3966
3967 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3968 0, MSG_BAND, 0);
3969 if (error) {
3970 eprintsoline(so, error);
3971 return (error);
3972 }
3973 control = NULL;
3974 if (uiop->uio_resid > 0) {
3975 /*
3976 * Recheck for fatal errors. Fail write even though
3977 * some data have been written. This is consistent
3978 * with strwrite semantics and BSD sockets semantics.
3979 */
3980 if (so->so_state & SS_CANTSENDMORE) {
3981 eprintsoline(so, error);
3982 return (EPIPE);
3983 }
3984 if (so->so_error != 0) {
3985 mutex_enter(&so->so_lock);
3986 error = sogeterr(so, B_TRUE);
3987 mutex_exit(&so->so_lock);
3988 if (error != 0) {
3989 eprintsoline(so, error);
3990 return (error);
3991 }
3992 }
3993 }
3994 } while (uiop->uio_resid > 0);
3995 return (0);
3996 }
3997
3998 /*
3999 * Sending data on a datagram socket.
4000 * Assumes caller has verified that SS_ISBOUND etc. are set.
4001 *
4002 * For AF_UNIX the destination address may be already in
4003 * internal form, as indicated by sti->sti_faddr_noxlate
4004 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
4005 * translate the destination address to internal form.
4006 *
4007 * The source address is passed as an option.
4008 */
4009 int
4010 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
4011 struct uio *uiop, int flags)
4012 {
4013 struct T_unitdata_req tudr;
4014 mblk_t *mp;
4015 int error;
4016 void *addr;
4017 socklen_t addrlen;
4018 void *src;
4019 socklen_t srclen;
4020 ssize_t len;
4021 sotpi_info_t *sti = SOTOTPI(so);
4022
4023 ASSERT(name != NULL && namelen != 0);
4024
4025 len = uiop->uio_resid;
4026 if (len > sti->sti_tidu_size) {
4027 error = EMSGSIZE;
4028 goto done;
4029 }
4030
4031 if (sti->sti_faddr_noxlate == 0 &&
4032 (flags & MSG_SENDTO_NOXLATE) == 0) {
4033 /*
4034 * Length and family checks.
4035 * Don't verify internal form.
4036 */
4037 error = so_addr_verify(so, name, namelen);
4038 if (error != 0)
4039 goto done;
4040 }
4041
4042 if (sti->sti_direct) /* Never on AF_UNIX */
4043 return (sodgram_direct(so, name, namelen, uiop, flags));
4044
4045 if (so->so_family == AF_UNIX) {
4046 if (sti->sti_faddr_noxlate) {
4047 /*
4048 * Already have a transport internal address. Do not
4049 * pass any (transport internal) source address.
4050 */
4051 addr = name;
4052 addrlen = namelen;
4053 src = NULL;
4054 srclen = 0;
4055 } else if (flags & MSG_SENDTO_NOXLATE) {
4056 /*
4057 * Have an internal form dest. address.
4058 * Pass the source address as usual.
4059 */
4060 addr = name;
4061 addrlen = namelen;
4062 src = sti->sti_laddr_sa;
4063 srclen = (socklen_t)sti->sti_laddr_len;
4064 } else {
4065 /*
4066 * Pass the sockaddr_un source address as an option
4067 * and translate the remote address.
4068 *
4069 * Note that this code does not prevent sti_laddr_sa
4070 * from changing while it is being used. Thus
4071 * if an unbind+bind occurs concurrently with this
4072 * send the peer might see a partially new and a
4073 * partially old "from" address.
4074 */
4075 src = sti->sti_laddr_sa;
4076 srclen = (socklen_t)sti->sti_laddr_len;
4077 dprintso(so, 1,
4078 ("sosend_dgram UNIX: srclen %d, src %p\n",
4079 srclen, src));
4080 /*
4081 * The sendmsg caller specified a destination
4082 * address, which we must translate into our
4083 * internal form. addr = &sti->sti_ux_taddr
4084 */
4085 error = so_ux_addr_xlate(so, name, namelen,
4086 (flags & MSG_XPG4_2),
4087 &addr, &addrlen);
4088 if (error) {
4089 eprintsoline(so, error);
4090 goto done;
4091 }
4092 }
4093 } else {
4094 addr = name;
4095 addrlen = namelen;
4096 src = NULL;
4097 srclen = 0;
4098 }
4099 tudr.PRIM_type = T_UNITDATA_REQ;
4100 tudr.DEST_length = addrlen;
4101 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4102 if (srclen == 0) {
4103 tudr.OPT_length = 0;
4104 tudr.OPT_offset = 0;
4105
4106 mp = soallocproto2(&tudr, sizeof (tudr),
4107 addr, addrlen, 0, _ALLOC_INTR, CRED());
4108 if (mp == NULL) {
4109 /*
4110 * Caught a signal waiting for memory.
4111 * Let send* return EINTR.
4112 */
4113 error = EINTR;
4114 goto done;
4115 }
4116 } else {
4117 /*
4118 * There is a AF_UNIX sockaddr_un to include as a source
4119 * address option.
4120 */
4121 struct T_opthdr toh;
4122 ssize_t size;
4123
4124 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4125 _TPI_ALIGN_TOPT(srclen));
4126 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4127 _TPI_ALIGN_TOPT(addrlen));
4128
4129 toh.level = SOL_SOCKET;
4130 toh.name = SO_SRCADDR;
4131 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4132 toh.status = 0;
4133
4134 size = tudr.OPT_offset + tudr.OPT_length;
4135 mp = soallocproto2(&tudr, sizeof (tudr),
4136 addr, addrlen, size, _ALLOC_INTR, CRED());
4137 if (mp == NULL) {
4138 /*
4139 * Caught a signal waiting for memory.
4140 * Let send* return EINTR.
4141 */
4142 error = EINTR;
4143 goto done;
4144 }
4145 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4146 soappendmsg(mp, &toh, sizeof (toh));
4147 soappendmsg(mp, src, srclen);
4148 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4149 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4150 }
4151
4152 if (AU_AUDITING())
4153 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4154
4155 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4156 done:
4157 #ifdef SOCK_DEBUG
4158 if (error) {
4159 eprintsoline(so, error);
4160 }
4161 #endif /* SOCK_DEBUG */
4162 return (error);
4163 }
4164
4165 /*
4166 * Sending data on a connected stream socket.
4167 * Assumes caller has verified that SS_ISCONNECTED is set.
4168 */
4169 int
4170 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4171 int sflag)
4172 {
4173 struct T_data_req tdr;
4174 mblk_t *mp;
4175 int error;
4176 ssize_t iosize;
4177 sotpi_info_t *sti = SOTOTPI(so);
4178
4179 dprintso(so, 1,
4180 ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4181 (void *)so, uiop->uio_resid, prim, sflag));
4182
4183 /*
4184 * Has to be bound and connected. However, since no locks are
4185 * held the state could have changed after sotpi_sendmsg checked it
4186 * thus it is not possible to ASSERT on the state.
4187 */
4188
4189 do {
4190 /*
4191 * Set the MORE flag if uio_resid does not fit in this
4192 * message or if the caller passed in "more".
4193 * Error for transports with zero tidu_size.
4194 */
4195 tdr.PRIM_type = prim;
4196 iosize = sti->sti_tidu_size;
4197 if (iosize <= 0)
4198 return (EMSGSIZE);
4199 if (uiop->uio_resid > iosize) {
4200 tdr.MORE_flag = 1;
4201 } else {
4202 if (more)
4203 tdr.MORE_flag = 1;
4204 else
4205 tdr.MORE_flag = 0;
4206 iosize = uiop->uio_resid;
4207 }
4208 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4209 prim, tdr.MORE_flag, iosize));
4210 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4211 if (mp == NULL) {
4212 /*
4213 * Caught a signal waiting for memory.
4214 * Let send* return EINTR.
4215 */
4216 return (EINTR);
4217 }
4218
4219 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4220 0, sflag | MSG_BAND, 0);
4221 if (error) {
4222 eprintsoline(so, error);
4223 return (error);
4224 }
4225 if (uiop->uio_resid > 0) {
4226 /*
4227 * Recheck for fatal errors. Fail write even though
4228 * some data have been written. This is consistent
4229 * with strwrite semantics and BSD sockets semantics.
4230 */
4231 if (so->so_state & SS_CANTSENDMORE) {
4232 eprintsoline(so, error);
4233 return (EPIPE);
4234 }
4235 if (so->so_error != 0) {
4236 mutex_enter(&so->so_lock);
4237 error = sogeterr(so, B_TRUE);
4238 mutex_exit(&so->so_lock);
4239 if (error != 0) {
4240 eprintsoline(so, error);
4241 return (error);
4242 }
4243 }
4244 }
4245 } while (uiop->uio_resid > 0);
4246 return (0);
4247 }
4248
4249 /*
4250 * Check the state for errors and call the appropriate send function.
4251 *
4252 * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4253 * this function issues a setsockopt to toggle SO_DONTROUTE before and
4254 * after sending the message.
4255 *
4256 * The caller may optionally specify a destination address, for either
4257 * stream or datagram sockets. This table summarizes the cases:
4258 *
4259 * Socket type Dest. given Connected Result
4260 * ----------- ----------- --------- --------------
4261 * Stream * Yes send to conn. addr.
4262 * Stream * No error ENOTCONN
4263 * Dgram yes * send to given addr.
4264 * Dgram no yes send to conn. addr.
4265 * Dgram no no error EDESTADDRREQ
4266 *
4267 * There are subtleties around the destination address when using
4268 * AF_UNIX datagram sockets. When the sendmsg call specifies the
4269 * destination address, it's in (struct sockaddr_un) form and we
4270 * need to translate it to our internal form (struct so_ux_addr).
4271 *
4272 * When the sendmsg call does not specify a destination address
4273 * we're using the peer address saved during sotpi_connect, and
4274 * that address is already in internal form. In this case, the
4275 * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4276 * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4277 * those functions should skip translation to internal form.
4278 * Avoiding that translation is not only more efficient, but it's
4279 * also necessary when a process does a connect on an AF_UNIX
4280 * datagram socket and then drops privileges. After the process
4281 * has dropped privileges, it may no longer be able to lookup the
4282 * the external name in the filesystem, but it should still be
4283 * able to send messages on the connected socket by leaving the
4284 * destination name unspecified.
4285 *
4286 * Yet more subtleties arise with sockets connected by socketpair(),
4287 * which puts internal form addresses in the fields where normally
4288 * the external form is found, and sets sti_faddr_noxlate=1, which
4289 * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4290 * to skip translation of destination addresses to internal form.
4291 * However, beware that the flag sti_faddr_noxlate=1 also triggers
4292 * different behaviour almost everywhere AF_UNIX addresses appear.
4293 */
4294 static int
4295 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4296 struct cred *cr)
4297 {
4298 int so_state;
4299 int so_mode;
4300 int error;
4301 struct sockaddr *name;
4302 t_uscalar_t namelen;
4303 int dontroute;
4304 int flags;
4305 sotpi_info_t *sti = SOTOTPI(so);
4306
4307 dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4308 (void *)so, (void *)msg, msg->msg_flags,
4309 pr_state(so->so_state, so->so_mode), so->so_error));
4310
4311 if (so->so_version == SOV_STREAM) {
4312 /* The imaginary "sockmod" has been popped - act as a stream */
4313 so_update_attrs(so, SOMOD);
4314 return (strwrite(SOTOV(so), uiop, cr));
4315 }
4316
4317 mutex_enter(&so->so_lock);
4318 so_state = so->so_state;
4319
4320 if (so_state & SS_CANTSENDMORE) {
4321 mutex_exit(&so->so_lock);
4322 return (EPIPE);
4323 }
4324
4325 if (so->so_error != 0) {
4326 error = sogeterr(so, B_TRUE);
4327 if (error != 0) {
4328 mutex_exit(&so->so_lock);
4329 return (error);
4330 }
4331 }
4332
4333 name = (struct sockaddr *)msg->msg_name;
4334 namelen = msg->msg_namelen;
4335 flags = msg->msg_flags;
4336
4337 /*
4338 * Historically, this function does not validate the flags
4339 * passed in, and any errant bits are ignored. However,
4340 * we would not want any such errant flag bits accidently
4341 * being treated as one of the internal-only flags, so
4342 * clear the internal-only flag bits.
4343 */
4344 flags &= ~MSG_SENDTO_NOXLATE;
4345
4346 so_mode = so->so_mode;
4347
4348 if (name == NULL) {
4349 if (!(so_state & SS_ISCONNECTED)) {
4350 mutex_exit(&so->so_lock);
4351 if (so_mode & SM_CONNREQUIRED)
4352 return (ENOTCONN);
4353 else
4354 return (EDESTADDRREQ);
4355 }
4356 /*
4357 * This is a connected socket.
4358 */
4359 if (so_mode & SM_CONNREQUIRED) {
4360 /*
4361 * This is a connected STREAM socket,
4362 * destination not specified.
4363 */
4364 name = NULL;
4365 namelen = 0;
4366 } else {
4367 /*
4368 * Datagram send on connected socket with
4369 * the destination name not specified.
4370 * Use the peer address from connect.
4371 */
4372 if (so->so_family == AF_UNIX) {
4373 /*
4374 * Use the (internal form) address saved
4375 * in sotpi_connect. See above.
4376 */
4377 name = (void *)&sti->sti_ux_faddr;
4378 namelen = sizeof (sti->sti_ux_faddr);
4379 flags |= MSG_SENDTO_NOXLATE;
4380 } else {
4381 ASSERT(sti->sti_faddr_sa);
4382 name = sti->sti_faddr_sa;
4383 namelen = (t_uscalar_t)sti->sti_faddr_len;
4384 }
4385 }
4386 } else {
4387 /*
4388 * Sendmsg specifies a destination name
4389 */
4390 if (!(so_state & SS_ISCONNECTED) &&
4391 (so_mode & SM_CONNREQUIRED)) {
4392 /* i.e. TCP not connected */
4393 mutex_exit(&so->so_lock);
4394 return (ENOTCONN);
4395 }
4396 /*
4397 * Ignore the address on connection-oriented sockets.
4398 * Just like BSD this code does not generate an error for
4399 * TCP (a CONNREQUIRED socket) when sending to an address
4400 * passed in with sendto/sendmsg. Instead the data is
4401 * delivered on the connection as if no address had been
4402 * supplied.
4403 */
4404 if ((so_state & SS_ISCONNECTED) &&
4405 !(so_mode & SM_CONNREQUIRED)) {
4406 mutex_exit(&so->so_lock);
4407 return (EISCONN);
4408 }
4409 if (!(so_state & SS_ISBOUND)) {
4410 so_lock_single(so); /* Set SOLOCKED */
4411 error = sotpi_bind(so, NULL, 0,
4412 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4413 so_unlock_single(so, SOLOCKED);
4414 if (error) {
4415 mutex_exit(&so->so_lock);
4416 eprintsoline(so, error);
4417 return (error);
4418 }
4419 }
4420 /*
4421 * Handle delayed datagram errors. These are only queued
4422 * when the application sets SO_DGRAM_ERRIND.
4423 * Return the error if we are sending to the address
4424 * that was returned in the last T_UDERROR_IND.
4425 * If sending to some other address discard the delayed
4426 * error indication.
4427 */
4428 if (sti->sti_delayed_error) {
4429 struct T_uderror_ind *tudi;
4430 void *addr;
4431 t_uscalar_t addrlen;
4432 boolean_t match = B_FALSE;
4433
4434 ASSERT(sti->sti_eaddr_mp);
4435 error = sti->sti_delayed_error;
4436 sti->sti_delayed_error = 0;
4437 tudi =
4438 (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4439 addrlen = tudi->DEST_length;
4440 addr = sogetoff(sti->sti_eaddr_mp,
4441 tudi->DEST_offset, addrlen, 1);
4442 ASSERT(addr); /* Checked by strsock_proto */
4443 switch (so->so_family) {
4444 case AF_INET: {
4445 /* Compare just IP address and port */
4446 sin_t *sin1 = (sin_t *)name;
4447 sin_t *sin2 = (sin_t *)addr;
4448
4449 if (addrlen == sizeof (sin_t) &&
4450 namelen == addrlen &&
4451 sin1->sin_port == sin2->sin_port &&
4452 sin1->sin_addr.s_addr ==
4453 sin2->sin_addr.s_addr)
4454 match = B_TRUE;
4455 break;
4456 }
4457 case AF_INET6: {
4458 /* Compare just IP address and port. Not flow */
4459 sin6_t *sin1 = (sin6_t *)name;
4460 sin6_t *sin2 = (sin6_t *)addr;
4461
4462 if (addrlen == sizeof (sin6_t) &&
4463 namelen == addrlen &&
4464 sin1->sin6_port == sin2->sin6_port &&
4465 IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4466 &sin2->sin6_addr))
4467 match = B_TRUE;
4468 break;
4469 }
4470 case AF_UNIX:
4471 default:
4472 if (namelen == addrlen &&
4473 bcmp(name, addr, namelen) == 0)
4474 match = B_TRUE;
4475 }
4476 if (match) {
4477 freemsg(sti->sti_eaddr_mp);
4478 sti->sti_eaddr_mp = NULL;
4479 mutex_exit(&so->so_lock);
4480 #ifdef DEBUG
4481 dprintso(so, 0,
4482 ("sockfs delayed error %d for %s\n",
4483 error,
4484 pr_addr(so->so_family, name, namelen)));
4485 #endif /* DEBUG */
4486 return (error);
4487 }
4488 freemsg(sti->sti_eaddr_mp);
4489 sti->sti_eaddr_mp = NULL;
4490 }
4491 }
4492 mutex_exit(&so->so_lock);
4493
4494 dontroute = 0;
4495 if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4496 uint32_t val;
4497
4498 val = 1;
4499 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4500 &val, (t_uscalar_t)sizeof (val), cr);
4501 if (error)
4502 return (error);
4503 dontroute = 1;
4504 }
4505
4506 if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4507 error = EOPNOTSUPP;
4508 goto done;
4509 }
4510 if (msg->msg_controllen != 0) {
4511 if (!(so_mode & SM_CONNREQUIRED)) {
4512 so_update_attrs(so, SOMOD);
4513 error = sosend_dgramcmsg(so, name, namelen, uiop,
4514 msg->msg_control, msg->msg_controllen, flags);
4515 } else {
4516 if (flags & MSG_OOB) {
4517 /* Can't generate T_EXDATA_REQ with options */
4518 error = EOPNOTSUPP;
4519 goto done;
4520 }
4521 so_update_attrs(so, SOMOD);
4522 error = sosend_svccmsg(so, uiop,
4523 !(flags & MSG_EOR),
4524 msg->msg_control, msg->msg_controllen,
4525 flags);
4526 }
4527 goto done;
4528 }
4529
4530 so_update_attrs(so, SOMOD);
4531 if (!(so_mode & SM_CONNREQUIRED)) {
4532 /*
4533 * If there is no SO_DONTROUTE to turn off return immediately
4534 * from send_dgram. This can allow tail-call optimizations.
4535 */
4536 if (!dontroute) {
4537 return (sosend_dgram(so, name, namelen, uiop, flags));
4538 }
4539 error = sosend_dgram(so, name, namelen, uiop, flags);
4540 } else {
4541 t_scalar_t prim;
4542 int sflag;
4543
4544 /* Ignore msg_name in the connected state */
4545 if (flags & MSG_OOB) {
4546 prim = T_EXDATA_REQ;
4547 /*
4548 * Send down T_EXDATA_REQ even if there is flow
4549 * control for data.
4550 */
4551 sflag = MSG_IGNFLOW;
4552 } else {
4553 if (so_mode & SM_BYTESTREAM) {
4554 /* Byte stream transport - use write */
4555 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4556
4557 /* Send M_DATA messages */
4558 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4559 (error = nl7c_data(so, uiop)) >= 0) {
4560 /* NL7C consumed the data */
4561 return (error);
4562 }
4563 /*
4564 * If there is no SO_DONTROUTE to turn off,
4565 * sti_direct is on, and there is no flow
4566 * control, we can take the fast path.
4567 */
4568 if (!dontroute && sti->sti_direct != 0 &&
4569 canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4570 return (sostream_direct(so, uiop,
4571 NULL, cr));
4572 }
4573 error = strwrite(SOTOV(so), uiop, cr);
4574 goto done;
4575 }
4576 prim = T_DATA_REQ;
4577 sflag = 0;
4578 }
4579 /*
4580 * If there is no SO_DONTROUTE to turn off return immediately
4581 * from sosend_svc. This can allow tail-call optimizations.
4582 */
4583 if (!dontroute)
4584 return (sosend_svc(so, uiop, prim,
4585 !(flags & MSG_EOR), sflag));
4586 error = sosend_svc(so, uiop, prim,
4587 !(flags & MSG_EOR), sflag);
4588 }
4589 ASSERT(dontroute);
4590 done:
4591 if (dontroute) {
4592 uint32_t val;
4593
4594 val = 0;
4595 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4596 &val, (t_uscalar_t)sizeof (val), cr);
4597 }
4598 return (error);
4599 }
4600
4601 /*
4602 * kstrwritemp() has very similar semantics as that of strwrite().
4603 * The main difference is it obtains mblks from the caller and also
4604 * does not do any copy as done in strwrite() from user buffers to
4605 * kernel buffers.
4606 *
4607 * Currently, this routine is used by sendfile to send data allocated
4608 * within the kernel without any copying. This interface does not use the
4609 * synchronous stream interface as synch. stream interface implies
4610 * copying.
4611 */
4612 int
4613 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4614 {
4615 struct stdata *stp;
4616 struct queue *wqp;
4617 mblk_t *newmp;
4618 char waitflag;
4619 int tempmode;
4620 int error = 0;
4621 int done = 0;
4622 struct sonode *so;
4623 boolean_t direct;
4624
4625 ASSERT(vp->v_stream);
4626 stp = vp->v_stream;
4627
4628 so = VTOSO(vp);
4629 direct = _SOTOTPI(so)->sti_direct;
4630
4631 /*
4632 * This is the sockfs direct fast path. canputnext() need
4633 * not be accurate so we don't grab the sd_lock here. If
4634 * we get flow-controlled, we grab sd_lock just before the
4635 * do..while loop below to emulate what strwrite() does.
4636 */
4637 wqp = stp->sd_wrq;
4638 if (canputnext(wqp) && direct &&
4639 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4640 return (sostream_direct(so, NULL, mp, CRED()));
4641 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4642 /* Fast check of flags before acquiring the lock */
4643 mutex_enter(&stp->sd_lock);
4644 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4645 mutex_exit(&stp->sd_lock);
4646 if (error != 0) {
4647 if (!(stp->sd_flag & STPLEX) &&
4648 (stp->sd_wput_opt & SW_SIGPIPE)) {
4649 error = EPIPE;
4650 }
4651 return (error);
4652 }
4653 }
4654
4655 waitflag = WRITEWAIT;
4656 if (stp->sd_flag & OLDNDELAY)
4657 tempmode = fmode & ~FNDELAY;
4658 else
4659 tempmode = fmode;
4660
4661 mutex_enter(&stp->sd_lock);
4662 do {
4663 if (canputnext(wqp)) {
4664 mutex_exit(&stp->sd_lock);
4665 if (stp->sd_wputdatafunc != NULL) {
4666 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4667 NULL, NULL, NULL);
4668 if (newmp == NULL) {
4669 /* The caller will free mp */
4670 return (ECOMM);
4671 }
4672 mp = newmp;
4673 }
4674 putnext(wqp, mp);
4675 return (0);
4676 }
4677 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4678 &done);
4679 } while (error == 0 && !done);
4680
4681 mutex_exit(&stp->sd_lock);
4682 /*
4683 * EAGAIN tells the application to try again. ENOMEM
4684 * is returned only if the memory allocation size
4685 * exceeds the physical limits of the system. ENOMEM
4686 * can't be true here.
4687 */
4688 if (error == ENOMEM)
4689 error = EAGAIN;
4690 return (error);
4691 }
4692
4693 /* ARGSUSED */
4694 static int
4695 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4696 struct cred *cr, mblk_t **mpp)
4697 {
4698 int error;
4699
4700 switch (so->so_family) {
4701 case AF_INET:
4702 case AF_INET6:
4703 case AF_UNIX:
4704 break;
4705 default:
4706 return (EAFNOSUPPORT);
4707
4708 }
4709
4710 if (so->so_state & SS_CANTSENDMORE)
4711 return (EPIPE);
4712
4713 if (so->so_type != SOCK_STREAM)
4714 return (EOPNOTSUPP);
4715
4716 if ((so->so_state & SS_ISCONNECTED) == 0)
4717 return (ENOTCONN);
4718
4719 error = kstrwritemp(so->so_vnode, *mpp, fflag);
4720 if (error == 0)
4721 *mpp = NULL;
4722 return (error);
4723 }
4724
4725 /*
4726 * Sending data on a datagram socket.
4727 * Assumes caller has verified that SS_ISBOUND etc. are set.
4728 */
4729 /* ARGSUSED */
4730 static int
4731 sodgram_direct(struct sonode *so, struct sockaddr *name,
4732 socklen_t namelen, struct uio *uiop, int flags)
4733 {
4734 struct T_unitdata_req tudr;
4735 mblk_t *mp = NULL;
4736 int error = 0;
4737 void *addr;
4738 socklen_t addrlen;
4739 ssize_t len;
4740 struct stdata *stp = SOTOV(so)->v_stream;
4741 int so_state;
4742 queue_t *udp_wq;
4743 boolean_t connected;
4744 mblk_t *mpdata = NULL;
4745 sotpi_info_t *sti = SOTOTPI(so);
4746 uint32_t auditing = AU_AUDITING();
4747
4748 ASSERT(name != NULL && namelen != 0);
4749 ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4750 ASSERT(!(so->so_mode & SM_EXDATA));
4751 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4752 ASSERT(SOTOV(so)->v_type == VSOCK);
4753
4754 /* Caller checked for proper length */
4755 len = uiop->uio_resid;
4756 ASSERT(len <= sti->sti_tidu_size);
4757
4758 /* Length and family checks have been done by caller */
4759 ASSERT(name->sa_family == so->so_family);
4760 ASSERT(so->so_family == AF_INET ||
4761 (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4762 ASSERT(so->so_family == AF_INET6 ||
4763 (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4764
4765 addr = name;
4766 addrlen = namelen;
4767
4768 if (stp->sd_sidp != NULL &&
4769 (error = straccess(stp, JCWRITE)) != 0)
4770 goto done;
4771
4772 so_state = so->so_state;
4773
4774 connected = so_state & SS_ISCONNECTED;
4775 if (!connected) {
4776 tudr.PRIM_type = T_UNITDATA_REQ;
4777 tudr.DEST_length = addrlen;
4778 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4779 tudr.OPT_length = 0;
4780 tudr.OPT_offset = 0;
4781
4782 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4783 _ALLOC_INTR, CRED());
4784 if (mp == NULL) {
4785 /*
4786 * Caught a signal waiting for memory.
4787 * Let send* return EINTR.
4788 */
4789 error = EINTR;
4790 goto done;
4791 }
4792 }
4793
4794 /*
4795 * For UDP we don't break up the copyin into smaller pieces
4796 * as in the TCP case. That means if ENOMEM is returned by
4797 * mcopyinuio() then the uio vector has not been modified at
4798 * all and we fallback to either strwrite() or kstrputmsg()
4799 * below. Note also that we never generate priority messages
4800 * from here.
4801 */
4802 udp_wq = stp->sd_wrq->q_next;
4803 if (canput(udp_wq) &&
4804 (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4805 ASSERT(DB_TYPE(mpdata) == M_DATA);
4806 ASSERT(uiop->uio_resid == 0);
4807 if (!connected)
4808 linkb(mp, mpdata);
4809 else
4810 mp = mpdata;
4811 if (auditing)
4812 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4813
4814 udp_wput(udp_wq, mp);
4815 return (0);
4816 }
4817
4818 ASSERT(mpdata == NULL);
4819 if (error != 0 && error != ENOMEM) {
4820 freemsg(mp);
4821 return (error);
4822 }
4823
4824 /*
4825 * For connected, let strwrite() handle the blocking case.
4826 * Otherwise we fall thru and use kstrputmsg().
4827 */
4828 if (connected)
4829 return (strwrite(SOTOV(so), uiop, CRED()));
4830
4831 if (auditing)
4832 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4833
4834 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4835 done:
4836 #ifdef SOCK_DEBUG
4837 if (error != 0) {
4838 eprintsoline(so, error);
4839 }
4840 #endif /* SOCK_DEBUG */
4841 return (error);
4842 }
4843
4844 int
4845 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4846 {
4847 struct stdata *stp = SOTOV(so)->v_stream;
4848 ssize_t iosize, rmax, maxblk;
4849 queue_t *tcp_wq = stp->sd_wrq->q_next;
4850 mblk_t *newmp;
4851 int error = 0, wflag = 0;
4852
4853 ASSERT(so->so_mode & SM_BYTESTREAM);
4854 ASSERT(SOTOV(so)->v_type == VSOCK);
4855
4856 if (stp->sd_sidp != NULL &&
4857 (error = straccess(stp, JCWRITE)) != 0)
4858 return (error);
4859
4860 if (uiop == NULL) {
4861 /*
4862 * kstrwritemp() should have checked sd_flag and
4863 * flow-control before coming here. If we end up
4864 * here it means that we can simply pass down the
4865 * data to tcp.
4866 */
4867 ASSERT(mp != NULL);
4868 if (stp->sd_wputdatafunc != NULL) {
4869 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4870 NULL, NULL, NULL);
4871 if (newmp == NULL) {
4872 /* The caller will free mp */
4873 return (ECOMM);
4874 }
4875 mp = newmp;
4876 }
4877 tcp_wput(tcp_wq, mp);
4878 return (0);
4879 }
4880
4881 /* Fallback to strwrite() to do proper error handling */
4882 if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4883 return (strwrite(SOTOV(so), uiop, cr));
4884
4885 rmax = stp->sd_qn_maxpsz;
4886 ASSERT(rmax >= 0 || rmax == INFPSZ);
4887 if (rmax == 0 || uiop->uio_resid <= 0)
4888 return (0);
4889
4890 if (rmax == INFPSZ)
4891 rmax = uiop->uio_resid;
4892
4893 maxblk = stp->sd_maxblk;
4894
4895 for (;;) {
4896 iosize = MIN(uiop->uio_resid, rmax);
4897
4898 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4899 if (mp == NULL) {
4900 /*
4901 * Fallback to strwrite() for ENOMEM; if this
4902 * is our first time in this routine and the uio
4903 * vector has not been modified, we will end up
4904 * calling strwrite() without any flag set.
4905 */
4906 if (error == ENOMEM)
4907 goto slow_send;
4908 else
4909 return (error);
4910 }
4911 ASSERT(uiop->uio_resid >= 0);
4912 /*
4913 * If mp is non-NULL and ENOMEM is set, it means that
4914 * mcopyinuio() was able to break down some of the user
4915 * data into one or more mblks. Send the partial data
4916 * to tcp and let the rest be handled in strwrite().
4917 */
4918 ASSERT(error == 0 || error == ENOMEM);
4919 if (stp->sd_wputdatafunc != NULL) {
4920 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4921 NULL, NULL, NULL);
4922 if (newmp == NULL) {
4923 /* The caller will free mp */
4924 return (ECOMM);
4925 }
4926 mp = newmp;
4927 }
4928 tcp_wput(tcp_wq, mp);
4929
4930 wflag |= NOINTR;
4931
4932 if (uiop->uio_resid == 0) { /* No more data; we're done */
4933 ASSERT(error == 0);
4934 break;
4935 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4936 (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4937 slow_send:
4938 /*
4939 * We were able to send down partial data using
4940 * the direct call interface, but are now relying
4941 * on strwrite() to handle the non-fastpath cases.
4942 * If the socket is blocking we will sleep in
4943 * strwaitq() until write is permitted, otherwise,
4944 * we will need to return the amount of bytes
4945 * written so far back to the app. This is the
4946 * reason why we pass NOINTR flag to strwrite()
4947 * for non-blocking socket, because we don't want
4948 * to return EAGAIN when portion of the user data
4949 * has actually been sent down.
4950 */
4951 return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4952 }
4953 }
4954 return (0);
4955 }
4956
4957 /*
4958 * Update sti_faddr by asking the transport (unless AF_UNIX).
4959 */
4960 /* ARGSUSED */
4961 int
4962 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4963 boolean_t accept, struct cred *cr)
4964 {
4965 struct strbuf strbuf;
4966 int error = 0, res;
4967 void *addr;
4968 t_uscalar_t addrlen;
4969 k_sigset_t smask;
4970 sotpi_info_t *sti = SOTOTPI(so);
4971
4972 dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4973 (void *)so, pr_state(so->so_state, so->so_mode)));
4974
4975 ASSERT(*namelen > 0);
4976 mutex_enter(&so->so_lock);
4977 so_lock_single(so); /* Set SOLOCKED */
4978
4979 if (accept) {
4980 bcopy(sti->sti_faddr_sa, name,
4981 MIN(*namelen, sti->sti_faddr_len));
4982 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4983 goto done;
4984 }
4985
4986 if (!(so->so_state & SS_ISCONNECTED)) {
4987 error = ENOTCONN;
4988 goto done;
4989 }
4990 /* Added this check for X/Open */
4991 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4992 error = EINVAL;
4993 if (xnet_check_print) {
4994 printf("sockfs: X/Open getpeername check => EINVAL\n");
4995 }
4996 goto done;
4997 }
4998
4999 if (sti->sti_faddr_valid) {
5000 bcopy(sti->sti_faddr_sa, name,
5001 MIN(*namelen, sti->sti_faddr_len));
5002 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
5003 goto done;
5004 }
5005
5006 #ifdef DEBUG
5007 dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
5008 pr_addr(so->so_family, sti->sti_faddr_sa,
5009 (t_uscalar_t)sti->sti_faddr_len)));
5010 #endif /* DEBUG */
5011
5012 if (so->so_family == AF_UNIX) {
5013 /* Transport has different name space - return local info */
5014 if (sti->sti_faddr_noxlate)
5015 *namelen = 0;
5016 error = 0;
5017 goto done;
5018 }
5019
5020 ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
5021
5022 ASSERT(sti->sti_faddr_sa);
5023 /* Allocate local buffer to use with ioctl */
5024 addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
5025 mutex_exit(&so->so_lock);
5026 addr = kmem_alloc(addrlen, KM_SLEEP);
5027
5028 /*
5029 * Issue TI_GETPEERNAME with signals masked.
5030 * Put the result in sti_faddr_sa so that getpeername works after
5031 * a shutdown(output).
5032 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5033 * back to the socket.
5034 */
5035 strbuf.buf = addr;
5036 strbuf.maxlen = addrlen;
5037 strbuf.len = 0;
5038
5039 sigintr(&smask, 0);
5040 res = 0;
5041 ASSERT(cr);
5042 error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
5043 0, K_TO_K, cr, &res);
5044 sigunintr(&smask);
5045
5046 mutex_enter(&so->so_lock);
5047 /*
5048 * If there is an error record the error in so_error put don't fail
5049 * the getpeername. Instead fallback on the recorded
5050 * sti->sti_faddr_sa.
5051 */
5052 if (error) {
5053 /*
5054 * Various stream head errors can be returned to the ioctl.
5055 * However, it is impossible to determine which ones of
5056 * these are really socket level errors that were incorrectly
5057 * consumed by the ioctl. Thus this code silently ignores the
5058 * error - to code explicitly does not reinstate the error
5059 * using soseterror().
5060 * Experiments have shows that at least this set of
5061 * errors are reported and should not be reinstated on the
5062 * socket:
5063 * EINVAL E.g. if an I_LINK was in effect when
5064 * getpeername was called.
5065 * EPIPE The ioctl error semantics prefer the write
5066 * side error over the read side error.
5067 * ENOTCONN The transport just got disconnected but
5068 * sockfs had not yet seen the T_DISCON_IND
5069 * when issuing the ioctl.
5070 */
5071 error = 0;
5072 } else if (res == 0 && strbuf.len > 0 &&
5073 (so->so_state & SS_ISCONNECTED)) {
5074 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
5075 sti->sti_faddr_len = (socklen_t)strbuf.len;
5076 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
5077 sti->sti_faddr_valid = 1;
5078
5079 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
5080 *namelen = sti->sti_faddr_len;
5081 }
5082 kmem_free(addr, addrlen);
5083 #ifdef DEBUG
5084 dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
5085 pr_addr(so->so_family, sti->sti_faddr_sa,
5086 (t_uscalar_t)sti->sti_faddr_len)));
5087 #endif /* DEBUG */
5088 done:
5089 so_unlock_single(so, SOLOCKED);
5090 mutex_exit(&so->so_lock);
5091 return (error);
5092 }
5093
5094 /*
5095 * Update sti_laddr by asking the transport (unless AF_UNIX).
5096 */
5097 int
5098 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5099 struct cred *cr)
5100 {
5101 struct strbuf strbuf;
5102 int error = 0, res;
5103 void *addr;
5104 t_uscalar_t addrlen;
5105 k_sigset_t smask;
5106 sotpi_info_t *sti = SOTOTPI(so);
5107
5108 dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5109 (void *)so, pr_state(so->so_state, so->so_mode)));
5110
5111 ASSERT(*namelen > 0);
5112 mutex_enter(&so->so_lock);
5113 so_lock_single(so); /* Set SOLOCKED */
5114
5115 #ifdef DEBUG
5116
5117 dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5118 pr_addr(so->so_family, sti->sti_laddr_sa,
5119 (t_uscalar_t)sti->sti_laddr_len)));
5120 #endif /* DEBUG */
5121 if (sti->sti_laddr_valid) {
5122 bcopy(sti->sti_laddr_sa, name,
5123 MIN(*namelen, sti->sti_laddr_len));
5124 *namelen = sti->sti_laddr_len;
5125 goto done;
5126 }
5127
5128 if (so->so_family == AF_UNIX) {
5129 /*
5130 * Transport has different name space - return local info. If we
5131 * have enough space, let consumers know the family.
5132 */
5133 if (*namelen >= sizeof (sa_family_t)) {
5134 name->sa_family = AF_UNIX;
5135 *namelen = sizeof (sa_family_t);
5136 } else {
5137 *namelen = 0;
5138 }
5139 error = 0;
5140 goto done;
5141 }
5142 if (!(so->so_state & SS_ISBOUND)) {
5143 /* If not bound, then nothing to return. */
5144 error = 0;
5145 goto done;
5146 }
5147
5148 /* Allocate local buffer to use with ioctl */
5149 addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5150 mutex_exit(&so->so_lock);
5151 addr = kmem_alloc(addrlen, KM_SLEEP);
5152
5153 /*
5154 * Issue TI_GETMYNAME with signals masked.
5155 * Put the result in sti_laddr_sa so that getsockname works after
5156 * a shutdown(output).
5157 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5158 * back to the socket.
5159 */
5160 strbuf.buf = addr;
5161 strbuf.maxlen = addrlen;
5162 strbuf.len = 0;
5163
5164 sigintr(&smask, 0);
5165 res = 0;
5166 ASSERT(cr);
5167 error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5168 0, K_TO_K, cr, &res);
5169 sigunintr(&smask);
5170
5171 mutex_enter(&so->so_lock);
5172 /*
5173 * If there is an error record the error in so_error put don't fail
5174 * the getsockname. Instead fallback on the recorded
5175 * sti->sti_laddr_sa.
5176 */
5177 if (error) {
5178 /*
5179 * Various stream head errors can be returned to the ioctl.
5180 * However, it is impossible to determine which ones of
5181 * these are really socket level errors that were incorrectly
5182 * consumed by the ioctl. Thus this code silently ignores the
5183 * error - to code explicitly does not reinstate the error
5184 * using soseterror().
5185 * Experiments have shows that at least this set of
5186 * errors are reported and should not be reinstated on the
5187 * socket:
5188 * EINVAL E.g. if an I_LINK was in effect when
5189 * getsockname was called.
5190 * EPIPE The ioctl error semantics prefer the write
5191 * side error over the read side error.
5192 */
5193 error = 0;
5194 } else if (res == 0 && strbuf.len > 0 &&
5195 (so->so_state & SS_ISBOUND)) {
5196 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5197 sti->sti_laddr_len = (socklen_t)strbuf.len;
5198 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5199 sti->sti_laddr_valid = 1;
5200
5201 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5202 *namelen = sti->sti_laddr_len;
5203 }
5204 kmem_free(addr, addrlen);
5205 #ifdef DEBUG
5206 dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5207 pr_addr(so->so_family, sti->sti_laddr_sa,
5208 (t_uscalar_t)sti->sti_laddr_len)));
5209 #endif /* DEBUG */
5210 done:
5211 so_unlock_single(so, SOLOCKED);
5212 mutex_exit(&so->so_lock);
5213 return (error);
5214 }
5215
5216 /*
5217 * Get socket options. For SOL_SOCKET options some options are handled
5218 * by the sockfs while others use the value recorded in the sonode as a
5219 * fallback should the T_SVR4_OPTMGMT_REQ fail.
5220 *
5221 * On the return most *optlenp bytes are copied to optval.
5222 */
5223 /* ARGSUSED */
5224 int
5225 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5226 void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5227 {
5228 struct T_optmgmt_req optmgmt_req;
5229 struct T_optmgmt_ack *optmgmt_ack;
5230 struct opthdr oh;
5231 struct opthdr *opt_res;
5232 mblk_t *mp = NULL;
5233 int error = 0;
5234 void *option = NULL; /* Set if fallback value */
5235 t_uscalar_t maxlen = *optlenp;
5236 t_uscalar_t len;
5237 uint32_t value;
5238 struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5239 struct timeval32 tmo_val32;
5240 struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
5241
5242 dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5243 (void *)so, level, option_name, optval, (void *)optlenp,
5244 pr_state(so->so_state, so->so_mode)));
5245
5246 mutex_enter(&so->so_lock);
5247 so_lock_single(so); /* Set SOLOCKED */
5248
5249 /*
5250 * Check for SOL_SOCKET options.
5251 * Certain SOL_SOCKET options are returned directly whereas
5252 * others only provide a default (fallback) value should
5253 * the T_SVR4_OPTMGMT_REQ fail.
5254 */
5255 if (level == SOL_SOCKET) {
5256 /* Check parameters */
5257 switch (option_name) {
5258 case SO_TYPE:
5259 case SO_ERROR:
5260 case SO_DEBUG:
5261 case SO_ACCEPTCONN:
5262 case SO_REUSEADDR:
5263 case SO_KEEPALIVE:
5264 case SO_DONTROUTE:
5265 case SO_BROADCAST:
5266 case SO_USELOOPBACK:
5267 case SO_OOBINLINE:
5268 case SO_SNDBUF:
5269 case SO_RCVBUF:
5270 #ifdef notyet
5271 case SO_SNDLOWAT:
5272 case SO_RCVLOWAT:
5273 #endif /* notyet */
5274 case SO_DOMAIN:
5275 case SO_DGRAM_ERRIND:
5276 if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5277 error = EINVAL;
5278 eprintsoline(so, error);
5279 goto done2;
5280 }
5281 break;
5282 case SO_RCVTIMEO:
5283 case SO_SNDTIMEO:
5284 if (get_udatamodel() == DATAMODEL_NONE ||
5285 get_udatamodel() == DATAMODEL_NATIVE) {
5286 if (maxlen < sizeof (struct timeval)) {
5287 error = EINVAL;
5288 eprintsoline(so, error);
5289 goto done2;
5290 }
5291 } else {
5292 if (maxlen < sizeof (struct timeval32)) {
5293 error = EINVAL;
5294 eprintsoline(so, error);
5295 goto done2;
5296 }
5297
5298 }
5299 break;
5300 case SO_LINGER:
5301 if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5302 error = EINVAL;
5303 eprintsoline(so, error);
5304 goto done2;
5305 }
5306 break;
5307 case SO_SND_BUFINFO:
5308 if (maxlen < (t_uscalar_t)
5309 sizeof (struct so_snd_bufinfo)) {
5310 error = EINVAL;
5311 eprintsoline(so, error);
5312 goto done2;
5313 }
5314 break;
5315 }
5316
5317 len = (t_uscalar_t)sizeof (uint32_t); /* Default */
5318
5319 switch (option_name) {
5320 case SO_TYPE:
5321 value = so->so_type;
5322 option = &value;
5323 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5324
5325 case SO_ERROR:
5326 value = sogeterr(so, B_TRUE);
5327 option = &value;
5328 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5329
5330 case SO_ACCEPTCONN:
5331 if (so->so_state & SS_ACCEPTCONN)
5332 value = SO_ACCEPTCONN;
5333 else
5334 value = 0;
5335 #ifdef DEBUG
5336 if (value) {
5337 dprintso(so, 1,
5338 ("sotpi_getsockopt: 0x%x is set\n",
5339 option_name));
5340 } else {
5341 dprintso(so, 1,
5342 ("sotpi_getsockopt: 0x%x not set\n",
5343 option_name));
5344 }
5345 #endif /* DEBUG */
5346 option = &value;
5347 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5348
5349 case SO_DEBUG:
5350 case SO_REUSEADDR:
5351 case SO_KEEPALIVE:
5352 case SO_DONTROUTE:
5353 case SO_BROADCAST:
5354 case SO_USELOOPBACK:
5355 case SO_OOBINLINE:
5356 case SO_DGRAM_ERRIND:
5357 value = (so->so_options & option_name);
5358 #ifdef DEBUG
5359 if (value) {
5360 dprintso(so, 1,
5361 ("sotpi_getsockopt: 0x%x is set\n",
5362 option_name));
5363 } else {
5364 dprintso(so, 1,
5365 ("sotpi_getsockopt: 0x%x not set\n",
5366 option_name));
5367 }
5368 #endif /* DEBUG */
5369 option = &value;
5370 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5371
5372 /*
5373 * The following options are only returned by sockfs when the
5374 * T_SVR4_OPTMGMT_REQ fails.
5375 */
5376 case SO_LINGER:
5377 option = &so->so_linger;
5378 len = (t_uscalar_t)sizeof (struct linger);
5379 break;
5380 case SO_SNDBUF: {
5381 ssize_t lvalue;
5382
5383 /*
5384 * If the option has not been set then get a default
5385 * value from the read queue. This value is
5386 * returned if the transport fails
5387 * the T_SVR4_OPTMGMT_REQ.
5388 */
5389 lvalue = so->so_sndbuf;
5390 if (lvalue == 0) {
5391 mutex_exit(&so->so_lock);
5392 (void) strqget(strvp2wq(SOTOV(so))->q_next,
5393 QHIWAT, 0, &lvalue);
5394 mutex_enter(&so->so_lock);
5395 dprintso(so, 1,
5396 ("got SO_SNDBUF %ld from q\n", lvalue));
5397 }
5398 value = (int)lvalue;
5399 option = &value;
5400 len = (t_uscalar_t)sizeof (so->so_sndbuf);
5401 break;
5402 }
5403 case SO_RCVBUF: {
5404 ssize_t lvalue;
5405
5406 /*
5407 * If the option has not been set then get a default
5408 * value from the read queue. This value is
5409 * returned if the transport fails
5410 * the T_SVR4_OPTMGMT_REQ.
5411 *
5412 * XXX If SO_RCVBUF has been set and this is an
5413 * XPG 4.2 application then do not ask the transport
5414 * since the transport might adjust the value and not
5415 * return exactly what was set by the application.
5416 * For non-XPG 4.2 application we return the value
5417 * that the transport is actually using.
5418 */
5419 lvalue = so->so_rcvbuf;
5420 if (lvalue == 0) {
5421 mutex_exit(&so->so_lock);
5422 (void) strqget(RD(strvp2wq(SOTOV(so))),
5423 QHIWAT, 0, &lvalue);
5424 mutex_enter(&so->so_lock);
5425 dprintso(so, 1,
5426 ("got SO_RCVBUF %ld from q\n", lvalue));
5427 } else if (flags & _SOGETSOCKOPT_XPG4_2) {
5428 value = (int)lvalue;
5429 option = &value;
5430 goto copyout; /* skip asking transport */
5431 }
5432 value = (int)lvalue;
5433 option = &value;
5434 len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5435 break;
5436 }
5437 case SO_DOMAIN:
5438 value = so->so_family;
5439 option = &value;
5440 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5441
5442 #ifdef notyet
5443 /*
5444 * We do not implement the semantics of these options
5445 * thus we shouldn't implement the options either.
5446 */
5447 case SO_SNDLOWAT:
5448 value = so->so_sndlowat;
5449 option = &value;
5450 break;
5451 case SO_RCVLOWAT:
5452 value = so->so_rcvlowat;
5453 option = &value;
5454 break;
5455 #endif /* notyet */
5456 case SO_SNDTIMEO:
5457 case SO_RCVTIMEO: {
5458 clock_t val;
5459
5460 if (option_name == SO_RCVTIMEO)
5461 val = drv_hztousec(so->so_rcvtimeo);
5462 else
5463 val = drv_hztousec(so->so_sndtimeo);
5464 tmo_val.tv_sec = val / (1000 * 1000);
5465 tmo_val.tv_usec = val % (1000 * 1000);
5466 if (get_udatamodel() == DATAMODEL_NONE ||
5467 get_udatamodel() == DATAMODEL_NATIVE) {
5468 option = &tmo_val;
5469 len = sizeof (struct timeval);
5470 } else {
5471 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5472 option = &tmo_val32;
5473 len = sizeof (struct timeval32);
5474 }
5475 break;
5476 }
5477 case SO_SND_BUFINFO: {
5478 snd_bufinfo.sbi_wroff =
5479 (so->so_proto_props).sopp_wroff;
5480 snd_bufinfo.sbi_maxblk =
5481 (so->so_proto_props).sopp_maxblk;
5482 snd_bufinfo.sbi_maxpsz =
5483 (so->so_proto_props).sopp_maxpsz;
5484 snd_bufinfo.sbi_tail =
5485 (so->so_proto_props).sopp_tail;
5486 option = &snd_bufinfo;
5487 len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5488 break;
5489 }
5490 }
5491 }
5492
5493 mutex_exit(&so->so_lock);
5494
5495 /* Send request */
5496 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5497 optmgmt_req.MGMT_flags = T_CHECK;
5498 optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5499 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5500
5501 oh.level = level;
5502 oh.name = option_name;
5503 oh.len = maxlen;
5504
5505 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5506 &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5507 /* Let option management work in the presence of data flow control */
5508 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5509 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5510 mp = NULL;
5511 mutex_enter(&so->so_lock);
5512 if (error) {
5513 eprintsoline(so, error);
5514 goto done2;
5515 }
5516 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5517 (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5518 if (error) {
5519 if (option != NULL) {
5520 /* We have a fallback value */
5521 error = 0;
5522 goto copyout;
5523 }
5524 eprintsoline(so, error);
5525 goto done2;
5526 }
5527 ASSERT(mp);
5528 optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5529 opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5530 optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5531 if (opt_res == NULL) {
5532 if (option != NULL) {
5533 /* We have a fallback value */
5534 error = 0;
5535 goto copyout;
5536 }
5537 error = EPROTO;
5538 eprintsoline(so, error);
5539 goto done;
5540 }
5541 option = &opt_res[1];
5542
5543 /* check to ensure that the option is within bounds */
5544 if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5545 (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5546 if (option != NULL) {
5547 /* We have a fallback value */
5548 error = 0;
5549 goto copyout;
5550 }
5551 error = EPROTO;
5552 eprintsoline(so, error);
5553 goto done;
5554 }
5555
5556 len = opt_res->len;
5557
5558 copyout: {
5559 t_uscalar_t size = MIN(len, maxlen);
5560 bcopy(option, optval, size);
5561 bcopy(&size, optlenp, sizeof (size));
5562 }
5563 done:
5564 freemsg(mp);
5565 done2:
5566 so_unlock_single(so, SOLOCKED);
5567 mutex_exit(&so->so_lock);
5568
5569 return (error);
5570 }
5571
5572 /*
5573 * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5574 * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5575 * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5576 * setsockopt has to work even if the transport does not support the option.
5577 */
5578 /* ARGSUSED */
5579 int
5580 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5581 const void *optval, t_uscalar_t optlen, struct cred *cr)
5582 {
5583 struct T_optmgmt_req optmgmt_req;
5584 struct opthdr oh;
5585 mblk_t *mp;
5586 int error = 0;
5587 boolean_t handled = B_FALSE;
5588
5589 dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5590 (void *)so, level, option_name, optval, optlen,
5591 pr_state(so->so_state, so->so_mode)));
5592
5593 /* X/Open requires this check */
5594 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5595 if (xnet_check_print)
5596 printf("sockfs: X/Open setsockopt check => EINVAL\n");
5597 return (EINVAL);
5598 }
5599
5600 mutex_enter(&so->so_lock);
5601 so_lock_single(so); /* Set SOLOCKED */
5602 mutex_exit(&so->so_lock);
5603
5604 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5605 optmgmt_req.MGMT_flags = T_NEGOTIATE;
5606 optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5607 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5608
5609 oh.level = level;
5610 oh.name = option_name;
5611 oh.len = optlen;
5612
5613 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5614 &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5615 /* Let option management work in the presence of data flow control */
5616 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5617 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5618 mp = NULL;
5619 mutex_enter(&so->so_lock);
5620 if (error) {
5621 eprintsoline(so, error);
5622 goto done2;
5623 }
5624 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5625 (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5626 if (error) {
5627 eprintsoline(so, error);
5628 goto done;
5629 }
5630 ASSERT(mp);
5631 /* No need to verify T_optmgmt_ack */
5632 freemsg(mp);
5633 done:
5634 /*
5635 * Check for SOL_SOCKET options and record their values.
5636 * If we know about a SOL_SOCKET parameter and the transport
5637 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5638 * EPROTO) we let the setsockopt succeed.
5639 */
5640 if (level == SOL_SOCKET) {
5641 /* Check parameters */
5642 switch (option_name) {
5643 case SO_DEBUG:
5644 case SO_REUSEADDR:
5645 case SO_KEEPALIVE:
5646 case SO_DONTROUTE:
5647 case SO_BROADCAST:
5648 case SO_USELOOPBACK:
5649 case SO_OOBINLINE:
5650 case SO_SNDBUF:
5651 case SO_RCVBUF:
5652 #ifdef notyet
5653 case SO_SNDLOWAT:
5654 case SO_RCVLOWAT:
5655 #endif /* notyet */
5656 case SO_DGRAM_ERRIND:
5657 if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5658 error = EINVAL;
5659 eprintsoline(so, error);
5660 goto done2;
5661 }
5662 ASSERT(optval);
5663 handled = B_TRUE;
5664 break;
5665 case SO_SNDTIMEO:
5666 case SO_RCVTIMEO:
5667 if (get_udatamodel() == DATAMODEL_NONE ||
5668 get_udatamodel() == DATAMODEL_NATIVE) {
5669 if (optlen != sizeof (struct timeval)) {
5670 error = EINVAL;
5671 eprintsoline(so, error);
5672 goto done2;
5673 }
5674 } else {
5675 if (optlen != sizeof (struct timeval32)) {
5676 error = EINVAL;
5677 eprintsoline(so, error);
5678 goto done2;
5679 }
5680 }
5681 ASSERT(optval);
5682 handled = B_TRUE;
5683 break;
5684 case SO_LINGER:
5685 if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5686 error = EINVAL;
5687 eprintsoline(so, error);
5688 goto done2;
5689 }
5690 ASSERT(optval);
5691 handled = B_TRUE;
5692 break;
5693 }
5694
5695 #define intvalue (*(int32_t *)optval)
5696
5697 switch (option_name) {
5698 case SO_TYPE:
5699 case SO_ERROR:
5700 case SO_ACCEPTCONN:
5701 /* Can't be set */
5702 error = ENOPROTOOPT;
5703 goto done2;
5704 case SO_LINGER: {
5705 struct linger *l = (struct linger *)optval;
5706
5707 so->so_linger.l_linger = l->l_linger;
5708 if (l->l_onoff) {
5709 so->so_linger.l_onoff = SO_LINGER;
5710 so->so_options |= SO_LINGER;
5711 } else {
5712 so->so_linger.l_onoff = 0;
5713 so->so_options &= ~SO_LINGER;
5714 }
5715 break;
5716 }
5717
5718 case SO_DEBUG:
5719 #ifdef SOCK_TEST
5720 if (intvalue & 2)
5721 sock_test_timelimit = 10 * hz;
5722 else
5723 sock_test_timelimit = 0;
5724
5725 if (intvalue & 4)
5726 do_useracc = 0;
5727 else
5728 do_useracc = 1;
5729 #endif /* SOCK_TEST */
5730 /* FALLTHRU */
5731 case SO_REUSEADDR:
5732 case SO_KEEPALIVE:
5733 case SO_DONTROUTE:
5734 case SO_BROADCAST:
5735 case SO_USELOOPBACK:
5736 case SO_OOBINLINE:
5737 case SO_DGRAM_ERRIND:
5738 if (intvalue != 0) {
5739 dprintso(so, 1,
5740 ("socket_setsockopt: setting 0x%x\n",
5741 option_name));
5742 so->so_options |= option_name;
5743 } else {
5744 dprintso(so, 1,
5745 ("socket_setsockopt: clearing 0x%x\n",
5746 option_name));
5747 so->so_options &= ~option_name;
5748 }
5749 break;
5750 /*
5751 * The following options are only returned by us when the
5752 * transport layer fails.
5753 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5754 * since the transport might adjust the value and not
5755 * return exactly what was set by the application.
5756 */
5757 case SO_SNDBUF:
5758 so->so_sndbuf = intvalue;
5759 break;
5760 case SO_RCVBUF:
5761 so->so_rcvbuf = intvalue;
5762 break;
5763 case SO_RCVPSH:
5764 so->so_rcv_timer_interval = intvalue;
5765 break;
5766 #ifdef notyet
5767 /*
5768 * We do not implement the semantics of these options
5769 * thus we shouldn't implement the options either.
5770 */
5771 case SO_SNDLOWAT:
5772 so->so_sndlowat = intvalue;
5773 break;
5774 case SO_RCVLOWAT:
5775 so->so_rcvlowat = intvalue;
5776 break;
5777 #endif /* notyet */
5778 case SO_SNDTIMEO:
5779 case SO_RCVTIMEO: {
5780 struct timeval tl;
5781 clock_t val;
5782
5783 if (get_udatamodel() == DATAMODEL_NONE ||
5784 get_udatamodel() == DATAMODEL_NATIVE)
5785 bcopy(&tl, (struct timeval *)optval,
5786 sizeof (struct timeval));
5787 else
5788 TIMEVAL32_TO_TIMEVAL(&tl,
5789 (struct timeval32 *)optval);
5790 val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5791 if (option_name == SO_RCVTIMEO)
5792 so->so_rcvtimeo = drv_usectohz(val);
5793 else
5794 so->so_sndtimeo = drv_usectohz(val);
5795 break;
5796 }
5797 }
5798 #undef intvalue
5799
5800 if (error) {
5801 if ((error == ENOPROTOOPT || error == EPROTO ||
5802 error == EINVAL) && handled) {
5803 dprintso(so, 1,
5804 ("setsockopt: ignoring error %d for 0x%x\n",
5805 error, option_name));
5806 error = 0;
5807 }
5808 }
5809 }
5810 done2:
5811 so_unlock_single(so, SOLOCKED);
5812 mutex_exit(&so->so_lock);
5813 return (error);
5814 }
5815
5816 /*
5817 * sotpi_close() is called when the last open reference goes away.
5818 */
5819 /* ARGSUSED */
5820 int
5821 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5822 {
5823 struct vnode *vp = SOTOV(so);
5824 dev_t dev;
5825 int error = 0;
5826 sotpi_info_t *sti = SOTOTPI(so);
5827
5828 dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5829 (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5830
5831 dev = sti->sti_dev;
5832
5833 ASSERT(STREAMSTAB(getmajor(dev)));
5834
5835 mutex_enter(&so->so_lock);
5836 so_lock_single(so); /* Set SOLOCKED */
5837
5838 ASSERT(so_verify_oobstate(so));
5839
5840 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5841 sti->sti_nl7c_flags = 0;
5842 nl7c_close(so);
5843 }
5844
5845 if (vp->v_stream != NULL) {
5846 vnode_t *ux_vp;
5847
5848 if (so->so_family == AF_UNIX) {
5849 /* Could avoid this when CANTSENDMORE for !dgram */
5850 so_unix_close(so);
5851 }
5852
5853 mutex_exit(&so->so_lock);
5854 /*
5855 * Disassemble the linkage from the AF_UNIX underlying file
5856 * system vnode to this socket (by atomically clearing
5857 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5858 * and frees the stream head.
5859 */
5860 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5861 ASSERT(ux_vp->v_stream);
5862 sti->sti_ux_bound_vp = NULL;
5863 vn_rele_stream(ux_vp);
5864 }
5865 error = strclose(vp, flag, cr);
5866 vp->v_stream = NULL;
5867 mutex_enter(&so->so_lock);
5868 }
5869
5870 /*
5871 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5872 */
5873 so_flush_discon_ind(so);
5874
5875 so_unlock_single(so, SOLOCKED);
5876 mutex_exit(&so->so_lock);
5877
5878 /*
5879 * Needed for STREAMs.
5880 * Decrement the device driver's reference count for streams
5881 * opened via the clone dip. The driver was held in clone_open().
5882 * The absence of clone_close() forces this asymmetry.
5883 */
5884 if (so->so_flag & SOCLONE)
5885 ddi_rele_driver(getmajor(dev));
5886
5887 return (error);
5888 }
5889
5890 static int
5891 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5892 struct cred *cr, int32_t *rvalp)
5893 {
5894 struct vnode *vp = SOTOV(so);
5895 sotpi_info_t *sti = SOTOTPI(so);
5896 int error = 0;
5897
5898 dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5899 cmd, arg, pr_state(so->so_state, so->so_mode)));
5900
5901 switch (cmd) {
5902 case SIOCSQPTR:
5903 /*
5904 * SIOCSQPTR is valid only when helper stream is created
5905 * by the protocol.
5906 */
5907 case _I_INSERT:
5908 case _I_REMOVE:
5909 /*
5910 * Since there's no compelling reason to support these ioctls
5911 * on sockets, and doing so would increase the complexity
5912 * markedly, prevent it.
5913 */
5914 return (EOPNOTSUPP);
5915
5916 case I_FIND:
5917 case I_LIST:
5918 case I_LOOK:
5919 case I_POP:
5920 case I_PUSH:
5921 /*
5922 * To prevent races and inconsistencies between the actual
5923 * state of the stream and the state according to the sonode,
5924 * we serialize all operations which modify or operate on the
5925 * list of modules on the socket's stream.
5926 */
5927 mutex_enter(&sti->sti_plumb_lock);
5928 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5929 mutex_exit(&sti->sti_plumb_lock);
5930 return (error);
5931
5932 default:
5933 if (so->so_version != SOV_STREAM)
5934 break;
5935
5936 /*
5937 * The imaginary "sockmod" has been popped; act as a stream.
5938 */
5939 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5940 }
5941
5942 ASSERT(so->so_version != SOV_STREAM);
5943
5944 /*
5945 * Process socket-specific ioctls.
5946 */
5947 switch (cmd) {
5948 case FIONBIO: {
5949 int32_t value;
5950
5951 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5952 (mode & (int)FKIOCTL)))
5953 return (EFAULT);
5954
5955 mutex_enter(&so->so_lock);
5956 if (value) {
5957 so->so_state |= SS_NDELAY;
5958 } else {
5959 so->so_state &= ~SS_NDELAY;
5960 }
5961 mutex_exit(&so->so_lock);
5962 return (0);
5963 }
5964
5965 case FIOASYNC: {
5966 int32_t value;
5967
5968 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5969 (mode & (int)FKIOCTL)))
5970 return (EFAULT);
5971
5972 mutex_enter(&so->so_lock);
5973 /*
5974 * SS_ASYNC flag not already set correctly?
5975 * (!value != !(so->so_state & SS_ASYNC))
5976 * but some engineers find that too hard to read.
5977 */
5978 if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5979 value != 0 && (so->so_state & SS_ASYNC) == 0)
5980 error = so_flip_async(so, vp, mode, cr);
5981 mutex_exit(&so->so_lock);
5982 return (error);
5983 }
5984
5985 case SIOCSPGRP:
5986 case FIOSETOWN: {
5987 pid_t pgrp;
5988
5989 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5990 (mode & (int)FKIOCTL)))
5991 return (EFAULT);
5992
5993 mutex_enter(&so->so_lock);
5994 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5995 /* Any change? */
5996 if (pgrp != so->so_pgrp)
5997 error = so_set_siggrp(so, vp, pgrp, mode, cr);
5998 mutex_exit(&so->so_lock);
5999 return (error);
6000 }
6001 case SIOCGPGRP:
6002 case FIOGETOWN:
6003 if (so_copyout(&so->so_pgrp, (void *)arg,
6004 sizeof (pid_t), (mode & (int)FKIOCTL)))
6005 return (EFAULT);
6006 return (0);
6007
6008 case SIOCATMARK: {
6009 int retval;
6010 uint_t so_state;
6011
6012 /*
6013 * strwaitmark has a finite timeout after which it
6014 * returns -1 if the mark state is undetermined.
6015 * In order to avoid any race between the mark state
6016 * in sockfs and the mark state in the stream head this
6017 * routine loops until the mark state can be determined
6018 * (or the urgent data indication has been removed by some
6019 * other thread).
6020 */
6021 do {
6022 mutex_enter(&so->so_lock);
6023 so_state = so->so_state;
6024 mutex_exit(&so->so_lock);
6025 if (so_state & SS_RCVATMARK) {
6026 retval = 1;
6027 } else if (!(so_state & SS_OOBPEND)) {
6028 /*
6029 * No SIGURG has been generated -- there is no
6030 * pending or present urgent data. Thus can't
6031 * possibly be at the mark.
6032 */
6033 retval = 0;
6034 } else {
6035 /*
6036 * Have the stream head wait until there is
6037 * either some messages on the read queue, or
6038 * STRATMARK or STRNOTATMARK gets set. The
6039 * STRNOTATMARK flag is used so that the
6040 * transport can send up a MSGNOTMARKNEXT
6041 * M_DATA to indicate that it is not
6042 * at the mark and additional data is not about
6043 * to be send upstream.
6044 *
6045 * If the mark state is undetermined this will
6046 * return -1 and we will loop rechecking the
6047 * socket state.
6048 */
6049 retval = strwaitmark(vp);
6050 }
6051 } while (retval == -1);
6052
6053 if (so_copyout(&retval, (void *)arg, sizeof (int),
6054 (mode & (int)FKIOCTL)))
6055 return (EFAULT);
6056 return (0);
6057 }
6058
6059 case I_FDINSERT:
6060 case I_SENDFD:
6061 case I_RECVFD:
6062 case I_ATMARK:
6063 case _SIOCSOCKFALLBACK:
6064 /*
6065 * These ioctls do not apply to sockets. I_FDINSERT can be
6066 * used to send M_PROTO messages without modifying the socket
6067 * state. I_SENDFD/RECVFD should not be used for socket file
6068 * descriptor passing since they assume a twisted stream.
6069 * SIOCATMARK must be used instead of I_ATMARK.
6070 *
6071 * _SIOCSOCKFALLBACK from an application should never be
6072 * processed. It is only generated by socktpi_open() or
6073 * in response to I_POP or I_PUSH.
6074 */
6075 #ifdef DEBUG
6076 zcmn_err(getzoneid(), CE_WARN,
6077 "Unsupported STREAMS ioctl 0x%x on socket. "
6078 "Pid = %d\n", cmd, curproc->p_pid);
6079 #endif /* DEBUG */
6080 return (EOPNOTSUPP);
6081
6082 case _I_GETPEERCRED:
6083 if ((mode & FKIOCTL) == 0)
6084 return (EINVAL);
6085
6086 mutex_enter(&so->so_lock);
6087 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6088 error = ENOTSUP;
6089 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
6090 error = ENOTCONN;
6091 } else if (so->so_peercred != NULL) {
6092 k_peercred_t *kp = (k_peercred_t *)arg;
6093 kp->pc_cr = so->so_peercred;
6094 kp->pc_cpid = so->so_cpid;
6095 crhold(so->so_peercred);
6096 } else {
6097 error = EINVAL;
6098 }
6099 mutex_exit(&so->so_lock);
6100 return (error);
6101
6102 default:
6103 /*
6104 * Do the higher-order bits of the ioctl cmd indicate
6105 * that it is an I_* streams ioctl?
6106 */
6107 if ((cmd & 0xffffff00U) == STR &&
6108 so->so_version == SOV_SOCKBSD) {
6109 #ifdef DEBUG
6110 zcmn_err(getzoneid(), CE_WARN,
6111 "Unsupported STREAMS ioctl 0x%x on socket. "
6112 "Pid = %d\n", cmd, curproc->p_pid);
6113 #endif /* DEBUG */
6114 return (EOPNOTSUPP);
6115 }
6116 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6117 }
6118 }
6119
6120 /*
6121 * Handle plumbing-related ioctls.
6122 */
6123 static int
6124 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6125 struct cred *cr, int32_t *rvalp)
6126 {
6127 static const char sockmod_name[] = "sockmod";
6128 struct sonode *so = VTOSO(vp);
6129 char mname[FMNAMESZ + 1];
6130 int error;
6131 sotpi_info_t *sti = SOTOTPI(so);
6132
6133 ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6134
6135 if (so->so_version == SOV_SOCKBSD)
6136 return (EOPNOTSUPP);
6137
6138 if (so->so_version == SOV_STREAM) {
6139 /*
6140 * The imaginary "sockmod" has been popped - act as a stream.
6141 * If this is a push of sockmod then change back to a socket.
6142 */
6143 if (cmd == I_PUSH) {
6144 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6145 (void *)arg, mname, sizeof (mname), NULL);
6146
6147 if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6148 dprintso(so, 0, ("socktpi_ioctl: going to "
6149 "socket version\n"));
6150 so_stream2sock(so);
6151 return (0);
6152 }
6153 }
6154 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6155 }
6156
6157 switch (cmd) {
6158 case I_PUSH:
6159 if (sti->sti_direct) {
6160 mutex_enter(&so->so_lock);
6161 so_lock_single(so);
6162 mutex_exit(&so->so_lock);
6163
6164 error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6165 cr, rvalp);
6166
6167 mutex_enter(&so->so_lock);
6168 if (error == 0)
6169 sti->sti_direct = 0;
6170 so_unlock_single(so, SOLOCKED);
6171 mutex_exit(&so->so_lock);
6172
6173 if (error != 0)
6174 return (error);
6175 }
6176
6177 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6178 if (error == 0)
6179 sti->sti_pushcnt++;
6180 return (error);
6181
6182 case I_POP:
6183 if (sti->sti_pushcnt == 0) {
6184 /* Emulate sockmod being popped */
6185 dprintso(so, 0,
6186 ("socktpi_ioctl: going to STREAMS version\n"));
6187 return (so_sock2stream(so));
6188 }
6189
6190 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6191 if (error == 0)
6192 sti->sti_pushcnt--;
6193 return (error);
6194
6195 case I_LIST: {
6196 struct str_mlist *kmlistp, *umlistp;
6197 struct str_list kstrlist;
6198 ssize_t kstrlistsize;
6199 int i, nmods;
6200
6201 STRUCT_DECL(str_list, ustrlist);
6202 STRUCT_INIT(ustrlist, mode);
6203
6204 if (arg == 0) {
6205 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6206 if (error == 0)
6207 (*rvalp)++; /* Add one for sockmod */
6208 return (error);
6209 }
6210
6211 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6212 STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6213 if (error != 0)
6214 return (error);
6215
6216 nmods = STRUCT_FGET(ustrlist, sl_nmods);
6217 if (nmods <= 0)
6218 return (EINVAL);
6219 /*
6220 * Ceiling nmods at nstrpush to prevent someone from
6221 * maliciously consuming lots of kernel memory.
6222 */
6223 nmods = MIN(nmods, nstrpush);
6224
6225 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6226 kstrlist.sl_nmods = nmods;
6227 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6228
6229 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6230 cr, rvalp);
6231 if (error != 0)
6232 goto done;
6233
6234 /*
6235 * Considering the module list as a 0-based array of sl_nmods
6236 * modules, sockmod should conceptually exist at slot
6237 * sti_pushcnt. Insert sockmod at this location by sliding all
6238 * of the module names after so_pushcnt over by one. We know
6239 * that there will be room to do this since we allocated
6240 * sl_modlist with an additional slot.
6241 */
6242 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6243 kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6244
6245 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6246 kstrlist.sl_nmods++;
6247
6248 /*
6249 * Copy all of the entries out to ustrlist.
6250 */
6251 kmlistp = kstrlist.sl_modlist;
6252 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6253 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6254 error = so_copyout(kmlistp++, umlistp++,
6255 sizeof (struct str_mlist), mode & FKIOCTL);
6256 if (error != 0)
6257 goto done;
6258 }
6259
6260 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6261 mode & FKIOCTL);
6262 if (error == 0)
6263 *rvalp = 0;
6264 done:
6265 kmem_free(kstrlist.sl_modlist, kstrlistsize);
6266 return (error);
6267 }
6268 case I_LOOK:
6269 if (sti->sti_pushcnt == 0) {
6270 return (so_copyout(sockmod_name, (void *)arg,
6271 sizeof (sockmod_name), mode & FKIOCTL));
6272 }
6273 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6274
6275 case I_FIND:
6276 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6277 if (error && error != EINVAL)
6278 return (error);
6279
6280 /* if not found and string was sockmod return 1 */
6281 if (*rvalp == 0 || error == EINVAL) {
6282 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6283 (void *)arg, mname, sizeof (mname), NULL);
6284 if (error == ENAMETOOLONG)
6285 error = EINVAL;
6286
6287 if (error == 0 && strcmp(mname, sockmod_name) == 0)
6288 *rvalp = 1;
6289 }
6290 return (error);
6291
6292 default:
6293 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6294 break;
6295 }
6296
6297 return (0);
6298 }
6299
6300 /*
6301 * Wrapper around the streams poll routine that implements socket poll
6302 * semantics.
6303 * The sockfs never calls pollwakeup itself - the stream head take care
6304 * of all pollwakeups. Since sockfs never holds so_lock when calling the
6305 * stream head there can never be a deadlock due to holding so_lock across
6306 * pollwakeup and acquiring so_lock in this routine.
6307 *
6308 * However, since the performance of VOP_POLL is critical we avoid
6309 * acquiring so_lock here. This is based on two assumptions:
6310 * - The poll implementation holds locks to serialize the VOP_POLL call
6311 * and a pollwakeup for the same pollhead. This ensures that should
6312 * e.g. so_state change during a socktpi_poll call the pollwakeup
6313 * (which strsock_* and strrput conspire to issue) is issued after
6314 * the state change. Thus the pollwakeup will block until VOP_POLL has
6315 * returned and then wake up poll and have it call VOP_POLL again.
6316 * - The reading of so_state without holding so_lock does not result in
6317 * stale data that is older than the latest state change that has dropped
6318 * so_lock. This is ensured by the mutex_exit issuing the appropriate
6319 * memory barrier to force the data into the coherency domain.
6320 */
6321 static int
6322 sotpi_poll(
6323 struct sonode *so,
6324 short events,
6325 int anyyet,
6326 short *reventsp,
6327 struct pollhead **phpp)
6328 {
6329 short origevents = events;
6330 struct vnode *vp = SOTOV(so);
6331 int error;
6332 int so_state = so->so_state; /* snapshot */
6333 sotpi_info_t *sti = SOTOTPI(so);
6334
6335 dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6336 (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6337
6338 ASSERT(vp->v_type == VSOCK);
6339 ASSERT(vp->v_stream != NULL);
6340
6341 if (so->so_version == SOV_STREAM) {
6342 /* The imaginary "sockmod" has been popped - act as a stream */
6343 return (strpoll(vp->v_stream, events, anyyet,
6344 reventsp, phpp));
6345 }
6346
6347 if (!(so_state & SS_ISCONNECTED) &&
6348 (so->so_mode & SM_CONNREQUIRED)) {
6349 /* Not connected yet - turn off write side events */
6350 events &= ~(POLLOUT|POLLWRBAND);
6351 }
6352 /*
6353 * Check for errors without calling strpoll if the caller wants them.
6354 * In sockets the errors are represented as input/output events
6355 * and there is no need to ask the stream head for this information.
6356 */
6357 if (so->so_error != 0 &&
6358 ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
6359 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6360 return (0);
6361 }
6362 /*
6363 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6364 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6365 * will not trigger a POLLIN event with POLLRDDATA set.
6366 * The handling of urgent data (causing POLLRDBAND) is done by
6367 * inspecting SS_OOBPEND below.
6368 */
6369 events |= POLLRDDATA;
6370
6371 /*
6372 * After shutdown(output) a stream head write error is set.
6373 * However, we should not return output events.
6374 */
6375 events |= POLLNOERR;
6376 error = strpoll(vp->v_stream, events, anyyet,
6377 reventsp, phpp);
6378 if (error)
6379 return (error);
6380
6381 ASSERT(!(*reventsp & POLLERR));
6382
6383 /*
6384 * Notes on T_CONN_IND handling for sockets.
6385 *
6386 * If strpoll() returned without events, SR_POLLIN is guaranteed
6387 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6388 *
6389 * Since the so_lock is not held, soqueueconnind() may have run
6390 * and a T_CONN_IND may be waiting. We now check for any queued
6391 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6392 * to ensure poll returns.
6393 *
6394 * However:
6395 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6396 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6397 * the following actions will occur; taken together they ensure the
6398 * syscall will return.
6399 *
6400 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6401 * the accept() was run on a non-blocking socket sowaitconnind()
6402 * may have already returned EWOULDBLOCK, so not be waiting to
6403 * process the message. Additionally socktpi_poll() has probably
6404 * proceeded past the sti_conn_ind_head check below.
6405 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6406 * this thread, however that could occur before poll_common()
6407 * has entered cv_wait.
6408 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6409 *
6410 * Before proceeding to cv_wait() in poll_common() for an event,
6411 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6412 * and if set, re-calls strpoll() to ensure the late arriving
6413 * T_CONN_IND is recognized, and pollsys() returns.
6414 */
6415
6416 if (sti->sti_conn_ind_head != NULL)
6417 *reventsp |= (POLLIN|POLLRDNORM) & events;
6418
6419 if (so->so_state & SS_CANTRCVMORE) {
6420 *reventsp |= POLLRDHUP & events;
6421
6422 if (so->so_state & SS_CANTSENDMORE)
6423 *reventsp |= POLLHUP;
6424 }
6425
6426 if (so->so_state & SS_OOBPEND)
6427 *reventsp |= POLLRDBAND & events;
6428
6429 if (sti->sti_nl7c_rcv_mp != NULL) {
6430 *reventsp |= (POLLIN|POLLRDNORM) & events;
6431 }
6432 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6433 ((POLLIN|POLLRDNORM) & *reventsp)) {
6434 sti->sti_nl7c_flags |= NL7C_POLLIN;
6435 }
6436
6437 return (0);
6438 }
6439
6440 /*ARGSUSED*/
6441 static int
6442 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6443 {
6444 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6445 int error = 0;
6446
6447 error = sonode_constructor(buf, cdrarg, kmflags);
6448 if (error != 0)
6449 return (error);
6450
6451 error = i_sotpi_info_constructor(&st->st_info);
6452 if (error != 0)
6453 sonode_destructor(buf, cdrarg);
6454
6455 st->st_sonode.so_priv = &st->st_info;
6456
6457 return (error);
6458 }
6459
6460 /*ARGSUSED1*/
6461 static void
6462 socktpi_destructor(void *buf, void *cdrarg)
6463 {
6464 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6465
6466 ASSERT(st->st_sonode.so_priv == &st->st_info);
6467 st->st_sonode.so_priv = NULL;
6468
6469 i_sotpi_info_destructor(&st->st_info);
6470 sonode_destructor(buf, cdrarg);
6471 }
6472
6473 static int
6474 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6475 {
6476 int retval;
6477
6478 if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6479 struct sonode *so = (struct sonode *)buf;
6480 sotpi_info_t *sti = SOTOTPI(so);
6481
6482 mutex_enter(&socklist.sl_lock);
6483
6484 sti->sti_next_so = socklist.sl_list;
6485 sti->sti_prev_so = NULL;
6486 if (sti->sti_next_so != NULL)
6487 SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6488 socklist.sl_list = so;
6489
6490 mutex_exit(&socklist.sl_lock);
6491
6492 }
6493 return (retval);
6494 }
6495
6496 static void
6497 socktpi_unix_destructor(void *buf, void *cdrarg)
6498 {
6499 struct sonode *so = (struct sonode *)buf;
6500 sotpi_info_t *sti = SOTOTPI(so);
6501
6502 mutex_enter(&socklist.sl_lock);
6503
6504 if (sti->sti_next_so != NULL)
6505 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6506 if (sti->sti_prev_so != NULL)
6507 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6508 else
6509 socklist.sl_list = sti->sti_next_so;
6510
6511 mutex_exit(&socklist.sl_lock);
6512
6513 socktpi_destructor(buf, cdrarg);
6514 }
6515
6516 int
6517 socktpi_init(void)
6518 {
6519 /*
6520 * Create sonode caches. We create a special one for AF_UNIX so
6521 * that we can track them for netstat(1m).
6522 */
6523 socktpi_cache = kmem_cache_create("socktpi_cache",
6524 sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6525 socktpi_destructor, NULL, NULL, NULL, 0);
6526
6527 socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6528 sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6529 socktpi_unix_destructor, NULL, NULL, NULL, 0);
6530
6531 return (0);
6532 }
6533
6534 /*
6535 * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6536 *
6537 * Caller must still update state and mode using sotpi_update_state().
6538 */
6539 int
6540 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6541 boolean_t *direct, queue_t **qp, struct cred *cr)
6542 {
6543 sotpi_info_t *sti;
6544 struct sockparams *origsp = so->so_sockparams;
6545 sock_lower_handle_t handle = so->so_proto_handle;
6546 struct stdata *stp;
6547 struct vnode *vp;
6548 queue_t *q;
6549 int error = 0;
6550
6551 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6552 SS_FALLBACK_PENDING);
6553 ASSERT(SOCK_IS_NONSTR(so));
6554
6555 *qp = NULL;
6556 *direct = B_FALSE;
6557 so->so_sockparams = newsp;
6558 /*
6559 * Allocate and initalize fields required by TPI.
6560 */
6561 (void) sotpi_info_create(so, KM_SLEEP);
6562 sotpi_info_init(so);
6563
6564 if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6565 sotpi_info_fini(so);
6566 sotpi_info_destroy(so);
6567 return (error);
6568 }
6569 ASSERT(handle == so->so_proto_handle);
6570 sti = SOTOTPI(so);
6571 if (sti->sti_direct != 0)
6572 *direct = B_TRUE;
6573
6574 /*
6575 * Keep the original sp around so we can properly dispose of the
6576 * sonode when the socket is being closed.
6577 */
6578 sti->sti_orig_sp = origsp;
6579
6580 so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
6581 so_alloc_addr(so, so->so_max_addr_len);
6582
6583 /*
6584 * If the application has done a SIOCSPGRP, make sure the
6585 * STREAM head is aware. This needs to take place before
6586 * the protocol start sending up messages. Otherwise we
6587 * might miss to generate SIGPOLL.
6588 *
6589 * It is possible that the application will receive duplicate
6590 * signals if some were already generated for either data or
6591 * connection indications.
6592 */
6593 if (so->so_pgrp != 0) {
6594 if (so_set_events(so, so->so_vnode, cr) != 0)
6595 so->so_pgrp = 0;
6596 }
6597
6598 /*
6599 * Determine which queue to use.
6600 */
6601 vp = SOTOV(so);
6602 stp = vp->v_stream;
6603 ASSERT(stp != NULL);
6604 q = stp->sd_wrq->q_next;
6605
6606 /*
6607 * Skip any modules that may have been auto pushed when the device
6608 * was opened
6609 */
6610 while (q->q_next != NULL)
6611 q = q->q_next;
6612 *qp = _RD(q);
6613
6614 /* This is now a STREAMS sockets */
6615 so->so_not_str = B_FALSE;
6616
6617 return (error);
6618 }
6619
6620 /*
6621 * Revert a TPI sonode. It is only allowed to revert the sonode during
6622 * the fallback process.
6623 */
6624 void
6625 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6626 {
6627 vnode_t *vp = SOTOV(so);
6628
6629 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6630 SS_FALLBACK_PENDING);
6631 ASSERT(!SOCK_IS_NONSTR(so));
6632 ASSERT(vp->v_stream != NULL);
6633
6634 strclean(vp);
6635 (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6636
6637 /*
6638 * Restore the original sockparams. The caller is responsible for
6639 * dropping the ref to the new sp.
6640 */
6641 so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6642
6643 sotpi_info_fini(so);
6644 sotpi_info_destroy(so);
6645
6646 /* This is no longer a STREAMS sockets */
6647 so->so_not_str = B_TRUE;
6648 }
6649
6650 void
6651 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6652 struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6653 socklen_t faddrlen, short opts)
6654 {
6655 sotpi_info_t *sti = SOTOTPI(so);
6656
6657 so_proc_tcapability_ack(so, tcap);
6658
6659 so->so_options |= opts;
6660
6661 /*
6662 * Determine whether the foreign and local address are valid
6663 */
6664 if (laddrlen != 0) {
6665 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6666 sti->sti_laddr_len = laddrlen;
6667 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6668 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6669 }
6670
6671 if (faddrlen != 0) {
6672 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6673 sti->sti_faddr_len = faddrlen;
6674 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6675 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6676 }
6677
6678 }
6679
6680 /*
6681 * Allocate enough space to cache the local and foreign addresses.
6682 */
6683 void
6684 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6685 {
6686 sotpi_info_t *sti = SOTOTPI(so);
6687
6688 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6689 ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6690 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6691 P2ROUNDUP(maxlen, KMEM_ALIGN);
6692 so->so_max_addr_len = sti->sti_laddr_maxlen;
6693 sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6694 sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6695 + sti->sti_laddr_maxlen);
6696
6697 if (so->so_family == AF_UNIX) {
6698 /*
6699 * Initialize AF_UNIX related fields.
6700 */
6701 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6702 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6703 }
6704 }
6705
6706
6707 sotpi_info_t *
6708 sotpi_sototpi(struct sonode *so)
6709 {
6710 sotpi_info_t *sti;
6711
6712 ASSERT(so != NULL);
6713
6714 sti = (sotpi_info_t *)so->so_priv;
6715
6716 ASSERT(sti != NULL);
6717 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6718
6719 return (sti);
6720 }
6721
6722 static int
6723 i_sotpi_info_constructor(sotpi_info_t *sti)
6724 {
6725 sti->sti_magic = SOTPI_INFO_MAGIC;
6726 sti->sti_ack_mp = NULL;
6727 sti->sti_discon_ind_mp = NULL;
6728 sti->sti_ux_bound_vp = NULL;
6729 sti->sti_unbind_mp = NULL;
6730
6731 sti->sti_conn_ind_head = NULL;
6732 sti->sti_conn_ind_tail = NULL;
6733
6734 sti->sti_laddr_sa = NULL;
6735 sti->sti_faddr_sa = NULL;
6736
6737 sti->sti_nl7c_flags = 0;
6738 sti->sti_nl7c_uri = NULL;
6739 sti->sti_nl7c_rcv_mp = NULL;
6740
6741 mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6742 cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6743
6744 return (0);
6745 }
6746
6747 static void
6748 i_sotpi_info_destructor(sotpi_info_t *sti)
6749 {
6750 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6751 ASSERT(sti->sti_ack_mp == NULL);
6752 ASSERT(sti->sti_discon_ind_mp == NULL);
6753 ASSERT(sti->sti_ux_bound_vp == NULL);
6754 ASSERT(sti->sti_unbind_mp == NULL);
6755
6756 ASSERT(sti->sti_conn_ind_head == NULL);
6757 ASSERT(sti->sti_conn_ind_tail == NULL);
6758
6759 ASSERT(sti->sti_laddr_sa == NULL);
6760 ASSERT(sti->sti_faddr_sa == NULL);
6761
6762 ASSERT(sti->sti_nl7c_flags == 0);
6763 ASSERT(sti->sti_nl7c_uri == NULL);
6764 ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6765
6766 mutex_destroy(&sti->sti_plumb_lock);
6767 cv_destroy(&sti->sti_ack_cv);
6768 }
6769
6770 /*
6771 * Creates and attaches TPI information to the given sonode
6772 */
6773 static boolean_t
6774 sotpi_info_create(struct sonode *so, int kmflags)
6775 {
6776 sotpi_info_t *sti;
6777
6778 ASSERT(so->so_priv == NULL);
6779
6780 if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6781 return (B_FALSE);
6782
6783 if (i_sotpi_info_constructor(sti) != 0) {
6784 kmem_free(sti, sizeof (*sti));
6785 return (B_FALSE);
6786 }
6787
6788 so->so_priv = (void *)sti;
6789 return (B_TRUE);
6790 }
6791
6792 /*
6793 * Initializes the TPI information.
6794 */
6795 static void
6796 sotpi_info_init(struct sonode *so)
6797 {
6798 struct vnode *vp = SOTOV(so);
6799 sotpi_info_t *sti = SOTOTPI(so);
6800 time_t now;
6801
6802 sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6803 vp->v_rdev = sti->sti_dev;
6804
6805 sti->sti_orig_sp = NULL;
6806
6807 sti->sti_pushcnt = 0;
6808
6809 now = gethrestime_sec();
6810 sti->sti_atime = now;
6811 sti->sti_mtime = now;
6812 sti->sti_ctime = now;
6813
6814 sti->sti_eaddr_mp = NULL;
6815 sti->sti_delayed_error = 0;
6816
6817 sti->sti_provinfo = NULL;
6818
6819 sti->sti_oobcnt = 0;
6820 sti->sti_oobsigcnt = 0;
6821
6822 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6823
6824 sti->sti_laddr_sa = 0;
6825 sti->sti_faddr_sa = 0;
6826 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6827 sti->sti_laddr_len = sti->sti_faddr_len = 0;
6828
6829 sti->sti_laddr_valid = 0;
6830 sti->sti_faddr_valid = 0;
6831 sti->sti_faddr_noxlate = 0;
6832
6833 sti->sti_direct = 0;
6834
6835 ASSERT(sti->sti_ack_mp == NULL);
6836 ASSERT(sti->sti_ux_bound_vp == NULL);
6837 ASSERT(sti->sti_unbind_mp == NULL);
6838
6839 ASSERT(sti->sti_conn_ind_head == NULL);
6840 ASSERT(sti->sti_conn_ind_tail == NULL);
6841 }
6842
6843 /*
6844 * Given a sonode, grab the TPI info and free any data.
6845 */
6846 static void
6847 sotpi_info_fini(struct sonode *so)
6848 {
6849 sotpi_info_t *sti = SOTOTPI(so);
6850 mblk_t *mp;
6851
6852 ASSERT(sti->sti_discon_ind_mp == NULL);
6853
6854 if ((mp = sti->sti_conn_ind_head) != NULL) {
6855 mblk_t *mp1;
6856
6857 while (mp) {
6858 mp1 = mp->b_next;
6859 mp->b_next = NULL;
6860 freemsg(mp);
6861 mp = mp1;
6862 }
6863 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6864 }
6865
6866 /*
6867 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6868 * indirect them. It also uses so_count as a validity test.
6869 */
6870 mutex_enter(&so->so_lock);
6871
6872 if (sti->sti_laddr_sa) {
6873 ASSERT((caddr_t)sti->sti_faddr_sa ==
6874 (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6875 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6876 sti->sti_laddr_valid = 0;
6877 sti->sti_faddr_valid = 0;
6878 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6879 sti->sti_laddr_sa = NULL;
6880 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6881 sti->sti_faddr_sa = NULL;
6882 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6883 }
6884
6885 mutex_exit(&so->so_lock);
6886
6887 if ((mp = sti->sti_eaddr_mp) != NULL) {
6888 freemsg(mp);
6889 sti->sti_eaddr_mp = NULL;
6890 sti->sti_delayed_error = 0;
6891 }
6892
6893 if ((mp = sti->sti_ack_mp) != NULL) {
6894 freemsg(mp);
6895 sti->sti_ack_mp = NULL;
6896 }
6897
6898 if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6899 sti->sti_nl7c_rcv_mp = NULL;
6900 freemsg(mp);
6901 }
6902 sti->sti_nl7c_rcv_rval = 0;
6903 if (sti->sti_nl7c_uri != NULL) {
6904 nl7c_urifree(so);
6905 /* urifree() cleared nl7c_uri */
6906 }
6907 if (sti->sti_nl7c_flags) {
6908 sti->sti_nl7c_flags = 0;
6909 }
6910
6911 ASSERT(sti->sti_ux_bound_vp == NULL);
6912 if ((mp = sti->sti_unbind_mp) != NULL) {
6913 freemsg(mp);
6914 sti->sti_unbind_mp = NULL;
6915 }
6916 }
6917
6918 /*
6919 * Destroys the TPI information attached to a sonode.
6920 */
6921 static void
6922 sotpi_info_destroy(struct sonode *so)
6923 {
6924 sotpi_info_t *sti = SOTOTPI(so);
6925
6926 i_sotpi_info_destructor(sti);
6927 kmem_free(sti, sizeof (*sti));
6928
6929 so->so_priv = NULL;
6930 }
6931
6932 /*
6933 * Create the global sotpi socket module entry. It will never be freed.
6934 */
6935 smod_info_t *
6936 sotpi_smod_create(void)
6937 {
6938 smod_info_t *smodp;
6939
6940 smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6941 smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6942 (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6943 /*
6944 * Initialize the smod_refcnt to 1 so it will never be freed.
6945 */
6946 smodp->smod_refcnt = 1;
6947 smodp->smod_uc_version = SOCK_UC_VERSION;
6948 smodp->smod_dc_version = SOCK_DC_VERSION;
6949 smodp->smod_sock_create_func = &sotpi_create;
6950 smodp->smod_sock_destroy_func = &sotpi_destroy;
6951 return (smodp);
6952 }