1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent, Inc.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/t_lock.h>
  30 #include <sys/param.h>
  31 #include <sys/systm.h>
  32 #include <sys/buf.h>
  33 #include <sys/conf.h>
  34 #include <sys/cred.h>
  35 #include <sys/kmem.h>
  36 #include <sys/kmem_impl.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/vfs.h>
  39 #include <sys/vnode.h>
  40 #include <sys/debug.h>
  41 #include <sys/errno.h>
  42 #include <sys/time.h>
  43 #include <sys/file.h>
  44 #include <sys/open.h>
  45 #include <sys/user.h>
  46 #include <sys/termios.h>
  47 #include <sys/stream.h>
  48 #include <sys/strsubr.h>
  49 #include <sys/strsun.h>
  50 #include <sys/suntpi.h>
  51 #include <sys/ddi.h>
  52 #include <sys/esunddi.h>
  53 #include <sys/flock.h>
  54 #include <sys/modctl.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/pathname.h>
  58 
  59 #include <sys/socket.h>
  60 #include <sys/socketvar.h>
  61 #include <sys/sockio.h>
  62 #include <netinet/in.h>
  63 #include <sys/un.h>
  64 #include <sys/strsun.h>
  65 
  66 #include <sys/tiuser.h>
  67 #define _SUN_TPI_VERSION        2
  68 #include <sys/tihdr.h>
  69 #include <sys/timod.h>            /* TI_GETMYNAME, TI_GETPEERNAME */
  70 
  71 #include <c2/audit.h>
  72 
  73 #include <inet/common.h>
  74 #include <inet/ip.h>
  75 #include <inet/ip6.h>
  76 #include <inet/tcp.h>
  77 #include <inet/udp_impl.h>
  78 
  79 #include <sys/zone.h>
  80 
  81 #include <fs/sockfs/nl7c.h>
  82 #include <fs/sockfs/nl7curi.h>
  83 
  84 #include <fs/sockfs/sockcommon.h>
  85 #include <fs/sockfs/socktpi.h>
  86 #include <fs/sockfs/socktpi_impl.h>
  87 
  88 /*
  89  * Possible failures when memory can't be allocated. The documented behavior:
  90  *
  91  *              5.5:                    4.X:            XNET:
  92  * accept:      ENOMEM/ENOSR/EINTR      - (EINTR)       ENOMEM/ENOBUFS/ENOSR/
  93  *                                                      EINTR
  94  *      (4.X does not document EINTR but returns it)
  95  * bind:        ENOSR                   -               ENOBUFS/ENOSR
  96  * connect:     EINTR                   EINTR           ENOBUFS/ENOSR/EINTR
  97  * getpeername: ENOMEM/ENOSR            ENOBUFS (-)     ENOBUFS/ENOSR
  98  * getsockname: ENOMEM/ENOSR            ENOBUFS (-)     ENOBUFS/ENOSR
  99  *      (4.X getpeername and getsockname do not fail in practice)
 100  * getsockopt:  ENOMEM/ENOSR            -               ENOBUFS/ENOSR
 101  * listen:      -                       -               ENOBUFS
 102  * recv:        ENOMEM/ENOSR/EINTR      EINTR           ENOBUFS/ENOMEM/ENOSR/
 103  *                                                      EINTR
 104  * send:        ENOMEM/ENOSR/EINTR      ENOBUFS/EINTR   ENOBUFS/ENOMEM/ENOSR/
 105  *                                                      EINTR
 106  * setsockopt:  ENOMEM/ENOSR            -               ENOBUFS/ENOMEM/ENOSR
 107  * shutdown:    ENOMEM/ENOSR            -               ENOBUFS/ENOSR
 108  * socket:      ENOMEM/ENOSR            ENOBUFS         ENOBUFS/ENOMEM/ENOSR
 109  * socketpair:  ENOMEM/ENOSR            -               ENOBUFS/ENOMEM/ENOSR
 110  *
 111  * Resolution. When allocation fails:
 112  *      recv: return EINTR
 113  *      send: return EINTR
 114  *      connect, accept: EINTR
 115  *      bind, listen, shutdown (unbind, unix_close, disconnect): sleep
 116  *      socket, socketpair: ENOBUFS
 117  *      getpeername, getsockname: sleep
 118  *      getsockopt, setsockopt: sleep
 119  */
 120 
 121 #ifdef SOCK_TEST
 122 /*
 123  * Variables that make sockfs do something other than the standard TPI
 124  * for the AF_INET transports.
 125  *
 126  * solisten_tpi_tcp:
 127  *      TCP can handle a O_T_BIND_REQ with an increased backlog even though
 128  *      the transport is already bound. This is needed to avoid loosing the
 129  *      port number should listen() do a T_UNBIND_REQ followed by a
 130  *      O_T_BIND_REQ.
 131  *
 132  * soconnect_tpi_udp:
 133  *      UDP and ICMP can handle a T_CONN_REQ.
 134  *      This is needed to make the sequence of connect(), getsockname()
 135  *      return the local IP address used to send packets to the connected to
 136  *      destination.
 137  *
 138  * soconnect_tpi_tcp:
 139  *      TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
 140  *      Set this to non-zero to send TPI conformant messages to TCP in this
 141  *      respect. This is a performance optimization.
 142  *
 143  * soaccept_tpi_tcp:
 144  *      TCP can handle a T_CONN_REQ without the acceptor being bound.
 145  *      This is a performance optimization that has been picked up in XTI.
 146  *
 147  * soaccept_tpi_multioptions:
 148  *      When inheriting SOL_SOCKET options from the listener to the accepting
 149  *      socket send them as a single message for AF_INET{,6}.
 150  */
 151 int solisten_tpi_tcp = 0;
 152 int soconnect_tpi_udp = 0;
 153 int soconnect_tpi_tcp = 0;
 154 int soaccept_tpi_tcp = 0;
 155 int soaccept_tpi_multioptions = 1;
 156 #else /* SOCK_TEST */
 157 #define soconnect_tpi_tcp       0
 158 #define soconnect_tpi_udp       0
 159 #define solisten_tpi_tcp        0
 160 #define soaccept_tpi_tcp        0
 161 #define soaccept_tpi_multioptions       1
 162 #endif /* SOCK_TEST */
 163 
 164 #ifdef SOCK_TEST
 165 extern int do_useracc;
 166 extern clock_t sock_test_timelimit;
 167 #endif /* SOCK_TEST */
 168 
 169 extern uint32_t ucredsize;
 170 
 171 /*
 172  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
 173  * applications working. Turn on this flag to disable these checks.
 174  */
 175 int xnet_skip_checks = 0;
 176 int xnet_check_print = 0;
 177 int xnet_truncate_print = 0;
 178 
 179 static void sotpi_destroy(struct sonode *);
 180 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
 181     int, int *, cred_t *cr);
 182 
 183 static boolean_t        sotpi_info_create(struct sonode *, int);
 184 static void             sotpi_info_init(struct sonode *);
 185 static void             sotpi_info_fini(struct sonode *);
 186 static void             sotpi_info_destroy(struct sonode *);
 187 
 188 /*
 189  * Do direct function call to the transport layer below; this would
 190  * also allow the transport to utilize read-side synchronous stream
 191  * interface if necessary.  This is a /etc/system tunable that must
 192  * not be modified on a running system.  By default this is enabled
 193  * for performance reasons and may be disabled for debugging purposes.
 194  */
 195 boolean_t socktpi_direct = B_TRUE;
 196 
 197 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
 198 
 199 extern  void sigintr(k_sigset_t *, int);
 200 extern  void sigunintr(k_sigset_t *);
 201 
 202 static int      sotpi_unbind(struct sonode *, int);
 203 
 204 /* TPI sockfs sonode operations */
 205 int             sotpi_init(struct sonode *, struct sonode *, struct cred *,
 206                     int);
 207 static int      sotpi_accept(struct sonode *, int, struct cred *,
 208                     struct sonode **);
 209 static int      sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
 210                     int, struct cred *);
 211 static int      sotpi_listen(struct sonode *, int, struct cred *);
 212 static int      sotpi_connect(struct sonode *, struct sockaddr *,
 213                     socklen_t, int, int, struct cred *);
 214 extern int      sotpi_recvmsg(struct sonode *, struct nmsghdr *,
 215                     struct uio *, struct cred *);
 216 static int      sotpi_sendmsg(struct sonode *, struct nmsghdr *,
 217                     struct uio *, struct cred *);
 218 static int      sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
 219                     struct cred *, mblk_t **);
 220 static int      sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
 221                     struct uio *, void *, t_uscalar_t, int);
 222 static int      sodgram_direct(struct sonode *, struct sockaddr *,
 223                     socklen_t, struct uio *, int);
 224 extern int      sotpi_getpeername(struct sonode *, struct sockaddr *,
 225                     socklen_t *, boolean_t, struct cred *);
 226 static int      sotpi_getsockname(struct sonode *, struct sockaddr *,
 227                     socklen_t *, struct cred *);
 228 static int      sotpi_shutdown(struct sonode *, int, struct cred *);
 229 extern int      sotpi_getsockopt(struct sonode *, int, int, void *,
 230                     socklen_t *, int, struct cred *);
 231 extern int      sotpi_setsockopt(struct sonode *, int, int, const void *,
 232                     socklen_t, struct cred *);
 233 static int      sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
 234                     int32_t *);
 235 static int      socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
 236                     struct cred *, int32_t *);
 237 static int      sotpi_poll(struct sonode *, short, int, short *,
 238                     struct pollhead **);
 239 static int      sotpi_close(struct sonode *, int, struct cred *);
 240 
 241 static int      i_sotpi_info_constructor(sotpi_info_t *);
 242 static void     i_sotpi_info_destructor(sotpi_info_t *);
 243 
 244 sonodeops_t sotpi_sonodeops = {
 245         sotpi_init,             /* sop_init             */
 246         sotpi_accept,           /* sop_accept           */
 247         sotpi_bind,             /* sop_bind             */
 248         sotpi_listen,           /* sop_listen           */
 249         sotpi_connect,          /* sop_connect          */
 250         sotpi_recvmsg,          /* sop_recvmsg          */
 251         sotpi_sendmsg,          /* sop_sendmsg          */
 252         sotpi_sendmblk,         /* sop_sendmblk         */
 253         sotpi_getpeername,      /* sop_getpeername      */
 254         sotpi_getsockname,      /* sop_getsockname      */
 255         sotpi_shutdown,         /* sop_shutdown         */
 256         sotpi_getsockopt,       /* sop_getsockopt       */
 257         sotpi_setsockopt,       /* sop_setsockopt       */
 258         sotpi_ioctl,            /* sop_ioctl            */
 259         sotpi_poll,             /* sop_poll             */
 260         sotpi_close,            /* sop_close            */
 261 };
 262 
 263 /*
 264  * Return a TPI socket vnode.
 265  *
 266  * Note that sockets assume that the driver will clone (either itself
 267  * or by using the clone driver) i.e. a socket() call will always
 268  * result in a new vnode being created.
 269  */
 270 
 271 /*
 272  * Common create code for socket and accept. If tso is set the values
 273  * from that node is used instead of issuing a T_INFO_REQ.
 274  */
 275 
 276 /* ARGSUSED */
 277 static struct sonode *
 278 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
 279     int version, int sflags, int *errorp, cred_t *cr)
 280 {
 281         struct sonode   *so;
 282         kmem_cache_t    *cp;
 283         int             sfamily = family;
 284 
 285         ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
 286 
 287         if (family == AF_NCA) {
 288                 /*
 289                  * The request is for an NCA socket so for NL7C use the
 290                  * INET domain instead and mark NL7C_AF_NCA below.
 291                  */
 292                 family = AF_INET;
 293                 /*
 294                  * NL7C is not supported in the non-global zone,
 295                  * we enforce this restriction here.
 296                  */
 297                 if (getzoneid() != GLOBAL_ZONEID) {
 298                         *errorp = ENOTSUP;
 299                         return (NULL);
 300                 }
 301         }
 302 
 303         /*
 304          * to be compatible with old tpi socket implementation ignore
 305          * sleep flag (sflags) passed in
 306          */
 307         cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
 308         so = kmem_cache_alloc(cp, KM_SLEEP);
 309         if (so == NULL) {
 310                 *errorp = ENOMEM;
 311                 return (NULL);
 312         }
 313 
 314         sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
 315         sotpi_info_init(so);
 316 
 317         if (sfamily == AF_NCA) {
 318                 SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
 319         }
 320 
 321         if (version == SOV_DEFAULT)
 322                 version = so_default_version;
 323 
 324         so->so_version = (short)version;
 325         *errorp = 0;
 326 
 327         return (so);
 328 }
 329 
 330 static void
 331 sotpi_destroy(struct sonode *so)
 332 {
 333         kmem_cache_t *cp;
 334         struct sockparams *origsp;
 335 
 336         /*
 337          * If there is a new dealloc function (ie. smod_destroy_func),
 338          * then it should check the correctness of the ops.
 339          */
 340 
 341         ASSERT(so->so_ops == &sotpi_sonodeops);
 342 
 343         origsp = SOTOTPI(so)->sti_orig_sp;
 344 
 345         sotpi_info_fini(so);
 346 
 347         if (so->so_state & SS_FALLBACK_COMP) {
 348                 /*
 349                  * A fallback happend, which means that a sotpi_info_t struct
 350                  * was allocated (as opposed to being allocated from the TPI
 351                  * sonode cache. Therefore we explicitly free the struct
 352                  * here.
 353                  */
 354                 sotpi_info_destroy(so);
 355                 ASSERT(origsp != NULL);
 356 
 357                 origsp->sp_smod_info->smod_sock_destroy_func(so);
 358                 SOCKPARAMS_DEC_REF(origsp);
 359         } else {
 360                 sonode_fini(so);
 361                 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
 362                     socktpi_cache;
 363                 kmem_cache_free(cp, so);
 364         }
 365 }
 366 
 367 /* ARGSUSED1 */
 368 int
 369 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
 370 {
 371         major_t maj;
 372         dev_t newdev;
 373         struct vnode *vp;
 374         int error = 0;
 375         struct stdata *stp;
 376 
 377         sotpi_info_t *sti = SOTOTPI(so);
 378 
 379         dprint(1, ("sotpi_init()\n"));
 380 
 381         /*
 382          * over write the sleep flag passed in but that is ok
 383          * as tpi socket does not honor sleep flag.
 384          */
 385         flags |= FREAD|FWRITE;
 386 
 387         /*
 388          * Record in so_flag that it is a clone.
 389          */
 390         if (getmajor(sti->sti_dev) == clone_major)
 391                 so->so_flag |= SOCLONE;
 392 
 393         if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
 394             (so->so_family == AF_INET || so->so_family == AF_INET6) &&
 395             (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
 396             so->so_protocol == IPPROTO_IP)) {
 397                 /* Tell tcp or udp that it's talking to sockets */
 398                 flags |= SO_SOCKSTR;
 399 
 400                 /*
 401                  * Here we indicate to socktpi_open() our attempt to
 402                  * make direct calls between sockfs and transport.
 403                  * The final decision is left to socktpi_open().
 404                  */
 405                 sti->sti_direct = 1;
 406 
 407                 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
 408                 if (so->so_type == SOCK_STREAM && tso != NULL) {
 409                         if (SOTOTPI(tso)->sti_direct) {
 410                                 /*
 411                                  * Inherit sti_direct from listener and pass
 412                                  * SO_ACCEPTOR open flag to tcp, indicating
 413                                  * that this is an accept fast-path instance.
 414                                  */
 415                                 flags |= SO_ACCEPTOR;
 416                         } else {
 417                                 /*
 418                                  * sti_direct is not set on listener, meaning
 419                                  * that the listener has been converted from
 420                                  * a socket to a stream.  Ensure that the
 421                                  * acceptor inherits these settings.
 422                                  */
 423                                 sti->sti_direct = 0;
 424                                 flags &= ~SO_SOCKSTR;
 425                         }
 426                 }
 427         }
 428 
 429         /*
 430          * Tell local transport that it is talking to sockets.
 431          */
 432         if (so->so_family == AF_UNIX) {
 433                 flags |= SO_SOCKSTR;
 434         }
 435 
 436         vp = SOTOV(so);
 437         newdev = vp->v_rdev;
 438         maj = getmajor(newdev);
 439         ASSERT(STREAMSTAB(maj));
 440 
 441         error = stropen(vp, &newdev, flags, cr);
 442 
 443         stp = vp->v_stream;
 444         if (error == 0) {
 445                 if (so->so_flag & SOCLONE)
 446                         ASSERT(newdev != vp->v_rdev);
 447                 mutex_enter(&so->so_lock);
 448                 sti->sti_dev = newdev;
 449                 vp->v_rdev = newdev;
 450                 mutex_exit(&so->so_lock);
 451 
 452                 if (stp->sd_flag & STRISTTY) {
 453                         /*
 454                          * this is a post SVR4 tty driver - a socket can not
 455                          * be a controlling terminal. Fail the open.
 456                          */
 457                         (void) sotpi_close(so, flags, cr);
 458                         return (ENOTTY);        /* XXX */
 459                 }
 460 
 461                 ASSERT(stp->sd_wrq != NULL);
 462                 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
 463 
 464                 /*
 465                  * If caller is interested in doing direct function call
 466                  * interface to/from transport module, probe the module
 467                  * directly beneath the streamhead to see if it qualifies.
 468                  *
 469                  * We turn off the direct interface when qualifications fail.
 470                  * In the acceptor case, we simply turn off the sti_direct
 471                  * flag on the socket. We do the fallback after the accept
 472                  * has completed, before the new socket is returned to the
 473                  * application.
 474                  */
 475                 if (sti->sti_direct) {
 476                         queue_t *tq = stp->sd_wrq->q_next;
 477 
 478                         /*
 479                          * sti_direct is currently supported and tested
 480                          * only for tcp/udp; this is the main reason to
 481                          * have the following assertions.
 482                          */
 483                         ASSERT(so->so_family == AF_INET ||
 484                             so->so_family == AF_INET6);
 485                         ASSERT(so->so_protocol == IPPROTO_UDP ||
 486                             so->so_protocol == IPPROTO_TCP ||
 487                             so->so_protocol == IPPROTO_IP);
 488                         ASSERT(so->so_type == SOCK_DGRAM ||
 489                             so->so_type == SOCK_STREAM);
 490 
 491                         /*
 492                          * Abort direct call interface if the module directly
 493                          * underneath the stream head is not defined with the
 494                          * _D_DIRECT flag.  This could happen in the tcp or
 495                          * udp case, when some other module is autopushed
 496                          * above it, or for some reasons the expected module
 497                          * isn't purely D_MP (which is the main requirement).
 498                          */
 499                         if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
 500                             !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
 501                                 int rval;
 502 
 503                                 /* Continue on without direct calls */
 504                                 sti->sti_direct = 0;
 505 
 506                                 /*
 507                                  * Cannot issue ioctl on fallback socket since
 508                                  * there is no conn associated with the queue.
 509                                  * The fallback downcall will notify the proto
 510                                  * of the change.
 511                                  */
 512                                 if (!(flags & SO_ACCEPTOR) &&
 513                                     !(flags & SO_FALLBACK)) {
 514                                         if ((error = strioctl(vp,
 515                                             _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
 516                                             cr, &rval)) != 0) {
 517                                                 (void) sotpi_close(so, flags,
 518                                                     cr);
 519                                                 return (error);
 520                                         }
 521                                 }
 522                         }
 523                 }
 524 
 525                 if (flags & SO_FALLBACK) {
 526                         /*
 527                          * The stream created does not have a conn.
 528                          * do stream set up after conn has been assigned
 529                          */
 530                         return (error);
 531                 }
 532                 if (error = so_strinit(so, tso)) {
 533                         (void) sotpi_close(so, flags, cr);
 534                         return (error);
 535                 }
 536 
 537                 /* Enable sendfile() on AF_UNIX streams */
 538                 if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
 539                         mutex_enter(&so->so_lock);
 540                         so->so_mode |= SM_SENDFILESUPP;
 541                         mutex_exit(&so->so_lock);
 542                 }
 543 
 544                 /* Wildcard */
 545                 if (so->so_protocol != so->so_sockparams->sp_protocol) {
 546                         int protocol = so->so_protocol;
 547                         /*
 548                          * Issue SO_PROTOTYPE setsockopt.
 549                          */
 550                         error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
 551                             &protocol, (t_uscalar_t)sizeof (protocol), cr);
 552                         if (error != 0) {
 553                                 (void) sotpi_close(so, flags, cr);
 554                                 /*
 555                                  * Setsockopt often fails with ENOPROTOOPT but
 556                                  * socket() should fail with
 557                                  * EPROTONOSUPPORT/EPROTOTYPE.
 558                                  */
 559                                 return (EPROTONOSUPPORT);
 560                         }
 561                 }
 562 
 563         } else {
 564                 /*
 565                  * While the same socket can not be reopened (unlike specfs)
 566                  * the stream head sets STREOPENFAIL when the autopush fails.
 567                  */
 568                 if ((stp != NULL) &&
 569                     (stp->sd_flag & STREOPENFAIL)) {
 570                         /*
 571                          * Open failed part way through.
 572                          */
 573                         mutex_enter(&stp->sd_lock);
 574                         stp->sd_flag &= ~STREOPENFAIL;
 575                         mutex_exit(&stp->sd_lock);
 576                         (void) sotpi_close(so, flags, cr);
 577                         return (error);
 578                         /*NOTREACHED*/
 579                 }
 580                 ASSERT(stp == NULL);
 581         }
 582         TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
 583             "sockfs open:maj %d vp %p so %p error %d",
 584             maj, vp, so, error);
 585         return (error);
 586 }
 587 
 588 /*
 589  * Bind the socket to an unspecified address in sockfs only.
 590  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
 591  * required in all cases.
 592  */
 593 static void
 594 so_automatic_bind(struct sonode *so)
 595 {
 596         sotpi_info_t *sti = SOTOTPI(so);
 597         ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
 598 
 599         ASSERT(MUTEX_HELD(&so->so_lock));
 600         ASSERT(!(so->so_state & SS_ISBOUND));
 601         ASSERT(sti->sti_unbind_mp);
 602 
 603         ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
 604         bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
 605         sti->sti_laddr_sa->sa_family = so->so_family;
 606         so->so_state |= SS_ISBOUND;
 607 }
 608 
 609 
 610 /*
 611  * bind the socket.
 612  *
 613  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
 614  * are passed in we allow rebinding. Note that for backwards compatibility
 615  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
 616  * Thus the rebinding code is currently not executed.
 617  *
 618  * The constraints for rebinding are:
 619  * - it is a SOCK_DGRAM, or
 620  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
 621  *   and no listen() has been done.
 622  * This rebinding code was added based on some language in the XNET book
 623  * about not returning EINVAL it the protocol allows rebinding. However,
 624  * this language is not present in the Posix socket draft. Thus maybe the
 625  * rebinding logic should be deleted from the source.
 626  *
 627  * A null "name" can be used to unbind the socket if:
 628  * - it is a SOCK_DGRAM, or
 629  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
 630  *   and no listen() has been done.
 631  */
 632 /* ARGSUSED */
 633 static int
 634 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
 635     socklen_t namelen, int backlog, int flags, struct cred *cr)
 636 {
 637         struct T_bind_req       bind_req;
 638         struct T_bind_ack       *bind_ack;
 639         int                     error = 0;
 640         mblk_t                  *mp;
 641         void                    *addr;
 642         t_uscalar_t             addrlen;
 643         int                     unbind_on_err = 1;
 644         boolean_t               clear_acceptconn_on_err = B_FALSE;
 645         boolean_t               restore_backlog_on_err = B_FALSE;
 646         int                     save_so_backlog;
 647         t_scalar_t              PRIM_type = O_T_BIND_REQ;
 648         boolean_t               tcp_udp_xport;
 649         void                    *nl7c = NULL;
 650         sotpi_info_t            *sti = SOTOTPI(so);
 651 
 652         dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
 653             (void *)so, (void *)name, namelen, backlog, flags,
 654             pr_state(so->so_state, so->so_mode)));
 655 
 656         tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
 657 
 658         if (!(flags & _SOBIND_LOCK_HELD)) {
 659                 mutex_enter(&so->so_lock);
 660                 so_lock_single(so);     /* Set SOLOCKED */
 661         } else {
 662                 ASSERT(MUTEX_HELD(&so->so_lock));
 663                 ASSERT(so->so_flag & SOLOCKED);
 664         }
 665 
 666         /*
 667          * Make sure that there is a preallocated unbind_req message
 668          * before binding. This message allocated when the socket is
 669          * created  but it might be have been consumed.
 670          */
 671         if (sti->sti_unbind_mp == NULL) {
 672                 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
 673                 /* NOTE: holding so_lock while sleeping */
 674                 sti->sti_unbind_mp =
 675                     soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
 676                     cr);
 677         }
 678 
 679         if (flags & _SOBIND_REBIND) {
 680                 /*
 681                  * Called from solisten after doing an sotpi_unbind() or
 682                  * potentially without the unbind (latter for AF_INET{,6}).
 683                  */
 684                 ASSERT(name == NULL && namelen == 0);
 685 
 686                 if (so->so_family == AF_UNIX) {
 687                         ASSERT(sti->sti_ux_bound_vp);
 688                         addr = &sti->sti_ux_laddr;
 689                         addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
 690                         dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
 691                             "addr 0x%p, vp %p\n",
 692                             addrlen,
 693                             (void *)((struct so_ux_addr *)addr)->soua_vp,
 694                             (void *)sti->sti_ux_bound_vp));
 695                 } else {
 696                         addr = sti->sti_laddr_sa;
 697                         addrlen = (t_uscalar_t)sti->sti_laddr_len;
 698                 }
 699         } else if (flags & _SOBIND_UNSPEC) {
 700                 ASSERT(name == NULL && namelen == 0);
 701 
 702                 /*
 703                  * The caller checked SS_ISBOUND but not necessarily
 704                  * under so_lock
 705                  */
 706                 if (so->so_state & SS_ISBOUND) {
 707                         /* No error */
 708                         goto done;
 709                 }
 710 
 711                 /* Set an initial local address */
 712                 switch (so->so_family) {
 713                 case AF_UNIX:
 714                         /*
 715                          * Use an address with same size as struct sockaddr
 716                          * just like BSD.
 717                          */
 718                         sti->sti_laddr_len =
 719                             (socklen_t)sizeof (struct sockaddr);
 720                         ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
 721                         bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
 722                         sti->sti_laddr_sa->sa_family = so->so_family;
 723 
 724                         /*
 725                          * Pass down an address with the implicit bind
 726                          * magic number and the rest all zeros.
 727                          * The transport will return a unique address.
 728                          */
 729                         sti->sti_ux_laddr.soua_vp = NULL;
 730                         sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
 731                         addr = &sti->sti_ux_laddr;
 732                         addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
 733                         break;
 734 
 735                 case AF_INET:
 736                 case AF_INET6:
 737                         /*
 738                          * An unspecified bind in TPI has a NULL address.
 739                          * Set the address in sockfs to have the sa_family.
 740                          */
 741                         sti->sti_laddr_len = (so->so_family == AF_INET) ?
 742                             (socklen_t)sizeof (sin_t) :
 743                             (socklen_t)sizeof (sin6_t);
 744                         ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
 745                         bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
 746                         sti->sti_laddr_sa->sa_family = so->so_family;
 747                         addr = NULL;
 748                         addrlen = 0;
 749                         break;
 750 
 751                 default:
 752                         /*
 753                          * An unspecified bind in TPI has a NULL address.
 754                          * Set the address in sockfs to be zero length.
 755                          *
 756                          * Can not assume there is a sa_family for all
 757                          * protocol families. For example, AF_X25 does not
 758                          * have a family field.
 759                          */
 760                         bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
 761                         sti->sti_laddr_len = 0;      /* XXX correct? */
 762                         addr = NULL;
 763                         addrlen = 0;
 764                         break;
 765                 }
 766 
 767         } else {
 768                 if (so->so_state & SS_ISBOUND) {
 769                         /*
 770                          * If it is ok to rebind the socket, first unbind
 771                          * with the transport. A rebind to the NULL address
 772                          * is interpreted as an unbind.
 773                          * Note that a bind to NULL in BSD does unbind the
 774                          * socket but it fails with EINVAL.
 775                          * Note that regular sockets set SOV_SOCKBSD i.e.
 776                          * _SOBIND_SOCKBSD gets set here hence no type of
 777                          * socket does currently allow rebinding.
 778                          *
 779                          * If the name is NULL just do an unbind.
 780                          */
 781                         if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
 782                             name != NULL) {
 783                                 error = EINVAL;
 784                                 unbind_on_err = 0;
 785                                 eprintsoline(so, error);
 786                                 goto done;
 787                         }
 788                         if ((so->so_mode & SM_CONNREQUIRED) &&
 789                             (so->so_state & SS_CANTREBIND)) {
 790                                 error = EINVAL;
 791                                 unbind_on_err = 0;
 792                                 eprintsoline(so, error);
 793                                 goto done;
 794                         }
 795                         error = sotpi_unbind(so, 0);
 796                         if (error) {
 797                                 eprintsoline(so, error);
 798                                 goto done;
 799                         }
 800                         ASSERT(!(so->so_state & SS_ISBOUND));
 801                         if (name == NULL) {
 802                                 so->so_state &=
 803                                     ~(SS_ISCONNECTED|SS_ISCONNECTING);
 804                                 goto done;
 805                         }
 806                 }
 807 
 808                 /* X/Open requires this check */
 809                 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
 810                         if (xnet_check_print) {
 811                                 printf("sockfs: X/Open bind state check "
 812                                     "caused EINVAL\n");
 813                         }
 814                         error = EINVAL;
 815                         goto done;
 816                 }
 817 
 818                 switch (so->so_family) {
 819                 case AF_UNIX:
 820                         /*
 821                          * All AF_UNIX addresses are nul terminated
 822                          * when copied (copyin_name) in so the minimum
 823                          * length is 3 bytes.
 824                          */
 825                         if (name == NULL ||
 826                             (ssize_t)namelen <= sizeof (short) + 1) {
 827                                 error = EISDIR;
 828                                 eprintsoline(so, error);
 829                                 goto done;
 830                         }
 831                         /*
 832                          * Verify so_family matches the bound family.
 833                          * BSD does not check this for AF_UNIX resulting
 834                          * in funny mknods.
 835                          */
 836                         if (name->sa_family != so->so_family) {
 837                                 error = EAFNOSUPPORT;
 838                                 goto done;
 839                         }
 840                         break;
 841                 case AF_INET:
 842                         if (name == NULL) {
 843                                 error = EINVAL;
 844                                 eprintsoline(so, error);
 845                                 goto done;
 846                         }
 847                         if ((size_t)namelen != sizeof (sin_t)) {
 848                                 error = name->sa_family != so->so_family ?
 849                                     EAFNOSUPPORT : EINVAL;
 850                                 eprintsoline(so, error);
 851                                 goto done;
 852                         }
 853                         if ((flags & _SOBIND_XPG4_2) &&
 854                             (name->sa_family != so->so_family)) {
 855                                 /*
 856                                  * This check has to be made for X/Open
 857                                  * sockets however application failures have
 858                                  * been observed when it is applied to
 859                                  * all sockets.
 860                                  */
 861                                 error = EAFNOSUPPORT;
 862                                 eprintsoline(so, error);
 863                                 goto done;
 864                         }
 865                         /*
 866                          * Force a zero sa_family to match so_family.
 867                          *
 868                          * Some programs like inetd(1M) don't set the
 869                          * family field. Other programs leave
 870                          * sin_family set to garbage - SunOS 4.X does
 871                          * not check the family field on a bind.
 872                          * We use the family field that
 873                          * was passed in to the socket() call.
 874                          */
 875                         name->sa_family = so->so_family;
 876                         break;
 877 
 878                 case AF_INET6: {
 879 #ifdef DEBUG
 880                         sin6_t *sin6 = (sin6_t *)name;
 881 #endif /* DEBUG */
 882 
 883                         if (name == NULL) {
 884                                 error = EINVAL;
 885                                 eprintsoline(so, error);
 886                                 goto done;
 887                         }
 888                         if ((size_t)namelen != sizeof (sin6_t)) {
 889                                 error = name->sa_family != so->so_family ?
 890                                     EAFNOSUPPORT : EINVAL;
 891                                 eprintsoline(so, error);
 892                                 goto done;
 893                         }
 894                         if (name->sa_family != so->so_family) {
 895                                 /*
 896                                  * With IPv6 we require the family to match
 897                                  * unlike in IPv4.
 898                                  */
 899                                 error = EAFNOSUPPORT;
 900                                 eprintsoline(so, error);
 901                                 goto done;
 902                         }
 903 #ifdef DEBUG
 904                         /*
 905                          * Verify that apps don't forget to clear
 906                          * sin6_scope_id etc
 907                          */
 908                         if (sin6->sin6_scope_id != 0 &&
 909                             !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
 910                                 zcmn_err(getzoneid(), CE_WARN,
 911                                     "bind with uninitialized sin6_scope_id "
 912                                     "(%d) on socket. Pid = %d\n",
 913                                     (int)sin6->sin6_scope_id,
 914                                     (int)curproc->p_pid);
 915                         }
 916                         if (sin6->__sin6_src_id != 0) {
 917                                 zcmn_err(getzoneid(), CE_WARN,
 918                                     "bind with uninitialized __sin6_src_id "
 919                                     "(%d) on socket. Pid = %d\n",
 920                                     (int)sin6->__sin6_src_id,
 921                                     (int)curproc->p_pid);
 922                         }
 923 #endif /* DEBUG */
 924                         break;
 925                 }
 926                 default:
 927                         /*
 928                          * Don't do any length or sa_family check to allow
 929                          * non-sockaddr style addresses.
 930                          */
 931                         if (name == NULL) {
 932                                 error = EINVAL;
 933                                 eprintsoline(so, error);
 934                                 goto done;
 935                         }
 936                         break;
 937                 }
 938 
 939                 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
 940                         error = ENAMETOOLONG;
 941                         eprintsoline(so, error);
 942                         goto done;
 943                 }
 944                 /*
 945                  * Save local address.
 946                  */
 947                 sti->sti_laddr_len = (socklen_t)namelen;
 948                 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
 949                 bcopy(name, sti->sti_laddr_sa, namelen);
 950 
 951                 addr = sti->sti_laddr_sa;
 952                 addrlen = (t_uscalar_t)sti->sti_laddr_len;
 953                 switch (so->so_family) {
 954                 case AF_INET6:
 955                 case AF_INET:
 956                         break;
 957                 case AF_UNIX: {
 958                         struct sockaddr_un *soun =
 959                             (struct sockaddr_un *)sti->sti_laddr_sa;
 960                         struct vnode *vp, *rvp;
 961                         struct vattr vattr;
 962 
 963                         ASSERT(sti->sti_ux_bound_vp == NULL);
 964                         /*
 965                          * Create vnode for the specified path name.
 966                          * Keep vnode held with a reference in sti_ux_bound_vp.
 967                          * Use the vnode pointer as the address used in the
 968                          * bind with the transport.
 969                          *
 970                          * Use the same mode as in BSD. In particular this does
 971                          * not observe the umask.
 972                          */
 973                         /* MAXPATHLEN + soun_family + nul termination */
 974                         if (sti->sti_laddr_len >
 975                             (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
 976                                 error = ENAMETOOLONG;
 977                                 eprintsoline(so, error);
 978                                 goto done;
 979                         }
 980                         vattr.va_type = VSOCK;
 981                         vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
 982                         vattr.va_mask = AT_TYPE|AT_MODE;
 983                         /* NOTE: holding so_lock */
 984                         error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
 985                             EXCL, 0, &vp, CRMKNOD, 0, 0);
 986                         if (error) {
 987                                 if (error == EEXIST)
 988                                         error = EADDRINUSE;
 989                                 eprintsoline(so, error);
 990                                 goto done;
 991                         }
 992                         /*
 993                          * Establish pointer from the underlying filesystem
 994                          * vnode to the socket node.
 995                          * sti_ux_bound_vp and v_stream->sd_vnode form the
 996                          * cross-linkage between the underlying filesystem
 997                          * node and the socket node.
 998                          */
 999 
1000                         if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
1001                                 VN_HOLD(rvp);
1002                                 VN_RELE(vp);
1003                                 vp = rvp;
1004                         }
1005 
1006                         ASSERT(SOTOV(so)->v_stream);
1007                         mutex_enter(&vp->v_lock);
1008                         vp->v_stream = SOTOV(so)->v_stream;
1009                         sti->sti_ux_bound_vp = vp;
1010                         mutex_exit(&vp->v_lock);
1011 
1012                         /*
1013                          * Use the vnode pointer value as a unique address
1014                          * (together with the magic number to avoid conflicts
1015                          * with implicit binds) in the transport provider.
1016                          */
1017                         sti->sti_ux_laddr.soua_vp =
1018                             (void *)sti->sti_ux_bound_vp;
1019                         sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1020                         addr = &sti->sti_ux_laddr;
1021                         addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1022                         dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1023                             addrlen,
1024                             (void *)((struct so_ux_addr *)addr)->soua_vp));
1025                         break;
1026                 }
1027                 } /* end switch (so->so_family) */
1028         }
1029 
1030         /*
1031          * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1032          * the transport can start passing up T_CONN_IND messages
1033          * as soon as it receives the bind req and strsock_proto()
1034          * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1035          */
1036         if (flags & _SOBIND_LISTEN) {
1037                 if ((so->so_state & SS_ACCEPTCONN) == 0)
1038                         clear_acceptconn_on_err = B_TRUE;
1039                 save_so_backlog = so->so_backlog;
1040                 restore_backlog_on_err = B_TRUE;
1041                 so->so_state |= SS_ACCEPTCONN;
1042                 so->so_backlog = backlog;
1043         }
1044 
1045         /*
1046          * If NL7C addr(s) have been configured check for addr/port match,
1047          * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1048          *
1049          * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1050          * family sockets only. If match mark as such.
1051          */
1052         if (nl7c_enabled && ((addr != NULL &&
1053             (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1054             (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1055             sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1056                 /*
1057                  * NL7C is not supported in non-global zones,
1058                  * we enforce this restriction here.
1059                  */
1060                 if (so->so_zoneid == GLOBAL_ZONEID) {
1061                         /* An NL7C socket, mark it */
1062                         sti->sti_nl7c_flags |= NL7C_ENABLED;
1063                         if (nl7c == NULL) {
1064                                 /*
1065                                  * Was an AF_NCA bind() so add it to the
1066                                  * addr list for reporting purposes.
1067                                  */
1068                                 nl7c = nl7c_add_addr(addr, addrlen);
1069                         }
1070                 } else
1071                         nl7c = NULL;
1072         }
1073 
1074         /*
1075          * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1076          * for other transports we will send in a O_T_BIND_REQ.
1077          */
1078         if (tcp_udp_xport &&
1079             (so->so_family == AF_INET || so->so_family == AF_INET6))
1080                 PRIM_type = T_BIND_REQ;
1081 
1082         bind_req.PRIM_type = PRIM_type;
1083         bind_req.ADDR_length = addrlen;
1084         bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1085         bind_req.CONIND_number = backlog;
1086         /* NOTE: holding so_lock while sleeping */
1087         mp = soallocproto2(&bind_req, sizeof (bind_req),
1088             addr, addrlen, 0, _ALLOC_SLEEP, cr);
1089         sti->sti_laddr_valid = 0;
1090 
1091         /* Done using sti_laddr_sa - can drop the lock */
1092         mutex_exit(&so->so_lock);
1093 
1094         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1095             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1096         if (error) {
1097                 eprintsoline(so, error);
1098                 mutex_enter(&so->so_lock);
1099                 goto done;
1100         }
1101 
1102         mutex_enter(&so->so_lock);
1103         error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1104             (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1105         if (error) {
1106                 eprintsoline(so, error);
1107                 goto done;
1108         }
1109         ASSERT(mp);
1110         /*
1111          * Even if some TPI message (e.g. T_DISCON_IND) was received in
1112          * strsock_proto while the lock was dropped above, the bind
1113          * is allowed to complete.
1114          */
1115 
1116         /* Mark as bound. This will be undone if we detect errors below. */
1117         if (flags & _SOBIND_NOXLATE) {
1118                 ASSERT(so->so_family == AF_UNIX);
1119                 sti->sti_faddr_noxlate = 1;
1120         }
1121         ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1122         so->so_state |= SS_ISBOUND;
1123         ASSERT(sti->sti_unbind_mp);
1124 
1125         /* note that we've already set SS_ACCEPTCONN above */
1126 
1127         /*
1128          * Recompute addrlen - an unspecied bind sent down an
1129          * address of length zero but we expect the appropriate length
1130          * in return.
1131          */
1132         addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1133             sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1134 
1135         bind_ack = (struct T_bind_ack *)mp->b_rptr;
1136         /*
1137          * The alignment restriction is really too strict but
1138          * we want enough alignment to inspect the fields of
1139          * a sockaddr_in.
1140          */
1141         addr = sogetoff(mp, bind_ack->ADDR_offset,
1142             bind_ack->ADDR_length,
1143             __TPI_ALIGN_SIZE);
1144         if (addr == NULL) {
1145                 freemsg(mp);
1146                 error = EPROTO;
1147                 eprintsoline(so, error);
1148                 goto done;
1149         }
1150         if (!(flags & _SOBIND_UNSPEC)) {
1151                 /*
1152                  * Verify that the transport didn't return something we
1153                  * did not want e.g. an address other than what we asked for.
1154                  *
1155                  * NOTE: These checks would go away if/when we switch to
1156                  * using the new TPI (in which the transport would fail
1157                  * the request instead of assigning a different address).
1158                  *
1159                  * NOTE2: For protocols that we don't know (i.e. any
1160                  * other than AF_INET6, AF_INET and AF_UNIX), we
1161                  * cannot know if the transport should be expected to
1162                  * return the same address as that requested.
1163                  *
1164                  * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1165                  * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1166                  *
1167                  * For example, in the case of netatalk it may be
1168                  * inappropriate for the transport to return the
1169                  * requested address (as it may have allocated a local
1170                  * port number in behaviour similar to that of an
1171                  * AF_INET bind request with a port number of zero).
1172                  *
1173                  * Given the definition of O_T_BIND_REQ, where the
1174                  * transport may bind to an address other than the
1175                  * requested address, it's not possible to determine
1176                  * whether a returned address that differs from the
1177                  * requested address is a reason to fail (because the
1178                  * requested address was not available) or succeed
1179                  * (because the transport allocated an appropriate
1180                  * address and/or port).
1181                  *
1182                  * sockfs currently requires that the transport return
1183                  * the requested address in the T_BIND_ACK, unless
1184                  * there is code here to allow for any discrepancy.
1185                  * Such code exists for AF_INET and AF_INET6.
1186                  *
1187                  * Netatalk chooses to return the requested address
1188                  * rather than the (correct) allocated address.  This
1189                  * means that netatalk violates the TPI specification
1190                  * (and would not function correctly if used from a
1191                  * TLI application), but it does mean that it works
1192                  * with sockfs.
1193                  *
1194                  * As noted above, using the newer XTI bind primitive
1195                  * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1196                  * allow sockfs to be more sure about whether or not
1197                  * the bind request had succeeded (as transports are
1198                  * not permitted to bind to a different address than
1199                  * that requested - they must return failure).
1200                  * Unfortunately, support for T_BIND_REQ may not be
1201                  * present in all transport implementations (netatalk,
1202                  * for example, doesn't have it), making the
1203                  * transition difficult.
1204                  */
1205                 if (bind_ack->ADDR_length != addrlen) {
1206                         /* Assumes that the requested address was in use */
1207                         freemsg(mp);
1208                         error = EADDRINUSE;
1209                         eprintsoline(so, error);
1210                         goto done;
1211                 }
1212 
1213                 switch (so->so_family) {
1214                 case AF_INET6:
1215                 case AF_INET: {
1216                         sin_t *rname, *aname;
1217 
1218                         rname = (sin_t *)addr;
1219                         aname = (sin_t *)sti->sti_laddr_sa;
1220 
1221                         /*
1222                          * Take advantage of the alignment
1223                          * of sin_port and sin6_port which fall
1224                          * in the same place in their data structures.
1225                          * Just use sin_port for either address family.
1226                          *
1227                          * This may become a problem if (heaven forbid)
1228                          * there's a separate ipv6port_reserved... :-P
1229                          *
1230                          * Binding to port 0 has the semantics of letting
1231                          * the transport bind to any port.
1232                          *
1233                          * If the transport is TCP or UDP since we had sent
1234                          * a T_BIND_REQ we would not get a port other than
1235                          * what we asked for.
1236                          */
1237                         if (tcp_udp_xport) {
1238                                 /*
1239                                  * Pick up the new port number if we bound to
1240                                  * port 0.
1241                                  */
1242                                 if (aname->sin_port == 0)
1243                                         aname->sin_port = rname->sin_port;
1244                                 sti->sti_laddr_valid = 1;
1245                                 break;
1246                         }
1247                         if (aname->sin_port != 0 &&
1248                             aname->sin_port != rname->sin_port) {
1249                                 freemsg(mp);
1250                                 error = EADDRINUSE;
1251                                 eprintsoline(so, error);
1252                                 goto done;
1253                         }
1254                         /*
1255                          * Pick up the new port number if we bound to port 0.
1256                          */
1257                         aname->sin_port = rname->sin_port;
1258 
1259                         /*
1260                          * Unfortunately, addresses aren't _quite_ the same.
1261                          */
1262                         if (so->so_family == AF_INET) {
1263                                 if (aname->sin_addr.s_addr !=
1264                                     rname->sin_addr.s_addr) {
1265                                         freemsg(mp);
1266                                         error = EADDRNOTAVAIL;
1267                                         eprintsoline(so, error);
1268                                         goto done;
1269                                 }
1270                         } else {
1271                                 sin6_t *rname6 = (sin6_t *)rname;
1272                                 sin6_t *aname6 = (sin6_t *)aname;
1273 
1274                                 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1275                                     &rname6->sin6_addr)) {
1276                                         freemsg(mp);
1277                                         error = EADDRNOTAVAIL;
1278                                         eprintsoline(so, error);
1279                                         goto done;
1280                                 }
1281                         }
1282                         break;
1283                 }
1284                 case AF_UNIX:
1285                         if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1286                                 freemsg(mp);
1287                                 error = EADDRINUSE;
1288                                 eprintsoline(so, error);
1289                                 eprintso(so,
1290                                     ("addrlen %d, addr 0x%x, vp %p\n",
1291                                     addrlen, *((int *)addr),
1292                                     (void *)sti->sti_ux_bound_vp));
1293                                 goto done;
1294                         }
1295                         sti->sti_laddr_valid = 1;
1296                         break;
1297                 default:
1298                         /*
1299                          * NOTE: This assumes that addresses can be
1300                          * byte-compared for equivalence.
1301                          */
1302                         if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1303                                 freemsg(mp);
1304                                 error = EADDRINUSE;
1305                                 eprintsoline(so, error);
1306                                 goto done;
1307                         }
1308                         /*
1309                          * Don't mark sti_laddr_valid, as we cannot be
1310                          * sure that the returned address is the real
1311                          * bound address when talking to an unknown
1312                          * transport.
1313                          */
1314                         break;
1315                 }
1316         } else {
1317                 /*
1318                  * Save for returned address for getsockname.
1319                  * Needed for unspecific bind unless transport supports
1320                  * the TI_GETMYNAME ioctl.
1321                  * Do this for AF_INET{,6} even though they do, as
1322                  * caching info here is much better performance than
1323                  * a TPI/STREAMS trip to the transport for getsockname.
1324                  * Any which can't for some reason _must_ _not_ set
1325                  * sti_laddr_valid here for the caching version of
1326                  * getsockname to not break;
1327                  */
1328                 switch (so->so_family) {
1329                 case AF_UNIX:
1330                         /*
1331                          * Record the address bound with the transport
1332                          * for use by socketpair.
1333                          */
1334                         bcopy(addr, &sti->sti_ux_laddr, addrlen);
1335                         sti->sti_laddr_valid = 1;
1336                         break;
1337                 case AF_INET:
1338                 case AF_INET6:
1339                         ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1340                         bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1341                         sti->sti_laddr_valid = 1;
1342                         break;
1343                 default:
1344                         /*
1345                          * Don't mark sti_laddr_valid, as we cannot be
1346                          * sure that the returned address is the real
1347                          * bound address when talking to an unknown
1348                          * transport.
1349                          */
1350                         break;
1351                 }
1352         }
1353 
1354         if (nl7c != NULL) {
1355                 /* Register listen()er sonode pointer with NL7C */
1356                 nl7c_listener_addr(nl7c, so);
1357         }
1358 
1359         freemsg(mp);
1360 
1361 done:
1362         if (error) {
1363                 /* reset state & backlog to values held on entry */
1364                 if (clear_acceptconn_on_err == B_TRUE)
1365                         so->so_state &= ~SS_ACCEPTCONN;
1366                 if (restore_backlog_on_err == B_TRUE)
1367                         so->so_backlog = save_so_backlog;
1368 
1369                 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1370                         int err;
1371 
1372                         err = sotpi_unbind(so, 0);
1373                         /* LINTED - statement has no consequent: if */
1374                         if (err) {
1375                                 eprintsoline(so, error);
1376                         } else {
1377                                 ASSERT(!(so->so_state & SS_ISBOUND));
1378                         }
1379                 }
1380         }
1381         if (!(flags & _SOBIND_LOCK_HELD)) {
1382                 so_unlock_single(so, SOLOCKED);
1383                 mutex_exit(&so->so_lock);
1384         } else {
1385                 ASSERT(MUTEX_HELD(&so->so_lock));
1386                 ASSERT(so->so_flag & SOLOCKED);
1387         }
1388         return (error);
1389 }
1390 
1391 /* bind the socket */
1392 static int
1393 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1394     int flags, struct cred *cr)
1395 {
1396         if ((flags & _SOBIND_SOCKETPAIR) == 0)
1397                 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1398 
1399         flags &= ~_SOBIND_SOCKETPAIR;
1400         return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1401 }
1402 
1403 /*
1404  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1405  * address, or when listen needs to unbind and bind.
1406  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1407  * so that a sobind can pick them up.
1408  */
1409 static int
1410 sotpi_unbind(struct sonode *so, int flags)
1411 {
1412         struct T_unbind_req     unbind_req;
1413         int                     error = 0;
1414         mblk_t                  *mp;
1415         sotpi_info_t            *sti = SOTOTPI(so);
1416 
1417         dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1418             (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1419 
1420         ASSERT(MUTEX_HELD(&so->so_lock));
1421         ASSERT(so->so_flag & SOLOCKED);
1422 
1423         if (!(so->so_state & SS_ISBOUND)) {
1424                 error = EINVAL;
1425                 eprintsoline(so, error);
1426                 goto done;
1427         }
1428 
1429         mutex_exit(&so->so_lock);
1430 
1431         /*
1432          * Flush the read and write side (except stream head read queue)
1433          * and send down T_UNBIND_REQ.
1434          */
1435         (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1436 
1437         unbind_req.PRIM_type = T_UNBIND_REQ;
1438         mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1439             0, _ALLOC_SLEEP, CRED());
1440         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1441             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1442         mutex_enter(&so->so_lock);
1443         if (error) {
1444                 eprintsoline(so, error);
1445                 goto done;
1446         }
1447 
1448         error = sowaitokack(so, T_UNBIND_REQ);
1449         if (error) {
1450                 eprintsoline(so, error);
1451                 goto done;
1452         }
1453 
1454         /*
1455          * Even if some TPI message (e.g. T_DISCON_IND) was received in
1456          * strsock_proto while the lock was dropped above, the unbind
1457          * is allowed to complete.
1458          */
1459         if (!(flags & _SOUNBIND_REBIND)) {
1460                 /*
1461                  * Clear out bound address.
1462                  */
1463                 vnode_t *vp;
1464 
1465                 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1466                         sti->sti_ux_bound_vp = NULL;
1467                         vn_rele_stream(vp);
1468                 }
1469                 /* Clear out address */
1470                 sti->sti_laddr_len = 0;
1471         }
1472         so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1473         sti->sti_laddr_valid = 0;
1474 
1475 done:
1476 
1477         /* If the caller held the lock don't release it here */
1478         ASSERT(MUTEX_HELD(&so->so_lock));
1479         ASSERT(so->so_flag & SOLOCKED);
1480 
1481         return (error);
1482 }
1483 
1484 /*
1485  * listen on the socket.
1486  * For TPI conforming transports this has to first unbind with the transport
1487  * and then bind again using the new backlog.
1488  */
1489 /* ARGSUSED */
1490 int
1491 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1492 {
1493         int             error = 0;
1494         sotpi_info_t    *sti = SOTOTPI(so);
1495 
1496         dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1497             (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1498 
1499         if (sti->sti_serv_type == T_CLTS)
1500                 return (EOPNOTSUPP);
1501 
1502         /*
1503          * If the socket is ready to accept connections already, then
1504          * return without doing anything.  This avoids a problem where
1505          * a second listen() call fails if a connection is pending and
1506          * leaves the socket unbound. Only when we are not unbinding
1507          * with the transport can we safely increase the backlog.
1508          */
1509         if (so->so_state & SS_ACCEPTCONN &&
1510             !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1511             /*CONSTCOND*/
1512             !solisten_tpi_tcp))
1513                 return (0);
1514 
1515         if (so->so_state & SS_ISCONNECTED)
1516                 return (EINVAL);
1517 
1518         mutex_enter(&so->so_lock);
1519         so_lock_single(so);     /* Set SOLOCKED */
1520 
1521         /*
1522          * If the listen doesn't change the backlog we do nothing.
1523          * This avoids an EPROTO error from the transport.
1524          */
1525         if ((so->so_state & SS_ACCEPTCONN) &&
1526             so->so_backlog == backlog)
1527                 goto done;
1528 
1529         if (!(so->so_state & SS_ISBOUND)) {
1530                 /*
1531                  * Must have been explicitly bound in the UNIX domain.
1532                  */
1533                 if (so->so_family == AF_UNIX) {
1534                         error = EINVAL;
1535                         goto done;
1536                 }
1537                 error = sotpi_bindlisten(so, NULL, 0, backlog,
1538                     _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1539         } else if (backlog > 0) {
1540                 /*
1541                  * AF_INET{,6} hack to avoid losing the port.
1542                  * Assumes that all AF_INET{,6} transports can handle a
1543                  * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1544                  * has already bound thus it is possible to avoid the unbind.
1545                  */
1546                 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1547                     /*CONSTCOND*/
1548                     !solisten_tpi_tcp)) {
1549                         error = sotpi_unbind(so, _SOUNBIND_REBIND);
1550                         if (error)
1551                                 goto done;
1552                 }
1553                 error = sotpi_bindlisten(so, NULL, 0, backlog,
1554                     _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1555         } else {
1556                 so->so_state |= SS_ACCEPTCONN;
1557                 so->so_backlog = backlog;
1558         }
1559         if (error)
1560                 goto done;
1561         ASSERT(so->so_state & SS_ACCEPTCONN);
1562 done:
1563         so_unlock_single(so, SOLOCKED);
1564         mutex_exit(&so->so_lock);
1565         return (error);
1566 }
1567 
1568 /*
1569  * Disconnect either a specified seqno or all (-1).
1570  * The former is used on listening sockets only.
1571  *
1572  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1573  * the current use of sodisconnect(seqno == -1) is only for shutdown
1574  * so there is no point (and potentially incorrect) to unbind.
1575  */
1576 static int
1577 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1578 {
1579         struct T_discon_req     discon_req;
1580         int                     error = 0;
1581         mblk_t                  *mp;
1582 
1583         dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1584             (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1585 
1586         if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1587                 mutex_enter(&so->so_lock);
1588                 so_lock_single(so);     /* Set SOLOCKED */
1589         } else {
1590                 ASSERT(MUTEX_HELD(&so->so_lock));
1591                 ASSERT(so->so_flag & SOLOCKED);
1592         }
1593 
1594         if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1595                 error = EINVAL;
1596                 eprintsoline(so, error);
1597                 goto done;
1598         }
1599 
1600         mutex_exit(&so->so_lock);
1601         /*
1602          * Flush the write side (unless this is a listener)
1603          * and then send down a T_DISCON_REQ.
1604          * (Don't flush on listener since it could flush {O_}T_CONN_RES
1605          * and other messages.)
1606          */
1607         if (!(so->so_state & SS_ACCEPTCONN))
1608                 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1609 
1610         discon_req.PRIM_type = T_DISCON_REQ;
1611         discon_req.SEQ_number = seqno;
1612         mp = soallocproto1(&discon_req, sizeof (discon_req),
1613             0, _ALLOC_SLEEP, CRED());
1614         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1615             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1616         mutex_enter(&so->so_lock);
1617         if (error) {
1618                 eprintsoline(so, error);
1619                 goto done;
1620         }
1621 
1622         error = sowaitokack(so, T_DISCON_REQ);
1623         if (error) {
1624                 eprintsoline(so, error);
1625                 goto done;
1626         }
1627         /*
1628          * Even if some TPI message (e.g. T_DISCON_IND) was received in
1629          * strsock_proto while the lock was dropped above, the disconnect
1630          * is allowed to complete. However, it is not possible to
1631          * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1632          */
1633         so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1634         SOTOTPI(so)->sti_laddr_valid = 0;
1635         SOTOTPI(so)->sti_faddr_valid = 0;
1636 done:
1637         if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1638                 so_unlock_single(so, SOLOCKED);
1639                 mutex_exit(&so->so_lock);
1640         } else {
1641                 /* If the caller held the lock don't release it here */
1642                 ASSERT(MUTEX_HELD(&so->so_lock));
1643                 ASSERT(so->so_flag & SOLOCKED);
1644         }
1645         return (error);
1646 }
1647 
1648 /* ARGSUSED */
1649 int
1650 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1651     struct sonode **nsop)
1652 {
1653         struct T_conn_ind       *conn_ind;
1654         struct T_conn_res       *conn_res;
1655         int                     error = 0;
1656         mblk_t                  *mp, *ack_mp;
1657         struct sonode           *nso;
1658         vnode_t                 *nvp;
1659         void                    *src;
1660         t_uscalar_t             srclen;
1661         void                    *opt;
1662         t_uscalar_t             optlen;
1663         t_scalar_t              PRIM_type;
1664         t_scalar_t              SEQ_number;
1665         size_t                  sinlen;
1666         sotpi_info_t            *sti = SOTOTPI(so);
1667         sotpi_info_t            *nsti;
1668 
1669         dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1670             (void *)so, fflag, (void *)nsop,
1671             pr_state(so->so_state, so->so_mode)));
1672 
1673         /*
1674          * Defer single-threading the accepting socket until
1675          * the T_CONN_IND has been received and parsed and the
1676          * new sonode has been opened.
1677          */
1678 
1679         /* Check that we are not already connected */
1680         if ((so->so_state & SS_ACCEPTCONN) == 0)
1681                 goto conn_bad;
1682 again:
1683         if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1684                 goto e_bad;
1685 
1686         ASSERT(mp != NULL);
1687         conn_ind = (struct T_conn_ind *)mp->b_rptr;
1688 
1689         /*
1690          * Save SEQ_number for error paths.
1691          */
1692         SEQ_number = conn_ind->SEQ_number;
1693 
1694         srclen = conn_ind->SRC_length;
1695         src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1696         if (src == NULL) {
1697                 error = EPROTO;
1698                 freemsg(mp);
1699                 eprintsoline(so, error);
1700                 goto disconnect_unlocked;
1701         }
1702         optlen = conn_ind->OPT_length;
1703         switch (so->so_family) {
1704         case AF_INET:
1705         case AF_INET6:
1706                 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1707                         bcopy(mp->b_rptr + conn_ind->OPT_offset,
1708                             &opt, conn_ind->OPT_length);
1709                 } else {
1710                         /*
1711                          * The transport (in this case TCP) hasn't sent up
1712                          * a pointer to an instance for the accept fast-path.
1713                          * Disable fast-path completely because the call to
1714                          * sotpi_create() below would otherwise create an
1715                          * incomplete TCP instance, which would lead to
1716                          * problems when sockfs sends a normal T_CONN_RES
1717                          * message down the new stream.
1718                          */
1719                         if (sti->sti_direct) {
1720                                 int rval;
1721                                 /*
1722                                  * For consistency we inform tcp to disable
1723                                  * direct interface on the listener, though
1724                                  * we can certainly live without doing this
1725                                  * because no data will ever travel upstream
1726                                  * on the listening socket.
1727                                  */
1728                                 sti->sti_direct = 0;
1729                                 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1730                                     0, 0, K_TO_K, cr, &rval);
1731                         }
1732                         opt = NULL;
1733                         optlen = 0;
1734                 }
1735                 break;
1736         case AF_UNIX:
1737         default:
1738                 if (optlen != 0) {
1739                         opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1740                             __TPI_ALIGN_SIZE);
1741                         if (opt == NULL) {
1742                                 error = EPROTO;
1743                                 freemsg(mp);
1744                                 eprintsoline(so, error);
1745                                 goto disconnect_unlocked;
1746                         }
1747                 }
1748                 if (so->so_family == AF_UNIX) {
1749                         if (!sti->sti_faddr_noxlate) {
1750                                 src = NULL;
1751                                 srclen = 0;
1752                         }
1753                         /* Extract src address from options */
1754                         if (optlen != 0)
1755                                 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1756                 }
1757                 break;
1758         }
1759 
1760         /*
1761          * Create the new socket.
1762          */
1763         nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1764         if (nso == NULL) {
1765                 ASSERT(error != 0);
1766                 /*
1767                  * Accept can not fail with ENOBUFS. sotpi_create
1768                  * sleeps waiting for memory until a signal is caught
1769                  * so return EINTR.
1770                  */
1771                 freemsg(mp);
1772                 if (error == ENOBUFS)
1773                         error = EINTR;
1774                 goto e_disc_unl;
1775         }
1776         nvp = SOTOV(nso);
1777         nsti = SOTOTPI(nso);
1778 
1779 #ifdef DEBUG
1780         /*
1781          * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1782          * it's inherited early to allow debugging of the accept code itself.
1783          */
1784         nso->so_options |= so->so_options & SO_DEBUG;
1785 #endif /* DEBUG */
1786 
1787         /*
1788          * Save the SRC address from the T_CONN_IND
1789          * for getpeername to work on AF_UNIX and on transports that do not
1790          * support TI_GETPEERNAME.
1791          *
1792          * NOTE: AF_UNIX NUL termination is ensured by the sender's
1793          * copyin_name().
1794          */
1795         if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1796                 error = EINVAL;
1797                 freemsg(mp);
1798                 eprintsoline(so, error);
1799                 goto disconnect_vp_unlocked;
1800         }
1801         nsti->sti_faddr_len = (socklen_t)srclen;
1802         ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1803         bcopy(src, nsti->sti_faddr_sa, srclen);
1804         nsti->sti_faddr_valid = 1;
1805 
1806         /*
1807          * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1808          */
1809         if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1810             (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1811                 cred_t  *cr;
1812                 pid_t   cpid;
1813 
1814                 cr = msg_getcred(mp, &cpid);
1815                 if (cr != NULL) {
1816                         crhold(cr);
1817                         nso->so_peercred = cr;
1818                         nso->so_cpid = cpid;
1819                 }
1820                 freemsg(mp);
1821 
1822                 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1823                     sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1824                 if (mp == NULL) {
1825                         /*
1826                          * Accept can not fail with ENOBUFS.
1827                          * A signal was caught so return EINTR.
1828                          */
1829                         error = EINTR;
1830                         eprintsoline(so, error);
1831                         goto disconnect_vp_unlocked;
1832                 }
1833                 conn_res = (struct T_conn_res *)mp->b_rptr;
1834         } else {
1835                 /*
1836                  * For efficency reasons we use msg_extractcred; no crhold
1837                  * needed since db_credp is cleared (i.e., we move the cred
1838                  * from the message to so_peercred.
1839                  */
1840                 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1841 
1842                 mp->b_rptr = DB_BASE(mp);
1843                 conn_res = (struct T_conn_res *)mp->b_rptr;
1844                 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1845 
1846                 mblk_setcred(mp, cr, curproc->p_pid);
1847         }
1848 
1849         /*
1850          * New socket must be bound at least in sockfs and, except for AF_INET,
1851          * (or AF_INET6) it also has to be bound in the transport provider.
1852          * We set the local address in the sonode from the T_OK_ACK of the
1853          * T_CONN_RES. For this reason the address we bind to here isn't
1854          * important.
1855          */
1856         if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1857             /*CONSTCOND*/
1858             nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1859                 /*
1860                  * Optimization for AF_INET{,6} transports
1861                  * that can handle a T_CONN_RES without being bound.
1862                  */
1863                 mutex_enter(&nso->so_lock);
1864                 so_automatic_bind(nso);
1865                 mutex_exit(&nso->so_lock);
1866         } else {
1867                 /* Perform NULL bind with the transport provider. */
1868                 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1869                     cr)) != 0) {
1870                         ASSERT(error != ENOBUFS);
1871                         freemsg(mp);
1872                         eprintsoline(nso, error);
1873                         goto disconnect_vp_unlocked;
1874                 }
1875         }
1876 
1877         /*
1878          * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1879          * so that any data arriving on the new socket will cause the
1880          * appropriate signals to be delivered for the new socket.
1881          *
1882          * No other thread (except strsock_proto and strsock_misc)
1883          * can access the new socket thus we relax the locking.
1884          */
1885         nso->so_pgrp = so->so_pgrp;
1886         nso->so_state |= so->so_state & SS_ASYNC;
1887         nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1888 
1889         if (nso->so_pgrp != 0) {
1890                 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1891                         eprintsoline(nso, error);
1892                         error = 0;
1893                         nso->so_pgrp = 0;
1894                 }
1895         }
1896 
1897         /*
1898          * Make note of the socket level options. TCP and IP level options
1899          * are already inherited. We could do all this after accept is
1900          * successful but doing it here simplifies code and no harm done
1901          * for error case.
1902          */
1903         nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1904             SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1905             SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1906         nso->so_sndbuf = so->so_sndbuf;
1907         nso->so_rcvbuf = so->so_rcvbuf;
1908         if (nso->so_options & SO_LINGER)
1909                 nso->so_linger = so->so_linger;
1910 
1911         /*
1912          * Note that the following sti_direct code path should be
1913          * removed once we are confident that the direct sockets
1914          * do not result in any degradation.
1915          */
1916         if (sti->sti_direct) {
1917 
1918                 ASSERT(opt != NULL);
1919 
1920                 conn_res->OPT_length = optlen;
1921                 conn_res->OPT_offset = MBLKL(mp);
1922                 bcopy(&opt, mp->b_wptr, optlen);
1923                 mp->b_wptr += optlen;
1924                 conn_res->PRIM_type = T_CONN_RES;
1925                 conn_res->ACCEPTOR_id = 0;
1926                 PRIM_type = T_CONN_RES;
1927 
1928                 /* Send down the T_CONN_RES on acceptor STREAM */
1929                 error = kstrputmsg(SOTOV(nso), mp, NULL,
1930                     0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1931                 if (error) {
1932                         mutex_enter(&so->so_lock);
1933                         so_lock_single(so);
1934                         eprintsoline(so, error);
1935                         goto disconnect_vp;
1936                 }
1937                 mutex_enter(&nso->so_lock);
1938                 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1939                     (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1940                 if (error) {
1941                         mutex_exit(&nso->so_lock);
1942                         mutex_enter(&so->so_lock);
1943                         so_lock_single(so);
1944                         eprintsoline(so, error);
1945                         goto disconnect_vp;
1946                 }
1947                 if (nso->so_family == AF_INET) {
1948                         sin_t *sin;
1949 
1950                         sin = (sin_t *)(ack_mp->b_rptr +
1951                             sizeof (struct T_ok_ack));
1952                         bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1953                         nsti->sti_laddr_len = sizeof (sin_t);
1954                 } else {
1955                         sin6_t *sin6;
1956 
1957                         sin6 = (sin6_t *)(ack_mp->b_rptr +
1958                             sizeof (struct T_ok_ack));
1959                         bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1960                         nsti->sti_laddr_len = sizeof (sin6_t);
1961                 }
1962                 freemsg(ack_mp);
1963 
1964                 nso->so_state |= SS_ISCONNECTED;
1965                 nso->so_proto_handle = (sock_lower_handle_t)opt;
1966                 nsti->sti_laddr_valid = 1;
1967 
1968                 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1969                         /*
1970                          * A NL7C marked listen()er so the new socket
1971                          * inherits the listen()er's NL7C state, except
1972                          * for NL7C_POLLIN.
1973                          *
1974                          * Only call NL7C to process the new socket if
1975                          * the listen socket allows blocking i/o.
1976                          */
1977                         nsti->sti_nl7c_flags =
1978                             sti->sti_nl7c_flags & (~NL7C_POLLIN);
1979                         if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1980                                 /*
1981                                  * Nonblocking accept() just make it
1982                                  * persist to defer processing to the
1983                                  * read-side syscall (e.g. read).
1984                                  */
1985                                 nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
1986                         } else if (nl7c_process(nso, B_FALSE)) {
1987                                 /*
1988                                  * NL7C has completed processing on the
1989                                  * socket, close the socket and back to
1990                                  * the top to await the next T_CONN_IND.
1991                                  */
1992                                 mutex_exit(&nso->so_lock);
1993                                 (void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1994                                     cr, NULL);
1995                                 VN_RELE(nvp);
1996                                 goto again;
1997                         }
1998                         /* Pass the new socket out */
1999                 }
2000 
2001                 mutex_exit(&nso->so_lock);
2002 
2003                 /*
2004                  * It's possible, through the use of autopush for example,
2005                  * that the acceptor stream may not support sti_direct
2006                  * semantics. If the new socket does not support sti_direct
2007                  * we issue a _SIOCSOCKFALLBACK to inform the transport
2008                  * as we would in the I_PUSH case.
2009                  */
2010                 if (nsti->sti_direct == 0) {
2011                         int     rval;
2012 
2013                         if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2014                             0, 0, K_TO_K, cr, &rval)) != 0) {
2015                                 mutex_enter(&so->so_lock);
2016                                 so_lock_single(so);
2017                                 eprintsoline(so, error);
2018                                 goto disconnect_vp;
2019                         }
2020                 }
2021 
2022                 /*
2023                  * Pass out new socket.
2024                  */
2025                 if (nsop != NULL)
2026                         *nsop = nso;
2027 
2028                 return (0);
2029         }
2030 
2031         /*
2032          * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2033          * which don't support the FireEngine accept fast-path. It is also
2034          * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2035          * again. Neither sockfs nor TCP attempt to find out if some other
2036          * random module has been inserted in between (in which case we
2037          * should follow TLI accept behaviour). We blindly assume the worst
2038          * case and revert back to old behaviour i.e. TCP will not send us
2039          * any option (eager) and the accept should happen on the listener
2040          * queue. Any queued T_conn_ind have already got their options removed
2041          * by so_sock2_stream() when "sockmod" was I_POP'd.
2042          */
2043         /*
2044          * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2045          */
2046         if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2047 #ifdef  _ILP32
2048                 queue_t *q;
2049 
2050                 /*
2051                  * Find read queue in driver
2052                  * Can safely do this since we "own" nso/nvp.
2053                  */
2054                 q = strvp2wq(nvp)->q_next;
2055                 while (SAMESTR(q))
2056                         q = q->q_next;
2057                 q = RD(q);
2058                 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2059 #else
2060                 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2061 #endif  /* _ILP32 */
2062                 conn_res->PRIM_type = O_T_CONN_RES;
2063                 PRIM_type = O_T_CONN_RES;
2064         } else {
2065                 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2066                 conn_res->PRIM_type = T_CONN_RES;
2067                 PRIM_type = T_CONN_RES;
2068         }
2069         conn_res->SEQ_number = SEQ_number;
2070         conn_res->OPT_length = 0;
2071         conn_res->OPT_offset = 0;
2072 
2073         mutex_enter(&so->so_lock);
2074         so_lock_single(so);     /* Set SOLOCKED */
2075         mutex_exit(&so->so_lock);
2076 
2077         error = kstrputmsg(SOTOV(so), mp, NULL,
2078             0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2079         mutex_enter(&so->so_lock);
2080         if (error) {
2081                 eprintsoline(so, error);
2082                 goto disconnect_vp;
2083         }
2084         error = sowaitprim(so, PRIM_type, T_OK_ACK,
2085             (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2086         if (error) {
2087                 eprintsoline(so, error);
2088                 goto disconnect_vp;
2089         }
2090         mutex_exit(&so->so_lock);
2091         /*
2092          * If there is a sin/sin6 appended onto the T_OK_ACK use
2093          * that to set the local address. If this is not present
2094          * then we zero out the address and don't set the
2095          * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2096          * the pathname from the listening socket.
2097          * In the case where this is TCP or an AF_UNIX socket the
2098          * client side may have queued data or a T_ORDREL in the
2099          * transport. Having now sent the T_CONN_RES we may receive
2100          * those queued messages at any time. Hold the acceptor
2101          * so_lock until its state and laddr are finalized.
2102          */
2103         mutex_enter(&nso->so_lock);
2104         sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2105         if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2106             MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2107                 ack_mp->b_rptr += sizeof (struct T_ok_ack);
2108                 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2109                 nsti->sti_laddr_len = sinlen;
2110                 nsti->sti_laddr_valid = 1;
2111         } else if (nso->so_family == AF_UNIX) {
2112                 ASSERT(so->so_family == AF_UNIX);
2113                 nsti->sti_laddr_len = sti->sti_laddr_len;
2114                 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2115                 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2116                     nsti->sti_laddr_len);
2117                 nsti->sti_laddr_valid = 1;
2118         } else {
2119                 nsti->sti_laddr_len = sti->sti_laddr_len;
2120                 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2121                 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2122                 nsti->sti_laddr_sa->sa_family = nso->so_family;
2123         }
2124         nso->so_state |= SS_ISCONNECTED;
2125         mutex_exit(&nso->so_lock);
2126 
2127         freemsg(ack_mp);
2128 
2129         mutex_enter(&so->so_lock);
2130         so_unlock_single(so, SOLOCKED);
2131         mutex_exit(&so->so_lock);
2132 
2133         /*
2134          * Pass out new socket.
2135          */
2136         if (nsop != NULL)
2137                 *nsop = nso;
2138 
2139         return (0);
2140 
2141 
2142 eproto_disc_unl:
2143         error = EPROTO;
2144 e_disc_unl:
2145         eprintsoline(so, error);
2146         goto disconnect_unlocked;
2147 
2148 pr_disc_vp_unl:
2149         eprintsoline(so, error);
2150 disconnect_vp_unlocked:
2151         (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2152         VN_RELE(nvp);
2153 disconnect_unlocked:
2154         (void) sodisconnect(so, SEQ_number, 0);
2155         return (error);
2156 
2157 pr_disc_vp:
2158         eprintsoline(so, error);
2159 disconnect_vp:
2160         (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2161         so_unlock_single(so, SOLOCKED);
2162         mutex_exit(&so->so_lock);
2163         (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2164         VN_RELE(nvp);
2165         return (error);
2166 
2167 conn_bad:       /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2168         error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2169             ? EOPNOTSUPP : EINVAL;
2170 e_bad:
2171         eprintsoline(so, error);
2172         return (error);
2173 }
2174 
2175 /*
2176  * connect a socket.
2177  *
2178  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2179  * unconnect (by specifying a null address).
2180  */
2181 int
2182 sotpi_connect(struct sonode *so,
2183     struct sockaddr *name,
2184     socklen_t namelen,
2185     int fflag,
2186     int flags,
2187     struct cred *cr)
2188 {
2189         struct T_conn_req       conn_req;
2190         int                     error = 0;
2191         mblk_t                  *mp;
2192         void                    *src;
2193         socklen_t               srclen;
2194         void                    *addr;
2195         socklen_t               addrlen;
2196         boolean_t               need_unlock;
2197         sotpi_info_t            *sti = SOTOTPI(so);
2198 
2199         dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2200             (void *)so, (void *)name, namelen, fflag, flags,
2201             pr_state(so->so_state, so->so_mode)));
2202 
2203         /*
2204          * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2205          * avoid sleeping for memory with SOLOCKED held.
2206          * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2207          * + sizeof (struct T_opthdr).
2208          * (the AF_UNIX so_ux_addr_xlate() does not make the address
2209          * exceed sti_faddr_maxlen).
2210          */
2211         mp = soallocproto(sizeof (struct T_conn_req) +
2212             2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2213             cr);
2214         if (mp == NULL) {
2215                 /*
2216                  * Connect can not fail with ENOBUFS. A signal was
2217                  * caught so return EINTR.
2218                  */
2219                 error = EINTR;
2220                 eprintsoline(so, error);
2221                 return (error);
2222         }
2223 
2224         mutex_enter(&so->so_lock);
2225         /*
2226          * Make sure there is a preallocated T_unbind_req message
2227          * before any binding. This message is allocated when the
2228          * socket is created. Since another thread can consume
2229          * so_unbind_mp by the time we return from so_lock_single(),
2230          * we should check the availability of so_unbind_mp after
2231          * we return from so_lock_single().
2232          */
2233 
2234         so_lock_single(so);     /* Set SOLOCKED */
2235         need_unlock = B_TRUE;
2236 
2237         if (sti->sti_unbind_mp == NULL) {
2238                 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2239                 /* NOTE: holding so_lock while sleeping */
2240                 sti->sti_unbind_mp =
2241                     soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2242                 if (sti->sti_unbind_mp == NULL) {
2243                         error = EINTR;
2244                         goto done;
2245                 }
2246         }
2247 
2248         /*
2249          * Can't have done a listen before connecting.
2250          */
2251         if (so->so_state & SS_ACCEPTCONN) {
2252                 error = EOPNOTSUPP;
2253                 goto done;
2254         }
2255 
2256         /*
2257          * Must be bound with the transport
2258          */
2259         if (!(so->so_state & SS_ISBOUND)) {
2260                 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2261                     /*CONSTCOND*/
2262                     so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2263                         /*
2264                          * Optimization for AF_INET{,6} transports
2265                          * that can handle a T_CONN_REQ without being bound.
2266                          */
2267                         so_automatic_bind(so);
2268                 } else {
2269                         error = sotpi_bind(so, NULL, 0,
2270                             _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2271                         if (error)
2272                                 goto done;
2273                 }
2274                 ASSERT(so->so_state & SS_ISBOUND);
2275                 flags |= _SOCONNECT_DID_BIND;
2276         }
2277 
2278         /*
2279          * Handle a connect to a name parameter of type AF_UNSPEC like a
2280          * connect to a null address. This is the portable method to
2281          * unconnect a socket.
2282          */
2283         if ((namelen >= sizeof (sa_family_t)) &&
2284             (name->sa_family == AF_UNSPEC)) {
2285                 name = NULL;
2286                 namelen = 0;
2287         }
2288 
2289         /*
2290          * Check that we are not already connected.
2291          * A connection-oriented socket cannot be reconnected.
2292          * A connected connection-less socket can be
2293          * - connected to a different address by a subsequent connect
2294          * - "unconnected" by a connect to the NULL address
2295          */
2296         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2297                 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2298                 if (so->so_mode & SM_CONNREQUIRED) {
2299                         /* Connection-oriented socket */
2300                         error = so->so_state & SS_ISCONNECTED ?
2301                             EISCONN : EALREADY;
2302                         goto done;
2303                 }
2304                 /* Connection-less socket */
2305                 if (name == NULL) {
2306                         /*
2307                          * Remove the connected state and clear SO_DGRAM_ERRIND
2308                          * since it was set when the socket was connected.
2309                          * If this is UDP also send down a T_DISCON_REQ.
2310                          */
2311                         int val;
2312 
2313                         if ((so->so_family == AF_INET ||
2314                             so->so_family == AF_INET6) &&
2315                             (so->so_type == SOCK_DGRAM ||
2316                             so->so_type == SOCK_RAW) &&
2317                             /*CONSTCOND*/
2318                             !soconnect_tpi_udp) {
2319                                 /* XXX What about implicitly unbinding here? */
2320                                 error = sodisconnect(so, -1,
2321                                     _SODISCONNECT_LOCK_HELD);
2322                         } else {
2323                                 so->so_state &=
2324                                     ~(SS_ISCONNECTED | SS_ISCONNECTING);
2325                                 sti->sti_faddr_valid = 0;
2326                                 sti->sti_faddr_len = 0;
2327                         }
2328 
2329                         /* Remove SOLOCKED since setsockopt will grab it */
2330                         so_unlock_single(so, SOLOCKED);
2331                         mutex_exit(&so->so_lock);
2332 
2333                         val = 0;
2334                         (void) sotpi_setsockopt(so, SOL_SOCKET,
2335                             SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2336                             cr);
2337 
2338                         mutex_enter(&so->so_lock);
2339                         so_lock_single(so);     /* Set SOLOCKED */
2340                         goto done;
2341                 }
2342         }
2343         ASSERT(so->so_state & SS_ISBOUND);
2344 
2345         if (name == NULL || namelen == 0) {
2346                 error = EINVAL;
2347                 goto done;
2348         }
2349         /*
2350          * Mark the socket if sti_faddr_sa represents the transport level
2351          * address.
2352          */
2353         if (flags & _SOCONNECT_NOXLATE) {
2354                 struct sockaddr_ux      *soaddr_ux;
2355 
2356                 ASSERT(so->so_family == AF_UNIX);
2357                 if (namelen != sizeof (struct sockaddr_ux)) {
2358                         error = EINVAL;
2359                         goto done;
2360                 }
2361                 soaddr_ux = (struct sockaddr_ux *)name;
2362                 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2363                 namelen = sizeof (soaddr_ux->sou_addr);
2364                 sti->sti_faddr_noxlate = 1;
2365         }
2366 
2367         /*
2368          * Length and family checks.
2369          */
2370         error = so_addr_verify(so, name, namelen);
2371         if (error)
2372                 goto bad;
2373 
2374         /*
2375          * Save foreign address. Needed for AF_UNIX as well as
2376          * transport providers that do not support TI_GETPEERNAME.
2377          * Also used for cached foreign address for TCP and UDP.
2378          */
2379         if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2380                 error = EINVAL;
2381                 goto done;
2382         }
2383         sti->sti_faddr_len = (socklen_t)namelen;
2384         ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2385         bcopy(name, sti->sti_faddr_sa, namelen);
2386         sti->sti_faddr_valid = 1;
2387 
2388         if (so->so_family == AF_UNIX) {
2389                 if (sti->sti_faddr_noxlate) {
2390                         /*
2391                          * sti_faddr is a transport-level address, so
2392                          * don't pass it as an option.  Do save it in
2393                          * sti_ux_faddr, used for connected DG send.
2394                          */
2395                         src = NULL;
2396                         srclen = 0;
2397                         addr = sti->sti_faddr_sa;
2398                         addrlen = (t_uscalar_t)sti->sti_faddr_len;
2399                         bcopy(addr, &sti->sti_ux_faddr,
2400                             sizeof (sti->sti_ux_faddr));
2401                 } else {
2402                         /*
2403                          * Pass the sockaddr_un source address as an option
2404                          * and translate the remote address.
2405                          * Holding so_lock thus sti_laddr_sa can not change.
2406                          */
2407                         src = sti->sti_laddr_sa;
2408                         srclen = (t_uscalar_t)sti->sti_laddr_len;
2409                         dprintso(so, 1,
2410                             ("sotpi_connect UNIX: srclen %d, src %p\n",
2411                             srclen, src));
2412                         /*
2413                          * Translate the destination address into our
2414                          * internal form, and save it in sti_ux_faddr.
2415                          * After this call, addr==&sti->sti_ux_taddr,
2416                          * and we copy that to sti->sti_ux_faddr so
2417                          * we save the connected peer address.
2418                          */
2419                         error = so_ux_addr_xlate(so,
2420                             sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2421                             (flags & _SOCONNECT_XPG4_2),
2422                             &addr, &addrlen);
2423                         if (error)
2424                                 goto bad;
2425                         bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2426                             sizeof (sti->sti_ux_faddr));
2427                 }
2428         } else {
2429                 addr = sti->sti_faddr_sa;
2430                 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2431                 src = NULL;
2432                 srclen = 0;
2433         }
2434         /*
2435          * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2436          * option which asks the transport provider to send T_UDERR_IND
2437          * messages. These T_UDERR_IND messages are used to return connected
2438          * style errors (e.g. ECONNRESET) for connected datagram sockets.
2439          *
2440          * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2441          * we send down a T_CONN_REQ. This is needed to let the
2442          * transport assign a local address that is consistent with
2443          * the remote address. Applications depend on a getsockname()
2444          * after a connect() to retrieve the "source" IP address for
2445          * the connected socket.  Invalidate the cached local address
2446          * to force getsockname() to enquire of the transport.
2447          */
2448         if (!(so->so_mode & SM_CONNREQUIRED)) {
2449                 /*
2450                  * Datagram socket.
2451                  */
2452                 int32_t val;
2453 
2454                 so_unlock_single(so, SOLOCKED);
2455                 mutex_exit(&so->so_lock);
2456 
2457                 val = 1;
2458                 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2459                     &val, (t_uscalar_t)sizeof (val), cr);
2460 
2461                 mutex_enter(&so->so_lock);
2462                 so_lock_single(so);     /* Set SOLOCKED */
2463                 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2464                     (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2465                     soconnect_tpi_udp) {
2466                         soisconnected(so);
2467                         goto done;
2468                 }
2469                 /*
2470                  * Send down T_CONN_REQ etc.
2471                  * Clear fflag to avoid returning EWOULDBLOCK.
2472                  */
2473                 fflag = 0;
2474                 ASSERT(so->so_family != AF_UNIX);
2475                 sti->sti_laddr_valid = 0;
2476         } else if (sti->sti_laddr_len != 0) {
2477                 /*
2478                  * If the local address or port was "any" then it may be
2479                  * changed by the transport as a result of the
2480                  * connect.  Invalidate the cached version if we have one.
2481                  */
2482                 switch (so->so_family) {
2483                 case AF_INET:
2484                         ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2485                         if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2486                             INADDR_ANY ||
2487                             ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2488                                 sti->sti_laddr_valid = 0;
2489                         break;
2490 
2491                 case AF_INET6:
2492                         ASSERT(sti->sti_laddr_len ==
2493                             (socklen_t)sizeof (sin6_t));
2494                         if (IN6_IS_ADDR_UNSPECIFIED(
2495                             &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2496                             IN6_IS_ADDR_V4MAPPED_ANY(
2497                             &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2498                             ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2499                                 sti->sti_laddr_valid = 0;
2500                         break;
2501 
2502                 default:
2503                         break;
2504                 }
2505         }
2506 
2507         /*
2508          * Check for failure of an earlier call
2509          */
2510         if (so->so_error != 0)
2511                 goto so_bad;
2512 
2513         /*
2514          * Send down T_CONN_REQ. Message was allocated above.
2515          */
2516         conn_req.PRIM_type = T_CONN_REQ;
2517         conn_req.DEST_length = addrlen;
2518         conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2519         if (srclen == 0) {
2520                 conn_req.OPT_length = 0;
2521                 conn_req.OPT_offset = 0;
2522                 soappendmsg(mp, &conn_req, sizeof (conn_req));
2523                 soappendmsg(mp, addr, addrlen);
2524         } else {
2525                 /*
2526                  * There is a AF_UNIX sockaddr_un to include as a source
2527                  * address option.
2528                  */
2529                 struct T_opthdr toh;
2530 
2531                 toh.level = SOL_SOCKET;
2532                 toh.name = SO_SRCADDR;
2533                 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2534                 toh.status = 0;
2535                 conn_req.OPT_length =
2536                     (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2537                 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2538                     _TPI_ALIGN_TOPT(addrlen));
2539 
2540                 soappendmsg(mp, &conn_req, sizeof (conn_req));
2541                 soappendmsg(mp, addr, addrlen);
2542                 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2543                 soappendmsg(mp, &toh, sizeof (toh));
2544                 soappendmsg(mp, src, srclen);
2545                 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2546                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2547         }
2548         /*
2549          * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2550          * in order to have the right state when the T_CONN_CON shows up.
2551          */
2552         soisconnecting(so);
2553         mutex_exit(&so->so_lock);
2554 
2555         if (AU_AUDITING())
2556                 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2557 
2558         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2559             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2560         mp = NULL;
2561         mutex_enter(&so->so_lock);
2562         if (error != 0)
2563                 goto bad;
2564 
2565         if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2566                 goto bad;
2567 
2568         /* Allow other threads to access the socket */
2569         so_unlock_single(so, SOLOCKED);
2570         need_unlock = B_FALSE;
2571 
2572         /*
2573          * Wait until we get a T_CONN_CON or an error
2574          */
2575         if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2576                 so_lock_single(so);     /* Set SOLOCKED */
2577                 need_unlock = B_TRUE;
2578         }
2579 
2580 done:
2581         freemsg(mp);
2582         switch (error) {
2583         case EINPROGRESS:
2584         case EALREADY:
2585         case EISCONN:
2586         case EINTR:
2587                 /* Non-fatal errors */
2588                 sti->sti_laddr_valid = 0;
2589                 /* FALLTHRU */
2590         case 0:
2591                 break;
2592         default:
2593                 ASSERT(need_unlock);
2594                 /*
2595                  * Fatal errors: clear SS_ISCONNECTING in case it was set,
2596                  * and invalidate local-address cache
2597                  */
2598                 so->so_state &= ~SS_ISCONNECTING;
2599                 sti->sti_laddr_valid = 0;
2600                 /* A discon_ind might have already unbound us */
2601                 if ((flags & _SOCONNECT_DID_BIND) &&
2602                     (so->so_state & SS_ISBOUND)) {
2603                         int err;
2604 
2605                         err = sotpi_unbind(so, 0);
2606                         /* LINTED - statement has no conseq */
2607                         if (err) {
2608                                 eprintsoline(so, err);
2609                         }
2610                 }
2611                 break;
2612         }
2613         if (need_unlock)
2614                 so_unlock_single(so, SOLOCKED);
2615         mutex_exit(&so->so_lock);
2616         return (error);
2617 
2618 so_bad: error = sogeterr(so, B_TRUE);
2619 bad:    eprintsoline(so, error);
2620         goto done;
2621 }
2622 
2623 /* ARGSUSED */
2624 int
2625 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2626 {
2627         struct T_ordrel_req     ordrel_req;
2628         mblk_t                  *mp;
2629         uint_t                  old_state, state_change;
2630         int                     error = 0;
2631         sotpi_info_t            *sti = SOTOTPI(so);
2632 
2633         dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2634             (void *)so, how, pr_state(so->so_state, so->so_mode)));
2635 
2636         mutex_enter(&so->so_lock);
2637         so_lock_single(so);     /* Set SOLOCKED */
2638 
2639         /*
2640          * SunOS 4.X has no check for datagram sockets.
2641          * 5.X checks that it is connected (ENOTCONN)
2642          * X/Open requires that we check the connected state.
2643          */
2644         if (!(so->so_state & SS_ISCONNECTED)) {
2645                 if (!xnet_skip_checks) {
2646                         error = ENOTCONN;
2647                         if (xnet_check_print) {
2648                                 printf("sockfs: X/Open shutdown check "
2649                                     "caused ENOTCONN\n");
2650                         }
2651                 }
2652                 goto done;
2653         }
2654         /*
2655          * Record the current state and then perform any state changes.
2656          * Then use the difference between the old and new states to
2657          * determine which messages need to be sent.
2658          * This prevents e.g. duplicate T_ORDREL_REQ when there are
2659          * duplicate calls to shutdown().
2660          */
2661         old_state = so->so_state;
2662 
2663         switch (how) {
2664         case 0:
2665                 socantrcvmore(so);
2666                 break;
2667         case 1:
2668                 socantsendmore(so);
2669                 break;
2670         case 2:
2671                 socantsendmore(so);
2672                 socantrcvmore(so);
2673                 break;
2674         default:
2675                 error = EINVAL;
2676                 goto done;
2677         }
2678 
2679         /*
2680          * Assumes that the SS_CANT* flags are never cleared in the above code.
2681          */
2682         state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2683             (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2684         ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2685 
2686         switch (state_change) {
2687         case 0:
2688                 dprintso(so, 1,
2689                     ("sotpi_shutdown: nothing to send in state 0x%x\n",
2690                     so->so_state));
2691                 goto done;
2692 
2693         case SS_CANTRCVMORE:
2694                 mutex_exit(&so->so_lock);
2695                 strseteof(SOTOV(so), 1);
2696                 /*
2697                  * strseteof takes care of read side wakeups,
2698                  * pollwakeups, and signals.
2699                  */
2700                 /*
2701                  * Get the read lock before flushing data to avoid problems
2702                  * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2703                  */
2704                 mutex_enter(&so->so_lock);
2705                 (void) so_lock_read(so, 0);     /* Set SOREADLOCKED */
2706                 mutex_exit(&so->so_lock);
2707 
2708                 /* Flush read side queue */
2709                 strflushrq(SOTOV(so), FLUSHALL);
2710 
2711                 mutex_enter(&so->so_lock);
2712                 so_unlock_read(so);             /* Clear SOREADLOCKED */
2713                 break;
2714 
2715         case SS_CANTSENDMORE:
2716                 mutex_exit(&so->so_lock);
2717                 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2718                 mutex_enter(&so->so_lock);
2719                 break;
2720 
2721         case SS_CANTSENDMORE|SS_CANTRCVMORE:
2722                 mutex_exit(&so->so_lock);
2723                 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2724                 strseteof(SOTOV(so), 1);
2725                 /*
2726                  * strseteof takes care of read side wakeups,
2727                  * pollwakeups, and signals.
2728                  */
2729                 /*
2730                  * Get the read lock before flushing data to avoid problems
2731                  * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2732                  */
2733                 mutex_enter(&so->so_lock);
2734                 (void) so_lock_read(so, 0);     /* Set SOREADLOCKED */
2735                 mutex_exit(&so->so_lock);
2736 
2737                 /* Flush read side queue */
2738                 strflushrq(SOTOV(so), FLUSHALL);
2739 
2740                 mutex_enter(&so->so_lock);
2741                 so_unlock_read(so);             /* Clear SOREADLOCKED */
2742                 break;
2743         }
2744 
2745         ASSERT(MUTEX_HELD(&so->so_lock));
2746 
2747         /*
2748          * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2749          * was set due to this call and the new state has both of them set:
2750          *      Send the AF_UNIX close indication
2751          *      For T_COTS send a discon_ind
2752          *
2753          * If cantsend was set due to this call:
2754          *      For T_COTSORD send an ordrel_ind
2755          *
2756          * Note that for T_CLTS there is no message sent here.
2757          */
2758         if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2759             (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2760                 /*
2761                  * For SunOS 4.X compatibility we tell the other end
2762                  * that we are unable to receive at this point.
2763                  */
2764                 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2765                         so_unix_close(so);
2766 
2767                 if (sti->sti_serv_type == T_COTS)
2768                         error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2769         }
2770         if ((state_change & SS_CANTSENDMORE) &&
2771             (sti->sti_serv_type == T_COTS_ORD)) {
2772                 /* Send an orderly release */
2773                 ordrel_req.PRIM_type = T_ORDREL_REQ;
2774 
2775                 mutex_exit(&so->so_lock);
2776                 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2777                     0, _ALLOC_SLEEP, cr);
2778                 /*
2779                  * Send down the T_ORDREL_REQ even if there is flow control.
2780                  * This prevents shutdown from blocking.
2781                  * Note that there is no T_OK_ACK for ordrel_req.
2782                  */
2783                 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2784                     MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2785                 mutex_enter(&so->so_lock);
2786                 if (error) {
2787                         eprintsoline(so, error);
2788                         goto done;
2789                 }
2790         }
2791 
2792 done:
2793         so_unlock_single(so, SOLOCKED);
2794         mutex_exit(&so->so_lock);
2795         return (error);
2796 }
2797 
2798 /*
2799  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2800  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2801  * that we have closed.
2802  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2803  * T_UNITDATA_REQ containing the same option.
2804  *
2805  * For SOCK_DGRAM half-connections (somebody connected to this end
2806  * but this end is not connect) we don't know where to send any
2807  * SO_UNIX_CLOSE.
2808  *
2809  * We have to ignore stream head errors just in case there has been
2810  * a shutdown(output).
2811  * Ignore any flow control to try to get the message more quickly to the peer.
2812  * While locally ignoring flow control solves the problem when there
2813  * is only the loopback transport on the stream it would not provide
2814  * the correct AF_UNIX socket semantics when one or more modules have
2815  * been pushed.
2816  */
2817 void
2818 so_unix_close(struct sonode *so)
2819 {
2820         struct T_opthdr toh;
2821         mblk_t          *mp;
2822         sotpi_info_t    *sti = SOTOTPI(so);
2823 
2824         ASSERT(MUTEX_HELD(&so->so_lock));
2825 
2826         ASSERT(so->so_family == AF_UNIX);
2827 
2828         if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2829             (SS_ISCONNECTED|SS_ISBOUND))
2830                 return;
2831 
2832         dprintso(so, 1, ("so_unix_close(%p) %s\n",
2833             (void *)so, pr_state(so->so_state, so->so_mode)));
2834 
2835         toh.level = SOL_SOCKET;
2836         toh.name = SO_UNIX_CLOSE;
2837 
2838         /* zero length + header */
2839         toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2840         toh.status = 0;
2841 
2842         if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2843                 struct T_optdata_req tdr;
2844 
2845                 tdr.PRIM_type = T_OPTDATA_REQ;
2846                 tdr.DATA_flag = 0;
2847 
2848                 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2849                 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2850 
2851                 /* NOTE: holding so_lock while sleeping */
2852                 mp = soallocproto2(&tdr, sizeof (tdr),
2853                     &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2854         } else {
2855                 struct T_unitdata_req   tudr;
2856                 void                    *addr;
2857                 socklen_t               addrlen;
2858                 void                    *src;
2859                 socklen_t               srclen;
2860                 struct T_opthdr         toh2;
2861                 t_scalar_t              size;
2862 
2863                 /*
2864                  * We know this is an AF_UNIX connected DGRAM socket.
2865                  * We therefore already have the destination address
2866                  * in the internal form needed for this send.  This is
2867                  * similar to the sosend_dgram call later in this file
2868                  * when there's no user-specified destination address.
2869                  */
2870                 if (sti->sti_faddr_noxlate) {
2871                         /*
2872                          * Already have a transport internal address. Do not
2873                          * pass any (transport internal) source address.
2874                          */
2875                         addr = sti->sti_faddr_sa;
2876                         addrlen = (t_uscalar_t)sti->sti_faddr_len;
2877                         src = NULL;
2878                         srclen = 0;
2879                 } else {
2880                         /*
2881                          * Pass the sockaddr_un source address as an option
2882                          * and translate the remote address.
2883                          * Holding so_lock thus sti_laddr_sa can not change.
2884                          */
2885                         src = sti->sti_laddr_sa;
2886                         srclen = (socklen_t)sti->sti_laddr_len;
2887                         dprintso(so, 1,
2888                             ("so_ux_close: srclen %d, src %p\n",
2889                             srclen, src));
2890                         /*
2891                          * Use the destination address saved in connect.
2892                          */
2893                         addr = &sti->sti_ux_faddr;
2894                         addrlen = sizeof (sti->sti_ux_faddr);
2895                 }
2896                 tudr.PRIM_type = T_UNITDATA_REQ;
2897                 tudr.DEST_length = addrlen;
2898                 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2899                 if (srclen == 0) {
2900                         tudr.OPT_length = (t_scalar_t)sizeof (toh);
2901                         tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2902                             _TPI_ALIGN_TOPT(addrlen));
2903 
2904                         size = tudr.OPT_offset + tudr.OPT_length;
2905                         /* NOTE: holding so_lock while sleeping */
2906                         mp = soallocproto2(&tudr, sizeof (tudr),
2907                             addr, addrlen, size, _ALLOC_SLEEP, CRED());
2908                         mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2909                         soappendmsg(mp, &toh, sizeof (toh));
2910                 } else {
2911                         /*
2912                          * There is a AF_UNIX sockaddr_un to include as a
2913                          * source address option.
2914                          */
2915                         tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2916                             _TPI_ALIGN_TOPT(srclen));
2917                         tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2918                             _TPI_ALIGN_TOPT(addrlen));
2919 
2920                         toh2.level = SOL_SOCKET;
2921                         toh2.name = SO_SRCADDR;
2922                         toh2.len = (t_uscalar_t)(srclen +
2923                             sizeof (struct T_opthdr));
2924                         toh2.status = 0;
2925 
2926                         size = tudr.OPT_offset + tudr.OPT_length;
2927 
2928                         /* NOTE: holding so_lock while sleeping */
2929                         mp = soallocproto2(&tudr, sizeof (tudr),
2930                             addr, addrlen, size, _ALLOC_SLEEP, CRED());
2931                         mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2932                         soappendmsg(mp, &toh, sizeof (toh));
2933                         soappendmsg(mp, &toh2, sizeof (toh2));
2934                         soappendmsg(mp, src, srclen);
2935                         mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2936                 }
2937                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2938         }
2939         mutex_exit(&so->so_lock);
2940         (void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2941             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2942         mutex_enter(&so->so_lock);
2943 }
2944 
2945 /*
2946  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2947  * In addition, the caller typically verifies that there is some
2948  * potential state to clear by checking
2949  *      if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2950  * before calling this routine.
2951  * Note that such a check can be made without holding so_lock since
2952  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2953  * decrements sti_oobsigcnt.
2954  *
2955  * When data is read *after* the point that all pending
2956  * oob data has been consumed the oob indication is cleared.
2957  *
2958  * This logic keeps select/poll returning POLLRDBAND and
2959  * SIOCATMARK returning true until we have read past
2960  * the mark.
2961  */
2962 static void
2963 sorecv_update_oobstate(struct sonode *so)
2964 {
2965         sotpi_info_t *sti = SOTOTPI(so);
2966 
2967         mutex_enter(&so->so_lock);
2968         ASSERT(so_verify_oobstate(so));
2969         dprintso(so, 1,
2970             ("sorecv_update_oobstate: counts %d/%d state %s\n",
2971             sti->sti_oobsigcnt,
2972             sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2973         if (sti->sti_oobsigcnt == 0) {
2974                 /* No more pending oob indications */
2975                 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2976                 freemsg(so->so_oobmsg);
2977                 so->so_oobmsg = NULL;
2978         }
2979         ASSERT(so_verify_oobstate(so));
2980         mutex_exit(&so->so_lock);
2981 }
2982 
2983 /*
2984  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2985  */
2986 static int
2987 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2988 {
2989         sotpi_info_t *sti = SOTOTPI(so);
2990         int     error = 0;
2991         mblk_t *tmp = NULL;
2992         mblk_t *pmp = NULL;
2993         mblk_t *nmp = sti->sti_nl7c_rcv_mp;
2994 
2995         ASSERT(nmp != NULL);
2996 
2997         while (nmp != NULL && uiop->uio_resid > 0) {
2998                 ssize_t n;
2999 
3000                 if (DB_TYPE(nmp) == M_DATA) {
3001                         /*
3002                          * We have some data, uiomove up to resid bytes.
3003                          */
3004                         n = MIN(MBLKL(nmp), uiop->uio_resid);
3005                         if (n > 0)
3006                                 error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3007                         nmp->b_rptr += n;
3008                         if (nmp->b_rptr == nmp->b_wptr) {
3009                                 pmp = nmp;
3010                                 nmp = nmp->b_cont;
3011                         }
3012                         if (error)
3013                                 break;
3014                 } else {
3015                         /*
3016                          * We only handle data, save for caller to handle.
3017                          */
3018                         if (pmp != NULL) {
3019                                 pmp->b_cont = nmp->b_cont;
3020                         }
3021                         nmp->b_cont = NULL;
3022                         if (*rmp == NULL) {
3023                                 *rmp = nmp;
3024                         } else {
3025                                 tmp->b_cont = nmp;
3026                         }
3027                         nmp = nmp->b_cont;
3028                         tmp = nmp;
3029                 }
3030         }
3031         if (pmp != NULL) {
3032                 /* Free any mblk_t(s) which we have consumed */
3033                 pmp->b_cont = NULL;
3034                 freemsg(sti->sti_nl7c_rcv_mp);
3035         }
3036         if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3037                 /* Last mblk_t so return the saved kstrgetmsg() rval/error */
3038                 if (error == 0) {
3039                         rval_t  *p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3040 
3041                         error = p->r_v.r_v2;
3042                         p->r_v.r_v2 = 0;
3043                 }
3044                 rp->r_vals = sti->sti_nl7c_rcv_rval;
3045                 sti->sti_nl7c_rcv_rval = 0;
3046         } else {
3047                 /* More mblk_t(s) to process so no rval to return */
3048                 rp->r_vals = 0;
3049         }
3050         return (error);
3051 }
3052 /*
3053  * Receive the next message on the queue.
3054  * If msg_controllen is non-zero when called the caller is interested in
3055  * any received control info (options).
3056  * If msg_namelen is non-zero when called the caller is interested in
3057  * any received source address.
3058  * The routine returns with msg_control and msg_name pointing to
3059  * kmem_alloc'ed memory which the caller has to free.
3060  */
3061 /* ARGSUSED */
3062 int
3063 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3064     struct cred *cr)
3065 {
3066         union T_primitives      *tpr;
3067         mblk_t                  *mp;
3068         uchar_t                 pri;
3069         int                     pflag, opflag;
3070         void                    *control;
3071         t_uscalar_t             controllen;
3072         t_uscalar_t             namelen;
3073         int                     so_state = so->so_state; /* Snapshot */
3074         ssize_t                 saved_resid;
3075         rval_t                  rval;
3076         int                     flags;
3077         clock_t                 timout;
3078         int                     error = 0;
3079         sotpi_info_t            *sti = SOTOTPI(so);
3080 
3081         flags = msg->msg_flags;
3082         msg->msg_flags = 0;
3083 
3084         dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3085             (void *)so, (void *)msg, flags,
3086             pr_state(so->so_state, so->so_mode), so->so_error));
3087 
3088         if (so->so_version == SOV_STREAM) {
3089                 so_update_attrs(so, SOACC);
3090                 /* The imaginary "sockmod" has been popped - act as a stream */
3091                 return (strread(SOTOV(so), uiop, cr));
3092         }
3093 
3094         /*
3095          * If we are not connected because we have never been connected
3096          * we return ENOTCONN. If we have been connected (but are no longer
3097          * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3098          * the EOF.
3099          *
3100          * An alternative would be to post an ENOTCONN error in stream head
3101          * (read+write) and clear it when we're connected. However, that error
3102          * would cause incorrect poll/select behavior!
3103          */
3104         if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3105             (so->so_mode & SM_CONNREQUIRED)) {
3106                 return (ENOTCONN);
3107         }
3108 
3109         /*
3110          * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3111          * after checking that the read queue is empty) and returns zero.
3112          * This implementation will sleep (in kstrgetmsg) even if uio_resid
3113          * is zero.
3114          */
3115 
3116         if (flags & MSG_OOB) {
3117                 /* Check that the transport supports OOB */
3118                 if (!(so->so_mode & SM_EXDATA))
3119                         return (EOPNOTSUPP);
3120                 so_update_attrs(so, SOACC);
3121                 return (sorecvoob(so, msg, uiop, flags,
3122                     (so->so_options & SO_OOBINLINE)));
3123         }
3124 
3125         so_update_attrs(so, SOACC);
3126 
3127         /*
3128          * Set msg_controllen and msg_namelen to zero here to make it
3129          * simpler in the cases that no control or name is returned.
3130          */
3131         controllen = msg->msg_controllen;
3132         namelen = msg->msg_namelen;
3133         msg->msg_controllen = 0;
3134         msg->msg_namelen = 0;
3135 
3136         dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3137             namelen, controllen));
3138 
3139         mutex_enter(&so->so_lock);
3140         /*
3141          * If an NL7C enabled socket and not waiting for write data.
3142          */
3143         if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3144             NL7C_ENABLED) {
3145                 if (sti->sti_nl7c_uri) {
3146                         /* Close uri processing for a previous request */
3147                         nl7c_close(so);
3148                 }
3149                 if ((so_state & SS_CANTRCVMORE) &&
3150                     sti->sti_nl7c_rcv_mp == NULL) {
3151                         /* Nothing to process, EOF */
3152                         mutex_exit(&so->so_lock);
3153                         return (0);
3154                 } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3155                         /* Persistent NL7C socket, try to process request */
3156                         boolean_t ret;
3157 
3158                         ret = nl7c_process(so,
3159                             (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3160                         rval.r_vals = sti->sti_nl7c_rcv_rval;
3161                         error = rval.r_v.r_v2;
3162                         if (error) {
3163                                 /* Error of some sort, return it */
3164                                 mutex_exit(&so->so_lock);
3165                                 return (error);
3166                         }
3167                         if (sti->sti_nl7c_flags &&
3168                             ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3169                                 /*
3170                                  * Still an NL7C socket and no data
3171                                  * to pass up to the caller.
3172                                  */
3173                                 mutex_exit(&so->so_lock);
3174                                 if (ret) {
3175                                         /* EOF */
3176                                         return (0);
3177                                 } else {
3178                                         /* Need more data */
3179                                         return (EAGAIN);
3180                                 }
3181                         }
3182                 } else {
3183                         /*
3184                          * Not persistent so no further NL7C processing.
3185                          */
3186                         sti->sti_nl7c_flags = 0;
3187                 }
3188         }
3189         /*
3190          * Only one reader is allowed at any given time. This is needed
3191          * for T_EXDATA handling and, in the future, MSG_WAITALL.
3192          *
3193          * This is slightly different that BSD behavior in that it fails with
3194          * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3195          * is single-threaded using sblock(), which is dropped while waiting
3196          * for data to appear. The difference shows up e.g. if one
3197          * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3198          * does use nonblocking io and different threads are reading each
3199          * file descriptor. In BSD there would never be an EWOULDBLOCK error
3200          * in this case as long as the read queue doesn't get empty.
3201          * In this implementation the thread using nonblocking io can
3202          * get an EWOULDBLOCK error due to the blocking thread executing
3203          * e.g. in the uiomove in kstrgetmsg.
3204          * This difference is not believed to be significant.
3205          */
3206         /* Set SOREADLOCKED */
3207         error = so_lock_read_intr(so,
3208             uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3209         mutex_exit(&so->so_lock);
3210         if (error)
3211                 return (error);
3212 
3213         /*
3214          * Tell kstrgetmsg to not inspect the stream head errors until all
3215          * queued data has been consumed.
3216          * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3217          * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3218          *
3219          * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3220          * to T_OPTDATA_IND that do not contain any user-visible control msg.
3221          * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3222          */
3223         pflag = MSG_ANY | MSG_DELAYERROR;
3224         if (flags & MSG_PEEK) {
3225                 pflag |= MSG_IPEEK;
3226                 flags &= ~MSG_WAITALL;
3227         }
3228         if (so->so_mode & SM_ATOMIC)
3229                 pflag |= MSG_DISCARDTAIL;
3230 
3231         if (flags & MSG_DONTWAIT)
3232                 timout = 0;
3233         else if (so->so_rcvtimeo != 0)
3234                 timout = TICK_TO_MSEC(so->so_rcvtimeo);
3235         else
3236                 timout = -1;
3237         opflag = pflag;
3238 retry:
3239         saved_resid = uiop->uio_resid;
3240         pri = 0;
3241         mp = NULL;
3242         if (sti->sti_nl7c_rcv_mp != NULL) {
3243                 /* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3244                 error = nl7c_sorecv(so, &mp, uiop, &rval);
3245         } else {
3246                 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3247                     timout, &rval);
3248         }
3249         if (error != 0) {
3250                 /* kstrgetmsg returns ETIME when timeout expires */
3251                 if (error == ETIME)
3252                         error = EWOULDBLOCK;
3253                 goto out;
3254         }
3255         /*
3256          * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3257          * For non-datagrams MOREDATA is used to set MSG_EOR.
3258          */
3259         ASSERT(!(rval.r_val1 & MORECTL));
3260         if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3261                 msg->msg_flags |= MSG_TRUNC;
3262 
3263         if (mp == NULL) {
3264                 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3265                 /*
3266                  * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3267                  * The draft Posix socket spec states that the mark should
3268                  * not be cleared when peeking. We follow the latter.
3269                  */
3270                 if ((so->so_state &
3271                     (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3272                     (uiop->uio_resid != saved_resid) &&
3273                     !(flags & MSG_PEEK)) {
3274                         sorecv_update_oobstate(so);
3275                 }
3276 
3277                 mutex_enter(&so->so_lock);
3278                 /* Set MSG_EOR based on MOREDATA */
3279                 if (!(rval.r_val1 & MOREDATA)) {
3280                         if (so->so_state & SS_SAVEDEOR) {
3281                                 msg->msg_flags |= MSG_EOR;
3282                                 so->so_state &= ~SS_SAVEDEOR;
3283                         }
3284                 }
3285                 /*
3286                  * If some data was received (i.e. not EOF) and the
3287                  * read/recv* has not been satisfied wait for some more.
3288                  */
3289                 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3290                     uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3291                         mutex_exit(&so->so_lock);
3292                         pflag = opflag | MSG_NOMARK;
3293                         goto retry;
3294                 }
3295                 goto out_locked;
3296         }
3297 
3298         /* strsock_proto has already verified length and alignment */
3299         tpr = (union T_primitives *)mp->b_rptr;
3300         dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3301 
3302         switch (tpr->type) {
3303         case T_DATA_IND: {
3304                 if ((so->so_state &
3305                     (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3306                     (uiop->uio_resid != saved_resid) &&
3307                     !(flags & MSG_PEEK)) {
3308                         sorecv_update_oobstate(so);
3309                 }
3310 
3311                 /*
3312                  * Set msg_flags to MSG_EOR based on
3313                  * MORE_flag and MOREDATA.
3314                  */
3315                 mutex_enter(&so->so_lock);
3316                 so->so_state &= ~SS_SAVEDEOR;
3317                 if (!(tpr->data_ind.MORE_flag & 1)) {
3318                         if (!(rval.r_val1 & MOREDATA))
3319                                 msg->msg_flags |= MSG_EOR;
3320                         else
3321                                 so->so_state |= SS_SAVEDEOR;
3322                 }
3323                 freemsg(mp);
3324                 /*
3325                  * If some data was received (i.e. not EOF) and the
3326                  * read/recv* has not been satisfied wait for some more.
3327                  */
3328                 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3329                     uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3330                         mutex_exit(&so->so_lock);
3331                         pflag = opflag | MSG_NOMARK;
3332                         goto retry;
3333                 }
3334                 goto out_locked;
3335         }
3336         case T_UNITDATA_IND: {
3337                 void *addr;
3338                 t_uscalar_t addrlen;
3339                 void *abuf;
3340                 t_uscalar_t optlen;
3341                 void *opt;
3342 
3343                 if ((so->so_state &
3344                     (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3345                     (uiop->uio_resid != saved_resid) &&
3346                     !(flags & MSG_PEEK)) {
3347                         sorecv_update_oobstate(so);
3348                 }
3349 
3350                 if (namelen != 0) {
3351                         /* Caller wants source address */
3352                         addrlen = tpr->unitdata_ind.SRC_length;
3353                         addr = sogetoff(mp,
3354                             tpr->unitdata_ind.SRC_offset,
3355                             addrlen, 1);
3356                         if (addr == NULL) {
3357                                 freemsg(mp);
3358                                 error = EPROTO;
3359                                 eprintsoline(so, error);
3360                                 goto out;
3361                         }
3362                         if (so->so_family == AF_UNIX) {
3363                                 /*
3364                                  * Can not use the transport level address.
3365                                  * If there is a SO_SRCADDR option carrying
3366                                  * the socket level address it will be
3367                                  * extracted below.
3368                                  */
3369                                 addr = NULL;
3370                                 addrlen = 0;
3371                         }
3372                 }
3373                 optlen = tpr->unitdata_ind.OPT_length;
3374                 if (optlen != 0) {
3375                         t_uscalar_t ncontrollen;
3376 
3377                         /*
3378                          * Extract any source address option.
3379                          * Determine how large cmsg buffer is needed.
3380                          */
3381                         opt = sogetoff(mp,
3382                             tpr->unitdata_ind.OPT_offset,
3383                             optlen, __TPI_ALIGN_SIZE);
3384 
3385                         if (opt == NULL) {
3386                                 freemsg(mp);
3387                                 error = EPROTO;
3388                                 eprintsoline(so, error);
3389                                 goto out;
3390                         }
3391                         if (so->so_family == AF_UNIX)
3392                                 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3393                         ncontrollen = so_cmsglen(mp, opt, optlen,
3394                             !(flags & MSG_XPG4_2));
3395                         if (controllen != 0)
3396                                 controllen = ncontrollen;
3397                         else if (ncontrollen != 0)
3398                                 msg->msg_flags |= MSG_CTRUNC;
3399                 } else {
3400                         controllen = 0;
3401                 }
3402 
3403                 if (namelen != 0) {
3404                         /*
3405                          * Return address to caller.
3406                          * Caller handles truncation if length
3407                          * exceeds msg_namelen.
3408                          * NOTE: AF_UNIX NUL termination is ensured by
3409                          * the sender's copyin_name().
3410                          */
3411                         abuf = kmem_alloc(addrlen, KM_SLEEP);
3412 
3413                         bcopy(addr, abuf, addrlen);
3414                         msg->msg_name = abuf;
3415                         msg->msg_namelen = addrlen;
3416                 }
3417 
3418                 if (controllen != 0) {
3419                         /*
3420                          * Return control msg to caller.
3421                          * Caller handles truncation if length
3422                          * exceeds msg_controllen.
3423                          */
3424                         control = kmem_zalloc(controllen, KM_SLEEP);
3425 
3426                         error = so_opt2cmsg(mp, opt, optlen,
3427                             !(flags & MSG_XPG4_2),
3428                             control, controllen);
3429                         if (error) {
3430                                 freemsg(mp);
3431                                 if (msg->msg_namelen != 0)
3432                                         kmem_free(msg->msg_name,
3433                                             msg->msg_namelen);
3434                                 kmem_free(control, controllen);
3435                                 eprintsoline(so, error);
3436                                 goto out;
3437                         }
3438                         msg->msg_control = control;
3439                         msg->msg_controllen = controllen;
3440                 }
3441 
3442                 freemsg(mp);
3443                 goto out;
3444         }
3445         case T_OPTDATA_IND: {
3446                 struct T_optdata_req *tdr;
3447                 void *opt;
3448                 t_uscalar_t optlen;
3449 
3450                 if ((so->so_state &
3451                     (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3452                     (uiop->uio_resid != saved_resid) &&
3453                     !(flags & MSG_PEEK)) {
3454                         sorecv_update_oobstate(so);
3455                 }
3456 
3457                 tdr = (struct T_optdata_req *)mp->b_rptr;
3458                 optlen = tdr->OPT_length;
3459                 if (optlen != 0) {
3460                         t_uscalar_t ncontrollen;
3461                         /*
3462                          * Determine how large cmsg buffer is needed.
3463                          */
3464                         opt = sogetoff(mp,
3465                             tpr->optdata_ind.OPT_offset,
3466                             optlen, __TPI_ALIGN_SIZE);
3467 
3468                         if (opt == NULL) {
3469                                 freemsg(mp);
3470                                 error = EPROTO;
3471                                 eprintsoline(so, error);
3472                                 goto out;
3473                         }
3474 
3475                         ncontrollen = so_cmsglen(mp, opt, optlen,
3476                             !(flags & MSG_XPG4_2));
3477                         if (controllen != 0)
3478                                 controllen = ncontrollen;
3479                         else if (ncontrollen != 0)
3480                                 msg->msg_flags |= MSG_CTRUNC;
3481                 } else {
3482                         controllen = 0;
3483                 }
3484 
3485                 if (controllen != 0) {
3486                         /*
3487                          * Return control msg to caller.
3488                          * Caller handles truncation if length
3489                          * exceeds msg_controllen.
3490                          */
3491                         control = kmem_zalloc(controllen, KM_SLEEP);
3492 
3493                         error = so_opt2cmsg(mp, opt, optlen,
3494                             !(flags & MSG_XPG4_2),
3495                             control, controllen);
3496                         if (error) {
3497                                 freemsg(mp);
3498                                 kmem_free(control, controllen);
3499                                 eprintsoline(so, error);
3500                                 goto out;
3501                         }
3502                         msg->msg_control = control;
3503                         msg->msg_controllen = controllen;
3504                 }
3505 
3506                 /*
3507                  * Set msg_flags to MSG_EOR based on
3508                  * DATA_flag and MOREDATA.
3509                  */
3510                 mutex_enter(&so->so_lock);
3511                 so->so_state &= ~SS_SAVEDEOR;
3512                 if (!(tpr->data_ind.MORE_flag & 1)) {
3513                         if (!(rval.r_val1 & MOREDATA))
3514                                 msg->msg_flags |= MSG_EOR;
3515                         else
3516                                 so->so_state |= SS_SAVEDEOR;
3517                 }
3518                 freemsg(mp);
3519                 /*
3520                  * If some data was received (i.e. not EOF) and the
3521                  * read/recv* has not been satisfied wait for some more.
3522                  * Not possible to wait if control info was received.
3523                  */
3524                 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3525                     controllen == 0 &&
3526                     uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3527                         mutex_exit(&so->so_lock);
3528                         pflag = opflag | MSG_NOMARK;
3529                         goto retry;
3530                 }
3531                 goto out_locked;
3532         }
3533         case T_EXDATA_IND: {
3534                 dprintso(so, 1,
3535                     ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3536                     "state %s\n",
3537                     sti->sti_oobsigcnt, sti->sti_oobcnt,
3538                     saved_resid - uiop->uio_resid,
3539                     pr_state(so->so_state, so->so_mode)));
3540                 /*
3541                  * kstrgetmsg handles MSGMARK so there is nothing to
3542                  * inspect in the T_EXDATA_IND.
3543                  * strsock_proto makes the stream head queue the T_EXDATA_IND
3544                  * as a separate message with no M_DATA component. Furthermore,
3545                  * the stream head does not consolidate M_DATA messages onto
3546                  * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3547                  * remains a message by itself. This is needed since MSGMARK
3548                  * marks both the whole message as well as the last byte
3549                  * of the message.
3550                  */
3551                 freemsg(mp);
3552                 ASSERT(uiop->uio_resid == saved_resid);      /* No data */
3553                 if (flags & MSG_PEEK) {
3554                         /*
3555                          * Even though we are peeking we consume the
3556                          * T_EXDATA_IND thereby moving the mark information
3557                          * to SS_RCVATMARK. Then the oob code below will
3558                          * retry the peeking kstrgetmsg.
3559                          * Note that the stream head read queue is
3560                          * never flushed without holding SOREADLOCKED
3561                          * thus the T_EXDATA_IND can not disappear
3562                          * underneath us.
3563                          */
3564                         dprintso(so, 1,
3565                             ("sotpi_recvmsg: consume EXDATA_IND "
3566                             "counts %d/%d state %s\n",
3567                             sti->sti_oobsigcnt,
3568                             sti->sti_oobcnt,
3569                             pr_state(so->so_state, so->so_mode)));
3570 
3571                         pflag = MSG_ANY | MSG_DELAYERROR;
3572                         if (so->so_mode & SM_ATOMIC)
3573                                 pflag |= MSG_DISCARDTAIL;
3574 
3575                         pri = 0;
3576                         mp = NULL;
3577 
3578                         error = kstrgetmsg(SOTOV(so), &mp, uiop,
3579                             &pri, &pflag, (clock_t)-1, &rval);
3580                         ASSERT(uiop->uio_resid == saved_resid);
3581 
3582                         if (error) {
3583 #ifdef SOCK_DEBUG
3584                                 if (error != EWOULDBLOCK && error != EINTR) {
3585                                         eprintsoline(so, error);
3586                                 }
3587 #endif /* SOCK_DEBUG */
3588                                 goto out;
3589                         }
3590                         ASSERT(mp);
3591                         tpr = (union T_primitives *)mp->b_rptr;
3592                         ASSERT(tpr->type == T_EXDATA_IND);
3593                         freemsg(mp);
3594                 } /* end "if (flags & MSG_PEEK)" */
3595 
3596                 /*
3597                  * Decrement the number of queued and pending oob.
3598                  *
3599                  * SS_RCVATMARK is cleared when we read past a mark.
3600                  * SS_HAVEOOBDATA is cleared when we've read past the
3601                  * last mark.
3602                  * SS_OOBPEND is cleared if we've read past the last
3603                  * mark and no (new) SIGURG has been posted.
3604                  */
3605                 mutex_enter(&so->so_lock);
3606                 ASSERT(so_verify_oobstate(so));
3607                 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3608                 ASSERT(sti->sti_oobsigcnt > 0);
3609                 sti->sti_oobsigcnt--;
3610                 ASSERT(sti->sti_oobcnt > 0);
3611                 sti->sti_oobcnt--;
3612                 /*
3613                  * Since the T_EXDATA_IND has been removed from the stream
3614                  * head, but we have not read data past the mark,
3615                  * sockfs needs to track that the socket is still at the mark.
3616                  *
3617                  * Since no data was received call kstrgetmsg again to wait
3618                  * for data.
3619                  */
3620                 so->so_state |= SS_RCVATMARK;
3621                 mutex_exit(&so->so_lock);
3622                 dprintso(so, 1,
3623                     ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3624                     sti->sti_oobsigcnt, sti->sti_oobcnt,
3625                     pr_state(so->so_state, so->so_mode)));
3626                 pflag = opflag;
3627                 goto retry;
3628         }
3629         default:
3630                 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3631                     (void *)so, tpr->type, (void *)mp);
3632                 ASSERT(0);
3633                 freemsg(mp);
3634                 error = EPROTO;
3635                 eprintsoline(so, error);
3636                 goto out;
3637         }
3638         /* NOTREACHED */
3639 out:
3640         mutex_enter(&so->so_lock);
3641 out_locked:
3642         so_unlock_read(so);     /* Clear SOREADLOCKED */
3643         mutex_exit(&so->so_lock);
3644         return (error);
3645 }
3646 
3647 /*
3648  * Sending data with options on a datagram socket.
3649  * Assumes caller has verified that SS_ISBOUND etc. are set.
3650  *
3651  * For AF_UNIX the destination address may be already in
3652  * internal form, as indicated by sti->sti_faddr_noxlate
3653  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3654  * translate the destination address to internal form.
3655  *
3656  * The source address is passed as an option.  If passing
3657  * file descriptors, those are passed as file pointers in
3658  * another option.
3659  */
3660 static int
3661 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3662     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3663 {
3664         struct T_unitdata_req   tudr;
3665         mblk_t                  *mp;
3666         int                     error;
3667         void                    *addr;
3668         socklen_t               addrlen;
3669         void                    *src;
3670         socklen_t               srclen;
3671         ssize_t                 len;
3672         int                     size;
3673         struct T_opthdr         toh;
3674         struct fdbuf            *fdbuf;
3675         t_uscalar_t             optlen;
3676         void                    *fds;
3677         int                     fdlen;
3678         sotpi_info_t            *sti = SOTOTPI(so);
3679 
3680         ASSERT(name && namelen);
3681         ASSERT(control && controllen);
3682 
3683         len = uiop->uio_resid;
3684         if (len > (ssize_t)sti->sti_tidu_size) {
3685                 return (EMSGSIZE);
3686         }
3687 
3688         if (sti->sti_faddr_noxlate == 0 &&
3689             (flags & MSG_SENDTO_NOXLATE) == 0) {
3690                 /*
3691                  * Length and family checks.
3692                  * Don't verify internal form.
3693                  */
3694                 error = so_addr_verify(so, name, namelen);
3695                 if (error) {
3696                         eprintsoline(so, error);
3697                         return (error);
3698                 }
3699         }
3700 
3701         if (so->so_family == AF_UNIX) {
3702                 if (sti->sti_faddr_noxlate) {
3703                         /*
3704                          * Already have a transport internal address. Do not
3705                          * pass any (transport internal) source address.
3706                          */
3707                         addr = name;
3708                         addrlen = namelen;
3709                         src = NULL;
3710                         srclen = 0;
3711                 } else if (flags & MSG_SENDTO_NOXLATE) {
3712                         /*
3713                          * Have an internal form dest. address.
3714                          * Pass the source address as usual.
3715                          */
3716                         addr = name;
3717                         addrlen = namelen;
3718                         src = sti->sti_laddr_sa;
3719                         srclen = (socklen_t)sti->sti_laddr_len;
3720                 } else {
3721                         /*
3722                          * Pass the sockaddr_un source address as an option
3723                          * and translate the remote address.
3724                          *
3725                          * Note that this code does not prevent sti_laddr_sa
3726                          * from changing while it is being used. Thus
3727                          * if an unbind+bind occurs concurrently with this
3728                          * send the peer might see a partially new and a
3729                          * partially old "from" address.
3730                          */
3731                         src = sti->sti_laddr_sa;
3732                         srclen = (socklen_t)sti->sti_laddr_len;
3733                         dprintso(so, 1,
3734                             ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3735                             srclen, src));
3736                         /*
3737                          * The sendmsg caller specified a destination
3738                          * address, which we must translate into our
3739                          * internal form.  addr = &sti->sti_ux_taddr
3740                          */
3741                         error = so_ux_addr_xlate(so, name, namelen,
3742                             (flags & MSG_XPG4_2),
3743                             &addr, &addrlen);
3744                         if (error) {
3745                                 eprintsoline(so, error);
3746                                 return (error);
3747                         }
3748                 }
3749         } else {
3750                 addr = name;
3751                 addrlen = namelen;
3752                 src = NULL;
3753                 srclen = 0;
3754         }
3755         optlen = so_optlen(control, controllen,
3756             !(flags & MSG_XPG4_2));
3757         tudr.PRIM_type = T_UNITDATA_REQ;
3758         tudr.DEST_length = addrlen;
3759         tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3760         if (srclen != 0)
3761                 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3762                     _TPI_ALIGN_TOPT(srclen));
3763         else
3764                 tudr.OPT_length = optlen;
3765         tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3766             _TPI_ALIGN_TOPT(addrlen));
3767 
3768         size = tudr.OPT_offset + tudr.OPT_length;
3769 
3770         /*
3771          * File descriptors only when SM_FDPASSING set.
3772          */
3773         error = so_getfdopt(control, controllen,
3774             !(flags & MSG_XPG4_2), &fds, &fdlen);
3775         if (error)
3776                 return (error);
3777         if (fdlen != -1) {
3778                 if (!(so->so_mode & SM_FDPASSING))
3779                         return (EOPNOTSUPP);
3780 
3781                 error = fdbuf_create(fds, fdlen, &fdbuf);
3782                 if (error)
3783                         return (error);
3784 
3785                 /*
3786                  * Pre-allocate enough additional space for lower level modules
3787                  * to append an option (e.g. see tl_unitdata). The following
3788                  * is enough extra space for the largest option we might append.
3789                  */
3790                 size += sizeof (struct T_opthdr) + ucredsize;
3791                 mp = fdbuf_allocmsg(size, fdbuf);
3792         } else {
3793                 mp = soallocproto(size, _ALLOC_INTR, CRED());
3794                 if (mp == NULL) {
3795                         /*
3796                          * Caught a signal waiting for memory.
3797                          * Let send* return EINTR.
3798                          */
3799                         return (EINTR);
3800                 }
3801         }
3802         soappendmsg(mp, &tudr, sizeof (tudr));
3803         soappendmsg(mp, addr, addrlen);
3804         mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3805 
3806         if (fdlen != -1) {
3807                 ASSERT(fdbuf != NULL);
3808                 toh.level = SOL_SOCKET;
3809                 toh.name = SO_FILEP;
3810                 toh.len = fdbuf->fd_size +
3811                     (t_uscalar_t)sizeof (struct T_opthdr);
3812                 toh.status = 0;
3813                 soappendmsg(mp, &toh, sizeof (toh));
3814                 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3815                 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3816         }
3817         if (srclen != 0) {
3818                 /*
3819                  * There is a AF_UNIX sockaddr_un to include as a source
3820                  * address option.
3821                  */
3822                 toh.level = SOL_SOCKET;
3823                 toh.name = SO_SRCADDR;
3824                 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3825                 toh.status = 0;
3826                 soappendmsg(mp, &toh, sizeof (toh));
3827                 soappendmsg(mp, src, srclen);
3828                 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3829                 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3830         }
3831         ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3832         so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3833         /*
3834          * Normally at most 3 bytes left in the message, but we might have
3835          * allowed for extra space if we're passing fd's through.
3836          */
3837         ASSERT(MBLKL(mp) <= (ssize_t)size);
3838 
3839         ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3840         if (AU_AUDITING())
3841                 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3842 
3843         error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3844 #ifdef SOCK_DEBUG
3845         if (error) {
3846                 eprintsoline(so, error);
3847         }
3848 #endif /* SOCK_DEBUG */
3849         return (error);
3850 }
3851 
3852 /*
3853  * Sending data with options on a connected stream socket.
3854  * Assumes caller has verified that SS_ISCONNECTED is set.
3855  */
3856 static int
3857 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3858     t_uscalar_t controllen, int flags)
3859 {
3860         struct T_optdata_req    tdr;
3861         mblk_t                  *mp;
3862         int                     error;
3863         ssize_t                 iosize;
3864         int                     size;
3865         struct fdbuf            *fdbuf;
3866         t_uscalar_t             optlen;
3867         void                    *fds;
3868         int                     fdlen;
3869         struct T_opthdr         toh;
3870         sotpi_info_t            *sti = SOTOTPI(so);
3871 
3872         dprintso(so, 1,
3873             ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3874 
3875         /*
3876          * Has to be bound and connected. However, since no locks are
3877          * held the state could have changed after sotpi_sendmsg checked it
3878          * thus it is not possible to ASSERT on the state.
3879          */
3880 
3881         /* Options on connection-oriented only when SM_OPTDATA set. */
3882         if (!(so->so_mode & SM_OPTDATA))
3883                 return (EOPNOTSUPP);
3884 
3885         do {
3886                 /*
3887                  * Set the MORE flag if uio_resid does not fit in this
3888                  * message or if the caller passed in "more".
3889                  * Error for transports with zero tidu_size.
3890                  */
3891                 tdr.PRIM_type = T_OPTDATA_REQ;
3892                 iosize = sti->sti_tidu_size;
3893                 if (iosize <= 0)
3894                         return (EMSGSIZE);
3895                 if (uiop->uio_resid > iosize) {
3896                         tdr.DATA_flag = 1;
3897                 } else {
3898                         if (more)
3899                                 tdr.DATA_flag = 1;
3900                         else
3901                                 tdr.DATA_flag = 0;
3902                         iosize = uiop->uio_resid;
3903                 }
3904                 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3905                     tdr.DATA_flag, iosize));
3906 
3907                 optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3908                 tdr.OPT_length = optlen;
3909                 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3910 
3911                 size = (int)sizeof (tdr) + optlen;
3912                 /*
3913                  * File descriptors only when SM_FDPASSING set.
3914                  */
3915                 error = so_getfdopt(control, controllen,
3916                     !(flags & MSG_XPG4_2), &fds, &fdlen);
3917                 if (error)
3918                         return (error);
3919                 if (fdlen != -1) {
3920                         if (!(so->so_mode & SM_FDPASSING))
3921                                 return (EOPNOTSUPP);
3922 
3923                         error = fdbuf_create(fds, fdlen, &fdbuf);
3924                         if (error)
3925                                 return (error);
3926 
3927                         /*
3928                          * Pre-allocate enough additional space for lower level
3929                          * modules to append an option (e.g. see tl_unitdata).
3930                          * The following is enough extra space for the largest
3931                          * option we might append.
3932                          */
3933                         size += sizeof (struct T_opthdr) + ucredsize;
3934                         mp = fdbuf_allocmsg(size, fdbuf);
3935                 } else {
3936                         mp = soallocproto(size, _ALLOC_INTR, CRED());
3937                         if (mp == NULL) {
3938                                 /*
3939                                  * Caught a signal waiting for memory.
3940                                  * Let send* return EINTR.
3941                                  */
3942                                 return (EINTR);
3943                         }
3944                 }
3945                 soappendmsg(mp, &tdr, sizeof (tdr));
3946 
3947                 if (fdlen != -1) {
3948                         ASSERT(fdbuf != NULL);
3949                         toh.level = SOL_SOCKET;
3950                         toh.name = SO_FILEP;
3951                         toh.len = fdbuf->fd_size +
3952                             (t_uscalar_t)sizeof (struct T_opthdr);
3953                         toh.status = 0;
3954                         soappendmsg(mp, &toh, sizeof (toh));
3955                         soappendmsg(mp, fdbuf, fdbuf->fd_size);
3956                         ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3957                 }
3958                 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3959                 /*
3960                  * Normally at most 3 bytes left in the message, but we might
3961                  * have allowed for extra space if we're passing fd's through.
3962                  */
3963                 ASSERT(MBLKL(mp) <= (ssize_t)size);
3964 
3965                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3966 
3967                 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3968                     0, MSG_BAND, 0);
3969                 if (error) {
3970                         eprintsoline(so, error);
3971                         return (error);
3972                 }
3973                 control = NULL;
3974                 if (uiop->uio_resid > 0) {
3975                         /*
3976                          * Recheck for fatal errors. Fail write even though
3977                          * some data have been written. This is consistent
3978                          * with strwrite semantics and BSD sockets semantics.
3979                          */
3980                         if (so->so_state & SS_CANTSENDMORE) {
3981                                 eprintsoline(so, error);
3982                                 return (EPIPE);
3983                         }
3984                         if (so->so_error != 0) {
3985                                 mutex_enter(&so->so_lock);
3986                                 error = sogeterr(so, B_TRUE);
3987                                 mutex_exit(&so->so_lock);
3988                                 if (error != 0) {
3989                                         eprintsoline(so, error);
3990                                         return (error);
3991                                 }
3992                         }
3993                 }
3994         } while (uiop->uio_resid > 0);
3995         return (0);
3996 }
3997 
3998 /*
3999  * Sending data on a datagram socket.
4000  * Assumes caller has verified that SS_ISBOUND etc. are set.
4001  *
4002  * For AF_UNIX the destination address may be already in
4003  * internal form, as indicated by sti->sti_faddr_noxlate
4004  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
4005  * translate the destination address to internal form.
4006  *
4007  * The source address is passed as an option.
4008  */
4009 int
4010 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
4011     struct uio *uiop, int flags)
4012 {
4013         struct T_unitdata_req   tudr;
4014         mblk_t                  *mp;
4015         int                     error;
4016         void                    *addr;
4017         socklen_t               addrlen;
4018         void                    *src;
4019         socklen_t               srclen;
4020         ssize_t                 len;
4021         sotpi_info_t            *sti = SOTOTPI(so);
4022 
4023         ASSERT(name != NULL && namelen != 0);
4024 
4025         len = uiop->uio_resid;
4026         if (len > sti->sti_tidu_size) {
4027                 error = EMSGSIZE;
4028                 goto done;
4029         }
4030 
4031         if (sti->sti_faddr_noxlate == 0 &&
4032             (flags & MSG_SENDTO_NOXLATE) == 0) {
4033                 /*
4034                  * Length and family checks.
4035                  * Don't verify internal form.
4036                  */
4037                 error = so_addr_verify(so, name, namelen);
4038                 if (error != 0)
4039                         goto done;
4040         }
4041 
4042         if (sti->sti_direct) /* Never on AF_UNIX */
4043                 return (sodgram_direct(so, name, namelen, uiop, flags));
4044 
4045         if (so->so_family == AF_UNIX) {
4046                 if (sti->sti_faddr_noxlate) {
4047                         /*
4048                          * Already have a transport internal address. Do not
4049                          * pass any (transport internal) source address.
4050                          */
4051                         addr = name;
4052                         addrlen = namelen;
4053                         src = NULL;
4054                         srclen = 0;
4055                 } else if (flags & MSG_SENDTO_NOXLATE) {
4056                         /*
4057                          * Have an internal form dest. address.
4058                          * Pass the source address as usual.
4059                          */
4060                         addr = name;
4061                         addrlen = namelen;
4062                         src = sti->sti_laddr_sa;
4063                         srclen = (socklen_t)sti->sti_laddr_len;
4064                 } else {
4065                         /*
4066                          * Pass the sockaddr_un source address as an option
4067                          * and translate the remote address.
4068                          *
4069                          * Note that this code does not prevent sti_laddr_sa
4070                          * from changing while it is being used. Thus
4071                          * if an unbind+bind occurs concurrently with this
4072                          * send the peer might see a partially new and a
4073                          * partially old "from" address.
4074                          */
4075                         src = sti->sti_laddr_sa;
4076                         srclen = (socklen_t)sti->sti_laddr_len;
4077                         dprintso(so, 1,
4078                             ("sosend_dgram UNIX: srclen %d, src %p\n",
4079                             srclen, src));
4080                         /*
4081                          * The sendmsg caller specified a destination
4082                          * address, which we must translate into our
4083                          * internal form.  addr = &sti->sti_ux_taddr
4084                          */
4085                         error = so_ux_addr_xlate(so, name, namelen,
4086                             (flags & MSG_XPG4_2),
4087                             &addr, &addrlen);
4088                         if (error) {
4089                                 eprintsoline(so, error);
4090                                 goto done;
4091                         }
4092                 }
4093         } else {
4094                 addr = name;
4095                 addrlen = namelen;
4096                 src = NULL;
4097                 srclen = 0;
4098         }
4099         tudr.PRIM_type = T_UNITDATA_REQ;
4100         tudr.DEST_length = addrlen;
4101         tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4102         if (srclen == 0) {
4103                 tudr.OPT_length = 0;
4104                 tudr.OPT_offset = 0;
4105 
4106                 mp = soallocproto2(&tudr, sizeof (tudr),
4107                     addr, addrlen, 0, _ALLOC_INTR, CRED());
4108                 if (mp == NULL) {
4109                         /*
4110                          * Caught a signal waiting for memory.
4111                          * Let send* return EINTR.
4112                          */
4113                         error = EINTR;
4114                         goto done;
4115                 }
4116         } else {
4117                 /*
4118                  * There is a AF_UNIX sockaddr_un to include as a source
4119                  * address option.
4120                  */
4121                 struct T_opthdr toh;
4122                 ssize_t size;
4123 
4124                 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4125                     _TPI_ALIGN_TOPT(srclen));
4126                 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4127                     _TPI_ALIGN_TOPT(addrlen));
4128 
4129                 toh.level = SOL_SOCKET;
4130                 toh.name = SO_SRCADDR;
4131                 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4132                 toh.status = 0;
4133 
4134                 size = tudr.OPT_offset + tudr.OPT_length;
4135                 mp = soallocproto2(&tudr, sizeof (tudr),
4136                     addr, addrlen, size, _ALLOC_INTR, CRED());
4137                 if (mp == NULL) {
4138                         /*
4139                          * Caught a signal waiting for memory.
4140                          * Let send* return EINTR.
4141                          */
4142                         error = EINTR;
4143                         goto done;
4144                 }
4145                 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4146                 soappendmsg(mp, &toh, sizeof (toh));
4147                 soappendmsg(mp, src, srclen);
4148                 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4149                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4150         }
4151 
4152         if (AU_AUDITING())
4153                 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4154 
4155         error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4156 done:
4157 #ifdef SOCK_DEBUG
4158         if (error) {
4159                 eprintsoline(so, error);
4160         }
4161 #endif /* SOCK_DEBUG */
4162         return (error);
4163 }
4164 
4165 /*
4166  * Sending data on a connected stream socket.
4167  * Assumes caller has verified that SS_ISCONNECTED is set.
4168  */
4169 int
4170 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4171     int sflag)
4172 {
4173         struct T_data_req       tdr;
4174         mblk_t                  *mp;
4175         int                     error;
4176         ssize_t                 iosize;
4177         sotpi_info_t            *sti = SOTOTPI(so);
4178 
4179         dprintso(so, 1,
4180             ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4181             (void *)so, uiop->uio_resid, prim, sflag));
4182 
4183         /*
4184          * Has to be bound and connected. However, since no locks are
4185          * held the state could have changed after sotpi_sendmsg checked it
4186          * thus it is not possible to ASSERT on the state.
4187          */
4188 
4189         do {
4190                 /*
4191                  * Set the MORE flag if uio_resid does not fit in this
4192                  * message or if the caller passed in "more".
4193                  * Error for transports with zero tidu_size.
4194                  */
4195                 tdr.PRIM_type = prim;
4196                 iosize = sti->sti_tidu_size;
4197                 if (iosize <= 0)
4198                         return (EMSGSIZE);
4199                 if (uiop->uio_resid > iosize) {
4200                         tdr.MORE_flag = 1;
4201                 } else {
4202                         if (more)
4203                                 tdr.MORE_flag = 1;
4204                         else
4205                                 tdr.MORE_flag = 0;
4206                         iosize = uiop->uio_resid;
4207                 }
4208                 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4209                     prim, tdr.MORE_flag, iosize));
4210                 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4211                 if (mp == NULL) {
4212                         /*
4213                          * Caught a signal waiting for memory.
4214                          * Let send* return EINTR.
4215                          */
4216                         return (EINTR);
4217                 }
4218 
4219                 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4220                     0, sflag | MSG_BAND, 0);
4221                 if (error) {
4222                         eprintsoline(so, error);
4223                         return (error);
4224                 }
4225                 if (uiop->uio_resid > 0) {
4226                         /*
4227                          * Recheck for fatal errors. Fail write even though
4228                          * some data have been written. This is consistent
4229                          * with strwrite semantics and BSD sockets semantics.
4230                          */
4231                         if (so->so_state & SS_CANTSENDMORE) {
4232                                 eprintsoline(so, error);
4233                                 return (EPIPE);
4234                         }
4235                         if (so->so_error != 0) {
4236                                 mutex_enter(&so->so_lock);
4237                                 error = sogeterr(so, B_TRUE);
4238                                 mutex_exit(&so->so_lock);
4239                                 if (error != 0) {
4240                                         eprintsoline(so, error);
4241                                         return (error);
4242                                 }
4243                         }
4244                 }
4245         } while (uiop->uio_resid > 0);
4246         return (0);
4247 }
4248 
4249 /*
4250  * Check the state for errors and call the appropriate send function.
4251  *
4252  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4253  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4254  * after sending the message.
4255  *
4256  * The caller may optionally specify a destination address, for either
4257  * stream or datagram sockets.  This table summarizes the cases:
4258  *
4259  *    Socket type    Dest. given    Connected    Result
4260  *    -----------    -----------    ---------    --------------
4261  *    Stream         *              Yes          send to conn. addr.
4262  *    Stream         *              No           error ENOTCONN
4263  *    Dgram          yes            *            send to given addr.
4264  *    Dgram          no             yes          send to conn. addr.
4265  *    Dgram          no             no           error EDESTADDRREQ
4266  *
4267  * There are subtleties around the destination address when using
4268  * AF_UNIX datagram sockets.  When the sendmsg call specifies the
4269  * destination address, it's in (struct sockaddr_un) form and we
4270  * need to translate it to our internal form (struct so_ux_addr).
4271  *
4272  * When the sendmsg call does not specify a destination address
4273  * we're using the peer address saved during sotpi_connect, and
4274  * that address is already in internal form.  In this case, the
4275  * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4276  * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4277  * those functions should skip translation to internal form.
4278  * Avoiding that translation is not only more efficient, but it's
4279  * also necessary when a process does a connect on an AF_UNIX
4280  * datagram socket and then drops privileges.  After the process
4281  * has dropped privileges, it may no longer be able to lookup the
4282  * the external name in the filesystem, but it should still be
4283  * able to send messages on the connected socket by leaving the
4284  * destination name unspecified.
4285  *
4286  * Yet more subtleties arise with sockets connected by socketpair(),
4287  * which puts internal form addresses in the fields where normally
4288  * the external form is found, and sets sti_faddr_noxlate=1, which
4289  * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4290  * to skip translation of destination addresses to internal form.
4291  * However, beware that the flag sti_faddr_noxlate=1 also triggers
4292  * different behaviour almost everywhere AF_UNIX addresses appear.
4293  */
4294 static int
4295 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4296     struct cred *cr)
4297 {
4298         int             so_state;
4299         int             so_mode;
4300         int             error;
4301         struct sockaddr *name;
4302         t_uscalar_t     namelen;
4303         int             dontroute;
4304         int             flags;
4305         sotpi_info_t    *sti = SOTOTPI(so);
4306 
4307         dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4308             (void *)so, (void *)msg, msg->msg_flags,
4309             pr_state(so->so_state, so->so_mode), so->so_error));
4310 
4311         if (so->so_version == SOV_STREAM) {
4312                 /* The imaginary "sockmod" has been popped - act as a stream */
4313                 so_update_attrs(so, SOMOD);
4314                 return (strwrite(SOTOV(so), uiop, cr));
4315         }
4316 
4317         mutex_enter(&so->so_lock);
4318         so_state = so->so_state;
4319 
4320         if (so_state & SS_CANTSENDMORE) {
4321                 mutex_exit(&so->so_lock);
4322                 return (EPIPE);
4323         }
4324 
4325         if (so->so_error != 0) {
4326                 error = sogeterr(so, B_TRUE);
4327                 if (error != 0) {
4328                         mutex_exit(&so->so_lock);
4329                         return (error);
4330                 }
4331         }
4332 
4333         name = (struct sockaddr *)msg->msg_name;
4334         namelen = msg->msg_namelen;
4335         flags = msg->msg_flags;
4336 
4337         /*
4338          * Historically, this function does not validate the flags
4339          * passed in, and any errant bits are ignored.  However,
4340          * we would not want any such errant flag bits accidently
4341          * being treated as one of the internal-only flags, so
4342          * clear the internal-only flag bits.
4343          */
4344         flags &= ~MSG_SENDTO_NOXLATE;
4345 
4346         so_mode = so->so_mode;
4347 
4348         if (name == NULL) {
4349                 if (!(so_state & SS_ISCONNECTED)) {
4350                         mutex_exit(&so->so_lock);
4351                         if (so_mode & SM_CONNREQUIRED)
4352                                 return (ENOTCONN);
4353                         else
4354                                 return (EDESTADDRREQ);
4355                 }
4356                 /*
4357                  * This is a connected socket.
4358                  */
4359                 if (so_mode & SM_CONNREQUIRED) {
4360                         /*
4361                          * This is a connected STREAM socket,
4362                          * destination not specified.
4363                          */
4364                         name = NULL;
4365                         namelen = 0;
4366                 } else {
4367                         /*
4368                          * Datagram send on connected socket with
4369                          * the destination name not specified.
4370                          * Use the peer address from connect.
4371                          */
4372                         if (so->so_family == AF_UNIX) {
4373                                 /*
4374                                  * Use the (internal form) address saved
4375                                  * in sotpi_connect.  See above.
4376                                  */
4377                                 name = (void *)&sti->sti_ux_faddr;
4378                                 namelen = sizeof (sti->sti_ux_faddr);
4379                                 flags |= MSG_SENDTO_NOXLATE;
4380                         } else {
4381                                 ASSERT(sti->sti_faddr_sa);
4382                                 name = sti->sti_faddr_sa;
4383                                 namelen = (t_uscalar_t)sti->sti_faddr_len;
4384                         }
4385                 }
4386         } else {
4387                 /*
4388                  * Sendmsg specifies a destination name
4389                  */
4390                 if (!(so_state & SS_ISCONNECTED) &&
4391                     (so_mode & SM_CONNREQUIRED)) {
4392                         /* i.e. TCP not connected */
4393                         mutex_exit(&so->so_lock);
4394                         return (ENOTCONN);
4395                 }
4396                 /*
4397                  * Ignore the address on connection-oriented sockets.
4398                  * Just like BSD this code does not generate an error for
4399                  * TCP (a CONNREQUIRED socket) when sending to an address
4400                  * passed in with sendto/sendmsg. Instead the data is
4401                  * delivered on the connection as if no address had been
4402                  * supplied.
4403                  */
4404                 if ((so_state & SS_ISCONNECTED) &&
4405                     !(so_mode & SM_CONNREQUIRED)) {
4406                         mutex_exit(&so->so_lock);
4407                         return (EISCONN);
4408                 }
4409                 if (!(so_state & SS_ISBOUND)) {
4410                         so_lock_single(so);     /* Set SOLOCKED */
4411                         error = sotpi_bind(so, NULL, 0,
4412                             _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4413                         so_unlock_single(so, SOLOCKED);
4414                         if (error) {
4415                                 mutex_exit(&so->so_lock);
4416                                 eprintsoline(so, error);
4417                                 return (error);
4418                         }
4419                 }
4420                 /*
4421                  * Handle delayed datagram errors. These are only queued
4422                  * when the application sets SO_DGRAM_ERRIND.
4423                  * Return the error if we are sending to the address
4424                  * that was returned in the last T_UDERROR_IND.
4425                  * If sending to some other address discard the delayed
4426                  * error indication.
4427                  */
4428                 if (sti->sti_delayed_error) {
4429                         struct T_uderror_ind    *tudi;
4430                         void                    *addr;
4431                         t_uscalar_t             addrlen;
4432                         boolean_t               match = B_FALSE;
4433 
4434                         ASSERT(sti->sti_eaddr_mp);
4435                         error = sti->sti_delayed_error;
4436                         sti->sti_delayed_error = 0;
4437                         tudi =
4438                             (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4439                         addrlen = tudi->DEST_length;
4440                         addr = sogetoff(sti->sti_eaddr_mp,
4441                             tudi->DEST_offset, addrlen, 1);
4442                         ASSERT(addr);   /* Checked by strsock_proto */
4443                         switch (so->so_family) {
4444                         case AF_INET: {
4445                                 /* Compare just IP address and port */
4446                                 sin_t *sin1 = (sin_t *)name;
4447                                 sin_t *sin2 = (sin_t *)addr;
4448 
4449                                 if (addrlen == sizeof (sin_t) &&
4450                                     namelen == addrlen &&
4451                                     sin1->sin_port == sin2->sin_port &&
4452                                     sin1->sin_addr.s_addr ==
4453                                     sin2->sin_addr.s_addr)
4454                                         match = B_TRUE;
4455                                 break;
4456                         }
4457                         case AF_INET6: {
4458                                 /* Compare just IP address and port. Not flow */
4459                                 sin6_t *sin1 = (sin6_t *)name;
4460                                 sin6_t *sin2 = (sin6_t *)addr;
4461 
4462                                 if (addrlen == sizeof (sin6_t) &&
4463                                     namelen == addrlen &&
4464                                     sin1->sin6_port == sin2->sin6_port &&
4465                                     IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4466                                     &sin2->sin6_addr))
4467                                         match = B_TRUE;
4468                                 break;
4469                         }
4470                         case AF_UNIX:
4471                         default:
4472                                 if (namelen == addrlen &&
4473                                     bcmp(name, addr, namelen) == 0)
4474                                         match = B_TRUE;
4475                         }
4476                         if (match) {
4477                                 freemsg(sti->sti_eaddr_mp);
4478                                 sti->sti_eaddr_mp = NULL;
4479                                 mutex_exit(&so->so_lock);
4480 #ifdef DEBUG
4481                                 dprintso(so, 0,
4482                                     ("sockfs delayed error %d for %s\n",
4483                                     error,
4484                                     pr_addr(so->so_family, name, namelen)));
4485 #endif /* DEBUG */
4486                                 return (error);
4487                         }
4488                         freemsg(sti->sti_eaddr_mp);
4489                         sti->sti_eaddr_mp = NULL;
4490                 }
4491         }
4492         mutex_exit(&so->so_lock);
4493 
4494         dontroute = 0;
4495         if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4496                 uint32_t        val;
4497 
4498                 val = 1;
4499                 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4500                     &val, (t_uscalar_t)sizeof (val), cr);
4501                 if (error)
4502                         return (error);
4503                 dontroute = 1;
4504         }
4505 
4506         if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4507                 error = EOPNOTSUPP;
4508                 goto done;
4509         }
4510         if (msg->msg_controllen != 0) {
4511                 if (!(so_mode & SM_CONNREQUIRED)) {
4512                         so_update_attrs(so, SOMOD);
4513                         error = sosend_dgramcmsg(so, name, namelen, uiop,
4514                             msg->msg_control, msg->msg_controllen, flags);
4515                 } else {
4516                         if (flags & MSG_OOB) {
4517                                 /* Can't generate T_EXDATA_REQ with options */
4518                                 error = EOPNOTSUPP;
4519                                 goto done;
4520                         }
4521                         so_update_attrs(so, SOMOD);
4522                         error = sosend_svccmsg(so, uiop,
4523                             !(flags & MSG_EOR),
4524                             msg->msg_control, msg->msg_controllen,
4525                             flags);
4526                 }
4527                 goto done;
4528         }
4529 
4530         so_update_attrs(so, SOMOD);
4531         if (!(so_mode & SM_CONNREQUIRED)) {
4532                 /*
4533                  * If there is no SO_DONTROUTE to turn off return immediately
4534                  * from send_dgram. This can allow tail-call optimizations.
4535                  */
4536                 if (!dontroute) {
4537                         return (sosend_dgram(so, name, namelen, uiop, flags));
4538                 }
4539                 error = sosend_dgram(so, name, namelen, uiop, flags);
4540         } else {
4541                 t_scalar_t prim;
4542                 int sflag;
4543 
4544                 /* Ignore msg_name in the connected state */
4545                 if (flags & MSG_OOB) {
4546                         prim = T_EXDATA_REQ;
4547                         /*
4548                          * Send down T_EXDATA_REQ even if there is flow
4549                          * control for data.
4550                          */
4551                         sflag = MSG_IGNFLOW;
4552                 } else {
4553                         if (so_mode & SM_BYTESTREAM) {
4554                                 /* Byte stream transport - use write */
4555                                 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4556 
4557                                 /* Send M_DATA messages */
4558                                 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4559                                     (error = nl7c_data(so, uiop)) >= 0) {
4560                                         /* NL7C consumed the data */
4561                                         return (error);
4562                                 }
4563                                 /*
4564                                  * If there is no SO_DONTROUTE to turn off,
4565                                  * sti_direct is on, and there is no flow
4566                                  * control, we can take the fast path.
4567                                  */
4568                                 if (!dontroute && sti->sti_direct != 0 &&
4569                                     canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4570                                         return (sostream_direct(so, uiop,
4571                                             NULL, cr));
4572                                 }
4573                                 error = strwrite(SOTOV(so), uiop, cr);
4574                                 goto done;
4575                         }
4576                         prim = T_DATA_REQ;
4577                         sflag = 0;
4578                 }
4579                 /*
4580                  * If there is no SO_DONTROUTE to turn off return immediately
4581                  * from sosend_svc. This can allow tail-call optimizations.
4582                  */
4583                 if (!dontroute)
4584                         return (sosend_svc(so, uiop, prim,
4585                             !(flags & MSG_EOR), sflag));
4586                 error = sosend_svc(so, uiop, prim,
4587                     !(flags & MSG_EOR), sflag);
4588         }
4589         ASSERT(dontroute);
4590 done:
4591         if (dontroute) {
4592                 uint32_t        val;
4593 
4594                 val = 0;
4595                 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4596                     &val, (t_uscalar_t)sizeof (val), cr);
4597         }
4598         return (error);
4599 }
4600 
4601 /*
4602  * kstrwritemp() has very similar semantics as that of strwrite().
4603  * The main difference is it obtains mblks from the caller and also
4604  * does not do any copy as done in strwrite() from user buffers to
4605  * kernel buffers.
4606  *
4607  * Currently, this routine is used by sendfile to send data allocated
4608  * within the kernel without any copying. This interface does not use the
4609  * synchronous stream interface as synch. stream interface implies
4610  * copying.
4611  */
4612 int
4613 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4614 {
4615         struct stdata *stp;
4616         struct queue *wqp;
4617         mblk_t *newmp;
4618         char waitflag;
4619         int tempmode;
4620         int error = 0;
4621         int done = 0;
4622         struct sonode *so;
4623         boolean_t direct;
4624 
4625         ASSERT(vp->v_stream);
4626         stp = vp->v_stream;
4627 
4628         so = VTOSO(vp);
4629         direct = _SOTOTPI(so)->sti_direct;
4630 
4631         /*
4632          * This is the sockfs direct fast path. canputnext() need
4633          * not be accurate so we don't grab the sd_lock here. If
4634          * we get flow-controlled, we grab sd_lock just before the
4635          * do..while loop below to emulate what strwrite() does.
4636          */
4637         wqp = stp->sd_wrq;
4638         if (canputnext(wqp) && direct &&
4639             !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4640                 return (sostream_direct(so, NULL, mp, CRED()));
4641         } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4642                 /* Fast check of flags before acquiring the lock */
4643                 mutex_enter(&stp->sd_lock);
4644                 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4645                 mutex_exit(&stp->sd_lock);
4646                 if (error != 0) {
4647                         if (!(stp->sd_flag & STPLEX) &&
4648                             (stp->sd_wput_opt & SW_SIGPIPE)) {
4649                                 error = EPIPE;
4650                         }
4651                         return (error);
4652                 }
4653         }
4654 
4655         waitflag = WRITEWAIT;
4656         if (stp->sd_flag & OLDNDELAY)
4657                 tempmode = fmode & ~FNDELAY;
4658         else
4659                 tempmode = fmode;
4660 
4661         mutex_enter(&stp->sd_lock);
4662         do {
4663                 if (canputnext(wqp)) {
4664                         mutex_exit(&stp->sd_lock);
4665                         if (stp->sd_wputdatafunc != NULL) {
4666                                 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4667                                     NULL, NULL, NULL);
4668                                 if (newmp == NULL) {
4669                                         /* The caller will free mp */
4670                                         return (ECOMM);
4671                                 }
4672                                 mp = newmp;
4673                         }
4674                         putnext(wqp, mp);
4675                         return (0);
4676                 }
4677                 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4678                     &done);
4679         } while (error == 0 && !done);
4680 
4681         mutex_exit(&stp->sd_lock);
4682         /*
4683          * EAGAIN tells the application to try again. ENOMEM
4684          * is returned only if the memory allocation size
4685          * exceeds the physical limits of the system. ENOMEM
4686          * can't be true here.
4687          */
4688         if (error == ENOMEM)
4689                 error = EAGAIN;
4690         return (error);
4691 }
4692 
4693 /* ARGSUSED */
4694 static int
4695 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4696     struct cred *cr, mblk_t **mpp)
4697 {
4698         int error;
4699 
4700         switch (so->so_family) {
4701         case AF_INET:
4702         case AF_INET6:
4703         case AF_UNIX:
4704                 break;
4705         default:
4706                 return (EAFNOSUPPORT);
4707 
4708         }
4709 
4710         if (so->so_state & SS_CANTSENDMORE)
4711                 return (EPIPE);
4712 
4713         if (so->so_type != SOCK_STREAM)
4714                 return (EOPNOTSUPP);
4715 
4716         if ((so->so_state & SS_ISCONNECTED) == 0)
4717                 return (ENOTCONN);
4718 
4719         error = kstrwritemp(so->so_vnode, *mpp, fflag);
4720         if (error == 0)
4721                 *mpp = NULL;
4722         return (error);
4723 }
4724 
4725 /*
4726  * Sending data on a datagram socket.
4727  * Assumes caller has verified that SS_ISBOUND etc. are set.
4728  */
4729 /* ARGSUSED */
4730 static int
4731 sodgram_direct(struct sonode *so, struct sockaddr *name,
4732     socklen_t namelen, struct uio *uiop, int flags)
4733 {
4734         struct T_unitdata_req   tudr;
4735         mblk_t                  *mp = NULL;
4736         int                     error = 0;
4737         void                    *addr;
4738         socklen_t               addrlen;
4739         ssize_t                 len;
4740         struct stdata           *stp = SOTOV(so)->v_stream;
4741         int                     so_state;
4742         queue_t                 *udp_wq;
4743         boolean_t               connected;
4744         mblk_t                  *mpdata = NULL;
4745         sotpi_info_t            *sti = SOTOTPI(so);
4746         uint32_t                auditing = AU_AUDITING();
4747 
4748         ASSERT(name != NULL && namelen != 0);
4749         ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4750         ASSERT(!(so->so_mode & SM_EXDATA));
4751         ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4752         ASSERT(SOTOV(so)->v_type == VSOCK);
4753 
4754         /* Caller checked for proper length */
4755         len = uiop->uio_resid;
4756         ASSERT(len <= sti->sti_tidu_size);
4757 
4758         /* Length and family checks have been done by caller */
4759         ASSERT(name->sa_family == so->so_family);
4760         ASSERT(so->so_family == AF_INET ||
4761             (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4762         ASSERT(so->so_family == AF_INET6 ||
4763             (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4764 
4765         addr = name;
4766         addrlen = namelen;
4767 
4768         if (stp->sd_sidp != NULL &&
4769             (error = straccess(stp, JCWRITE)) != 0)
4770                 goto done;
4771 
4772         so_state = so->so_state;
4773 
4774         connected = so_state & SS_ISCONNECTED;
4775         if (!connected) {
4776                 tudr.PRIM_type = T_UNITDATA_REQ;
4777                 tudr.DEST_length = addrlen;
4778                 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4779                 tudr.OPT_length = 0;
4780                 tudr.OPT_offset = 0;
4781 
4782                 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4783                     _ALLOC_INTR, CRED());
4784                 if (mp == NULL) {
4785                         /*
4786                          * Caught a signal waiting for memory.
4787                          * Let send* return EINTR.
4788                          */
4789                         error = EINTR;
4790                         goto done;
4791                 }
4792         }
4793 
4794         /*
4795          * For UDP we don't break up the copyin into smaller pieces
4796          * as in the TCP case.  That means if ENOMEM is returned by
4797          * mcopyinuio() then the uio vector has not been modified at
4798          * all and we fallback to either strwrite() or kstrputmsg()
4799          * below.  Note also that we never generate priority messages
4800          * from here.
4801          */
4802         udp_wq = stp->sd_wrq->q_next;
4803         if (canput(udp_wq) &&
4804             (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4805                 ASSERT(DB_TYPE(mpdata) == M_DATA);
4806                 ASSERT(uiop->uio_resid == 0);
4807                 if (!connected)
4808                         linkb(mp, mpdata);
4809                 else
4810                         mp = mpdata;
4811                 if (auditing)
4812                         audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4813 
4814                 /* Always returns 0... */
4815                 return (udp_wput(udp_wq, mp));
4816         }
4817 
4818         ASSERT(mpdata == NULL);
4819         if (error != 0 && error != ENOMEM) {
4820                 freemsg(mp);
4821                 return (error);
4822         }
4823 
4824         /*
4825          * For connected, let strwrite() handle the blocking case.
4826          * Otherwise we fall thru and use kstrputmsg().
4827          */
4828         if (connected)
4829                 return (strwrite(SOTOV(so), uiop, CRED()));
4830 
4831         if (auditing)
4832                 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4833 
4834         error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4835 done:
4836 #ifdef SOCK_DEBUG
4837         if (error != 0) {
4838                 eprintsoline(so, error);
4839         }
4840 #endif /* SOCK_DEBUG */
4841         return (error);
4842 }
4843 
4844 int
4845 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4846 {
4847         struct stdata *stp = SOTOV(so)->v_stream;
4848         ssize_t iosize, rmax, maxblk;
4849         queue_t *tcp_wq = stp->sd_wrq->q_next;
4850         mblk_t *newmp;
4851         int error = 0, wflag = 0;
4852 
4853         ASSERT(so->so_mode & SM_BYTESTREAM);
4854         ASSERT(SOTOV(so)->v_type == VSOCK);
4855 
4856         if (stp->sd_sidp != NULL &&
4857             (error = straccess(stp, JCWRITE)) != 0)
4858                 return (error);
4859 
4860         if (uiop == NULL) {
4861                 /*
4862                  * kstrwritemp() should have checked sd_flag and
4863                  * flow-control before coming here.  If we end up
4864                  * here it means that we can simply pass down the
4865                  * data to tcp.
4866                  */
4867                 ASSERT(mp != NULL);
4868                 if (stp->sd_wputdatafunc != NULL) {
4869                         newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4870                             NULL, NULL, NULL);
4871                         if (newmp == NULL) {
4872                                 /* The caller will free mp */
4873                                 return (ECOMM);
4874                         }
4875                         mp = newmp;
4876                 }
4877                 /* Always returns 0... */
4878                 return (tcp_wput(tcp_wq, mp));
4879         }
4880 
4881         /* Fallback to strwrite() to do proper error handling */
4882         if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4883                 return (strwrite(SOTOV(so), uiop, cr));
4884 
4885         rmax = stp->sd_qn_maxpsz;
4886         ASSERT(rmax >= 0 || rmax == INFPSZ);
4887         if (rmax == 0 || uiop->uio_resid <= 0)
4888                 return (0);
4889 
4890         if (rmax == INFPSZ)
4891                 rmax = uiop->uio_resid;
4892 
4893         maxblk = stp->sd_maxblk;
4894 
4895         for (;;) {
4896                 iosize = MIN(uiop->uio_resid, rmax);
4897 
4898                 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4899                 if (mp == NULL) {
4900                         /*
4901                          * Fallback to strwrite() for ENOMEM; if this
4902                          * is our first time in this routine and the uio
4903                          * vector has not been modified, we will end up
4904                          * calling strwrite() without any flag set.
4905                          */
4906                         if (error == ENOMEM)
4907                                 goto slow_send;
4908                         else
4909                                 return (error);
4910                 }
4911                 ASSERT(uiop->uio_resid >= 0);
4912                 /*
4913                  * If mp is non-NULL and ENOMEM is set, it means that
4914                  * mcopyinuio() was able to break down some of the user
4915                  * data into one or more mblks.  Send the partial data
4916                  * to tcp and let the rest be handled in strwrite().
4917                  */
4918                 ASSERT(error == 0 || error == ENOMEM);
4919                 if (stp->sd_wputdatafunc != NULL) {
4920                         newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4921                             NULL, NULL, NULL);
4922                         if (newmp == NULL) {
4923                                 /* The caller will free mp */
4924                                 return (ECOMM);
4925                         }
4926                         mp = newmp;
4927                 }
4928                 (void) tcp_wput(tcp_wq, mp);    /* Always returns 0 anyway. */
4929 
4930                 wflag |= NOINTR;
4931 
4932                 if (uiop->uio_resid == 0) {  /* No more data; we're done */
4933                         ASSERT(error == 0);
4934                         break;
4935                 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4936                     (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4937 slow_send:
4938                         /*
4939                          * We were able to send down partial data using
4940                          * the direct call interface, but are now relying
4941                          * on strwrite() to handle the non-fastpath cases.
4942                          * If the socket is blocking we will sleep in
4943                          * strwaitq() until write is permitted, otherwise,
4944                          * we will need to return the amount of bytes
4945                          * written so far back to the app.  This is the
4946                          * reason why we pass NOINTR flag to strwrite()
4947                          * for non-blocking socket, because we don't want
4948                          * to return EAGAIN when portion of the user data
4949                          * has actually been sent down.
4950                          */
4951                         return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4952                 }
4953         }
4954         return (0);
4955 }
4956 
4957 /*
4958  * Update sti_faddr by asking the transport (unless AF_UNIX).
4959  */
4960 /* ARGSUSED */
4961 int
4962 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4963     boolean_t accept, struct cred *cr)
4964 {
4965         struct strbuf   strbuf;
4966         int             error = 0, res;
4967         void            *addr;
4968         t_uscalar_t     addrlen;
4969         k_sigset_t      smask;
4970         sotpi_info_t    *sti = SOTOTPI(so);
4971 
4972         dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4973             (void *)so, pr_state(so->so_state, so->so_mode)));
4974 
4975         ASSERT(*namelen > 0);
4976         mutex_enter(&so->so_lock);
4977         so_lock_single(so);     /* Set SOLOCKED */
4978 
4979         if (accept) {
4980                 bcopy(sti->sti_faddr_sa, name,
4981                     MIN(*namelen, sti->sti_faddr_len));
4982                 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4983                 goto done;
4984         }
4985 
4986         if (!(so->so_state & SS_ISCONNECTED)) {
4987                 error = ENOTCONN;
4988                 goto done;
4989         }
4990         /* Added this check for X/Open */
4991         if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4992                 error = EINVAL;
4993                 if (xnet_check_print) {
4994                         printf("sockfs: X/Open getpeername check => EINVAL\n");
4995                 }
4996                 goto done;
4997         }
4998 
4999         if (sti->sti_faddr_valid) {
5000                 bcopy(sti->sti_faddr_sa, name,
5001                     MIN(*namelen, sti->sti_faddr_len));
5002                 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
5003                 goto done;
5004         }
5005 
5006 #ifdef DEBUG
5007         dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
5008             pr_addr(so->so_family, sti->sti_faddr_sa,
5009             (t_uscalar_t)sti->sti_faddr_len)));
5010 #endif /* DEBUG */
5011 
5012         if (so->so_family == AF_UNIX) {
5013                 /* Transport has different name space - return local info */
5014                 if (sti->sti_faddr_noxlate)
5015                         *namelen = 0;
5016                 error = 0;
5017                 goto done;
5018         }
5019 
5020         ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
5021 
5022         ASSERT(sti->sti_faddr_sa);
5023         /* Allocate local buffer to use with ioctl */
5024         addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
5025         mutex_exit(&so->so_lock);
5026         addr = kmem_alloc(addrlen, KM_SLEEP);
5027 
5028         /*
5029          * Issue TI_GETPEERNAME with signals masked.
5030          * Put the result in sti_faddr_sa so that getpeername works after
5031          * a shutdown(output).
5032          * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5033          * back to the socket.
5034          */
5035         strbuf.buf = addr;
5036         strbuf.maxlen = addrlen;
5037         strbuf.len = 0;
5038 
5039         sigintr(&smask, 0);
5040         res = 0;
5041         ASSERT(cr);
5042         error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
5043             0, K_TO_K, cr, &res);
5044         sigunintr(&smask);
5045 
5046         mutex_enter(&so->so_lock);
5047         /*
5048          * If there is an error record the error in so_error put don't fail
5049          * the getpeername. Instead fallback on the recorded
5050          * sti->sti_faddr_sa.
5051          */
5052         if (error) {
5053                 /*
5054                  * Various stream head errors can be returned to the ioctl.
5055                  * However, it is impossible to determine which ones of
5056                  * these are really socket level errors that were incorrectly
5057                  * consumed by the ioctl. Thus this code silently ignores the
5058                  * error - to code explicitly does not reinstate the error
5059                  * using soseterror().
5060                  * Experiments have shows that at least this set of
5061                  * errors are reported and should not be reinstated on the
5062                  * socket:
5063                  *      EINVAL  E.g. if an I_LINK was in effect when
5064                  *              getpeername was called.
5065                  *      EPIPE   The ioctl error semantics prefer the write
5066                  *              side error over the read side error.
5067                  *      ENOTCONN The transport just got disconnected but
5068                  *              sockfs had not yet seen the T_DISCON_IND
5069                  *              when issuing the ioctl.
5070                  */
5071                 error = 0;
5072         } else if (res == 0 && strbuf.len > 0 &&
5073             (so->so_state & SS_ISCONNECTED)) {
5074                 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
5075                 sti->sti_faddr_len = (socklen_t)strbuf.len;
5076                 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
5077                 sti->sti_faddr_valid = 1;
5078 
5079                 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
5080                 *namelen = sti->sti_faddr_len;
5081         }
5082         kmem_free(addr, addrlen);
5083 #ifdef DEBUG
5084         dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
5085             pr_addr(so->so_family, sti->sti_faddr_sa,
5086             (t_uscalar_t)sti->sti_faddr_len)));
5087 #endif /* DEBUG */
5088 done:
5089         so_unlock_single(so, SOLOCKED);
5090         mutex_exit(&so->so_lock);
5091         return (error);
5092 }
5093 
5094 /*
5095  * Update sti_laddr by asking the transport (unless AF_UNIX).
5096  */
5097 int
5098 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5099     struct cred *cr)
5100 {
5101         struct strbuf   strbuf;
5102         int             error = 0, res;
5103         void            *addr;
5104         t_uscalar_t     addrlen;
5105         k_sigset_t      smask;
5106         sotpi_info_t    *sti = SOTOTPI(so);
5107 
5108         dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5109             (void *)so, pr_state(so->so_state, so->so_mode)));
5110 
5111         ASSERT(*namelen > 0);
5112         mutex_enter(&so->so_lock);
5113         so_lock_single(so);     /* Set SOLOCKED */
5114 
5115 #ifdef DEBUG
5116 
5117         dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5118             pr_addr(so->so_family, sti->sti_laddr_sa,
5119             (t_uscalar_t)sti->sti_laddr_len)));
5120 #endif /* DEBUG */
5121         if (sti->sti_laddr_valid) {
5122                 bcopy(sti->sti_laddr_sa, name,
5123                     MIN(*namelen, sti->sti_laddr_len));
5124                 *namelen = sti->sti_laddr_len;
5125                 goto done;
5126         }
5127 
5128         if (so->so_family == AF_UNIX) {
5129                 /*
5130                  * Transport has different name space - return local info. If we
5131                  * have enough space, let consumers know the family.
5132                  */
5133                 if (*namelen >= sizeof (sa_family_t)) {
5134                         name->sa_family = AF_UNIX;
5135                         *namelen = sizeof (sa_family_t);
5136                 } else {
5137                         *namelen = 0;
5138                 }
5139                 error = 0;
5140                 goto done;
5141         }
5142         if (!(so->so_state & SS_ISBOUND)) {
5143                 /* If not bound, then nothing to return. */
5144                 error = 0;
5145                 goto done;
5146         }
5147 
5148         /* Allocate local buffer to use with ioctl */
5149         addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5150         mutex_exit(&so->so_lock);
5151         addr = kmem_alloc(addrlen, KM_SLEEP);
5152 
5153         /*
5154          * Issue TI_GETMYNAME with signals masked.
5155          * Put the result in sti_laddr_sa so that getsockname works after
5156          * a shutdown(output).
5157          * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5158          * back to the socket.
5159          */
5160         strbuf.buf = addr;
5161         strbuf.maxlen = addrlen;
5162         strbuf.len = 0;
5163 
5164         sigintr(&smask, 0);
5165         res = 0;
5166         ASSERT(cr);
5167         error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5168             0, K_TO_K, cr, &res);
5169         sigunintr(&smask);
5170 
5171         mutex_enter(&so->so_lock);
5172         /*
5173          * If there is an error record the error in so_error put don't fail
5174          * the getsockname. Instead fallback on the recorded
5175          * sti->sti_laddr_sa.
5176          */
5177         if (error) {
5178                 /*
5179                  * Various stream head errors can be returned to the ioctl.
5180                  * However, it is impossible to determine which ones of
5181                  * these are really socket level errors that were incorrectly
5182                  * consumed by the ioctl. Thus this code silently ignores the
5183                  * error - to code explicitly does not reinstate the error
5184                  * using soseterror().
5185                  * Experiments have shows that at least this set of
5186                  * errors are reported and should not be reinstated on the
5187                  * socket:
5188                  *      EINVAL  E.g. if an I_LINK was in effect when
5189                  *              getsockname was called.
5190                  *      EPIPE   The ioctl error semantics prefer the write
5191                  *              side error over the read side error.
5192                  */
5193                 error = 0;
5194         } else if (res == 0 && strbuf.len > 0 &&
5195             (so->so_state & SS_ISBOUND)) {
5196                 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5197                 sti->sti_laddr_len = (socklen_t)strbuf.len;
5198                 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5199                 sti->sti_laddr_valid = 1;
5200 
5201                 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5202                 *namelen = sti->sti_laddr_len;
5203         }
5204         kmem_free(addr, addrlen);
5205 #ifdef DEBUG
5206         dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5207             pr_addr(so->so_family, sti->sti_laddr_sa,
5208             (t_uscalar_t)sti->sti_laddr_len)));
5209 #endif /* DEBUG */
5210 done:
5211         so_unlock_single(so, SOLOCKED);
5212         mutex_exit(&so->so_lock);
5213         return (error);
5214 }
5215 
5216 /*
5217  * Get socket options. For SOL_SOCKET options some options are handled
5218  * by the sockfs while others use the value recorded in the sonode as a
5219  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5220  *
5221  * On the return most *optlenp bytes are copied to optval.
5222  */
5223 /* ARGSUSED */
5224 int
5225 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5226     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5227 {
5228         struct T_optmgmt_req    optmgmt_req;
5229         struct T_optmgmt_ack    *optmgmt_ack;
5230         struct opthdr           oh;
5231         struct opthdr           *opt_res;
5232         mblk_t                  *mp = NULL;
5233         int                     error = 0;
5234         void                    *option = NULL; /* Set if fallback value */
5235         t_uscalar_t             maxlen = *optlenp;
5236         t_uscalar_t             len;
5237         uint32_t                value;
5238         struct timeval          tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5239         struct timeval32        tmo_val32;
5240         struct so_snd_bufinfo   snd_bufinfo;    /* used for zero copy */
5241 
5242         dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5243             (void *)so, level, option_name, optval, (void *)optlenp,
5244             pr_state(so->so_state, so->so_mode)));
5245 
5246         mutex_enter(&so->so_lock);
5247         so_lock_single(so);     /* Set SOLOCKED */
5248 
5249         /*
5250          * Check for SOL_SOCKET options.
5251          * Certain SOL_SOCKET options are returned directly whereas
5252          * others only provide a default (fallback) value should
5253          * the T_SVR4_OPTMGMT_REQ fail.
5254          */
5255         if (level == SOL_SOCKET) {
5256                 /* Check parameters */
5257                 switch (option_name) {
5258                 case SO_TYPE:
5259                 case SO_ERROR:
5260                 case SO_DEBUG:
5261                 case SO_ACCEPTCONN:
5262                 case SO_REUSEADDR:
5263                 case SO_KEEPALIVE:
5264                 case SO_DONTROUTE:
5265                 case SO_BROADCAST:
5266                 case SO_USELOOPBACK:
5267                 case SO_OOBINLINE:
5268                 case SO_SNDBUF:
5269                 case SO_RCVBUF:
5270 #ifdef notyet
5271                 case SO_SNDLOWAT:
5272                 case SO_RCVLOWAT:
5273 #endif /* notyet */
5274                 case SO_DOMAIN:
5275                 case SO_DGRAM_ERRIND:
5276                         if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5277                                 error = EINVAL;
5278                                 eprintsoline(so, error);
5279                                 goto done2;
5280                         }
5281                         break;
5282                 case SO_RCVTIMEO:
5283                 case SO_SNDTIMEO:
5284                         if (get_udatamodel() == DATAMODEL_NONE ||
5285                             get_udatamodel() == DATAMODEL_NATIVE) {
5286                                 if (maxlen < sizeof (struct timeval)) {
5287                                         error = EINVAL;
5288                                         eprintsoline(so, error);
5289                                         goto done2;
5290                                 }
5291                         } else {
5292                                 if (maxlen < sizeof (struct timeval32)) {
5293                                         error = EINVAL;
5294                                         eprintsoline(so, error);
5295                                         goto done2;
5296                                 }
5297 
5298                         }
5299                         break;
5300                 case SO_LINGER:
5301                         if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5302                                 error = EINVAL;
5303                                 eprintsoline(so, error);
5304                                 goto done2;
5305                         }
5306                         break;
5307                 case SO_SND_BUFINFO:
5308                         if (maxlen < (t_uscalar_t)
5309                             sizeof (struct so_snd_bufinfo)) {
5310                                 error = EINVAL;
5311                                 eprintsoline(so, error);
5312                                 goto done2;
5313                         }
5314                         break;
5315                 }
5316 
5317                 len = (t_uscalar_t)sizeof (uint32_t);   /* Default */
5318 
5319                 switch (option_name) {
5320                 case SO_TYPE:
5321                         value = so->so_type;
5322                         option = &value;
5323                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5324 
5325                 case SO_ERROR:
5326                         value = sogeterr(so, B_TRUE);
5327                         option = &value;
5328                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5329 
5330                 case SO_ACCEPTCONN:
5331                         if (so->so_state & SS_ACCEPTCONN)
5332                                 value = SO_ACCEPTCONN;
5333                         else
5334                                 value = 0;
5335 #ifdef DEBUG
5336                         if (value) {
5337                                 dprintso(so, 1,
5338                                     ("sotpi_getsockopt: 0x%x is set\n",
5339                                     option_name));
5340                         } else {
5341                                 dprintso(so, 1,
5342                                     ("sotpi_getsockopt: 0x%x not set\n",
5343                                     option_name));
5344                         }
5345 #endif /* DEBUG */
5346                         option = &value;
5347                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5348 
5349                 case SO_DEBUG:
5350                 case SO_REUSEADDR:
5351                 case SO_KEEPALIVE:
5352                 case SO_DONTROUTE:
5353                 case SO_BROADCAST:
5354                 case SO_USELOOPBACK:
5355                 case SO_OOBINLINE:
5356                 case SO_DGRAM_ERRIND:
5357                         value = (so->so_options & option_name);
5358 #ifdef DEBUG
5359                         if (value) {
5360                                 dprintso(so, 1,
5361                                     ("sotpi_getsockopt: 0x%x is set\n",
5362                                     option_name));
5363                         } else {
5364                                 dprintso(so, 1,
5365                                     ("sotpi_getsockopt: 0x%x not set\n",
5366                                     option_name));
5367                         }
5368 #endif /* DEBUG */
5369                         option = &value;
5370                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5371 
5372                 /*
5373                  * The following options are only returned by sockfs when the
5374                  * T_SVR4_OPTMGMT_REQ fails.
5375                  */
5376                 case SO_LINGER:
5377                         option = &so->so_linger;
5378                         len = (t_uscalar_t)sizeof (struct linger);
5379                         break;
5380                 case SO_SNDBUF: {
5381                         ssize_t lvalue;
5382 
5383                         /*
5384                          * If the option has not been set then get a default
5385                          * value from the read queue. This value is
5386                          * returned if the transport fails
5387                          * the T_SVR4_OPTMGMT_REQ.
5388                          */
5389                         lvalue = so->so_sndbuf;
5390                         if (lvalue == 0) {
5391                                 mutex_exit(&so->so_lock);
5392                                 (void) strqget(strvp2wq(SOTOV(so))->q_next,
5393                                     QHIWAT, 0, &lvalue);
5394                                 mutex_enter(&so->so_lock);
5395                                 dprintso(so, 1,
5396                                     ("got SO_SNDBUF %ld from q\n", lvalue));
5397                         }
5398                         value = (int)lvalue;
5399                         option = &value;
5400                         len = (t_uscalar_t)sizeof (so->so_sndbuf);
5401                         break;
5402                 }
5403                 case SO_RCVBUF: {
5404                         ssize_t lvalue;
5405 
5406                         /*
5407                          * If the option has not been set then get a default
5408                          * value from the read queue. This value is
5409                          * returned if the transport fails
5410                          * the T_SVR4_OPTMGMT_REQ.
5411                          *
5412                          * XXX If SO_RCVBUF has been set and this is an
5413                          * XPG 4.2 application then do not ask the transport
5414                          * since the transport might adjust the value and not
5415                          * return exactly what was set by the application.
5416                          * For non-XPG 4.2 application we return the value
5417                          * that the transport is actually using.
5418                          */
5419                         lvalue = so->so_rcvbuf;
5420                         if (lvalue == 0) {
5421                                 mutex_exit(&so->so_lock);
5422                                 (void) strqget(RD(strvp2wq(SOTOV(so))),
5423                                     QHIWAT, 0, &lvalue);
5424                                 mutex_enter(&so->so_lock);
5425                                 dprintso(so, 1,
5426                                     ("got SO_RCVBUF %ld from q\n", lvalue));
5427                         } else if (flags & _SOGETSOCKOPT_XPG4_2) {
5428                                 value = (int)lvalue;
5429                                 option = &value;
5430                                 goto copyout;   /* skip asking transport */
5431                         }
5432                         value = (int)lvalue;
5433                         option = &value;
5434                         len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5435                         break;
5436                 }
5437                 case SO_DOMAIN:
5438                         value = so->so_family;
5439                         option = &value;
5440                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5441 
5442 #ifdef notyet
5443                 /*
5444                  * We do not implement the semantics of these options
5445                  * thus we shouldn't implement the options either.
5446                  */
5447                 case SO_SNDLOWAT:
5448                         value = so->so_sndlowat;
5449                         option = &value;
5450                         break;
5451                 case SO_RCVLOWAT:
5452                         value = so->so_rcvlowat;
5453                         option = &value;
5454                         break;
5455 #endif /* notyet */
5456                 case SO_SNDTIMEO:
5457                 case SO_RCVTIMEO: {
5458                         clock_t val;
5459 
5460                         if (option_name == SO_RCVTIMEO)
5461                                 val = drv_hztousec(so->so_rcvtimeo);
5462                         else
5463                                 val = drv_hztousec(so->so_sndtimeo);
5464                         tmo_val.tv_sec = val / (1000 * 1000);
5465                         tmo_val.tv_usec = val % (1000 * 1000);
5466                         if (get_udatamodel() == DATAMODEL_NONE ||
5467                             get_udatamodel() == DATAMODEL_NATIVE) {
5468                                 option = &tmo_val;
5469                                 len = sizeof (struct timeval);
5470                         } else {
5471                                 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5472                                 option = &tmo_val32;
5473                                 len = sizeof (struct timeval32);
5474                         }
5475                         break;
5476                 }
5477                 case SO_SND_BUFINFO: {
5478                         snd_bufinfo.sbi_wroff =
5479                             (so->so_proto_props).sopp_wroff;
5480                         snd_bufinfo.sbi_maxblk =
5481                             (so->so_proto_props).sopp_maxblk;
5482                         snd_bufinfo.sbi_maxpsz =
5483                             (so->so_proto_props).sopp_maxpsz;
5484                         snd_bufinfo.sbi_tail =
5485                             (so->so_proto_props).sopp_tail;
5486                         option = &snd_bufinfo;
5487                         len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5488                         break;
5489                 }
5490                 }
5491         }
5492 
5493         mutex_exit(&so->so_lock);
5494 
5495         /* Send request */
5496         optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5497         optmgmt_req.MGMT_flags = T_CHECK;
5498         optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5499         optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5500 
5501         oh.level = level;
5502         oh.name = option_name;
5503         oh.len = maxlen;
5504 
5505         mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5506             &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5507         /* Let option management work in the presence of data flow control */
5508         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5509             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5510         mp = NULL;
5511         mutex_enter(&so->so_lock);
5512         if (error) {
5513                 eprintsoline(so, error);
5514                 goto done2;
5515         }
5516         error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5517             (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5518         if (error) {
5519                 if (option != NULL) {
5520                         /* We have a fallback value */
5521                         error = 0;
5522                         goto copyout;
5523                 }
5524                 eprintsoline(so, error);
5525                 goto done2;
5526         }
5527         ASSERT(mp);
5528         optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5529         opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5530             optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5531         if (opt_res == NULL) {
5532                 if (option != NULL) {
5533                         /* We have a fallback value */
5534                         error = 0;
5535                         goto copyout;
5536                 }
5537                 error = EPROTO;
5538                 eprintsoline(so, error);
5539                 goto done;
5540         }
5541         option = &opt_res[1];
5542 
5543         /* check to ensure that the option is within bounds */
5544         if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5545             (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5546                 if (option != NULL) {
5547                         /* We have a fallback value */
5548                         error = 0;
5549                         goto copyout;
5550                 }
5551                 error = EPROTO;
5552                 eprintsoline(so, error);
5553                 goto done;
5554         }
5555 
5556         len = opt_res->len;
5557 
5558 copyout: {
5559                 t_uscalar_t size = MIN(len, maxlen);
5560                 bcopy(option, optval, size);
5561                 bcopy(&size, optlenp, sizeof (size));
5562         }
5563 done:
5564         freemsg(mp);
5565 done2:
5566         so_unlock_single(so, SOLOCKED);
5567         mutex_exit(&so->so_lock);
5568 
5569         return (error);
5570 }
5571 
5572 /*
5573  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5574  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5575  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5576  * setsockopt has to work even if the transport does not support the option.
5577  */
5578 /* ARGSUSED */
5579 int
5580 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5581     const void *optval, t_uscalar_t optlen, struct cred *cr)
5582 {
5583         struct T_optmgmt_req    optmgmt_req;
5584         struct opthdr           oh;
5585         mblk_t                  *mp;
5586         int                     error = 0;
5587         boolean_t               handled = B_FALSE;
5588 
5589         dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5590             (void *)so, level, option_name, optval, optlen,
5591             pr_state(so->so_state, so->so_mode)));
5592 
5593         /* X/Open requires this check */
5594         if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5595                 if (xnet_check_print)
5596                         printf("sockfs: X/Open setsockopt check => EINVAL\n");
5597                 return (EINVAL);
5598         }
5599 
5600         mutex_enter(&so->so_lock);
5601         so_lock_single(so);     /* Set SOLOCKED */
5602         mutex_exit(&so->so_lock);
5603 
5604         optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5605         optmgmt_req.MGMT_flags = T_NEGOTIATE;
5606         optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5607         optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5608 
5609         oh.level = level;
5610         oh.name = option_name;
5611         oh.len = optlen;
5612 
5613         mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5614             &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5615         /* Let option management work in the presence of data flow control */
5616         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5617             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5618         mp = NULL;
5619         mutex_enter(&so->so_lock);
5620         if (error) {
5621                 eprintsoline(so, error);
5622                 goto done2;
5623         }
5624         error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5625             (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5626         if (error) {
5627                 eprintsoline(so, error);
5628                 goto done;
5629         }
5630         ASSERT(mp);
5631         /* No need to verify T_optmgmt_ack */
5632         freemsg(mp);
5633 done:
5634         /*
5635          * Check for SOL_SOCKET options and record their values.
5636          * If we know about a SOL_SOCKET parameter and the transport
5637          * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5638          * EPROTO) we let the setsockopt succeed.
5639          */
5640         if (level == SOL_SOCKET) {
5641                 /* Check parameters */
5642                 switch (option_name) {
5643                 case SO_DEBUG:
5644                 case SO_REUSEADDR:
5645                 case SO_KEEPALIVE:
5646                 case SO_DONTROUTE:
5647                 case SO_BROADCAST:
5648                 case SO_USELOOPBACK:
5649                 case SO_OOBINLINE:
5650                 case SO_SNDBUF:
5651                 case SO_RCVBUF:
5652 #ifdef notyet
5653                 case SO_SNDLOWAT:
5654                 case SO_RCVLOWAT:
5655 #endif /* notyet */
5656                 case SO_DGRAM_ERRIND:
5657                         if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5658                                 error = EINVAL;
5659                                 eprintsoline(so, error);
5660                                 goto done2;
5661                         }
5662                         ASSERT(optval);
5663                         handled = B_TRUE;
5664                         break;
5665                 case SO_SNDTIMEO:
5666                 case SO_RCVTIMEO:
5667                         if (get_udatamodel() == DATAMODEL_NONE ||
5668                             get_udatamodel() == DATAMODEL_NATIVE) {
5669                                 if (optlen != sizeof (struct timeval)) {
5670                                         error = EINVAL;
5671                                         eprintsoline(so, error);
5672                                         goto done2;
5673                                 }
5674                         } else {
5675                                 if (optlen != sizeof (struct timeval32)) {
5676                                         error = EINVAL;
5677                                         eprintsoline(so, error);
5678                                         goto done2;
5679                                 }
5680                         }
5681                         ASSERT(optval);
5682                         handled = B_TRUE;
5683                         break;
5684                 case SO_LINGER:
5685                         if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5686                                 error = EINVAL;
5687                                 eprintsoline(so, error);
5688                                 goto done2;
5689                         }
5690                         ASSERT(optval);
5691                         handled = B_TRUE;
5692                         break;
5693                 }
5694 
5695 #define intvalue        (*(int32_t *)optval)
5696 
5697                 switch (option_name) {
5698                 case SO_TYPE:
5699                 case SO_ERROR:
5700                 case SO_ACCEPTCONN:
5701                         /* Can't be set */
5702                         error = ENOPROTOOPT;
5703                         goto done2;
5704                 case SO_LINGER: {
5705                         struct linger *l = (struct linger *)optval;
5706 
5707                         so->so_linger.l_linger = l->l_linger;
5708                         if (l->l_onoff) {
5709                                 so->so_linger.l_onoff = SO_LINGER;
5710                                 so->so_options |= SO_LINGER;
5711                         } else {
5712                                 so->so_linger.l_onoff = 0;
5713                                 so->so_options &= ~SO_LINGER;
5714                         }
5715                         break;
5716                 }
5717 
5718                 case SO_DEBUG:
5719 #ifdef SOCK_TEST
5720                         if (intvalue & 2)
5721                                 sock_test_timelimit = 10 * hz;
5722                         else
5723                                 sock_test_timelimit = 0;
5724 
5725                         if (intvalue & 4)
5726                                 do_useracc = 0;
5727                         else
5728                                 do_useracc = 1;
5729 #endif /* SOCK_TEST */
5730                         /* FALLTHRU */
5731                 case SO_REUSEADDR:
5732                 case SO_KEEPALIVE:
5733                 case SO_DONTROUTE:
5734                 case SO_BROADCAST:
5735                 case SO_USELOOPBACK:
5736                 case SO_OOBINLINE:
5737                 case SO_DGRAM_ERRIND:
5738                         if (intvalue != 0) {
5739                                 dprintso(so, 1,
5740                                     ("socket_setsockopt: setting 0x%x\n",
5741                                     option_name));
5742                                 so->so_options |= option_name;
5743                         } else {
5744                                 dprintso(so, 1,
5745                                     ("socket_setsockopt: clearing 0x%x\n",
5746                                     option_name));
5747                                 so->so_options &= ~option_name;
5748                         }
5749                         break;
5750                 /*
5751                  * The following options are only returned by us when the
5752                  * transport layer fails.
5753                  * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5754                  * since the transport might adjust the value and not
5755                  * return exactly what was set by the application.
5756                  */
5757                 case SO_SNDBUF:
5758                         so->so_sndbuf = intvalue;
5759                         break;
5760                 case SO_RCVBUF:
5761                         so->so_rcvbuf = intvalue;
5762                         break;
5763                 case SO_RCVPSH:
5764                         so->so_rcv_timer_interval = intvalue;
5765                         break;
5766 #ifdef notyet
5767                 /*
5768                  * We do not implement the semantics of these options
5769                  * thus we shouldn't implement the options either.
5770                  */
5771                 case SO_SNDLOWAT:
5772                         so->so_sndlowat = intvalue;
5773                         break;
5774                 case SO_RCVLOWAT:
5775                         so->so_rcvlowat = intvalue;
5776                         break;
5777 #endif /* notyet */
5778                 case SO_SNDTIMEO:
5779                 case SO_RCVTIMEO: {
5780                         struct timeval tl;
5781                         clock_t val;
5782 
5783                         if (get_udatamodel() == DATAMODEL_NONE ||
5784                             get_udatamodel() == DATAMODEL_NATIVE)
5785                                 bcopy(&tl, (struct timeval *)optval,
5786                                     sizeof (struct timeval));
5787                         else
5788                                 TIMEVAL32_TO_TIMEVAL(&tl,
5789                                     (struct timeval32 *)optval);
5790                         val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5791                         if (option_name == SO_RCVTIMEO)
5792                                 so->so_rcvtimeo = drv_usectohz(val);
5793                         else
5794                                 so->so_sndtimeo = drv_usectohz(val);
5795                         break;
5796                 }
5797                 }
5798 #undef  intvalue
5799 
5800                 if (error) {
5801                         if ((error == ENOPROTOOPT || error == EPROTO ||
5802                             error == EINVAL) && handled) {
5803                                 dprintso(so, 1,
5804                                     ("setsockopt: ignoring error %d for 0x%x\n",
5805                                     error, option_name));
5806                                 error = 0;
5807                         }
5808                 }
5809         }
5810 done2:
5811         so_unlock_single(so, SOLOCKED);
5812         mutex_exit(&so->so_lock);
5813         return (error);
5814 }
5815 
5816 /*
5817  * sotpi_close() is called when the last open reference goes away.
5818  */
5819 /* ARGSUSED */
5820 int
5821 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5822 {
5823         struct vnode *vp = SOTOV(so);
5824         dev_t dev;
5825         int error = 0;
5826         sotpi_info_t *sti = SOTOTPI(so);
5827 
5828         dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5829             (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5830 
5831         dev = sti->sti_dev;
5832 
5833         ASSERT(STREAMSTAB(getmajor(dev)));
5834 
5835         mutex_enter(&so->so_lock);
5836         so_lock_single(so);     /* Set SOLOCKED */
5837 
5838         ASSERT(so_verify_oobstate(so));
5839 
5840         if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5841                 sti->sti_nl7c_flags = 0;
5842                 nl7c_close(so);
5843         }
5844 
5845         if (vp->v_stream != NULL) {
5846                 vnode_t *ux_vp;
5847 
5848                 if (so->so_family == AF_UNIX) {
5849                         /* Could avoid this when CANTSENDMORE for !dgram */
5850                         so_unix_close(so);
5851                 }
5852 
5853                 mutex_exit(&so->so_lock);
5854                 /*
5855                  * Disassemble the linkage from the AF_UNIX underlying file
5856                  * system vnode to this socket (by atomically clearing
5857                  * v_stream in vn_rele_stream) before strclose clears sd_vnode
5858                  * and frees the stream head.
5859                  */
5860                 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5861                         ASSERT(ux_vp->v_stream);
5862                         sti->sti_ux_bound_vp = NULL;
5863                         vn_rele_stream(ux_vp);
5864                 }
5865                 error = strclose(vp, flag, cr);
5866                 vp->v_stream = NULL;
5867                 mutex_enter(&so->so_lock);
5868         }
5869 
5870         /*
5871          * Flush the T_DISCON_IND on sti_discon_ind_mp.
5872          */
5873         so_flush_discon_ind(so);
5874 
5875         so_unlock_single(so, SOLOCKED);
5876         mutex_exit(&so->so_lock);
5877 
5878         /*
5879          * Needed for STREAMs.
5880          * Decrement the device driver's reference count for streams
5881          * opened via the clone dip. The driver was held in clone_open().
5882          * The absence of clone_close() forces this asymmetry.
5883          */
5884         if (so->so_flag & SOCLONE)
5885                 ddi_rele_driver(getmajor(dev));
5886 
5887         return (error);
5888 }
5889 
5890 static int
5891 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5892     struct cred *cr, int32_t *rvalp)
5893 {
5894         struct vnode *vp = SOTOV(so);
5895         sotpi_info_t *sti = SOTOTPI(so);
5896         int error = 0;
5897 
5898         dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5899             cmd, arg, pr_state(so->so_state, so->so_mode)));
5900 
5901         switch (cmd) {
5902         case SIOCSQPTR:
5903                 /*
5904                  * SIOCSQPTR is valid only when helper stream is created
5905                  * by the protocol.
5906                  */
5907         case _I_INSERT:
5908         case _I_REMOVE:
5909                 /*
5910                  * Since there's no compelling reason to support these ioctls
5911                  * on sockets, and doing so would increase the complexity
5912                  * markedly, prevent it.
5913                  */
5914                 return (EOPNOTSUPP);
5915 
5916         case I_FIND:
5917         case I_LIST:
5918         case I_LOOK:
5919         case I_POP:
5920         case I_PUSH:
5921                 /*
5922                  * To prevent races and inconsistencies between the actual
5923                  * state of the stream and the state according to the sonode,
5924                  * we serialize all operations which modify or operate on the
5925                  * list of modules on the socket's stream.
5926                  */
5927                 mutex_enter(&sti->sti_plumb_lock);
5928                 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5929                 mutex_exit(&sti->sti_plumb_lock);
5930                 return (error);
5931 
5932         default:
5933                 if (so->so_version != SOV_STREAM)
5934                         break;
5935 
5936                 /*
5937                  * The imaginary "sockmod" has been popped; act as a stream.
5938                  */
5939                 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5940         }
5941 
5942         ASSERT(so->so_version != SOV_STREAM);
5943 
5944         /*
5945          * Process socket-specific ioctls.
5946          */
5947         switch (cmd) {
5948         case FIONBIO: {
5949                 int32_t value;
5950 
5951                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5952                     (mode & (int)FKIOCTL)))
5953                         return (EFAULT);
5954 
5955                 mutex_enter(&so->so_lock);
5956                 if (value) {
5957                         so->so_state |= SS_NDELAY;
5958                 } else {
5959                         so->so_state &= ~SS_NDELAY;
5960                 }
5961                 mutex_exit(&so->so_lock);
5962                 return (0);
5963         }
5964 
5965         case FIOASYNC: {
5966                 int32_t value;
5967 
5968                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5969                     (mode & (int)FKIOCTL)))
5970                         return (EFAULT);
5971 
5972                 mutex_enter(&so->so_lock);
5973                 /*
5974                  * SS_ASYNC flag not already set correctly?
5975                  * (!value != !(so->so_state & SS_ASYNC))
5976                  * but some engineers find that too hard to read.
5977                  */
5978                 if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5979                     value != 0 && (so->so_state & SS_ASYNC) == 0)
5980                         error = so_flip_async(so, vp, mode, cr);
5981                 mutex_exit(&so->so_lock);
5982                 return (error);
5983         }
5984 
5985         case SIOCSPGRP:
5986         case FIOSETOWN: {
5987                 pid_t pgrp;
5988 
5989                 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5990                     (mode & (int)FKIOCTL)))
5991                         return (EFAULT);
5992 
5993                 mutex_enter(&so->so_lock);
5994                 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5995                 /* Any change? */
5996                 if (pgrp != so->so_pgrp)
5997                         error = so_set_siggrp(so, vp, pgrp, mode, cr);
5998                 mutex_exit(&so->so_lock);
5999                 return (error);
6000         }
6001         case SIOCGPGRP:
6002         case FIOGETOWN:
6003                 if (so_copyout(&so->so_pgrp, (void *)arg,
6004                     sizeof (pid_t), (mode & (int)FKIOCTL)))
6005                         return (EFAULT);
6006                 return (0);
6007 
6008         case SIOCATMARK: {
6009                 int retval;
6010                 uint_t so_state;
6011 
6012                 /*
6013                  * strwaitmark has a finite timeout after which it
6014                  * returns -1 if the mark state is undetermined.
6015                  * In order to avoid any race between the mark state
6016                  * in sockfs and the mark state in the stream head this
6017                  * routine loops until the mark state can be determined
6018                  * (or the urgent data indication has been removed by some
6019                  * other thread).
6020                  */
6021                 do {
6022                         mutex_enter(&so->so_lock);
6023                         so_state = so->so_state;
6024                         mutex_exit(&so->so_lock);
6025                         if (so_state & SS_RCVATMARK) {
6026                                 retval = 1;
6027                         } else if (!(so_state & SS_OOBPEND)) {
6028                                 /*
6029                                  * No SIGURG has been generated -- there is no
6030                                  * pending or present urgent data. Thus can't
6031                                  * possibly be at the mark.
6032                                  */
6033                                 retval = 0;
6034                         } else {
6035                                 /*
6036                                  * Have the stream head wait until there is
6037                                  * either some messages on the read queue, or
6038                                  * STRATMARK or STRNOTATMARK gets set. The
6039                                  * STRNOTATMARK flag is used so that the
6040                                  * transport can send up a MSGNOTMARKNEXT
6041                                  * M_DATA to indicate that it is not
6042                                  * at the mark and additional data is not about
6043                                  * to be send upstream.
6044                                  *
6045                                  * If the mark state is undetermined this will
6046                                  * return -1 and we will loop rechecking the
6047                                  * socket state.
6048                                  */
6049                                 retval = strwaitmark(vp);
6050                         }
6051                 } while (retval == -1);
6052 
6053                 if (so_copyout(&retval, (void *)arg, sizeof (int),
6054                     (mode & (int)FKIOCTL)))
6055                         return (EFAULT);
6056                 return (0);
6057         }
6058 
6059         case I_FDINSERT:
6060         case I_SENDFD:
6061         case I_RECVFD:
6062         case I_ATMARK:
6063         case _SIOCSOCKFALLBACK:
6064                 /*
6065                  * These ioctls do not apply to sockets. I_FDINSERT can be
6066                  * used to send M_PROTO messages without modifying the socket
6067                  * state. I_SENDFD/RECVFD should not be used for socket file
6068                  * descriptor passing since they assume a twisted stream.
6069                  * SIOCATMARK must be used instead of I_ATMARK.
6070                  *
6071                  * _SIOCSOCKFALLBACK from an application should never be
6072                  * processed.  It is only generated by socktpi_open() or
6073                  * in response to I_POP or I_PUSH.
6074                  */
6075 #ifdef DEBUG
6076                 zcmn_err(getzoneid(), CE_WARN,
6077                     "Unsupported STREAMS ioctl 0x%x on socket. "
6078                     "Pid = %d\n", cmd, curproc->p_pid);
6079 #endif /* DEBUG */
6080                 return (EOPNOTSUPP);
6081 
6082         case _I_GETPEERCRED:
6083                 if ((mode & FKIOCTL) == 0)
6084                         return (EINVAL);
6085 
6086                 mutex_enter(&so->so_lock);
6087                 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6088                         error = ENOTSUP;
6089                 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
6090                         error = ENOTCONN;
6091                 } else if (so->so_peercred != NULL) {
6092                         k_peercred_t *kp = (k_peercred_t *)arg;
6093                         kp->pc_cr = so->so_peercred;
6094                         kp->pc_cpid = so->so_cpid;
6095                         crhold(so->so_peercred);
6096                 } else {
6097                         error = EINVAL;
6098                 }
6099                 mutex_exit(&so->so_lock);
6100                 return (error);
6101 
6102         default:
6103                 /*
6104                  * Do the higher-order bits of the ioctl cmd indicate
6105                  * that it is an I_* streams ioctl?
6106                  */
6107                 if ((cmd & 0xffffff00U) == STR &&
6108                     so->so_version == SOV_SOCKBSD) {
6109 #ifdef DEBUG
6110                         zcmn_err(getzoneid(), CE_WARN,
6111                             "Unsupported STREAMS ioctl 0x%x on socket. "
6112                             "Pid = %d\n", cmd, curproc->p_pid);
6113 #endif /* DEBUG */
6114                         return (EOPNOTSUPP);
6115                 }
6116                 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6117         }
6118 }
6119 
6120 /*
6121  * Handle plumbing-related ioctls.
6122  */
6123 static int
6124 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6125     struct cred *cr, int32_t *rvalp)
6126 {
6127         static const char sockmod_name[] = "sockmod";
6128         struct sonode   *so = VTOSO(vp);
6129         char            mname[FMNAMESZ + 1];
6130         int             error;
6131         sotpi_info_t    *sti = SOTOTPI(so);
6132 
6133         ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6134 
6135         if (so->so_version == SOV_SOCKBSD)
6136                 return (EOPNOTSUPP);
6137 
6138         if (so->so_version == SOV_STREAM) {
6139                 /*
6140                  * The imaginary "sockmod" has been popped - act as a stream.
6141                  * If this is a push of sockmod then change back to a socket.
6142                  */
6143                 if (cmd == I_PUSH) {
6144                         error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6145                             (void *)arg, mname, sizeof (mname), NULL);
6146 
6147                         if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6148                                 dprintso(so, 0, ("socktpi_ioctl: going to "
6149                                     "socket version\n"));
6150                                 so_stream2sock(so);
6151                                 return (0);
6152                         }
6153                 }
6154                 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6155         }
6156 
6157         switch (cmd) {
6158         case I_PUSH:
6159                 if (sti->sti_direct) {
6160                         mutex_enter(&so->so_lock);
6161                         so_lock_single(so);
6162                         mutex_exit(&so->so_lock);
6163 
6164                         error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6165                             cr, rvalp);
6166 
6167                         mutex_enter(&so->so_lock);
6168                         if (error == 0)
6169                                 sti->sti_direct = 0;
6170                         so_unlock_single(so, SOLOCKED);
6171                         mutex_exit(&so->so_lock);
6172 
6173                         if (error != 0)
6174                                 return (error);
6175                 }
6176 
6177                 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6178                 if (error == 0)
6179                         sti->sti_pushcnt++;
6180                 return (error);
6181 
6182         case I_POP:
6183                 if (sti->sti_pushcnt == 0) {
6184                         /* Emulate sockmod being popped */
6185                         dprintso(so, 0,
6186                             ("socktpi_ioctl: going to STREAMS version\n"));
6187                         return (so_sock2stream(so));
6188                 }
6189 
6190                 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6191                 if (error == 0)
6192                         sti->sti_pushcnt--;
6193                 return (error);
6194 
6195         case I_LIST: {
6196                 struct str_mlist *kmlistp, *umlistp;
6197                 struct str_list kstrlist;
6198                 ssize_t         kstrlistsize;
6199                 int             i, nmods;
6200 
6201                 STRUCT_DECL(str_list, ustrlist);
6202                 STRUCT_INIT(ustrlist, mode);
6203 
6204                 if (arg == 0) {
6205                         error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6206                         if (error == 0)
6207                                 (*rvalp)++;     /* Add one for sockmod */
6208                         return (error);
6209                 }
6210 
6211                 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6212                     STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6213                 if (error != 0)
6214                         return (error);
6215 
6216                 nmods = STRUCT_FGET(ustrlist, sl_nmods);
6217                 if (nmods <= 0)
6218                         return (EINVAL);
6219                 /*
6220                  * Ceiling nmods at nstrpush to prevent someone from
6221                  * maliciously consuming lots of kernel memory.
6222                  */
6223                 nmods = MIN(nmods, nstrpush);
6224 
6225                 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6226                 kstrlist.sl_nmods = nmods;
6227                 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6228 
6229                 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6230                     cr, rvalp);
6231                 if (error != 0)
6232                         goto done;
6233 
6234                 /*
6235                  * Considering the module list as a 0-based array of sl_nmods
6236                  * modules, sockmod should conceptually exist at slot
6237                  * sti_pushcnt.  Insert sockmod at this location by sliding all
6238                  * of the module names after so_pushcnt over by one.  We know
6239                  * that there will be room to do this since we allocated
6240                  * sl_modlist with an additional slot.
6241                  */
6242                 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6243                         kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6244 
6245                 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6246                 kstrlist.sl_nmods++;
6247 
6248                 /*
6249                  * Copy all of the entries out to ustrlist.
6250                  */
6251                 kmlistp = kstrlist.sl_modlist;
6252                 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6253                 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6254                         error = so_copyout(kmlistp++, umlistp++,
6255                             sizeof (struct str_mlist), mode & FKIOCTL);
6256                         if (error != 0)
6257                                 goto done;
6258                 }
6259 
6260                 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6261                     mode & FKIOCTL);
6262                 if (error == 0)
6263                         *rvalp = 0;
6264         done:
6265                 kmem_free(kstrlist.sl_modlist, kstrlistsize);
6266                 return (error);
6267         }
6268         case I_LOOK:
6269                 if (sti->sti_pushcnt == 0) {
6270                         return (so_copyout(sockmod_name, (void *)arg,
6271                             sizeof (sockmod_name), mode & FKIOCTL));
6272                 }
6273                 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6274 
6275         case I_FIND:
6276                 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6277                 if (error && error != EINVAL)
6278                         return (error);
6279 
6280                 /* if not found and string was sockmod return 1 */
6281                 if (*rvalp == 0 || error == EINVAL) {
6282                         error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6283                             (void *)arg, mname, sizeof (mname), NULL);
6284                         if (error == ENAMETOOLONG)
6285                                 error = EINVAL;
6286 
6287                         if (error == 0 && strcmp(mname, sockmod_name) == 0)
6288                                 *rvalp = 1;
6289                 }
6290                 return (error);
6291 
6292         default:
6293                 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6294                 break;
6295         }
6296 
6297         return (0);
6298 }
6299 
6300 /*
6301  * Wrapper around the streams poll routine that implements socket poll
6302  * semantics.
6303  * The sockfs never calls pollwakeup itself - the stream head take care
6304  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6305  * stream head there can never be a deadlock due to holding so_lock across
6306  * pollwakeup and acquiring so_lock in this routine.
6307  *
6308  * However, since the performance of VOP_POLL is critical we avoid
6309  * acquiring so_lock here. This is based on two assumptions:
6310  *  - The poll implementation holds locks to serialize the VOP_POLL call
6311  *    and a pollwakeup for the same pollhead. This ensures that should
6312  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6313  *    (which strsock_* and strrput conspire to issue) is issued after
6314  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6315  *    returned and then wake up poll and have it call VOP_POLL again.
6316  *  - The reading of so_state without holding so_lock does not result in
6317  *    stale data that is older than the latest state change that has dropped
6318  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6319  *    memory barrier to force the data into the coherency domain.
6320  */
6321 static int
6322 sotpi_poll(
6323         struct sonode   *so,
6324         short           events,
6325         int             anyyet,
6326         short           *reventsp,
6327         struct pollhead **phpp)
6328 {
6329         short origevents = events;
6330         struct vnode *vp = SOTOV(so);
6331         int error;
6332         int so_state = so->so_state; /* snapshot */
6333         sotpi_info_t *sti = SOTOTPI(so);
6334 
6335         dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6336             (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6337 
6338         ASSERT(vp->v_type == VSOCK);
6339         ASSERT(vp->v_stream != NULL);
6340 
6341         if (so->so_version == SOV_STREAM) {
6342                 /* The imaginary "sockmod" has been popped - act as a stream */
6343                 return (strpoll(vp->v_stream, events, anyyet,
6344                     reventsp, phpp));
6345         }
6346 
6347         if (!(so_state & SS_ISCONNECTED) &&
6348             (so->so_mode & SM_CONNREQUIRED)) {
6349                 /* Not connected yet - turn off write side events */
6350                 events &= ~(POLLOUT|POLLWRBAND);
6351         }
6352         /*
6353          * Check for errors without calling strpoll if the caller wants them.
6354          * In sockets the errors are represented as input/output events
6355          * and there is no need to ask the stream head for this information.
6356          */
6357         if (so->so_error != 0 &&
6358             ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6359                 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6360                 return (0);
6361         }
6362         /*
6363          * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6364          * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6365          * will not trigger a POLLIN event with POLLRDDATA set.
6366          * The handling of urgent data (causing POLLRDBAND) is done by
6367          * inspecting SS_OOBPEND below.
6368          */
6369         events |= POLLRDDATA;
6370 
6371         /*
6372          * After shutdown(output) a stream head write error is set.
6373          * However, we should not return output events.
6374          */
6375         events |= POLLNOERR;
6376         error = strpoll(vp->v_stream, events, anyyet,
6377             reventsp, phpp);
6378         if (error)
6379                 return (error);
6380 
6381         ASSERT(!(*reventsp & POLLERR));
6382 
6383         /*
6384          * Notes on T_CONN_IND handling for sockets.
6385          *
6386          * If strpoll() returned without events, SR_POLLIN is guaranteed
6387          * to be set, ensuring any subsequent strrput() runs pollwakeup().
6388          *
6389          * Since the so_lock is not held, soqueueconnind() may have run
6390          * and a T_CONN_IND may be waiting. We now check for any queued
6391          * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6392          * to ensure poll returns.
6393          *
6394          * However:
6395          * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6396          * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6397          * the following actions will occur; taken together they ensure the
6398          * syscall will return.
6399          *
6400          * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6401          *    the accept() was run on a non-blocking socket sowaitconnind()
6402          *    may have already returned EWOULDBLOCK, so not be waiting to
6403          *    process the message. Additionally socktpi_poll() has probably
6404          *    proceeded past the sti_conn_ind_head check below.
6405          * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6406          *    this thread,  however that could occur before poll_common()
6407          *    has entered cv_wait.
6408          * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6409          *
6410          * Before proceeding to cv_wait() in poll_common() for an event,
6411          * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6412          * and if set, re-calls strpoll() to ensure the late arriving
6413          * T_CONN_IND is recognized, and pollsys() returns.
6414          */
6415 
6416         if (sti->sti_conn_ind_head != NULL)
6417                 *reventsp |= (POLLIN|POLLRDNORM) & events;
6418 
6419         if (so->so_state & SS_CANTRCVMORE) {
6420                 *reventsp |= POLLRDHUP & events;
6421 
6422                 if (so->so_state & SS_CANTSENDMORE)
6423                         *reventsp |= POLLHUP;
6424         }
6425 
6426         if (so->so_state & SS_OOBPEND)
6427                 *reventsp |= POLLRDBAND & events;
6428 
6429         if (sti->sti_nl7c_rcv_mp != NULL) {
6430                 *reventsp |= (POLLIN|POLLRDNORM) & events;
6431         }
6432         if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6433             ((POLLIN|POLLRDNORM) & *reventsp)) {
6434                 sti->sti_nl7c_flags |= NL7C_POLLIN;
6435         }
6436 
6437         return (0);
6438 }
6439 
6440 /*ARGSUSED*/
6441 static int
6442 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6443 {
6444         sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6445         int error = 0;
6446 
6447         error = sonode_constructor(buf, cdrarg, kmflags);
6448         if (error != 0)
6449                 return (error);
6450 
6451         error = i_sotpi_info_constructor(&st->st_info);
6452         if (error != 0)
6453                 sonode_destructor(buf, cdrarg);
6454 
6455         st->st_sonode.so_priv = &st->st_info;
6456 
6457         return (error);
6458 }
6459 
6460 /*ARGSUSED1*/
6461 static void
6462 socktpi_destructor(void *buf, void *cdrarg)
6463 {
6464         sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6465 
6466         ASSERT(st->st_sonode.so_priv == &st->st_info);
6467         st->st_sonode.so_priv = NULL;
6468 
6469         i_sotpi_info_destructor(&st->st_info);
6470         sonode_destructor(buf, cdrarg);
6471 }
6472 
6473 static int
6474 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6475 {
6476         int retval;
6477 
6478         if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6479                 struct sonode *so = (struct sonode *)buf;
6480                 sotpi_info_t *sti = SOTOTPI(so);
6481 
6482                 mutex_enter(&socklist.sl_lock);
6483 
6484                 sti->sti_next_so = socklist.sl_list;
6485                 sti->sti_prev_so = NULL;
6486                 if (sti->sti_next_so != NULL)
6487                         SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6488                 socklist.sl_list = so;
6489 
6490                 mutex_exit(&socklist.sl_lock);
6491 
6492         }
6493         return (retval);
6494 }
6495 
6496 static void
6497 socktpi_unix_destructor(void *buf, void *cdrarg)
6498 {
6499         struct sonode   *so = (struct sonode *)buf;
6500         sotpi_info_t    *sti = SOTOTPI(so);
6501 
6502         mutex_enter(&socklist.sl_lock);
6503 
6504         if (sti->sti_next_so != NULL)
6505                 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6506         if (sti->sti_prev_so != NULL)
6507                 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6508         else
6509                 socklist.sl_list = sti->sti_next_so;
6510 
6511         mutex_exit(&socklist.sl_lock);
6512 
6513         socktpi_destructor(buf, cdrarg);
6514 }
6515 
6516 int
6517 socktpi_init(void)
6518 {
6519         /*
6520          * Create sonode caches.  We create a special one for AF_UNIX so
6521          * that we can track them for netstat(1m).
6522          */
6523         socktpi_cache = kmem_cache_create("socktpi_cache",
6524             sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6525             socktpi_destructor, NULL, NULL, NULL, 0);
6526 
6527         socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6528             sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6529             socktpi_unix_destructor, NULL, NULL, NULL, 0);
6530 
6531         return (0);
6532 }
6533 
6534 /*
6535  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6536  *
6537  * Caller must still update state and mode using sotpi_update_state().
6538  */
6539 int
6540 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6541     boolean_t *direct, queue_t **qp, struct cred *cr)
6542 {
6543         sotpi_info_t *sti;
6544         struct sockparams *origsp = so->so_sockparams;
6545         sock_lower_handle_t handle = so->so_proto_handle;
6546         struct stdata *stp;
6547         struct vnode *vp;
6548         queue_t *q;
6549         int error = 0;
6550 
6551         ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6552             SS_FALLBACK_PENDING);
6553         ASSERT(SOCK_IS_NONSTR(so));
6554 
6555         *qp = NULL;
6556         *direct = B_FALSE;
6557         so->so_sockparams = newsp;
6558         /*
6559          * Allocate and initalize fields required by TPI.
6560          */
6561         (void) sotpi_info_create(so, KM_SLEEP);
6562         sotpi_info_init(so);
6563 
6564         if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6565                 sotpi_info_fini(so);
6566                 sotpi_info_destroy(so);
6567                 return (error);
6568         }
6569         ASSERT(handle == so->so_proto_handle);
6570         sti = SOTOTPI(so);
6571         if (sti->sti_direct != 0)
6572                 *direct = B_TRUE;
6573 
6574         /*
6575          * Keep the original sp around so we can properly dispose of the
6576          * sonode when the socket is being closed.
6577          */
6578         sti->sti_orig_sp = origsp;
6579 
6580         so_basic_strinit(so);   /* skips the T_CAPABILITY_REQ */
6581         so_alloc_addr(so, so->so_max_addr_len);
6582 
6583         /*
6584          * If the application has done a SIOCSPGRP, make sure the
6585          * STREAM head is aware. This needs to take place before
6586          * the protocol start sending up messages. Otherwise we
6587          * might miss to generate SIGPOLL.
6588          *
6589          * It is possible that the application will receive duplicate
6590          * signals if some were already generated for either data or
6591          * connection indications.
6592          */
6593         if (so->so_pgrp != 0) {
6594                 if (so_set_events(so, so->so_vnode, cr) != 0)
6595                         so->so_pgrp = 0;
6596         }
6597 
6598         /*
6599          * Determine which queue to use.
6600          */
6601         vp = SOTOV(so);
6602         stp = vp->v_stream;
6603         ASSERT(stp != NULL);
6604         q = stp->sd_wrq->q_next;
6605 
6606         /*
6607          * Skip any modules that may have been auto pushed when the device
6608          * was opened
6609          */
6610         while (q->q_next != NULL)
6611                 q = q->q_next;
6612         *qp = _RD(q);
6613 
6614         /* This is now a STREAMS sockets */
6615         so->so_not_str = B_FALSE;
6616 
6617         return (error);
6618 }
6619 
6620 /*
6621  * Revert a TPI sonode. It is only allowed to revert the sonode during
6622  * the fallback process.
6623  */
6624 void
6625 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6626 {
6627         vnode_t *vp = SOTOV(so);
6628 
6629         ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6630             SS_FALLBACK_PENDING);
6631         ASSERT(!SOCK_IS_NONSTR(so));
6632         ASSERT(vp->v_stream != NULL);
6633 
6634         strclean(vp);
6635         (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6636 
6637         /*
6638          * Restore the original sockparams. The caller is responsible for
6639          * dropping the ref to the new sp.
6640          */
6641         so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6642 
6643         sotpi_info_fini(so);
6644         sotpi_info_destroy(so);
6645 
6646         /* This is no longer a STREAMS sockets */
6647         so->so_not_str = B_TRUE;
6648 }
6649 
6650 void
6651 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6652     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6653     socklen_t faddrlen, short opts)
6654 {
6655         sotpi_info_t *sti = SOTOTPI(so);
6656 
6657         so_proc_tcapability_ack(so, tcap);
6658 
6659         so->so_options |= opts;
6660 
6661         /*
6662          * Determine whether the foreign and local address are valid
6663          */
6664         if (laddrlen != 0) {
6665                 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6666                 sti->sti_laddr_len = laddrlen;
6667                 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6668                 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6669         }
6670 
6671         if (faddrlen != 0) {
6672                 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6673                 sti->sti_faddr_len = faddrlen;
6674                 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6675                 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6676         }
6677 
6678 }
6679 
6680 /*
6681  * Allocate enough space to cache the local and foreign addresses.
6682  */
6683 void
6684 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6685 {
6686         sotpi_info_t *sti = SOTOTPI(so);
6687 
6688         ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6689         ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6690         sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6691             P2ROUNDUP(maxlen, KMEM_ALIGN);
6692         so->so_max_addr_len = sti->sti_laddr_maxlen;
6693         sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6694         sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6695             + sti->sti_laddr_maxlen);
6696 
6697         if (so->so_family == AF_UNIX) {
6698                 /*
6699                  * Initialize AF_UNIX related fields.
6700                  */
6701                 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6702                 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6703         }
6704 }
6705 
6706 
6707 sotpi_info_t *
6708 sotpi_sototpi(struct sonode *so)
6709 {
6710         sotpi_info_t *sti;
6711 
6712         ASSERT(so != NULL);
6713 
6714         sti = (sotpi_info_t *)so->so_priv;
6715 
6716         ASSERT(sti != NULL);
6717         ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6718 
6719         return (sti);
6720 }
6721 
6722 static int
6723 i_sotpi_info_constructor(sotpi_info_t *sti)
6724 {
6725         sti->sti_magic               = SOTPI_INFO_MAGIC;
6726         sti->sti_ack_mp              = NULL;
6727         sti->sti_discon_ind_mp       = NULL;
6728         sti->sti_ux_bound_vp = NULL;
6729         sti->sti_unbind_mp   = NULL;
6730 
6731         sti->sti_conn_ind_head       = NULL;
6732         sti->sti_conn_ind_tail       = NULL;
6733 
6734         sti->sti_laddr_sa    = NULL;
6735         sti->sti_faddr_sa    = NULL;
6736 
6737         sti->sti_nl7c_flags  = 0;
6738         sti->sti_nl7c_uri    = NULL;
6739         sti->sti_nl7c_rcv_mp = NULL;
6740 
6741         mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6742         cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6743 
6744         return (0);
6745 }
6746 
6747 static void
6748 i_sotpi_info_destructor(sotpi_info_t *sti)
6749 {
6750         ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6751         ASSERT(sti->sti_ack_mp == NULL);
6752         ASSERT(sti->sti_discon_ind_mp == NULL);
6753         ASSERT(sti->sti_ux_bound_vp == NULL);
6754         ASSERT(sti->sti_unbind_mp == NULL);
6755 
6756         ASSERT(sti->sti_conn_ind_head == NULL);
6757         ASSERT(sti->sti_conn_ind_tail == NULL);
6758 
6759         ASSERT(sti->sti_laddr_sa == NULL);
6760         ASSERT(sti->sti_faddr_sa == NULL);
6761 
6762         ASSERT(sti->sti_nl7c_flags == 0);
6763         ASSERT(sti->sti_nl7c_uri == NULL);
6764         ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6765 
6766         mutex_destroy(&sti->sti_plumb_lock);
6767         cv_destroy(&sti->sti_ack_cv);
6768 }
6769 
6770 /*
6771  * Creates and attaches TPI information to the given sonode
6772  */
6773 static boolean_t
6774 sotpi_info_create(struct sonode *so, int kmflags)
6775 {
6776         sotpi_info_t *sti;
6777 
6778         ASSERT(so->so_priv == NULL);
6779 
6780         if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6781                 return (B_FALSE);
6782 
6783         if (i_sotpi_info_constructor(sti) != 0) {
6784                 kmem_free(sti, sizeof (*sti));
6785                 return (B_FALSE);
6786         }
6787 
6788         so->so_priv = (void *)sti;
6789         return (B_TRUE);
6790 }
6791 
6792 /*
6793  * Initializes the TPI information.
6794  */
6795 static void
6796 sotpi_info_init(struct sonode *so)
6797 {
6798         struct vnode *vp = SOTOV(so);
6799         sotpi_info_t *sti = SOTOTPI(so);
6800         time_t now;
6801 
6802         sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6803         vp->v_rdev   = sti->sti_dev;
6804 
6805         sti->sti_orig_sp = NULL;
6806 
6807         sti->sti_pushcnt = 0;
6808 
6809         now = gethrestime_sec();
6810         sti->sti_atime       = now;
6811         sti->sti_mtime       = now;
6812         sti->sti_ctime       = now;
6813 
6814         sti->sti_eaddr_mp = NULL;
6815         sti->sti_delayed_error = 0;
6816 
6817         sti->sti_provinfo = NULL;
6818 
6819         sti->sti_oobcnt = 0;
6820         sti->sti_oobsigcnt = 0;
6821 
6822         ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6823 
6824         sti->sti_laddr_sa    = 0;
6825         sti->sti_faddr_sa    = 0;
6826         sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6827         sti->sti_laddr_len = sti->sti_faddr_len = 0;
6828 
6829         sti->sti_laddr_valid = 0;
6830         sti->sti_faddr_valid = 0;
6831         sti->sti_faddr_noxlate = 0;
6832 
6833         sti->sti_direct = 0;
6834 
6835         ASSERT(sti->sti_ack_mp == NULL);
6836         ASSERT(sti->sti_ux_bound_vp == NULL);
6837         ASSERT(sti->sti_unbind_mp == NULL);
6838 
6839         ASSERT(sti->sti_conn_ind_head == NULL);
6840         ASSERT(sti->sti_conn_ind_tail == NULL);
6841 }
6842 
6843 /*
6844  * Given a sonode, grab the TPI info and free any data.
6845  */
6846 static void
6847 sotpi_info_fini(struct sonode *so)
6848 {
6849         sotpi_info_t *sti = SOTOTPI(so);
6850         mblk_t *mp;
6851 
6852         ASSERT(sti->sti_discon_ind_mp == NULL);
6853 
6854         if ((mp = sti->sti_conn_ind_head) != NULL) {
6855                 mblk_t *mp1;
6856 
6857                 while (mp) {
6858                         mp1 = mp->b_next;
6859                         mp->b_next = NULL;
6860                         freemsg(mp);
6861                         mp = mp1;
6862                 }
6863                 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6864         }
6865 
6866         /*
6867          * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6868          * indirect them.  It also uses so_count as a validity test.
6869          */
6870         mutex_enter(&so->so_lock);
6871 
6872         if (sti->sti_laddr_sa) {
6873                 ASSERT((caddr_t)sti->sti_faddr_sa ==
6874                     (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6875                 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6876                 sti->sti_laddr_valid = 0;
6877                 sti->sti_faddr_valid = 0;
6878                 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6879                 sti->sti_laddr_sa = NULL;
6880                 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6881                 sti->sti_faddr_sa = NULL;
6882                 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6883         }
6884 
6885         mutex_exit(&so->so_lock);
6886 
6887         if ((mp = sti->sti_eaddr_mp) != NULL) {
6888                 freemsg(mp);
6889                 sti->sti_eaddr_mp = NULL;
6890                 sti->sti_delayed_error = 0;
6891         }
6892 
6893         if ((mp = sti->sti_ack_mp) != NULL) {
6894                 freemsg(mp);
6895                 sti->sti_ack_mp = NULL;
6896         }
6897 
6898         if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6899                 sti->sti_nl7c_rcv_mp = NULL;
6900                 freemsg(mp);
6901         }
6902         sti->sti_nl7c_rcv_rval = 0;
6903         if (sti->sti_nl7c_uri != NULL) {
6904                 nl7c_urifree(so);
6905                 /* urifree() cleared nl7c_uri */
6906         }
6907         if (sti->sti_nl7c_flags) {
6908                 sti->sti_nl7c_flags = 0;
6909         }
6910 
6911         ASSERT(sti->sti_ux_bound_vp == NULL);
6912         if ((mp = sti->sti_unbind_mp) != NULL) {
6913                 freemsg(mp);
6914                 sti->sti_unbind_mp = NULL;
6915         }
6916 }
6917 
6918 /*
6919  * Destroys the TPI information attached to a sonode.
6920  */
6921 static void
6922 sotpi_info_destroy(struct sonode *so)
6923 {
6924         sotpi_info_t *sti = SOTOTPI(so);
6925 
6926         i_sotpi_info_destructor(sti);
6927         kmem_free(sti, sizeof (*sti));
6928 
6929         so->so_priv = NULL;
6930 }
6931 
6932 /*
6933  * Create the global sotpi socket module entry. It will never be freed.
6934  */
6935 smod_info_t *
6936 sotpi_smod_create(void)
6937 {
6938         smod_info_t *smodp;
6939 
6940         smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6941         smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6942         (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6943         /*
6944          * Initialize the smod_refcnt to 1 so it will never be freed.
6945          */
6946         smodp->smod_refcnt = 1;
6947         smodp->smod_uc_version = SOCK_UC_VERSION;
6948         smodp->smod_dc_version = SOCK_DC_VERSION;
6949         smodp->smod_sock_create_func = &sotpi_create;
6950         smodp->smod_sock_destroy_func = &sotpi_destroy;
6951         return (smodp);
6952 }