1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent, Inc.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  * Copyright 2022 MNX Cloud, Inc.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/t_lock.h>
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33 #include <sys/buf.h>
  34 #include <sys/conf.h>
  35 #include <sys/cred.h>
  36 #include <sys/kmem.h>
  37 #include <sys/kmem_impl.h>
  38 #include <sys/sysmacros.h>
  39 #include <sys/vfs.h>
  40 #include <sys/vnode.h>
  41 #include <sys/debug.h>
  42 #include <sys/errno.h>
  43 #include <sys/time.h>
  44 #include <sys/file.h>
  45 #include <sys/open.h>
  46 #include <sys/user.h>
  47 #include <sys/termios.h>
  48 #include <sys/stream.h>
  49 #include <sys/strsubr.h>
  50 #include <sys/strsun.h>
  51 #include <sys/suntpi.h>
  52 #include <sys/ddi.h>
  53 #include <sys/esunddi.h>
  54 #include <sys/flock.h>
  55 #include <sys/modctl.h>
  56 #include <sys/vtrace.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/pathname.h>
  59 
  60 #include <sys/socket.h>
  61 #include <sys/socketvar.h>
  62 #include <sys/sockio.h>
  63 #include <netinet/in.h>
  64 #include <sys/un.h>
  65 #include <sys/strsun.h>
  66 
  67 #include <sys/tiuser.h>
  68 #define _SUN_TPI_VERSION        2
  69 #include <sys/tihdr.h>
  70 #include <sys/timod.h>            /* TI_GETMYNAME, TI_GETPEERNAME */
  71 
  72 #include <c2/audit.h>
  73 
  74 #include <inet/common.h>
  75 #include <inet/ip.h>
  76 #include <inet/ip6.h>
  77 #include <inet/tcp.h>
  78 #include <inet/udp_impl.h>
  79 
  80 #include <sys/zone.h>
  81 
  82 #include <fs/sockfs/nl7c.h>
  83 #include <fs/sockfs/nl7curi.h>
  84 
  85 #include <fs/sockfs/sockcommon.h>
  86 #include <fs/sockfs/socktpi.h>
  87 #include <fs/sockfs/socktpi_impl.h>
  88 
  89 /*
  90  * Possible failures when memory can't be allocated. The documented behavior:
  91  *
  92  *              5.5:                    4.X:            XNET:
  93  * accept:      ENOMEM/ENOSR/EINTR      - (EINTR)       ENOMEM/ENOBUFS/ENOSR/
  94  *                                                      EINTR
  95  *      (4.X does not document EINTR but returns it)
  96  * bind:        ENOSR                   -               ENOBUFS/ENOSR
  97  * connect:     EINTR                   EINTR           ENOBUFS/ENOSR/EINTR
  98  * getpeername: ENOMEM/ENOSR            ENOBUFS (-)     ENOBUFS/ENOSR
  99  * getsockname: ENOMEM/ENOSR            ENOBUFS (-)     ENOBUFS/ENOSR
 100  *      (4.X getpeername and getsockname do not fail in practice)
 101  * getsockopt:  ENOMEM/ENOSR            -               ENOBUFS/ENOSR
 102  * listen:      -                       -               ENOBUFS
 103  * recv:        ENOMEM/ENOSR/EINTR      EINTR           ENOBUFS/ENOMEM/ENOSR/
 104  *                                                      EINTR
 105  * send:        ENOMEM/ENOSR/EINTR      ENOBUFS/EINTR   ENOBUFS/ENOMEM/ENOSR/
 106  *                                                      EINTR
 107  * setsockopt:  ENOMEM/ENOSR            -               ENOBUFS/ENOMEM/ENOSR
 108  * shutdown:    ENOMEM/ENOSR            -               ENOBUFS/ENOSR
 109  * socket:      ENOMEM/ENOSR            ENOBUFS         ENOBUFS/ENOMEM/ENOSR
 110  * socketpair:  ENOMEM/ENOSR            -               ENOBUFS/ENOMEM/ENOSR
 111  *
 112  * Resolution. When allocation fails:
 113  *      recv: return EINTR
 114  *      send: return EINTR
 115  *      connect, accept: EINTR
 116  *      bind, listen, shutdown (unbind, unix_close, disconnect): sleep
 117  *      socket, socketpair: ENOBUFS
 118  *      getpeername, getsockname: sleep
 119  *      getsockopt, setsockopt: sleep
 120  */
 121 
 122 #ifdef SOCK_TEST
 123 /*
 124  * Variables that make sockfs do something other than the standard TPI
 125  * for the AF_INET transports.
 126  *
 127  * solisten_tpi_tcp:
 128  *      TCP can handle a O_T_BIND_REQ with an increased backlog even though
 129  *      the transport is already bound. This is needed to avoid loosing the
 130  *      port number should listen() do a T_UNBIND_REQ followed by a
 131  *      O_T_BIND_REQ.
 132  *
 133  * soconnect_tpi_udp:
 134  *      UDP and ICMP can handle a T_CONN_REQ.
 135  *      This is needed to make the sequence of connect(), getsockname()
 136  *      return the local IP address used to send packets to the connected to
 137  *      destination.
 138  *
 139  * soconnect_tpi_tcp:
 140  *      TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
 141  *      Set this to non-zero to send TPI conformant messages to TCP in this
 142  *      respect. This is a performance optimization.
 143  *
 144  * soaccept_tpi_tcp:
 145  *      TCP can handle a T_CONN_REQ without the acceptor being bound.
 146  *      This is a performance optimization that has been picked up in XTI.
 147  *
 148  * soaccept_tpi_multioptions:
 149  *      When inheriting SOL_SOCKET options from the listener to the accepting
 150  *      socket send them as a single message for AF_INET{,6}.
 151  */
 152 int solisten_tpi_tcp = 0;
 153 int soconnect_tpi_udp = 0;
 154 int soconnect_tpi_tcp = 0;
 155 int soaccept_tpi_tcp = 0;
 156 int soaccept_tpi_multioptions = 1;
 157 #else /* SOCK_TEST */
 158 #define soconnect_tpi_tcp       0
 159 #define soconnect_tpi_udp       0
 160 #define solisten_tpi_tcp        0
 161 #define soaccept_tpi_tcp        0
 162 #define soaccept_tpi_multioptions       1
 163 #endif /* SOCK_TEST */
 164 
 165 #ifdef SOCK_TEST
 166 extern int do_useracc;
 167 extern clock_t sock_test_timelimit;
 168 #endif /* SOCK_TEST */
 169 
 170 extern uint32_t ucredsize;
 171 
 172 /*
 173  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
 174  * applications working. Turn on this flag to disable these checks.
 175  */
 176 int xnet_skip_checks = 0;
 177 int xnet_check_print = 0;
 178 int xnet_truncate_print = 0;
 179 
 180 static void sotpi_destroy(struct sonode *);
 181 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
 182     int, int *, cred_t *cr);
 183 
 184 static boolean_t        sotpi_info_create(struct sonode *, int);
 185 static void             sotpi_info_init(struct sonode *);
 186 static void             sotpi_info_fini(struct sonode *);
 187 static void             sotpi_info_destroy(struct sonode *);
 188 
 189 /*
 190  * Do direct function call to the transport layer below; this would
 191  * also allow the transport to utilize read-side synchronous stream
 192  * interface if necessary.  This is a /etc/system tunable that must
 193  * not be modified on a running system.  By default this is enabled
 194  * for performance reasons and may be disabled for debugging purposes.
 195  */
 196 boolean_t socktpi_direct = B_TRUE;
 197 
 198 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
 199 
 200 extern  void sigintr(k_sigset_t *, int);
 201 extern  void sigunintr(k_sigset_t *);
 202 
 203 static int      sotpi_unbind(struct sonode *, int);
 204 
 205 /* TPI sockfs sonode operations */
 206 int             sotpi_init(struct sonode *, struct sonode *, struct cred *,
 207                     int);
 208 static int      sotpi_accept(struct sonode *, int, struct cred *,
 209                     struct sonode **);
 210 static int      sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
 211                     int, struct cred *);
 212 static int      sotpi_listen(struct sonode *, int, struct cred *);
 213 static int      sotpi_connect(struct sonode *, struct sockaddr *,
 214                     socklen_t, int, int, struct cred *);
 215 extern int      sotpi_recvmsg(struct sonode *, struct nmsghdr *,
 216                     struct uio *, struct cred *);
 217 static int      sotpi_sendmsg(struct sonode *, struct nmsghdr *,
 218                     struct uio *, struct cred *);
 219 static int      sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
 220                     struct cred *, mblk_t **);
 221 static int      sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
 222                     struct uio *, void *, t_uscalar_t, int);
 223 static int      sodgram_direct(struct sonode *, struct sockaddr *,
 224                     socklen_t, struct uio *, int);
 225 extern int      sotpi_getpeername(struct sonode *, struct sockaddr *,
 226                     socklen_t *, boolean_t, struct cred *);
 227 static int      sotpi_getsockname(struct sonode *, struct sockaddr *,
 228                     socklen_t *, struct cred *);
 229 static int      sotpi_shutdown(struct sonode *, int, struct cred *);
 230 extern int      sotpi_getsockopt(struct sonode *, int, int, void *,
 231                     socklen_t *, int, struct cred *);
 232 extern int      sotpi_setsockopt(struct sonode *, int, int, const void *,
 233                     socklen_t, struct cred *);
 234 static int      sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
 235                     int32_t *);
 236 static int      socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
 237                     struct cred *, int32_t *);
 238 static int      sotpi_poll(struct sonode *, short, int, short *,
 239                     struct pollhead **);
 240 static int      sotpi_close(struct sonode *, int, struct cred *);
 241 
 242 static int      i_sotpi_info_constructor(sotpi_info_t *);
 243 static void     i_sotpi_info_destructor(sotpi_info_t *);
 244 
 245 sonodeops_t sotpi_sonodeops = {
 246         sotpi_init,             /* sop_init             */
 247         sotpi_accept,           /* sop_accept           */
 248         sotpi_bind,             /* sop_bind             */
 249         sotpi_listen,           /* sop_listen           */
 250         sotpi_connect,          /* sop_connect          */
 251         sotpi_recvmsg,          /* sop_recvmsg          */
 252         sotpi_sendmsg,          /* sop_sendmsg          */
 253         sotpi_sendmblk,         /* sop_sendmblk         */
 254         sotpi_getpeername,      /* sop_getpeername      */
 255         sotpi_getsockname,      /* sop_getsockname      */
 256         sotpi_shutdown,         /* sop_shutdown         */
 257         sotpi_getsockopt,       /* sop_getsockopt       */
 258         sotpi_setsockopt,       /* sop_setsockopt       */
 259         sotpi_ioctl,            /* sop_ioctl            */
 260         sotpi_poll,             /* sop_poll             */
 261         sotpi_close,            /* sop_close            */
 262 };
 263 
 264 /*
 265  * Post-close reality check for NULL v_stream...
 266  *
 267  * Kernel callers (e.g. in procfs) may attempt socket operations, after
 268  * holding the vnode, after it has been closed.  For TPI sockets, post-close
 269  * operations will have a NULL v_stream (which all functions here assume
 270  * or even ASSERT() is non-NULL).  See sotpi_close for where we wipe it out.
 271  *
 272  * If we are in a state where we lost a race to close(), we need to stop ASAP,
 273  * and return the acceptable-as-an-errno EBADF.  Because cleanup may be
 274  * required, this macro only checks the v_stream.
 275  *
 276  * Checking should only be relevant for in-kernel other-thread inspectors.
 277  * Userland ones (i.e. same process that opened the socktpi socket) SHOULD be
 278  * protected by higher-level mechanisms. The only in-kernel inspector in the
 279  * source base is procfs, which only accesses get{sockname,peername,sockopt}().
 280  */
 281 #define SOTPI_VN_NOSTREAM(vn) ((vn)->v_stream == NULL)
 282 
 283 /*
 284  * Return a TPI socket vnode.
 285  *
 286  * Note that sockets assume that the driver will clone (either itself
 287  * or by using the clone driver) i.e. a socket() call will always
 288  * result in a new vnode being created.
 289  */
 290 
 291 /*
 292  * Common create code for socket and accept. If tso is set the values
 293  * from that node is used instead of issuing a T_INFO_REQ.
 294  */
 295 
 296 /* ARGSUSED */
 297 static struct sonode *
 298 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
 299     int version, int sflags, int *errorp, cred_t *cr)
 300 {
 301         struct sonode   *so;
 302         kmem_cache_t    *cp;
 303         int             sfamily = family;
 304 
 305         ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
 306 
 307         if (family == AF_NCA) {
 308                 /*
 309                  * The request is for an NCA socket so for NL7C use the
 310                  * INET domain instead and mark NL7C_AF_NCA below.
 311                  */
 312                 family = AF_INET;
 313                 /*
 314                  * NL7C is not supported in the non-global zone,
 315                  * we enforce this restriction here.
 316                  */
 317                 if (getzoneid() != GLOBAL_ZONEID) {
 318                         *errorp = ENOTSUP;
 319                         return (NULL);
 320                 }
 321         }
 322 
 323         /*
 324          * to be compatible with old tpi socket implementation ignore
 325          * sleep flag (sflags) passed in
 326          */
 327         cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
 328         so = kmem_cache_alloc(cp, KM_SLEEP);
 329         if (so == NULL) {
 330                 *errorp = ENOMEM;
 331                 return (NULL);
 332         }
 333 
 334         sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
 335         sotpi_info_init(so);
 336 
 337         if (sfamily == AF_NCA) {
 338                 SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
 339         }
 340 
 341         if (version == SOV_DEFAULT)
 342                 version = so_default_version;
 343 
 344         so->so_version = (short)version;
 345         *errorp = 0;
 346 
 347         return (so);
 348 }
 349 
 350 static void
 351 sotpi_destroy(struct sonode *so)
 352 {
 353         kmem_cache_t *cp;
 354         struct sockparams *origsp;
 355 
 356         /*
 357          * If there is a new dealloc function (ie. smod_destroy_func),
 358          * then it should check the correctness of the ops.
 359          */
 360 
 361         ASSERT(so->so_ops == &sotpi_sonodeops);
 362 
 363         origsp = SOTOTPI(so)->sti_orig_sp;
 364 
 365         sotpi_info_fini(so);
 366 
 367         if (so->so_state & SS_FALLBACK_COMP) {
 368                 /*
 369                  * A fallback happend, which means that a sotpi_info_t struct
 370                  * was allocated (as opposed to being allocated from the TPI
 371                  * sonode cache. Therefore we explicitly free the struct
 372                  * here.
 373                  */
 374                 sotpi_info_destroy(so);
 375                 ASSERT(origsp != NULL);
 376 
 377                 origsp->sp_smod_info->smod_sock_destroy_func(so);
 378                 SOCKPARAMS_DEC_REF(origsp);
 379         } else {
 380                 sonode_fini(so);
 381                 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
 382                     socktpi_cache;
 383                 kmem_cache_free(cp, so);
 384         }
 385 }
 386 
 387 /* ARGSUSED1 */
 388 int
 389 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
 390 {
 391         major_t maj;
 392         dev_t newdev;
 393         struct vnode *vp;
 394         int error = 0;
 395         struct stdata *stp;
 396 
 397         sotpi_info_t *sti = SOTOTPI(so);
 398 
 399         dprint(1, ("sotpi_init()\n"));
 400 
 401         /*
 402          * over write the sleep flag passed in but that is ok
 403          * as tpi socket does not honor sleep flag.
 404          */
 405         flags |= FREAD|FWRITE;
 406 
 407         /*
 408          * Record in so_flag that it is a clone.
 409          */
 410         if (getmajor(sti->sti_dev) == clone_major)
 411                 so->so_flag |= SOCLONE;
 412 
 413         if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
 414             (so->so_family == AF_INET || so->so_family == AF_INET6) &&
 415             (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
 416             so->so_protocol == IPPROTO_IP)) {
 417                 /* Tell tcp or udp that it's talking to sockets */
 418                 flags |= SO_SOCKSTR;
 419 
 420                 /*
 421                  * Here we indicate to socktpi_open() our attempt to
 422                  * make direct calls between sockfs and transport.
 423                  * The final decision is left to socktpi_open().
 424                  */
 425                 sti->sti_direct = 1;
 426 
 427                 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
 428                 if (so->so_type == SOCK_STREAM && tso != NULL) {
 429                         if (SOTOTPI(tso)->sti_direct) {
 430                                 /*
 431                                  * Inherit sti_direct from listener and pass
 432                                  * SO_ACCEPTOR open flag to tcp, indicating
 433                                  * that this is an accept fast-path instance.
 434                                  */
 435                                 flags |= SO_ACCEPTOR;
 436                         } else {
 437                                 /*
 438                                  * sti_direct is not set on listener, meaning
 439                                  * that the listener has been converted from
 440                                  * a socket to a stream.  Ensure that the
 441                                  * acceptor inherits these settings.
 442                                  */
 443                                 sti->sti_direct = 0;
 444                                 flags &= ~SO_SOCKSTR;
 445                         }
 446                 }
 447         }
 448 
 449         /*
 450          * Tell local transport that it is talking to sockets.
 451          */
 452         if (so->so_family == AF_UNIX) {
 453                 flags |= SO_SOCKSTR;
 454         }
 455 
 456         vp = SOTOV(so);
 457         newdev = vp->v_rdev;
 458         maj = getmajor(newdev);
 459         ASSERT(STREAMSTAB(maj));
 460 
 461         error = stropen(vp, &newdev, flags, cr);
 462 
 463         stp = vp->v_stream;
 464         if (error == 0) {
 465                 if (so->so_flag & SOCLONE)
 466                         ASSERT(newdev != vp->v_rdev);
 467                 mutex_enter(&so->so_lock);
 468                 sti->sti_dev = newdev;
 469                 vp->v_rdev = newdev;
 470                 mutex_exit(&so->so_lock);
 471 
 472                 if (stp->sd_flag & STRISTTY) {
 473                         /*
 474                          * this is a post SVR4 tty driver - a socket can not
 475                          * be a controlling terminal. Fail the open.
 476                          */
 477                         (void) sotpi_close(so, flags, cr);
 478                         return (ENOTTY);        /* XXX */
 479                 }
 480 
 481                 ASSERT(stp->sd_wrq != NULL);
 482                 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
 483 
 484                 /*
 485                  * If caller is interested in doing direct function call
 486                  * interface to/from transport module, probe the module
 487                  * directly beneath the streamhead to see if it qualifies.
 488                  *
 489                  * We turn off the direct interface when qualifications fail.
 490                  * In the acceptor case, we simply turn off the sti_direct
 491                  * flag on the socket. We do the fallback after the accept
 492                  * has completed, before the new socket is returned to the
 493                  * application.
 494                  */
 495                 if (sti->sti_direct) {
 496                         queue_t *tq = stp->sd_wrq->q_next;
 497 
 498                         /*
 499                          * sti_direct is currently supported and tested
 500                          * only for tcp/udp; this is the main reason to
 501                          * have the following assertions.
 502                          */
 503                         ASSERT(so->so_family == AF_INET ||
 504                             so->so_family == AF_INET6);
 505                         ASSERT(so->so_protocol == IPPROTO_UDP ||
 506                             so->so_protocol == IPPROTO_TCP ||
 507                             so->so_protocol == IPPROTO_IP);
 508                         ASSERT(so->so_type == SOCK_DGRAM ||
 509                             so->so_type == SOCK_STREAM);
 510 
 511                         /*
 512                          * Abort direct call interface if the module directly
 513                          * underneath the stream head is not defined with the
 514                          * _D_DIRECT flag.  This could happen in the tcp or
 515                          * udp case, when some other module is autopushed
 516                          * above it, or for some reasons the expected module
 517                          * isn't purely D_MP (which is the main requirement).
 518                          */
 519                         if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
 520                             !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
 521                                 int rval;
 522 
 523                                 /* Continue on without direct calls */
 524                                 sti->sti_direct = 0;
 525 
 526                                 /*
 527                                  * Cannot issue ioctl on fallback socket since
 528                                  * there is no conn associated with the queue.
 529                                  * The fallback downcall will notify the proto
 530                                  * of the change.
 531                                  */
 532                                 if (!(flags & SO_ACCEPTOR) &&
 533                                     !(flags & SO_FALLBACK)) {
 534                                         if ((error = strioctl(vp,
 535                                             _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
 536                                             cr, &rval)) != 0) {
 537                                                 (void) sotpi_close(so, flags,
 538                                                     cr);
 539                                                 return (error);
 540                                         }
 541                                 }
 542                         }
 543                 }
 544 
 545                 if (flags & SO_FALLBACK) {
 546                         /*
 547                          * The stream created does not have a conn.
 548                          * do stream set up after conn has been assigned
 549                          */
 550                         return (error);
 551                 }
 552                 if (error = so_strinit(so, tso)) {
 553                         (void) sotpi_close(so, flags, cr);
 554                         return (error);
 555                 }
 556 
 557                 /* Enable sendfile() on AF_UNIX streams */
 558                 if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
 559                         mutex_enter(&so->so_lock);
 560                         so->so_mode |= SM_SENDFILESUPP;
 561                         mutex_exit(&so->so_lock);
 562                 }
 563 
 564                 /* Wildcard */
 565                 if (so->so_protocol != so->so_sockparams->sp_protocol) {
 566                         int protocol = so->so_protocol;
 567                         /*
 568                          * Issue SO_PROTOTYPE setsockopt.
 569                          */
 570                         error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
 571                             &protocol, (t_uscalar_t)sizeof (protocol), cr);
 572                         if (error != 0) {
 573                                 (void) sotpi_close(so, flags, cr);
 574                                 /*
 575                                  * Setsockopt often fails with ENOPROTOOPT but
 576                                  * socket() should fail with
 577                                  * EPROTONOSUPPORT/EPROTOTYPE.
 578                                  */
 579                                 return (EPROTONOSUPPORT);
 580                         }
 581                 }
 582 
 583         } else {
 584                 /*
 585                  * While the same socket can not be reopened (unlike specfs)
 586                  * the stream head sets STREOPENFAIL when the autopush fails.
 587                  */
 588                 if ((stp != NULL) &&
 589                     (stp->sd_flag & STREOPENFAIL)) {
 590                         /*
 591                          * Open failed part way through.
 592                          */
 593                         mutex_enter(&stp->sd_lock);
 594                         stp->sd_flag &= ~STREOPENFAIL;
 595                         mutex_exit(&stp->sd_lock);
 596                         (void) sotpi_close(so, flags, cr);
 597                         return (error);
 598                         /*NOTREACHED*/
 599                 }
 600                 ASSERT(stp == NULL);
 601         }
 602         TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
 603             "sockfs open:maj %d vp %p so %p error %d",
 604             maj, vp, so, error);
 605         return (error);
 606 }
 607 
 608 /*
 609  * Bind the socket to an unspecified address in sockfs only.
 610  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
 611  * required in all cases.
 612  */
 613 static void
 614 so_automatic_bind(struct sonode *so)
 615 {
 616         sotpi_info_t *sti = SOTOTPI(so);
 617         ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
 618 
 619         ASSERT(MUTEX_HELD(&so->so_lock));
 620         ASSERT(!(so->so_state & SS_ISBOUND));
 621         ASSERT(sti->sti_unbind_mp);
 622 
 623         ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
 624         bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
 625         sti->sti_laddr_sa->sa_family = so->so_family;
 626         so->so_state |= SS_ISBOUND;
 627 }
 628 
 629 
 630 /*
 631  * bind the socket.
 632  *
 633  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
 634  * are passed in we allow rebinding. Note that for backwards compatibility
 635  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
 636  * Thus the rebinding code is currently not executed.
 637  *
 638  * The constraints for rebinding are:
 639  * - it is a SOCK_DGRAM, or
 640  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
 641  *   and no listen() has been done.
 642  * This rebinding code was added based on some language in the XNET book
 643  * about not returning EINVAL it the protocol allows rebinding. However,
 644  * this language is not present in the Posix socket draft. Thus maybe the
 645  * rebinding logic should be deleted from the source.
 646  *
 647  * A null "name" can be used to unbind the socket if:
 648  * - it is a SOCK_DGRAM, or
 649  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
 650  *   and no listen() has been done.
 651  */
 652 /* ARGSUSED */
 653 static int
 654 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
 655     socklen_t namelen, int backlog, int flags, struct cred *cr)
 656 {
 657         struct T_bind_req       bind_req;
 658         struct T_bind_ack       *bind_ack;
 659         int                     error = 0;
 660         mblk_t                  *mp;
 661         void                    *addr;
 662         t_uscalar_t             addrlen;
 663         int                     unbind_on_err = 1;
 664         boolean_t               clear_acceptconn_on_err = B_FALSE;
 665         boolean_t               restore_backlog_on_err = B_FALSE;
 666         int                     save_so_backlog;
 667         t_scalar_t              PRIM_type = O_T_BIND_REQ;
 668         boolean_t               tcp_udp_xport;
 669         void                    *nl7c = NULL;
 670         sotpi_info_t            *sti = SOTOTPI(so);
 671 
 672         dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
 673             (void *)so, (void *)name, namelen, backlog, flags,
 674             pr_state(so->so_state, so->so_mode)));
 675 
 676         tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
 677 
 678         if (!(flags & _SOBIND_LOCK_HELD)) {
 679                 mutex_enter(&so->so_lock);
 680                 so_lock_single(so);     /* Set SOLOCKED */
 681         } else {
 682                 ASSERT(MUTEX_HELD(&so->so_lock));
 683                 ASSERT(so->so_flag & SOLOCKED);
 684         }
 685 
 686         /*
 687          * Make sure that there is a preallocated unbind_req message
 688          * before binding. This message allocated when the socket is
 689          * created  but it might be have been consumed.
 690          */
 691         if (sti->sti_unbind_mp == NULL) {
 692                 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
 693                 /* NOTE: holding so_lock while sleeping */
 694                 sti->sti_unbind_mp =
 695                     soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
 696                     cr);
 697         }
 698 
 699         if (flags & _SOBIND_REBIND) {
 700                 /*
 701                  * Called from solisten after doing an sotpi_unbind() or
 702                  * potentially without the unbind (latter for AF_INET{,6}).
 703                  */
 704                 ASSERT(name == NULL && namelen == 0);
 705 
 706                 if (so->so_family == AF_UNIX) {
 707                         ASSERT(sti->sti_ux_bound_vp);
 708                         addr = &sti->sti_ux_laddr;
 709                         addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
 710                         dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
 711                             "addr 0x%p, vp %p\n",
 712                             addrlen,
 713                             (void *)((struct so_ux_addr *)addr)->soua_vp,
 714                             (void *)sti->sti_ux_bound_vp));
 715                 } else {
 716                         addr = sti->sti_laddr_sa;
 717                         addrlen = (t_uscalar_t)sti->sti_laddr_len;
 718                 }
 719         } else if (flags & _SOBIND_UNSPEC) {
 720                 ASSERT(name == NULL && namelen == 0);
 721 
 722                 /*
 723                  * The caller checked SS_ISBOUND but not necessarily
 724                  * under so_lock
 725                  */
 726                 if (so->so_state & SS_ISBOUND) {
 727                         /* No error */
 728                         goto done;
 729                 }
 730 
 731                 /* Set an initial local address */
 732                 switch (so->so_family) {
 733                 case AF_UNIX:
 734                         /*
 735                          * Use an address with same size as struct sockaddr
 736                          * just like BSD.
 737                          */
 738                         sti->sti_laddr_len =
 739                             (socklen_t)sizeof (struct sockaddr);
 740                         ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
 741                         bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
 742                         sti->sti_laddr_sa->sa_family = so->so_family;
 743 
 744                         /*
 745                          * Pass down an address with the implicit bind
 746                          * magic number and the rest all zeros.
 747                          * The transport will return a unique address.
 748                          */
 749                         sti->sti_ux_laddr.soua_vp = NULL;
 750                         sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
 751                         addr = &sti->sti_ux_laddr;
 752                         addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
 753                         break;
 754 
 755                 case AF_INET:
 756                 case AF_INET6:
 757                         /*
 758                          * An unspecified bind in TPI has a NULL address.
 759                          * Set the address in sockfs to have the sa_family.
 760                          */
 761                         sti->sti_laddr_len = (so->so_family == AF_INET) ?
 762                             (socklen_t)sizeof (sin_t) :
 763                             (socklen_t)sizeof (sin6_t);
 764                         ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
 765                         bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
 766                         sti->sti_laddr_sa->sa_family = so->so_family;
 767                         addr = NULL;
 768                         addrlen = 0;
 769                         break;
 770 
 771                 default:
 772                         /*
 773                          * An unspecified bind in TPI has a NULL address.
 774                          * Set the address in sockfs to be zero length.
 775                          *
 776                          * Can not assume there is a sa_family for all
 777                          * protocol families. For example, AF_X25 does not
 778                          * have a family field.
 779                          */
 780                         bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
 781                         sti->sti_laddr_len = 0;      /* XXX correct? */
 782                         addr = NULL;
 783                         addrlen = 0;
 784                         break;
 785                 }
 786 
 787         } else {
 788                 if (so->so_state & SS_ISBOUND) {
 789                         /*
 790                          * If it is ok to rebind the socket, first unbind
 791                          * with the transport. A rebind to the NULL address
 792                          * is interpreted as an unbind.
 793                          * Note that a bind to NULL in BSD does unbind the
 794                          * socket but it fails with EINVAL.
 795                          * Note that regular sockets set SOV_SOCKBSD i.e.
 796                          * _SOBIND_SOCKBSD gets set here hence no type of
 797                          * socket does currently allow rebinding.
 798                          *
 799                          * If the name is NULL just do an unbind.
 800                          */
 801                         if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
 802                             name != NULL) {
 803                                 error = EINVAL;
 804                                 unbind_on_err = 0;
 805                                 eprintsoline(so, error);
 806                                 goto done;
 807                         }
 808                         if ((so->so_mode & SM_CONNREQUIRED) &&
 809                             (so->so_state & SS_CANTREBIND)) {
 810                                 error = EINVAL;
 811                                 unbind_on_err = 0;
 812                                 eprintsoline(so, error);
 813                                 goto done;
 814                         }
 815                         error = sotpi_unbind(so, 0);
 816                         if (error) {
 817                                 eprintsoline(so, error);
 818                                 goto done;
 819                         }
 820                         ASSERT(!(so->so_state & SS_ISBOUND));
 821                         if (name == NULL) {
 822                                 so->so_state &=
 823                                     ~(SS_ISCONNECTED|SS_ISCONNECTING);
 824                                 goto done;
 825                         }
 826                 }
 827 
 828                 /* X/Open requires this check */
 829                 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
 830                         if (xnet_check_print) {
 831                                 printf("sockfs: X/Open bind state check "
 832                                     "caused EINVAL\n");
 833                         }
 834                         error = EINVAL;
 835                         goto done;
 836                 }
 837 
 838                 switch (so->so_family) {
 839                 case AF_UNIX:
 840                         /*
 841                          * All AF_UNIX addresses are nul terminated
 842                          * when copied (copyin_name) in so the minimum
 843                          * length is 3 bytes.
 844                          */
 845                         if (name == NULL ||
 846                             (ssize_t)namelen <= sizeof (short) + 1) {
 847                                 error = EISDIR;
 848                                 eprintsoline(so, error);
 849                                 goto done;
 850                         }
 851                         /*
 852                          * Verify so_family matches the bound family.
 853                          * BSD does not check this for AF_UNIX resulting
 854                          * in funny mknods.
 855                          */
 856                         if (name->sa_family != so->so_family) {
 857                                 error = EAFNOSUPPORT;
 858                                 goto done;
 859                         }
 860                         break;
 861                 case AF_INET:
 862                         if (name == NULL) {
 863                                 error = EINVAL;
 864                                 eprintsoline(so, error);
 865                                 goto done;
 866                         }
 867                         if ((size_t)namelen != sizeof (sin_t)) {
 868                                 error = name->sa_family != so->so_family ?
 869                                     EAFNOSUPPORT : EINVAL;
 870                                 eprintsoline(so, error);
 871                                 goto done;
 872                         }
 873                         if ((flags & _SOBIND_XPG4_2) &&
 874                             (name->sa_family != so->so_family)) {
 875                                 /*
 876                                  * This check has to be made for X/Open
 877                                  * sockets however application failures have
 878                                  * been observed when it is applied to
 879                                  * all sockets.
 880                                  */
 881                                 error = EAFNOSUPPORT;
 882                                 eprintsoline(so, error);
 883                                 goto done;
 884                         }
 885                         /*
 886                          * Force a zero sa_family to match so_family.
 887                          *
 888                          * Some programs like inetd(8) don't set the
 889                          * family field. Other programs leave
 890                          * sin_family set to garbage - SunOS 4.X does
 891                          * not check the family field on a bind.
 892                          * We use the family field that
 893                          * was passed in to the socket() call.
 894                          */
 895                         name->sa_family = so->so_family;
 896                         break;
 897 
 898                 case AF_INET6: {
 899 #ifdef DEBUG
 900                         sin6_t *sin6 = (sin6_t *)name;
 901 #endif /* DEBUG */
 902 
 903                         if (name == NULL) {
 904                                 error = EINVAL;
 905                                 eprintsoline(so, error);
 906                                 goto done;
 907                         }
 908                         if ((size_t)namelen != sizeof (sin6_t)) {
 909                                 error = name->sa_family != so->so_family ?
 910                                     EAFNOSUPPORT : EINVAL;
 911                                 eprintsoline(so, error);
 912                                 goto done;
 913                         }
 914                         if (name->sa_family != so->so_family) {
 915                                 /*
 916                                  * With IPv6 we require the family to match
 917                                  * unlike in IPv4.
 918                                  */
 919                                 error = EAFNOSUPPORT;
 920                                 eprintsoline(so, error);
 921                                 goto done;
 922                         }
 923 #ifdef DEBUG
 924                         /*
 925                          * Verify that apps don't forget to clear
 926                          * sin6_scope_id etc
 927                          */
 928                         if (sin6->sin6_scope_id != 0 &&
 929                             !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
 930                                 zcmn_err(getzoneid(), CE_WARN,
 931                                     "bind with uninitialized sin6_scope_id "
 932                                     "(%d) on socket. Pid = %d\n",
 933                                     (int)sin6->sin6_scope_id,
 934                                     (int)curproc->p_pid);
 935                         }
 936                         if (sin6->__sin6_src_id != 0) {
 937                                 zcmn_err(getzoneid(), CE_WARN,
 938                                     "bind with uninitialized __sin6_src_id "
 939                                     "(%d) on socket. Pid = %d\n",
 940                                     (int)sin6->__sin6_src_id,
 941                                     (int)curproc->p_pid);
 942                         }
 943 #endif /* DEBUG */
 944                         break;
 945                 }
 946                 default:
 947                         /*
 948                          * Don't do any length or sa_family check to allow
 949                          * non-sockaddr style addresses.
 950                          */
 951                         if (name == NULL) {
 952                                 error = EINVAL;
 953                                 eprintsoline(so, error);
 954                                 goto done;
 955                         }
 956                         break;
 957                 }
 958 
 959                 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
 960                         error = ENAMETOOLONG;
 961                         eprintsoline(so, error);
 962                         goto done;
 963                 }
 964                 /*
 965                  * Save local address.
 966                  */
 967                 sti->sti_laddr_len = (socklen_t)namelen;
 968                 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
 969                 bcopy(name, sti->sti_laddr_sa, namelen);
 970 
 971                 addr = sti->sti_laddr_sa;
 972                 addrlen = (t_uscalar_t)sti->sti_laddr_len;
 973                 switch (so->so_family) {
 974                 case AF_INET6:
 975                 case AF_INET:
 976                         break;
 977                 case AF_UNIX: {
 978                         struct sockaddr_un *soun =
 979                             (struct sockaddr_un *)sti->sti_laddr_sa;
 980                         struct vnode *vp, *rvp;
 981                         struct vattr vattr;
 982 
 983                         ASSERT(sti->sti_ux_bound_vp == NULL);
 984                         /*
 985                          * Create vnode for the specified path name.
 986                          * Keep vnode held with a reference in sti_ux_bound_vp.
 987                          * Use the vnode pointer as the address used in the
 988                          * bind with the transport.
 989                          *
 990                          * Use the same mode as in BSD. In particular this does
 991                          * not observe the umask.
 992                          */
 993                         /* MAXPATHLEN + soun_family + nul termination */
 994                         if (sti->sti_laddr_len >
 995                             (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
 996                                 error = ENAMETOOLONG;
 997                                 eprintsoline(so, error);
 998                                 goto done;
 999                         }
1000                         vattr.va_type = VSOCK;
1001                         vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
1002                         vattr.va_mask = AT_TYPE|AT_MODE;
1003                         /* NOTE: holding so_lock */
1004                         error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
1005                             EXCL, 0, &vp, CRMKNOD, 0, 0);
1006                         if (error) {
1007                                 if (error == EEXIST)
1008                                         error = EADDRINUSE;
1009                                 eprintsoline(so, error);
1010                                 goto done;
1011                         }
1012                         /*
1013                          * Establish pointer from the underlying filesystem
1014                          * vnode to the socket node.
1015                          * sti_ux_bound_vp and v_stream->sd_vnode form the
1016                          * cross-linkage between the underlying filesystem
1017                          * node and the socket node.
1018                          */
1019 
1020                         if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
1021                                 VN_HOLD(rvp);
1022                                 VN_RELE(vp);
1023                                 vp = rvp;
1024                         }
1025 
1026                         ASSERT(SOTOV(so)->v_stream != NULL);
1027                         mutex_enter(&vp->v_lock);
1028                         vp->v_stream = SOTOV(so)->v_stream;
1029                         sti->sti_ux_bound_vp = vp;
1030                         mutex_exit(&vp->v_lock);
1031 
1032                         /*
1033                          * Use the vnode pointer value as a unique address
1034                          * (together with the magic number to avoid conflicts
1035                          * with implicit binds) in the transport provider.
1036                          */
1037                         sti->sti_ux_laddr.soua_vp =
1038                             (void *)sti->sti_ux_bound_vp;
1039                         sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1040                         addr = &sti->sti_ux_laddr;
1041                         addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1042                         dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1043                             addrlen,
1044                             (void *)((struct so_ux_addr *)addr)->soua_vp));
1045                         break;
1046                 }
1047                 } /* end switch (so->so_family) */
1048         }
1049 
1050         /*
1051          * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1052          * the transport can start passing up T_CONN_IND messages
1053          * as soon as it receives the bind req and strsock_proto()
1054          * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1055          */
1056         if (flags & _SOBIND_LISTEN) {
1057                 if ((so->so_state & SS_ACCEPTCONN) == 0)
1058                         clear_acceptconn_on_err = B_TRUE;
1059                 save_so_backlog = so->so_backlog;
1060                 restore_backlog_on_err = B_TRUE;
1061                 so->so_state |= SS_ACCEPTCONN;
1062                 so->so_backlog = backlog;
1063         }
1064 
1065         /*
1066          * If NL7C addr(s) have been configured check for addr/port match,
1067          * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1068          *
1069          * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1070          * family sockets only. If match mark as such.
1071          */
1072         if (nl7c_enabled && ((addr != NULL &&
1073             (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1074             (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1075             sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1076                 /*
1077                  * NL7C is not supported in non-global zones,
1078                  * we enforce this restriction here.
1079                  */
1080                 if (so->so_zoneid == GLOBAL_ZONEID) {
1081                         /* An NL7C socket, mark it */
1082                         sti->sti_nl7c_flags |= NL7C_ENABLED;
1083                         if (nl7c == NULL) {
1084                                 /*
1085                                  * Was an AF_NCA bind() so add it to the
1086                                  * addr list for reporting purposes.
1087                                  */
1088                                 nl7c = nl7c_add_addr(addr, addrlen);
1089                         }
1090                 } else
1091                         nl7c = NULL;
1092         }
1093 
1094         /*
1095          * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1096          * for other transports we will send in a O_T_BIND_REQ.
1097          */
1098         if (tcp_udp_xport &&
1099             (so->so_family == AF_INET || so->so_family == AF_INET6))
1100                 PRIM_type = T_BIND_REQ;
1101 
1102         bind_req.PRIM_type = PRIM_type;
1103         bind_req.ADDR_length = addrlen;
1104         bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1105         bind_req.CONIND_number = backlog;
1106         /* NOTE: holding so_lock while sleeping */
1107         mp = soallocproto2(&bind_req, sizeof (bind_req),
1108             addr, addrlen, 0, _ALLOC_SLEEP, cr);
1109         sti->sti_laddr_valid = 0;
1110 
1111         /* Done using sti_laddr_sa - can drop the lock */
1112         mutex_exit(&so->so_lock);
1113 
1114         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1115             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1116         if (error) {
1117                 eprintsoline(so, error);
1118                 mutex_enter(&so->so_lock);
1119                 goto done;
1120         }
1121 
1122         mutex_enter(&so->so_lock);
1123         error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1124             (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1125         if (error) {
1126                 eprintsoline(so, error);
1127                 goto done;
1128         }
1129         ASSERT(mp);
1130         /*
1131          * Even if some TPI message (e.g. T_DISCON_IND) was received in
1132          * strsock_proto while the lock was dropped above, the bind
1133          * is allowed to complete.
1134          */
1135 
1136         /* Mark as bound. This will be undone if we detect errors below. */
1137         if (flags & _SOBIND_NOXLATE) {
1138                 ASSERT(so->so_family == AF_UNIX);
1139                 sti->sti_faddr_noxlate = 1;
1140         }
1141         ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1142         so->so_state |= SS_ISBOUND;
1143         ASSERT(sti->sti_unbind_mp);
1144 
1145         /* note that we've already set SS_ACCEPTCONN above */
1146 
1147         /*
1148          * Recompute addrlen - an unspecied bind sent down an
1149          * address of length zero but we expect the appropriate length
1150          * in return.
1151          */
1152         addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1153             sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1154 
1155         bind_ack = (struct T_bind_ack *)mp->b_rptr;
1156         /*
1157          * The alignment restriction is really too strict but
1158          * we want enough alignment to inspect the fields of
1159          * a sockaddr_in.
1160          */
1161         addr = sogetoff(mp, bind_ack->ADDR_offset,
1162             bind_ack->ADDR_length,
1163             __TPI_ALIGN_SIZE);
1164         if (addr == NULL) {
1165                 freemsg(mp);
1166                 error = EPROTO;
1167                 eprintsoline(so, error);
1168                 goto done;
1169         }
1170         if (!(flags & _SOBIND_UNSPEC)) {
1171                 /*
1172                  * Verify that the transport didn't return something we
1173                  * did not want e.g. an address other than what we asked for.
1174                  *
1175                  * NOTE: These checks would go away if/when we switch to
1176                  * using the new TPI (in which the transport would fail
1177                  * the request instead of assigning a different address).
1178                  *
1179                  * NOTE2: For protocols that we don't know (i.e. any
1180                  * other than AF_INET6, AF_INET and AF_UNIX), we
1181                  * cannot know if the transport should be expected to
1182                  * return the same address as that requested.
1183                  *
1184                  * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1185                  * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1186                  *
1187                  * For example, in the case of netatalk it may be
1188                  * inappropriate for the transport to return the
1189                  * requested address (as it may have allocated a local
1190                  * port number in behaviour similar to that of an
1191                  * AF_INET bind request with a port number of zero).
1192                  *
1193                  * Given the definition of O_T_BIND_REQ, where the
1194                  * transport may bind to an address other than the
1195                  * requested address, it's not possible to determine
1196                  * whether a returned address that differs from the
1197                  * requested address is a reason to fail (because the
1198                  * requested address was not available) or succeed
1199                  * (because the transport allocated an appropriate
1200                  * address and/or port).
1201                  *
1202                  * sockfs currently requires that the transport return
1203                  * the requested address in the T_BIND_ACK, unless
1204                  * there is code here to allow for any discrepancy.
1205                  * Such code exists for AF_INET and AF_INET6.
1206                  *
1207                  * Netatalk chooses to return the requested address
1208                  * rather than the (correct) allocated address.  This
1209                  * means that netatalk violates the TPI specification
1210                  * (and would not function correctly if used from a
1211                  * TLI application), but it does mean that it works
1212                  * with sockfs.
1213                  *
1214                  * As noted above, using the newer XTI bind primitive
1215                  * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1216                  * allow sockfs to be more sure about whether or not
1217                  * the bind request had succeeded (as transports are
1218                  * not permitted to bind to a different address than
1219                  * that requested - they must return failure).
1220                  * Unfortunately, support for T_BIND_REQ may not be
1221                  * present in all transport implementations (netatalk,
1222                  * for example, doesn't have it), making the
1223                  * transition difficult.
1224                  */
1225                 if (bind_ack->ADDR_length != addrlen) {
1226                         /* Assumes that the requested address was in use */
1227                         freemsg(mp);
1228                         error = EADDRINUSE;
1229                         eprintsoline(so, error);
1230                         goto done;
1231                 }
1232 
1233                 switch (so->so_family) {
1234                 case AF_INET6:
1235                 case AF_INET: {
1236                         sin_t *rname, *aname;
1237 
1238                         rname = (sin_t *)addr;
1239                         aname = (sin_t *)sti->sti_laddr_sa;
1240 
1241                         /*
1242                          * Take advantage of the alignment
1243                          * of sin_port and sin6_port which fall
1244                          * in the same place in their data structures.
1245                          * Just use sin_port for either address family.
1246                          *
1247                          * This may become a problem if (heaven forbid)
1248                          * there's a separate ipv6port_reserved... :-P
1249                          *
1250                          * Binding to port 0 has the semantics of letting
1251                          * the transport bind to any port.
1252                          *
1253                          * If the transport is TCP or UDP since we had sent
1254                          * a T_BIND_REQ we would not get a port other than
1255                          * what we asked for.
1256                          */
1257                         if (tcp_udp_xport) {
1258                                 /*
1259                                  * Pick up the new port number if we bound to
1260                                  * port 0.
1261                                  */
1262                                 if (aname->sin_port == 0)
1263                                         aname->sin_port = rname->sin_port;
1264                                 sti->sti_laddr_valid = 1;
1265                                 break;
1266                         }
1267                         if (aname->sin_port != 0 &&
1268                             aname->sin_port != rname->sin_port) {
1269                                 freemsg(mp);
1270                                 error = EADDRINUSE;
1271                                 eprintsoline(so, error);
1272                                 goto done;
1273                         }
1274                         /*
1275                          * Pick up the new port number if we bound to port 0.
1276                          */
1277                         aname->sin_port = rname->sin_port;
1278 
1279                         /*
1280                          * Unfortunately, addresses aren't _quite_ the same.
1281                          */
1282                         if (so->so_family == AF_INET) {
1283                                 if (aname->sin_addr.s_addr !=
1284                                     rname->sin_addr.s_addr) {
1285                                         freemsg(mp);
1286                                         error = EADDRNOTAVAIL;
1287                                         eprintsoline(so, error);
1288                                         goto done;
1289                                 }
1290                         } else {
1291                                 sin6_t *rname6 = (sin6_t *)rname;
1292                                 sin6_t *aname6 = (sin6_t *)aname;
1293 
1294                                 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1295                                     &rname6->sin6_addr)) {
1296                                         freemsg(mp);
1297                                         error = EADDRNOTAVAIL;
1298                                         eprintsoline(so, error);
1299                                         goto done;
1300                                 }
1301                         }
1302                         break;
1303                 }
1304                 case AF_UNIX:
1305                         if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1306                                 freemsg(mp);
1307                                 error = EADDRINUSE;
1308                                 eprintsoline(so, error);
1309                                 eprintso(so,
1310                                     ("addrlen %d, addr 0x%x, vp %p\n",
1311                                     addrlen, *((int *)addr),
1312                                     (void *)sti->sti_ux_bound_vp));
1313                                 goto done;
1314                         }
1315                         sti->sti_laddr_valid = 1;
1316                         break;
1317                 default:
1318                         /*
1319                          * NOTE: This assumes that addresses can be
1320                          * byte-compared for equivalence.
1321                          */
1322                         if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1323                                 freemsg(mp);
1324                                 error = EADDRINUSE;
1325                                 eprintsoline(so, error);
1326                                 goto done;
1327                         }
1328                         /*
1329                          * Don't mark sti_laddr_valid, as we cannot be
1330                          * sure that the returned address is the real
1331                          * bound address when talking to an unknown
1332                          * transport.
1333                          */
1334                         break;
1335                 }
1336         } else {
1337                 /*
1338                  * Save for returned address for getsockname.
1339                  * Needed for unspecific bind unless transport supports
1340                  * the TI_GETMYNAME ioctl.
1341                  * Do this for AF_INET{,6} even though they do, as
1342                  * caching info here is much better performance than
1343                  * a TPI/STREAMS trip to the transport for getsockname.
1344                  * Any which can't for some reason _must_ _not_ set
1345                  * sti_laddr_valid here for the caching version of
1346                  * getsockname to not break;
1347                  */
1348                 switch (so->so_family) {
1349                 case AF_UNIX:
1350                         /*
1351                          * Record the address bound with the transport
1352                          * for use by socketpair.
1353                          */
1354                         bcopy(addr, &sti->sti_ux_laddr, addrlen);
1355                         sti->sti_laddr_valid = 1;
1356                         break;
1357                 case AF_INET:
1358                 case AF_INET6:
1359                         ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1360                         bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1361                         sti->sti_laddr_valid = 1;
1362                         break;
1363                 default:
1364                         /*
1365                          * Don't mark sti_laddr_valid, as we cannot be
1366                          * sure that the returned address is the real
1367                          * bound address when talking to an unknown
1368                          * transport.
1369                          */
1370                         break;
1371                 }
1372         }
1373 
1374         if (nl7c != NULL) {
1375                 /* Register listen()er sonode pointer with NL7C */
1376                 nl7c_listener_addr(nl7c, so);
1377         }
1378 
1379         freemsg(mp);
1380 
1381 done:
1382         if (error) {
1383                 /* reset state & backlog to values held on entry */
1384                 if (clear_acceptconn_on_err == B_TRUE)
1385                         so->so_state &= ~SS_ACCEPTCONN;
1386                 if (restore_backlog_on_err == B_TRUE)
1387                         so->so_backlog = save_so_backlog;
1388 
1389                 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1390                         int err;
1391 
1392                         err = sotpi_unbind(so, 0);
1393                         /* LINTED - statement has no consequent: if */
1394                         if (err) {
1395                                 eprintsoline(so, error);
1396                         } else {
1397                                 ASSERT(!(so->so_state & SS_ISBOUND));
1398                         }
1399                 }
1400         }
1401         if (!(flags & _SOBIND_LOCK_HELD)) {
1402                 so_unlock_single(so, SOLOCKED);
1403                 mutex_exit(&so->so_lock);
1404         } else {
1405                 ASSERT(MUTEX_HELD(&so->so_lock));
1406                 ASSERT(so->so_flag & SOLOCKED);
1407         }
1408         return (error);
1409 }
1410 
1411 /* bind the socket */
1412 static int
1413 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1414     int flags, struct cred *cr)
1415 {
1416         if ((flags & _SOBIND_SOCKETPAIR) == 0)
1417                 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1418 
1419         flags &= ~_SOBIND_SOCKETPAIR;
1420         return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1421 }
1422 
1423 /*
1424  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1425  * address, or when listen needs to unbind and bind.
1426  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1427  * so that a sobind can pick them up.
1428  */
1429 static int
1430 sotpi_unbind(struct sonode *so, int flags)
1431 {
1432         struct T_unbind_req     unbind_req;
1433         int                     error = 0;
1434         mblk_t                  *mp;
1435         sotpi_info_t            *sti = SOTOTPI(so);
1436 
1437         dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1438             (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1439 
1440         ASSERT(MUTEX_HELD(&so->so_lock));
1441         ASSERT(so->so_flag & SOLOCKED);
1442 
1443         if (!(so->so_state & SS_ISBOUND)) {
1444                 error = EINVAL;
1445                 eprintsoline(so, error);
1446                 goto done;
1447         }
1448 
1449         mutex_exit(&so->so_lock);
1450 
1451         /*
1452          * Flush the read and write side (except stream head read queue)
1453          * and send down T_UNBIND_REQ.
1454          */
1455         (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1456 
1457         unbind_req.PRIM_type = T_UNBIND_REQ;
1458         mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1459             0, _ALLOC_SLEEP, CRED());
1460         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1461             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1462         mutex_enter(&so->so_lock);
1463         if (error) {
1464                 eprintsoline(so, error);
1465                 goto done;
1466         }
1467 
1468         error = sowaitokack(so, T_UNBIND_REQ);
1469         if (error) {
1470                 eprintsoline(so, error);
1471                 goto done;
1472         }
1473 
1474         /*
1475          * Even if some TPI message (e.g. T_DISCON_IND) was received in
1476          * strsock_proto while the lock was dropped above, the unbind
1477          * is allowed to complete.
1478          */
1479         if (!(flags & _SOUNBIND_REBIND)) {
1480                 /*
1481                  * Clear out bound address.
1482                  */
1483                 vnode_t *vp;
1484 
1485                 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1486                         sti->sti_ux_bound_vp = NULL;
1487                         vn_rele_stream(vp);
1488                 }
1489                 /* Clear out address */
1490                 sti->sti_laddr_len = 0;
1491         }
1492         so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1493         sti->sti_laddr_valid = 0;
1494 
1495 done:
1496 
1497         /* If the caller held the lock don't release it here */
1498         ASSERT(MUTEX_HELD(&so->so_lock));
1499         ASSERT(so->so_flag & SOLOCKED);
1500 
1501         return (error);
1502 }
1503 
1504 /*
1505  * listen on the socket.
1506  * For TPI conforming transports this has to first unbind with the transport
1507  * and then bind again using the new backlog.
1508  */
1509 /* ARGSUSED */
1510 int
1511 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1512 {
1513         int             error = 0;
1514         sotpi_info_t    *sti = SOTOTPI(so);
1515 
1516         dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1517             (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1518 
1519         if (sti->sti_serv_type == T_CLTS)
1520                 return (EOPNOTSUPP);
1521 
1522         /*
1523          * If the socket is ready to accept connections already, then
1524          * return without doing anything.  This avoids a problem where
1525          * a second listen() call fails if a connection is pending and
1526          * leaves the socket unbound. Only when we are not unbinding
1527          * with the transport can we safely increase the backlog.
1528          */
1529         if (so->so_state & SS_ACCEPTCONN &&
1530             !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1531             /*CONSTCOND*/
1532             !solisten_tpi_tcp))
1533                 return (0);
1534 
1535         if (so->so_state & SS_ISCONNECTED)
1536                 return (EINVAL);
1537 
1538         mutex_enter(&so->so_lock);
1539         so_lock_single(so);     /* Set SOLOCKED */
1540 
1541         /*
1542          * If the listen doesn't change the backlog we do nothing.
1543          * This avoids an EPROTO error from the transport.
1544          */
1545         if ((so->so_state & SS_ACCEPTCONN) &&
1546             so->so_backlog == backlog)
1547                 goto done;
1548 
1549         if (!(so->so_state & SS_ISBOUND)) {
1550                 /*
1551                  * Must have been explicitly bound in the UNIX domain.
1552                  */
1553                 if (so->so_family == AF_UNIX) {
1554                         error = EINVAL;
1555                         goto done;
1556                 }
1557                 error = sotpi_bindlisten(so, NULL, 0, backlog,
1558                     _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1559         } else if (backlog > 0) {
1560                 /*
1561                  * AF_INET{,6} hack to avoid losing the port.
1562                  * Assumes that all AF_INET{,6} transports can handle a
1563                  * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1564                  * has already bound thus it is possible to avoid the unbind.
1565                  */
1566                 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1567                     /*CONSTCOND*/
1568                     !solisten_tpi_tcp)) {
1569                         error = sotpi_unbind(so, _SOUNBIND_REBIND);
1570                         if (error)
1571                                 goto done;
1572                 }
1573                 error = sotpi_bindlisten(so, NULL, 0, backlog,
1574                     _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1575         } else {
1576                 so->so_state |= SS_ACCEPTCONN;
1577                 so->so_backlog = backlog;
1578         }
1579         if (error)
1580                 goto done;
1581         ASSERT(so->so_state & SS_ACCEPTCONN);
1582 done:
1583         so_unlock_single(so, SOLOCKED);
1584         mutex_exit(&so->so_lock);
1585         return (error);
1586 }
1587 
1588 /*
1589  * Disconnect either a specified seqno or all (-1).
1590  * The former is used on listening sockets only.
1591  *
1592  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1593  * the current use of sodisconnect(seqno == -1) is only for shutdown
1594  * so there is no point (and potentially incorrect) to unbind.
1595  */
1596 static int
1597 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1598 {
1599         struct T_discon_req     discon_req;
1600         int                     error = 0;
1601         mblk_t                  *mp;
1602 
1603         dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1604             (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1605 
1606         if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1607                 mutex_enter(&so->so_lock);
1608                 so_lock_single(so);     /* Set SOLOCKED */
1609         } else {
1610                 ASSERT(MUTEX_HELD(&so->so_lock));
1611                 ASSERT(so->so_flag & SOLOCKED);
1612         }
1613 
1614         if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1615                 error = EINVAL;
1616                 eprintsoline(so, error);
1617                 goto done;
1618         }
1619 
1620         mutex_exit(&so->so_lock);
1621         /*
1622          * Flush the write side (unless this is a listener)
1623          * and then send down a T_DISCON_REQ.
1624          * (Don't flush on listener since it could flush {O_}T_CONN_RES
1625          * and other messages.)
1626          */
1627         if (!(so->so_state & SS_ACCEPTCONN))
1628                 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1629 
1630         discon_req.PRIM_type = T_DISCON_REQ;
1631         discon_req.SEQ_number = seqno;
1632         mp = soallocproto1(&discon_req, sizeof (discon_req),
1633             0, _ALLOC_SLEEP, CRED());
1634         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1635             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1636         mutex_enter(&so->so_lock);
1637         if (error) {
1638                 eprintsoline(so, error);
1639                 goto done;
1640         }
1641 
1642         error = sowaitokack(so, T_DISCON_REQ);
1643         if (error) {
1644                 eprintsoline(so, error);
1645                 goto done;
1646         }
1647         /*
1648          * Even if some TPI message (e.g. T_DISCON_IND) was received in
1649          * strsock_proto while the lock was dropped above, the disconnect
1650          * is allowed to complete. However, it is not possible to
1651          * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1652          */
1653         so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1654         SOTOTPI(so)->sti_laddr_valid = 0;
1655         SOTOTPI(so)->sti_faddr_valid = 0;
1656 done:
1657         if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1658                 so_unlock_single(so, SOLOCKED);
1659                 mutex_exit(&so->so_lock);
1660         } else {
1661                 /* If the caller held the lock don't release it here */
1662                 ASSERT(MUTEX_HELD(&so->so_lock));
1663                 ASSERT(so->so_flag & SOLOCKED);
1664         }
1665         return (error);
1666 }
1667 
1668 /* ARGSUSED */
1669 int
1670 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1671     struct sonode **nsop)
1672 {
1673         struct T_conn_ind       *conn_ind;
1674         struct T_conn_res       *conn_res;
1675         int                     error = 0;
1676         mblk_t                  *mp, *ack_mp;
1677         struct sonode           *nso;
1678         vnode_t                 *nvp;
1679         void                    *src;
1680         t_uscalar_t             srclen;
1681         void                    *opt;
1682         t_uscalar_t             optlen;
1683         t_scalar_t              PRIM_type;
1684         t_scalar_t              SEQ_number;
1685         size_t                  sinlen;
1686         sotpi_info_t            *sti = SOTOTPI(so);
1687         sotpi_info_t            *nsti;
1688 
1689         dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1690             (void *)so, fflag, (void *)nsop,
1691             pr_state(so->so_state, so->so_mode)));
1692 
1693         /*
1694          * Defer single-threading the accepting socket until
1695          * the T_CONN_IND has been received and parsed and the
1696          * new sonode has been opened.
1697          */
1698 
1699         /* Check that we are not already connected */
1700         if ((so->so_state & SS_ACCEPTCONN) == 0)
1701                 goto conn_bad;
1702 again:
1703         if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1704                 goto e_bad;
1705 
1706         ASSERT(mp != NULL);
1707         conn_ind = (struct T_conn_ind *)mp->b_rptr;
1708 
1709         /*
1710          * Save SEQ_number for error paths.
1711          */
1712         SEQ_number = conn_ind->SEQ_number;
1713 
1714         srclen = conn_ind->SRC_length;
1715         src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1716         if (src == NULL) {
1717                 error = EPROTO;
1718                 freemsg(mp);
1719                 eprintsoline(so, error);
1720                 goto disconnect_unlocked;
1721         }
1722         optlen = conn_ind->OPT_length;
1723         switch (so->so_family) {
1724         case AF_INET:
1725         case AF_INET6:
1726                 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1727                         bcopy(mp->b_rptr + conn_ind->OPT_offset,
1728                             &opt, conn_ind->OPT_length);
1729                 } else {
1730                         /*
1731                          * The transport (in this case TCP) hasn't sent up
1732                          * a pointer to an instance for the accept fast-path.
1733                          * Disable fast-path completely because the call to
1734                          * sotpi_create() below would otherwise create an
1735                          * incomplete TCP instance, which would lead to
1736                          * problems when sockfs sends a normal T_CONN_RES
1737                          * message down the new stream.
1738                          */
1739                         if (sti->sti_direct) {
1740                                 int rval;
1741                                 /*
1742                                  * For consistency we inform tcp to disable
1743                                  * direct interface on the listener, though
1744                                  * we can certainly live without doing this
1745                                  * because no data will ever travel upstream
1746                                  * on the listening socket.
1747                                  */
1748                                 sti->sti_direct = 0;
1749                                 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1750                                     0, 0, K_TO_K, cr, &rval);
1751                         }
1752                         opt = NULL;
1753                         optlen = 0;
1754                 }
1755                 break;
1756         case AF_UNIX:
1757         default:
1758                 if (optlen != 0) {
1759                         opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1760                             __TPI_ALIGN_SIZE);
1761                         if (opt == NULL) {
1762                                 error = EPROTO;
1763                                 freemsg(mp);
1764                                 eprintsoline(so, error);
1765                                 goto disconnect_unlocked;
1766                         }
1767                 }
1768                 if (so->so_family == AF_UNIX) {
1769                         if (!sti->sti_faddr_noxlate) {
1770                                 src = NULL;
1771                                 srclen = 0;
1772                         }
1773                         /* Extract src address from options */
1774                         if (optlen != 0)
1775                                 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1776                 }
1777                 break;
1778         }
1779 
1780         /*
1781          * Create the new socket.
1782          */
1783         nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1784         if (nso == NULL) {
1785                 ASSERT(error != 0);
1786                 /*
1787                  * Accept can not fail with ENOBUFS. sotpi_create
1788                  * sleeps waiting for memory until a signal is caught
1789                  * so return EINTR.
1790                  */
1791                 freemsg(mp);
1792                 if (error == ENOBUFS)
1793                         error = EINTR;
1794                 goto e_disc_unl;
1795         }
1796         nvp = SOTOV(nso);
1797         nsti = SOTOTPI(nso);
1798 
1799 #ifdef DEBUG
1800         /*
1801          * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1802          * it's inherited early to allow debugging of the accept code itself.
1803          */
1804         nso->so_options |= so->so_options & SO_DEBUG;
1805 #endif /* DEBUG */
1806 
1807         /*
1808          * Save the SRC address from the T_CONN_IND
1809          * for getpeername to work on AF_UNIX and on transports that do not
1810          * support TI_GETPEERNAME.
1811          *
1812          * NOTE: AF_UNIX NUL termination is ensured by the sender's
1813          * copyin_name().
1814          */
1815         if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1816                 error = EINVAL;
1817                 freemsg(mp);
1818                 eprintsoline(so, error);
1819                 goto disconnect_vp_unlocked;
1820         }
1821         nsti->sti_faddr_len = (socklen_t)srclen;
1822         ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1823         bcopy(src, nsti->sti_faddr_sa, srclen);
1824         nsti->sti_faddr_valid = 1;
1825 
1826         /*
1827          * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1828          */
1829         if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1830             (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1831                 cred_t  *cr;
1832                 pid_t   cpid;
1833 
1834                 cr = msg_getcred(mp, &cpid);
1835                 if (cr != NULL) {
1836                         crhold(cr);
1837                         nso->so_peercred = cr;
1838                         nso->so_cpid = cpid;
1839                 }
1840                 freemsg(mp);
1841 
1842                 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1843                     sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1844                 if (mp == NULL) {
1845                         /*
1846                          * Accept can not fail with ENOBUFS.
1847                          * A signal was caught so return EINTR.
1848                          */
1849                         error = EINTR;
1850                         eprintsoline(so, error);
1851                         goto disconnect_vp_unlocked;
1852                 }
1853                 conn_res = (struct T_conn_res *)mp->b_rptr;
1854         } else {
1855                 /*
1856                  * For efficency reasons we use msg_extractcred; no crhold
1857                  * needed since db_credp is cleared (i.e., we move the cred
1858                  * from the message to so_peercred.
1859                  */
1860                 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1861 
1862                 mp->b_rptr = DB_BASE(mp);
1863                 conn_res = (struct T_conn_res *)mp->b_rptr;
1864                 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1865 
1866                 mblk_setcred(mp, cr, curproc->p_pid);
1867         }
1868 
1869         /*
1870          * New socket must be bound at least in sockfs and, except for AF_INET,
1871          * (or AF_INET6) it also has to be bound in the transport provider.
1872          * We set the local address in the sonode from the T_OK_ACK of the
1873          * T_CONN_RES. For this reason the address we bind to here isn't
1874          * important.
1875          */
1876         if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1877             /*CONSTCOND*/
1878             nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1879                 /*
1880                  * Optimization for AF_INET{,6} transports
1881                  * that can handle a T_CONN_RES without being bound.
1882                  */
1883                 mutex_enter(&nso->so_lock);
1884                 so_automatic_bind(nso);
1885                 mutex_exit(&nso->so_lock);
1886         } else {
1887                 /* Perform NULL bind with the transport provider. */
1888                 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1889                     cr)) != 0) {
1890                         ASSERT(error != ENOBUFS);
1891                         freemsg(mp);
1892                         eprintsoline(nso, error);
1893                         goto disconnect_vp_unlocked;
1894                 }
1895         }
1896 
1897         /*
1898          * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1899          * so that any data arriving on the new socket will cause the
1900          * appropriate signals to be delivered for the new socket.
1901          *
1902          * No other thread (except strsock_proto and strsock_misc)
1903          * can access the new socket thus we relax the locking.
1904          */
1905         nso->so_pgrp = so->so_pgrp;
1906         nso->so_state |= so->so_state & SS_ASYNC;
1907         nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1908 
1909         if (nso->so_pgrp != 0) {
1910                 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1911                         eprintsoline(nso, error);
1912                         error = 0;
1913                         nso->so_pgrp = 0;
1914                 }
1915         }
1916 
1917         /*
1918          * Make note of the socket level options. TCP and IP level options
1919          * are already inherited. We could do all this after accept is
1920          * successful but doing it here simplifies code and no harm done
1921          * for error case.
1922          */
1923         nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1924             SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1925             SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1926         nso->so_sndbuf = so->so_sndbuf;
1927         nso->so_rcvbuf = so->so_rcvbuf;
1928         if (nso->so_options & SO_LINGER)
1929                 nso->so_linger = so->so_linger;
1930 
1931         /*
1932          * Note that the following sti_direct code path should be
1933          * removed once we are confident that the direct sockets
1934          * do not result in any degradation.
1935          */
1936         if (sti->sti_direct) {
1937 
1938                 ASSERT(opt != NULL);
1939 
1940                 conn_res->OPT_length = optlen;
1941                 conn_res->OPT_offset = MBLKL(mp);
1942                 bcopy(&opt, mp->b_wptr, optlen);
1943                 mp->b_wptr += optlen;
1944                 conn_res->PRIM_type = T_CONN_RES;
1945                 conn_res->ACCEPTOR_id = 0;
1946                 PRIM_type = T_CONN_RES;
1947 
1948                 /* Send down the T_CONN_RES on acceptor STREAM */
1949                 error = kstrputmsg(SOTOV(nso), mp, NULL,
1950                     0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1951                 if (error) {
1952                         mutex_enter(&so->so_lock);
1953                         so_lock_single(so);
1954                         eprintsoline(so, error);
1955                         goto disconnect_vp;
1956                 }
1957                 mutex_enter(&nso->so_lock);
1958                 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1959                     (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1960                 if (error) {
1961                         mutex_exit(&nso->so_lock);
1962                         mutex_enter(&so->so_lock);
1963                         so_lock_single(so);
1964                         eprintsoline(so, error);
1965                         goto disconnect_vp;
1966                 }
1967                 if (nso->so_family == AF_INET) {
1968                         sin_t *sin;
1969 
1970                         sin = (sin_t *)(ack_mp->b_rptr +
1971                             sizeof (struct T_ok_ack));
1972                         bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1973                         nsti->sti_laddr_len = sizeof (sin_t);
1974                 } else {
1975                         sin6_t *sin6;
1976 
1977                         sin6 = (sin6_t *)(ack_mp->b_rptr +
1978                             sizeof (struct T_ok_ack));
1979                         bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1980                         nsti->sti_laddr_len = sizeof (sin6_t);
1981                 }
1982                 freemsg(ack_mp);
1983 
1984                 nso->so_state |= SS_ISCONNECTED;
1985                 nso->so_proto_handle = (sock_lower_handle_t)opt;
1986                 nsti->sti_laddr_valid = 1;
1987 
1988                 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1989                         /*
1990                          * A NL7C marked listen()er so the new socket
1991                          * inherits the listen()er's NL7C state, except
1992                          * for NL7C_POLLIN.
1993                          *
1994                          * Only call NL7C to process the new socket if
1995                          * the listen socket allows blocking i/o.
1996                          */
1997                         nsti->sti_nl7c_flags =
1998                             sti->sti_nl7c_flags & (~NL7C_POLLIN);
1999                         if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
2000                                 /*
2001                                  * Nonblocking accept() just make it
2002                                  * persist to defer processing to the
2003                                  * read-side syscall (e.g. read).
2004                                  */
2005                                 nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
2006                         } else if (nl7c_process(nso, B_FALSE)) {
2007                                 /*
2008                                  * NL7C has completed processing on the
2009                                  * socket, close the socket and back to
2010                                  * the top to await the next T_CONN_IND.
2011                                  */
2012                                 mutex_exit(&nso->so_lock);
2013                                 (void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
2014                                     cr, NULL);
2015                                 VN_RELE(nvp);
2016                                 goto again;
2017                         }
2018                         /* Pass the new socket out */
2019                 }
2020 
2021                 mutex_exit(&nso->so_lock);
2022 
2023                 /*
2024                  * It's possible, through the use of autopush for example,
2025                  * that the acceptor stream may not support sti_direct
2026                  * semantics. If the new socket does not support sti_direct
2027                  * we issue a _SIOCSOCKFALLBACK to inform the transport
2028                  * as we would in the I_PUSH case.
2029                  */
2030                 if (nsti->sti_direct == 0) {
2031                         int     rval;
2032 
2033                         if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2034                             0, 0, K_TO_K, cr, &rval)) != 0) {
2035                                 mutex_enter(&so->so_lock);
2036                                 so_lock_single(so);
2037                                 eprintsoline(so, error);
2038                                 goto disconnect_vp;
2039                         }
2040                 }
2041 
2042                 /*
2043                  * Pass out new socket.
2044                  */
2045                 if (nsop != NULL)
2046                         *nsop = nso;
2047 
2048                 return (0);
2049         }
2050 
2051         /*
2052          * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2053          * which don't support the FireEngine accept fast-path. It is also
2054          * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2055          * again. Neither sockfs nor TCP attempt to find out if some other
2056          * random module has been inserted in between (in which case we
2057          * should follow TLI accept behaviour). We blindly assume the worst
2058          * case and revert back to old behaviour i.e. TCP will not send us
2059          * any option (eager) and the accept should happen on the listener
2060          * queue. Any queued T_conn_ind have already got their options removed
2061          * by so_sock2_stream() when "sockmod" was I_POP'd.
2062          */
2063         /*
2064          * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2065          */
2066         if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2067 #ifdef  _ILP32
2068                 queue_t *q;
2069 
2070                 /*
2071                  * Find read queue in driver
2072                  * Can safely do this since we "own" nso/nvp.
2073                  */
2074                 q = strvp2wq(nvp)->q_next;
2075                 while (SAMESTR(q))
2076                         q = q->q_next;
2077                 q = RD(q);
2078                 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2079 #else
2080                 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2081 #endif  /* _ILP32 */
2082                 conn_res->PRIM_type = O_T_CONN_RES;
2083                 PRIM_type = O_T_CONN_RES;
2084         } else {
2085                 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2086                 conn_res->PRIM_type = T_CONN_RES;
2087                 PRIM_type = T_CONN_RES;
2088         }
2089         conn_res->SEQ_number = SEQ_number;
2090         conn_res->OPT_length = 0;
2091         conn_res->OPT_offset = 0;
2092 
2093         mutex_enter(&so->so_lock);
2094         so_lock_single(so);     /* Set SOLOCKED */
2095         mutex_exit(&so->so_lock);
2096 
2097         error = kstrputmsg(SOTOV(so), mp, NULL,
2098             0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2099         mutex_enter(&so->so_lock);
2100         if (error) {
2101                 eprintsoline(so, error);
2102                 goto disconnect_vp;
2103         }
2104         error = sowaitprim(so, PRIM_type, T_OK_ACK,
2105             (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2106         if (error) {
2107                 eprintsoline(so, error);
2108                 goto disconnect_vp;
2109         }
2110         mutex_exit(&so->so_lock);
2111         /*
2112          * If there is a sin/sin6 appended onto the T_OK_ACK use
2113          * that to set the local address. If this is not present
2114          * then we zero out the address and don't set the
2115          * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2116          * the pathname from the listening socket.
2117          * In the case where this is TCP or an AF_UNIX socket the
2118          * client side may have queued data or a T_ORDREL in the
2119          * transport. Having now sent the T_CONN_RES we may receive
2120          * those queued messages at any time. Hold the acceptor
2121          * so_lock until its state and laddr are finalized.
2122          */
2123         mutex_enter(&nso->so_lock);
2124         sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2125         if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2126             MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2127                 ack_mp->b_rptr += sizeof (struct T_ok_ack);
2128                 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2129                 nsti->sti_laddr_len = sinlen;
2130                 nsti->sti_laddr_valid = 1;
2131         } else if (nso->so_family == AF_UNIX) {
2132                 ASSERT(so->so_family == AF_UNIX);
2133                 nsti->sti_laddr_len = sti->sti_laddr_len;
2134                 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2135                 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2136                     nsti->sti_laddr_len);
2137                 nsti->sti_laddr_valid = 1;
2138         } else {
2139                 nsti->sti_laddr_len = sti->sti_laddr_len;
2140                 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2141                 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2142                 nsti->sti_laddr_sa->sa_family = nso->so_family;
2143         }
2144         nso->so_state |= SS_ISCONNECTED;
2145         mutex_exit(&nso->so_lock);
2146 
2147         freemsg(ack_mp);
2148 
2149         mutex_enter(&so->so_lock);
2150         so_unlock_single(so, SOLOCKED);
2151         mutex_exit(&so->so_lock);
2152 
2153         /*
2154          * Pass out new socket.
2155          */
2156         if (nsop != NULL)
2157                 *nsop = nso;
2158 
2159         return (0);
2160 
2161 
2162 eproto_disc_unl:
2163         error = EPROTO;
2164 e_disc_unl:
2165         eprintsoline(so, error);
2166         goto disconnect_unlocked;
2167 
2168 pr_disc_vp_unl:
2169         eprintsoline(so, error);
2170 disconnect_vp_unlocked:
2171         (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2172         VN_RELE(nvp);
2173 disconnect_unlocked:
2174         (void) sodisconnect(so, SEQ_number, 0);
2175         return (error);
2176 
2177 pr_disc_vp:
2178         eprintsoline(so, error);
2179 disconnect_vp:
2180         (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2181         so_unlock_single(so, SOLOCKED);
2182         mutex_exit(&so->so_lock);
2183         (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2184         VN_RELE(nvp);
2185         return (error);
2186 
2187 conn_bad:       /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2188         error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2189             ? EOPNOTSUPP : EINVAL;
2190 e_bad:
2191         eprintsoline(so, error);
2192         return (error);
2193 }
2194 
2195 /*
2196  * connect a socket.
2197  *
2198  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2199  * unconnect (by specifying a null address).
2200  */
2201 int
2202 sotpi_connect(struct sonode *so,
2203     struct sockaddr *name,
2204     socklen_t namelen,
2205     int fflag,
2206     int flags,
2207     struct cred *cr)
2208 {
2209         struct T_conn_req       conn_req;
2210         int                     error = 0;
2211         mblk_t                  *mp;
2212         void                    *src;
2213         socklen_t               srclen;
2214         void                    *addr;
2215         socklen_t               addrlen;
2216         boolean_t               need_unlock;
2217         sotpi_info_t            *sti = SOTOTPI(so);
2218 
2219         dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2220             (void *)so, (void *)name, namelen, fflag, flags,
2221             pr_state(so->so_state, so->so_mode)));
2222 
2223         /*
2224          * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2225          * avoid sleeping for memory with SOLOCKED held.
2226          * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2227          * + sizeof (struct T_opthdr).
2228          * (the AF_UNIX so_ux_addr_xlate() does not make the address
2229          * exceed sti_faddr_maxlen).
2230          */
2231         mp = soallocproto(sizeof (struct T_conn_req) +
2232             2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2233             cr);
2234         if (mp == NULL) {
2235                 /*
2236                  * Connect can not fail with ENOBUFS. A signal was
2237                  * caught so return EINTR.
2238                  */
2239                 error = EINTR;
2240                 eprintsoline(so, error);
2241                 return (error);
2242         }
2243 
2244         mutex_enter(&so->so_lock);
2245         /*
2246          * Make sure there is a preallocated T_unbind_req message
2247          * before any binding. This message is allocated when the
2248          * socket is created. Since another thread can consume
2249          * so_unbind_mp by the time we return from so_lock_single(),
2250          * we should check the availability of so_unbind_mp after
2251          * we return from so_lock_single().
2252          */
2253 
2254         so_lock_single(so);     /* Set SOLOCKED */
2255         need_unlock = B_TRUE;
2256 
2257         if (sti->sti_unbind_mp == NULL) {
2258                 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2259                 /* NOTE: holding so_lock while sleeping */
2260                 sti->sti_unbind_mp =
2261                     soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2262                 if (sti->sti_unbind_mp == NULL) {
2263                         error = EINTR;
2264                         goto done;
2265                 }
2266         }
2267 
2268         /*
2269          * Can't have done a listen before connecting.
2270          */
2271         if (so->so_state & SS_ACCEPTCONN) {
2272                 error = EOPNOTSUPP;
2273                 goto done;
2274         }
2275 
2276         /*
2277          * Must be bound with the transport
2278          */
2279         if (!(so->so_state & SS_ISBOUND)) {
2280                 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2281                     /*CONSTCOND*/
2282                     so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2283                         /*
2284                          * Optimization for AF_INET{,6} transports
2285                          * that can handle a T_CONN_REQ without being bound.
2286                          */
2287                         so_automatic_bind(so);
2288                 } else {
2289                         error = sotpi_bind(so, NULL, 0,
2290                             _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2291                         if (error)
2292                                 goto done;
2293                 }
2294                 ASSERT(so->so_state & SS_ISBOUND);
2295                 flags |= _SOCONNECT_DID_BIND;
2296         }
2297 
2298         /*
2299          * Handle a connect to a name parameter of type AF_UNSPEC like a
2300          * connect to a null address. This is the portable method to
2301          * unconnect a socket.
2302          */
2303         if ((namelen >= sizeof (sa_family_t)) &&
2304             (name->sa_family == AF_UNSPEC)) {
2305                 name = NULL;
2306                 namelen = 0;
2307         }
2308 
2309         /*
2310          * Check that we are not already connected.
2311          * A connection-oriented socket cannot be reconnected.
2312          * A connected connection-less socket can be
2313          * - connected to a different address by a subsequent connect
2314          * - "unconnected" by a connect to the NULL address
2315          */
2316         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2317                 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2318                 if (so->so_mode & SM_CONNREQUIRED) {
2319                         /* Connection-oriented socket */
2320                         error = so->so_state & SS_ISCONNECTED ?
2321                             EISCONN : EALREADY;
2322                         goto done;
2323                 }
2324                 /* Connection-less socket */
2325                 if (name == NULL) {
2326                         /*
2327                          * Remove the connected state and clear SO_DGRAM_ERRIND
2328                          * since it was set when the socket was connected.
2329                          * If this is UDP also send down a T_DISCON_REQ.
2330                          */
2331                         int val;
2332 
2333                         if ((so->so_family == AF_INET ||
2334                             so->so_family == AF_INET6) &&
2335                             (so->so_type == SOCK_DGRAM ||
2336                             so->so_type == SOCK_RAW) &&
2337                             /*CONSTCOND*/
2338                             !soconnect_tpi_udp) {
2339                                 /* XXX What about implicitly unbinding here? */
2340                                 error = sodisconnect(so, -1,
2341                                     _SODISCONNECT_LOCK_HELD);
2342                         } else {
2343                                 so->so_state &=
2344                                     ~(SS_ISCONNECTED | SS_ISCONNECTING);
2345                                 sti->sti_faddr_valid = 0;
2346                                 sti->sti_faddr_len = 0;
2347                         }
2348 
2349                         /* Remove SOLOCKED since setsockopt will grab it */
2350                         so_unlock_single(so, SOLOCKED);
2351                         mutex_exit(&so->so_lock);
2352 
2353                         val = 0;
2354                         (void) sotpi_setsockopt(so, SOL_SOCKET,
2355                             SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2356                             cr);
2357 
2358                         mutex_enter(&so->so_lock);
2359                         so_lock_single(so);     /* Set SOLOCKED */
2360                         goto done;
2361                 }
2362         }
2363         ASSERT(so->so_state & SS_ISBOUND);
2364 
2365         if (name == NULL || namelen == 0) {
2366                 error = EINVAL;
2367                 goto done;
2368         }
2369         /*
2370          * Mark the socket if sti_faddr_sa represents the transport level
2371          * address.
2372          */
2373         if (flags & _SOCONNECT_NOXLATE) {
2374                 struct sockaddr_ux      *soaddr_ux;
2375 
2376                 ASSERT(so->so_family == AF_UNIX);
2377                 if (namelen != sizeof (struct sockaddr_ux)) {
2378                         error = EINVAL;
2379                         goto done;
2380                 }
2381                 soaddr_ux = (struct sockaddr_ux *)name;
2382                 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2383                 namelen = sizeof (soaddr_ux->sou_addr);
2384                 sti->sti_faddr_noxlate = 1;
2385         }
2386 
2387         /*
2388          * Length and family checks.
2389          */
2390         error = so_addr_verify(so, name, namelen);
2391         if (error)
2392                 goto bad;
2393 
2394         /*
2395          * Save foreign address. Needed for AF_UNIX as well as
2396          * transport providers that do not support TI_GETPEERNAME.
2397          * Also used for cached foreign address for TCP and UDP.
2398          */
2399         if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2400                 error = EINVAL;
2401                 goto done;
2402         }
2403         sti->sti_faddr_len = (socklen_t)namelen;
2404         ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2405         bcopy(name, sti->sti_faddr_sa, namelen);
2406         sti->sti_faddr_valid = 1;
2407 
2408         if (so->so_family == AF_UNIX) {
2409                 if (sti->sti_faddr_noxlate) {
2410                         /*
2411                          * sti_faddr is a transport-level address, so
2412                          * don't pass it as an option.  Do save it in
2413                          * sti_ux_faddr, used for connected DG send.
2414                          */
2415                         src = NULL;
2416                         srclen = 0;
2417                         addr = sti->sti_faddr_sa;
2418                         addrlen = (t_uscalar_t)sti->sti_faddr_len;
2419                         bcopy(addr, &sti->sti_ux_faddr,
2420                             sizeof (sti->sti_ux_faddr));
2421                 } else {
2422                         /*
2423                          * Pass the sockaddr_un source address as an option
2424                          * and translate the remote address.
2425                          * Holding so_lock thus sti_laddr_sa can not change.
2426                          */
2427                         src = sti->sti_laddr_sa;
2428                         srclen = (t_uscalar_t)sti->sti_laddr_len;
2429                         dprintso(so, 1,
2430                             ("sotpi_connect UNIX: srclen %d, src %p\n",
2431                             srclen, src));
2432                         /*
2433                          * Translate the destination address into our
2434                          * internal form, and save it in sti_ux_faddr.
2435                          * After this call, addr==&sti->sti_ux_taddr,
2436                          * and we copy that to sti->sti_ux_faddr so
2437                          * we save the connected peer address.
2438                          */
2439                         error = so_ux_addr_xlate(so,
2440                             sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2441                             (flags & _SOCONNECT_XPG4_2),
2442                             &addr, &addrlen);
2443                         if (error)
2444                                 goto bad;
2445                         bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2446                             sizeof (sti->sti_ux_faddr));
2447                 }
2448         } else {
2449                 addr = sti->sti_faddr_sa;
2450                 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2451                 src = NULL;
2452                 srclen = 0;
2453         }
2454         /*
2455          * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2456          * option which asks the transport provider to send T_UDERR_IND
2457          * messages. These T_UDERR_IND messages are used to return connected
2458          * style errors (e.g. ECONNRESET) for connected datagram sockets.
2459          *
2460          * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2461          * we send down a T_CONN_REQ. This is needed to let the
2462          * transport assign a local address that is consistent with
2463          * the remote address. Applications depend on a getsockname()
2464          * after a connect() to retrieve the "source" IP address for
2465          * the connected socket.  Invalidate the cached local address
2466          * to force getsockname() to enquire of the transport.
2467          */
2468         if (!(so->so_mode & SM_CONNREQUIRED)) {
2469                 /*
2470                  * Datagram socket.
2471                  */
2472                 int32_t val;
2473 
2474                 so_unlock_single(so, SOLOCKED);
2475                 mutex_exit(&so->so_lock);
2476 
2477                 val = 1;
2478                 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2479                     &val, (t_uscalar_t)sizeof (val), cr);
2480 
2481                 mutex_enter(&so->so_lock);
2482                 so_lock_single(so);     /* Set SOLOCKED */
2483                 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2484                     (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2485                     soconnect_tpi_udp) {
2486                         soisconnected(so);
2487                         goto done;
2488                 }
2489                 /*
2490                  * Send down T_CONN_REQ etc.
2491                  * Clear fflag to avoid returning EWOULDBLOCK.
2492                  */
2493                 fflag = 0;
2494                 ASSERT(so->so_family != AF_UNIX);
2495                 sti->sti_laddr_valid = 0;
2496         } else if (sti->sti_laddr_len != 0) {
2497                 /*
2498                  * If the local address or port was "any" then it may be
2499                  * changed by the transport as a result of the
2500                  * connect.  Invalidate the cached version if we have one.
2501                  */
2502                 switch (so->so_family) {
2503                 case AF_INET:
2504                         ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2505                         if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2506                             INADDR_ANY ||
2507                             ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2508                                 sti->sti_laddr_valid = 0;
2509                         break;
2510 
2511                 case AF_INET6:
2512                         ASSERT(sti->sti_laddr_len ==
2513                             (socklen_t)sizeof (sin6_t));
2514                         if (IN6_IS_ADDR_UNSPECIFIED(
2515                             &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2516                             IN6_IS_ADDR_V4MAPPED_ANY(
2517                             &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2518                             ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2519                                 sti->sti_laddr_valid = 0;
2520                         break;
2521 
2522                 default:
2523                         break;
2524                 }
2525         }
2526 
2527         /*
2528          * Check for failure of an earlier call
2529          */
2530         if (so->so_error != 0)
2531                 goto so_bad;
2532 
2533         /*
2534          * Send down T_CONN_REQ. Message was allocated above.
2535          */
2536         conn_req.PRIM_type = T_CONN_REQ;
2537         conn_req.DEST_length = addrlen;
2538         conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2539         if (srclen == 0) {
2540                 conn_req.OPT_length = 0;
2541                 conn_req.OPT_offset = 0;
2542                 soappendmsg(mp, &conn_req, sizeof (conn_req));
2543                 soappendmsg(mp, addr, addrlen);
2544         } else {
2545                 /*
2546                  * There is a AF_UNIX sockaddr_un to include as a source
2547                  * address option.
2548                  */
2549                 struct T_opthdr toh;
2550 
2551                 toh.level = SOL_SOCKET;
2552                 toh.name = SO_SRCADDR;
2553                 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2554                 toh.status = 0;
2555                 conn_req.OPT_length =
2556                     (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2557                 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2558                     _TPI_ALIGN_TOPT(addrlen));
2559 
2560                 soappendmsg(mp, &conn_req, sizeof (conn_req));
2561                 soappendmsg(mp, addr, addrlen);
2562                 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2563                 soappendmsg(mp, &toh, sizeof (toh));
2564                 soappendmsg(mp, src, srclen);
2565                 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2566                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2567         }
2568         /*
2569          * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2570          * in order to have the right state when the T_CONN_CON shows up.
2571          */
2572         soisconnecting(so);
2573         mutex_exit(&so->so_lock);
2574 
2575         if (AU_AUDITING())
2576                 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2577 
2578         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2579             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2580         mp = NULL;
2581         mutex_enter(&so->so_lock);
2582         if (error != 0)
2583                 goto bad;
2584 
2585         if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2586                 goto bad;
2587 
2588         /* Allow other threads to access the socket */
2589         so_unlock_single(so, SOLOCKED);
2590         need_unlock = B_FALSE;
2591 
2592         /*
2593          * Wait until we get a T_CONN_CON or an error
2594          */
2595         if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2596                 so_lock_single(so);     /* Set SOLOCKED */
2597                 need_unlock = B_TRUE;
2598         }
2599 
2600 done:
2601         freemsg(mp);
2602         switch (error) {
2603         case EINPROGRESS:
2604         case EALREADY:
2605         case EISCONN:
2606         case EINTR:
2607                 /* Non-fatal errors */
2608                 sti->sti_laddr_valid = 0;
2609                 /* FALLTHRU */
2610         case 0:
2611                 break;
2612         default:
2613                 ASSERT(need_unlock);
2614                 /*
2615                  * Fatal errors: clear SS_ISCONNECTING in case it was set,
2616                  * and invalidate local-address cache
2617                  */
2618                 so->so_state &= ~SS_ISCONNECTING;
2619                 sti->sti_laddr_valid = 0;
2620                 /* A discon_ind might have already unbound us */
2621                 if ((flags & _SOCONNECT_DID_BIND) &&
2622                     (so->so_state & SS_ISBOUND)) {
2623                         int err;
2624 
2625                         err = sotpi_unbind(so, 0);
2626                         /* LINTED - statement has no conseq */
2627                         if (err) {
2628                                 eprintsoline(so, err);
2629                         }
2630                 }
2631                 break;
2632         }
2633         if (need_unlock)
2634                 so_unlock_single(so, SOLOCKED);
2635         mutex_exit(&so->so_lock);
2636         return (error);
2637 
2638 so_bad: error = sogeterr(so, B_TRUE);
2639 bad:    eprintsoline(so, error);
2640         goto done;
2641 }
2642 
2643 /* ARGSUSED */
2644 int
2645 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2646 {
2647         struct T_ordrel_req     ordrel_req;
2648         mblk_t                  *mp;
2649         uint_t                  old_state, state_change;
2650         int                     error = 0;
2651         sotpi_info_t            *sti = SOTOTPI(so);
2652 
2653         dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2654             (void *)so, how, pr_state(so->so_state, so->so_mode)));
2655 
2656         mutex_enter(&so->so_lock);
2657         so_lock_single(so);     /* Set SOLOCKED */
2658 
2659         /*
2660          * SunOS 4.X has no check for datagram sockets.
2661          * 5.X checks that it is connected (ENOTCONN)
2662          * X/Open requires that we check the connected state.
2663          */
2664         if (!(so->so_state & SS_ISCONNECTED)) {
2665                 if (!xnet_skip_checks) {
2666                         error = ENOTCONN;
2667                         if (xnet_check_print) {
2668                                 printf("sockfs: X/Open shutdown check "
2669                                     "caused ENOTCONN\n");
2670                         }
2671                 }
2672                 goto done;
2673         }
2674         /*
2675          * Record the current state and then perform any state changes.
2676          * Then use the difference between the old and new states to
2677          * determine which messages need to be sent.
2678          * This prevents e.g. duplicate T_ORDREL_REQ when there are
2679          * duplicate calls to shutdown().
2680          */
2681         old_state = so->so_state;
2682 
2683         switch (how) {
2684         case 0:
2685                 socantrcvmore(so);
2686                 break;
2687         case 1:
2688                 socantsendmore(so);
2689                 break;
2690         case 2:
2691                 socantsendmore(so);
2692                 socantrcvmore(so);
2693                 break;
2694         default:
2695                 error = EINVAL;
2696                 goto done;
2697         }
2698 
2699         /*
2700          * Assumes that the SS_CANT* flags are never cleared in the above code.
2701          */
2702         state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2703             (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2704         ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2705 
2706         switch (state_change) {
2707         case 0:
2708                 dprintso(so, 1,
2709                     ("sotpi_shutdown: nothing to send in state 0x%x\n",
2710                     so->so_state));
2711                 goto done;
2712 
2713         case SS_CANTRCVMORE:
2714                 mutex_exit(&so->so_lock);
2715                 strseteof(SOTOV(so), 1);
2716                 /*
2717                  * strseteof takes care of read side wakeups,
2718                  * pollwakeups, and signals.
2719                  */
2720                 /*
2721                  * Get the read lock before flushing data to avoid problems
2722                  * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2723                  */
2724                 mutex_enter(&so->so_lock);
2725                 (void) so_lock_read(so, 0);     /* Set SOREADLOCKED */
2726                 mutex_exit(&so->so_lock);
2727 
2728                 /* Flush read side queue */
2729                 strflushrq(SOTOV(so), FLUSHALL);
2730 
2731                 mutex_enter(&so->so_lock);
2732                 so_unlock_read(so);             /* Clear SOREADLOCKED */
2733                 break;
2734 
2735         case SS_CANTSENDMORE:
2736                 mutex_exit(&so->so_lock);
2737                 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2738                 mutex_enter(&so->so_lock);
2739                 break;
2740 
2741         case SS_CANTSENDMORE|SS_CANTRCVMORE:
2742                 mutex_exit(&so->so_lock);
2743                 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2744                 strseteof(SOTOV(so), 1);
2745                 /*
2746                  * strseteof takes care of read side wakeups,
2747                  * pollwakeups, and signals.
2748                  */
2749                 /*
2750                  * Get the read lock before flushing data to avoid problems
2751                  * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2752                  */
2753                 mutex_enter(&so->so_lock);
2754                 (void) so_lock_read(so, 0);     /* Set SOREADLOCKED */
2755                 mutex_exit(&so->so_lock);
2756 
2757                 /* Flush read side queue */
2758                 strflushrq(SOTOV(so), FLUSHALL);
2759 
2760                 mutex_enter(&so->so_lock);
2761                 so_unlock_read(so);             /* Clear SOREADLOCKED */
2762                 break;
2763         }
2764 
2765         ASSERT(MUTEX_HELD(&so->so_lock));
2766 
2767         /*
2768          * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2769          * was set due to this call and the new state has both of them set:
2770          *      Send the AF_UNIX close indication
2771          *      For T_COTS send a discon_ind
2772          *
2773          * If cantsend was set due to this call:
2774          *      For T_COTSORD send an ordrel_ind
2775          *
2776          * Note that for T_CLTS there is no message sent here.
2777          */
2778         if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2779             (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2780                 /*
2781                  * For SunOS 4.X compatibility we tell the other end
2782                  * that we are unable to receive at this point.
2783                  */
2784                 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2785                         so_unix_close(so);
2786 
2787                 if (sti->sti_serv_type == T_COTS)
2788                         error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2789         }
2790         if ((state_change & SS_CANTSENDMORE) &&
2791             (sti->sti_serv_type == T_COTS_ORD)) {
2792                 /* Send an orderly release */
2793                 ordrel_req.PRIM_type = T_ORDREL_REQ;
2794 
2795                 mutex_exit(&so->so_lock);
2796                 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2797                     0, _ALLOC_SLEEP, cr);
2798                 /*
2799                  * Send down the T_ORDREL_REQ even if there is flow control.
2800                  * This prevents shutdown from blocking.
2801                  * Note that there is no T_OK_ACK for ordrel_req.
2802                  */
2803                 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2804                     MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2805                 mutex_enter(&so->so_lock);
2806                 if (error) {
2807                         eprintsoline(so, error);
2808                         goto done;
2809                 }
2810         }
2811 
2812 done:
2813         so_unlock_single(so, SOLOCKED);
2814         mutex_exit(&so->so_lock);
2815         return (error);
2816 }
2817 
2818 /*
2819  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2820  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2821  * that we have closed.
2822  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2823  * T_UNITDATA_REQ containing the same option.
2824  *
2825  * For SOCK_DGRAM half-connections (somebody connected to this end
2826  * but this end is not connect) we don't know where to send any
2827  * SO_UNIX_CLOSE.
2828  *
2829  * We have to ignore stream head errors just in case there has been
2830  * a shutdown(output).
2831  * Ignore any flow control to try to get the message more quickly to the peer.
2832  * While locally ignoring flow control solves the problem when there
2833  * is only the loopback transport on the stream it would not provide
2834  * the correct AF_UNIX socket semantics when one or more modules have
2835  * been pushed.
2836  */
2837 void
2838 so_unix_close(struct sonode *so)
2839 {
2840         struct T_opthdr toh;
2841         mblk_t          *mp;
2842         sotpi_info_t    *sti = SOTOTPI(so);
2843 
2844         ASSERT(MUTEX_HELD(&so->so_lock));
2845 
2846         ASSERT(so->so_family == AF_UNIX);
2847 
2848         if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2849             (SS_ISCONNECTED|SS_ISBOUND))
2850                 return;
2851 
2852         dprintso(so, 1, ("so_unix_close(%p) %s\n",
2853             (void *)so, pr_state(so->so_state, so->so_mode)));
2854 
2855         toh.level = SOL_SOCKET;
2856         toh.name = SO_UNIX_CLOSE;
2857 
2858         /* zero length + header */
2859         toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2860         toh.status = 0;
2861 
2862         if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2863                 struct T_optdata_req tdr;
2864 
2865                 tdr.PRIM_type = T_OPTDATA_REQ;
2866                 tdr.DATA_flag = 0;
2867 
2868                 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2869                 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2870 
2871                 /* NOTE: holding so_lock while sleeping */
2872                 mp = soallocproto2(&tdr, sizeof (tdr),
2873                     &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2874         } else {
2875                 struct T_unitdata_req   tudr;
2876                 void                    *addr;
2877                 socklen_t               addrlen;
2878                 void                    *src;
2879                 socklen_t               srclen;
2880                 struct T_opthdr         toh2;
2881                 t_scalar_t              size;
2882 
2883                 /*
2884                  * We know this is an AF_UNIX connected DGRAM socket.
2885                  * We therefore already have the destination address
2886                  * in the internal form needed for this send.  This is
2887                  * similar to the sosend_dgram call later in this file
2888                  * when there's no user-specified destination address.
2889                  */
2890                 if (sti->sti_faddr_noxlate) {
2891                         /*
2892                          * Already have a transport internal address. Do not
2893                          * pass any (transport internal) source address.
2894                          */
2895                         addr = sti->sti_faddr_sa;
2896                         addrlen = (t_uscalar_t)sti->sti_faddr_len;
2897                         src = NULL;
2898                         srclen = 0;
2899                 } else {
2900                         /*
2901                          * Pass the sockaddr_un source address as an option
2902                          * and translate the remote address.
2903                          * Holding so_lock thus sti_laddr_sa can not change.
2904                          */
2905                         src = sti->sti_laddr_sa;
2906                         srclen = (socklen_t)sti->sti_laddr_len;
2907                         dprintso(so, 1,
2908                             ("so_ux_close: srclen %d, src %p\n",
2909                             srclen, src));
2910                         /*
2911                          * Use the destination address saved in connect.
2912                          */
2913                         addr = &sti->sti_ux_faddr;
2914                         addrlen = sizeof (sti->sti_ux_faddr);
2915                 }
2916                 tudr.PRIM_type = T_UNITDATA_REQ;
2917                 tudr.DEST_length = addrlen;
2918                 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2919                 if (srclen == 0) {
2920                         tudr.OPT_length = (t_scalar_t)sizeof (toh);
2921                         tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2922                             _TPI_ALIGN_TOPT(addrlen));
2923 
2924                         size = tudr.OPT_offset + tudr.OPT_length;
2925                         /* NOTE: holding so_lock while sleeping */
2926                         mp = soallocproto2(&tudr, sizeof (tudr),
2927                             addr, addrlen, size, _ALLOC_SLEEP, CRED());
2928                         mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2929                         soappendmsg(mp, &toh, sizeof (toh));
2930                 } else {
2931                         /*
2932                          * There is a AF_UNIX sockaddr_un to include as a
2933                          * source address option.
2934                          */
2935                         tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2936                             _TPI_ALIGN_TOPT(srclen));
2937                         tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2938                             _TPI_ALIGN_TOPT(addrlen));
2939 
2940                         toh2.level = SOL_SOCKET;
2941                         toh2.name = SO_SRCADDR;
2942                         toh2.len = (t_uscalar_t)(srclen +
2943                             sizeof (struct T_opthdr));
2944                         toh2.status = 0;
2945 
2946                         size = tudr.OPT_offset + tudr.OPT_length;
2947 
2948                         /* NOTE: holding so_lock while sleeping */
2949                         mp = soallocproto2(&tudr, sizeof (tudr),
2950                             addr, addrlen, size, _ALLOC_SLEEP, CRED());
2951                         mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2952                         soappendmsg(mp, &toh, sizeof (toh));
2953                         soappendmsg(mp, &toh2, sizeof (toh2));
2954                         soappendmsg(mp, src, srclen);
2955                         mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2956                 }
2957                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2958         }
2959         mutex_exit(&so->so_lock);
2960         (void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2961             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2962         mutex_enter(&so->so_lock);
2963 }
2964 
2965 /*
2966  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2967  * In addition, the caller typically verifies that there is some
2968  * potential state to clear by checking
2969  *      if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2970  * before calling this routine.
2971  * Note that such a check can be made without holding so_lock since
2972  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2973  * decrements sti_oobsigcnt.
2974  *
2975  * When data is read *after* the point that all pending
2976  * oob data has been consumed the oob indication is cleared.
2977  *
2978  * This logic keeps select/poll returning POLLRDBAND and
2979  * SIOCATMARK returning true until we have read past
2980  * the mark.
2981  */
2982 static void
2983 sorecv_update_oobstate(struct sonode *so)
2984 {
2985         sotpi_info_t *sti = SOTOTPI(so);
2986 
2987         mutex_enter(&so->so_lock);
2988         ASSERT(so_verify_oobstate(so));
2989         dprintso(so, 1,
2990             ("sorecv_update_oobstate: counts %d/%d state %s\n",
2991             sti->sti_oobsigcnt,
2992             sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2993         if (sti->sti_oobsigcnt == 0) {
2994                 /* No more pending oob indications */
2995                 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2996                 freemsg(so->so_oobmsg);
2997                 so->so_oobmsg = NULL;
2998         }
2999         ASSERT(so_verify_oobstate(so));
3000         mutex_exit(&so->so_lock);
3001 }
3002 
3003 /*
3004  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
3005  */
3006 static int
3007 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
3008 {
3009         sotpi_info_t *sti = SOTOTPI(so);
3010         int     error = 0;
3011         mblk_t *tmp = NULL;
3012         mblk_t *pmp = NULL;
3013         mblk_t *nmp = sti->sti_nl7c_rcv_mp;
3014 
3015         ASSERT(nmp != NULL);
3016 
3017         while (nmp != NULL && uiop->uio_resid > 0) {
3018                 ssize_t n;
3019 
3020                 if (DB_TYPE(nmp) == M_DATA) {
3021                         /*
3022                          * We have some data, uiomove up to resid bytes.
3023                          */
3024                         n = MIN(MBLKL(nmp), uiop->uio_resid);
3025                         if (n > 0)
3026                                 error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3027                         nmp->b_rptr += n;
3028                         if (nmp->b_rptr == nmp->b_wptr) {
3029                                 pmp = nmp;
3030                                 nmp = nmp->b_cont;
3031                         }
3032                         if (error)
3033                                 break;
3034                 } else {
3035                         /*
3036                          * We only handle data, save for caller to handle.
3037                          */
3038                         if (pmp != NULL) {
3039                                 pmp->b_cont = nmp->b_cont;
3040                         }
3041                         nmp->b_cont = NULL;
3042                         if (*rmp == NULL) {
3043                                 *rmp = nmp;
3044                         } else {
3045                                 tmp->b_cont = nmp;
3046                         }
3047                         nmp = nmp->b_cont;
3048                         tmp = nmp;
3049                 }
3050         }
3051         if (pmp != NULL) {
3052                 /* Free any mblk_t(s) which we have consumed */
3053                 pmp->b_cont = NULL;
3054                 freemsg(sti->sti_nl7c_rcv_mp);
3055         }
3056         if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3057                 /* Last mblk_t so return the saved kstrgetmsg() rval/error */
3058                 if (error == 0) {
3059                         rval_t  *p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3060 
3061                         error = p->r_v.r_v2;
3062                         p->r_v.r_v2 = 0;
3063                 }
3064                 rp->r_vals = sti->sti_nl7c_rcv_rval;
3065                 sti->sti_nl7c_rcv_rval = 0;
3066         } else {
3067                 /* More mblk_t(s) to process so no rval to return */
3068                 rp->r_vals = 0;
3069         }
3070         return (error);
3071 }
3072 /*
3073  * Receive the next message on the queue.
3074  * If msg_controllen is non-zero when called the caller is interested in
3075  * any received control info (options).
3076  * If msg_namelen is non-zero when called the caller is interested in
3077  * any received source address.
3078  * The routine returns with msg_control and msg_name pointing to
3079  * kmem_alloc'ed memory which the caller has to free.
3080  */
3081 /* ARGSUSED */
3082 int
3083 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3084     struct cred *cr)
3085 {
3086         union T_primitives      *tpr;
3087         mblk_t                  *mp;
3088         uchar_t                 pri;
3089         int                     pflag, opflag;
3090         void                    *control;
3091         t_uscalar_t             controllen;
3092         t_uscalar_t             namelen;
3093         int                     so_state = so->so_state; /* Snapshot */
3094         ssize_t                 saved_resid;
3095         rval_t                  rval;
3096         int                     flags;
3097         clock_t                 timout;
3098         int                     error = 0;
3099         sotpi_info_t            *sti = SOTOTPI(so);
3100 
3101         flags = msg->msg_flags;
3102         msg->msg_flags = 0;
3103 
3104         dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3105             (void *)so, (void *)msg, flags,
3106             pr_state(so->so_state, so->so_mode), so->so_error));
3107 
3108         if (so->so_version == SOV_STREAM) {
3109                 so_update_attrs(so, SOACC);
3110                 /* The imaginary "sockmod" has been popped - act as a stream */
3111                 return (strread(SOTOV(so), uiop, cr));
3112         }
3113 
3114         /*
3115          * If we are not connected because we have never been connected
3116          * we return ENOTCONN. If we have been connected (but are no longer
3117          * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3118          * the EOF.
3119          *
3120          * An alternative would be to post an ENOTCONN error in stream head
3121          * (read+write) and clear it when we're connected. However, that error
3122          * would cause incorrect poll/select behavior!
3123          */
3124         if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3125             (so->so_mode & SM_CONNREQUIRED)) {
3126                 return (ENOTCONN);
3127         }
3128 
3129         /*
3130          * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3131          * after checking that the read queue is empty) and returns zero.
3132          * This implementation will sleep (in kstrgetmsg) even if uio_resid
3133          * is zero.
3134          */
3135 
3136         if (flags & MSG_OOB) {
3137                 /* Check that the transport supports OOB */
3138                 if (!(so->so_mode & SM_EXDATA))
3139                         return (EOPNOTSUPP);
3140                 so_update_attrs(so, SOACC);
3141                 return (sorecvoob(so, msg, uiop, flags,
3142                     (so->so_options & SO_OOBINLINE)));
3143         }
3144 
3145         so_update_attrs(so, SOACC);
3146 
3147         /*
3148          * Set msg_controllen and msg_namelen to zero here to make it
3149          * simpler in the cases that no control or name is returned.
3150          */
3151         controllen = msg->msg_controllen;
3152         namelen = msg->msg_namelen;
3153         msg->msg_controllen = 0;
3154         msg->msg_namelen = 0;
3155 
3156         dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3157             namelen, controllen));
3158 
3159         mutex_enter(&so->so_lock);
3160         /*
3161          * If an NL7C enabled socket and not waiting for write data.
3162          */
3163         if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3164             NL7C_ENABLED) {
3165                 if (sti->sti_nl7c_uri) {
3166                         /* Close uri processing for a previous request */
3167                         nl7c_close(so);
3168                 }
3169                 if ((so_state & SS_CANTRCVMORE) &&
3170                     sti->sti_nl7c_rcv_mp == NULL) {
3171                         /* Nothing to process, EOF */
3172                         mutex_exit(&so->so_lock);
3173                         return (0);
3174                 } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3175                         /* Persistent NL7C socket, try to process request */
3176                         boolean_t ret;
3177 
3178                         ret = nl7c_process(so,
3179                             (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3180                         rval.r_vals = sti->sti_nl7c_rcv_rval;
3181                         error = rval.r_v.r_v2;
3182                         if (error) {
3183                                 /* Error of some sort, return it */
3184                                 mutex_exit(&so->so_lock);
3185                                 return (error);
3186                         }
3187                         if (sti->sti_nl7c_flags &&
3188                             ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3189                                 /*
3190                                  * Still an NL7C socket and no data
3191                                  * to pass up to the caller.
3192                                  */
3193                                 mutex_exit(&so->so_lock);
3194                                 if (ret) {
3195                                         /* EOF */
3196                                         return (0);
3197                                 } else {
3198                                         /* Need more data */
3199                                         return (EAGAIN);
3200                                 }
3201                         }
3202                 } else {
3203                         /*
3204                          * Not persistent so no further NL7C processing.
3205                          */
3206                         sti->sti_nl7c_flags = 0;
3207                 }
3208         }
3209         /*
3210          * Only one reader is allowed at any given time. This is needed
3211          * for T_EXDATA handling and, in the future, MSG_WAITALL.
3212          *
3213          * This is slightly different that BSD behavior in that it fails with
3214          * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3215          * is single-threaded using sblock(), which is dropped while waiting
3216          * for data to appear. The difference shows up e.g. if one
3217          * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3218          * does use nonblocking io and different threads are reading each
3219          * file descriptor. In BSD there would never be an EWOULDBLOCK error
3220          * in this case as long as the read queue doesn't get empty.
3221          * In this implementation the thread using nonblocking io can
3222          * get an EWOULDBLOCK error due to the blocking thread executing
3223          * e.g. in the uiomove in kstrgetmsg.
3224          * This difference is not believed to be significant.
3225          */
3226         /* Set SOREADLOCKED */
3227         error = so_lock_read_intr(so,
3228             uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3229         mutex_exit(&so->so_lock);
3230         if (error)
3231                 return (error);
3232 
3233         /*
3234          * Tell kstrgetmsg to not inspect the stream head errors until all
3235          * queued data has been consumed.
3236          * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3237          * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3238          *
3239          * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3240          * to T_OPTDATA_IND that do not contain any user-visible control msg.
3241          * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3242          */
3243         pflag = MSG_ANY | MSG_DELAYERROR;
3244         if (flags & MSG_PEEK) {
3245                 pflag |= MSG_IPEEK;
3246                 flags &= ~MSG_WAITALL;
3247         }
3248         if (so->so_mode & SM_ATOMIC)
3249                 pflag |= MSG_DISCARDTAIL;
3250 
3251         if (flags & MSG_DONTWAIT)
3252                 timout = 0;
3253         else if (so->so_rcvtimeo != 0)
3254                 timout = TICK_TO_MSEC(so->so_rcvtimeo);
3255         else
3256                 timout = -1;
3257         opflag = pflag;
3258 retry:
3259         saved_resid = uiop->uio_resid;
3260         pri = 0;
3261         mp = NULL;
3262         if (sti->sti_nl7c_rcv_mp != NULL) {
3263                 /* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3264                 error = nl7c_sorecv(so, &mp, uiop, &rval);
3265         } else {
3266                 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3267                     timout, &rval);
3268         }
3269         if (error != 0) {
3270                 /* kstrgetmsg returns ETIME when timeout expires */
3271                 if (error == ETIME)
3272                         error = EWOULDBLOCK;
3273                 goto out;
3274         }
3275         /*
3276          * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3277          * For non-datagrams MOREDATA is used to set MSG_EOR.
3278          */
3279         ASSERT(!(rval.r_val1 & MORECTL));
3280         if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3281                 msg->msg_flags |= MSG_TRUNC;
3282 
3283         if (mp == NULL) {
3284                 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3285                 /*
3286                  * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3287                  * The draft Posix socket spec states that the mark should
3288                  * not be cleared when peeking. We follow the latter.
3289                  */
3290                 if ((so->so_state &
3291                     (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3292                     (uiop->uio_resid != saved_resid) &&
3293                     !(flags & MSG_PEEK)) {
3294                         sorecv_update_oobstate(so);
3295                 }
3296 
3297                 mutex_enter(&so->so_lock);
3298                 /* Set MSG_EOR based on MOREDATA */
3299                 if (!(rval.r_val1 & MOREDATA)) {
3300                         if (so->so_state & SS_SAVEDEOR) {
3301                                 msg->msg_flags |= MSG_EOR;
3302                                 so->so_state &= ~SS_SAVEDEOR;
3303                         }
3304                 }
3305                 /*
3306                  * If some data was received (i.e. not EOF) and the
3307                  * read/recv* has not been satisfied wait for some more.
3308                  */
3309                 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3310                     uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3311                         mutex_exit(&so->so_lock);
3312                         pflag = opflag | MSG_NOMARK;
3313                         goto retry;
3314                 }
3315                 goto out_locked;
3316         }
3317 
3318         /* strsock_proto has already verified length and alignment */
3319         tpr = (union T_primitives *)mp->b_rptr;
3320         dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3321 
3322         switch (tpr->type) {
3323         case T_DATA_IND: {
3324                 if ((so->so_state &
3325                     (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3326                     (uiop->uio_resid != saved_resid) &&
3327                     !(flags & MSG_PEEK)) {
3328                         sorecv_update_oobstate(so);
3329                 }
3330 
3331                 /*
3332                  * Set msg_flags to MSG_EOR based on
3333                  * MORE_flag and MOREDATA.
3334                  */
3335                 mutex_enter(&so->so_lock);
3336                 so->so_state &= ~SS_SAVEDEOR;
3337                 if (!(tpr->data_ind.MORE_flag & 1)) {
3338                         if (!(rval.r_val1 & MOREDATA))
3339                                 msg->msg_flags |= MSG_EOR;
3340                         else
3341                                 so->so_state |= SS_SAVEDEOR;
3342                 }
3343                 freemsg(mp);
3344                 /*
3345                  * If some data was received (i.e. not EOF) and the
3346                  * read/recv* has not been satisfied wait for some more.
3347                  */
3348                 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3349                     uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3350                         mutex_exit(&so->so_lock);
3351                         pflag = opflag | MSG_NOMARK;
3352                         goto retry;
3353                 }
3354                 goto out_locked;
3355         }
3356         case T_UNITDATA_IND: {
3357                 void *addr;
3358                 t_uscalar_t addrlen;
3359                 void *abuf;
3360                 t_uscalar_t optlen;
3361                 void *opt;
3362 
3363                 if ((so->so_state &
3364                     (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3365                     (uiop->uio_resid != saved_resid) &&
3366                     !(flags & MSG_PEEK)) {
3367                         sorecv_update_oobstate(so);
3368                 }
3369 
3370                 if (namelen != 0) {
3371                         /* Caller wants source address */
3372                         addrlen = tpr->unitdata_ind.SRC_length;
3373                         addr = sogetoff(mp,
3374                             tpr->unitdata_ind.SRC_offset,
3375                             addrlen, 1);
3376                         if (addr == NULL) {
3377                                 freemsg(mp);
3378                                 error = EPROTO;
3379                                 eprintsoline(so, error);
3380                                 goto out;
3381                         }
3382                         if (so->so_family == AF_UNIX) {
3383                                 /*
3384                                  * Can not use the transport level address.
3385                                  * If there is a SO_SRCADDR option carrying
3386                                  * the socket level address it will be
3387                                  * extracted below.
3388                                  */
3389                                 addr = NULL;
3390                                 addrlen = 0;
3391                         }
3392                 }
3393                 optlen = tpr->unitdata_ind.OPT_length;
3394                 if (optlen != 0) {
3395                         t_uscalar_t ncontrollen;
3396 
3397                         /*
3398                          * Extract any source address option.
3399                          * Determine how large cmsg buffer is needed.
3400                          */
3401                         opt = sogetoff(mp,
3402                             tpr->unitdata_ind.OPT_offset,
3403                             optlen, __TPI_ALIGN_SIZE);
3404 
3405                         if (opt == NULL) {
3406                                 freemsg(mp);
3407                                 error = EPROTO;
3408                                 eprintsoline(so, error);
3409                                 goto out;
3410                         }
3411                         if (so->so_family == AF_UNIX)
3412                                 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3413                         ncontrollen = so_cmsglen(mp, opt, optlen,
3414                             !(flags & MSG_XPG4_2));
3415                         if (controllen != 0)
3416                                 controllen = ncontrollen;
3417                         else if (ncontrollen != 0)
3418                                 msg->msg_flags |= MSG_CTRUNC;
3419                 } else {
3420                         controllen = 0;
3421                 }
3422 
3423                 if (namelen != 0) {
3424                         /*
3425                          * Return address to caller.
3426                          * Caller handles truncation if length
3427                          * exceeds msg_namelen.
3428                          * NOTE: AF_UNIX NUL termination is ensured by
3429                          * the sender's copyin_name().
3430                          */
3431                         abuf = kmem_alloc(addrlen, KM_SLEEP);
3432 
3433                         bcopy(addr, abuf, addrlen);
3434                         msg->msg_name = abuf;
3435                         msg->msg_namelen = addrlen;
3436                 }
3437 
3438                 if (controllen != 0) {
3439                         /*
3440                          * Return control msg to caller.
3441                          * Caller handles truncation if length
3442                          * exceeds msg_controllen.
3443                          */
3444                         control = kmem_zalloc(controllen, KM_SLEEP);
3445 
3446                         error = so_opt2cmsg(mp, opt, optlen,
3447                             !(flags & MSG_XPG4_2),
3448                             control, controllen);
3449                         if (error) {
3450                                 freemsg(mp);
3451                                 if (msg->msg_namelen != 0)
3452                                         kmem_free(msg->msg_name,
3453                                             msg->msg_namelen);
3454                                 kmem_free(control, controllen);
3455                                 eprintsoline(so, error);
3456                                 goto out;
3457                         }
3458                         msg->msg_control = control;
3459                         msg->msg_controllen = controllen;
3460                 }
3461 
3462                 freemsg(mp);
3463                 goto out;
3464         }
3465         case T_OPTDATA_IND: {
3466                 struct T_optdata_req *tdr;
3467                 void *opt;
3468                 t_uscalar_t optlen;
3469 
3470                 if ((so->so_state &
3471                     (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3472                     (uiop->uio_resid != saved_resid) &&
3473                     !(flags & MSG_PEEK)) {
3474                         sorecv_update_oobstate(so);
3475                 }
3476 
3477                 tdr = (struct T_optdata_req *)mp->b_rptr;
3478                 optlen = tdr->OPT_length;
3479                 if (optlen != 0) {
3480                         t_uscalar_t ncontrollen;
3481                         /*
3482                          * Determine how large cmsg buffer is needed.
3483                          */
3484                         opt = sogetoff(mp,
3485                             tpr->optdata_ind.OPT_offset,
3486                             optlen, __TPI_ALIGN_SIZE);
3487 
3488                         if (opt == NULL) {
3489                                 freemsg(mp);
3490                                 error = EPROTO;
3491                                 eprintsoline(so, error);
3492                                 goto out;
3493                         }
3494 
3495                         ncontrollen = so_cmsglen(mp, opt, optlen,
3496                             !(flags & MSG_XPG4_2));
3497                         if (controllen != 0)
3498                                 controllen = ncontrollen;
3499                         else if (ncontrollen != 0)
3500                                 msg->msg_flags |= MSG_CTRUNC;
3501                 } else {
3502                         controllen = 0;
3503                 }
3504 
3505                 if (controllen != 0) {
3506                         /*
3507                          * Return control msg to caller.
3508                          * Caller handles truncation if length
3509                          * exceeds msg_controllen.
3510                          */
3511                         control = kmem_zalloc(controllen, KM_SLEEP);
3512 
3513                         error = so_opt2cmsg(mp, opt, optlen,
3514                             !(flags & MSG_XPG4_2),
3515                             control, controllen);
3516                         if (error) {
3517                                 freemsg(mp);
3518                                 kmem_free(control, controllen);
3519                                 eprintsoline(so, error);
3520                                 goto out;
3521                         }
3522                         msg->msg_control = control;
3523                         msg->msg_controllen = controllen;
3524                 }
3525 
3526                 /*
3527                  * Set msg_flags to MSG_EOR based on
3528                  * DATA_flag and MOREDATA.
3529                  */
3530                 mutex_enter(&so->so_lock);
3531                 so->so_state &= ~SS_SAVEDEOR;
3532                 if (!(tpr->data_ind.MORE_flag & 1)) {
3533                         if (!(rval.r_val1 & MOREDATA))
3534                                 msg->msg_flags |= MSG_EOR;
3535                         else
3536                                 so->so_state |= SS_SAVEDEOR;
3537                 }
3538                 freemsg(mp);
3539                 /*
3540                  * If some data was received (i.e. not EOF) and the
3541                  * read/recv* has not been satisfied wait for some more.
3542                  * Not possible to wait if control info was received.
3543                  */
3544                 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3545                     controllen == 0 &&
3546                     uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3547                         mutex_exit(&so->so_lock);
3548                         pflag = opflag | MSG_NOMARK;
3549                         goto retry;
3550                 }
3551                 goto out_locked;
3552         }
3553         case T_EXDATA_IND: {
3554                 dprintso(so, 1,
3555                     ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3556                     "state %s\n",
3557                     sti->sti_oobsigcnt, sti->sti_oobcnt,
3558                     saved_resid - uiop->uio_resid,
3559                     pr_state(so->so_state, so->so_mode)));
3560                 /*
3561                  * kstrgetmsg handles MSGMARK so there is nothing to
3562                  * inspect in the T_EXDATA_IND.
3563                  * strsock_proto makes the stream head queue the T_EXDATA_IND
3564                  * as a separate message with no M_DATA component. Furthermore,
3565                  * the stream head does not consolidate M_DATA messages onto
3566                  * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3567                  * remains a message by itself. This is needed since MSGMARK
3568                  * marks both the whole message as well as the last byte
3569                  * of the message.
3570                  */
3571                 freemsg(mp);
3572                 ASSERT(uiop->uio_resid == saved_resid);      /* No data */
3573                 if (flags & MSG_PEEK) {
3574                         /*
3575                          * Even though we are peeking we consume the
3576                          * T_EXDATA_IND thereby moving the mark information
3577                          * to SS_RCVATMARK. Then the oob code below will
3578                          * retry the peeking kstrgetmsg.
3579                          * Note that the stream head read queue is
3580                          * never flushed without holding SOREADLOCKED
3581                          * thus the T_EXDATA_IND can not disappear
3582                          * underneath us.
3583                          */
3584                         dprintso(so, 1,
3585                             ("sotpi_recvmsg: consume EXDATA_IND "
3586                             "counts %d/%d state %s\n",
3587                             sti->sti_oobsigcnt,
3588                             sti->sti_oobcnt,
3589                             pr_state(so->so_state, so->so_mode)));
3590 
3591                         pflag = MSG_ANY | MSG_DELAYERROR;
3592                         if (so->so_mode & SM_ATOMIC)
3593                                 pflag |= MSG_DISCARDTAIL;
3594 
3595                         pri = 0;
3596                         mp = NULL;
3597 
3598                         error = kstrgetmsg(SOTOV(so), &mp, uiop,
3599                             &pri, &pflag, (clock_t)-1, &rval);
3600                         ASSERT(uiop->uio_resid == saved_resid);
3601 
3602                         if (error) {
3603 #ifdef SOCK_DEBUG
3604                                 if (error != EWOULDBLOCK && error != EINTR) {
3605                                         eprintsoline(so, error);
3606                                 }
3607 #endif /* SOCK_DEBUG */
3608                                 goto out;
3609                         }
3610                         ASSERT(mp);
3611                         tpr = (union T_primitives *)mp->b_rptr;
3612                         ASSERT(tpr->type == T_EXDATA_IND);
3613                         freemsg(mp);
3614                 } /* end "if (flags & MSG_PEEK)" */
3615 
3616                 /*
3617                  * Decrement the number of queued and pending oob.
3618                  *
3619                  * SS_RCVATMARK is cleared when we read past a mark.
3620                  * SS_HAVEOOBDATA is cleared when we've read past the
3621                  * last mark.
3622                  * SS_OOBPEND is cleared if we've read past the last
3623                  * mark and no (new) SIGURG has been posted.
3624                  */
3625                 mutex_enter(&so->so_lock);
3626                 ASSERT(so_verify_oobstate(so));
3627                 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3628                 ASSERT(sti->sti_oobsigcnt > 0);
3629                 sti->sti_oobsigcnt--;
3630                 ASSERT(sti->sti_oobcnt > 0);
3631                 sti->sti_oobcnt--;
3632                 /*
3633                  * Since the T_EXDATA_IND has been removed from the stream
3634                  * head, but we have not read data past the mark,
3635                  * sockfs needs to track that the socket is still at the mark.
3636                  *
3637                  * Since no data was received call kstrgetmsg again to wait
3638                  * for data.
3639                  */
3640                 so->so_state |= SS_RCVATMARK;
3641                 mutex_exit(&so->so_lock);
3642                 dprintso(so, 1,
3643                     ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3644                     sti->sti_oobsigcnt, sti->sti_oobcnt,
3645                     pr_state(so->so_state, so->so_mode)));
3646                 pflag = opflag;
3647                 goto retry;
3648         }
3649         default:
3650                 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3651                     (void *)so, tpr->type, (void *)mp);
3652                 ASSERT(0);
3653                 freemsg(mp);
3654                 error = EPROTO;
3655                 eprintsoline(so, error);
3656                 goto out;
3657         }
3658         /* NOTREACHED */
3659 out:
3660         mutex_enter(&so->so_lock);
3661 out_locked:
3662         so_unlock_read(so);     /* Clear SOREADLOCKED */
3663         mutex_exit(&so->so_lock);
3664         return (error);
3665 }
3666 
3667 /*
3668  * Sending data with options on a datagram socket.
3669  * Assumes caller has verified that SS_ISBOUND etc. are set.
3670  *
3671  * For AF_UNIX the destination address may be already in
3672  * internal form, as indicated by sti->sti_faddr_noxlate
3673  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3674  * translate the destination address to internal form.
3675  *
3676  * The source address is passed as an option.  If passing
3677  * file descriptors, those are passed as file pointers in
3678  * another option.
3679  */
3680 static int
3681 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3682     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3683 {
3684         struct T_unitdata_req   tudr;
3685         mblk_t                  *mp;
3686         int                     error;
3687         void                    *addr;
3688         socklen_t               addrlen;
3689         void                    *src;
3690         socklen_t               srclen;
3691         ssize_t                 len;
3692         int                     size;
3693         struct T_opthdr         toh;
3694         struct fdbuf            *fdbuf;
3695         t_uscalar_t             optlen;
3696         void                    *fds;
3697         int                     fdlen;
3698         sotpi_info_t            *sti = SOTOTPI(so);
3699 
3700         ASSERT(name && namelen);
3701         ASSERT(control && controllen);
3702 
3703         len = uiop->uio_resid;
3704         if (len > (ssize_t)sti->sti_tidu_size) {
3705                 return (EMSGSIZE);
3706         }
3707 
3708         if (sti->sti_faddr_noxlate == 0 &&
3709             (flags & MSG_SENDTO_NOXLATE) == 0) {
3710                 /*
3711                  * Length and family checks.
3712                  * Don't verify internal form.
3713                  */
3714                 error = so_addr_verify(so, name, namelen);
3715                 if (error) {
3716                         eprintsoline(so, error);
3717                         return (error);
3718                 }
3719         }
3720 
3721         if (so->so_family == AF_UNIX) {
3722                 if (sti->sti_faddr_noxlate) {
3723                         /*
3724                          * Already have a transport internal address. Do not
3725                          * pass any (transport internal) source address.
3726                          */
3727                         addr = name;
3728                         addrlen = namelen;
3729                         src = NULL;
3730                         srclen = 0;
3731                 } else if (flags & MSG_SENDTO_NOXLATE) {
3732                         /*
3733                          * Have an internal form dest. address.
3734                          * Pass the source address as usual.
3735                          */
3736                         addr = name;
3737                         addrlen = namelen;
3738                         src = sti->sti_laddr_sa;
3739                         srclen = (socklen_t)sti->sti_laddr_len;
3740                 } else {
3741                         /*
3742                          * Pass the sockaddr_un source address as an option
3743                          * and translate the remote address.
3744                          *
3745                          * Note that this code does not prevent sti_laddr_sa
3746                          * from changing while it is being used. Thus
3747                          * if an unbind+bind occurs concurrently with this
3748                          * send the peer might see a partially new and a
3749                          * partially old "from" address.
3750                          */
3751                         src = sti->sti_laddr_sa;
3752                         srclen = (socklen_t)sti->sti_laddr_len;
3753                         dprintso(so, 1,
3754                             ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3755                             srclen, src));
3756                         /*
3757                          * The sendmsg caller specified a destination
3758                          * address, which we must translate into our
3759                          * internal form.  addr = &sti->sti_ux_taddr
3760                          */
3761                         error = so_ux_addr_xlate(so, name, namelen,
3762                             (flags & MSG_XPG4_2),
3763                             &addr, &addrlen);
3764                         if (error) {
3765                                 eprintsoline(so, error);
3766                                 return (error);
3767                         }
3768                 }
3769         } else {
3770                 addr = name;
3771                 addrlen = namelen;
3772                 src = NULL;
3773                 srclen = 0;
3774         }
3775         optlen = so_optlen(control, controllen,
3776             !(flags & MSG_XPG4_2));
3777         tudr.PRIM_type = T_UNITDATA_REQ;
3778         tudr.DEST_length = addrlen;
3779         tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3780         if (srclen != 0)
3781                 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3782                     _TPI_ALIGN_TOPT(srclen));
3783         else
3784                 tudr.OPT_length = optlen;
3785         tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3786             _TPI_ALIGN_TOPT(addrlen));
3787 
3788         size = tudr.OPT_offset + tudr.OPT_length;
3789 
3790         /*
3791          * File descriptors only when SM_FDPASSING set.
3792          */
3793         error = so_getfdopt(control, controllen,
3794             !(flags & MSG_XPG4_2), &fds, &fdlen);
3795         if (error)
3796                 return (error);
3797         if (fdlen != -1) {
3798                 if (!(so->so_mode & SM_FDPASSING))
3799                         return (EOPNOTSUPP);
3800 
3801                 error = fdbuf_create(fds, fdlen, &fdbuf);
3802                 if (error)
3803                         return (error);
3804 
3805                 /*
3806                  * Pre-allocate enough additional space for lower level modules
3807                  * to append an option (e.g. see tl_unitdata). The following
3808                  * is enough extra space for the largest option we might append.
3809                  */
3810                 size += sizeof (struct T_opthdr) + ucredsize;
3811                 mp = fdbuf_allocmsg(size, fdbuf);
3812         } else {
3813                 mp = soallocproto(size, _ALLOC_INTR, CRED());
3814                 if (mp == NULL) {
3815                         /*
3816                          * Caught a signal waiting for memory.
3817                          * Let send* return EINTR.
3818                          */
3819                         return (EINTR);
3820                 }
3821         }
3822         soappendmsg(mp, &tudr, sizeof (tudr));
3823         soappendmsg(mp, addr, addrlen);
3824         mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3825 
3826         if (fdlen != -1) {
3827                 ASSERT(fdbuf != NULL);
3828                 toh.level = SOL_SOCKET;
3829                 toh.name = SO_FILEP;
3830                 toh.len = fdbuf->fd_size +
3831                     (t_uscalar_t)sizeof (struct T_opthdr);
3832                 toh.status = 0;
3833                 soappendmsg(mp, &toh, sizeof (toh));
3834                 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3835                 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3836         }
3837         if (srclen != 0) {
3838                 /*
3839                  * There is a AF_UNIX sockaddr_un to include as a source
3840                  * address option.
3841                  */
3842                 toh.level = SOL_SOCKET;
3843                 toh.name = SO_SRCADDR;
3844                 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3845                 toh.status = 0;
3846                 soappendmsg(mp, &toh, sizeof (toh));
3847                 soappendmsg(mp, src, srclen);
3848                 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3849                 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3850         }
3851         ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3852         so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3853         /*
3854          * Normally at most 3 bytes left in the message, but we might have
3855          * allowed for extra space if we're passing fd's through.
3856          */
3857         ASSERT(MBLKL(mp) <= (ssize_t)size);
3858 
3859         ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3860         if (AU_AUDITING())
3861                 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3862 
3863         error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3864 #ifdef SOCK_DEBUG
3865         if (error) {
3866                 eprintsoline(so, error);
3867         }
3868 #endif /* SOCK_DEBUG */
3869         return (error);
3870 }
3871 
3872 /*
3873  * Sending data with options on a connected stream socket.
3874  * Assumes caller has verified that SS_ISCONNECTED is set.
3875  */
3876 static int
3877 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3878     t_uscalar_t controllen, int flags)
3879 {
3880         struct T_optdata_req    tdr;
3881         mblk_t                  *mp;
3882         int                     error;
3883         ssize_t                 iosize;
3884         int                     size;
3885         struct fdbuf            *fdbuf;
3886         t_uscalar_t             optlen;
3887         void                    *fds;
3888         int                     fdlen;
3889         struct T_opthdr         toh;
3890         sotpi_info_t            *sti = SOTOTPI(so);
3891 
3892         dprintso(so, 1,
3893             ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3894 
3895         /*
3896          * Has to be bound and connected. However, since no locks are
3897          * held the state could have changed after sotpi_sendmsg checked it
3898          * thus it is not possible to ASSERT on the state.
3899          */
3900 
3901         /* Options on connection-oriented only when SM_OPTDATA set. */
3902         if (!(so->so_mode & SM_OPTDATA))
3903                 return (EOPNOTSUPP);
3904 
3905         do {
3906                 /*
3907                  * Set the MORE flag if uio_resid does not fit in this
3908                  * message or if the caller passed in "more".
3909                  * Error for transports with zero tidu_size.
3910                  */
3911                 tdr.PRIM_type = T_OPTDATA_REQ;
3912                 iosize = sti->sti_tidu_size;
3913                 if (iosize <= 0)
3914                         return (EMSGSIZE);
3915                 if (uiop->uio_resid > iosize) {
3916                         tdr.DATA_flag = 1;
3917                 } else {
3918                         if (more)
3919                                 tdr.DATA_flag = 1;
3920                         else
3921                                 tdr.DATA_flag = 0;
3922                         iosize = uiop->uio_resid;
3923                 }
3924                 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3925                     tdr.DATA_flag, iosize));
3926 
3927                 optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3928                 tdr.OPT_length = optlen;
3929                 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3930 
3931                 size = (int)sizeof (tdr) + optlen;
3932                 /*
3933                  * File descriptors only when SM_FDPASSING set.
3934                  */
3935                 error = so_getfdopt(control, controllen,
3936                     !(flags & MSG_XPG4_2), &fds, &fdlen);
3937                 if (error)
3938                         return (error);
3939                 if (fdlen != -1) {
3940                         if (!(so->so_mode & SM_FDPASSING))
3941                                 return (EOPNOTSUPP);
3942 
3943                         error = fdbuf_create(fds, fdlen, &fdbuf);
3944                         if (error)
3945                                 return (error);
3946 
3947                         /*
3948                          * Pre-allocate enough additional space for lower level
3949                          * modules to append an option (e.g. see tl_unitdata).
3950                          * The following is enough extra space for the largest
3951                          * option we might append.
3952                          */
3953                         size += sizeof (struct T_opthdr) + ucredsize;
3954                         mp = fdbuf_allocmsg(size, fdbuf);
3955                 } else {
3956                         mp = soallocproto(size, _ALLOC_INTR, CRED());
3957                         if (mp == NULL) {
3958                                 /*
3959                                  * Caught a signal waiting for memory.
3960                                  * Let send* return EINTR.
3961                                  */
3962                                 return (EINTR);
3963                         }
3964                 }
3965                 soappendmsg(mp, &tdr, sizeof (tdr));
3966 
3967                 if (fdlen != -1) {
3968                         ASSERT(fdbuf != NULL);
3969                         toh.level = SOL_SOCKET;
3970                         toh.name = SO_FILEP;
3971                         toh.len = fdbuf->fd_size +
3972                             (t_uscalar_t)sizeof (struct T_opthdr);
3973                         toh.status = 0;
3974                         soappendmsg(mp, &toh, sizeof (toh));
3975                         soappendmsg(mp, fdbuf, fdbuf->fd_size);
3976                         ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3977                 }
3978                 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3979                 /*
3980                  * Normally at most 3 bytes left in the message, but we might
3981                  * have allowed for extra space if we're passing fd's through.
3982                  */
3983                 ASSERT(MBLKL(mp) <= (ssize_t)size);
3984 
3985                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3986 
3987                 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3988                     0, MSG_BAND, 0);
3989                 if (error) {
3990                         eprintsoline(so, error);
3991                         return (error);
3992                 }
3993                 control = NULL;
3994                 if (uiop->uio_resid > 0) {
3995                         /*
3996                          * Recheck for fatal errors. Fail write even though
3997                          * some data have been written. This is consistent
3998                          * with strwrite semantics and BSD sockets semantics.
3999                          */
4000                         if (so->so_state & SS_CANTSENDMORE) {
4001                                 eprintsoline(so, error);
4002                                 return (EPIPE);
4003                         }
4004                         if (so->so_error != 0) {
4005                                 mutex_enter(&so->so_lock);
4006                                 error = sogeterr(so, B_TRUE);
4007                                 mutex_exit(&so->so_lock);
4008                                 if (error != 0) {
4009                                         eprintsoline(so, error);
4010                                         return (error);
4011                                 }
4012                         }
4013                 }
4014         } while (uiop->uio_resid > 0);
4015         return (0);
4016 }
4017 
4018 /*
4019  * Sending data on a datagram socket.
4020  * Assumes caller has verified that SS_ISBOUND etc. are set.
4021  *
4022  * For AF_UNIX the destination address may be already in
4023  * internal form, as indicated by sti->sti_faddr_noxlate
4024  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
4025  * translate the destination address to internal form.
4026  *
4027  * The source address is passed as an option.
4028  */
4029 int
4030 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
4031     struct uio *uiop, int flags)
4032 {
4033         struct T_unitdata_req   tudr;
4034         mblk_t                  *mp;
4035         int                     error;
4036         void                    *addr;
4037         socklen_t               addrlen;
4038         void                    *src;
4039         socklen_t               srclen;
4040         ssize_t                 len;
4041         sotpi_info_t            *sti = SOTOTPI(so);
4042 
4043         ASSERT(name != NULL && namelen != 0);
4044 
4045         len = uiop->uio_resid;
4046         if (len > sti->sti_tidu_size) {
4047                 error = EMSGSIZE;
4048                 goto done;
4049         }
4050 
4051         if (sti->sti_faddr_noxlate == 0 &&
4052             (flags & MSG_SENDTO_NOXLATE) == 0) {
4053                 /*
4054                  * Length and family checks.
4055                  * Don't verify internal form.
4056                  */
4057                 error = so_addr_verify(so, name, namelen);
4058                 if (error != 0)
4059                         goto done;
4060         }
4061 
4062         if (sti->sti_direct) /* Never on AF_UNIX */
4063                 return (sodgram_direct(so, name, namelen, uiop, flags));
4064 
4065         if (so->so_family == AF_UNIX) {
4066                 if (sti->sti_faddr_noxlate) {
4067                         /*
4068                          * Already have a transport internal address. Do not
4069                          * pass any (transport internal) source address.
4070                          */
4071                         addr = name;
4072                         addrlen = namelen;
4073                         src = NULL;
4074                         srclen = 0;
4075                 } else if (flags & MSG_SENDTO_NOXLATE) {
4076                         /*
4077                          * Have an internal form dest. address.
4078                          * Pass the source address as usual.
4079                          */
4080                         addr = name;
4081                         addrlen = namelen;
4082                         src = sti->sti_laddr_sa;
4083                         srclen = (socklen_t)sti->sti_laddr_len;
4084                 } else {
4085                         /*
4086                          * Pass the sockaddr_un source address as an option
4087                          * and translate the remote address.
4088                          *
4089                          * Note that this code does not prevent sti_laddr_sa
4090                          * from changing while it is being used. Thus
4091                          * if an unbind+bind occurs concurrently with this
4092                          * send the peer might see a partially new and a
4093                          * partially old "from" address.
4094                          */
4095                         src = sti->sti_laddr_sa;
4096                         srclen = (socklen_t)sti->sti_laddr_len;
4097                         dprintso(so, 1,
4098                             ("sosend_dgram UNIX: srclen %d, src %p\n",
4099                             srclen, src));
4100                         /*
4101                          * The sendmsg caller specified a destination
4102                          * address, which we must translate into our
4103                          * internal form.  addr = &sti->sti_ux_taddr
4104                          */
4105                         error = so_ux_addr_xlate(so, name, namelen,
4106                             (flags & MSG_XPG4_2),
4107                             &addr, &addrlen);
4108                         if (error) {
4109                                 eprintsoline(so, error);
4110                                 goto done;
4111                         }
4112                 }
4113         } else {
4114                 addr = name;
4115                 addrlen = namelen;
4116                 src = NULL;
4117                 srclen = 0;
4118         }
4119         tudr.PRIM_type = T_UNITDATA_REQ;
4120         tudr.DEST_length = addrlen;
4121         tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4122         if (srclen == 0) {
4123                 tudr.OPT_length = 0;
4124                 tudr.OPT_offset = 0;
4125 
4126                 mp = soallocproto2(&tudr, sizeof (tudr),
4127                     addr, addrlen, 0, _ALLOC_INTR, CRED());
4128                 if (mp == NULL) {
4129                         /*
4130                          * Caught a signal waiting for memory.
4131                          * Let send* return EINTR.
4132                          */
4133                         error = EINTR;
4134                         goto done;
4135                 }
4136         } else {
4137                 /*
4138                  * There is a AF_UNIX sockaddr_un to include as a source
4139                  * address option.
4140                  */
4141                 struct T_opthdr toh;
4142                 ssize_t size;
4143 
4144                 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4145                     _TPI_ALIGN_TOPT(srclen));
4146                 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4147                     _TPI_ALIGN_TOPT(addrlen));
4148 
4149                 toh.level = SOL_SOCKET;
4150                 toh.name = SO_SRCADDR;
4151                 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4152                 toh.status = 0;
4153 
4154                 size = tudr.OPT_offset + tudr.OPT_length;
4155                 mp = soallocproto2(&tudr, sizeof (tudr),
4156                     addr, addrlen, size, _ALLOC_INTR, CRED());
4157                 if (mp == NULL) {
4158                         /*
4159                          * Caught a signal waiting for memory.
4160                          * Let send* return EINTR.
4161                          */
4162                         error = EINTR;
4163                         goto done;
4164                 }
4165                 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4166                 soappendmsg(mp, &toh, sizeof (toh));
4167                 soappendmsg(mp, src, srclen);
4168                 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4169                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4170         }
4171 
4172         if (AU_AUDITING())
4173                 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4174 
4175         error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4176 done:
4177 #ifdef SOCK_DEBUG
4178         if (error) {
4179                 eprintsoline(so, error);
4180         }
4181 #endif /* SOCK_DEBUG */
4182         return (error);
4183 }
4184 
4185 /*
4186  * Sending data on a connected stream socket.
4187  * Assumes caller has verified that SS_ISCONNECTED is set.
4188  */
4189 int
4190 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4191     int sflag)
4192 {
4193         struct T_data_req       tdr;
4194         mblk_t                  *mp;
4195         int                     error;
4196         ssize_t                 iosize;
4197         sotpi_info_t            *sti = SOTOTPI(so);
4198 
4199         dprintso(so, 1,
4200             ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4201             (void *)so, uiop->uio_resid, prim, sflag));
4202 
4203         /*
4204          * Has to be bound and connected. However, since no locks are
4205          * held the state could have changed after sotpi_sendmsg checked it
4206          * thus it is not possible to ASSERT on the state.
4207          */
4208 
4209         do {
4210                 /*
4211                  * Set the MORE flag if uio_resid does not fit in this
4212                  * message or if the caller passed in "more".
4213                  * Error for transports with zero tidu_size.
4214                  */
4215                 tdr.PRIM_type = prim;
4216                 iosize = sti->sti_tidu_size;
4217                 if (iosize <= 0)
4218                         return (EMSGSIZE);
4219                 if (uiop->uio_resid > iosize) {
4220                         tdr.MORE_flag = 1;
4221                 } else {
4222                         if (more)
4223                                 tdr.MORE_flag = 1;
4224                         else
4225                                 tdr.MORE_flag = 0;
4226                         iosize = uiop->uio_resid;
4227                 }
4228                 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4229                     prim, tdr.MORE_flag, iosize));
4230                 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4231                 if (mp == NULL) {
4232                         /*
4233                          * Caught a signal waiting for memory.
4234                          * Let send* return EINTR.
4235                          */
4236                         return (EINTR);
4237                 }
4238 
4239                 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4240                     0, sflag | MSG_BAND, 0);
4241                 if (error) {
4242                         eprintsoline(so, error);
4243                         return (error);
4244                 }
4245                 if (uiop->uio_resid > 0) {
4246                         /*
4247                          * Recheck for fatal errors. Fail write even though
4248                          * some data have been written. This is consistent
4249                          * with strwrite semantics and BSD sockets semantics.
4250                          */
4251                         if (so->so_state & SS_CANTSENDMORE) {
4252                                 eprintsoline(so, error);
4253                                 return (EPIPE);
4254                         }
4255                         if (so->so_error != 0) {
4256                                 mutex_enter(&so->so_lock);
4257                                 error = sogeterr(so, B_TRUE);
4258                                 mutex_exit(&so->so_lock);
4259                                 if (error != 0) {
4260                                         eprintsoline(so, error);
4261                                         return (error);
4262                                 }
4263                         }
4264                 }
4265         } while (uiop->uio_resid > 0);
4266         return (0);
4267 }
4268 
4269 /*
4270  * Check the state for errors and call the appropriate send function.
4271  *
4272  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4273  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4274  * after sending the message.
4275  *
4276  * The caller may optionally specify a destination address, for either
4277  * stream or datagram sockets.  This table summarizes the cases:
4278  *
4279  *    Socket type    Dest. given    Connected    Result
4280  *    -----------    -----------    ---------    --------------
4281  *    Stream         *              Yes          send to conn. addr.
4282  *    Stream         *              No           error ENOTCONN
4283  *    Dgram          yes            *            send to given addr.
4284  *    Dgram          no             yes          send to conn. addr.
4285  *    Dgram          no             no           error EDESTADDRREQ
4286  *
4287  * There are subtleties around the destination address when using
4288  * AF_UNIX datagram sockets.  When the sendmsg call specifies the
4289  * destination address, it's in (struct sockaddr_un) form and we
4290  * need to translate it to our internal form (struct so_ux_addr).
4291  *
4292  * When the sendmsg call does not specify a destination address
4293  * we're using the peer address saved during sotpi_connect, and
4294  * that address is already in internal form.  In this case, the
4295  * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4296  * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4297  * those functions should skip translation to internal form.
4298  * Avoiding that translation is not only more efficient, but it's
4299  * also necessary when a process does a connect on an AF_UNIX
4300  * datagram socket and then drops privileges.  After the process
4301  * has dropped privileges, it may no longer be able to lookup the
4302  * the external name in the filesystem, but it should still be
4303  * able to send messages on the connected socket by leaving the
4304  * destination name unspecified.
4305  *
4306  * Yet more subtleties arise with sockets connected by socketpair(),
4307  * which puts internal form addresses in the fields where normally
4308  * the external form is found, and sets sti_faddr_noxlate=1, which
4309  * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4310  * to skip translation of destination addresses to internal form.
4311  * However, beware that the flag sti_faddr_noxlate=1 also triggers
4312  * different behaviour almost everywhere AF_UNIX addresses appear.
4313  */
4314 static int
4315 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4316     struct cred *cr)
4317 {
4318         int             so_state;
4319         int             so_mode;
4320         int             error;
4321         struct sockaddr *name;
4322         t_uscalar_t     namelen;
4323         int             dontroute;
4324         int             flags;
4325         sotpi_info_t    *sti = SOTOTPI(so);
4326 
4327         dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4328             (void *)so, (void *)msg, msg->msg_flags,
4329             pr_state(so->so_state, so->so_mode), so->so_error));
4330 
4331         if (so->so_version == SOV_STREAM) {
4332                 /* The imaginary "sockmod" has been popped - act as a stream */
4333                 so_update_attrs(so, SOMOD);
4334                 return (strwrite(SOTOV(so), uiop, cr));
4335         }
4336 
4337         mutex_enter(&so->so_lock);
4338         so_state = so->so_state;
4339 
4340         if (so_state & SS_CANTSENDMORE) {
4341                 mutex_exit(&so->so_lock);
4342                 return (EPIPE);
4343         }
4344 
4345         if (so->so_error != 0) {
4346                 error = sogeterr(so, B_TRUE);
4347                 if (error != 0) {
4348                         mutex_exit(&so->so_lock);
4349                         return (error);
4350                 }
4351         }
4352 
4353         name = (struct sockaddr *)msg->msg_name;
4354         namelen = msg->msg_namelen;
4355         flags = msg->msg_flags;
4356 
4357         /*
4358          * Historically, this function does not validate the flags
4359          * passed in, and any errant bits are ignored.  However,
4360          * we would not want any such errant flag bits accidently
4361          * being treated as one of the internal-only flags, so
4362          * clear the internal-only flag bits.
4363          */
4364         flags &= ~MSG_SENDTO_NOXLATE;
4365 
4366         so_mode = so->so_mode;
4367 
4368         if (name == NULL) {
4369                 if (!(so_state & SS_ISCONNECTED)) {
4370                         mutex_exit(&so->so_lock);
4371                         if (so_mode & SM_CONNREQUIRED)
4372                                 return (ENOTCONN);
4373                         else
4374                                 return (EDESTADDRREQ);
4375                 }
4376                 /*
4377                  * This is a connected socket.
4378                  */
4379                 if (so_mode & SM_CONNREQUIRED) {
4380                         /*
4381                          * This is a connected STREAM socket,
4382                          * destination not specified.
4383                          */
4384                         name = NULL;
4385                         namelen = 0;
4386                 } else {
4387                         /*
4388                          * Datagram send on connected socket with
4389                          * the destination name not specified.
4390                          * Use the peer address from connect.
4391                          */
4392                         if (so->so_family == AF_UNIX) {
4393                                 /*
4394                                  * Use the (internal form) address saved
4395                                  * in sotpi_connect.  See above.
4396                                  */
4397                                 name = (void *)&sti->sti_ux_faddr;
4398                                 namelen = sizeof (sti->sti_ux_faddr);
4399                                 flags |= MSG_SENDTO_NOXLATE;
4400                         } else {
4401                                 ASSERT(sti->sti_faddr_sa);
4402                                 name = sti->sti_faddr_sa;
4403                                 namelen = (t_uscalar_t)sti->sti_faddr_len;
4404                         }
4405                 }
4406         } else {
4407                 /*
4408                  * Sendmsg specifies a destination name
4409                  */
4410                 if (!(so_state & SS_ISCONNECTED) &&
4411                     (so_mode & SM_CONNREQUIRED)) {
4412                         /* i.e. TCP not connected */
4413                         mutex_exit(&so->so_lock);
4414                         return (ENOTCONN);
4415                 }
4416                 /*
4417                  * Ignore the address on connection-oriented sockets.
4418                  * Just like BSD this code does not generate an error for
4419                  * TCP (a CONNREQUIRED socket) when sending to an address
4420                  * passed in with sendto/sendmsg. Instead the data is
4421                  * delivered on the connection as if no address had been
4422                  * supplied.
4423                  */
4424                 if ((so_state & SS_ISCONNECTED) &&
4425                     !(so_mode & SM_CONNREQUIRED)) {
4426                         mutex_exit(&so->so_lock);
4427                         return (EISCONN);
4428                 }
4429                 if (!(so_state & SS_ISBOUND)) {
4430                         so_lock_single(so);     /* Set SOLOCKED */
4431                         error = sotpi_bind(so, NULL, 0,
4432                             _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4433                         so_unlock_single(so, SOLOCKED);
4434                         if (error) {
4435                                 mutex_exit(&so->so_lock);
4436                                 eprintsoline(so, error);
4437                                 return (error);
4438                         }
4439                 }
4440                 /*
4441                  * Handle delayed datagram errors. These are only queued
4442                  * when the application sets SO_DGRAM_ERRIND.
4443                  * Return the error if we are sending to the address
4444                  * that was returned in the last T_UDERROR_IND.
4445                  * If sending to some other address discard the delayed
4446                  * error indication.
4447                  */
4448                 if (sti->sti_delayed_error) {
4449                         struct T_uderror_ind    *tudi;
4450                         void                    *addr;
4451                         t_uscalar_t             addrlen;
4452                         boolean_t               match = B_FALSE;
4453 
4454                         ASSERT(sti->sti_eaddr_mp);
4455                         error = sti->sti_delayed_error;
4456                         sti->sti_delayed_error = 0;
4457                         tudi =
4458                             (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4459                         addrlen = tudi->DEST_length;
4460                         addr = sogetoff(sti->sti_eaddr_mp,
4461                             tudi->DEST_offset, addrlen, 1);
4462                         ASSERT(addr);   /* Checked by strsock_proto */
4463                         switch (so->so_family) {
4464                         case AF_INET: {
4465                                 /* Compare just IP address and port */
4466                                 sin_t *sin1 = (sin_t *)name;
4467                                 sin_t *sin2 = (sin_t *)addr;
4468 
4469                                 if (addrlen == sizeof (sin_t) &&
4470                                     namelen == addrlen &&
4471                                     sin1->sin_port == sin2->sin_port &&
4472                                     sin1->sin_addr.s_addr ==
4473                                     sin2->sin_addr.s_addr)
4474                                         match = B_TRUE;
4475                                 break;
4476                         }
4477                         case AF_INET6: {
4478                                 /* Compare just IP address and port. Not flow */
4479                                 sin6_t *sin1 = (sin6_t *)name;
4480                                 sin6_t *sin2 = (sin6_t *)addr;
4481 
4482                                 if (addrlen == sizeof (sin6_t) &&
4483                                     namelen == addrlen &&
4484                                     sin1->sin6_port == sin2->sin6_port &&
4485                                     IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4486                                     &sin2->sin6_addr))
4487                                         match = B_TRUE;
4488                                 break;
4489                         }
4490                         case AF_UNIX:
4491                         default:
4492                                 if (namelen == addrlen &&
4493                                     bcmp(name, addr, namelen) == 0)
4494                                         match = B_TRUE;
4495                         }
4496                         if (match) {
4497                                 freemsg(sti->sti_eaddr_mp);
4498                                 sti->sti_eaddr_mp = NULL;
4499                                 mutex_exit(&so->so_lock);
4500 #ifdef DEBUG
4501                                 dprintso(so, 0,
4502                                     ("sockfs delayed error %d for %s\n",
4503                                     error,
4504                                     pr_addr(so->so_family, name, namelen)));
4505 #endif /* DEBUG */
4506                                 return (error);
4507                         }
4508                         freemsg(sti->sti_eaddr_mp);
4509                         sti->sti_eaddr_mp = NULL;
4510                 }
4511         }
4512         mutex_exit(&so->so_lock);
4513 
4514         dontroute = 0;
4515         if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4516                 uint32_t        val;
4517 
4518                 val = 1;
4519                 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4520                     &val, (t_uscalar_t)sizeof (val), cr);
4521                 if (error)
4522                         return (error);
4523                 dontroute = 1;
4524         }
4525 
4526         if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4527                 error = EOPNOTSUPP;
4528                 goto done;
4529         }
4530         if (msg->msg_controllen != 0) {
4531                 if (!(so_mode & SM_CONNREQUIRED)) {
4532                         so_update_attrs(so, SOMOD);
4533                         error = sosend_dgramcmsg(so, name, namelen, uiop,
4534                             msg->msg_control, msg->msg_controllen, flags);
4535                 } else {
4536                         if (flags & MSG_OOB) {
4537                                 /* Can't generate T_EXDATA_REQ with options */
4538                                 error = EOPNOTSUPP;
4539                                 goto done;
4540                         }
4541                         so_update_attrs(so, SOMOD);
4542                         error = sosend_svccmsg(so, uiop,
4543                             !(flags & MSG_EOR),
4544                             msg->msg_control, msg->msg_controllen,
4545                             flags);
4546                 }
4547                 goto done;
4548         }
4549 
4550         so_update_attrs(so, SOMOD);
4551         if (!(so_mode & SM_CONNREQUIRED)) {
4552                 /*
4553                  * If there is no SO_DONTROUTE to turn off return immediately
4554                  * from send_dgram. This can allow tail-call optimizations.
4555                  */
4556                 if (!dontroute) {
4557                         return (sosend_dgram(so, name, namelen, uiop, flags));
4558                 }
4559                 error = sosend_dgram(so, name, namelen, uiop, flags);
4560         } else {
4561                 t_scalar_t prim;
4562                 int sflag;
4563 
4564                 /* Ignore msg_name in the connected state */
4565                 if (flags & MSG_OOB) {
4566                         prim = T_EXDATA_REQ;
4567                         /*
4568                          * Send down T_EXDATA_REQ even if there is flow
4569                          * control for data.
4570                          */
4571                         sflag = MSG_IGNFLOW;
4572                 } else {
4573                         if (so_mode & SM_BYTESTREAM) {
4574                                 /* Byte stream transport - use write */
4575                                 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4576 
4577                                 /* Send M_DATA messages */
4578                                 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4579                                     (error = nl7c_data(so, uiop)) >= 0) {
4580                                         /* NL7C consumed the data */
4581                                         return (error);
4582                                 }
4583                                 /*
4584                                  * If there is no SO_DONTROUTE to turn off,
4585                                  * sti_direct is on, and there is no flow
4586                                  * control, we can take the fast path.
4587                                  */
4588                                 if (!dontroute && sti->sti_direct != 0 &&
4589                                     canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4590                                         return (sostream_direct(so, uiop,
4591                                             NULL, cr));
4592                                 }
4593                                 error = strwrite(SOTOV(so), uiop, cr);
4594                                 goto done;
4595                         }
4596                         prim = T_DATA_REQ;
4597                         sflag = 0;
4598                 }
4599                 /*
4600                  * If there is no SO_DONTROUTE to turn off return immediately
4601                  * from sosend_svc. This can allow tail-call optimizations.
4602                  */
4603                 if (!dontroute)
4604                         return (sosend_svc(so, uiop, prim,
4605                             !(flags & MSG_EOR), sflag));
4606                 error = sosend_svc(so, uiop, prim,
4607                     !(flags & MSG_EOR), sflag);
4608         }
4609         ASSERT(dontroute);
4610 done:
4611         if (dontroute) {
4612                 uint32_t        val;
4613 
4614                 val = 0;
4615                 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4616                     &val, (t_uscalar_t)sizeof (val), cr);
4617         }
4618         return (error);
4619 }
4620 
4621 /*
4622  * kstrwritemp() has very similar semantics as that of strwrite().
4623  * The main difference is it obtains mblks from the caller and also
4624  * does not do any copy as done in strwrite() from user buffers to
4625  * kernel buffers.
4626  *
4627  * Currently, this routine is used by sendfile to send data allocated
4628  * within the kernel without any copying. This interface does not use the
4629  * synchronous stream interface as synch. stream interface implies
4630  * copying.
4631  */
4632 int
4633 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4634 {
4635         struct stdata *stp;
4636         struct queue *wqp;
4637         mblk_t *newmp;
4638         char waitflag;
4639         int tempmode;
4640         int error = 0;
4641         int done = 0;
4642         struct sonode *so;
4643         boolean_t direct;
4644 
4645         ASSERT(vp->v_stream);
4646         stp = vp->v_stream;
4647 
4648         so = VTOSO(vp);
4649         direct = _SOTOTPI(so)->sti_direct;
4650 
4651         /*
4652          * This is the sockfs direct fast path. canputnext() need
4653          * not be accurate so we don't grab the sd_lock here. If
4654          * we get flow-controlled, we grab sd_lock just before the
4655          * do..while loop below to emulate what strwrite() does.
4656          */
4657         wqp = stp->sd_wrq;
4658         if (canputnext(wqp) && direct &&
4659             !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4660                 return (sostream_direct(so, NULL, mp, CRED()));
4661         } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4662                 /* Fast check of flags before acquiring the lock */
4663                 mutex_enter(&stp->sd_lock);
4664                 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4665                 mutex_exit(&stp->sd_lock);
4666                 if (error != 0) {
4667                         if (!(stp->sd_flag & STPLEX) &&
4668                             (stp->sd_wput_opt & SW_SIGPIPE)) {
4669                                 error = EPIPE;
4670                         }
4671                         return (error);
4672                 }
4673         }
4674 
4675         waitflag = WRITEWAIT;
4676         if (stp->sd_flag & OLDNDELAY)
4677                 tempmode = fmode & ~FNDELAY;
4678         else
4679                 tempmode = fmode;
4680 
4681         mutex_enter(&stp->sd_lock);
4682         do {
4683                 if (canputnext(wqp)) {
4684                         mutex_exit(&stp->sd_lock);
4685                         if (stp->sd_wputdatafunc != NULL) {
4686                                 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4687                                     NULL, NULL, NULL);
4688                                 if (newmp == NULL) {
4689                                         /* The caller will free mp */
4690                                         return (ECOMM);
4691                                 }
4692                                 mp = newmp;
4693                         }
4694                         putnext(wqp, mp);
4695                         return (0);
4696                 }
4697                 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4698                     &done);
4699         } while (error == 0 && !done);
4700 
4701         mutex_exit(&stp->sd_lock);
4702         /*
4703          * EAGAIN tells the application to try again. ENOMEM
4704          * is returned only if the memory allocation size
4705          * exceeds the physical limits of the system. ENOMEM
4706          * can't be true here.
4707          */
4708         if (error == ENOMEM)
4709                 error = EAGAIN;
4710         return (error);
4711 }
4712 
4713 /* ARGSUSED */
4714 static int
4715 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4716     struct cred *cr, mblk_t **mpp)
4717 {
4718         int error;
4719 
4720         switch (so->so_family) {
4721         case AF_INET:
4722         case AF_INET6:
4723         case AF_UNIX:
4724                 break;
4725         default:
4726                 return (EAFNOSUPPORT);
4727 
4728         }
4729 
4730         if (so->so_state & SS_CANTSENDMORE)
4731                 return (EPIPE);
4732 
4733         if (so->so_type != SOCK_STREAM)
4734                 return (EOPNOTSUPP);
4735 
4736         if ((so->so_state & SS_ISCONNECTED) == 0)
4737                 return (ENOTCONN);
4738 
4739         error = kstrwritemp(so->so_vnode, *mpp, fflag);
4740         if (error == 0)
4741                 *mpp = NULL;
4742         return (error);
4743 }
4744 
4745 /*
4746  * Sending data on a datagram socket.
4747  * Assumes caller has verified that SS_ISBOUND etc. are set.
4748  */
4749 /* ARGSUSED */
4750 static int
4751 sodgram_direct(struct sonode *so, struct sockaddr *name,
4752     socklen_t namelen, struct uio *uiop, int flags)
4753 {
4754         struct T_unitdata_req   tudr;
4755         mblk_t                  *mp = NULL;
4756         int                     error = 0;
4757         void                    *addr;
4758         socklen_t               addrlen;
4759         ssize_t                 len;
4760         struct stdata           *stp = SOTOV(so)->v_stream;
4761         int                     so_state;
4762         queue_t                 *udp_wq;
4763         boolean_t               connected;
4764         mblk_t                  *mpdata = NULL;
4765         sotpi_info_t            *sti = SOTOTPI(so);
4766         uint32_t                auditing = AU_AUDITING();
4767 
4768         ASSERT(name != NULL && namelen != 0);
4769         ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4770         ASSERT(!(so->so_mode & SM_EXDATA));
4771         ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4772         ASSERT(SOTOV(so)->v_type == VSOCK);
4773 
4774         /* Caller checked for proper length */
4775         len = uiop->uio_resid;
4776         ASSERT(len <= sti->sti_tidu_size);
4777 
4778         /* Length and family checks have been done by caller */
4779         ASSERT(name->sa_family == so->so_family);
4780         ASSERT(so->so_family == AF_INET ||
4781             (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4782         ASSERT(so->so_family == AF_INET6 ||
4783             (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4784 
4785         addr = name;
4786         addrlen = namelen;
4787 
4788         if (stp->sd_sidp != NULL &&
4789             (error = straccess(stp, JCWRITE)) != 0)
4790                 goto done;
4791 
4792         so_state = so->so_state;
4793 
4794         connected = so_state & SS_ISCONNECTED;
4795         if (!connected) {
4796                 tudr.PRIM_type = T_UNITDATA_REQ;
4797                 tudr.DEST_length = addrlen;
4798                 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4799                 tudr.OPT_length = 0;
4800                 tudr.OPT_offset = 0;
4801 
4802                 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4803                     _ALLOC_INTR, CRED());
4804                 if (mp == NULL) {
4805                         /*
4806                          * Caught a signal waiting for memory.
4807                          * Let send* return EINTR.
4808                          */
4809                         error = EINTR;
4810                         goto done;
4811                 }
4812         }
4813 
4814         /*
4815          * For UDP we don't break up the copyin into smaller pieces
4816          * as in the TCP case.  That means if ENOMEM is returned by
4817          * mcopyinuio() then the uio vector has not been modified at
4818          * all and we fallback to either strwrite() or kstrputmsg()
4819          * below.  Note also that we never generate priority messages
4820          * from here.
4821          */
4822         udp_wq = stp->sd_wrq->q_next;
4823         if (canput(udp_wq) &&
4824             (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4825                 ASSERT(DB_TYPE(mpdata) == M_DATA);
4826                 ASSERT(uiop->uio_resid == 0);
4827                 if (!connected)
4828                         linkb(mp, mpdata);
4829                 else
4830                         mp = mpdata;
4831                 if (auditing)
4832                         audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4833 
4834                 /* Always returns 0... */
4835                 return (udp_wput(udp_wq, mp));
4836         }
4837 
4838         ASSERT(mpdata == NULL);
4839         if (error != 0 && error != ENOMEM) {
4840                 freemsg(mp);
4841                 return (error);
4842         }
4843 
4844         /*
4845          * For connected, let strwrite() handle the blocking case.
4846          * Otherwise we fall thru and use kstrputmsg().
4847          */
4848         if (connected)
4849                 return (strwrite(SOTOV(so), uiop, CRED()));
4850 
4851         if (auditing)
4852                 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4853 
4854         error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4855 done:
4856 #ifdef SOCK_DEBUG
4857         if (error != 0) {
4858                 eprintsoline(so, error);
4859         }
4860 #endif /* SOCK_DEBUG */
4861         return (error);
4862 }
4863 
4864 int
4865 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4866 {
4867         struct stdata *stp = SOTOV(so)->v_stream;
4868         ssize_t iosize, rmax, maxblk;
4869         queue_t *tcp_wq = stp->sd_wrq->q_next;
4870         mblk_t *newmp;
4871         int error = 0, wflag = 0;
4872 
4873         ASSERT(so->so_mode & SM_BYTESTREAM);
4874         ASSERT(SOTOV(so)->v_type == VSOCK);
4875 
4876         if (stp->sd_sidp != NULL &&
4877             (error = straccess(stp, JCWRITE)) != 0)
4878                 return (error);
4879 
4880         if (uiop == NULL) {
4881                 /*
4882                  * kstrwritemp() should have checked sd_flag and
4883                  * flow-control before coming here.  If we end up
4884                  * here it means that we can simply pass down the
4885                  * data to tcp.
4886                  */
4887                 ASSERT(mp != NULL);
4888                 if (stp->sd_wputdatafunc != NULL) {
4889                         newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4890                             NULL, NULL, NULL);
4891                         if (newmp == NULL) {
4892                                 /* The caller will free mp */
4893                                 return (ECOMM);
4894                         }
4895                         mp = newmp;
4896                 }
4897                 /* Always returns 0... */
4898                 return (tcp_wput(tcp_wq, mp));
4899         }
4900 
4901         /* Fallback to strwrite() to do proper error handling */
4902         if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4903                 return (strwrite(SOTOV(so), uiop, cr));
4904 
4905         rmax = stp->sd_qn_maxpsz;
4906         ASSERT(rmax >= 0 || rmax == INFPSZ);
4907         if (rmax == 0 || uiop->uio_resid <= 0)
4908                 return (0);
4909 
4910         if (rmax == INFPSZ)
4911                 rmax = uiop->uio_resid;
4912 
4913         maxblk = stp->sd_maxblk;
4914 
4915         for (;;) {
4916                 iosize = MIN(uiop->uio_resid, rmax);
4917 
4918                 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4919                 if (mp == NULL) {
4920                         /*
4921                          * Fallback to strwrite() for ENOMEM; if this
4922                          * is our first time in this routine and the uio
4923                          * vector has not been modified, we will end up
4924                          * calling strwrite() without any flag set.
4925                          */
4926                         if (error == ENOMEM)
4927                                 goto slow_send;
4928                         else
4929                                 return (error);
4930                 }
4931                 ASSERT(uiop->uio_resid >= 0);
4932                 /*
4933                  * If mp is non-NULL and ENOMEM is set, it means that
4934                  * mcopyinuio() was able to break down some of the user
4935                  * data into one or more mblks.  Send the partial data
4936                  * to tcp and let the rest be handled in strwrite().
4937                  */
4938                 ASSERT(error == 0 || error == ENOMEM);
4939                 if (stp->sd_wputdatafunc != NULL) {
4940                         newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4941                             NULL, NULL, NULL);
4942                         if (newmp == NULL) {
4943                                 /* The caller will free mp */
4944                                 return (ECOMM);
4945                         }
4946                         mp = newmp;
4947                 }
4948                 (void) tcp_wput(tcp_wq, mp);    /* Always returns 0 anyway. */
4949 
4950                 wflag |= NOINTR;
4951 
4952                 if (uiop->uio_resid == 0) {  /* No more data; we're done */
4953                         ASSERT(error == 0);
4954                         break;
4955                 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4956                     (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4957 slow_send:
4958                         /*
4959                          * We were able to send down partial data using
4960                          * the direct call interface, but are now relying
4961                          * on strwrite() to handle the non-fastpath cases.
4962                          * If the socket is blocking we will sleep in
4963                          * strwaitq() until write is permitted, otherwise,
4964                          * we will need to return the amount of bytes
4965                          * written so far back to the app.  This is the
4966                          * reason why we pass NOINTR flag to strwrite()
4967                          * for non-blocking socket, because we don't want
4968                          * to return EAGAIN when portion of the user data
4969                          * has actually been sent down.
4970                          */
4971                         return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4972                 }
4973         }
4974         return (0);
4975 }
4976 
4977 /*
4978  * Update sti_faddr by asking the transport (unless AF_UNIX).
4979  */
4980 /* ARGSUSED */
4981 int
4982 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4983     boolean_t accept, struct cred *cr)
4984 {
4985         struct strbuf   strbuf;
4986         int             error = 0, res;
4987         void            *addr;
4988         t_uscalar_t     addrlen;
4989         k_sigset_t      smask;
4990         sotpi_info_t    *sti = SOTOTPI(so);
4991         vnode_t         *vn;
4992 
4993         dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4994             (void *)so, pr_state(so->so_state, so->so_mode)));
4995 
4996         ASSERT(*namelen > 0);
4997         mutex_enter(&so->so_lock);
4998         so_lock_single(so);     /* Set SOLOCKED */
4999         vn = SOTOV(so);
5000         if (SOTPI_VN_NOSTREAM(vn)) {
5001                 error = EBADF;
5002                 goto done;
5003         }
5004 
5005         if (accept) {
5006                 bcopy(sti->sti_faddr_sa, name,
5007                     MIN(*namelen, sti->sti_faddr_len));
5008                 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
5009                 goto done;
5010         }
5011 
5012         if (!(so->so_state & SS_ISCONNECTED)) {
5013                 error = ENOTCONN;
5014                 goto done;
5015         }
5016         /* Added this check for X/Open */
5017         if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5018                 error = EINVAL;
5019                 if (xnet_check_print) {
5020                         printf("sockfs: X/Open getpeername check => EINVAL\n");
5021                 }
5022                 goto done;
5023         }
5024 
5025         if (sti->sti_faddr_valid) {
5026                 bcopy(sti->sti_faddr_sa, name,
5027                     MIN(*namelen, sti->sti_faddr_len));
5028                 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
5029                 goto done;
5030         }
5031 
5032 #ifdef DEBUG
5033         dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
5034             pr_addr(so->so_family, sti->sti_faddr_sa,
5035             (t_uscalar_t)sti->sti_faddr_len)));
5036 #endif /* DEBUG */
5037 
5038         if (so->so_family == AF_UNIX) {
5039                 /* Transport has different name space - return local info */
5040                 if (sti->sti_faddr_noxlate)
5041                         *namelen = 0;
5042                 error = 0;
5043                 goto done;
5044         }
5045 
5046         ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
5047 
5048         ASSERT(sti->sti_faddr_sa);
5049         /* Allocate local buffer to use with ioctl */
5050         addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
5051         mutex_exit(&so->so_lock);
5052         addr = kmem_alloc(addrlen, KM_SLEEP);
5053 
5054         /*
5055          * Issue TI_GETPEERNAME with signals masked.
5056          * Put the result in sti_faddr_sa so that getpeername works after
5057          * a shutdown(output).
5058          * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5059          * back to the socket.
5060          */
5061         strbuf.buf = addr;
5062         strbuf.maxlen = addrlen;
5063         strbuf.len = 0;
5064 
5065         sigintr(&smask, 0);
5066         res = 0;
5067         ASSERT(cr);
5068         error = strioctl(vn, TI_GETPEERNAME, (intptr_t)&strbuf,
5069             0, K_TO_K, cr, &res);
5070         sigunintr(&smask);
5071 
5072         mutex_enter(&so->so_lock);
5073         /*
5074          * If there is an error record the error in so_error put don't fail
5075          * the getpeername. Instead fallback on the recorded
5076          * sti->sti_faddr_sa.
5077          */
5078         if (error) {
5079                 /*
5080                  * Various stream head errors can be returned to the ioctl.
5081                  * However, it is impossible to determine which ones of
5082                  * these are really socket level errors that were incorrectly
5083                  * consumed by the ioctl. Thus this code silently ignores the
5084                  * error - to code explicitly does not reinstate the error
5085                  * using soseterror().
5086                  * Experiments have shows that at least this set of
5087                  * errors are reported and should not be reinstated on the
5088                  * socket:
5089                  *      EINVAL  E.g. if an I_LINK was in effect when
5090                  *              getpeername was called.
5091                  *      EPIPE   The ioctl error semantics prefer the write
5092                  *              side error over the read side error.
5093                  *      ENOTCONN The transport just got disconnected but
5094                  *              sockfs had not yet seen the T_DISCON_IND
5095                  *              when issuing the ioctl.
5096                  */
5097                 error = 0;
5098         } else if (res == 0 && strbuf.len > 0 &&
5099             (so->so_state & SS_ISCONNECTED)) {
5100                 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
5101                 sti->sti_faddr_len = (socklen_t)strbuf.len;
5102                 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
5103                 sti->sti_faddr_valid = 1;
5104 
5105                 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
5106                 *namelen = sti->sti_faddr_len;
5107         }
5108         kmem_free(addr, addrlen);
5109 #ifdef DEBUG
5110         dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
5111             pr_addr(so->so_family, sti->sti_faddr_sa,
5112             (t_uscalar_t)sti->sti_faddr_len)));
5113 #endif /* DEBUG */
5114 done:
5115         so_unlock_single(so, SOLOCKED);
5116         mutex_exit(&so->so_lock);
5117         return (error);
5118 }
5119 
5120 /*
5121  * Update sti_laddr by asking the transport (unless AF_UNIX).
5122  */
5123 int
5124 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5125     struct cred *cr)
5126 {
5127         struct strbuf   strbuf;
5128         int             error = 0, res;
5129         void            *addr;
5130         t_uscalar_t     addrlen;
5131         k_sigset_t      smask;
5132         sotpi_info_t    *sti = SOTOTPI(so);
5133         vnode_t         *vn;
5134 
5135         dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5136             (void *)so, pr_state(so->so_state, so->so_mode)));
5137 
5138         ASSERT(*namelen > 0);
5139         mutex_enter(&so->so_lock);
5140         so_lock_single(so);     /* Set SOLOCKED */
5141         vn = SOTOV(so);
5142         if (SOTPI_VN_NOSTREAM(vn)) {
5143                 error = EBADF;
5144                 goto done;
5145         }
5146 
5147 #ifdef DEBUG
5148 
5149         dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5150             pr_addr(so->so_family, sti->sti_laddr_sa,
5151             (t_uscalar_t)sti->sti_laddr_len)));
5152 #endif /* DEBUG */
5153         if (sti->sti_laddr_valid) {
5154                 bcopy(sti->sti_laddr_sa, name,
5155                     MIN(*namelen, sti->sti_laddr_len));
5156                 *namelen = sti->sti_laddr_len;
5157                 goto done;
5158         }
5159 
5160         if (so->so_family == AF_UNIX) {
5161                 /*
5162                  * Transport has different name space - return local info. If we
5163                  * have enough space, let consumers know the family.
5164                  */
5165                 if (*namelen >= sizeof (sa_family_t)) {
5166                         name->sa_family = AF_UNIX;
5167                         *namelen = sizeof (sa_family_t);
5168                 } else {
5169                         *namelen = 0;
5170                 }
5171                 error = 0;
5172                 goto done;
5173         }
5174         if (!(so->so_state & SS_ISBOUND)) {
5175                 /* If not bound, then nothing to return. */
5176                 error = 0;
5177                 goto done;
5178         }
5179 
5180         /* Allocate local buffer to use with ioctl */
5181         addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5182         mutex_exit(&so->so_lock);
5183         addr = kmem_alloc(addrlen, KM_SLEEP);
5184 
5185         /*
5186          * Issue TI_GETMYNAME with signals masked.
5187          * Put the result in sti_laddr_sa so that getsockname works after
5188          * a shutdown(output).
5189          * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5190          * back to the socket.
5191          */
5192         strbuf.buf = addr;
5193         strbuf.maxlen = addrlen;
5194         strbuf.len = 0;
5195 
5196         sigintr(&smask, 0);
5197         res = 0;
5198         ASSERT(cr);
5199         error = strioctl(vn, TI_GETMYNAME, (intptr_t)&strbuf,
5200             0, K_TO_K, cr, &res);
5201         sigunintr(&smask);
5202 
5203         mutex_enter(&so->so_lock);
5204         /*
5205          * If there is an error record the error in so_error put don't fail
5206          * the getsockname. Instead fallback on the recorded
5207          * sti->sti_laddr_sa.
5208          */
5209         if (error) {
5210                 /*
5211                  * Various stream head errors can be returned to the ioctl.
5212                  * However, it is impossible to determine which ones of
5213                  * these are really socket level errors that were incorrectly
5214                  * consumed by the ioctl. Thus this code silently ignores the
5215                  * error - to code explicitly does not reinstate the error
5216                  * using soseterror().
5217                  * Experiments have shows that at least this set of
5218                  * errors are reported and should not be reinstated on the
5219                  * socket:
5220                  *      EINVAL  E.g. if an I_LINK was in effect when
5221                  *              getsockname was called.
5222                  *      EPIPE   The ioctl error semantics prefer the write
5223                  *              side error over the read side error.
5224                  */
5225                 error = 0;
5226         } else if (res == 0 && strbuf.len > 0 &&
5227             (so->so_state & SS_ISBOUND)) {
5228                 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5229                 sti->sti_laddr_len = (socklen_t)strbuf.len;
5230                 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5231                 sti->sti_laddr_valid = 1;
5232 
5233                 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5234                 *namelen = sti->sti_laddr_len;
5235         }
5236         kmem_free(addr, addrlen);
5237 #ifdef DEBUG
5238         dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5239             pr_addr(so->so_family, sti->sti_laddr_sa,
5240             (t_uscalar_t)sti->sti_laddr_len)));
5241 #endif /* DEBUG */
5242 done:
5243         so_unlock_single(so, SOLOCKED);
5244         mutex_exit(&so->so_lock);
5245         return (error);
5246 }
5247 
5248 /*
5249  * Get socket options. For SOL_SOCKET options some options are handled
5250  * by the sockfs while others use the value recorded in the sonode as a
5251  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5252  *
5253  * On the return most *optlenp bytes are copied to optval.
5254  */
5255 /* ARGSUSED */
5256 int
5257 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5258     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5259 {
5260         struct T_optmgmt_req    optmgmt_req;
5261         struct T_optmgmt_ack    *optmgmt_ack;
5262         struct opthdr           oh;
5263         struct opthdr           *opt_res;
5264         mblk_t                  *mp = NULL;
5265         int                     error = 0;
5266         void                    *option = NULL; /* Set if fallback value */
5267         t_uscalar_t             maxlen = *optlenp;
5268         t_uscalar_t             len;
5269         uint32_t                value;
5270         struct timeval          tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5271         struct timeval32        tmo_val32;
5272         struct so_snd_bufinfo   snd_bufinfo;    /* used for zero copy */
5273         vnode_t                 *vn;
5274 
5275         dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5276             (void *)so, level, option_name, optval, (void *)optlenp,
5277             pr_state(so->so_state, so->so_mode)));
5278 
5279         mutex_enter(&so->so_lock);
5280         so_lock_single(so);     /* Set SOLOCKED */
5281         vn = SOTOV(so);
5282         if (SOTPI_VN_NOSTREAM(vn)) {
5283                 error = EBADF;
5284                 eprintsoline(so, error);
5285                 goto done2;
5286         }
5287 
5288         /*
5289          * Check for SOL_SOCKET options.
5290          * Certain SOL_SOCKET options are returned directly whereas
5291          * others only provide a default (fallback) value should
5292          * the T_SVR4_OPTMGMT_REQ fail.
5293          */
5294         if (level == SOL_SOCKET) {
5295                 /* Check parameters */
5296                 switch (option_name) {
5297                 case SO_TYPE:
5298                 case SO_ERROR:
5299                 case SO_DEBUG:
5300                 case SO_ACCEPTCONN:
5301                 case SO_REUSEADDR:
5302                 case SO_KEEPALIVE:
5303                 case SO_DONTROUTE:
5304                 case SO_BROADCAST:
5305                 case SO_USELOOPBACK:
5306                 case SO_OOBINLINE:
5307                 case SO_SNDBUF:
5308                 case SO_RCVBUF:
5309 #ifdef notyet
5310                 case SO_SNDLOWAT:
5311                 case SO_RCVLOWAT:
5312 #endif /* notyet */
5313                 case SO_DOMAIN:
5314                 case SO_DGRAM_ERRIND:
5315                         if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5316                                 error = EINVAL;
5317                                 eprintsoline(so, error);
5318                                 goto done2;
5319                         }
5320                         break;
5321                 case SO_RCVTIMEO:
5322                 case SO_SNDTIMEO:
5323                         if (get_udatamodel() == DATAMODEL_NONE ||
5324                             get_udatamodel() == DATAMODEL_NATIVE) {
5325                                 if (maxlen < sizeof (struct timeval)) {
5326                                         error = EINVAL;
5327                                         eprintsoline(so, error);
5328                                         goto done2;
5329                                 }
5330                         } else {
5331                                 if (maxlen < sizeof (struct timeval32)) {
5332                                         error = EINVAL;
5333                                         eprintsoline(so, error);
5334                                         goto done2;
5335                                 }
5336 
5337                         }
5338                         break;
5339                 case SO_LINGER:
5340                         if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5341                                 error = EINVAL;
5342                                 eprintsoline(so, error);
5343                                 goto done2;
5344                         }
5345                         break;
5346                 case SO_SND_BUFINFO:
5347                         if (maxlen < (t_uscalar_t)
5348                             sizeof (struct so_snd_bufinfo)) {
5349                                 error = EINVAL;
5350                                 eprintsoline(so, error);
5351                                 goto done2;
5352                         }
5353                         break;
5354                 }
5355 
5356                 len = (t_uscalar_t)sizeof (uint32_t);   /* Default */
5357 
5358                 switch (option_name) {
5359                 case SO_TYPE:
5360                         value = so->so_type;
5361                         option = &value;
5362                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5363 
5364                 case SO_ERROR:
5365                         value = sogeterr(so, B_TRUE);
5366                         option = &value;
5367                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5368 
5369                 case SO_ACCEPTCONN:
5370                         if (so->so_state & SS_ACCEPTCONN)
5371                                 value = SO_ACCEPTCONN;
5372                         else
5373                                 value = 0;
5374 #ifdef DEBUG
5375                         if (value) {
5376                                 dprintso(so, 1,
5377                                     ("sotpi_getsockopt: 0x%x is set\n",
5378                                     option_name));
5379                         } else {
5380                                 dprintso(so, 1,
5381                                     ("sotpi_getsockopt: 0x%x not set\n",
5382                                     option_name));
5383                         }
5384 #endif /* DEBUG */
5385                         option = &value;
5386                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5387 
5388                 case SO_DEBUG:
5389                 case SO_REUSEADDR:
5390                 case SO_KEEPALIVE:
5391                 case SO_DONTROUTE:
5392                 case SO_BROADCAST:
5393                 case SO_USELOOPBACK:
5394                 case SO_OOBINLINE:
5395                 case SO_DGRAM_ERRIND:
5396                         value = (so->so_options & option_name);
5397 #ifdef DEBUG
5398                         if (value) {
5399                                 dprintso(so, 1,
5400                                     ("sotpi_getsockopt: 0x%x is set\n",
5401                                     option_name));
5402                         } else {
5403                                 dprintso(so, 1,
5404                                     ("sotpi_getsockopt: 0x%x not set\n",
5405                                     option_name));
5406                         }
5407 #endif /* DEBUG */
5408                         option = &value;
5409                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5410 
5411                 /*
5412                  * The following options are only returned by sockfs when the
5413                  * T_SVR4_OPTMGMT_REQ fails.
5414                  */
5415                 case SO_LINGER:
5416                         option = &so->so_linger;
5417                         len = (t_uscalar_t)sizeof (struct linger);
5418                         break;
5419                 case SO_SNDBUF: {
5420                         ssize_t lvalue;
5421 
5422                         /*
5423                          * If the option has not been set then get a default
5424                          * value from the read queue. This value is
5425                          * returned if the transport fails
5426                          * the T_SVR4_OPTMGMT_REQ.
5427                          */
5428                         lvalue = so->so_sndbuf;
5429                         if (lvalue == 0) {
5430                                 mutex_exit(&so->so_lock);
5431                                 (void) strqget(strvp2wq(vn)->q_next,
5432                                     QHIWAT, 0, &lvalue);
5433                                 mutex_enter(&so->so_lock);
5434                                 dprintso(so, 1,
5435                                     ("got SO_SNDBUF %ld from q\n", lvalue));
5436                         }
5437                         value = (int)lvalue;
5438                         option = &value;
5439                         len = (t_uscalar_t)sizeof (so->so_sndbuf);
5440                         break;
5441                 }
5442                 case SO_RCVBUF: {
5443                         ssize_t lvalue;
5444 
5445                         /*
5446                          * If the option has not been set then get a default
5447                          * value from the read queue. This value is
5448                          * returned if the transport fails
5449                          * the T_SVR4_OPTMGMT_REQ.
5450                          *
5451                          * XXX If SO_RCVBUF has been set and this is an
5452                          * XPG 4.2 application then do not ask the transport
5453                          * since the transport might adjust the value and not
5454                          * return exactly what was set by the application.
5455                          * For non-XPG 4.2 application we return the value
5456                          * that the transport is actually using.
5457                          */
5458                         lvalue = so->so_rcvbuf;
5459                         if (lvalue == 0) {
5460                                 mutex_exit(&so->so_lock);
5461                                 (void) strqget(RD(strvp2wq(vn)),
5462                                     QHIWAT, 0, &lvalue);
5463                                 mutex_enter(&so->so_lock);
5464                                 dprintso(so, 1,
5465                                     ("got SO_RCVBUF %ld from q\n", lvalue));
5466                         } else if (flags & _SOGETSOCKOPT_XPG4_2) {
5467                                 value = (int)lvalue;
5468                                 option = &value;
5469                                 goto copyout;   /* skip asking transport */
5470                         }
5471                         value = (int)lvalue;
5472                         option = &value;
5473                         len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5474                         break;
5475                 }
5476                 case SO_DOMAIN:
5477                         value = so->so_family;
5478                         option = &value;
5479                         goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5480 
5481 #ifdef notyet
5482                 /*
5483                  * We do not implement the semantics of these options
5484                  * thus we shouldn't implement the options either.
5485                  */
5486                 case SO_SNDLOWAT:
5487                         value = so->so_sndlowat;
5488                         option = &value;
5489                         break;
5490                 case SO_RCVLOWAT:
5491                         value = so->so_rcvlowat;
5492                         option = &value;
5493                         break;
5494 #endif /* notyet */
5495                 case SO_SNDTIMEO:
5496                 case SO_RCVTIMEO: {
5497                         clock_t val;
5498 
5499                         if (option_name == SO_RCVTIMEO)
5500                                 val = drv_hztousec(so->so_rcvtimeo);
5501                         else
5502                                 val = drv_hztousec(so->so_sndtimeo);
5503                         tmo_val.tv_sec = val / (1000 * 1000);
5504                         tmo_val.tv_usec = val % (1000 * 1000);
5505                         if (get_udatamodel() == DATAMODEL_NONE ||
5506                             get_udatamodel() == DATAMODEL_NATIVE) {
5507                                 option = &tmo_val;
5508                                 len = sizeof (struct timeval);
5509                         } else {
5510                                 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5511                                 option = &tmo_val32;
5512                                 len = sizeof (struct timeval32);
5513                         }
5514                         break;
5515                 }
5516                 case SO_SND_BUFINFO: {
5517                         snd_bufinfo.sbi_wroff =
5518                             (so->so_proto_props).sopp_wroff;
5519                         snd_bufinfo.sbi_maxblk =
5520                             (so->so_proto_props).sopp_maxblk;
5521                         snd_bufinfo.sbi_maxpsz =
5522                             (so->so_proto_props).sopp_maxpsz;
5523                         snd_bufinfo.sbi_tail =
5524                             (so->so_proto_props).sopp_tail;
5525                         option = &snd_bufinfo;
5526                         len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5527                         break;
5528                 }
5529                 }
5530         }
5531 
5532         mutex_exit(&so->so_lock);
5533 
5534         /* Send request */
5535         optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5536         optmgmt_req.MGMT_flags = T_CHECK;
5537         optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5538         optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5539 
5540         oh.level = level;
5541         oh.name = option_name;
5542         oh.len = maxlen;
5543 
5544         mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5545             &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5546         /* Let option management work in the presence of data flow control */
5547         error = kstrputmsg(vn, mp, NULL, 0, 0,
5548             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5549         mp = NULL;
5550         mutex_enter(&so->so_lock);
5551         if (error) {
5552                 eprintsoline(so, error);
5553                 goto done2;
5554         }
5555         error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5556             (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5557         if (error) {
5558                 if (option != NULL) {
5559                         /* We have a fallback value */
5560                         error = 0;
5561                         goto copyout;
5562                 }
5563                 eprintsoline(so, error);
5564                 goto done2;
5565         }
5566         ASSERT(mp);
5567         optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5568         opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5569             optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5570         if (opt_res == NULL) {
5571                 if (option != NULL) {
5572                         /* We have a fallback value */
5573                         error = 0;
5574                         goto copyout;
5575                 }
5576                 error = EPROTO;
5577                 eprintsoline(so, error);
5578                 goto done;
5579         }
5580         option = &opt_res[1];
5581 
5582         /* check to ensure that the option is within bounds */
5583         if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5584             (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5585                 if (option != NULL) {
5586                         /* We have a fallback value */
5587                         error = 0;
5588                         goto copyout;
5589                 }
5590                 error = EPROTO;
5591                 eprintsoline(so, error);
5592                 goto done;
5593         }
5594 
5595         len = opt_res->len;
5596 
5597 copyout: {
5598                 t_uscalar_t size = MIN(len, maxlen);
5599                 bcopy(option, optval, size);
5600                 bcopy(&size, optlenp, sizeof (size));
5601         }
5602 done:
5603         freemsg(mp);
5604 done2:
5605         so_unlock_single(so, SOLOCKED);
5606         mutex_exit(&so->so_lock);
5607 
5608         return (error);
5609 }
5610 
5611 /*
5612  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5613  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5614  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5615  * setsockopt has to work even if the transport does not support the option.
5616  */
5617 /* ARGSUSED */
5618 int
5619 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5620     const void *optval, t_uscalar_t optlen, struct cred *cr)
5621 {
5622         struct T_optmgmt_req    optmgmt_req;
5623         struct opthdr           oh;
5624         mblk_t                  *mp;
5625         int                     error = 0;
5626         boolean_t               handled = B_FALSE;
5627 
5628         dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5629             (void *)so, level, option_name, optval, optlen,
5630             pr_state(so->so_state, so->so_mode)));
5631 
5632         /* X/Open requires this check */
5633         if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5634                 if (xnet_check_print)
5635                         printf("sockfs: X/Open setsockopt check => EINVAL\n");
5636                 return (EINVAL);
5637         }
5638 
5639         mutex_enter(&so->so_lock);
5640         so_lock_single(so);     /* Set SOLOCKED */
5641         mutex_exit(&so->so_lock);
5642 
5643         optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5644         optmgmt_req.MGMT_flags = T_NEGOTIATE;
5645         optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5646         optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5647 
5648         oh.level = level;
5649         oh.name = option_name;
5650         oh.len = optlen;
5651 
5652         mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5653             &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5654         /* Let option management work in the presence of data flow control */
5655         error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5656             MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5657         mp = NULL;
5658         mutex_enter(&so->so_lock);
5659         if (error) {
5660                 eprintsoline(so, error);
5661                 goto done2;
5662         }
5663         error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5664             (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5665         if (error) {
5666                 eprintsoline(so, error);
5667                 goto done;
5668         }
5669         ASSERT(mp);
5670         /* No need to verify T_optmgmt_ack */
5671         freemsg(mp);
5672 done:
5673         /*
5674          * Check for SOL_SOCKET options and record their values.
5675          * If we know about a SOL_SOCKET parameter and the transport
5676          * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5677          * EPROTO) we let the setsockopt succeed.
5678          */
5679         if (level == SOL_SOCKET) {
5680                 /* Check parameters */
5681                 switch (option_name) {
5682                 case SO_DEBUG:
5683                 case SO_REUSEADDR:
5684                 case SO_KEEPALIVE:
5685                 case SO_DONTROUTE:
5686                 case SO_BROADCAST:
5687                 case SO_USELOOPBACK:
5688                 case SO_OOBINLINE:
5689                 case SO_SNDBUF:
5690                 case SO_RCVBUF:
5691 #ifdef notyet
5692                 case SO_SNDLOWAT:
5693                 case SO_RCVLOWAT:
5694 #endif /* notyet */
5695                 case SO_DGRAM_ERRIND:
5696                         if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5697                                 error = EINVAL;
5698                                 eprintsoline(so, error);
5699                                 goto done2;
5700                         }
5701                         ASSERT(optval);
5702                         handled = B_TRUE;
5703                         break;
5704                 case SO_SNDTIMEO:
5705                 case SO_RCVTIMEO:
5706                         if (get_udatamodel() == DATAMODEL_NONE ||
5707                             get_udatamodel() == DATAMODEL_NATIVE) {
5708                                 if (optlen != sizeof (struct timeval)) {
5709                                         error = EINVAL;
5710                                         eprintsoline(so, error);
5711                                         goto done2;
5712                                 }
5713                         } else {
5714                                 if (optlen != sizeof (struct timeval32)) {
5715                                         error = EINVAL;
5716                                         eprintsoline(so, error);
5717                                         goto done2;
5718                                 }
5719                         }
5720                         ASSERT(optval);
5721                         handled = B_TRUE;
5722                         break;
5723                 case SO_LINGER:
5724                         if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5725                                 error = EINVAL;
5726                                 eprintsoline(so, error);
5727                                 goto done2;
5728                         }
5729                         ASSERT(optval);
5730                         handled = B_TRUE;
5731                         break;
5732                 }
5733 
5734 #define intvalue        (*(int32_t *)optval)
5735 
5736                 switch (option_name) {
5737                 case SO_TYPE:
5738                 case SO_ERROR:
5739                 case SO_ACCEPTCONN:
5740                         /* Can't be set */
5741                         error = ENOPROTOOPT;
5742                         goto done2;
5743                 case SO_LINGER: {
5744                         struct linger *l = (struct linger *)optval;
5745 
5746                         so->so_linger.l_linger = l->l_linger;
5747                         if (l->l_onoff) {
5748                                 so->so_linger.l_onoff = SO_LINGER;
5749                                 so->so_options |= SO_LINGER;
5750                         } else {
5751                                 so->so_linger.l_onoff = 0;
5752                                 so->so_options &= ~SO_LINGER;
5753                         }
5754                         break;
5755                 }
5756 
5757                 case SO_DEBUG:
5758 #ifdef SOCK_TEST
5759                         if (intvalue & 2)
5760                                 sock_test_timelimit = 10 * hz;
5761                         else
5762                                 sock_test_timelimit = 0;
5763 
5764                         if (intvalue & 4)
5765                                 do_useracc = 0;
5766                         else
5767                                 do_useracc = 1;
5768 #endif /* SOCK_TEST */
5769                         /* FALLTHRU */
5770                 case SO_REUSEADDR:
5771                 case SO_KEEPALIVE:
5772                 case SO_DONTROUTE:
5773                 case SO_BROADCAST:
5774                 case SO_USELOOPBACK:
5775                 case SO_OOBINLINE:
5776                 case SO_DGRAM_ERRIND:
5777                         if (intvalue != 0) {
5778                                 dprintso(so, 1,
5779                                     ("socket_setsockopt: setting 0x%x\n",
5780                                     option_name));
5781                                 so->so_options |= option_name;
5782                         } else {
5783                                 dprintso(so, 1,
5784                                     ("socket_setsockopt: clearing 0x%x\n",
5785                                     option_name));
5786                                 so->so_options &= ~option_name;
5787                         }
5788                         break;
5789                 /*
5790                  * The following options are only returned by us when the
5791                  * transport layer fails.
5792                  * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5793                  * since the transport might adjust the value and not
5794                  * return exactly what was set by the application.
5795                  */
5796                 case SO_SNDBUF:
5797                         so->so_sndbuf = intvalue;
5798                         break;
5799                 case SO_RCVBUF:
5800                         so->so_rcvbuf = intvalue;
5801                         break;
5802                 case SO_RCVPSH:
5803                         so->so_rcv_timer_interval = intvalue;
5804                         break;
5805 #ifdef notyet
5806                 /*
5807                  * We do not implement the semantics of these options
5808                  * thus we shouldn't implement the options either.
5809                  */
5810                 case SO_SNDLOWAT:
5811                         so->so_sndlowat = intvalue;
5812                         break;
5813                 case SO_RCVLOWAT:
5814                         so->so_rcvlowat = intvalue;
5815                         break;
5816 #endif /* notyet */
5817                 case SO_SNDTIMEO:
5818                 case SO_RCVTIMEO: {
5819                         struct timeval tl;
5820                         clock_t val;
5821 
5822                         if (get_udatamodel() == DATAMODEL_NONE ||
5823                             get_udatamodel() == DATAMODEL_NATIVE)
5824                                 bcopy(&tl, (struct timeval *)optval,
5825                                     sizeof (struct timeval));
5826                         else
5827                                 TIMEVAL32_TO_TIMEVAL(&tl,
5828                                     (struct timeval32 *)optval);
5829                         val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5830                         if (option_name == SO_RCVTIMEO)
5831                                 so->so_rcvtimeo = drv_usectohz(val);
5832                         else
5833                                 so->so_sndtimeo = drv_usectohz(val);
5834                         break;
5835                 }
5836                 }
5837 #undef  intvalue
5838 
5839                 if (error) {
5840                         if ((error == ENOPROTOOPT || error == EPROTO ||
5841                             error == EINVAL) && handled) {
5842                                 dprintso(so, 1,
5843                                     ("setsockopt: ignoring error %d for 0x%x\n",
5844                                     error, option_name));
5845                                 error = 0;
5846                         }
5847                 }
5848         }
5849 done2:
5850         so_unlock_single(so, SOLOCKED);
5851         mutex_exit(&so->so_lock);
5852         return (error);
5853 }
5854 
5855 /*
5856  * sotpi_close() is called when the last open reference goes away.
5857  */
5858 /* ARGSUSED */
5859 int
5860 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5861 {
5862         struct vnode *vp = SOTOV(so);
5863         dev_t dev;
5864         int error = 0;
5865         sotpi_info_t *sti = SOTOTPI(so);
5866 
5867         dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5868             (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5869 
5870         dev = sti->sti_dev;
5871 
5872         ASSERT(STREAMSTAB(getmajor(dev)));
5873 
5874         mutex_enter(&so->so_lock);
5875         so_lock_single(so);     /* Set SOLOCKED */
5876 
5877         ASSERT(so_verify_oobstate(so));
5878 
5879         if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5880                 sti->sti_nl7c_flags = 0;
5881                 nl7c_close(so);
5882         }
5883 
5884         if (vp->v_stream != NULL) {
5885                 vnode_t *ux_vp;
5886 
5887                 if (so->so_family == AF_UNIX) {
5888                         /* Could avoid this when CANTSENDMORE for !dgram */
5889                         so_unix_close(so);
5890                 }
5891 
5892                 mutex_exit(&so->so_lock);
5893                 /*
5894                  * Disassemble the linkage from the AF_UNIX underlying file
5895                  * system vnode to this socket (by atomically clearing
5896                  * v_stream in vn_rele_stream) before strclose clears sd_vnode
5897                  * and frees the stream head.
5898                  */
5899                 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5900                         ASSERT(ux_vp->v_stream);
5901                         sti->sti_ux_bound_vp = NULL;
5902                         vn_rele_stream(ux_vp);
5903                 }
5904                 error = strclose(vp, flag, cr);
5905                 vp->v_stream = NULL;
5906                 mutex_enter(&so->so_lock);
5907         }
5908 
5909         /*
5910          * Flush the T_DISCON_IND on sti_discon_ind_mp.
5911          */
5912         so_flush_discon_ind(so);
5913 
5914         so_unlock_single(so, SOLOCKED);
5915         mutex_exit(&so->so_lock);
5916 
5917         /*
5918          * Needed for STREAMs.
5919          * Decrement the device driver's reference count for streams
5920          * opened via the clone dip. The driver was held in clone_open().
5921          * The absence of clone_close() forces this asymmetry.
5922          */
5923         if (so->so_flag & SOCLONE)
5924                 ddi_rele_driver(getmajor(dev));
5925 
5926         return (error);
5927 }
5928 
5929 static int
5930 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5931     struct cred *cr, int32_t *rvalp)
5932 {
5933         struct vnode *vp = SOTOV(so);
5934         sotpi_info_t *sti = SOTOTPI(so);
5935         int error = 0;
5936 
5937         dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5938             cmd, arg, pr_state(so->so_state, so->so_mode)));
5939 
5940         switch (cmd) {
5941         case SIOCSQPTR:
5942                 /*
5943                  * SIOCSQPTR is valid only when helper stream is created
5944                  * by the protocol.
5945                  */
5946         case _I_INSERT:
5947         case _I_REMOVE:
5948                 /*
5949                  * Since there's no compelling reason to support these ioctls
5950                  * on sockets, and doing so would increase the complexity
5951                  * markedly, prevent it.
5952                  */
5953                 return (EOPNOTSUPP);
5954 
5955         case I_FIND:
5956         case I_LIST:
5957         case I_LOOK:
5958         case I_POP:
5959         case I_PUSH:
5960                 /*
5961                  * To prevent races and inconsistencies between the actual
5962                  * state of the stream and the state according to the sonode,
5963                  * we serialize all operations which modify or operate on the
5964                  * list of modules on the socket's stream.
5965                  */
5966                 mutex_enter(&sti->sti_plumb_lock);
5967                 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5968                 mutex_exit(&sti->sti_plumb_lock);
5969                 return (error);
5970 
5971         default:
5972                 if (so->so_version != SOV_STREAM)
5973                         break;
5974 
5975                 /*
5976                  * The imaginary "sockmod" has been popped; act as a stream.
5977                  */
5978                 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5979         }
5980 
5981         ASSERT(so->so_version != SOV_STREAM);
5982 
5983         /*
5984          * Process socket-specific ioctls.
5985          */
5986         switch (cmd) {
5987         case FIONBIO: {
5988                 int32_t value;
5989 
5990                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5991                     (mode & (int)FKIOCTL)))
5992                         return (EFAULT);
5993 
5994                 mutex_enter(&so->so_lock);
5995                 if (value) {
5996                         so->so_state |= SS_NDELAY;
5997                 } else {
5998                         so->so_state &= ~SS_NDELAY;
5999                 }
6000                 mutex_exit(&so->so_lock);
6001                 return (0);
6002         }
6003 
6004         case FIOASYNC: {
6005                 int32_t value;
6006 
6007                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
6008                     (mode & (int)FKIOCTL)))
6009                         return (EFAULT);
6010 
6011                 mutex_enter(&so->so_lock);
6012                 /*
6013                  * SS_ASYNC flag not already set correctly?
6014                  * (!value != !(so->so_state & SS_ASYNC))
6015                  * but some engineers find that too hard to read.
6016                  */
6017                 if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
6018                     value != 0 && (so->so_state & SS_ASYNC) == 0)
6019                         error = so_flip_async(so, vp, mode, cr);
6020                 mutex_exit(&so->so_lock);
6021                 return (error);
6022         }
6023 
6024         case SIOCSPGRP:
6025         case FIOSETOWN: {
6026                 pid_t pgrp;
6027 
6028                 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
6029                     (mode & (int)FKIOCTL)))
6030                         return (EFAULT);
6031 
6032                 mutex_enter(&so->so_lock);
6033                 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
6034                 /* Any change? */
6035                 if (pgrp != so->so_pgrp)
6036                         error = so_set_siggrp(so, vp, pgrp, mode, cr);
6037                 mutex_exit(&so->so_lock);
6038                 return (error);
6039         }
6040         case SIOCGPGRP:
6041         case FIOGETOWN:
6042                 if (so_copyout(&so->so_pgrp, (void *)arg,
6043                     sizeof (pid_t), (mode & (int)FKIOCTL)))
6044                         return (EFAULT);
6045                 return (0);
6046 
6047         case SIOCATMARK: {
6048                 int retval;
6049                 uint_t so_state;
6050 
6051                 /*
6052                  * strwaitmark has a finite timeout after which it
6053                  * returns -1 if the mark state is undetermined.
6054                  * In order to avoid any race between the mark state
6055                  * in sockfs and the mark state in the stream head this
6056                  * routine loops until the mark state can be determined
6057                  * (or the urgent data indication has been removed by some
6058                  * other thread).
6059                  */
6060                 do {
6061                         mutex_enter(&so->so_lock);
6062                         so_state = so->so_state;
6063                         mutex_exit(&so->so_lock);
6064                         if (so_state & SS_RCVATMARK) {
6065                                 retval = 1;
6066                         } else if (!(so_state & SS_OOBPEND)) {
6067                                 /*
6068                                  * No SIGURG has been generated -- there is no
6069                                  * pending or present urgent data. Thus can't
6070                                  * possibly be at the mark.
6071                                  */
6072                                 retval = 0;
6073                         } else {
6074                                 /*
6075                                  * Have the stream head wait until there is
6076                                  * either some messages on the read queue, or
6077                                  * STRATMARK or STRNOTATMARK gets set. The
6078                                  * STRNOTATMARK flag is used so that the
6079                                  * transport can send up a MSGNOTMARKNEXT
6080                                  * M_DATA to indicate that it is not
6081                                  * at the mark and additional data is not about
6082                                  * to be send upstream.
6083                                  *
6084                                  * If the mark state is undetermined this will
6085                                  * return -1 and we will loop rechecking the
6086                                  * socket state.
6087                                  */
6088                                 retval = strwaitmark(vp);
6089                         }
6090                 } while (retval == -1);
6091 
6092                 if (so_copyout(&retval, (void *)arg, sizeof (int),
6093                     (mode & (int)FKIOCTL)))
6094                         return (EFAULT);
6095                 return (0);
6096         }
6097 
6098         case I_FDINSERT:
6099         case I_SENDFD:
6100         case I_RECVFD:
6101         case I_ATMARK:
6102         case _SIOCSOCKFALLBACK:
6103                 /*
6104                  * These ioctls do not apply to sockets. I_FDINSERT can be
6105                  * used to send M_PROTO messages without modifying the socket
6106                  * state. I_SENDFD/RECVFD should not be used for socket file
6107                  * descriptor passing since they assume a twisted stream.
6108                  * SIOCATMARK must be used instead of I_ATMARK.
6109                  *
6110                  * _SIOCSOCKFALLBACK from an application should never be
6111                  * processed.  It is only generated by socktpi_open() or
6112                  * in response to I_POP or I_PUSH.
6113                  */
6114 #ifdef DEBUG
6115                 zcmn_err(getzoneid(), CE_WARN,
6116                     "Unsupported STREAMS ioctl 0x%x on socket. "
6117                     "Pid = %d\n", cmd, curproc->p_pid);
6118 #endif /* DEBUG */
6119                 return (EOPNOTSUPP);
6120 
6121         case _I_GETPEERCRED:
6122                 if ((mode & FKIOCTL) == 0)
6123                         return (EINVAL);
6124 
6125                 mutex_enter(&so->so_lock);
6126                 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6127                         error = ENOTSUP;
6128                 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
6129                         error = ENOTCONN;
6130                 } else if (so->so_peercred != NULL) {
6131                         k_peercred_t *kp = (k_peercred_t *)arg;
6132                         kp->pc_cr = so->so_peercred;
6133                         kp->pc_cpid = so->so_cpid;
6134                         crhold(so->so_peercred);
6135                 } else {
6136                         error = EINVAL;
6137                 }
6138                 mutex_exit(&so->so_lock);
6139                 return (error);
6140 
6141         default:
6142                 /*
6143                  * Do the higher-order bits of the ioctl cmd indicate
6144                  * that it is an I_* streams ioctl?
6145                  */
6146                 if ((cmd & 0xffffff00U) == STR &&
6147                     so->so_version == SOV_SOCKBSD) {
6148 #ifdef DEBUG
6149                         zcmn_err(getzoneid(), CE_WARN,
6150                             "Unsupported STREAMS ioctl 0x%x on socket. "
6151                             "Pid = %d\n", cmd, curproc->p_pid);
6152 #endif /* DEBUG */
6153                         return (EOPNOTSUPP);
6154                 }
6155                 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6156         }
6157 }
6158 
6159 /*
6160  * Handle plumbing-related ioctls.
6161  */
6162 static int
6163 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6164     struct cred *cr, int32_t *rvalp)
6165 {
6166         static const char sockmod_name[] = "sockmod";
6167         struct sonode   *so = VTOSO(vp);
6168         char            mname[FMNAMESZ + 1];
6169         int             error;
6170         sotpi_info_t    *sti = SOTOTPI(so);
6171 
6172         ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6173 
6174         if (so->so_version == SOV_SOCKBSD)
6175                 return (EOPNOTSUPP);
6176 
6177         if (so->so_version == SOV_STREAM) {
6178                 /*
6179                  * The imaginary "sockmod" has been popped - act as a stream.
6180                  * If this is a push of sockmod then change back to a socket.
6181                  */
6182                 if (cmd == I_PUSH) {
6183                         error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6184                             (void *)arg, mname, sizeof (mname), NULL);
6185 
6186                         if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6187                                 dprintso(so, 0, ("socktpi_ioctl: going to "
6188                                     "socket version\n"));
6189                                 so_stream2sock(so);
6190                                 return (0);
6191                         }
6192                 }
6193                 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6194         }
6195 
6196         switch (cmd) {
6197         case I_PUSH:
6198                 if (sti->sti_direct) {
6199                         mutex_enter(&so->so_lock);
6200                         so_lock_single(so);
6201                         mutex_exit(&so->so_lock);
6202 
6203                         error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6204                             cr, rvalp);
6205 
6206                         mutex_enter(&so->so_lock);
6207                         if (error == 0)
6208                                 sti->sti_direct = 0;
6209                         so_unlock_single(so, SOLOCKED);
6210                         mutex_exit(&so->so_lock);
6211 
6212                         if (error != 0)
6213                                 return (error);
6214                 }
6215 
6216                 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6217                 if (error == 0)
6218                         sti->sti_pushcnt++;
6219                 return (error);
6220 
6221         case I_POP:
6222                 if (sti->sti_pushcnt == 0) {
6223                         /* Emulate sockmod being popped */
6224                         dprintso(so, 0,
6225                             ("socktpi_ioctl: going to STREAMS version\n"));
6226                         return (so_sock2stream(so));
6227                 }
6228 
6229                 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6230                 if (error == 0)
6231                         sti->sti_pushcnt--;
6232                 return (error);
6233 
6234         case I_LIST: {
6235                 struct str_mlist *kmlistp, *umlistp;
6236                 struct str_list kstrlist;
6237                 ssize_t         kstrlistsize;
6238                 int             i, nmods;
6239 
6240                 STRUCT_DECL(str_list, ustrlist);
6241                 STRUCT_INIT(ustrlist, mode);
6242 
6243                 if (arg == 0) {
6244                         error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6245                         if (error == 0)
6246                                 (*rvalp)++;     /* Add one for sockmod */
6247                         return (error);
6248                 }
6249 
6250                 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6251                     STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6252                 if (error != 0)
6253                         return (error);
6254 
6255                 nmods = STRUCT_FGET(ustrlist, sl_nmods);
6256                 if (nmods <= 0)
6257                         return (EINVAL);
6258                 /*
6259                  * Ceiling nmods at nstrpush to prevent someone from
6260                  * maliciously consuming lots of kernel memory.
6261                  */
6262                 nmods = MIN(nmods, nstrpush);
6263 
6264                 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6265                 kstrlist.sl_nmods = nmods;
6266                 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6267 
6268                 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6269                     cr, rvalp);
6270                 if (error != 0)
6271                         goto done;
6272 
6273                 /*
6274                  * Considering the module list as a 0-based array of sl_nmods
6275                  * modules, sockmod should conceptually exist at slot
6276                  * sti_pushcnt.  Insert sockmod at this location by sliding all
6277                  * of the module names after so_pushcnt over by one.  We know
6278                  * that there will be room to do this since we allocated
6279                  * sl_modlist with an additional slot.
6280                  */
6281                 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6282                         kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6283 
6284                 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6285                 kstrlist.sl_nmods++;
6286 
6287                 /*
6288                  * Copy all of the entries out to ustrlist.
6289                  */
6290                 kmlistp = kstrlist.sl_modlist;
6291                 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6292                 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6293                         error = so_copyout(kmlistp++, umlistp++,
6294                             sizeof (struct str_mlist), mode & FKIOCTL);
6295                         if (error != 0)
6296                                 goto done;
6297                 }
6298 
6299                 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6300                     mode & FKIOCTL);
6301                 if (error == 0)
6302                         *rvalp = 0;
6303         done:
6304                 kmem_free(kstrlist.sl_modlist, kstrlistsize);
6305                 return (error);
6306         }
6307         case I_LOOK:
6308                 if (sti->sti_pushcnt == 0) {
6309                         return (so_copyout(sockmod_name, (void *)arg,
6310                             sizeof (sockmod_name), mode & FKIOCTL));
6311                 }
6312                 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6313 
6314         case I_FIND:
6315                 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6316                 if (error && error != EINVAL)
6317                         return (error);
6318 
6319                 /* if not found and string was sockmod return 1 */
6320                 if (*rvalp == 0 || error == EINVAL) {
6321                         error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6322                             (void *)arg, mname, sizeof (mname), NULL);
6323                         if (error == ENAMETOOLONG)
6324                                 error = EINVAL;
6325 
6326                         if (error == 0 && strcmp(mname, sockmod_name) == 0)
6327                                 *rvalp = 1;
6328                 }
6329                 return (error);
6330 
6331         default:
6332                 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6333                 break;
6334         }
6335 
6336         return (0);
6337 }
6338 
6339 /*
6340  * Wrapper around the streams poll routine that implements socket poll
6341  * semantics.
6342  * The sockfs never calls pollwakeup itself - the stream head take care
6343  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6344  * stream head there can never be a deadlock due to holding so_lock across
6345  * pollwakeup and acquiring so_lock in this routine.
6346  *
6347  * However, since the performance of VOP_POLL is critical we avoid
6348  * acquiring so_lock here. This is based on two assumptions:
6349  *  - The poll implementation holds locks to serialize the VOP_POLL call
6350  *    and a pollwakeup for the same pollhead. This ensures that should
6351  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6352  *    (which strsock_* and strrput conspire to issue) is issued after
6353  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6354  *    returned and then wake up poll and have it call VOP_POLL again.
6355  *  - The reading of so_state without holding so_lock does not result in
6356  *    stale data that is older than the latest state change that has dropped
6357  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6358  *    memory barrier to force the data into the coherency domain.
6359  */
6360 static int
6361 sotpi_poll(
6362         struct sonode   *so,
6363         short           events,
6364         int             anyyet,
6365         short           *reventsp,
6366         struct pollhead **phpp)
6367 {
6368         short origevents = events;
6369         struct vnode *vp = SOTOV(so);
6370         int error;
6371         int so_state = so->so_state; /* snapshot */
6372         sotpi_info_t *sti = SOTOTPI(so);
6373 
6374         dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6375             (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6376 
6377         ASSERT(vp->v_type == VSOCK);
6378         ASSERT(vp->v_stream != NULL);
6379 
6380         if (so->so_version == SOV_STREAM) {
6381                 /* The imaginary "sockmod" has been popped - act as a stream */
6382                 return (strpoll(vp->v_stream, events, anyyet,
6383                     reventsp, phpp));
6384         }
6385 
6386         if (!(so_state & SS_ISCONNECTED) &&
6387             (so->so_mode & SM_CONNREQUIRED)) {
6388                 /* Not connected yet - turn off write side events */
6389                 events &= ~(POLLOUT|POLLWRBAND);
6390         }
6391         /*
6392          * Check for errors without calling strpoll if the caller wants them.
6393          * In sockets the errors are represented as input/output events
6394          * and there is no need to ask the stream head for this information.
6395          */
6396         if (so->so_error != 0 &&
6397             ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6398                 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6399                 return (0);
6400         }
6401         /*
6402          * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6403          * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6404          * will not trigger a POLLIN event with POLLRDDATA set.
6405          * The handling of urgent data (causing POLLRDBAND) is done by
6406          * inspecting SS_OOBPEND below.
6407          */
6408         events |= POLLRDDATA;
6409 
6410         /*
6411          * After shutdown(output) a stream head write error is set.
6412          * However, we should not return output events.
6413          */
6414         events |= POLLNOERR;
6415         error = strpoll(vp->v_stream, events, anyyet,
6416             reventsp, phpp);
6417         if (error)
6418                 return (error);
6419 
6420         ASSERT(!(*reventsp & POLLERR));
6421 
6422         /*
6423          * Notes on T_CONN_IND handling for sockets.
6424          *
6425          * If strpoll() returned without events, SR_POLLIN is guaranteed
6426          * to be set, ensuring any subsequent strrput() runs pollwakeup().
6427          *
6428          * Since the so_lock is not held, soqueueconnind() may have run
6429          * and a T_CONN_IND may be waiting. We now check for any queued
6430          * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6431          * to ensure poll returns.
6432          *
6433          * However:
6434          * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6435          * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6436          * the following actions will occur; taken together they ensure the
6437          * syscall will return.
6438          *
6439          * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6440          *    the accept() was run on a non-blocking socket sowaitconnind()
6441          *    may have already returned EWOULDBLOCK, so not be waiting to
6442          *    process the message. Additionally socktpi_poll() has probably
6443          *    proceeded past the sti_conn_ind_head check below.
6444          * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6445          *    this thread,  however that could occur before poll_common()
6446          *    has entered cv_wait.
6447          * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6448          *
6449          * Before proceeding to cv_wait() in poll_common() for an event,
6450          * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6451          * and if set, re-calls strpoll() to ensure the late arriving
6452          * T_CONN_IND is recognized, and pollsys() returns.
6453          */
6454 
6455         if (sti->sti_conn_ind_head != NULL)
6456                 *reventsp |= (POLLIN|POLLRDNORM) & events;
6457 
6458         if (so->so_state & SS_CANTRCVMORE) {
6459                 *reventsp |= POLLRDHUP & events;
6460 
6461                 if (so->so_state & SS_CANTSENDMORE)
6462                         *reventsp |= POLLHUP;
6463         }
6464 
6465         if (so->so_state & SS_OOBPEND)
6466                 *reventsp |= POLLRDBAND & events;
6467 
6468         if (sti->sti_nl7c_rcv_mp != NULL) {
6469                 *reventsp |= (POLLIN|POLLRDNORM) & events;
6470         }
6471         if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6472             ((POLLIN|POLLRDNORM) & *reventsp)) {
6473                 sti->sti_nl7c_flags |= NL7C_POLLIN;
6474         }
6475 
6476         return (0);
6477 }
6478 
6479 /*ARGSUSED*/
6480 static int
6481 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6482 {
6483         sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6484         int error = 0;
6485 
6486         error = sonode_constructor(buf, cdrarg, kmflags);
6487         if (error != 0)
6488                 return (error);
6489 
6490         error = i_sotpi_info_constructor(&st->st_info);
6491         if (error != 0)
6492                 sonode_destructor(buf, cdrarg);
6493 
6494         st->st_sonode.so_priv = &st->st_info;
6495 
6496         return (error);
6497 }
6498 
6499 /*ARGSUSED1*/
6500 static void
6501 socktpi_destructor(void *buf, void *cdrarg)
6502 {
6503         sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6504 
6505         ASSERT(st->st_sonode.so_priv == &st->st_info);
6506         st->st_sonode.so_priv = NULL;
6507 
6508         i_sotpi_info_destructor(&st->st_info);
6509         sonode_destructor(buf, cdrarg);
6510 }
6511 
6512 static int
6513 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6514 {
6515         int retval;
6516 
6517         if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6518                 struct sonode *so = (struct sonode *)buf;
6519                 sotpi_info_t *sti = SOTOTPI(so);
6520 
6521                 mutex_enter(&socklist.sl_lock);
6522 
6523                 sti->sti_next_so = socklist.sl_list;
6524                 sti->sti_prev_so = NULL;
6525                 if (sti->sti_next_so != NULL)
6526                         SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6527                 socklist.sl_list = so;
6528 
6529                 mutex_exit(&socklist.sl_lock);
6530 
6531         }
6532         return (retval);
6533 }
6534 
6535 static void
6536 socktpi_unix_destructor(void *buf, void *cdrarg)
6537 {
6538         struct sonode   *so = (struct sonode *)buf;
6539         sotpi_info_t    *sti = SOTOTPI(so);
6540 
6541         mutex_enter(&socklist.sl_lock);
6542 
6543         if (sti->sti_next_so != NULL)
6544                 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6545         if (sti->sti_prev_so != NULL)
6546                 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6547         else
6548                 socklist.sl_list = sti->sti_next_so;
6549 
6550         mutex_exit(&socklist.sl_lock);
6551 
6552         socktpi_destructor(buf, cdrarg);
6553 }
6554 
6555 int
6556 socktpi_init(void)
6557 {
6558         /*
6559          * Create sonode caches.  We create a special one for AF_UNIX so
6560          * that we can track them for netstat(8).
6561          */
6562         socktpi_cache = kmem_cache_create("socktpi_cache",
6563             sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6564             socktpi_destructor, NULL, NULL, NULL, 0);
6565 
6566         socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6567             sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6568             socktpi_unix_destructor, NULL, NULL, NULL, 0);
6569 
6570         return (0);
6571 }
6572 
6573 /*
6574  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6575  *
6576  * Caller must still update state and mode using sotpi_update_state().
6577  */
6578 int
6579 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6580     boolean_t *direct, queue_t **qp, struct cred *cr)
6581 {
6582         sotpi_info_t *sti;
6583         struct sockparams *origsp = so->so_sockparams;
6584         sock_lower_handle_t handle = so->so_proto_handle;
6585         struct stdata *stp;
6586         struct vnode *vp;
6587         queue_t *q;
6588         int error = 0;
6589 
6590         ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6591             SS_FALLBACK_PENDING);
6592         ASSERT(SOCK_IS_NONSTR(so));
6593 
6594         *qp = NULL;
6595         *direct = B_FALSE;
6596         so->so_sockparams = newsp;
6597         /*
6598          * Allocate and initalize fields required by TPI.
6599          */
6600         (void) sotpi_info_create(so, KM_SLEEP);
6601         sotpi_info_init(so);
6602 
6603         if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6604                 sotpi_info_fini(so);
6605                 sotpi_info_destroy(so);
6606                 return (error);
6607         }
6608         ASSERT(handle == so->so_proto_handle);
6609         sti = SOTOTPI(so);
6610         if (sti->sti_direct != 0)
6611                 *direct = B_TRUE;
6612 
6613         /*
6614          * Keep the original sp around so we can properly dispose of the
6615          * sonode when the socket is being closed.
6616          */
6617         sti->sti_orig_sp = origsp;
6618 
6619         so_basic_strinit(so);   /* skips the T_CAPABILITY_REQ */
6620         so_alloc_addr(so, so->so_max_addr_len);
6621 
6622         /*
6623          * If the application has done a SIOCSPGRP, make sure the
6624          * STREAM head is aware. This needs to take place before
6625          * the protocol start sending up messages. Otherwise we
6626          * might miss to generate SIGPOLL.
6627          *
6628          * It is possible that the application will receive duplicate
6629          * signals if some were already generated for either data or
6630          * connection indications.
6631          */
6632         if (so->so_pgrp != 0) {
6633                 if (so_set_events(so, so->so_vnode, cr) != 0)
6634                         so->so_pgrp = 0;
6635         }
6636 
6637         /*
6638          * Determine which queue to use.
6639          */
6640         vp = SOTOV(so);
6641         stp = vp->v_stream;
6642         ASSERT(stp != NULL);
6643         q = stp->sd_wrq->q_next;
6644 
6645         /*
6646          * Skip any modules that may have been auto pushed when the device
6647          * was opened
6648          */
6649         while (q->q_next != NULL)
6650                 q = q->q_next;
6651         *qp = _RD(q);
6652 
6653         /* This is now a STREAMS sockets */
6654         so->so_not_str = B_FALSE;
6655 
6656         return (error);
6657 }
6658 
6659 /*
6660  * Revert a TPI sonode. It is only allowed to revert the sonode during
6661  * the fallback process.
6662  */
6663 void
6664 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6665 {
6666         vnode_t *vp = SOTOV(so);
6667 
6668         ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6669             SS_FALLBACK_PENDING);
6670         ASSERT(!SOCK_IS_NONSTR(so));
6671         ASSERT(vp->v_stream != NULL);
6672 
6673         strclean(vp);
6674         (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6675 
6676         /*
6677          * Restore the original sockparams. The caller is responsible for
6678          * dropping the ref to the new sp.
6679          */
6680         so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6681 
6682         sotpi_info_fini(so);
6683         sotpi_info_destroy(so);
6684 
6685         /* This is no longer a STREAMS sockets */
6686         so->so_not_str = B_TRUE;
6687 }
6688 
6689 void
6690 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6691     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6692     socklen_t faddrlen, short opts)
6693 {
6694         sotpi_info_t *sti = SOTOTPI(so);
6695 
6696         so_proc_tcapability_ack(so, tcap);
6697 
6698         so->so_options |= opts;
6699 
6700         /*
6701          * Determine whether the foreign and local address are valid
6702          */
6703         if (laddrlen != 0) {
6704                 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6705                 sti->sti_laddr_len = laddrlen;
6706                 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6707                 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6708         }
6709 
6710         if (faddrlen != 0) {
6711                 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6712                 sti->sti_faddr_len = faddrlen;
6713                 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6714                 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6715         }
6716 
6717 }
6718 
6719 /*
6720  * Allocate enough space to cache the local and foreign addresses.
6721  */
6722 void
6723 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6724 {
6725         sotpi_info_t *sti = SOTOTPI(so);
6726 
6727         ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6728         ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6729         sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6730             P2ROUNDUP(maxlen, KMEM_ALIGN);
6731         so->so_max_addr_len = sti->sti_laddr_maxlen;
6732         sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6733         sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6734             + sti->sti_laddr_maxlen);
6735 
6736         if (so->so_family == AF_UNIX) {
6737                 /*
6738                  * Initialize AF_UNIX related fields.
6739                  */
6740                 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6741                 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6742         }
6743 }
6744 
6745 
6746 sotpi_info_t *
6747 sotpi_sototpi(struct sonode *so)
6748 {
6749         sotpi_info_t *sti;
6750 
6751         ASSERT(so != NULL);
6752 
6753         sti = (sotpi_info_t *)so->so_priv;
6754 
6755         ASSERT(sti != NULL);
6756         ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6757 
6758         return (sti);
6759 }
6760 
6761 static int
6762 i_sotpi_info_constructor(sotpi_info_t *sti)
6763 {
6764         sti->sti_magic               = SOTPI_INFO_MAGIC;
6765         sti->sti_ack_mp              = NULL;
6766         sti->sti_discon_ind_mp       = NULL;
6767         sti->sti_ux_bound_vp = NULL;
6768         sti->sti_unbind_mp   = NULL;
6769 
6770         sti->sti_conn_ind_head       = NULL;
6771         sti->sti_conn_ind_tail       = NULL;
6772 
6773         sti->sti_laddr_sa    = NULL;
6774         sti->sti_faddr_sa    = NULL;
6775 
6776         sti->sti_nl7c_flags  = 0;
6777         sti->sti_nl7c_uri    = NULL;
6778         sti->sti_nl7c_rcv_mp = NULL;
6779 
6780         mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6781         cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6782 
6783         return (0);
6784 }
6785 
6786 static void
6787 i_sotpi_info_destructor(sotpi_info_t *sti)
6788 {
6789         ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6790         ASSERT(sti->sti_ack_mp == NULL);
6791         ASSERT(sti->sti_discon_ind_mp == NULL);
6792         ASSERT(sti->sti_ux_bound_vp == NULL);
6793         ASSERT(sti->sti_unbind_mp == NULL);
6794 
6795         ASSERT(sti->sti_conn_ind_head == NULL);
6796         ASSERT(sti->sti_conn_ind_tail == NULL);
6797 
6798         ASSERT(sti->sti_laddr_sa == NULL);
6799         ASSERT(sti->sti_faddr_sa == NULL);
6800 
6801         ASSERT(sti->sti_nl7c_flags == 0);
6802         ASSERT(sti->sti_nl7c_uri == NULL);
6803         ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6804 
6805         mutex_destroy(&sti->sti_plumb_lock);
6806         cv_destroy(&sti->sti_ack_cv);
6807 }
6808 
6809 /*
6810  * Creates and attaches TPI information to the given sonode
6811  */
6812 static boolean_t
6813 sotpi_info_create(struct sonode *so, int kmflags)
6814 {
6815         sotpi_info_t *sti;
6816 
6817         ASSERT(so->so_priv == NULL);
6818 
6819         if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6820                 return (B_FALSE);
6821 
6822         if (i_sotpi_info_constructor(sti) != 0) {
6823                 kmem_free(sti, sizeof (*sti));
6824                 return (B_FALSE);
6825         }
6826 
6827         so->so_priv = (void *)sti;
6828         return (B_TRUE);
6829 }
6830 
6831 /*
6832  * Initializes the TPI information.
6833  */
6834 static void
6835 sotpi_info_init(struct sonode *so)
6836 {
6837         struct vnode *vp = SOTOV(so);
6838         sotpi_info_t *sti = SOTOTPI(so);
6839         time_t now;
6840 
6841         sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6842         vp->v_rdev   = sti->sti_dev;
6843 
6844         sti->sti_orig_sp = NULL;
6845 
6846         sti->sti_pushcnt = 0;
6847 
6848         now = gethrestime_sec();
6849         sti->sti_atime       = now;
6850         sti->sti_mtime       = now;
6851         sti->sti_ctime       = now;
6852 
6853         sti->sti_eaddr_mp = NULL;
6854         sti->sti_delayed_error = 0;
6855 
6856         sti->sti_provinfo = NULL;
6857 
6858         sti->sti_oobcnt = 0;
6859         sti->sti_oobsigcnt = 0;
6860 
6861         ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6862 
6863         sti->sti_laddr_sa    = 0;
6864         sti->sti_faddr_sa    = 0;
6865         sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6866         sti->sti_laddr_len = sti->sti_faddr_len = 0;
6867 
6868         sti->sti_laddr_valid = 0;
6869         sti->sti_faddr_valid = 0;
6870         sti->sti_faddr_noxlate = 0;
6871 
6872         sti->sti_direct = 0;
6873 
6874         ASSERT(sti->sti_ack_mp == NULL);
6875         ASSERT(sti->sti_ux_bound_vp == NULL);
6876         ASSERT(sti->sti_unbind_mp == NULL);
6877 
6878         ASSERT(sti->sti_conn_ind_head == NULL);
6879         ASSERT(sti->sti_conn_ind_tail == NULL);
6880 }
6881 
6882 /*
6883  * Given a sonode, grab the TPI info and free any data.
6884  */
6885 static void
6886 sotpi_info_fini(struct sonode *so)
6887 {
6888         sotpi_info_t *sti = SOTOTPI(so);
6889         mblk_t *mp;
6890 
6891         ASSERT(sti->sti_discon_ind_mp == NULL);
6892 
6893         if ((mp = sti->sti_conn_ind_head) != NULL) {
6894                 mblk_t *mp1;
6895 
6896                 while (mp) {
6897                         mp1 = mp->b_next;
6898                         mp->b_next = NULL;
6899                         freemsg(mp);
6900                         mp = mp1;
6901                 }
6902                 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6903         }
6904 
6905         /*
6906          * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6907          * indirect them.  It also uses so_count as a validity test.
6908          */
6909         mutex_enter(&so->so_lock);
6910 
6911         if (sti->sti_laddr_sa) {
6912                 ASSERT((caddr_t)sti->sti_faddr_sa ==
6913                     (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6914                 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6915                 sti->sti_laddr_valid = 0;
6916                 sti->sti_faddr_valid = 0;
6917                 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6918                 sti->sti_laddr_sa = NULL;
6919                 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6920                 sti->sti_faddr_sa = NULL;
6921                 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6922         }
6923 
6924         mutex_exit(&so->so_lock);
6925 
6926         if ((mp = sti->sti_eaddr_mp) != NULL) {
6927                 freemsg(mp);
6928                 sti->sti_eaddr_mp = NULL;
6929                 sti->sti_delayed_error = 0;
6930         }
6931 
6932         if ((mp = sti->sti_ack_mp) != NULL) {
6933                 freemsg(mp);
6934                 sti->sti_ack_mp = NULL;
6935         }
6936 
6937         if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6938                 sti->sti_nl7c_rcv_mp = NULL;
6939                 freemsg(mp);
6940         }
6941         sti->sti_nl7c_rcv_rval = 0;
6942         if (sti->sti_nl7c_uri != NULL) {
6943                 nl7c_urifree(so);
6944                 /* urifree() cleared nl7c_uri */
6945         }
6946         if (sti->sti_nl7c_flags) {
6947                 sti->sti_nl7c_flags = 0;
6948         }
6949 
6950         ASSERT(sti->sti_ux_bound_vp == NULL);
6951         if ((mp = sti->sti_unbind_mp) != NULL) {
6952                 freemsg(mp);
6953                 sti->sti_unbind_mp = NULL;
6954         }
6955 }
6956 
6957 /*
6958  * Destroys the TPI information attached to a sonode.
6959  */
6960 static void
6961 sotpi_info_destroy(struct sonode *so)
6962 {
6963         sotpi_info_t *sti = SOTOTPI(so);
6964 
6965         i_sotpi_info_destructor(sti);
6966         kmem_free(sti, sizeof (*sti));
6967 
6968         so->so_priv = NULL;
6969 }
6970 
6971 /*
6972  * Create the global sotpi socket module entry. It will never be freed.
6973  */
6974 smod_info_t *
6975 sotpi_smod_create(void)
6976 {
6977         smod_info_t *smodp;
6978 
6979         smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6980         smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6981         (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6982         /*
6983          * Initialize the smod_refcnt to 1 so it will never be freed.
6984          */
6985         smodp->smod_refcnt = 1;
6986         smodp->smod_uc_version = SOCK_UC_VERSION;
6987         smodp->smod_dc_version = SOCK_DC_VERSION;
6988         smodp->smod_sock_create_func = &sotpi_create;
6989         smodp->smod_sock_destroy_func = &sotpi_destroy;
6990         return (smodp);
6991 }