1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /*
  26  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/signal.h>
  32 #include <sys/cmn_err.h>
  33 
  34 #include <sys/stropts.h>
  35 #include <sys/socket.h>
  36 #include <sys/socketvar.h>
  37 #include <sys/sockio.h>
  38 #include <sys/strsubr.h>
  39 #include <sys/strsun.h>
  40 #include <sys/atomic.h>
  41 #include <sys/tihdr.h>
  42 
  43 #include <fs/sockfs/sockcommon.h>
  44 #include <fs/sockfs/sockfilter_impl.h>
  45 #include <fs/sockfs/socktpi.h>
  46 #include <fs/sockfs/sodirect.h>
  47 #include <sys/ddi.h>
  48 #include <inet/ip.h>
  49 #include <sys/time.h>
  50 #include <sys/cmn_err.h>
  51 
  52 #ifdef SOCK_TEST
  53 extern int do_useracc;
  54 extern clock_t sock_test_timelimit;
  55 #endif /* SOCK_TEST */
  56 
  57 #define MBLK_PULL_LEN 64
  58 uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
  59 
  60 #ifdef DEBUG
  61 boolean_t so_debug_length = B_FALSE;
  62 static boolean_t so_check_length(sonode_t *so);
  63 #endif
  64 
  65 static int
  66 so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
  67     struct sonode **nsop)
  68 {
  69         struct sonode *nso = NULL;
  70 
  71         *nsop = NULL;
  72         ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
  73         while ((nso = list_remove_head(&so->so_acceptq_list)) == NULL) {
  74                 /*
  75                  * No need to check so_error here, because it is not
  76                  * possible for a listening socket to be reset or otherwise
  77                  * disconnected.
  78                  *
  79                  * So now we just need check if it's ok to wait.
  80                  */
  81                 if (dontblock)
  82                         return (EWOULDBLOCK);
  83                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
  84                         return (EINTR);
  85 
  86                 if (cv_wait_sig_swap(&so->so_acceptq_cv,
  87                     &so->so_acceptq_lock) == 0)
  88                         return (EINTR);
  89         }
  90 
  91         ASSERT(nso != NULL);
  92         ASSERT(so->so_acceptq_len > 0);
  93         so->so_acceptq_len--;
  94         nso->so_listener = NULL;
  95 
  96         *nsop = nso;
  97 
  98         return (0);
  99 }
 100 
 101 /*
 102  * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
 103  *
 104  * Pulls a connection off of the accept queue.
 105  *
 106  * Arguments:
 107  *   so        - listening socket
 108  *   dontblock - indicate whether it's ok to sleep if there are no
 109  *               connections on the queue
 110  *   nsop      - Value-return argument
 111  *
 112  * Return values:
 113  *   0 when a connection is successfully dequeued, in which case nsop
 114  *   is set to point to the new connection. Upon failure a non-zero
 115  *   value is returned, and the value of nsop is set to NULL.
 116  *
 117  * Note:
 118  *   so_acceptq_dequeue() may return prematurly if the socket is falling
 119  *   back to TPI.
 120  */
 121 int
 122 so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
 123     struct sonode **nsop)
 124 {
 125         int error;
 126 
 127         mutex_enter(&so->so_acceptq_lock);
 128         error = so_acceptq_dequeue_locked(so, dontblock, nsop);
 129         mutex_exit(&so->so_acceptq_lock);
 130 
 131         return (error);
 132 }
 133 
 134 static void
 135 so_acceptq_flush_impl(struct sonode *so, list_t *list, boolean_t doclose)
 136 {
 137         struct sonode *nso;
 138 
 139         while ((nso = list_remove_head(list)) != NULL) {
 140                 nso->so_listener = NULL;
 141                 if (doclose) {
 142                         (void) socket_close(nso, 0, CRED());
 143                 } else {
 144                         /*
 145                          * Only used for fallback - not possible when filters
 146                          * are present.
 147                          */
 148                         ASSERT(so->so_filter_active == 0);
 149                         /*
 150                          * Since the socket is on the accept queue, there can
 151                          * only be one reference. We drop the reference and
 152                          * just blow off the socket.
 153                          */
 154                         ASSERT(nso->so_count == 1);
 155                         nso->so_count--;
 156                         /* drop the proto ref */
 157                         VN_RELE(SOTOV(nso));
 158                 }
 159                 socket_destroy(nso);
 160         }
 161 }
 162 /*
 163  * void so_acceptq_flush(struct sonode *so)
 164  *
 165  * Removes all pending connections from a listening socket, and
 166  * frees the associated resources.
 167  *
 168  * Arguments
 169  *   so      - listening socket
 170  *   doclose - make a close downcall for each socket on the accept queue
 171  *
 172  * Return values:
 173  *   None.
 174  *
 175  * Note:
 176  *   The caller has to ensure that no calls to so_acceptq_enqueue() or
 177  *   so_acceptq_dequeue() occur while the accept queue is being flushed.
 178  *   So either the socket needs to be in a state where no operations
 179  *   would come in, or so_lock needs to be obtained.
 180  */
 181 void
 182 so_acceptq_flush(struct sonode *so, boolean_t doclose)
 183 {
 184         so_acceptq_flush_impl(so, &so->so_acceptq_list, doclose);
 185         so_acceptq_flush_impl(so, &so->so_acceptq_defer, doclose);
 186 
 187         so->so_acceptq_len = 0;
 188 }
 189 
 190 int
 191 so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
 192     sock_connid_t id)
 193 {
 194         ASSERT(MUTEX_HELD(&so->so_lock));
 195 
 196         /*
 197          * The protocol has notified us that a connection attempt is being
 198          * made, so before we wait for a notification to arrive we must
 199          * clear out any errors associated with earlier connection attempts.
 200          */
 201         if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
 202                 so->so_error = 0;
 203 
 204         while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
 205                 if (nonblock)
 206                         return (EINPROGRESS);
 207 
 208                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 209                         return (EINTR);
 210 
 211                 if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
 212                         return (EINTR);
 213         }
 214 
 215         if (so->so_error != 0)
 216                 return (sogeterr(so, B_TRUE));
 217         /*
 218          * Under normal circumstances, so_error should contain an error
 219          * in case the connect failed. However, it is possible for another
 220          * thread to come in a consume the error, so generate a sensible
 221          * error in that case.
 222          */
 223         if ((so->so_state & SS_ISCONNECTED) == 0)
 224                 return (ECONNREFUSED);
 225 
 226         return (0);
 227 }
 228 
 229 /*
 230  * int so_wait_connected(struct sonode *so, boolean_t nonblock,
 231  *    sock_connid_t id)
 232  *
 233  * Wait until the socket is connected or an error has occured.
 234  *
 235  * Arguments:
 236  *   so       - socket
 237  *   nonblock - indicate whether it's ok to sleep if the connection has
 238  *              not yet been established
 239  *   gen      - generation number that was returned by the protocol
 240  *              when the operation was started
 241  *
 242  * Returns:
 243  *   0 if the connection attempt was successful, or an error indicating why
 244  *   the connection attempt failed.
 245  */
 246 int
 247 so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
 248 {
 249         int error;
 250 
 251         mutex_enter(&so->so_lock);
 252         error = so_wait_connected_locked(so, nonblock, id);
 253         mutex_exit(&so->so_lock);
 254 
 255         return (error);
 256 }
 257 
 258 int
 259 so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
 260 {
 261         int error;
 262 
 263         ASSERT(MUTEX_HELD(&so->so_lock));
 264         while (SO_SND_FLOWCTRLD(so)) {
 265                 if (so->so_state & SS_CANTSENDMORE)
 266                         return (EPIPE);
 267                 if (dontblock)
 268                         return (EWOULDBLOCK);
 269 
 270                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 271                         return (EINTR);
 272 
 273                 if (so->so_sndtimeo == 0) {
 274                         /*
 275                          * Zero means disable timeout.
 276                          */
 277                         error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
 278                 } else {
 279                         error = cv_reltimedwait_sig(&so->so_snd_cv,
 280                             &so->so_lock, so->so_sndtimeo, TR_CLOCK_TICK);
 281                 }
 282                 if (error == 0)
 283                         return (EINTR);
 284                 else if (error == -1)
 285                         return (EAGAIN);
 286         }
 287         return (0);
 288 }
 289 
 290 /*
 291  * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
 292  *
 293  * Wait for the transport to notify us about send buffers becoming
 294  * available.
 295  */
 296 int
 297 so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
 298 {
 299         int error = 0;
 300 
 301         mutex_enter(&so->so_lock);
 302         so->so_snd_wakeup = B_TRUE;
 303         error = so_snd_wait_qnotfull_locked(so, dontblock);
 304         so->so_snd_wakeup = B_FALSE;
 305         mutex_exit(&so->so_lock);
 306 
 307         return (error);
 308 }
 309 
 310 void
 311 so_snd_qfull(struct sonode *so)
 312 {
 313         mutex_enter(&so->so_lock);
 314         so->so_snd_qfull = B_TRUE;
 315         mutex_exit(&so->so_lock);
 316 }
 317 
 318 void
 319 so_snd_qnotfull(struct sonode *so)
 320 {
 321         mutex_enter(&so->so_lock);
 322         so->so_snd_qfull = B_FALSE;
 323         /* wake up everyone waiting for buffers */
 324         cv_broadcast(&so->so_snd_cv);
 325         mutex_exit(&so->so_lock);
 326 }
 327 
 328 /*
 329  * Change the process/process group to which SIGIO is sent.
 330  */
 331 int
 332 socket_chgpgrp(struct sonode *so, pid_t pid)
 333 {
 334         int error;
 335 
 336         ASSERT(MUTEX_HELD(&so->so_lock));
 337         if (pid != 0) {
 338                 /*
 339                  * Permissions check by sending signal 0.
 340                  * Note that when kill fails it does a
 341                  * set_errno causing the system call to fail.
 342                  */
 343                 error = kill(pid, 0);
 344                 if (error != 0) {
 345                         return (error);
 346                 }
 347         }
 348         so->so_pgrp = pid;
 349         return (0);
 350 }
 351 
 352 
 353 /*
 354  * Generate a SIGIO, for 'writable' events include siginfo structure,
 355  * for read events just send the signal.
 356  */
 357 /*ARGSUSED*/
 358 static void
 359 socket_sigproc(proc_t *proc, int event)
 360 {
 361         k_siginfo_t info;
 362 
 363         ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
 364 
 365         if (event & SOCKETSIG_WRITE) {
 366                 info.si_signo = SIGPOLL;
 367                 info.si_code = POLL_OUT;
 368                 info.si_errno = 0;
 369                 info.si_fd = 0;
 370                 info.si_band = 0;
 371                 sigaddq(proc, NULL, &info, KM_NOSLEEP);
 372         }
 373         if (event & SOCKETSIG_READ) {
 374                 sigtoproc(proc, NULL, SIGPOLL);
 375         }
 376         if (event & SOCKETSIG_URG) {
 377                 sigtoproc(proc, NULL, SIGURG);
 378         }
 379 }
 380 
 381 void
 382 socket_sendsig(struct sonode *so, int event)
 383 {
 384         proc_t *proc;
 385 
 386         ASSERT(MUTEX_HELD(&so->so_lock));
 387 
 388         if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
 389             event != SOCKETSIG_URG)) {
 390                 return;
 391         }
 392 
 393         dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
 394 
 395         if (so->so_pgrp > 0) {
 396                 /*
 397                  * XXX This unfortunately still generates
 398                  * a signal when a fd is closed but
 399                  * the proc is active.
 400                  */
 401                 mutex_enter(&pidlock);
 402                 /*
 403                  * Even if the thread started in another zone, we're receiving
 404                  * on behalf of this socket's zone, so find the proc using the
 405                  * socket's zone ID.
 406                  */
 407                 proc = prfind_zone(so->so_pgrp, so->so_zoneid);
 408                 if (proc == NULL) {
 409                         mutex_exit(&pidlock);
 410                         return;
 411                 }
 412                 mutex_enter(&proc->p_lock);
 413                 mutex_exit(&pidlock);
 414                 socket_sigproc(proc, event);
 415                 mutex_exit(&proc->p_lock);
 416         } else {
 417                 /*
 418                  * Send to process group. Hold pidlock across
 419                  * calls to socket_sigproc().
 420                  */
 421                 pid_t pgrp = -so->so_pgrp;
 422 
 423                 mutex_enter(&pidlock);
 424                 /*
 425                  * Even if the thread started in another zone, we're receiving
 426                  * on behalf of this socket's zone, so find the pgrp using the
 427                  * socket's zone ID.
 428                  */
 429                 proc = pgfind_zone(pgrp, so->so_zoneid);
 430                 while (proc != NULL) {
 431                         mutex_enter(&proc->p_lock);
 432                         socket_sigproc(proc, event);
 433                         mutex_exit(&proc->p_lock);
 434                         proc = proc->p_pglink;
 435                 }
 436                 mutex_exit(&pidlock);
 437         }
 438 }
 439 
 440 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 441 /* Copy userdata into a new mblk_t */
 442 mblk_t *
 443 socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
 444     size_t tail_len, int *errorp)
 445 {
 446         mblk_t  *head = NULL, **tail = &head;
 447 
 448         ASSERT(iosize == INFPSZ || iosize > 0);
 449 
 450         if (iosize == INFPSZ || iosize > uiop->uio_resid)
 451                 iosize = uiop->uio_resid;
 452 
 453         if (maxblk == INFPSZ)
 454                 maxblk = iosize;
 455 
 456         /* Nothing to do in these cases, so we're done */
 457         if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
 458                 goto done;
 459 
 460         /*
 461          * We will enter the loop below if iosize is 0; it will allocate an
 462          * empty message block and call uiomove(9F) which will just return.
 463          * We could avoid that with an extra check but would only slow
 464          * down the much more likely case where iosize is larger than 0.
 465          */
 466         do {
 467                 ssize_t blocksize;
 468                 mblk_t  *mp;
 469 
 470                 blocksize = MIN(iosize, maxblk);
 471                 ASSERT(blocksize >= 0);
 472                 mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
 473                 if (mp == NULL) {
 474                         *errorp = ENOMEM;
 475                         return (head);
 476                 }
 477                 mp->b_rptr += wroff;
 478                 mp->b_wptr = mp->b_rptr + blocksize;
 479 
 480                 *tail = mp;
 481                 tail = &mp->b_cont;
 482 
 483                 /* uiomove(9F) either returns 0 or EFAULT */
 484                 if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
 485                     UIO_WRITE, uiop)) != 0) {
 486                         ASSERT(*errorp != ENOMEM);
 487                         freemsg(head);
 488                         return (NULL);
 489                 }
 490 
 491                 iosize -= blocksize;
 492         } while (iosize > 0);
 493 
 494 done:
 495         *errorp = 0;
 496         return (head);
 497 }
 498 
 499 mblk_t *
 500 socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
 501 {
 502         int error;
 503         ptrdiff_t n;
 504         mblk_t *nmp;
 505 
 506         ASSERT(mp->b_wptr >= mp->b_rptr);
 507 
 508         /*
 509          * max_read is the offset of the oobmark and read can not go pass
 510          * the oobmark.
 511          */
 512         if (max_read == INFPSZ || max_read > uiop->uio_resid)
 513                 max_read = uiop->uio_resid;
 514 
 515         do {
 516                 if ((n = MIN(max_read, MBLKL(mp))) != 0) {
 517                         ASSERT(n > 0);
 518 
 519                         error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
 520                         if (error != 0) {
 521                                 freemsg(mp);
 522                                 *errorp = error;
 523                                 return (NULL);
 524                         }
 525                 }
 526 
 527                 mp->b_rptr += n;
 528                 max_read -= n;
 529                 while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
 530                         /*
 531                          * get rid of zero length mblks
 532                          */
 533                         nmp = mp;
 534                         mp = mp->b_cont;
 535                         freeb(nmp);
 536                 }
 537         } while (mp != NULL && max_read > 0);
 538 
 539         *errorp = 0;
 540         return (mp);
 541 }
 542 
 543 static void
 544 so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
 545 {
 546         ASSERT(last_tail != NULL);
 547         mp->b_next = so->so_rcv_q_head;
 548         mp->b_prev = last_tail;
 549         ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
 550 
 551         if (so->so_rcv_q_head == NULL) {
 552                 ASSERT(so->so_rcv_q_last_head == NULL);
 553                 so->so_rcv_q_last_head = mp;
 554 #ifdef DEBUG
 555         } else {
 556                 ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
 557 #endif
 558         }
 559         so->so_rcv_q_head = mp;
 560 
 561 #ifdef DEBUG
 562         if (so_debug_length) {
 563                 mutex_enter(&so->so_lock);
 564                 ASSERT(so_check_length(so));
 565                 mutex_exit(&so->so_lock);
 566         }
 567 #endif
 568 }
 569 
 570 /*
 571  * Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
 572  * can be processed by so_dequeue_msg().
 573  */
 574 void
 575 so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
 576 {
 577         if (so->so_filter_active > 0 &&
 578             (mp_head = sof_filter_data_in_proc(so, mp_head,
 579             &mp_last_head)) == NULL)
 580                 return;
 581 
 582         ASSERT(mp_head->b_prev != NULL);
 583         if (so->so_rcv_q_head == NULL) {
 584                 so->so_rcv_q_head = mp_head;
 585                 so->so_rcv_q_last_head = mp_last_head;
 586                 ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
 587         } else {
 588                 boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
 589                     (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
 590 
 591                 if (mp_head->b_next == NULL &&
 592                     DB_TYPE(mp_head) == M_DATA &&
 593                     DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
 594                         so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 595                         so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 596                         mp_head->b_prev = NULL;
 597                 } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
 598                         /*
 599                          * Append to last_head if more than one mblks, and both
 600                          * mp_head and last_head are I/OAT mblks.
 601                          */
 602                         ASSERT(mp_head->b_next != NULL);
 603                         so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 604                         so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 605                         mp_head->b_prev = NULL;
 606 
 607                         so->so_rcv_q_last_head->b_next = mp_head->b_next;
 608                         mp_head->b_next = NULL;
 609                         so->so_rcv_q_last_head = mp_last_head;
 610                 } else {
 611 #ifdef DEBUG
 612                         {
 613                                 mblk_t *tmp_mblk;
 614                                 tmp_mblk = mp_head;
 615                                 while (tmp_mblk != NULL) {
 616                                         ASSERT(tmp_mblk->b_prev != NULL);
 617                                         tmp_mblk = tmp_mblk->b_next;
 618                                 }
 619                         }
 620 #endif
 621                         so->so_rcv_q_last_head->b_next = mp_head;
 622                         so->so_rcv_q_last_head = mp_last_head;
 623                 }
 624         }
 625 }
 626 
 627 /*
 628  * Check flow control on a given sonode.  Must have so_lock held, and
 629  * this function will release the hold.  Return true if flow control
 630  * is cleared.
 631  */
 632 boolean_t
 633 so_check_flow_control(struct sonode *so)
 634 {
 635         ASSERT(MUTEX_HELD(&so->so_lock));
 636 
 637         if (so->so_flowctrld && (so->so_rcv_queued < so->so_rcvlowat &&
 638             !(so->so_state & SS_FIL_RCV_FLOWCTRL))) {
 639                 so->so_flowctrld = B_FALSE;
 640                 mutex_exit(&so->so_lock);
 641                 /*
 642                  * Open up flow control. SCTP does not have any downcalls, and
 643                  * it will clr flow ctrl in sosctp_recvmsg().
 644                  */
 645                 if (so->so_downcalls != NULL &&
 646                     so->so_downcalls->sd_clr_flowctrl != NULL) {
 647                         (*so->so_downcalls->sd_clr_flowctrl)
 648                             (so->so_proto_handle);
 649                 }
 650                 /* filters can start injecting data */
 651                 sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0);
 652                 return (B_TRUE);
 653         } else {
 654                 mutex_exit(&so->so_lock);
 655                 return (B_FALSE);
 656         }
 657 }
 658 
 659 int
 660 so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
 661     rval_t *rvalp, int flags)
 662 {
 663         mblk_t  *mp, *nmp;
 664         mblk_t  *savemp, *savemptail;
 665         mblk_t  *new_msg_head;
 666         mblk_t  *new_msg_last_head;
 667         mblk_t  *last_tail;
 668         boolean_t partial_read;
 669         boolean_t reset_atmark = B_FALSE;
 670         int more = 0;
 671         int error;
 672         ssize_t oobmark;
 673         ssize_t copied = 0;
 674         sodirect_t *sodp = so->so_direct;
 675         xuio_t *xuio = NULL;
 676 
 677         partial_read = B_FALSE;
 678         *mctlp = NULL;
 679         if ((uiop->uio_extflg & UIO_XUIO) != 0) {
 680                 xuio = (xuio_t *)uiop;
 681         }
 682 again:
 683         mutex_enter(&so->so_lock);
 684 again1:
 685 #ifdef DEBUG
 686         if (so_debug_length) {
 687                 ASSERT(so_check_length(so));
 688         }
 689 #endif
 690         if (so->so_state & SS_RCVATMARK) {
 691                 /* Check whether the caller is OK to read past the mark */
 692                 if (flags & MSG_NOMARK) {
 693                         mutex_exit(&so->so_lock);
 694                         return (EWOULDBLOCK);
 695                 }
 696                 reset_atmark = B_TRUE;
 697         }
 698         /*
 699          * First move messages from the dump area to processing area
 700          */
 701         if (sodp != NULL) {
 702                 if (sodp->sod_enabled) {
 703                         if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
 704                                 /* nothing to uioamove */
 705                                 sodp = NULL;
 706                         } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
 707                                 sodp->sod_uioa.uioa_state &= UIOA_CLR;
 708                                 sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
 709                                 /*
 710                                  * try to uioamove() the data that
 711                                  * has already queued.
 712                                  */
 713                                 sod_uioa_so_init(so, sodp, uiop);
 714                         }
 715                 } else {
 716                         sodp = NULL;
 717                 }
 718         }
 719         new_msg_head = so->so_rcv_head;
 720         new_msg_last_head = so->so_rcv_last_head;
 721         so->so_rcv_head = NULL;
 722         so->so_rcv_last_head = NULL;
 723         oobmark = so->so_oobmark;
 724         /*
 725          * We can release the lock as there can only be one reader
 726          */
 727         mutex_exit(&so->so_lock);
 728 
 729         if (new_msg_head != NULL) {
 730                 so_process_new_message(so, new_msg_head, new_msg_last_head);
 731         }
 732         savemp = savemptail = NULL;
 733         rvalp->r_vals = 0;
 734         error = 0;
 735         mp = so->so_rcv_q_head;
 736 
 737         if (mp != NULL &&
 738             (so->so_rcv_timer_tid == 0 ||
 739             so->so_rcv_queued >= so->so_rcv_thresh)) {
 740                 partial_read = B_FALSE;
 741 
 742                 if (flags & MSG_PEEK) {
 743                         if ((nmp = dupmsg(mp)) == NULL &&
 744                             (nmp = copymsg(mp)) == NULL) {
 745                                 size_t size = msgsize(mp);
 746 
 747                                 error = strwaitbuf(size, BPRI_HI);
 748                                 if (error) {
 749                                         return (error);
 750                                 }
 751                                 goto again;
 752                         }
 753                         mp = nmp;
 754                 } else {
 755                         ASSERT(mp->b_prev != NULL);
 756                         last_tail = mp->b_prev;
 757                         mp->b_prev = NULL;
 758                         so->so_rcv_q_head = mp->b_next;
 759                         if (so->so_rcv_q_head == NULL) {
 760                                 so->so_rcv_q_last_head = NULL;
 761                         }
 762                         mp->b_next = NULL;
 763                 }
 764 
 765                 ASSERT(mctlp != NULL);
 766                 /*
 767                  * First process PROTO or PCPROTO blocks, if any.
 768                  */
 769                 if (DB_TYPE(mp) != M_DATA) {
 770                         *mctlp = mp;
 771                         savemp = mp;
 772                         savemptail = mp;
 773                         ASSERT(DB_TYPE(mp) == M_PROTO ||
 774                             DB_TYPE(mp) == M_PCPROTO);
 775                         while (mp->b_cont != NULL &&
 776                             DB_TYPE(mp->b_cont) != M_DATA) {
 777                                 ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
 778                                     DB_TYPE(mp->b_cont) == M_PCPROTO);
 779                                 mp = mp->b_cont;
 780                                 savemptail = mp;
 781                         }
 782                         mp = savemptail->b_cont;
 783                         savemptail->b_cont = NULL;
 784                 }
 785 
 786                 ASSERT(DB_TYPE(mp) == M_DATA);
 787                 /*
 788                  * Now process DATA blocks, if any. Note that for sodirect
 789                  * enabled socket, uio_resid can be 0.
 790                  */
 791                 if (uiop->uio_resid >= 0) {
 792                         if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
 793                                 mutex_enter(&so->so_lock);
 794                                 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
 795                                 copied = sod_uioa_mblk(so, mp);
 796                                 if (copied > 0)
 797                                         partial_read = B_TRUE;
 798                                 mutex_exit(&so->so_lock);
 799                                 /* mark this mblk as processed */
 800                                 mp = NULL;
 801                         } else {
 802                                 ssize_t oldresid = uiop->uio_resid;
 803 
 804                                 if (MBLKL(mp) < so_mblk_pull_len) {
 805                                         if (pullupmsg(mp, -1) == 1) {
 806                                                 last_tail = mp;
 807                                         }
 808                                 }
 809                                 /*
 810                                  * Can not read beyond the oobmark
 811                                  */
 812                                 mp = socopyoutuio(mp, uiop,
 813                                     oobmark == 0 ? INFPSZ : oobmark, &error);
 814                                 if (error != 0) {
 815                                         freemsg(*mctlp);
 816                                         *mctlp = NULL;
 817                                         more = 0;
 818                                         goto done;
 819                                 }
 820                                 ASSERT(oldresid >= uiop->uio_resid);
 821                                 copied = oldresid - uiop->uio_resid;
 822                                 if (oldresid > uiop->uio_resid)
 823                                         partial_read = B_TRUE;
 824                         }
 825                         ASSERT(copied >= 0);
 826                         if (copied > 0 && !(flags & MSG_PEEK)) {
 827                                 mutex_enter(&so->so_lock);
 828                                 so->so_rcv_queued -= copied;
 829                                 ASSERT(so->so_oobmark >= 0);
 830                                 if (so->so_oobmark > 0) {
 831                                         so->so_oobmark -= copied;
 832                                         ASSERT(so->so_oobmark >= 0);
 833                                         if (so->so_oobmark == 0) {
 834                                                 ASSERT(so->so_state &
 835                                                     SS_OOBPEND);
 836                                                 so->so_oobmark = 0;
 837                                                 so->so_state |= SS_RCVATMARK;
 838                                         }
 839                                 }
 840                                 /*
 841                                  * so_check_flow_control() will drop
 842                                  * so->so_lock.
 843                                  */
 844                                 rvalp->r_val2 = so_check_flow_control(so);
 845                         }
 846                 }
 847                 if (mp != NULL) { /* more data blocks in msg */
 848                         more |= MOREDATA;
 849 
 850                         /*
 851                          * If requested, tally up remaining data along with the
 852                          * amount already copied.
 853                          */
 854                         if (xuio != NULL &&
 855                             xuio->xu_type == UIOTYPE_PEEKSIZE) {
 856                                 xuio->xu_ext.xu_ps.xu_ps_set = B_TRUE;
 857                                 xuio->xu_ext.xu_ps.xu_ps_size =
 858                                     copied + msgdsize(mp);
 859                         }
 860 
 861                         if ((flags & (MSG_PEEK|MSG_TRUNC))) {
 862                                 if (flags & MSG_PEEK) {
 863                                         freemsg(mp);
 864                                 } else {
 865                                         unsigned int msize = msgdsize(mp);
 866 
 867                                         freemsg(mp);
 868                                         mutex_enter(&so->so_lock);
 869                                         so->so_rcv_queued -= msize;
 870                                         /*
 871                                          * so_check_flow_control() will drop
 872                                          * so->so_lock.
 873                                          */
 874                                         rvalp->r_val2 =
 875                                             so_check_flow_control(so);
 876                                 }
 877                         } else if (partial_read && !somsghasdata(mp)) {
 878                                 /*
 879                                  * Avoid queuing a zero-length tail part of
 880                                  * a message. partial_read == 1 indicates that
 881                                  * we read some of the message.
 882                                  */
 883                                 freemsg(mp);
 884                                 more &= ~MOREDATA;
 885                         } else {
 886                                 if (savemp != NULL &&
 887                                     (flags & MSG_DUPCTRL)) {
 888                                         mblk_t *nmp;
 889                                         /*
 890                                          * There should only be non data mblks
 891                                          */
 892                                         ASSERT(DB_TYPE(savemp) != M_DATA &&
 893                                             DB_TYPE(savemptail) != M_DATA);
 894 try_again:
 895                                         if ((nmp = dupmsg(savemp)) == NULL &&
 896                                             (nmp = copymsg(savemp)) == NULL) {
 897 
 898                                                 size_t size = msgsize(savemp);
 899 
 900                                                 error = strwaitbuf(size,
 901                                                     BPRI_HI);
 902                                                 if (error != 0) {
 903                                                         /*
 904                                                          * In case we
 905                                                          * cannot copy
 906                                                          * control data
 907                                                          * free the remaining
 908                                                          * data.
 909                                                          */
 910                                                         freemsg(mp);
 911                                                         goto done;
 912                                                 }
 913                                                 goto try_again;
 914                                         }
 915 
 916                                         ASSERT(nmp != NULL);
 917                                         ASSERT(DB_TYPE(nmp) != M_DATA);
 918                                         savemptail->b_cont = mp;
 919                                         *mctlp = nmp;
 920                                         mp = savemp;
 921                                 }
 922                                 /*
 923                                  * putback mp
 924                                  */
 925                                 so_prepend_msg(so, mp, last_tail);
 926                         }
 927                 }
 928 
 929                 /* fast check so_rcv_head if there is more data */
 930                 if (partial_read && !(so->so_state & SS_RCVATMARK) &&
 931                     *mctlp == NULL && uiop->uio_resid > 0 &&
 932                     !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
 933                         goto again;
 934                 }
 935         } else if (!partial_read) {
 936                 mutex_enter(&so->so_lock);
 937                 if (so->so_error != 0) {
 938                         error = sogeterr(so, !(flags & MSG_PEEK));
 939                         mutex_exit(&so->so_lock);
 940                         return (error);
 941                 }
 942                 /*
 943                  * No pending data. Return right away for nonblocking
 944                  * socket, otherwise sleep waiting for data.
 945                  */
 946                 if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
 947                         if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
 948                             (flags & MSG_DONTWAIT)) {
 949                                 error = EWOULDBLOCK;
 950                         } else {
 951                                 if (so->so_state & (SS_CLOSING |
 952                                     SS_FALLBACK_PENDING)) {
 953                                         mutex_exit(&so->so_lock);
 954                                         error = EINTR;
 955                                         goto done;
 956                                 }
 957 
 958                                 if (so->so_rcv_head != NULL) {
 959                                         goto again1;
 960                                 }
 961                                 so->so_rcv_wakeup = B_TRUE;
 962                                 so->so_rcv_wanted = uiop->uio_resid;
 963                                 if (so->so_rcvtimeo == 0) {
 964                                         /*
 965                                          * Zero means disable timeout.
 966                                          */
 967                                         error = cv_wait_sig(&so->so_rcv_cv,
 968                                             &so->so_lock);
 969                                 } else {
 970                                         error = cv_reltimedwait_sig(
 971                                             &so->so_rcv_cv, &so->so_lock,
 972                                             so->so_rcvtimeo, TR_CLOCK_TICK);
 973                                 }
 974                                 so->so_rcv_wakeup = B_FALSE;
 975                                 so->so_rcv_wanted = 0;
 976 
 977                                 if (error == 0) {
 978                                         error = EINTR;
 979                                 } else if (error == -1) {
 980                                         error = EAGAIN;
 981                                 } else {
 982                                         goto again1;
 983                                 }
 984                         }
 985                 }
 986                 mutex_exit(&so->so_lock);
 987         }
 988         if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
 989                 /*
 990                  * We are passed the mark, update state
 991                  * 4.3BSD and 4.4BSD clears the mark when peeking across it.
 992                  * The draft Posix socket spec states that the mark should
 993                  * not be cleared when peeking. We follow the latter.
 994                  */
 995                 mutex_enter(&so->so_lock);
 996                 ASSERT(so_verify_oobstate(so));
 997                 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
 998                 freemsg(so->so_oobmsg);
 999                 so->so_oobmsg = NULL;
1000                 ASSERT(so_verify_oobstate(so));
1001                 mutex_exit(&so->so_lock);
1002         }
1003         ASSERT(so->so_rcv_wakeup == B_FALSE);
1004 done:
1005         if (sodp != NULL) {
1006                 mutex_enter(&so->so_lock);
1007                 if (sodp->sod_enabled &&
1008                     (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
1009                         SOD_UIOAFINI(sodp);
1010                         if (sodp->sod_uioa.uioa_mbytes > 0) {
1011                                 ASSERT(so->so_rcv_q_head != NULL ||
1012                                     so->so_rcv_head != NULL);
1013                                 so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
1014                                 if (error == EWOULDBLOCK)
1015                                         error = 0;
1016                         }
1017                 }
1018                 mutex_exit(&so->so_lock);
1019         }
1020 #ifdef DEBUG
1021         if (so_debug_length) {
1022                 mutex_enter(&so->so_lock);
1023                 ASSERT(so_check_length(so));
1024                 mutex_exit(&so->so_lock);
1025         }
1026 #endif
1027         rvalp->r_val1 = more;
1028         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1029         return (error);
1030 }
1031 
1032 /*
1033  * Enqueue data from the protocol on the socket's rcv queue.
1034  *
1035  * We try to hook new M_DATA mblks onto an existing chain, however,
1036  * that cannot be done if the existing chain has already been
1037  * processed by I/OAT. Non-M_DATA mblks are just linked together via
1038  * b_next. In all cases the b_prev of the enqueued mblk is set to
1039  * point to the last mblk in its b_cont chain.
1040  */
1041 void
1042 so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1043 {
1044         ASSERT(MUTEX_HELD(&so->so_lock));
1045 
1046 #ifdef DEBUG
1047         if (so_debug_length) {
1048                 ASSERT(so_check_length(so));
1049         }
1050 #endif
1051         so->so_rcv_queued += msg_size;
1052 
1053         if (so->so_rcv_head == NULL) {
1054                 ASSERT(so->so_rcv_last_head == NULL);
1055                 so->so_rcv_head = mp;
1056                 so->so_rcv_last_head = mp;
1057         } else if ((DB_TYPE(mp) == M_DATA &&
1058             DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1059             ((DB_FLAGS(mp) & DBLK_UIOA) ==
1060             (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1061                 /* Added to the end */
1062                 ASSERT(so->so_rcv_last_head != NULL);
1063                 ASSERT(so->so_rcv_last_head->b_prev != NULL);
1064                 so->so_rcv_last_head->b_prev->b_cont = mp;
1065         } else {
1066                 /* Start a new end */
1067                 so->so_rcv_last_head->b_next = mp;
1068                 so->so_rcv_last_head = mp;
1069         }
1070         while (mp->b_cont != NULL)
1071                 mp = mp->b_cont;
1072 
1073         so->so_rcv_last_head->b_prev = mp;
1074 #ifdef DEBUG
1075         if (so_debug_length) {
1076                 ASSERT(so_check_length(so));
1077         }
1078 #endif
1079 }
1080 
1081 /*
1082  * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1083  */
1084 boolean_t
1085 somsghasdata(mblk_t *mp)
1086 {
1087         for (; mp; mp = mp->b_cont)
1088                 if (mp->b_datap->db_type == M_DATA) {
1089                         ASSERT(mp->b_wptr >= mp->b_rptr);
1090                         if (mp->b_wptr > mp->b_rptr)
1091                                 return (B_TRUE);
1092                 }
1093         return (B_FALSE);
1094 }
1095 
1096 /*
1097  * Flush the read side of sockfs.
1098  *
1099  * The caller must be sure that a reader is not already active when the
1100  * buffer is being flushed.
1101  */
1102 void
1103 so_rcv_flush(struct sonode *so)
1104 {
1105         mblk_t  *mp;
1106 
1107         ASSERT(MUTEX_HELD(&so->so_lock));
1108 
1109         if (so->so_oobmsg != NULL) {
1110                 freemsg(so->so_oobmsg);
1111                 so->so_oobmsg = NULL;
1112                 so->so_oobmark = 0;
1113                 so->so_state &=
1114                     ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1115         }
1116 
1117         /*
1118          * Free messages sitting in the recv queues
1119          */
1120         while (so->so_rcv_q_head != NULL) {
1121                 mp = so->so_rcv_q_head;
1122                 so->so_rcv_q_head = mp->b_next;
1123                 mp->b_next = mp->b_prev = NULL;
1124                 freemsg(mp);
1125         }
1126         while (so->so_rcv_head != NULL) {
1127                 mp = so->so_rcv_head;
1128                 so->so_rcv_head = mp->b_next;
1129                 mp->b_next = mp->b_prev = NULL;
1130                 freemsg(mp);
1131         }
1132         so->so_rcv_queued = 0;
1133         so->so_rcv_q_head = NULL;
1134         so->so_rcv_q_last_head = NULL;
1135         so->so_rcv_head = NULL;
1136         so->so_rcv_last_head = NULL;
1137 }
1138 
1139 /*
1140  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1141  */
1142 int
1143 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1144     boolean_t oob_inline)
1145 {
1146         mblk_t          *mp, *nmp;
1147         int             error;
1148 
1149         dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1150             flags));
1151 
1152         if (msg != NULL) {
1153                 /*
1154                  * There is never any oob data with addresses or control since
1155                  * the T_EXDATA_IND does not carry any options.
1156                  */
1157                 msg->msg_controllen = 0;
1158                 msg->msg_namelen = 0;
1159                 msg->msg_flags = 0;
1160         }
1161 
1162         mutex_enter(&so->so_lock);
1163         ASSERT(so_verify_oobstate(so));
1164         if (oob_inline ||
1165             (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1166                 dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1167                 mutex_exit(&so->so_lock);
1168                 return (EINVAL);
1169         }
1170         if (!(so->so_state & SS_HAVEOOBDATA)) {
1171                 dprintso(so, 1, ("sorecvoob: no data yet\n"));
1172                 mutex_exit(&so->so_lock);
1173                 return (EWOULDBLOCK);
1174         }
1175         ASSERT(so->so_oobmsg != NULL);
1176         mp = so->so_oobmsg;
1177         if (flags & MSG_PEEK) {
1178                 /*
1179                  * Since recv* can not return ENOBUFS we can not use dupmsg.
1180                  * Instead we revert to the consolidation private
1181                  * allocb_wait plus bcopy.
1182                  */
1183                 mblk_t *mp1;
1184 
1185                 mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1186                 ASSERT(mp1);
1187 
1188                 while (mp != NULL) {
1189                         ssize_t size;
1190 
1191                         size = MBLKL(mp);
1192                         bcopy(mp->b_rptr, mp1->b_wptr, size);
1193                         mp1->b_wptr += size;
1194                         ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1195                         mp = mp->b_cont;
1196                 }
1197                 mp = mp1;
1198         } else {
1199                 /*
1200                  * Update the state indicating that the data has been consumed.
1201                  * Keep SS_OOBPEND set until data is consumed past the mark.
1202                  */
1203                 so->so_oobmsg = NULL;
1204                 so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1205         }
1206         ASSERT(so_verify_oobstate(so));
1207         mutex_exit(&so->so_lock);
1208 
1209         error = 0;
1210         nmp = mp;
1211         while (nmp != NULL && uiop->uio_resid > 0) {
1212                 ssize_t n = MBLKL(nmp);
1213 
1214                 n = MIN(n, uiop->uio_resid);
1215                 if (n > 0)
1216                         error = uiomove(nmp->b_rptr, n,
1217                             UIO_READ, uiop);
1218                 if (error)
1219                         break;
1220                 nmp = nmp->b_cont;
1221         }
1222         ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1223         freemsg(mp);
1224         return (error);
1225 }
1226 
1227 /*
1228  * Allocate and initializ sonode
1229  */
1230 /* ARGSUSED */
1231 struct sonode *
1232 socket_sonode_create(struct sockparams *sp, int family, int type,
1233     int protocol, int version, int sflags, int *errorp, struct cred *cr)
1234 {
1235         sonode_t *so;
1236         int     kmflags;
1237 
1238         /*
1239          * Choose the right set of sonodeops based on the upcall and
1240          * down call version that the protocol has provided
1241          */
1242         if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1243             SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1244                 /*
1245                  * mismatch
1246                  */
1247 #ifdef DEBUG
1248                 cmn_err(CE_CONT, "protocol and socket module version mismatch");
1249 #endif
1250                 *errorp = EINVAL;
1251                 return (NULL);
1252         }
1253 
1254         kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1255 
1256         so = kmem_cache_alloc(socket_cache, kmflags);
1257         if (so == NULL) {
1258                 *errorp = ENOMEM;
1259                 return (NULL);
1260         }
1261 
1262         sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1263 
1264         if (version == SOV_DEFAULT)
1265                 version = so_default_version;
1266 
1267         so->so_version = (short)version;
1268 
1269         /*
1270          * set the default values to be INFPSZ
1271          * if a protocol desires it can change the value later
1272          */
1273         so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1274         so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1275         so->so_proto_props.sopp_maxpsz = INFPSZ;
1276         so->so_proto_props.sopp_maxblk = INFPSZ;
1277 
1278         return (so);
1279 }
1280 
1281 int
1282 socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1283 {
1284         int error = 0;
1285 
1286         if (pso != NULL) {
1287                 /*
1288                  * We have a passive open, so inherit basic state from
1289                  * the parent (listener).
1290                  *
1291                  * No need to grab the new sonode's lock, since there is no
1292                  * one that can have a reference to it.
1293                  */
1294                 mutex_enter(&pso->so_lock);
1295 
1296                 so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1297                 so->so_pgrp = pso->so_pgrp;
1298                 so->so_rcvtimeo = pso->so_rcvtimeo;
1299                 so->so_sndtimeo = pso->so_sndtimeo;
1300                 so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
1301                 /*
1302                  * Make note of the socket level options. TCP and IP level
1303                  * options are already inherited. We could do all this after
1304                  * accept is successful but doing it here simplifies code and
1305                  * no harm done for error case.
1306                  */
1307                 so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
1308                     SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1309                     SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1310                 so->so_proto_props = pso->so_proto_props;
1311                 so->so_mode = pso->so_mode;
1312                 so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
1313 
1314                 mutex_exit(&pso->so_lock);
1315 
1316                 /*
1317                  * If the parent has any filters, try to inherit them.
1318                  */
1319                 if (pso->so_filter_active > 0 &&
1320                     (error = sof_sonode_inherit_filters(so, pso)) != 0)
1321                         return (error);
1322 
1323         } else {
1324                 struct sockparams *sp = so->so_sockparams;
1325                 sock_upcalls_t *upcalls_to_use;
1326 
1327                 /*
1328                  * Attach automatic filters, if there are any.
1329                  */
1330                 if (!list_is_empty(&sp->sp_auto_filters) &&
1331                     (error = sof_sonode_autoattach_filters(so, cr)) != 0)
1332                         return (error);
1333 
1334                 /* OK to attach filters */
1335                 so->so_state |= SS_FILOP_OK;
1336 
1337                 /*
1338                  * Based on the version number select the right upcalls to
1339                  * pass down. Currently we only have one version so choose
1340                  * default
1341                  */
1342                 upcalls_to_use = &so_upcalls;
1343 
1344                 /* active open, so create a lower handle */
1345                 so->so_proto_handle =
1346                     sp->sp_smod_info->smod_proto_create_func(so->so_family,
1347                     so->so_type, so->so_protocol, &so->so_downcalls,
1348                     &so->so_mode, &error, flags, cr);
1349 
1350                 if (so->so_proto_handle == NULL) {
1351                         ASSERT(error != 0);
1352                         /*
1353                          * To be safe; if a lower handle cannot be created, and
1354                          * the proto does not give a reason why, assume there
1355                          * was a lack of memory.
1356                          */
1357                         return ((error == 0) ? ENOMEM : error);
1358                 }
1359                 ASSERT(so->so_downcalls != NULL);
1360                 ASSERT(so->so_downcalls->sd_send != NULL ||
1361                     so->so_downcalls->sd_send_uio != NULL);
1362                 if (so->so_downcalls->sd_recv_uio != NULL) {
1363                         ASSERT(so->so_downcalls->sd_poll != NULL);
1364                         so->so_pollev |= SO_POLLEV_ALWAYS;
1365                 }
1366 
1367                 (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1368                     (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1369 
1370                 /* Wildcard */
1371 
1372                 /*
1373                  * FIXME No need for this, the protocol can deal with it in
1374                  * sd_create(). Should update ICMP.
1375                  */
1376                 if (so->so_protocol != so->so_sockparams->sp_protocol) {
1377                         int protocol = so->so_protocol;
1378                         int error;
1379                         /*
1380                          * Issue SO_PROTOTYPE setsockopt.
1381                          */
1382                         error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1383                             &protocol, (t_uscalar_t)sizeof (protocol), cr);
1384                         if (error) {
1385                                 (void) (*so->so_downcalls->sd_close)
1386                                     (so->so_proto_handle, 0, cr);
1387 
1388                                 mutex_enter(&so->so_lock);
1389                                 so_rcv_flush(so);
1390                                 mutex_exit(&so->so_lock);
1391                                 /*
1392                                  * Setsockopt often fails with ENOPROTOOPT but
1393                                  * socket() should fail with
1394                                  * EPROTONOSUPPORT/EPROTOTYPE.
1395                                  */
1396                                 return (EPROTONOSUPPORT);
1397                         }
1398                 }
1399         }
1400 
1401         if (uioasync.enabled)
1402                 sod_sock_init(so);
1403 
1404         /* put an extra reference on the socket for the protocol */
1405         VN_HOLD(SOTOV(so));
1406 
1407         return (0);
1408 }
1409 
1410 /*
1411  * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1412  *         struct cred *cr, int32_t *rvalp)
1413  *
1414  * Handle ioctls that manipulate basic socket state; non-blocking,
1415  * async, etc.
1416  *
1417  * Returns:
1418  *   < 0  - ioctl was not handle
1419  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1420  *
1421  * Notes:
1422  *   Assumes the standard receive buffer is used to obtain info for
1423  *   NREAD.
1424  */
1425 /* ARGSUSED */
1426 int
1427 socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1428     struct cred *cr, int32_t *rvalp)
1429 {
1430         switch (cmd) {
1431         case SIOCSQPTR:
1432                 /*
1433                  * SIOCSQPTR is valid only when helper stream is created
1434                  * by the protocol.
1435                  */
1436 
1437                 return (EOPNOTSUPP);
1438         case FIONBIO: {
1439                 int32_t value;
1440 
1441                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1442                     (mode & (int)FKIOCTL)))
1443                         return (EFAULT);
1444 
1445                 mutex_enter(&so->so_lock);
1446                 if (value) {
1447                         so->so_state |= SS_NDELAY;
1448                 } else {
1449                         so->so_state &= ~SS_NDELAY;
1450                 }
1451                 mutex_exit(&so->so_lock);
1452                 return (0);
1453         }
1454         case FIOASYNC: {
1455                 int32_t value;
1456 
1457                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1458                     (mode & (int)FKIOCTL)))
1459                         return (EFAULT);
1460 
1461                 mutex_enter(&so->so_lock);
1462 
1463                 if (value) {
1464                         /* Turn on SIGIO */
1465                         so->so_state |= SS_ASYNC;
1466                 } else {
1467                         /* Turn off SIGIO */
1468                         so->so_state &= ~SS_ASYNC;
1469                 }
1470                 mutex_exit(&so->so_lock);
1471 
1472                 return (0);
1473         }
1474 
1475         case SIOCSPGRP:
1476         case FIOSETOWN: {
1477                 int error;
1478                 pid_t pid;
1479 
1480                 if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1481                     (mode & (int)FKIOCTL)))
1482                         return (EFAULT);
1483 
1484                 mutex_enter(&so->so_lock);
1485                 error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1486                 mutex_exit(&so->so_lock);
1487                 return (error);
1488         }
1489         case SIOCGPGRP:
1490         case FIOGETOWN:
1491                 if (so_copyout(&so->so_pgrp, (void *)arg,
1492                     sizeof (pid_t), (mode & (int)FKIOCTL)))
1493                         return (EFAULT);
1494 
1495                 return (0);
1496         case SIOCATMARK: {
1497                 int retval;
1498 
1499                 /*
1500                  * Only protocols that support urgent data can handle ATMARK.
1501                  */
1502                 if ((so->so_mode & SM_EXDATA) == 0)
1503                         return (EINVAL);
1504 
1505                 /*
1506                  * If the protocol is maintaining its own buffer, then the
1507                  * request must be passed down.
1508                  */
1509                 if (so->so_downcalls->sd_recv_uio != NULL)
1510                         return (-1);
1511 
1512                 retval = (so->so_state & SS_RCVATMARK) != 0;
1513 
1514                 if (so_copyout(&retval, (void *)arg, sizeof (int),
1515                     (mode & (int)FKIOCTL))) {
1516                         return (EFAULT);
1517                 }
1518                 return (0);
1519         }
1520 
1521         case FIONREAD: {
1522                 int retval;
1523 
1524                 /*
1525                  * If the protocol is maintaining its own buffer, then the
1526                  * request must be passed down.
1527                  */
1528                 if (so->so_downcalls->sd_recv_uio != NULL)
1529                         return (-1);
1530 
1531                 retval = MIN(so->so_rcv_queued, INT_MAX);
1532 
1533                 if (so_copyout(&retval, (void *)arg,
1534                     sizeof (retval), (mode & (int)FKIOCTL))) {
1535                         return (EFAULT);
1536                 }
1537                 return (0);
1538         }
1539 
1540         case _I_GETPEERCRED: {
1541                 int error = 0;
1542 
1543                 if ((mode & FKIOCTL) == 0)
1544                         return (EINVAL);
1545 
1546                 mutex_enter(&so->so_lock);
1547                 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1548                         error = ENOTSUP;
1549                 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1550                         error = ENOTCONN;
1551                 } else if (so->so_peercred != NULL) {
1552                         k_peercred_t *kp = (k_peercred_t *)arg;
1553                         kp->pc_cr = so->so_peercred;
1554                         kp->pc_cpid = so->so_cpid;
1555                         crhold(so->so_peercred);
1556                 } else {
1557                         error = EINVAL;
1558                 }
1559                 mutex_exit(&so->so_lock);
1560                 return (error);
1561         }
1562         default:
1563                 return (-1);
1564         }
1565 }
1566 
1567 /*
1568  * Handle the I_NREAD STREAM ioctl.
1569  */
1570 static int
1571 so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
1572 {
1573         size_t size = 0;
1574         int retval;
1575         int count = 0;
1576         mblk_t *mp;
1577         clock_t wakeup = drv_usectohz(10);
1578 
1579         if (so->so_downcalls == NULL ||
1580             so->so_downcalls->sd_recv_uio != NULL)
1581                 return (EINVAL);
1582 
1583         mutex_enter(&so->so_lock);
1584         /* Wait for reader to get out of the way. */
1585         while (so->so_flag & SOREADLOCKED) {
1586                 /*
1587                  * If reader is waiting for data, then there should be nothing
1588                  * on the rcv queue.
1589                  */
1590                 if (so->so_rcv_wakeup)
1591                         goto out;
1592 
1593                 /* Do a timed sleep, in case the reader goes to sleep. */
1594                 (void) cv_reltimedwait(&so->so_read_cv, &so->so_lock, wakeup,
1595                     TR_CLOCK_TICK);
1596         }
1597 
1598         /*
1599          * Since we are holding so_lock no new reader will come in, and the
1600          * protocol will not be able to enqueue data. So it's safe to walk
1601          * both rcv queues.
1602          */
1603         mp = so->so_rcv_q_head;
1604         if (mp != NULL) {
1605                 size = msgdsize(so->so_rcv_q_head);
1606                 for (; mp != NULL; mp = mp->b_next)
1607                         count++;
1608         } else {
1609                 /*
1610                  * In case the processing list was empty, get the size of the
1611                  * next msg in line.
1612                  */
1613                 size = msgdsize(so->so_rcv_head);
1614         }
1615 
1616         for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
1617                 count++;
1618 out:
1619         mutex_exit(&so->so_lock);
1620 
1621         /*
1622          * Drop down from size_t to the "int" required by the
1623          * interface.  Cap at INT_MAX.
1624          */
1625         retval = MIN(size, INT_MAX);
1626         if (so_copyout(&retval, (void *)arg, sizeof (retval),
1627             (mode & (int)FKIOCTL))) {
1628                 return (EFAULT);
1629         } else {
1630                 *rvalp = count;
1631                 return (0);
1632         }
1633 }
1634 
1635 /*
1636  * Process STREAM ioctls.
1637  *
1638  * Returns:
1639  *   < 0  - ioctl was not handle
1640  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1641  */
1642 int
1643 socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1644     struct cred *cr, int32_t *rvalp)
1645 {
1646         int retval;
1647 
1648         /* Only STREAM iotcls are handled here */
1649         if ((cmd & 0xffffff00U) != STR)
1650                 return (-1);
1651 
1652         switch (cmd) {
1653         case I_CANPUT:
1654                 /*
1655                  * We return an error for I_CANPUT so that isastream(3C) will
1656                  * not report the socket as being a STREAM.
1657                  */
1658                 return (EOPNOTSUPP);
1659         case I_NREAD:
1660                 /* Avoid doing a fallback for I_NREAD. */
1661                 return (so_strioc_nread(so, arg, mode, rvalp));
1662         case I_LOOK:
1663                 /* Avoid doing a fallback for I_LOOK. */
1664                 if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1665                     (mode & (int)FKIOCTL))) {
1666                         return (EFAULT);
1667                 }
1668                 return (0);
1669         default:
1670                 break;
1671         }
1672 
1673         /*
1674          * Try to fall back to TPI, and if successful, reissue the ioctl.
1675          */
1676         if ((retval = so_tpi_fallback(so, cr)) == 0) {
1677                 /* Reissue the ioctl */
1678                 ASSERT(so->so_rcv_q_head == NULL);
1679                 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1680         } else {
1681                 return (retval);
1682         }
1683 }
1684 
1685 /*
1686  * This is called for all socket types to verify that the buffer size is large
1687  * enough for the option, and if we can, handle the request as well. Most
1688  * options will be forwarded to the protocol.
1689  */
1690 int
1691 socket_getopt_common(struct sonode *so, int level, int option_name,
1692     void *optval, socklen_t *optlenp, int flags)
1693 {
1694         if (level != SOL_SOCKET)
1695                 return (-1);
1696 
1697         switch (option_name) {
1698         case SO_ERROR:
1699         case SO_DOMAIN:
1700         case SO_TYPE:
1701         case SO_ACCEPTCONN: {
1702                 int32_t value;
1703                 socklen_t optlen = *optlenp;
1704 
1705                 if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1706                         return (EINVAL);
1707                 }
1708 
1709                 switch (option_name) {
1710                 case SO_ERROR:
1711                         mutex_enter(&so->so_lock);
1712                         value = sogeterr(so, B_TRUE);
1713                         mutex_exit(&so->so_lock);
1714                         break;
1715                 case SO_DOMAIN:
1716                         value = so->so_family;
1717                         break;
1718                 case SO_TYPE:
1719                         value = so->so_type;
1720                         break;
1721                 case SO_ACCEPTCONN:
1722                         if (so->so_state & SS_ACCEPTCONN)
1723                                 value = SO_ACCEPTCONN;
1724                         else
1725                                 value = 0;
1726                         break;
1727                 }
1728 
1729                 bcopy(&value, optval, sizeof (value));
1730                 *optlenp = sizeof (value);
1731 
1732                 return (0);
1733         }
1734         case SO_SNDTIMEO:
1735         case SO_RCVTIMEO: {
1736                 clock_t value;
1737                 socklen_t optlen = *optlenp;
1738 
1739                 if (get_udatamodel() == DATAMODEL_NONE ||
1740                     get_udatamodel() == DATAMODEL_NATIVE) {
1741                         if (optlen < sizeof (struct timeval))
1742                                 return (EINVAL);
1743                 } else {
1744                         if (optlen < sizeof (struct timeval32))
1745                                 return (EINVAL);
1746                 }
1747                 if (option_name == SO_RCVTIMEO)
1748                         value = drv_hztousec(so->so_rcvtimeo);
1749                 else
1750                         value = drv_hztousec(so->so_sndtimeo);
1751 
1752                 if (get_udatamodel() == DATAMODEL_NONE ||
1753                     get_udatamodel() == DATAMODEL_NATIVE) {
1754                         ((struct timeval *)(optval))->tv_sec =
1755                             value / (1000 * 1000);
1756                         ((struct timeval *)(optval))->tv_usec =
1757                             value % (1000 * 1000);
1758                         *optlenp = sizeof (struct timeval);
1759                 } else {
1760                         ((struct timeval32 *)(optval))->tv_sec =
1761                             value / (1000 * 1000);
1762                         ((struct timeval32 *)(optval))->tv_usec =
1763                             value % (1000 * 1000);
1764                         *optlenp = sizeof (struct timeval32);
1765                 }
1766                 return (0);
1767         }
1768         case SO_DEBUG:
1769         case SO_REUSEADDR:
1770         case SO_KEEPALIVE:
1771         case SO_DONTROUTE:
1772         case SO_BROADCAST:
1773         case SO_USELOOPBACK:
1774         case SO_OOBINLINE:
1775         case SO_SNDBUF:
1776 #ifdef notyet
1777         case SO_SNDLOWAT:
1778         case SO_RCVLOWAT:
1779 #endif /* notyet */
1780         case SO_DGRAM_ERRIND: {
1781                 socklen_t optlen = *optlenp;
1782 
1783                 if (optlen < (t_uscalar_t)sizeof (int32_t))
1784                         return (EINVAL);
1785                 break;
1786         }
1787         case SO_RCVBUF: {
1788                 socklen_t optlen = *optlenp;
1789 
1790                 if (optlen < (t_uscalar_t)sizeof (int32_t))
1791                         return (EINVAL);
1792 
1793                 if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1794                         /*
1795                          * XXX If SO_RCVBUF has been set and this is an
1796                          * XPG 4.2 application then do not ask the transport
1797                          * since the transport might adjust the value and not
1798                          * return exactly what was set by the application.
1799                          * For non-XPG 4.2 application we return the value
1800                          * that the transport is actually using.
1801                          */
1802                         *(int32_t *)optval = so->so_xpg_rcvbuf;
1803                         *optlenp = sizeof (so->so_xpg_rcvbuf);
1804                         return (0);
1805                 }
1806                 /*
1807                  * If the option has not been set then get a default
1808                  * value from the transport.
1809                  */
1810                 break;
1811         }
1812         case SO_LINGER: {
1813                 socklen_t optlen = *optlenp;
1814 
1815                 if (optlen < (t_uscalar_t)sizeof (struct linger))
1816                         return (EINVAL);
1817                 break;
1818         }
1819         case SO_SND_BUFINFO: {
1820                 socklen_t optlen = *optlenp;
1821 
1822                 if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1823                         return (EINVAL);
1824                 ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1825                     (so->so_proto_props).sopp_wroff;
1826                 ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1827                     (so->so_proto_props).sopp_maxblk;
1828                 ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1829                     (so->so_proto_props).sopp_maxpsz;
1830                 ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1831                     (so->so_proto_props).sopp_tail;
1832                 *optlenp = sizeof (struct so_snd_bufinfo);
1833                 return (0);
1834         }
1835         case SO_SND_COPYAVOID: {
1836                 sof_instance_t *inst;
1837 
1838                 /*
1839                  * Avoid zero-copy if there is a filter with a data_out
1840                  * callback. We could let the operation succeed, but then
1841                  * the filter would have to copy the data anyway.
1842                  */
1843                 for (inst = so->so_filter_top; inst != NULL;
1844                     inst = inst->sofi_next) {
1845                         if (SOF_INTERESTED(inst, data_out))
1846                                 return (EOPNOTSUPP);
1847                 }
1848                 break;
1849         }
1850 
1851         default:
1852                 break;
1853         }
1854 
1855         /* Unknown Option */
1856         return (-1);
1857 }
1858 
1859 void
1860 socket_sonode_destroy(struct sonode *so)
1861 {
1862         sonode_fini(so);
1863         kmem_cache_free(socket_cache, so);
1864 }
1865 
1866 int
1867 so_zcopy_wait(struct sonode *so)
1868 {
1869         int error = 0;
1870 
1871         mutex_enter(&so->so_lock);
1872         while (!(so->so_copyflag & STZCNOTIFY)) {
1873                 if (so->so_state & SS_CLOSING) {
1874                         mutex_exit(&so->so_lock);
1875                         return (EINTR);
1876                 }
1877                 if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1878                         error = EINTR;
1879                         break;
1880                 }
1881         }
1882         so->so_copyflag &= ~STZCNOTIFY;
1883         mutex_exit(&so->so_lock);
1884         return (error);
1885 }
1886 
1887 void
1888 so_timer_callback(void *arg)
1889 {
1890         struct sonode *so = (struct sonode *)arg;
1891 
1892         mutex_enter(&so->so_lock);
1893 
1894         so->so_rcv_timer_tid = 0;
1895         if (so->so_rcv_queued > 0) {
1896                 so_notify_data(so, so->so_rcv_queued);
1897         } else {
1898                 mutex_exit(&so->so_lock);
1899         }
1900 }
1901 
1902 #ifdef DEBUG
1903 /*
1904  * Verify that the length stored in so_rcv_queued and the length of data blocks
1905  * queued is same.
1906  */
1907 static boolean_t
1908 so_check_length(sonode_t *so)
1909 {
1910         mblk_t *mp = so->so_rcv_q_head;
1911         int len = 0;
1912 
1913         ASSERT(MUTEX_HELD(&so->so_lock));
1914 
1915         if (mp != NULL) {
1916                 len = msgdsize(mp);
1917                 while ((mp = mp->b_next) != NULL)
1918                         len += msgdsize(mp);
1919         }
1920         mp = so->so_rcv_head;
1921         if (mp != NULL) {
1922                 len += msgdsize(mp);
1923                 while ((mp = mp->b_next) != NULL)
1924                         len += msgdsize(mp);
1925         }
1926         return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1927 }
1928 #endif
1929 
1930 int
1931 so_get_mod_version(struct sockparams *sp)
1932 {
1933         ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1934         return (sp->sp_smod_info->smod_version);
1935 }
1936 
1937 /*
1938  * so_start_fallback()
1939  *
1940  * Block new socket operations from coming in, and wait for active operations
1941  * to complete. Threads that are sleeping will be woken up so they can get
1942  * out of the way.
1943  *
1944  * The caller must be a reader on so_fallback_rwlock.
1945  */
1946 static boolean_t
1947 so_start_fallback(struct sonode *so)
1948 {
1949         ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1950 
1951         mutex_enter(&so->so_lock);
1952         if (so->so_state & SS_FALLBACK_PENDING) {
1953                 mutex_exit(&so->so_lock);
1954                 return (B_FALSE);
1955         }
1956         so->so_state |= SS_FALLBACK_PENDING;
1957         /*
1958          * Poke all threads that might be sleeping. Any operation that comes
1959          * in after the cv_broadcast will observe the fallback pending flag
1960          * which cause the call to return where it would normally sleep.
1961          */
1962         cv_broadcast(&so->so_state_cv);          /* threads in connect() */
1963         cv_broadcast(&so->so_rcv_cv);            /* threads in recvmsg() */
1964         cv_broadcast(&so->so_snd_cv);            /* threads in sendmsg() */
1965         mutex_enter(&so->so_acceptq_lock);
1966         cv_broadcast(&so->so_acceptq_cv);        /* threads in accept() */
1967         mutex_exit(&so->so_acceptq_lock);
1968         mutex_exit(&so->so_lock);
1969 
1970         /*
1971          * The main reason for the rw_tryupgrade call is to provide
1972          * observability during the fallback process. We want to
1973          * be able to see if there are pending operations.
1974          */
1975         if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1976                 /*
1977                  * It is safe to drop and reaquire the fallback lock, because
1978                  * we are guaranteed that another fallback cannot take place.
1979                  */
1980                 rw_exit(&so->so_fallback_rwlock);
1981                 DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1982                 rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1983                 DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1984         }
1985 
1986         return (B_TRUE);
1987 }
1988 
1989 /*
1990  * so_end_fallback()
1991  *
1992  * Allow socket opertions back in.
1993  *
1994  * The caller must be a writer on so_fallback_rwlock.
1995  */
1996 static void
1997 so_end_fallback(struct sonode *so)
1998 {
1999         ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
2000 
2001         mutex_enter(&so->so_lock);
2002         so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
2003         mutex_exit(&so->so_lock);
2004 
2005         rw_downgrade(&so->so_fallback_rwlock);
2006 }
2007 
2008 /*
2009  * so_quiesced_cb()
2010  *
2011  * Callback passed to the protocol during fallback. It is called once
2012  * the endpoint is quiescent.
2013  *
2014  * No requests from the user, no notifications from the protocol, so it
2015  * is safe to synchronize the state. Data can also be moved without
2016  * risk for reordering.
2017  *
2018  * We do not need to hold so_lock, since there can be only one thread
2019  * operating on the sonode.
2020  */
2021 static mblk_t *
2022 so_quiesced_cb(sock_upper_handle_t sock_handle, sock_quiesce_arg_t *arg,
2023     struct T_capability_ack *tcap,
2024     struct sockaddr *laddr, socklen_t laddrlen,
2025     struct sockaddr *faddr, socklen_t faddrlen, short opts)
2026 {
2027         struct sonode *so = (struct sonode *)sock_handle;
2028         boolean_t atmark;
2029         mblk_t *retmp = NULL, **tailmpp = &retmp;
2030 
2031         if (tcap != NULL)
2032                 sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen,
2033                     opts);
2034 
2035         /*
2036          * Some protocols do not quiece the data path during fallback. Once
2037          * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
2038          * fail and the protocol is responsible for saving the data for later
2039          * delivery (i.e., once the fallback has completed).
2040          */
2041         mutex_enter(&so->so_lock);
2042         so->so_state |= SS_FALLBACK_DRAIN;
2043         SOCKET_TIMER_CANCEL(so);
2044         mutex_exit(&so->so_lock);
2045 
2046         if (so->so_rcv_head != NULL) {
2047                 if (so->so_rcv_q_last_head == NULL)
2048                         so->so_rcv_q_head = so->so_rcv_head;
2049                 else
2050                         so->so_rcv_q_last_head->b_next = so->so_rcv_head;
2051                 so->so_rcv_q_last_head = so->so_rcv_last_head;
2052         }
2053 
2054         atmark = (so->so_state & SS_RCVATMARK) != 0;
2055         /*
2056          * Clear any OOB state having to do with pending data. The TPI
2057          * code path will set the appropriate oob state when we move the
2058          * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
2059          * data has already been consumed.
2060          */
2061         so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
2062 
2063         ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
2064 
2065         /*
2066          * Move data to the STREAM head.
2067          */
2068         while (so->so_rcv_q_head != NULL) {
2069                 mblk_t *mp = so->so_rcv_q_head;
2070                 size_t mlen = msgdsize(mp);
2071 
2072                 so->so_rcv_q_head = mp->b_next;
2073                 mp->b_next = NULL;
2074                 mp->b_prev = NULL;
2075 
2076                 /*
2077                  * Send T_EXDATA_IND if we are at the oob mark.
2078                  */
2079                 if (atmark) {
2080                         struct T_exdata_ind *tei;
2081                         mblk_t *mp1 = arg->soqa_exdata_mp;
2082 
2083                         arg->soqa_exdata_mp = NULL;
2084                         ASSERT(mp1 != NULL);
2085                         mp1->b_datap->db_type = M_PROTO;
2086                         tei = (struct T_exdata_ind *)mp1->b_rptr;
2087                         tei->PRIM_type = T_EXDATA_IND;
2088                         tei->MORE_flag = 0;
2089                         mp1->b_wptr = (uchar_t *)&tei[1];
2090 
2091                         if (IS_SO_OOB_INLINE(so)) {
2092                                 mp1->b_cont = mp;
2093                         } else {
2094                                 ASSERT(so->so_oobmsg != NULL);
2095                                 mp1->b_cont = so->so_oobmsg;
2096                                 so->so_oobmsg = NULL;
2097 
2098                                 /* process current mp next time around */
2099                                 mp->b_next = so->so_rcv_q_head;
2100                                 so->so_rcv_q_head = mp;
2101                                 mlen = 0;
2102                         }
2103                         mp = mp1;
2104 
2105                         /* we have consumed the oob mark */
2106                         atmark = B_FALSE;
2107                 } else if (so->so_oobmark > 0) {
2108                         /*
2109                          * Check if the OOB mark is within the current
2110                          * mblk chain. In that case we have to split it up.
2111                          */
2112                         if (so->so_oobmark < mlen) {
2113                                 mblk_t *urg_mp = mp;
2114 
2115                                 atmark = B_TRUE;
2116                                 mp = NULL;
2117                                 mlen = so->so_oobmark;
2118 
2119                                 /*
2120                                  * It is assumed that the OOB mark does
2121                                  * not land within a mblk.
2122                                  */
2123                                 do {
2124                                         so->so_oobmark -= MBLKL(urg_mp);
2125                                         mp = urg_mp;
2126                                         urg_mp = urg_mp->b_cont;
2127                                 } while (so->so_oobmark > 0);
2128                                 mp->b_cont = NULL;
2129                                 if (urg_mp != NULL) {
2130                                         urg_mp->b_next = so->so_rcv_q_head;
2131                                         so->so_rcv_q_head = urg_mp;
2132                                 }
2133                         } else {
2134                                 so->so_oobmark -= mlen;
2135                                 if (so->so_oobmark == 0)
2136                                         atmark = B_TRUE;
2137                         }
2138                 }
2139 
2140                 /*
2141                  * Queue data on the STREAM head.
2142                  */
2143                 so->so_rcv_queued -= mlen;
2144                 *tailmpp = mp;
2145                 tailmpp = &mp->b_next;
2146         }
2147         so->so_rcv_head = NULL;
2148         so->so_rcv_last_head = NULL;
2149         so->so_rcv_q_head = NULL;
2150         so->so_rcv_q_last_head = NULL;
2151 
2152         /*
2153          * Check if the oob byte is at the end of the data stream, or if the
2154          * oob byte has not yet arrived. In the latter case we have to send a
2155          * SIGURG and a mark indicator to the STREAM head. The mark indicator
2156          * is needed to guarantee correct behavior for SIOCATMARK. See block
2157          * comment in socktpi.h for more details.
2158          */
2159         if (atmark || so->so_oobmark > 0) {
2160                 mblk_t *mp;
2161 
2162                 if (atmark && so->so_oobmsg != NULL) {
2163                         struct T_exdata_ind *tei;
2164 
2165                         mp = arg->soqa_exdata_mp;
2166                         arg->soqa_exdata_mp = NULL;
2167                         ASSERT(mp != NULL);
2168                         mp->b_datap->db_type = M_PROTO;
2169                         tei = (struct T_exdata_ind *)mp->b_rptr;
2170                         tei->PRIM_type = T_EXDATA_IND;
2171                         tei->MORE_flag = 0;
2172                         mp->b_wptr = (uchar_t *)&tei[1];
2173 
2174                         mp->b_cont = so->so_oobmsg;
2175                         so->so_oobmsg = NULL;
2176 
2177                         *tailmpp = mp;
2178                         tailmpp = &mp->b_next;
2179                 } else {
2180                         /* Send up the signal */
2181                         mp = arg->soqa_exdata_mp;
2182                         arg->soqa_exdata_mp = NULL;
2183                         ASSERT(mp != NULL);
2184                         DB_TYPE(mp) = M_PCSIG;
2185                         *mp->b_wptr++ = (uchar_t)SIGURG;
2186                         *tailmpp = mp;
2187                         tailmpp = &mp->b_next;
2188 
2189                         /* Send up the mark indicator */
2190                         mp = arg->soqa_urgmark_mp;
2191                         arg->soqa_urgmark_mp = NULL;
2192                         mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
2193                         *tailmpp = mp;
2194                         tailmpp = &mp->b_next;
2195 
2196                         so->so_oobmark = 0;
2197                 }
2198         }
2199         ASSERT(so->so_oobmark == 0);
2200         ASSERT(so->so_rcv_queued == 0);
2201 
2202         return (retmp);
2203 }
2204 
2205 #ifdef DEBUG
2206 /*
2207  * Do an integrity check of the sonode. This should be done if a
2208  * fallback fails after sonode has initially been converted to use
2209  * TPI and subsequently have to be reverted.
2210  *
2211  * Failure to pass the integrity check will panic the system.
2212  */
2213 void
2214 so_integrity_check(struct sonode *cur, struct sonode *orig)
2215 {
2216         VERIFY(cur->so_vnode == orig->so_vnode);
2217         VERIFY(cur->so_ops == orig->so_ops);
2218         /*
2219          * For so_state we can only VERIFY the state flags in CHECK_STATE.
2220          * The other state flags might be affected by a notification from the
2221          * protocol.
2222          */
2223 #define CHECK_STATE     (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
2224         SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
2225         SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
2226         VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
2227             (orig->so_state & CHECK_STATE));
2228         VERIFY(cur->so_mode == orig->so_mode);
2229         VERIFY(cur->so_flag == orig->so_flag);
2230         VERIFY(cur->so_count == orig->so_count);
2231         /* Cannot VERIFY so_proto_connid; proto can update it */
2232         VERIFY(cur->so_sockparams == orig->so_sockparams);
2233         /* an error might have been recorded, but it can not be lost */
2234         VERIFY(cur->so_error != 0 || orig->so_error == 0);
2235         VERIFY(cur->so_family == orig->so_family);
2236         VERIFY(cur->so_type == orig->so_type);
2237         VERIFY(cur->so_protocol == orig->so_protocol);
2238         VERIFY(cur->so_version == orig->so_version);
2239         /* New conns might have arrived, but none should have been lost */
2240         VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
2241         VERIFY(list_head(&cur->so_acceptq_list) ==
2242             list_head(&orig->so_acceptq_list));
2243         VERIFY(cur->so_backlog == orig->so_backlog);
2244         /* New OOB migth have arrived, but mark should not have been lost */
2245         VERIFY(cur->so_oobmark >= orig->so_oobmark);
2246         /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
2247         VERIFY(cur->so_pgrp == orig->so_pgrp);
2248         VERIFY(cur->so_peercred == orig->so_peercred);
2249         VERIFY(cur->so_cpid == orig->so_cpid);
2250         VERIFY(cur->so_zoneid == orig->so_zoneid);
2251         /* New data migth have arrived, but none should have been lost */
2252         VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
2253         VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
2254         VERIFY(cur->so_rcv_head == orig->so_rcv_head);
2255         VERIFY(cur->so_proto_handle == orig->so_proto_handle);
2256         VERIFY(cur->so_downcalls == orig->so_downcalls);
2257         /* Cannot VERIFY so_proto_props; they can be updated by proto */
2258 }
2259 #endif
2260 
2261 /*
2262  * so_tpi_fallback()
2263  *
2264  * This is the fallback initation routine; things start here.
2265  *
2266  * Basic strategy:
2267  *   o Block new socket operations from coming in
2268  *   o Allocate/initate info needed by TPI
2269  *   o Quiesce the connection, at which point we sync
2270  *     state and move data
2271  *   o Change operations (sonodeops) associated with the socket
2272  *   o Unblock threads waiting for the fallback to finish
2273  */
2274 int
2275 so_tpi_fallback(struct sonode *so, struct cred *cr)
2276 {
2277         int error;
2278         queue_t *q;
2279         struct sockparams *sp;
2280         struct sockparams *newsp = NULL;
2281         so_proto_fallback_func_t fbfunc;
2282         const char *devpath;
2283         boolean_t direct;
2284         struct sonode *nso;
2285         sock_quiesce_arg_t arg = { NULL, NULL };
2286 #ifdef DEBUG
2287         struct sonode origso;
2288 #endif
2289         error = 0;
2290         sp = so->so_sockparams;
2291         fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
2292 
2293         /*
2294          * Cannot fallback if the socket has active filters
2295          */
2296         if (so->so_filter_active > 0)
2297                 return (EINVAL);
2298 
2299         switch (so->so_family) {
2300         case AF_INET:
2301                 devpath = sp->sp_smod_info->smod_fallback_devpath_v4;
2302                 break;
2303         case AF_INET6:
2304                 devpath = sp->sp_smod_info->smod_fallback_devpath_v6;
2305                 break;
2306         default:
2307                 return (EINVAL);
2308         }
2309 
2310         /*
2311          * Fallback can only happen if the socket module has a TPI device
2312          * and fallback function.
2313          */
2314         if (devpath == NULL || fbfunc == NULL)
2315                 return (EINVAL);
2316 
2317         /*
2318          * Initiate fallback; upon success we know that no new requests
2319          * will come in from the user.
2320          */
2321         if (!so_start_fallback(so))
2322                 return (EAGAIN);
2323 #ifdef DEBUG
2324         /*
2325          * Make a copy of the sonode in case we need to make an integrity
2326          * check later on.
2327          */
2328         bcopy(so, &origso, sizeof (*so));
2329 #endif
2330 
2331         sp->sp_stats.sps_nfallback.value.ui64++;
2332 
2333         newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
2334             so->so_protocol, devpath, KM_SLEEP, &error);
2335         if (error != 0)
2336                 goto out;
2337 
2338         if (so->so_direct != NULL) {
2339                 sodirect_t *sodp = so->so_direct;
2340                 mutex_enter(&so->so_lock);
2341 
2342                 so->so_direct->sod_enabled = B_FALSE;
2343                 so->so_state &= ~SS_SODIRECT;
2344                 ASSERT(sodp->sod_uioafh == NULL);
2345                 mutex_exit(&so->so_lock);
2346         }
2347 
2348         /* Turn sonode into a TPI socket */
2349         error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
2350         if (error != 0)
2351                 goto out;
2352         /*
2353          * When it comes to urgent data we have two cases to deal with;
2354          * (1) The oob byte has already arrived, or (2) the protocol has
2355          * notified that oob data is pending, but it has not yet arrived.
2356          *
2357          * For (1) all we need to do is send a T_EXDATA_IND to indicate were
2358          * in the byte stream the oob byte is. For (2) we have to send a
2359          * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
2360          * the oob byte will be the next byte from the protocol.
2361          *
2362          * So in the worst case we need two mblks, one for the signal, another
2363          * for mark indication. In that case we use the exdata_mp for the sig.
2364          */
2365         arg.soqa_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind),
2366             BPRI_MED, STR_NOSIG, NULL);
2367         arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
2368 
2369         /*
2370          * Now tell the protocol to start using TPI. so_quiesced_cb be
2371          * called once it's safe to synchronize state.
2372          */
2373         DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
2374         error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb,
2375             &arg);
2376         DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
2377 
2378         if (error != 0) {
2379                 /* protocol was unable to do a fallback, revert the sonode */
2380                 sotpi_revert_sonode(so, cr);
2381                 goto out;
2382         }
2383 
2384         /*
2385          * Walk the accept queue and notify the proto that they should
2386          * fall back to TPI. The protocol will send up the T_CONN_IND.
2387          */
2388         nso = list_head(&so->so_acceptq_list);
2389         while (nso != NULL) {
2390                 int rval;
2391                 struct sonode *next;
2392 
2393                 if (arg.soqa_exdata_mp == NULL) {
2394                         arg.soqa_exdata_mp =
2395                             allocb_wait(sizeof (struct T_exdata_ind),
2396                             BPRI_MED, STR_NOSIG, NULL);
2397                 }
2398                 if (arg.soqa_urgmark_mp == NULL) {
2399                         arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED,
2400                             STR_NOSIG, NULL);
2401                 }
2402 
2403                 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
2404                 rval = (*fbfunc)(nso->so_proto_handle, NULL, direct,
2405                     so_quiesced_cb, &arg);
2406                 DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
2407                 if (rval != 0) {
2408                         /* Abort the connection */
2409                         zcmn_err(getzoneid(), CE_WARN,
2410                             "Failed to convert socket in accept queue to TPI. "
2411                             "Pid = %d\n", curproc->p_pid);
2412                         next = list_next(&so->so_acceptq_list, nso);
2413                         list_remove(&so->so_acceptq_list, nso);
2414                         so->so_acceptq_len--;
2415 
2416                         (void) socket_close(nso, 0, CRED());
2417                         socket_destroy(nso);
2418                         nso = next;
2419                 } else {
2420                         nso = list_next(&so->so_acceptq_list, nso);
2421                 }
2422         }
2423 
2424         /*
2425          * Now flush the acceptq, this will destroy all sockets. They will
2426          * be recreated in sotpi_accept().
2427          */
2428         so_acceptq_flush(so, B_FALSE);
2429 
2430         mutex_enter(&so->so_lock);
2431         so->so_state |= SS_FALLBACK_COMP;
2432         mutex_exit(&so->so_lock);
2433 
2434         /*
2435          * Swap the sonode ops. Socket opertations that come in once this
2436          * is done will proceed without blocking.
2437          */
2438         so->so_ops = &sotpi_sonodeops;
2439 
2440         /*
2441          * Wake up any threads stuck in poll. This is needed since the poll
2442          * head changes when the fallback happens (moves from the sonode to
2443          * the STREAMS head).
2444          */
2445         pollwakeup(&so->so_poll_list, POLLERR);
2446 
2447         /*
2448          * When this non-STREAM socket was created we placed an extra ref on
2449          * the associated vnode to support asynchronous close. Drop that ref
2450          * here.
2451          */
2452         ASSERT(SOTOV(so)->v_count >= 2);
2453         VN_RELE(SOTOV(so));
2454 out:
2455         so_end_fallback(so);
2456 
2457         if (error != 0) {
2458 #ifdef DEBUG
2459                 so_integrity_check(so, &origso);
2460 #endif
2461                 zcmn_err(getzoneid(), CE_WARN,
2462                     "Failed to convert socket to TPI (err=%d). Pid = %d\n",
2463                     error, curproc->p_pid);
2464                 if (newsp != NULL)
2465                         SOCKPARAMS_DEC_REF(newsp);
2466         }
2467         if (arg.soqa_exdata_mp != NULL)
2468                 freemsg(arg.soqa_exdata_mp);
2469         if (arg.soqa_urgmark_mp != NULL)
2470                 freemsg(arg.soqa_urgmark_mp);
2471 
2472         return (error);
2473 }