1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /*
  26  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/signal.h>
  32 #include <sys/cmn_err.h>
  33 
  34 #include <sys/stropts.h>
  35 #include <sys/socket.h>
  36 #include <sys/socketvar.h>
  37 #include <sys/sockio.h>
  38 #include <sys/strsubr.h>
  39 #include <sys/strsun.h>
  40 #include <sys/atomic.h>
  41 #include <sys/tihdr.h>
  42 
  43 #include <fs/sockfs/sockcommon.h>
  44 #include <fs/sockfs/sockfilter_impl.h>
  45 #include <fs/sockfs/socktpi.h>
  46 #include <fs/sockfs/sodirect.h>
  47 #include <sys/ddi.h>
  48 #include <inet/ip.h>
  49 #include <sys/time.h>
  50 #include <sys/cmn_err.h>
  51 
  52 #ifdef SOCK_TEST
  53 extern int do_useracc;
  54 extern clock_t sock_test_timelimit;
  55 #endif /* SOCK_TEST */
  56 
  57 #define MBLK_PULL_LEN 64
  58 uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
  59 
  60 #ifdef DEBUG
  61 boolean_t so_debug_length = B_FALSE;
  62 static boolean_t so_check_length(sonode_t *so);
  63 #endif
  64 
  65 static int
  66 so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
  67     struct sonode **nsop)
  68 {
  69         struct sonode *nso = NULL;
  70 
  71         *nsop = NULL;
  72         ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
  73         while ((nso = list_remove_head(&so->so_acceptq_list)) == NULL) {
  74                 /*
  75                  * No need to check so_error here, because it is not
  76                  * possible for a listening socket to be reset or otherwise
  77                  * disconnected.
  78                  *
  79                  * So now we just need check if it's ok to wait.
  80                  */
  81                 if (dontblock)
  82                         return (EWOULDBLOCK);
  83                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
  84                         return (EINTR);
  85 
  86                 if (cv_wait_sig_swap(&so->so_acceptq_cv,
  87                     &so->so_acceptq_lock) == 0)
  88                         return (EINTR);
  89         }
  90 
  91         ASSERT(nso != NULL);
  92         ASSERT(so->so_acceptq_len > 0);
  93         so->so_acceptq_len--;
  94         nso->so_listener = NULL;
  95 
  96         *nsop = nso;
  97 
  98         return (0);
  99 }
 100 
 101 /*
 102  * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
 103  *
 104  * Pulls a connection off of the accept queue.
 105  *
 106  * Arguments:
 107  *   so        - listening socket
 108  *   dontblock - indicate whether it's ok to sleep if there are no
 109  *               connections on the queue
 110  *   nsop      - Value-return argument
 111  *
 112  * Return values:
 113  *   0 when a connection is successfully dequeued, in which case nsop
 114  *   is set to point to the new connection. Upon failure a non-zero
 115  *   value is returned, and the value of nsop is set to NULL.
 116  *
 117  * Note:
 118  *   so_acceptq_dequeue() may return prematurly if the socket is falling
 119  *   back to TPI.
 120  */
 121 int
 122 so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
 123     struct sonode **nsop)
 124 {
 125         int error;
 126 
 127         mutex_enter(&so->so_acceptq_lock);
 128         error = so_acceptq_dequeue_locked(so, dontblock, nsop);
 129         mutex_exit(&so->so_acceptq_lock);
 130 
 131         return (error);
 132 }
 133 
 134 static void
 135 so_acceptq_flush_impl(struct sonode *so, list_t *list, boolean_t doclose)
 136 {
 137         struct sonode *nso;
 138 
 139         while ((nso = list_remove_head(list)) != NULL) {
 140                 nso->so_listener = NULL;
 141                 if (doclose) {
 142                         (void) socket_close(nso, 0, CRED());
 143                 } else {
 144                         /*
 145                          * Only used for fallback - not possible when filters
 146                          * are present.
 147                          */
 148                         ASSERT(so->so_filter_active == 0);
 149                         /*
 150                          * Since the socket is on the accept queue, there can
 151                          * only be one reference. We drop the reference and
 152                          * just blow off the socket.
 153                          */
 154                         ASSERT(nso->so_count == 1);
 155                         nso->so_count--;
 156                         /* drop the proto ref */
 157                         VN_RELE(SOTOV(nso));
 158                 }
 159                 socket_destroy(nso);
 160         }
 161 }
 162 /*
 163  * void so_acceptq_flush(struct sonode *so)
 164  *
 165  * Removes all pending connections from a listening socket, and
 166  * frees the associated resources.
 167  *
 168  * Arguments
 169  *   so      - listening socket
 170  *   doclose - make a close downcall for each socket on the accept queue
 171  *
 172  * Return values:
 173  *   None.
 174  *
 175  * Note:
 176  *   The caller has to ensure that no calls to so_acceptq_enqueue() or
 177  *   so_acceptq_dequeue() occur while the accept queue is being flushed.
 178  *   So either the socket needs to be in a state where no operations
 179  *   would come in, or so_lock needs to be obtained.
 180  */
 181 void
 182 so_acceptq_flush(struct sonode *so, boolean_t doclose)
 183 {
 184         so_acceptq_flush_impl(so, &so->so_acceptq_list, doclose);
 185         so_acceptq_flush_impl(so, &so->so_acceptq_defer, doclose);
 186 
 187         so->so_acceptq_len = 0;
 188 }
 189 
 190 int
 191 so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
 192     sock_connid_t id)
 193 {
 194         ASSERT(MUTEX_HELD(&so->so_lock));
 195 
 196         /*
 197          * The protocol has notified us that a connection attempt is being
 198          * made, so before we wait for a notification to arrive we must
 199          * clear out any errors associated with earlier connection attempts.
 200          */
 201         if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
 202                 so->so_error = 0;
 203 
 204         while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
 205                 if (nonblock)
 206                         return (EINPROGRESS);
 207 
 208                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 209                         return (EINTR);
 210 
 211                 if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
 212                         return (EINTR);
 213         }
 214 
 215         if (so->so_error != 0)
 216                 return (sogeterr(so, B_TRUE));
 217         /*
 218          * Under normal circumstances, so_error should contain an error
 219          * in case the connect failed. However, it is possible for another
 220          * thread to come in a consume the error, so generate a sensible
 221          * error in that case.
 222          */
 223         if ((so->so_state & SS_ISCONNECTED) == 0)
 224                 return (ECONNREFUSED);
 225 
 226         return (0);
 227 }
 228 
 229 /*
 230  * int so_wait_connected(struct sonode *so, boolean_t nonblock,
 231  *    sock_connid_t id)
 232  *
 233  * Wait until the socket is connected or an error has occured.
 234  *
 235  * Arguments:
 236  *   so       - socket
 237  *   nonblock - indicate whether it's ok to sleep if the connection has
 238  *              not yet been established
 239  *   gen      - generation number that was returned by the protocol
 240  *              when the operation was started
 241  *
 242  * Returns:
 243  *   0 if the connection attempt was successful, or an error indicating why
 244  *   the connection attempt failed.
 245  */
 246 int
 247 so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
 248 {
 249         int error;
 250 
 251         mutex_enter(&so->so_lock);
 252         error = so_wait_connected_locked(so, nonblock, id);
 253         mutex_exit(&so->so_lock);
 254 
 255         return (error);
 256 }
 257 
 258 int
 259 so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
 260 {
 261         int error;
 262 
 263         ASSERT(MUTEX_HELD(&so->so_lock));
 264         while (SO_SND_FLOWCTRLD(so)) {
 265                 if (so->so_state & SS_CANTSENDMORE)
 266                         return (EPIPE);
 267                 if (dontblock)
 268                         return (EWOULDBLOCK);
 269 
 270                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 271                         return (EINTR);
 272 
 273                 if (so->so_sndtimeo == 0) {
 274                         /*
 275                          * Zero means disable timeout.
 276                          */
 277                         error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
 278                 } else {
 279                         error = cv_reltimedwait_sig(&so->so_snd_cv,
 280                             &so->so_lock, so->so_sndtimeo, TR_CLOCK_TICK);
 281                 }
 282                 if (error == 0)
 283                         return (EINTR);
 284                 else if (error == -1)
 285                         return (EAGAIN);
 286         }
 287         return (0);
 288 }
 289 
 290 /*
 291  * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
 292  *
 293  * Wait for the transport to notify us about send buffers becoming
 294  * available.
 295  */
 296 int
 297 so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
 298 {
 299         int error = 0;
 300 
 301         mutex_enter(&so->so_lock);
 302         so->so_snd_wakeup = B_TRUE;
 303         error = so_snd_wait_qnotfull_locked(so, dontblock);
 304         so->so_snd_wakeup = B_FALSE;
 305         mutex_exit(&so->so_lock);
 306 
 307         return (error);
 308 }
 309 
 310 void
 311 so_snd_qfull(struct sonode *so)
 312 {
 313         mutex_enter(&so->so_lock);
 314         so->so_snd_qfull = B_TRUE;
 315         mutex_exit(&so->so_lock);
 316 }
 317 
 318 void
 319 so_snd_qnotfull(struct sonode *so)
 320 {
 321         mutex_enter(&so->so_lock);
 322         so->so_snd_qfull = B_FALSE;
 323         /* wake up everyone waiting for buffers */
 324         cv_broadcast(&so->so_snd_cv);
 325         mutex_exit(&so->so_lock);
 326 }
 327 
 328 /*
 329  * Change the process/process group to which SIGIO is sent.
 330  */
 331 int
 332 socket_chgpgrp(struct sonode *so, pid_t pid)
 333 {
 334         int error;
 335 
 336         ASSERT(MUTEX_HELD(&so->so_lock));
 337         if (pid != 0) {
 338                 /*
 339                  * Permissions check by sending signal 0.
 340                  * Note that when kill fails it does a
 341                  * set_errno causing the system call to fail.
 342                  */
 343                 error = kill(pid, 0);
 344                 if (error != 0) {
 345                         return (error);
 346                 }
 347         }
 348         so->so_pgrp = pid;
 349         return (0);
 350 }
 351 
 352 
 353 /*
 354  * Generate a SIGIO, for 'writable' events include siginfo structure,
 355  * for read events just send the signal.
 356  */
 357 /*ARGSUSED*/
 358 static void
 359 socket_sigproc(proc_t *proc, int event)
 360 {
 361         k_siginfo_t info;
 362 
 363         ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
 364 
 365         if (event & SOCKETSIG_WRITE) {
 366                 info.si_signo = SIGPOLL;
 367                 info.si_code = POLL_OUT;
 368                 info.si_errno = 0;
 369                 info.si_fd = 0;
 370                 info.si_band = 0;
 371                 sigaddq(proc, NULL, &info, KM_NOSLEEP);
 372         }
 373         if (event & SOCKETSIG_READ) {
 374                 sigtoproc(proc, NULL, SIGPOLL);
 375         }
 376         if (event & SOCKETSIG_URG) {
 377                 sigtoproc(proc, NULL, SIGURG);
 378         }
 379 }
 380 
 381 void
 382 socket_sendsig(struct sonode *so, int event)
 383 {
 384         proc_t *proc;
 385 
 386         ASSERT(MUTEX_HELD(&so->so_lock));
 387 
 388         if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
 389             event != SOCKETSIG_URG)) {
 390                 return;
 391         }
 392 
 393         dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
 394 
 395         if (so->so_pgrp > 0) {
 396                 /*
 397                  * XXX This unfortunately still generates
 398                  * a signal when a fd is closed but
 399                  * the proc is active.
 400                  */
 401                 mutex_enter(&pidlock);
 402                 /*
 403                  * Even if the thread started in another zone, we're receiving
 404                  * on behalf of this socket's zone, so find the proc using the
 405                  * socket's zone ID.
 406                  */
 407                 proc = prfind_zone(so->so_pgrp, so->so_zoneid);
 408                 if (proc == NULL) {
 409                         mutex_exit(&pidlock);
 410                         return;
 411                 }
 412                 mutex_enter(&proc->p_lock);
 413                 mutex_exit(&pidlock);
 414                 socket_sigproc(proc, event);
 415                 mutex_exit(&proc->p_lock);
 416         } else {
 417                 /*
 418                  * Send to process group. Hold pidlock across
 419                  * calls to socket_sigproc().
 420                  */
 421                 pid_t pgrp = -so->so_pgrp;
 422 
 423                 mutex_enter(&pidlock);
 424                 /*
 425                  * Even if the thread started in another zone, we're receiving
 426                  * on behalf of this socket's zone, so find the pgrp using the
 427                  * socket's zone ID.
 428                  */
 429                 proc = pgfind_zone(pgrp, so->so_zoneid);
 430                 while (proc != NULL) {
 431                         mutex_enter(&proc->p_lock);
 432                         socket_sigproc(proc, event);
 433                         mutex_exit(&proc->p_lock);
 434                         proc = proc->p_pglink;
 435                 }
 436                 mutex_exit(&pidlock);
 437         }
 438 }
 439 
 440 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 441 /* Copy userdata into a new mblk_t */
 442 mblk_t *
 443 socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
 444     size_t tail_len, int *errorp)
 445 {
 446         mblk_t  *head = NULL, **tail = &head;
 447 
 448         ASSERT(iosize == INFPSZ || iosize > 0);
 449 
 450         if (iosize == INFPSZ || iosize > uiop->uio_resid)
 451                 iosize = uiop->uio_resid;
 452 
 453         if (maxblk == INFPSZ)
 454                 maxblk = iosize;
 455 
 456         /* Nothing to do in these cases, so we're done */
 457         if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
 458                 goto done;
 459 
 460         /*
 461          * We will enter the loop below if iosize is 0; it will allocate an
 462          * empty message block and call uiomove(9F) which will just return.
 463          * We could avoid that with an extra check but would only slow
 464          * down the much more likely case where iosize is larger than 0.
 465          */
 466         do {
 467                 ssize_t blocksize;
 468                 mblk_t  *mp;
 469 
 470                 blocksize = MIN(iosize, maxblk);
 471                 ASSERT(blocksize >= 0);
 472                 mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
 473                 if (mp == NULL) {
 474                         *errorp = ENOMEM;
 475                         return (head);
 476                 }
 477                 mp->b_rptr += wroff;
 478                 mp->b_wptr = mp->b_rptr + blocksize;
 479 
 480                 *tail = mp;
 481                 tail = &mp->b_cont;
 482 
 483                 /* uiomove(9F) either returns 0 or EFAULT */
 484                 if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
 485                     UIO_WRITE, uiop)) != 0) {
 486                         ASSERT(*errorp != ENOMEM);
 487                         freemsg(head);
 488                         return (NULL);
 489                 }
 490 
 491                 iosize -= blocksize;
 492         } while (iosize > 0);
 493 
 494 done:
 495         *errorp = 0;
 496         return (head);
 497 }
 498 
 499 mblk_t *
 500 socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
 501 {
 502         int error;
 503         ptrdiff_t n;
 504         mblk_t *nmp;
 505 
 506         ASSERT(mp->b_wptr >= mp->b_rptr);
 507 
 508         /*
 509          * max_read is the offset of the oobmark and read can not go pass
 510          * the oobmark.
 511          */
 512         if (max_read == INFPSZ || max_read > uiop->uio_resid)
 513                 max_read = uiop->uio_resid;
 514 
 515         do {
 516                 if ((n = MIN(max_read, MBLKL(mp))) != 0) {
 517                         ASSERT(n > 0);
 518 
 519                         error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
 520                         if (error != 0) {
 521                                 freemsg(mp);
 522                                 *errorp = error;
 523                                 return (NULL);
 524                         }
 525                 }
 526 
 527                 mp->b_rptr += n;
 528                 max_read -= n;
 529                 while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
 530                         /*
 531                          * get rid of zero length mblks
 532                          */
 533                         nmp = mp;
 534                         mp = mp->b_cont;
 535                         freeb(nmp);
 536                 }
 537         } while (mp != NULL && max_read > 0);
 538 
 539         *errorp = 0;
 540         return (mp);
 541 }
 542 
 543 static void
 544 so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
 545 {
 546         ASSERT(last_tail != NULL);
 547         mp->b_next = so->so_rcv_q_head;
 548         mp->b_prev = last_tail;
 549         ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
 550 
 551         if (so->so_rcv_q_head == NULL) {
 552                 ASSERT(so->so_rcv_q_last_head == NULL);
 553                 so->so_rcv_q_last_head = mp;
 554 #ifdef DEBUG
 555         } else {
 556                 ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
 557 #endif
 558         }
 559         so->so_rcv_q_head = mp;
 560 
 561 #ifdef DEBUG
 562         if (so_debug_length) {
 563                 mutex_enter(&so->so_lock);
 564                 ASSERT(so_check_length(so));
 565                 mutex_exit(&so->so_lock);
 566         }
 567 #endif
 568 }
 569 
 570 /*
 571  * Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
 572  * can be processed by so_dequeue_msg().
 573  */
 574 void
 575 so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
 576 {
 577         if (so->so_filter_active > 0 &&
 578             (mp_head = sof_filter_data_in_proc(so, mp_head,
 579             &mp_last_head)) == NULL)
 580                 return;
 581 
 582         ASSERT(mp_head->b_prev != NULL);
 583         if (so->so_rcv_q_head == NULL) {
 584                 so->so_rcv_q_head = mp_head;
 585                 so->so_rcv_q_last_head = mp_last_head;
 586                 ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
 587         } else {
 588                 boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
 589                     (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
 590 
 591                 if (mp_head->b_next == NULL &&
 592                     DB_TYPE(mp_head) == M_DATA &&
 593                     DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
 594                         so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 595                         so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 596                         mp_head->b_prev = NULL;
 597                 } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
 598                         /*
 599                          * Append to last_head if more than one mblks, and both
 600                          * mp_head and last_head are I/OAT mblks.
 601                          */
 602                         ASSERT(mp_head->b_next != NULL);
 603                         so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 604                         so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 605                         mp_head->b_prev = NULL;
 606 
 607                         so->so_rcv_q_last_head->b_next = mp_head->b_next;
 608                         mp_head->b_next = NULL;
 609                         so->so_rcv_q_last_head = mp_last_head;
 610                 } else {
 611 #ifdef DEBUG
 612                         {
 613                                 mblk_t *tmp_mblk;
 614                                 tmp_mblk = mp_head;
 615                                 while (tmp_mblk != NULL) {
 616                                         ASSERT(tmp_mblk->b_prev != NULL);
 617                                         tmp_mblk = tmp_mblk->b_next;
 618                                 }
 619                         }
 620 #endif
 621                         so->so_rcv_q_last_head->b_next = mp_head;
 622                         so->so_rcv_q_last_head = mp_last_head;
 623                 }
 624         }
 625 }
 626 
 627 /*
 628  * Check flow control on a given sonode.  Must have so_lock held, and
 629  * this function will release the hold.  Return true if flow control
 630  * is cleared.
 631  */
 632 boolean_t
 633 so_check_flow_control(struct sonode *so)
 634 {
 635         ASSERT(MUTEX_HELD(&so->so_lock));
 636 
 637         if (so->so_flowctrld && (so->so_rcv_queued < so->so_rcvlowat &&
 638             !(so->so_state & SS_FIL_RCV_FLOWCTRL))) {
 639                 so->so_flowctrld = B_FALSE;
 640                 mutex_exit(&so->so_lock);
 641                 /*
 642                  * Open up flow control. SCTP does not have any downcalls, and
 643                  * it will clr flow ctrl in sosctp_recvmsg().
 644                  */
 645                 if (so->so_downcalls != NULL &&
 646                     so->so_downcalls->sd_clr_flowctrl != NULL) {
 647                         (*so->so_downcalls->sd_clr_flowctrl)
 648                             (so->so_proto_handle);
 649                 }
 650                 /* filters can start injecting data */
 651                 sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0);
 652                 return (B_TRUE);
 653         } else {
 654                 mutex_exit(&so->so_lock);
 655                 return (B_FALSE);
 656         }
 657 }
 658 
 659 int
 660 so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
 661     rval_t *rvalp, int flags)
 662 {
 663         mblk_t  *mp, *nmp;
 664         mblk_t  *savemp, *savemptail;
 665         mblk_t  *new_msg_head;
 666         mblk_t  *new_msg_last_head;
 667         mblk_t  *last_tail;
 668         boolean_t partial_read;
 669         boolean_t reset_atmark = B_FALSE;
 670         int more = 0;
 671         int error;
 672         ssize_t oobmark;
 673         sodirect_t *sodp = so->so_direct;
 674 
 675         partial_read = B_FALSE;
 676         *mctlp = NULL;
 677 again:
 678         mutex_enter(&so->so_lock);
 679 again1:
 680 #ifdef DEBUG
 681         if (so_debug_length) {
 682                 ASSERT(so_check_length(so));
 683         }
 684 #endif
 685         if (so->so_state & SS_RCVATMARK) {
 686                 /* Check whether the caller is OK to read past the mark */
 687                 if (flags & MSG_NOMARK) {
 688                         mutex_exit(&so->so_lock);
 689                         return (EWOULDBLOCK);
 690                 }
 691                 reset_atmark = B_TRUE;
 692         }
 693         /*
 694          * First move messages from the dump area to processing area
 695          */
 696         if (sodp != NULL) {
 697                 if (sodp->sod_enabled) {
 698                         if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
 699                                 /* nothing to uioamove */
 700                                 sodp = NULL;
 701                         } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
 702                                 sodp->sod_uioa.uioa_state &= UIOA_CLR;
 703                                 sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
 704                                 /*
 705                                  * try to uioamove() the data that
 706                                  * has already queued.
 707                                  */
 708                                 sod_uioa_so_init(so, sodp, uiop);
 709                         }
 710                 } else {
 711                         sodp = NULL;
 712                 }
 713         }
 714         new_msg_head = so->so_rcv_head;
 715         new_msg_last_head = so->so_rcv_last_head;
 716         so->so_rcv_head = NULL;
 717         so->so_rcv_last_head = NULL;
 718         oobmark = so->so_oobmark;
 719         /*
 720          * We can release the lock as there can only be one reader
 721          */
 722         mutex_exit(&so->so_lock);
 723 
 724         if (new_msg_head != NULL) {
 725                 so_process_new_message(so, new_msg_head, new_msg_last_head);
 726         }
 727         savemp = savemptail = NULL;
 728         rvalp->r_vals = 0;
 729         error = 0;
 730         mp = so->so_rcv_q_head;
 731 
 732         if (mp != NULL &&
 733             (so->so_rcv_timer_tid == 0 ||
 734             so->so_rcv_queued >= so->so_rcv_thresh)) {
 735                 partial_read = B_FALSE;
 736 
 737                 if (flags & MSG_PEEK) {
 738                         if ((nmp = dupmsg(mp)) == NULL &&
 739                             (nmp = copymsg(mp)) == NULL) {
 740                                 size_t size = msgsize(mp);
 741 
 742                                 error = strwaitbuf(size, BPRI_HI);
 743                                 if (error) {
 744                                         return (error);
 745                                 }
 746                                 goto again;
 747                         }
 748                         mp = nmp;
 749                 } else {
 750                         ASSERT(mp->b_prev != NULL);
 751                         last_tail = mp->b_prev;
 752                         mp->b_prev = NULL;
 753                         so->so_rcv_q_head = mp->b_next;
 754                         if (so->so_rcv_q_head == NULL) {
 755                                 so->so_rcv_q_last_head = NULL;
 756                         }
 757                         mp->b_next = NULL;
 758                 }
 759 
 760                 ASSERT(mctlp != NULL);
 761                 /*
 762                  * First process PROTO or PCPROTO blocks, if any.
 763                  */
 764                 if (DB_TYPE(mp) != M_DATA) {
 765                         *mctlp = mp;
 766                         savemp = mp;
 767                         savemptail = mp;
 768                         ASSERT(DB_TYPE(mp) == M_PROTO ||
 769                             DB_TYPE(mp) == M_PCPROTO);
 770                         while (mp->b_cont != NULL &&
 771                             DB_TYPE(mp->b_cont) != M_DATA) {
 772                                 ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
 773                                     DB_TYPE(mp->b_cont) == M_PCPROTO);
 774                                 mp = mp->b_cont;
 775                                 savemptail = mp;
 776                         }
 777                         mp = savemptail->b_cont;
 778                         savemptail->b_cont = NULL;
 779                 }
 780 
 781                 ASSERT(DB_TYPE(mp) == M_DATA);
 782                 /*
 783                  * Now process DATA blocks, if any. Note that for sodirect
 784                  * enabled socket, uio_resid can be 0.
 785                  */
 786                 if (uiop->uio_resid >= 0) {
 787                         ssize_t copied = 0;
 788 
 789                         if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
 790                                 mutex_enter(&so->so_lock);
 791                                 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
 792                                 copied = sod_uioa_mblk(so, mp);
 793                                 if (copied > 0)
 794                                         partial_read = B_TRUE;
 795                                 mutex_exit(&so->so_lock);
 796                                 /* mark this mblk as processed */
 797                                 mp = NULL;
 798                         } else {
 799                                 ssize_t oldresid = uiop->uio_resid;
 800 
 801                                 if (MBLKL(mp) < so_mblk_pull_len) {
 802                                         if (pullupmsg(mp, -1) == 1) {
 803                                                 last_tail = mp;
 804                                         }
 805                                 }
 806                                 /*
 807                                  * Can not read beyond the oobmark
 808                                  */
 809                                 mp = socopyoutuio(mp, uiop,
 810                                     oobmark == 0 ? INFPSZ : oobmark, &error);
 811                                 if (error != 0) {
 812                                         freemsg(*mctlp);
 813                                         *mctlp = NULL;
 814                                         more = 0;
 815                                         goto done;
 816                                 }
 817                                 ASSERT(oldresid >= uiop->uio_resid);
 818                                 copied = oldresid - uiop->uio_resid;
 819                                 if (oldresid > uiop->uio_resid)
 820                                         partial_read = B_TRUE;
 821                         }
 822                         ASSERT(copied >= 0);
 823                         if (copied > 0 && !(flags & MSG_PEEK)) {
 824                                 mutex_enter(&so->so_lock);
 825                                 so->so_rcv_queued -= copied;
 826                                 ASSERT(so->so_oobmark >= 0);
 827                                 if (so->so_oobmark > 0) {
 828                                         so->so_oobmark -= copied;
 829                                         ASSERT(so->so_oobmark >= 0);
 830                                         if (so->so_oobmark == 0) {
 831                                                 ASSERT(so->so_state &
 832                                                     SS_OOBPEND);
 833                                                 so->so_oobmark = 0;
 834                                                 so->so_state |= SS_RCVATMARK;
 835                                         }
 836                                 }
 837                                 /*
 838                                  * so_check_flow_control() will drop
 839                                  * so->so_lock.
 840                                  */
 841                                 rvalp->r_val2 = so_check_flow_control(so);
 842                         }
 843                 }
 844                 if (mp != NULL) { /* more data blocks in msg */
 845                         more |= MOREDATA;
 846                         if ((flags & (MSG_PEEK|MSG_TRUNC))) {
 847                                 if (flags & MSG_PEEK) {
 848                                         freemsg(mp);
 849                                 } else {
 850                                         unsigned int msize = msgdsize(mp);
 851 
 852                                         freemsg(mp);
 853                                         mutex_enter(&so->so_lock);
 854                                         so->so_rcv_queued -= msize;
 855                                         /*
 856                                          * so_check_flow_control() will drop
 857                                          * so->so_lock.
 858                                          */
 859                                         rvalp->r_val2 =
 860                                             so_check_flow_control(so);
 861                                 }
 862                         } else if (partial_read && !somsghasdata(mp)) {
 863                                 /*
 864                                  * Avoid queuing a zero-length tail part of
 865                                  * a message. partial_read == 1 indicates that
 866                                  * we read some of the message.
 867                                  */
 868                                 freemsg(mp);
 869                                 more &= ~MOREDATA;
 870                         } else {
 871                                 if (savemp != NULL &&
 872                                     (flags & MSG_DUPCTRL)) {
 873                                         mblk_t *nmp;
 874                                         /*
 875                                          * There should only be non data mblks
 876                                          */
 877                                         ASSERT(DB_TYPE(savemp) != M_DATA &&
 878                                             DB_TYPE(savemptail) != M_DATA);
 879 try_again:
 880                                         if ((nmp = dupmsg(savemp)) == NULL &&
 881                                             (nmp = copymsg(savemp)) == NULL) {
 882 
 883                                                 size_t size = msgsize(savemp);
 884 
 885                                                 error = strwaitbuf(size,
 886                                                     BPRI_HI);
 887                                                 if (error != 0) {
 888                                                         /*
 889                                                          * In case we
 890                                                          * cannot copy
 891                                                          * control data
 892                                                          * free the remaining
 893                                                          * data.
 894                                                          */
 895                                                         freemsg(mp);
 896                                                         goto done;
 897                                                 }
 898                                                 goto try_again;
 899                                         }
 900 
 901                                         ASSERT(nmp != NULL);
 902                                         ASSERT(DB_TYPE(nmp) != M_DATA);
 903                                         savemptail->b_cont = mp;
 904                                         *mctlp = nmp;
 905                                         mp = savemp;
 906                                 }
 907                                 /*
 908                                  * putback mp
 909                                  */
 910                                 so_prepend_msg(so, mp, last_tail);
 911                         }
 912                 }
 913 
 914                 /* fast check so_rcv_head if there is more data */
 915                 if (partial_read && !(so->so_state & SS_RCVATMARK) &&
 916                     *mctlp == NULL && uiop->uio_resid > 0 &&
 917                     !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
 918                         goto again;
 919                 }
 920         } else if (!partial_read) {
 921                 mutex_enter(&so->so_lock);
 922                 if (so->so_error != 0) {
 923                         error = sogeterr(so, !(flags & MSG_PEEK));
 924                         mutex_exit(&so->so_lock);
 925                         return (error);
 926                 }
 927                 /*
 928                  * No pending data. Return right away for nonblocking
 929                  * socket, otherwise sleep waiting for data.
 930                  */
 931                 if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
 932                         if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
 933                             (flags & MSG_DONTWAIT)) {
 934                                 error = EWOULDBLOCK;
 935                         } else {
 936                                 if (so->so_state & (SS_CLOSING |
 937                                     SS_FALLBACK_PENDING)) {
 938                                         mutex_exit(&so->so_lock);
 939                                         error = EINTR;
 940                                         goto done;
 941                                 }
 942 
 943                                 if (so->so_rcv_head != NULL) {
 944                                         goto again1;
 945                                 }
 946                                 so->so_rcv_wakeup = B_TRUE;
 947                                 so->so_rcv_wanted = uiop->uio_resid;
 948                                 if (so->so_rcvtimeo == 0) {
 949                                         /*
 950                                          * Zero means disable timeout.
 951                                          */
 952                                         error = cv_wait_sig(&so->so_rcv_cv,
 953                                             &so->so_lock);
 954                                 } else {
 955                                         error = cv_reltimedwait_sig(
 956                                             &so->so_rcv_cv, &so->so_lock,
 957                                             so->so_rcvtimeo, TR_CLOCK_TICK);
 958                                 }
 959                                 so->so_rcv_wakeup = B_FALSE;
 960                                 so->so_rcv_wanted = 0;
 961 
 962                                 if (error == 0) {
 963                                         error = EINTR;
 964                                 } else if (error == -1) {
 965                                         error = EAGAIN;
 966                                 } else {
 967                                         goto again1;
 968                                 }
 969                         }
 970                 }
 971                 mutex_exit(&so->so_lock);
 972         }
 973         if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
 974                 /*
 975                  * We are passed the mark, update state
 976                  * 4.3BSD and 4.4BSD clears the mark when peeking across it.
 977                  * The draft Posix socket spec states that the mark should
 978                  * not be cleared when peeking. We follow the latter.
 979                  */
 980                 mutex_enter(&so->so_lock);
 981                 ASSERT(so_verify_oobstate(so));
 982                 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
 983                 freemsg(so->so_oobmsg);
 984                 so->so_oobmsg = NULL;
 985                 ASSERT(so_verify_oobstate(so));
 986                 mutex_exit(&so->so_lock);
 987         }
 988         ASSERT(so->so_rcv_wakeup == B_FALSE);
 989 done:
 990         if (sodp != NULL) {
 991                 mutex_enter(&so->so_lock);
 992                 if (sodp->sod_enabled &&
 993                     (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
 994                         SOD_UIOAFINI(sodp);
 995                         if (sodp->sod_uioa.uioa_mbytes > 0) {
 996                                 ASSERT(so->so_rcv_q_head != NULL ||
 997                                     so->so_rcv_head != NULL);
 998                                 so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
 999                                 if (error == EWOULDBLOCK)
1000                                         error = 0;
1001                         }
1002                 }
1003                 mutex_exit(&so->so_lock);
1004         }
1005 #ifdef DEBUG
1006         if (so_debug_length) {
1007                 mutex_enter(&so->so_lock);
1008                 ASSERT(so_check_length(so));
1009                 mutex_exit(&so->so_lock);
1010         }
1011 #endif
1012         rvalp->r_val1 = more;
1013         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1014         return (error);
1015 }
1016 
1017 /*
1018  * Enqueue data from the protocol on the socket's rcv queue.
1019  *
1020  * We try to hook new M_DATA mblks onto an existing chain, however,
1021  * that cannot be done if the existing chain has already been
1022  * processed by I/OAT. Non-M_DATA mblks are just linked together via
1023  * b_next. In all cases the b_prev of the enqueued mblk is set to
1024  * point to the last mblk in its b_cont chain.
1025  */
1026 void
1027 so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1028 {
1029         ASSERT(MUTEX_HELD(&so->so_lock));
1030 
1031 #ifdef DEBUG
1032         if (so_debug_length) {
1033                 ASSERT(so_check_length(so));
1034         }
1035 #endif
1036         so->so_rcv_queued += msg_size;
1037 
1038         if (so->so_rcv_head == NULL) {
1039                 ASSERT(so->so_rcv_last_head == NULL);
1040                 so->so_rcv_head = mp;
1041                 so->so_rcv_last_head = mp;
1042         } else if ((DB_TYPE(mp) == M_DATA &&
1043             DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1044             ((DB_FLAGS(mp) & DBLK_UIOA) ==
1045             (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1046                 /* Added to the end */
1047                 ASSERT(so->so_rcv_last_head != NULL);
1048                 ASSERT(so->so_rcv_last_head->b_prev != NULL);
1049                 so->so_rcv_last_head->b_prev->b_cont = mp;
1050         } else {
1051                 /* Start a new end */
1052                 so->so_rcv_last_head->b_next = mp;
1053                 so->so_rcv_last_head = mp;
1054         }
1055         while (mp->b_cont != NULL)
1056                 mp = mp->b_cont;
1057 
1058         so->so_rcv_last_head->b_prev = mp;
1059 #ifdef DEBUG
1060         if (so_debug_length) {
1061                 ASSERT(so_check_length(so));
1062         }
1063 #endif
1064 }
1065 
1066 /*
1067  * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1068  */
1069 boolean_t
1070 somsghasdata(mblk_t *mp)
1071 {
1072         for (; mp; mp = mp->b_cont)
1073                 if (mp->b_datap->db_type == M_DATA) {
1074                         ASSERT(mp->b_wptr >= mp->b_rptr);
1075                         if (mp->b_wptr > mp->b_rptr)
1076                                 return (B_TRUE);
1077                 }
1078         return (B_FALSE);
1079 }
1080 
1081 /*
1082  * Flush the read side of sockfs.
1083  *
1084  * The caller must be sure that a reader is not already active when the
1085  * buffer is being flushed.
1086  */
1087 void
1088 so_rcv_flush(struct sonode *so)
1089 {
1090         mblk_t  *mp;
1091 
1092         ASSERT(MUTEX_HELD(&so->so_lock));
1093 
1094         if (so->so_oobmsg != NULL) {
1095                 freemsg(so->so_oobmsg);
1096                 so->so_oobmsg = NULL;
1097                 so->so_oobmark = 0;
1098                 so->so_state &=
1099                     ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1100         }
1101 
1102         /*
1103          * Free messages sitting in the recv queues
1104          */
1105         while (so->so_rcv_q_head != NULL) {
1106                 mp = so->so_rcv_q_head;
1107                 so->so_rcv_q_head = mp->b_next;
1108                 mp->b_next = mp->b_prev = NULL;
1109                 freemsg(mp);
1110         }
1111         while (so->so_rcv_head != NULL) {
1112                 mp = so->so_rcv_head;
1113                 so->so_rcv_head = mp->b_next;
1114                 mp->b_next = mp->b_prev = NULL;
1115                 freemsg(mp);
1116         }
1117         so->so_rcv_queued = 0;
1118         so->so_rcv_q_head = NULL;
1119         so->so_rcv_q_last_head = NULL;
1120         so->so_rcv_head = NULL;
1121         so->so_rcv_last_head = NULL;
1122 }
1123 
1124 /*
1125  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1126  */
1127 int
1128 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1129     boolean_t oob_inline)
1130 {
1131         mblk_t          *mp, *nmp;
1132         int             error;
1133 
1134         dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1135             flags));
1136 
1137         if (msg != NULL) {
1138                 /*
1139                  * There is never any oob data with addresses or control since
1140                  * the T_EXDATA_IND does not carry any options.
1141                  */
1142                 msg->msg_controllen = 0;
1143                 msg->msg_namelen = 0;
1144                 msg->msg_flags = 0;
1145         }
1146 
1147         mutex_enter(&so->so_lock);
1148         ASSERT(so_verify_oobstate(so));
1149         if (oob_inline ||
1150             (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1151                 dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1152                 mutex_exit(&so->so_lock);
1153                 return (EINVAL);
1154         }
1155         if (!(so->so_state & SS_HAVEOOBDATA)) {
1156                 dprintso(so, 1, ("sorecvoob: no data yet\n"));
1157                 mutex_exit(&so->so_lock);
1158                 return (EWOULDBLOCK);
1159         }
1160         ASSERT(so->so_oobmsg != NULL);
1161         mp = so->so_oobmsg;
1162         if (flags & MSG_PEEK) {
1163                 /*
1164                  * Since recv* can not return ENOBUFS we can not use dupmsg.
1165                  * Instead we revert to the consolidation private
1166                  * allocb_wait plus bcopy.
1167                  */
1168                 mblk_t *mp1;
1169 
1170                 mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1171                 ASSERT(mp1);
1172 
1173                 while (mp != NULL) {
1174                         ssize_t size;
1175 
1176                         size = MBLKL(mp);
1177                         bcopy(mp->b_rptr, mp1->b_wptr, size);
1178                         mp1->b_wptr += size;
1179                         ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1180                         mp = mp->b_cont;
1181                 }
1182                 mp = mp1;
1183         } else {
1184                 /*
1185                  * Update the state indicating that the data has been consumed.
1186                  * Keep SS_OOBPEND set until data is consumed past the mark.
1187                  */
1188                 so->so_oobmsg = NULL;
1189                 so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1190         }
1191         ASSERT(so_verify_oobstate(so));
1192         mutex_exit(&so->so_lock);
1193 
1194         error = 0;
1195         nmp = mp;
1196         while (nmp != NULL && uiop->uio_resid > 0) {
1197                 ssize_t n = MBLKL(nmp);
1198 
1199                 n = MIN(n, uiop->uio_resid);
1200                 if (n > 0)
1201                         error = uiomove(nmp->b_rptr, n,
1202                             UIO_READ, uiop);
1203                 if (error)
1204                         break;
1205                 nmp = nmp->b_cont;
1206         }
1207         ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1208         freemsg(mp);
1209         return (error);
1210 }
1211 
1212 /*
1213  * Allocate and initializ sonode
1214  */
1215 /* ARGSUSED */
1216 struct sonode *
1217 socket_sonode_create(struct sockparams *sp, int family, int type,
1218     int protocol, int version, int sflags, int *errorp, struct cred *cr)
1219 {
1220         sonode_t *so;
1221         int     kmflags;
1222 
1223         /*
1224          * Choose the right set of sonodeops based on the upcall and
1225          * down call version that the protocol has provided
1226          */
1227         if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1228             SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1229                 /*
1230                  * mismatch
1231                  */
1232 #ifdef DEBUG
1233                 cmn_err(CE_CONT, "protocol and socket module version mismatch");
1234 #endif
1235                 *errorp = EINVAL;
1236                 return (NULL);
1237         }
1238 
1239         kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1240 
1241         so = kmem_cache_alloc(socket_cache, kmflags);
1242         if (so == NULL) {
1243                 *errorp = ENOMEM;
1244                 return (NULL);
1245         }
1246 
1247         sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1248 
1249         if (version == SOV_DEFAULT)
1250                 version = so_default_version;
1251 
1252         so->so_version = (short)version;
1253 
1254         /*
1255          * set the default values to be INFPSZ
1256          * if a protocol desires it can change the value later
1257          */
1258         so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1259         so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1260         so->so_proto_props.sopp_maxpsz = INFPSZ;
1261         so->so_proto_props.sopp_maxblk = INFPSZ;
1262 
1263         return (so);
1264 }
1265 
1266 int
1267 socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1268 {
1269         int error = 0;
1270 
1271         if (pso != NULL) {
1272                 /*
1273                  * We have a passive open, so inherit basic state from
1274                  * the parent (listener).
1275                  *
1276                  * No need to grab the new sonode's lock, since there is no
1277                  * one that can have a reference to it.
1278                  */
1279                 mutex_enter(&pso->so_lock);
1280 
1281                 so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1282                 so->so_pgrp = pso->so_pgrp;
1283                 so->so_rcvtimeo = pso->so_rcvtimeo;
1284                 so->so_sndtimeo = pso->so_sndtimeo;
1285                 so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
1286                 /*
1287                  * Make note of the socket level options. TCP and IP level
1288                  * options are already inherited. We could do all this after
1289                  * accept is successful but doing it here simplifies code and
1290                  * no harm done for error case.
1291                  */
1292                 so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
1293                     SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1294                     SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1295                 so->so_proto_props = pso->so_proto_props;
1296                 so->so_mode = pso->so_mode;
1297                 so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
1298 
1299                 mutex_exit(&pso->so_lock);
1300 
1301                 /*
1302                  * If the parent has any filters, try to inherit them.
1303                  */
1304                 if (pso->so_filter_active > 0 &&
1305                     (error = sof_sonode_inherit_filters(so, pso)) != 0)
1306                         return (error);
1307 
1308         } else {
1309                 struct sockparams *sp = so->so_sockparams;
1310                 sock_upcalls_t *upcalls_to_use;
1311 
1312                 /*
1313                  * Attach automatic filters, if there are any.
1314                  */
1315                 if (!list_is_empty(&sp->sp_auto_filters) &&
1316                     (error = sof_sonode_autoattach_filters(so, cr)) != 0)
1317                         return (error);
1318 
1319                 /* OK to attach filters */
1320                 so->so_state |= SS_FILOP_OK;
1321 
1322                 /*
1323                  * Based on the version number select the right upcalls to
1324                  * pass down. Currently we only have one version so choose
1325                  * default
1326                  */
1327                 upcalls_to_use = &so_upcalls;
1328 
1329                 /* active open, so create a lower handle */
1330                 so->so_proto_handle =
1331                     sp->sp_smod_info->smod_proto_create_func(so->so_family,
1332                     so->so_type, so->so_protocol, &so->so_downcalls,
1333                     &so->so_mode, &error, flags, cr);
1334 
1335                 if (so->so_proto_handle == NULL) {
1336                         ASSERT(error != 0);
1337                         /*
1338                          * To be safe; if a lower handle cannot be created, and
1339                          * the proto does not give a reason why, assume there
1340                          * was a lack of memory.
1341                          */
1342                         return ((error == 0) ? ENOMEM : error);
1343                 }
1344                 ASSERT(so->so_downcalls != NULL);
1345                 ASSERT(so->so_downcalls->sd_send != NULL ||
1346                     so->so_downcalls->sd_send_uio != NULL);
1347                 if (so->so_downcalls->sd_recv_uio != NULL) {
1348                         ASSERT(so->so_downcalls->sd_poll != NULL);
1349                         so->so_pollev |= SO_POLLEV_ALWAYS;
1350                 }
1351 
1352                 (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1353                     (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1354 
1355                 /* Wildcard */
1356 
1357                 /*
1358                  * FIXME No need for this, the protocol can deal with it in
1359                  * sd_create(). Should update ICMP.
1360                  */
1361                 if (so->so_protocol != so->so_sockparams->sp_protocol) {
1362                         int protocol = so->so_protocol;
1363                         int error;
1364                         /*
1365                          * Issue SO_PROTOTYPE setsockopt.
1366                          */
1367                         error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1368                             &protocol, (t_uscalar_t)sizeof (protocol), cr);
1369                         if (error) {
1370                                 (void) (*so->so_downcalls->sd_close)
1371                                     (so->so_proto_handle, 0, cr);
1372 
1373                                 mutex_enter(&so->so_lock);
1374                                 so_rcv_flush(so);
1375                                 mutex_exit(&so->so_lock);
1376                                 /*
1377                                  * Setsockopt often fails with ENOPROTOOPT but
1378                                  * socket() should fail with
1379                                  * EPROTONOSUPPORT/EPROTOTYPE.
1380                                  */
1381                                 return (EPROTONOSUPPORT);
1382                         }
1383                 }
1384         }
1385 
1386         if (uioasync.enabled)
1387                 sod_sock_init(so);
1388 
1389         /* put an extra reference on the socket for the protocol */
1390         VN_HOLD(SOTOV(so));
1391 
1392         return (0);
1393 }
1394 
1395 /*
1396  * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1397  *         struct cred *cr, int32_t *rvalp)
1398  *
1399  * Handle ioctls that manipulate basic socket state; non-blocking,
1400  * async, etc.
1401  *
1402  * Returns:
1403  *   < 0  - ioctl was not handle
1404  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1405  *
1406  * Notes:
1407  *   Assumes the standard receive buffer is used to obtain info for
1408  *   NREAD.
1409  */
1410 /* ARGSUSED */
1411 int
1412 socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1413     struct cred *cr, int32_t *rvalp)
1414 {
1415         switch (cmd) {
1416         case SIOCSQPTR:
1417                 /*
1418                  * SIOCSQPTR is valid only when helper stream is created
1419                  * by the protocol.
1420                  */
1421 
1422                 return (EOPNOTSUPP);
1423         case FIONBIO: {
1424                 int32_t value;
1425 
1426                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1427                     (mode & (int)FKIOCTL)))
1428                         return (EFAULT);
1429 
1430                 mutex_enter(&so->so_lock);
1431                 if (value) {
1432                         so->so_state |= SS_NDELAY;
1433                 } else {
1434                         so->so_state &= ~SS_NDELAY;
1435                 }
1436                 mutex_exit(&so->so_lock);
1437                 return (0);
1438         }
1439         case FIOASYNC: {
1440                 int32_t value;
1441 
1442                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1443                     (mode & (int)FKIOCTL)))
1444                         return (EFAULT);
1445 
1446                 mutex_enter(&so->so_lock);
1447 
1448                 if (value) {
1449                         /* Turn on SIGIO */
1450                         so->so_state |= SS_ASYNC;
1451                 } else {
1452                         /* Turn off SIGIO */
1453                         so->so_state &= ~SS_ASYNC;
1454                 }
1455                 mutex_exit(&so->so_lock);
1456 
1457                 return (0);
1458         }
1459 
1460         case SIOCSPGRP:
1461         case FIOSETOWN: {
1462                 int error;
1463                 pid_t pid;
1464 
1465                 if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1466                     (mode & (int)FKIOCTL)))
1467                         return (EFAULT);
1468 
1469                 mutex_enter(&so->so_lock);
1470                 error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1471                 mutex_exit(&so->so_lock);
1472                 return (error);
1473         }
1474         case SIOCGPGRP:
1475         case FIOGETOWN:
1476                 if (so_copyout(&so->so_pgrp, (void *)arg,
1477                     sizeof (pid_t), (mode & (int)FKIOCTL)))
1478                         return (EFAULT);
1479 
1480                 return (0);
1481         case SIOCATMARK: {
1482                 int retval;
1483 
1484                 /*
1485                  * Only protocols that support urgent data can handle ATMARK.
1486                  */
1487                 if ((so->so_mode & SM_EXDATA) == 0)
1488                         return (EINVAL);
1489 
1490                 /*
1491                  * If the protocol is maintaining its own buffer, then the
1492                  * request must be passed down.
1493                  */
1494                 if (so->so_downcalls->sd_recv_uio != NULL)
1495                         return (-1);
1496 
1497                 retval = (so->so_state & SS_RCVATMARK) != 0;
1498 
1499                 if (so_copyout(&retval, (void *)arg, sizeof (int),
1500                     (mode & (int)FKIOCTL))) {
1501                         return (EFAULT);
1502                 }
1503                 return (0);
1504         }
1505 
1506         case FIONREAD: {
1507                 int retval;
1508 
1509                 /*
1510                  * If the protocol is maintaining its own buffer, then the
1511                  * request must be passed down.
1512                  */
1513                 if (so->so_downcalls->sd_recv_uio != NULL)
1514                         return (-1);
1515 
1516                 retval = MIN(so->so_rcv_queued, INT_MAX);
1517 
1518                 if (so_copyout(&retval, (void *)arg,
1519                     sizeof (retval), (mode & (int)FKIOCTL))) {
1520                         return (EFAULT);
1521                 }
1522                 return (0);
1523         }
1524 
1525         case _I_GETPEERCRED: {
1526                 int error = 0;
1527 
1528                 if ((mode & FKIOCTL) == 0)
1529                         return (EINVAL);
1530 
1531                 mutex_enter(&so->so_lock);
1532                 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1533                         error = ENOTSUP;
1534                 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1535                         error = ENOTCONN;
1536                 } else if (so->so_peercred != NULL) {
1537                         k_peercred_t *kp = (k_peercred_t *)arg;
1538                         kp->pc_cr = so->so_peercred;
1539                         kp->pc_cpid = so->so_cpid;
1540                         crhold(so->so_peercred);
1541                 } else {
1542                         error = EINVAL;
1543                 }
1544                 mutex_exit(&so->so_lock);
1545                 return (error);
1546         }
1547         default:
1548                 return (-1);
1549         }
1550 }
1551 
1552 /*
1553  * Handle the I_NREAD STREAM ioctl.
1554  */
1555 static int
1556 so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
1557 {
1558         size_t size = 0;
1559         int retval;
1560         int count = 0;
1561         mblk_t *mp;
1562         clock_t wakeup = drv_usectohz(10);
1563 
1564         if (so->so_downcalls == NULL ||
1565             so->so_downcalls->sd_recv_uio != NULL)
1566                 return (EINVAL);
1567 
1568         mutex_enter(&so->so_lock);
1569         /* Wait for reader to get out of the way. */
1570         while (so->so_flag & SOREADLOCKED) {
1571                 /*
1572                  * If reader is waiting for data, then there should be nothing
1573                  * on the rcv queue.
1574                  */
1575                 if (so->so_rcv_wakeup)
1576                         goto out;
1577 
1578                 /* Do a timed sleep, in case the reader goes to sleep. */
1579                 (void) cv_reltimedwait(&so->so_read_cv, &so->so_lock, wakeup,
1580                     TR_CLOCK_TICK);
1581         }
1582 
1583         /*
1584          * Since we are holding so_lock no new reader will come in, and the
1585          * protocol will not be able to enqueue data. So it's safe to walk
1586          * both rcv queues.
1587          */
1588         mp = so->so_rcv_q_head;
1589         if (mp != NULL) {
1590                 size = msgdsize(so->so_rcv_q_head);
1591                 for (; mp != NULL; mp = mp->b_next)
1592                         count++;
1593         } else {
1594                 /*
1595                  * In case the processing list was empty, get the size of the
1596                  * next msg in line.
1597                  */
1598                 size = msgdsize(so->so_rcv_head);
1599         }
1600 
1601         for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
1602                 count++;
1603 out:
1604         mutex_exit(&so->so_lock);
1605 
1606         /*
1607          * Drop down from size_t to the "int" required by the
1608          * interface.  Cap at INT_MAX.
1609          */
1610         retval = MIN(size, INT_MAX);
1611         if (so_copyout(&retval, (void *)arg, sizeof (retval),
1612             (mode & (int)FKIOCTL))) {
1613                 return (EFAULT);
1614         } else {
1615                 *rvalp = count;
1616                 return (0);
1617         }
1618 }
1619 
1620 /*
1621  * Process STREAM ioctls.
1622  *
1623  * Returns:
1624  *   < 0  - ioctl was not handle
1625  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1626  */
1627 int
1628 socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1629     struct cred *cr, int32_t *rvalp)
1630 {
1631         int retval;
1632 
1633         /* Only STREAM iotcls are handled here */
1634         if ((cmd & 0xffffff00U) != STR)
1635                 return (-1);
1636 
1637         switch (cmd) {
1638         case I_CANPUT:
1639                 /*
1640                  * We return an error for I_CANPUT so that isastream(3C) will
1641                  * not report the socket as being a STREAM.
1642                  */
1643                 return (EOPNOTSUPP);
1644         case I_NREAD:
1645                 /* Avoid doing a fallback for I_NREAD. */
1646                 return (so_strioc_nread(so, arg, mode, rvalp));
1647         case I_LOOK:
1648                 /* Avoid doing a fallback for I_LOOK. */
1649                 if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1650                     (mode & (int)FKIOCTL))) {
1651                         return (EFAULT);
1652                 }
1653                 return (0);
1654         default:
1655                 break;
1656         }
1657 
1658         /*
1659          * Try to fall back to TPI, and if successful, reissue the ioctl.
1660          */
1661         if ((retval = so_tpi_fallback(so, cr)) == 0) {
1662                 /* Reissue the ioctl */
1663                 ASSERT(so->so_rcv_q_head == NULL);
1664                 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1665         } else {
1666                 return (retval);
1667         }
1668 }
1669 
1670 /*
1671  * This is called for all socket types to verify that the buffer size is large
1672  * enough for the option, and if we can, handle the request as well. Most
1673  * options will be forwarded to the protocol.
1674  */
1675 int
1676 socket_getopt_common(struct sonode *so, int level, int option_name,
1677     void *optval, socklen_t *optlenp, int flags)
1678 {
1679         if (level != SOL_SOCKET)
1680                 return (-1);
1681 
1682         switch (option_name) {
1683         case SO_ERROR:
1684         case SO_DOMAIN:
1685         case SO_TYPE:
1686         case SO_ACCEPTCONN: {
1687                 int32_t value;
1688                 socklen_t optlen = *optlenp;
1689 
1690                 if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1691                         return (EINVAL);
1692                 }
1693 
1694                 switch (option_name) {
1695                 case SO_ERROR:
1696                         mutex_enter(&so->so_lock);
1697                         value = sogeterr(so, B_TRUE);
1698                         mutex_exit(&so->so_lock);
1699                         break;
1700                 case SO_DOMAIN:
1701                         value = so->so_family;
1702                         break;
1703                 case SO_TYPE:
1704                         value = so->so_type;
1705                         break;
1706                 case SO_ACCEPTCONN:
1707                         if (so->so_state & SS_ACCEPTCONN)
1708                                 value = SO_ACCEPTCONN;
1709                         else
1710                                 value = 0;
1711                         break;
1712                 }
1713 
1714                 bcopy(&value, optval, sizeof (value));
1715                 *optlenp = sizeof (value);
1716 
1717                 return (0);
1718         }
1719         case SO_SNDTIMEO:
1720         case SO_RCVTIMEO: {
1721                 clock_t value;
1722                 socklen_t optlen = *optlenp;
1723 
1724                 if (get_udatamodel() == DATAMODEL_NONE ||
1725                     get_udatamodel() == DATAMODEL_NATIVE) {
1726                         if (optlen < sizeof (struct timeval))
1727                                 return (EINVAL);
1728                 } else {
1729                         if (optlen < sizeof (struct timeval32))
1730                                 return (EINVAL);
1731                 }
1732                 if (option_name == SO_RCVTIMEO)
1733                         value = drv_hztousec(so->so_rcvtimeo);
1734                 else
1735                         value = drv_hztousec(so->so_sndtimeo);
1736 
1737                 if (get_udatamodel() == DATAMODEL_NONE ||
1738                     get_udatamodel() == DATAMODEL_NATIVE) {
1739                         ((struct timeval *)(optval))->tv_sec =
1740                             value / (1000 * 1000);
1741                         ((struct timeval *)(optval))->tv_usec =
1742                             value % (1000 * 1000);
1743                         *optlenp = sizeof (struct timeval);
1744                 } else {
1745                         ((struct timeval32 *)(optval))->tv_sec =
1746                             value / (1000 * 1000);
1747                         ((struct timeval32 *)(optval))->tv_usec =
1748                             value % (1000 * 1000);
1749                         *optlenp = sizeof (struct timeval32);
1750                 }
1751                 return (0);
1752         }
1753         case SO_DEBUG:
1754         case SO_REUSEADDR:
1755         case SO_KEEPALIVE:
1756         case SO_DONTROUTE:
1757         case SO_BROADCAST:
1758         case SO_USELOOPBACK:
1759         case SO_OOBINLINE:
1760         case SO_SNDBUF:
1761 #ifdef notyet
1762         case SO_SNDLOWAT:
1763         case SO_RCVLOWAT:
1764 #endif /* notyet */
1765         case SO_DGRAM_ERRIND: {
1766                 socklen_t optlen = *optlenp;
1767 
1768                 if (optlen < (t_uscalar_t)sizeof (int32_t))
1769                         return (EINVAL);
1770                 break;
1771         }
1772         case SO_RCVBUF: {
1773                 socklen_t optlen = *optlenp;
1774 
1775                 if (optlen < (t_uscalar_t)sizeof (int32_t))
1776                         return (EINVAL);
1777 
1778                 if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1779                         /*
1780                          * XXX If SO_RCVBUF has been set and this is an
1781                          * XPG 4.2 application then do not ask the transport
1782                          * since the transport might adjust the value and not
1783                          * return exactly what was set by the application.
1784                          * For non-XPG 4.2 application we return the value
1785                          * that the transport is actually using.
1786                          */
1787                         *(int32_t *)optval = so->so_xpg_rcvbuf;
1788                         *optlenp = sizeof (so->so_xpg_rcvbuf);
1789                         return (0);
1790                 }
1791                 /*
1792                  * If the option has not been set then get a default
1793                  * value from the transport.
1794                  */
1795                 break;
1796         }
1797         case SO_LINGER: {
1798                 socklen_t optlen = *optlenp;
1799 
1800                 if (optlen < (t_uscalar_t)sizeof (struct linger))
1801                         return (EINVAL);
1802                 break;
1803         }
1804         case SO_SND_BUFINFO: {
1805                 socklen_t optlen = *optlenp;
1806 
1807                 if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1808                         return (EINVAL);
1809                 ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1810                     (so->so_proto_props).sopp_wroff;
1811                 ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1812                     (so->so_proto_props).sopp_maxblk;
1813                 ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1814                     (so->so_proto_props).sopp_maxpsz;
1815                 ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1816                     (so->so_proto_props).sopp_tail;
1817                 *optlenp = sizeof (struct so_snd_bufinfo);
1818                 return (0);
1819         }
1820         case SO_SND_COPYAVOID: {
1821                 sof_instance_t *inst;
1822 
1823                 /*
1824                  * Avoid zero-copy if there is a filter with a data_out
1825                  * callback. We could let the operation succeed, but then
1826                  * the filter would have to copy the data anyway.
1827                  */
1828                 for (inst = so->so_filter_top; inst != NULL;
1829                     inst = inst->sofi_next) {
1830                         if (SOF_INTERESTED(inst, data_out))
1831                                 return (EOPNOTSUPP);
1832                 }
1833                 break;
1834         }
1835 
1836         default:
1837                 break;
1838         }
1839 
1840         /* Unknown Option */
1841         return (-1);
1842 }
1843 
1844 void
1845 socket_sonode_destroy(struct sonode *so)
1846 {
1847         sonode_fini(so);
1848         kmem_cache_free(socket_cache, so);
1849 }
1850 
1851 int
1852 so_zcopy_wait(struct sonode *so)
1853 {
1854         int error = 0;
1855 
1856         mutex_enter(&so->so_lock);
1857         while (!(so->so_copyflag & STZCNOTIFY)) {
1858                 if (so->so_state & SS_CLOSING) {
1859                         mutex_exit(&so->so_lock);
1860                         return (EINTR);
1861                 }
1862                 if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1863                         error = EINTR;
1864                         break;
1865                 }
1866         }
1867         so->so_copyflag &= ~STZCNOTIFY;
1868         mutex_exit(&so->so_lock);
1869         return (error);
1870 }
1871 
1872 void
1873 so_timer_callback(void *arg)
1874 {
1875         struct sonode *so = (struct sonode *)arg;
1876 
1877         mutex_enter(&so->so_lock);
1878 
1879         so->so_rcv_timer_tid = 0;
1880         if (so->so_rcv_queued > 0) {
1881                 so_notify_data(so, so->so_rcv_queued);
1882         } else {
1883                 mutex_exit(&so->so_lock);
1884         }
1885 }
1886 
1887 #ifdef DEBUG
1888 /*
1889  * Verify that the length stored in so_rcv_queued and the length of data blocks
1890  * queued is same.
1891  */
1892 static boolean_t
1893 so_check_length(sonode_t *so)
1894 {
1895         mblk_t *mp = so->so_rcv_q_head;
1896         int len = 0;
1897 
1898         ASSERT(MUTEX_HELD(&so->so_lock));
1899 
1900         if (mp != NULL) {
1901                 len = msgdsize(mp);
1902                 while ((mp = mp->b_next) != NULL)
1903                         len += msgdsize(mp);
1904         }
1905         mp = so->so_rcv_head;
1906         if (mp != NULL) {
1907                 len += msgdsize(mp);
1908                 while ((mp = mp->b_next) != NULL)
1909                         len += msgdsize(mp);
1910         }
1911         return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1912 }
1913 #endif
1914 
1915 int
1916 so_get_mod_version(struct sockparams *sp)
1917 {
1918         ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1919         return (sp->sp_smod_info->smod_version);
1920 }
1921 
1922 /*
1923  * so_start_fallback()
1924  *
1925  * Block new socket operations from coming in, and wait for active operations
1926  * to complete. Threads that are sleeping will be woken up so they can get
1927  * out of the way.
1928  *
1929  * The caller must be a reader on so_fallback_rwlock.
1930  */
1931 static boolean_t
1932 so_start_fallback(struct sonode *so)
1933 {
1934         ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1935 
1936         mutex_enter(&so->so_lock);
1937         if (so->so_state & SS_FALLBACK_PENDING) {
1938                 mutex_exit(&so->so_lock);
1939                 return (B_FALSE);
1940         }
1941         so->so_state |= SS_FALLBACK_PENDING;
1942         /*
1943          * Poke all threads that might be sleeping. Any operation that comes
1944          * in after the cv_broadcast will observe the fallback pending flag
1945          * which cause the call to return where it would normally sleep.
1946          */
1947         cv_broadcast(&so->so_state_cv);          /* threads in connect() */
1948         cv_broadcast(&so->so_rcv_cv);            /* threads in recvmsg() */
1949         cv_broadcast(&so->so_snd_cv);            /* threads in sendmsg() */
1950         mutex_enter(&so->so_acceptq_lock);
1951         cv_broadcast(&so->so_acceptq_cv);        /* threads in accept() */
1952         mutex_exit(&so->so_acceptq_lock);
1953         mutex_exit(&so->so_lock);
1954 
1955         /*
1956          * The main reason for the rw_tryupgrade call is to provide
1957          * observability during the fallback process. We want to
1958          * be able to see if there are pending operations.
1959          */
1960         if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1961                 /*
1962                  * It is safe to drop and reaquire the fallback lock, because
1963                  * we are guaranteed that another fallback cannot take place.
1964                  */
1965                 rw_exit(&so->so_fallback_rwlock);
1966                 DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1967                 rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1968                 DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1969         }
1970 
1971         return (B_TRUE);
1972 }
1973 
1974 /*
1975  * so_end_fallback()
1976  *
1977  * Allow socket opertions back in.
1978  *
1979  * The caller must be a writer on so_fallback_rwlock.
1980  */
1981 static void
1982 so_end_fallback(struct sonode *so)
1983 {
1984         ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
1985 
1986         mutex_enter(&so->so_lock);
1987         so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
1988         mutex_exit(&so->so_lock);
1989 
1990         rw_downgrade(&so->so_fallback_rwlock);
1991 }
1992 
1993 /*
1994  * so_quiesced_cb()
1995  *
1996  * Callback passed to the protocol during fallback. It is called once
1997  * the endpoint is quiescent.
1998  *
1999  * No requests from the user, no notifications from the protocol, so it
2000  * is safe to synchronize the state. Data can also be moved without
2001  * risk for reordering.
2002  *
2003  * We do not need to hold so_lock, since there can be only one thread
2004  * operating on the sonode.
2005  */
2006 static mblk_t *
2007 so_quiesced_cb(sock_upper_handle_t sock_handle, sock_quiesce_arg_t *arg,
2008     struct T_capability_ack *tcap,
2009     struct sockaddr *laddr, socklen_t laddrlen,
2010     struct sockaddr *faddr, socklen_t faddrlen, short opts)
2011 {
2012         struct sonode *so = (struct sonode *)sock_handle;
2013         boolean_t atmark;
2014         mblk_t *retmp = NULL, **tailmpp = &retmp;
2015 
2016         if (tcap != NULL)
2017                 sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen,
2018                     opts);
2019 
2020         /*
2021          * Some protocols do not quiece the data path during fallback. Once
2022          * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
2023          * fail and the protocol is responsible for saving the data for later
2024          * delivery (i.e., once the fallback has completed).
2025          */
2026         mutex_enter(&so->so_lock);
2027         so->so_state |= SS_FALLBACK_DRAIN;
2028         SOCKET_TIMER_CANCEL(so);
2029         mutex_exit(&so->so_lock);
2030 
2031         if (so->so_rcv_head != NULL) {
2032                 if (so->so_rcv_q_last_head == NULL)
2033                         so->so_rcv_q_head = so->so_rcv_head;
2034                 else
2035                         so->so_rcv_q_last_head->b_next = so->so_rcv_head;
2036                 so->so_rcv_q_last_head = so->so_rcv_last_head;
2037         }
2038 
2039         atmark = (so->so_state & SS_RCVATMARK) != 0;
2040         /*
2041          * Clear any OOB state having to do with pending data. The TPI
2042          * code path will set the appropriate oob state when we move the
2043          * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
2044          * data has already been consumed.
2045          */
2046         so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
2047 
2048         ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
2049 
2050         /*
2051          * Move data to the STREAM head.
2052          */
2053         while (so->so_rcv_q_head != NULL) {
2054                 mblk_t *mp = so->so_rcv_q_head;
2055                 size_t mlen = msgdsize(mp);
2056 
2057                 so->so_rcv_q_head = mp->b_next;
2058                 mp->b_next = NULL;
2059                 mp->b_prev = NULL;
2060 
2061                 /*
2062                  * Send T_EXDATA_IND if we are at the oob mark.
2063                  */
2064                 if (atmark) {
2065                         struct T_exdata_ind *tei;
2066                         mblk_t *mp1 = arg->soqa_exdata_mp;
2067 
2068                         arg->soqa_exdata_mp = NULL;
2069                         ASSERT(mp1 != NULL);
2070                         mp1->b_datap->db_type = M_PROTO;
2071                         tei = (struct T_exdata_ind *)mp1->b_rptr;
2072                         tei->PRIM_type = T_EXDATA_IND;
2073                         tei->MORE_flag = 0;
2074                         mp1->b_wptr = (uchar_t *)&tei[1];
2075 
2076                         if (IS_SO_OOB_INLINE(so)) {
2077                                 mp1->b_cont = mp;
2078                         } else {
2079                                 ASSERT(so->so_oobmsg != NULL);
2080                                 mp1->b_cont = so->so_oobmsg;
2081                                 so->so_oobmsg = NULL;
2082 
2083                                 /* process current mp next time around */
2084                                 mp->b_next = so->so_rcv_q_head;
2085                                 so->so_rcv_q_head = mp;
2086                                 mlen = 0;
2087                         }
2088                         mp = mp1;
2089 
2090                         /* we have consumed the oob mark */
2091                         atmark = B_FALSE;
2092                 } else if (so->so_oobmark > 0) {
2093                         /*
2094                          * Check if the OOB mark is within the current
2095                          * mblk chain. In that case we have to split it up.
2096                          */
2097                         if (so->so_oobmark < mlen) {
2098                                 mblk_t *urg_mp = mp;
2099 
2100                                 atmark = B_TRUE;
2101                                 mp = NULL;
2102                                 mlen = so->so_oobmark;
2103 
2104                                 /*
2105                                  * It is assumed that the OOB mark does
2106                                  * not land within a mblk.
2107                                  */
2108                                 do {
2109                                         so->so_oobmark -= MBLKL(urg_mp);
2110                                         mp = urg_mp;
2111                                         urg_mp = urg_mp->b_cont;
2112                                 } while (so->so_oobmark > 0);
2113                                 mp->b_cont = NULL;
2114                                 if (urg_mp != NULL) {
2115                                         urg_mp->b_next = so->so_rcv_q_head;
2116                                         so->so_rcv_q_head = urg_mp;
2117                                 }
2118                         } else {
2119                                 so->so_oobmark -= mlen;
2120                                 if (so->so_oobmark == 0)
2121                                         atmark = B_TRUE;
2122                         }
2123                 }
2124 
2125                 /*
2126                  * Queue data on the STREAM head.
2127                  */
2128                 so->so_rcv_queued -= mlen;
2129                 *tailmpp = mp;
2130                 tailmpp = &mp->b_next;
2131         }
2132         so->so_rcv_head = NULL;
2133         so->so_rcv_last_head = NULL;
2134         so->so_rcv_q_head = NULL;
2135         so->so_rcv_q_last_head = NULL;
2136 
2137         /*
2138          * Check if the oob byte is at the end of the data stream, or if the
2139          * oob byte has not yet arrived. In the latter case we have to send a
2140          * SIGURG and a mark indicator to the STREAM head. The mark indicator
2141          * is needed to guarantee correct behavior for SIOCATMARK. See block
2142          * comment in socktpi.h for more details.
2143          */
2144         if (atmark || so->so_oobmark > 0) {
2145                 mblk_t *mp;
2146 
2147                 if (atmark && so->so_oobmsg != NULL) {
2148                         struct T_exdata_ind *tei;
2149 
2150                         mp = arg->soqa_exdata_mp;
2151                         arg->soqa_exdata_mp = NULL;
2152                         ASSERT(mp != NULL);
2153                         mp->b_datap->db_type = M_PROTO;
2154                         tei = (struct T_exdata_ind *)mp->b_rptr;
2155                         tei->PRIM_type = T_EXDATA_IND;
2156                         tei->MORE_flag = 0;
2157                         mp->b_wptr = (uchar_t *)&tei[1];
2158 
2159                         mp->b_cont = so->so_oobmsg;
2160                         so->so_oobmsg = NULL;
2161 
2162                         *tailmpp = mp;
2163                         tailmpp = &mp->b_next;
2164                 } else {
2165                         /* Send up the signal */
2166                         mp = arg->soqa_exdata_mp;
2167                         arg->soqa_exdata_mp = NULL;
2168                         ASSERT(mp != NULL);
2169                         DB_TYPE(mp) = M_PCSIG;
2170                         *mp->b_wptr++ = (uchar_t)SIGURG;
2171                         *tailmpp = mp;
2172                         tailmpp = &mp->b_next;
2173 
2174                         /* Send up the mark indicator */
2175                         mp = arg->soqa_urgmark_mp;
2176                         arg->soqa_urgmark_mp = NULL;
2177                         mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
2178                         *tailmpp = mp;
2179                         tailmpp = &mp->b_next;
2180 
2181                         so->so_oobmark = 0;
2182                 }
2183         }
2184         ASSERT(so->so_oobmark == 0);
2185         ASSERT(so->so_rcv_queued == 0);
2186 
2187         return (retmp);
2188 }
2189 
2190 #ifdef DEBUG
2191 /*
2192  * Do an integrity check of the sonode. This should be done if a
2193  * fallback fails after sonode has initially been converted to use
2194  * TPI and subsequently have to be reverted.
2195  *
2196  * Failure to pass the integrity check will panic the system.
2197  */
2198 void
2199 so_integrity_check(struct sonode *cur, struct sonode *orig)
2200 {
2201         VERIFY(cur->so_vnode == orig->so_vnode);
2202         VERIFY(cur->so_ops == orig->so_ops);
2203         /*
2204          * For so_state we can only VERIFY the state flags in CHECK_STATE.
2205          * The other state flags might be affected by a notification from the
2206          * protocol.
2207          */
2208 #define CHECK_STATE     (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
2209         SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
2210         SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
2211         VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
2212             (orig->so_state & CHECK_STATE));
2213         VERIFY(cur->so_mode == orig->so_mode);
2214         VERIFY(cur->so_flag == orig->so_flag);
2215         VERIFY(cur->so_count == orig->so_count);
2216         /* Cannot VERIFY so_proto_connid; proto can update it */
2217         VERIFY(cur->so_sockparams == orig->so_sockparams);
2218         /* an error might have been recorded, but it can not be lost */
2219         VERIFY(cur->so_error != 0 || orig->so_error == 0);
2220         VERIFY(cur->so_family == orig->so_family);
2221         VERIFY(cur->so_type == orig->so_type);
2222         VERIFY(cur->so_protocol == orig->so_protocol);
2223         VERIFY(cur->so_version == orig->so_version);
2224         /* New conns might have arrived, but none should have been lost */
2225         VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
2226         VERIFY(list_head(&cur->so_acceptq_list) ==
2227             list_head(&orig->so_acceptq_list));
2228         VERIFY(cur->so_backlog == orig->so_backlog);
2229         /* New OOB migth have arrived, but mark should not have been lost */
2230         VERIFY(cur->so_oobmark >= orig->so_oobmark);
2231         /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
2232         VERIFY(cur->so_pgrp == orig->so_pgrp);
2233         VERIFY(cur->so_peercred == orig->so_peercred);
2234         VERIFY(cur->so_cpid == orig->so_cpid);
2235         VERIFY(cur->so_zoneid == orig->so_zoneid);
2236         /* New data migth have arrived, but none should have been lost */
2237         VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
2238         VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
2239         VERIFY(cur->so_rcv_head == orig->so_rcv_head);
2240         VERIFY(cur->so_proto_handle == orig->so_proto_handle);
2241         VERIFY(cur->so_downcalls == orig->so_downcalls);
2242         /* Cannot VERIFY so_proto_props; they can be updated by proto */
2243 }
2244 #endif
2245 
2246 /*
2247  * so_tpi_fallback()
2248  *
2249  * This is the fallback initation routine; things start here.
2250  *
2251  * Basic strategy:
2252  *   o Block new socket operations from coming in
2253  *   o Allocate/initate info needed by TPI
2254  *   o Quiesce the connection, at which point we sync
2255  *     state and move data
2256  *   o Change operations (sonodeops) associated with the socket
2257  *   o Unblock threads waiting for the fallback to finish
2258  */
2259 int
2260 so_tpi_fallback(struct sonode *so, struct cred *cr)
2261 {
2262         int error;
2263         queue_t *q;
2264         struct sockparams *sp;
2265         struct sockparams *newsp = NULL;
2266         so_proto_fallback_func_t fbfunc;
2267         const char *devpath;
2268         boolean_t direct;
2269         struct sonode *nso;
2270         sock_quiesce_arg_t arg = { NULL, NULL };
2271 #ifdef DEBUG
2272         struct sonode origso;
2273 #endif
2274         error = 0;
2275         sp = so->so_sockparams;
2276         fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
2277 
2278         /*
2279          * Cannot fallback if the socket has active filters
2280          */
2281         if (so->so_filter_active > 0)
2282                 return (EINVAL);
2283 
2284         switch (so->so_family) {
2285         case AF_INET:
2286                 devpath = sp->sp_smod_info->smod_fallback_devpath_v4;
2287                 break;
2288         case AF_INET6:
2289                 devpath = sp->sp_smod_info->smod_fallback_devpath_v6;
2290                 break;
2291         default:
2292                 return (EINVAL);
2293         }
2294 
2295         /*
2296          * Fallback can only happen if the socket module has a TPI device
2297          * and fallback function.
2298          */
2299         if (devpath == NULL || fbfunc == NULL)
2300                 return (EINVAL);
2301 
2302         /*
2303          * Initiate fallback; upon success we know that no new requests
2304          * will come in from the user.
2305          */
2306         if (!so_start_fallback(so))
2307                 return (EAGAIN);
2308 #ifdef DEBUG
2309         /*
2310          * Make a copy of the sonode in case we need to make an integrity
2311          * check later on.
2312          */
2313         bcopy(so, &origso, sizeof (*so));
2314 #endif
2315 
2316         sp->sp_stats.sps_nfallback.value.ui64++;
2317 
2318         newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
2319             so->so_protocol, devpath, KM_SLEEP, &error);
2320         if (error != 0)
2321                 goto out;
2322 
2323         if (so->so_direct != NULL) {
2324                 sodirect_t *sodp = so->so_direct;
2325                 mutex_enter(&so->so_lock);
2326 
2327                 so->so_direct->sod_enabled = B_FALSE;
2328                 so->so_state &= ~SS_SODIRECT;
2329                 ASSERT(sodp->sod_uioafh == NULL);
2330                 mutex_exit(&so->so_lock);
2331         }
2332 
2333         /* Turn sonode into a TPI socket */
2334         error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
2335         if (error != 0)
2336                 goto out;
2337         /*
2338          * When it comes to urgent data we have two cases to deal with;
2339          * (1) The oob byte has already arrived, or (2) the protocol has
2340          * notified that oob data is pending, but it has not yet arrived.
2341          *
2342          * For (1) all we need to do is send a T_EXDATA_IND to indicate were
2343          * in the byte stream the oob byte is. For (2) we have to send a
2344          * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
2345          * the oob byte will be the next byte from the protocol.
2346          *
2347          * So in the worst case we need two mblks, one for the signal, another
2348          * for mark indication. In that case we use the exdata_mp for the sig.
2349          */
2350         arg.soqa_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind),
2351             BPRI_MED, STR_NOSIG, NULL);
2352         arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
2353 
2354         /*
2355          * Now tell the protocol to start using TPI. so_quiesced_cb be
2356          * called once it's safe to synchronize state.
2357          */
2358         DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
2359         error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb,
2360             &arg);
2361         DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
2362 
2363         if (error != 0) {
2364                 /* protocol was unable to do a fallback, revert the sonode */
2365                 sotpi_revert_sonode(so, cr);
2366                 goto out;
2367         }
2368 
2369         /*
2370          * Walk the accept queue and notify the proto that they should
2371          * fall back to TPI. The protocol will send up the T_CONN_IND.
2372          */
2373         nso = list_head(&so->so_acceptq_list);
2374         while (nso != NULL) {
2375                 int rval;
2376                 struct sonode *next;
2377 
2378                 if (arg.soqa_exdata_mp == NULL) {
2379                         arg.soqa_exdata_mp =
2380                             allocb_wait(sizeof (struct T_exdata_ind),
2381                             BPRI_MED, STR_NOSIG, NULL);
2382                 }
2383                 if (arg.soqa_urgmark_mp == NULL) {
2384                         arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED,
2385                             STR_NOSIG, NULL);
2386                 }
2387 
2388                 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
2389                 rval = (*fbfunc)(nso->so_proto_handle, NULL, direct,
2390                     so_quiesced_cb, &arg);
2391                 DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
2392                 if (rval != 0) {
2393                         /* Abort the connection */
2394                         zcmn_err(getzoneid(), CE_WARN,
2395                             "Failed to convert socket in accept queue to TPI. "
2396                             "Pid = %d\n", curproc->p_pid);
2397                         next = list_next(&so->so_acceptq_list, nso);
2398                         list_remove(&so->so_acceptq_list, nso);
2399                         so->so_acceptq_len--;
2400 
2401                         (void) socket_close(nso, 0, CRED());
2402                         socket_destroy(nso);
2403                         nso = next;
2404                 } else {
2405                         nso = list_next(&so->so_acceptq_list, nso);
2406                 }
2407         }
2408 
2409         /*
2410          * Now flush the acceptq, this will destroy all sockets. They will
2411          * be recreated in sotpi_accept().
2412          */
2413         so_acceptq_flush(so, B_FALSE);
2414 
2415         mutex_enter(&so->so_lock);
2416         so->so_state |= SS_FALLBACK_COMP;
2417         mutex_exit(&so->so_lock);
2418 
2419         /*
2420          * Swap the sonode ops. Socket opertations that come in once this
2421          * is done will proceed without blocking.
2422          */
2423         so->so_ops = &sotpi_sonodeops;
2424 
2425         /*
2426          * Wake up any threads stuck in poll. This is needed since the poll
2427          * head changes when the fallback happens (moves from the sonode to
2428          * the STREAMS head).
2429          */
2430         pollwakeup(&so->so_poll_list, POLLERR);
2431 
2432         /*
2433          * When this non-STREAM socket was created we placed an extra ref on
2434          * the associated vnode to support asynchronous close. Drop that ref
2435          * here.
2436          */
2437         ASSERT(SOTOV(so)->v_count >= 2);
2438         VN_RELE(SOTOV(so));
2439 out:
2440         so_end_fallback(so);
2441 
2442         if (error != 0) {
2443 #ifdef DEBUG
2444                 so_integrity_check(so, &origso);
2445 #endif
2446                 zcmn_err(getzoneid(), CE_WARN,
2447                     "Failed to convert socket to TPI (err=%d). Pid = %d\n",
2448                     error, curproc->p_pid);
2449                 if (newsp != NULL)
2450                         SOCKPARAMS_DEC_REF(newsp);
2451         }
2452         if (arg.soqa_exdata_mp != NULL)
2453                 freemsg(arg.soqa_exdata_mp);
2454         if (arg.soqa_urgmark_mp != NULL)
2455                 freemsg(arg.soqa_urgmark_mp);
2456 
2457         return (error);
2458 }