1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /*
  26  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  27  * Copyright 2015 Joyent, Inc.
  28  */
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/signal.h>
  33 #include <sys/cmn_err.h>
  34 
  35 #include <sys/stropts.h>
  36 #include <sys/socket.h>
  37 #include <sys/socketvar.h>
  38 #include <sys/sockio.h>
  39 #include <sys/strsubr.h>
  40 #include <sys/strsun.h>
  41 #include <sys/atomic.h>
  42 #include <sys/tihdr.h>
  43 
  44 #include <fs/sockfs/sockcommon.h>
  45 #include <fs/sockfs/sockfilter_impl.h>
  46 #include <fs/sockfs/socktpi.h>
  47 #include <fs/sockfs/sodirect.h>
  48 #include <sys/ddi.h>
  49 #include <inet/ip.h>
  50 #include <sys/time.h>
  51 #include <sys/cmn_err.h>
  52 
  53 #ifdef SOCK_TEST
  54 extern int do_useracc;
  55 extern clock_t sock_test_timelimit;
  56 #endif /* SOCK_TEST */
  57 
  58 #define MBLK_PULL_LEN 64
  59 uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
  60 
  61 #ifdef DEBUG
  62 boolean_t so_debug_length = B_FALSE;
  63 static boolean_t so_check_length(sonode_t *so);
  64 #endif
  65 
  66 static int
  67 so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
  68     struct sonode **nsop)
  69 {
  70         struct sonode *nso = NULL;
  71 
  72         *nsop = NULL;
  73         ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
  74         while ((nso = list_remove_head(&so->so_acceptq_list)) == NULL) {
  75                 /*
  76                  * No need to check so_error here, because it is not
  77                  * possible for a listening socket to be reset or otherwise
  78                  * disconnected.
  79                  *
  80                  * So now we just need check if it's ok to wait.
  81                  */
  82                 if (dontblock)
  83                         return (EWOULDBLOCK);
  84                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
  85                         return (EINTR);
  86 
  87                 if (cv_wait_sig_swap(&so->so_acceptq_cv,
  88                     &so->so_acceptq_lock) == 0)
  89                         return (EINTR);
  90         }
  91 
  92         ASSERT(nso != NULL);
  93         ASSERT(so->so_acceptq_len > 0);
  94         so->so_acceptq_len--;
  95         nso->so_listener = NULL;
  96 
  97         *nsop = nso;
  98 
  99         return (0);
 100 }
 101 
 102 /*
 103  * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
 104  *
 105  * Pulls a connection off of the accept queue.
 106  *
 107  * Arguments:
 108  *   so        - listening socket
 109  *   dontblock - indicate whether it's ok to sleep if there are no
 110  *               connections on the queue
 111  *   nsop      - Value-return argument
 112  *
 113  * Return values:
 114  *   0 when a connection is successfully dequeued, in which case nsop
 115  *   is set to point to the new connection. Upon failure a non-zero
 116  *   value is returned, and the value of nsop is set to NULL.
 117  *
 118  * Note:
 119  *   so_acceptq_dequeue() may return prematurly if the socket is falling
 120  *   back to TPI.
 121  */
 122 int
 123 so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
 124     struct sonode **nsop)
 125 {
 126         int error;
 127 
 128         mutex_enter(&so->so_acceptq_lock);
 129         error = so_acceptq_dequeue_locked(so, dontblock, nsop);
 130         mutex_exit(&so->so_acceptq_lock);
 131 
 132         return (error);
 133 }
 134 
 135 static void
 136 so_acceptq_flush_impl(struct sonode *so, list_t *list, boolean_t doclose)
 137 {
 138         struct sonode *nso;
 139 
 140         while ((nso = list_remove_head(list)) != NULL) {
 141                 nso->so_listener = NULL;
 142                 if (doclose) {
 143                         (void) socket_close(nso, 0, CRED());
 144                 } else {
 145                         /*
 146                          * Only used for fallback - not possible when filters
 147                          * are present.
 148                          */
 149                         ASSERT(so->so_filter_active == 0);
 150                         /*
 151                          * Since the socket is on the accept queue, there can
 152                          * only be one reference. We drop the reference and
 153                          * just blow off the socket.
 154                          */
 155                         ASSERT(nso->so_count == 1);
 156                         nso->so_count--;
 157                         /* drop the proto ref */
 158                         VN_RELE(SOTOV(nso));
 159                 }
 160                 socket_destroy(nso);
 161         }
 162 }
 163 /*
 164  * void so_acceptq_flush(struct sonode *so)
 165  *
 166  * Removes all pending connections from a listening socket, and
 167  * frees the associated resources.
 168  *
 169  * Arguments
 170  *   so      - listening socket
 171  *   doclose - make a close downcall for each socket on the accept queue
 172  *
 173  * Return values:
 174  *   None.
 175  *
 176  * Note:
 177  *   The caller has to ensure that no calls to so_acceptq_enqueue() or
 178  *   so_acceptq_dequeue() occur while the accept queue is being flushed.
 179  *   So either the socket needs to be in a state where no operations
 180  *   would come in, or so_lock needs to be obtained.
 181  */
 182 void
 183 so_acceptq_flush(struct sonode *so, boolean_t doclose)
 184 {
 185         so_acceptq_flush_impl(so, &so->so_acceptq_list, doclose);
 186         so_acceptq_flush_impl(so, &so->so_acceptq_defer, doclose);
 187 
 188         so->so_acceptq_len = 0;
 189 }
 190 
 191 int
 192 so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
 193     sock_connid_t id)
 194 {
 195         ASSERT(MUTEX_HELD(&so->so_lock));
 196 
 197         /*
 198          * The protocol has notified us that a connection attempt is being
 199          * made, so before we wait for a notification to arrive we must
 200          * clear out any errors associated with earlier connection attempts.
 201          */
 202         if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
 203                 so->so_error = 0;
 204 
 205         while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
 206                 if (nonblock)
 207                         return (EINPROGRESS);
 208 
 209                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 210                         return (EINTR);
 211 
 212                 if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
 213                         return (EINTR);
 214         }
 215 
 216         if (so->so_error != 0)
 217                 return (sogeterr(so, B_TRUE));
 218         /*
 219          * Under normal circumstances, so_error should contain an error
 220          * in case the connect failed. However, it is possible for another
 221          * thread to come in a consume the error, so generate a sensible
 222          * error in that case.
 223          */
 224         if ((so->so_state & SS_ISCONNECTED) == 0)
 225                 return (ECONNREFUSED);
 226 
 227         return (0);
 228 }
 229 
 230 /*
 231  * int so_wait_connected(struct sonode *so, boolean_t nonblock,
 232  *    sock_connid_t id)
 233  *
 234  * Wait until the socket is connected or an error has occured.
 235  *
 236  * Arguments:
 237  *   so       - socket
 238  *   nonblock - indicate whether it's ok to sleep if the connection has
 239  *              not yet been established
 240  *   gen      - generation number that was returned by the protocol
 241  *              when the operation was started
 242  *
 243  * Returns:
 244  *   0 if the connection attempt was successful, or an error indicating why
 245  *   the connection attempt failed.
 246  */
 247 int
 248 so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
 249 {
 250         int error;
 251 
 252         mutex_enter(&so->so_lock);
 253         error = so_wait_connected_locked(so, nonblock, id);
 254         mutex_exit(&so->so_lock);
 255 
 256         return (error);
 257 }
 258 
 259 int
 260 so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
 261 {
 262         int error;
 263 
 264         ASSERT(MUTEX_HELD(&so->so_lock));
 265         while (SO_SND_FLOWCTRLD(so)) {
 266                 if (so->so_state & SS_CANTSENDMORE)
 267                         return (EPIPE);
 268                 if (dontblock)
 269                         return (EWOULDBLOCK);
 270 
 271                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 272                         return (EINTR);
 273 
 274                 if (so->so_sndtimeo == 0) {
 275                         /*
 276                          * Zero means disable timeout.
 277                          */
 278                         error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
 279                 } else {
 280                         error = cv_reltimedwait_sig(&so->so_snd_cv,
 281                             &so->so_lock, so->so_sndtimeo, TR_CLOCK_TICK);
 282                 }
 283                 if (error == 0)
 284                         return (EINTR);
 285                 else if (error == -1)
 286                         return (EAGAIN);
 287         }
 288         return (0);
 289 }
 290 
 291 /*
 292  * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
 293  *
 294  * Wait for the transport to notify us about send buffers becoming
 295  * available.
 296  */
 297 int
 298 so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
 299 {
 300         int error = 0;
 301 
 302         mutex_enter(&so->so_lock);
 303         so->so_snd_wakeup = B_TRUE;
 304         error = so_snd_wait_qnotfull_locked(so, dontblock);
 305         so->so_snd_wakeup = B_FALSE;
 306         mutex_exit(&so->so_lock);
 307 
 308         return (error);
 309 }
 310 
 311 void
 312 so_snd_qfull(struct sonode *so)
 313 {
 314         mutex_enter(&so->so_lock);
 315         so->so_snd_qfull = B_TRUE;
 316         mutex_exit(&so->so_lock);
 317 }
 318 
 319 void
 320 so_snd_qnotfull(struct sonode *so)
 321 {
 322         mutex_enter(&so->so_lock);
 323         so->so_snd_qfull = B_FALSE;
 324         /* wake up everyone waiting for buffers */
 325         cv_broadcast(&so->so_snd_cv);
 326         mutex_exit(&so->so_lock);
 327 }
 328 
 329 /*
 330  * Change the process/process group to which SIGIO is sent.
 331  */
 332 int
 333 socket_chgpgrp(struct sonode *so, pid_t pid)
 334 {
 335         int error;
 336 
 337         ASSERT(MUTEX_HELD(&so->so_lock));
 338         if (pid != 0) {
 339                 /*
 340                  * Permissions check by sending signal 0.
 341                  * Note that when kill fails it does a
 342                  * set_errno causing the system call to fail.
 343                  */
 344                 error = kill(pid, 0);
 345                 if (error != 0) {
 346                         return (error);
 347                 }
 348         }
 349         so->so_pgrp = pid;
 350         return (0);
 351 }
 352 
 353 
 354 /*
 355  * Generate a SIGIO, for 'writable' events include siginfo structure,
 356  * for read events just send the signal.
 357  */
 358 /*ARGSUSED*/
 359 static void
 360 socket_sigproc(proc_t *proc, int event)
 361 {
 362         k_siginfo_t info;
 363 
 364         ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
 365 
 366         if (event & SOCKETSIG_WRITE) {
 367                 info.si_signo = SIGPOLL;
 368                 info.si_code = POLL_OUT;
 369                 info.si_errno = 0;
 370                 info.si_fd = 0;
 371                 info.si_band = 0;
 372                 sigaddq(proc, NULL, &info, KM_NOSLEEP);
 373         }
 374         if (event & SOCKETSIG_READ) {
 375                 sigtoproc(proc, NULL, SIGPOLL);
 376         }
 377         if (event & SOCKETSIG_URG) {
 378                 sigtoproc(proc, NULL, SIGURG);
 379         }
 380 }
 381 
 382 void
 383 socket_sendsig(struct sonode *so, int event)
 384 {
 385         proc_t *proc;
 386 
 387         ASSERT(MUTEX_HELD(&so->so_lock));
 388 
 389         if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
 390             event != SOCKETSIG_URG)) {
 391                 return;
 392         }
 393 
 394         dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
 395 
 396         if (so->so_pgrp > 0) {
 397                 /*
 398                  * XXX This unfortunately still generates
 399                  * a signal when a fd is closed but
 400                  * the proc is active.
 401                  */
 402                 mutex_enter(&pidlock);
 403                 /*
 404                  * Even if the thread started in another zone, we're receiving
 405                  * on behalf of this socket's zone, so find the proc using the
 406                  * socket's zone ID.
 407                  */
 408                 proc = prfind_zone(so->so_pgrp, so->so_zoneid);
 409                 if (proc == NULL) {
 410                         mutex_exit(&pidlock);
 411                         return;
 412                 }
 413                 mutex_enter(&proc->p_lock);
 414                 mutex_exit(&pidlock);
 415                 socket_sigproc(proc, event);
 416                 mutex_exit(&proc->p_lock);
 417         } else {
 418                 /*
 419                  * Send to process group. Hold pidlock across
 420                  * calls to socket_sigproc().
 421                  */
 422                 pid_t pgrp = -so->so_pgrp;
 423 
 424                 mutex_enter(&pidlock);
 425                 /*
 426                  * Even if the thread started in another zone, we're receiving
 427                  * on behalf of this socket's zone, so find the pgrp using the
 428                  * socket's zone ID.
 429                  */
 430                 proc = pgfind_zone(pgrp, so->so_zoneid);
 431                 while (proc != NULL) {
 432                         mutex_enter(&proc->p_lock);
 433                         socket_sigproc(proc, event);
 434                         mutex_exit(&proc->p_lock);
 435                         proc = proc->p_pglink;
 436                 }
 437                 mutex_exit(&pidlock);
 438         }
 439 }
 440 
 441 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 442 /* Copy userdata into a new mblk_t */
 443 mblk_t *
 444 socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
 445     size_t tail_len, int *errorp)
 446 {
 447         mblk_t  *head = NULL, **tail = &head;
 448 
 449         ASSERT(iosize == INFPSZ || iosize > 0);
 450 
 451         if (iosize == INFPSZ || iosize > uiop->uio_resid)
 452                 iosize = uiop->uio_resid;
 453 
 454         if (maxblk == INFPSZ)
 455                 maxblk = iosize;
 456 
 457         /* Nothing to do in these cases, so we're done */
 458         if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
 459                 goto done;
 460 
 461         /*
 462          * We will enter the loop below if iosize is 0; it will allocate an
 463          * empty message block and call uiomove(9F) which will just return.
 464          * We could avoid that with an extra check but would only slow
 465          * down the much more likely case where iosize is larger than 0.
 466          */
 467         do {
 468                 ssize_t blocksize;
 469                 mblk_t  *mp;
 470 
 471                 blocksize = MIN(iosize, maxblk);
 472                 ASSERT(blocksize >= 0);
 473                 mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
 474                 if (mp == NULL) {
 475                         *errorp = ENOMEM;
 476                         return (head);
 477                 }
 478                 mp->b_rptr += wroff;
 479                 mp->b_wptr = mp->b_rptr + blocksize;
 480 
 481                 *tail = mp;
 482                 tail = &mp->b_cont;
 483 
 484                 /* uiomove(9F) either returns 0 or EFAULT */
 485                 if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
 486                     UIO_WRITE, uiop)) != 0) {
 487                         ASSERT(*errorp != ENOMEM);
 488                         freemsg(head);
 489                         return (NULL);
 490                 }
 491 
 492                 iosize -= blocksize;
 493         } while (iosize > 0);
 494 
 495 done:
 496         *errorp = 0;
 497         return (head);
 498 }
 499 
 500 mblk_t *
 501 socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
 502 {
 503         int error;
 504         ptrdiff_t n;
 505         mblk_t *nmp;
 506 
 507         ASSERT(mp->b_wptr >= mp->b_rptr);
 508 
 509         /*
 510          * max_read is the offset of the oobmark and read can not go pass
 511          * the oobmark.
 512          */
 513         if (max_read == INFPSZ || max_read > uiop->uio_resid)
 514                 max_read = uiop->uio_resid;
 515 
 516         do {
 517                 if ((n = MIN(max_read, MBLKL(mp))) != 0) {
 518                         ASSERT(n > 0);
 519 
 520                         error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
 521                         if (error != 0) {
 522                                 freemsg(mp);
 523                                 *errorp = error;
 524                                 return (NULL);
 525                         }
 526                 }
 527 
 528                 mp->b_rptr += n;
 529                 max_read -= n;
 530                 while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
 531                         /*
 532                          * get rid of zero length mblks
 533                          */
 534                         nmp = mp;
 535                         mp = mp->b_cont;
 536                         freeb(nmp);
 537                 }
 538         } while (mp != NULL && max_read > 0);
 539 
 540         *errorp = 0;
 541         return (mp);
 542 }
 543 
 544 static void
 545 so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
 546 {
 547         ASSERT(last_tail != NULL);
 548         mp->b_next = so->so_rcv_q_head;
 549         mp->b_prev = last_tail;
 550         ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
 551 
 552         if (so->so_rcv_q_head == NULL) {
 553                 ASSERT(so->so_rcv_q_last_head == NULL);
 554                 so->so_rcv_q_last_head = mp;
 555 #ifdef DEBUG
 556         } else {
 557                 ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
 558 #endif
 559         }
 560         so->so_rcv_q_head = mp;
 561 
 562 #ifdef DEBUG
 563         if (so_debug_length) {
 564                 mutex_enter(&so->so_lock);
 565                 ASSERT(so_check_length(so));
 566                 mutex_exit(&so->so_lock);
 567         }
 568 #endif
 569 }
 570 
 571 /*
 572  * Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
 573  * can be processed by so_dequeue_msg().
 574  */
 575 void
 576 so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
 577 {
 578         if (so->so_filter_active > 0 &&
 579             (mp_head = sof_filter_data_in_proc(so, mp_head,
 580             &mp_last_head)) == NULL)
 581                 return;
 582 
 583         ASSERT(mp_head->b_prev != NULL);
 584         if (so->so_rcv_q_head == NULL) {
 585                 so->so_rcv_q_head = mp_head;
 586                 so->so_rcv_q_last_head = mp_last_head;
 587                 ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
 588         } else {
 589                 boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
 590                     (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
 591 
 592                 if (mp_head->b_next == NULL &&
 593                     DB_TYPE(mp_head) == M_DATA &&
 594                     DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
 595                         so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 596                         so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 597                         mp_head->b_prev = NULL;
 598                 } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
 599                         /*
 600                          * Append to last_head if more than one mblks, and both
 601                          * mp_head and last_head are I/OAT mblks.
 602                          */
 603                         ASSERT(mp_head->b_next != NULL);
 604                         so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 605                         so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 606                         mp_head->b_prev = NULL;
 607 
 608                         so->so_rcv_q_last_head->b_next = mp_head->b_next;
 609                         mp_head->b_next = NULL;
 610                         so->so_rcv_q_last_head = mp_last_head;
 611                 } else {
 612 #ifdef DEBUG
 613                         {
 614                                 mblk_t *tmp_mblk;
 615                                 tmp_mblk = mp_head;
 616                                 while (tmp_mblk != NULL) {
 617                                         ASSERT(tmp_mblk->b_prev != NULL);
 618                                         tmp_mblk = tmp_mblk->b_next;
 619                                 }
 620                         }
 621 #endif
 622                         so->so_rcv_q_last_head->b_next = mp_head;
 623                         so->so_rcv_q_last_head = mp_last_head;
 624                 }
 625         }
 626 }
 627 
 628 /*
 629  * Check flow control on a given sonode.  Must have so_lock held, and
 630  * this function will release the hold.  Return true if flow control
 631  * is cleared.
 632  */
 633 boolean_t
 634 so_check_flow_control(struct sonode *so)
 635 {
 636         ASSERT(MUTEX_HELD(&so->so_lock));
 637 
 638         if (so->so_flowctrld && (so->so_rcv_queued < so->so_rcvlowat &&
 639             !(so->so_state & SS_FIL_RCV_FLOWCTRL))) {
 640                 so->so_flowctrld = B_FALSE;
 641                 mutex_exit(&so->so_lock);
 642                 /*
 643                  * Open up flow control. SCTP does not have any downcalls, and
 644                  * it will clr flow ctrl in sosctp_recvmsg().
 645                  */
 646                 if (so->so_downcalls != NULL &&
 647                     so->so_downcalls->sd_clr_flowctrl != NULL) {
 648                         (*so->so_downcalls->sd_clr_flowctrl)
 649                             (so->so_proto_handle);
 650                 }
 651                 /* filters can start injecting data */
 652                 sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0);
 653                 return (B_TRUE);
 654         } else {
 655                 mutex_exit(&so->so_lock);
 656                 return (B_FALSE);
 657         }
 658 }
 659 
 660 int
 661 so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
 662     rval_t *rvalp, int flags)
 663 {
 664         mblk_t  *mp, *nmp;
 665         mblk_t  *savemp, *savemptail;
 666         mblk_t  *new_msg_head;
 667         mblk_t  *new_msg_last_head;
 668         mblk_t  *last_tail;
 669         boolean_t partial_read;
 670         boolean_t reset_atmark = B_FALSE;
 671         int more = 0;
 672         int error;
 673         ssize_t oobmark;
 674         ssize_t copied = 0;
 675         sodirect_t *sodp = so->so_direct;
 676         xuio_t *xuio = NULL;
 677 
 678         partial_read = B_FALSE;
 679         *mctlp = NULL;
 680         if ((uiop->uio_extflg & UIO_XUIO) != 0) {
 681                 xuio = (xuio_t *)uiop;
 682         }
 683 again:
 684         mutex_enter(&so->so_lock);
 685 again1:
 686 #ifdef DEBUG
 687         if (so_debug_length) {
 688                 ASSERT(so_check_length(so));
 689         }
 690 #endif
 691         if (so->so_state & SS_RCVATMARK) {
 692                 /* Check whether the caller is OK to read past the mark */
 693                 if (flags & MSG_NOMARK) {
 694                         mutex_exit(&so->so_lock);
 695                         return (EWOULDBLOCK);
 696                 }
 697                 reset_atmark = B_TRUE;
 698         }
 699         /*
 700          * First move messages from the dump area to processing area
 701          */
 702         if (sodp != NULL) {
 703                 if (sodp->sod_enabled) {
 704                         if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
 705                                 /* nothing to uioamove */
 706                                 sodp = NULL;
 707                         } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
 708                                 sodp->sod_uioa.uioa_state &= UIOA_CLR;
 709                                 sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
 710                                 /*
 711                                  * try to uioamove() the data that
 712                                  * has already queued.
 713                                  */
 714                                 sod_uioa_so_init(so, sodp, uiop);
 715                         }
 716                 } else {
 717                         sodp = NULL;
 718                 }
 719         }
 720         new_msg_head = so->so_rcv_head;
 721         new_msg_last_head = so->so_rcv_last_head;
 722         so->so_rcv_head = NULL;
 723         so->so_rcv_last_head = NULL;
 724         oobmark = so->so_oobmark;
 725         /*
 726          * We can release the lock as there can only be one reader
 727          */
 728         mutex_exit(&so->so_lock);
 729 
 730         if (new_msg_head != NULL) {
 731                 so_process_new_message(so, new_msg_head, new_msg_last_head);
 732         }
 733         savemp = savemptail = NULL;
 734         rvalp->r_vals = 0;
 735         error = 0;
 736         mp = so->so_rcv_q_head;
 737 
 738         if (mp != NULL &&
 739             (so->so_rcv_timer_tid == 0 ||
 740             so->so_rcv_queued >= so->so_rcv_thresh)) {
 741                 partial_read = B_FALSE;
 742 
 743                 if (flags & MSG_PEEK) {
 744                         if ((nmp = dupmsg(mp)) == NULL &&
 745                             (nmp = copymsg(mp)) == NULL) {
 746                                 size_t size = msgsize(mp);
 747 
 748                                 error = strwaitbuf(size, BPRI_HI);
 749                                 if (error) {
 750                                         return (error);
 751                                 }
 752                                 goto again;
 753                         }
 754                         mp = nmp;
 755                 } else {
 756                         ASSERT(mp->b_prev != NULL);
 757                         last_tail = mp->b_prev;
 758                         mp->b_prev = NULL;
 759                         so->so_rcv_q_head = mp->b_next;
 760                         if (so->so_rcv_q_head == NULL) {
 761                                 so->so_rcv_q_last_head = NULL;
 762                         }
 763                         mp->b_next = NULL;
 764                 }
 765 
 766                 ASSERT(mctlp != NULL);
 767                 /*
 768                  * First process PROTO or PCPROTO blocks, if any.
 769                  */
 770                 if (DB_TYPE(mp) != M_DATA) {
 771                         *mctlp = mp;
 772                         savemp = mp;
 773                         savemptail = mp;
 774                         ASSERT(DB_TYPE(mp) == M_PROTO ||
 775                             DB_TYPE(mp) == M_PCPROTO);
 776                         while (mp->b_cont != NULL &&
 777                             DB_TYPE(mp->b_cont) != M_DATA) {
 778                                 ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
 779                                     DB_TYPE(mp->b_cont) == M_PCPROTO);
 780                                 mp = mp->b_cont;
 781                                 savemptail = mp;
 782                         }
 783                         mp = savemptail->b_cont;
 784                         savemptail->b_cont = NULL;
 785                 }
 786 
 787                 ASSERT(DB_TYPE(mp) == M_DATA);
 788                 /*
 789                  * Now process DATA blocks, if any. Note that for sodirect
 790                  * enabled socket, uio_resid can be 0.
 791                  */
 792                 if (uiop->uio_resid >= 0) {
 793                         if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
 794                                 mutex_enter(&so->so_lock);
 795                                 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
 796                                 copied = sod_uioa_mblk(so, mp);
 797                                 if (copied > 0)
 798                                         partial_read = B_TRUE;
 799                                 mutex_exit(&so->so_lock);
 800                                 /* mark this mblk as processed */
 801                                 mp = NULL;
 802                         } else {
 803                                 ssize_t oldresid = uiop->uio_resid;
 804 
 805                                 if (MBLKL(mp) < so_mblk_pull_len) {
 806                                         if (pullupmsg(mp, -1) == 1) {
 807                                                 last_tail = mp;
 808                                         }
 809                                 }
 810                                 /*
 811                                  * Can not read beyond the oobmark
 812                                  */
 813                                 mp = socopyoutuio(mp, uiop,
 814                                     oobmark == 0 ? INFPSZ : oobmark, &error);
 815                                 if (error != 0) {
 816                                         freemsg(*mctlp);
 817                                         *mctlp = NULL;
 818                                         more = 0;
 819                                         goto done;
 820                                 }
 821                                 ASSERT(oldresid >= uiop->uio_resid);
 822                                 copied = oldresid - uiop->uio_resid;
 823                                 if (oldresid > uiop->uio_resid)
 824                                         partial_read = B_TRUE;
 825                         }
 826                         ASSERT(copied >= 0);
 827                         if (copied > 0 && !(flags & MSG_PEEK)) {
 828                                 mutex_enter(&so->so_lock);
 829                                 so->so_rcv_queued -= copied;
 830                                 ASSERT(so->so_oobmark >= 0);
 831                                 if (so->so_oobmark > 0) {
 832                                         so->so_oobmark -= copied;
 833                                         ASSERT(so->so_oobmark >= 0);
 834                                         if (so->so_oobmark == 0) {
 835                                                 ASSERT(so->so_state &
 836                                                     SS_OOBPEND);
 837                                                 so->so_oobmark = 0;
 838                                                 so->so_state |= SS_RCVATMARK;
 839                                         }
 840                                 }
 841                                 /*
 842                                  * so_check_flow_control() will drop
 843                                  * so->so_lock.
 844                                  */
 845                                 rvalp->r_val2 = so_check_flow_control(so);
 846                         }
 847                 }
 848                 if (mp != NULL) { /* more data blocks in msg */
 849                         more |= MOREDATA;
 850 
 851                         /*
 852                          * If requested, tally up remaining data along with the
 853                          * amount already copied.
 854                          */
 855                         if (xuio != NULL &&
 856                             xuio->xu_type == UIOTYPE_PEEKSIZE) {
 857                                 xuio->xu_ext.xu_ps.xu_ps_set = B_TRUE;
 858                                 xuio->xu_ext.xu_ps.xu_ps_size =
 859                                     copied + msgdsize(mp);
 860                         }
 861 
 862                         if ((flags & (MSG_PEEK|MSG_TRUNC))) {
 863                                 if (flags & MSG_PEEK) {
 864                                         freemsg(mp);
 865                                 } else {
 866                                         unsigned int msize = msgdsize(mp);
 867 
 868                                         freemsg(mp);
 869                                         mutex_enter(&so->so_lock);
 870                                         so->so_rcv_queued -= msize;
 871                                         /*
 872                                          * so_check_flow_control() will drop
 873                                          * so->so_lock.
 874                                          */
 875                                         rvalp->r_val2 =
 876                                             so_check_flow_control(so);
 877                                 }
 878                         } else if (partial_read && !somsghasdata(mp)) {
 879                                 /*
 880                                  * Avoid queuing a zero-length tail part of
 881                                  * a message. partial_read == 1 indicates that
 882                                  * we read some of the message.
 883                                  */
 884                                 freemsg(mp);
 885                                 more &= ~MOREDATA;
 886                         } else {
 887                                 if (savemp != NULL &&
 888                                     (flags & MSG_DUPCTRL)) {
 889                                         mblk_t *nmp;
 890                                         /*
 891                                          * There should only be non data mblks
 892                                          */
 893                                         ASSERT(DB_TYPE(savemp) != M_DATA &&
 894                                             DB_TYPE(savemptail) != M_DATA);
 895 try_again:
 896                                         if ((nmp = dupmsg(savemp)) == NULL &&
 897                                             (nmp = copymsg(savemp)) == NULL) {
 898 
 899                                                 size_t size = msgsize(savemp);
 900 
 901                                                 error = strwaitbuf(size,
 902                                                     BPRI_HI);
 903                                                 if (error != 0) {
 904                                                         /*
 905                                                          * In case we
 906                                                          * cannot copy
 907                                                          * control data
 908                                                          * free the remaining
 909                                                          * data.
 910                                                          */
 911                                                         freemsg(mp);
 912                                                         goto done;
 913                                                 }
 914                                                 goto try_again;
 915                                         }
 916 
 917                                         ASSERT(nmp != NULL);
 918                                         ASSERT(DB_TYPE(nmp) != M_DATA);
 919                                         savemptail->b_cont = mp;
 920                                         *mctlp = nmp;
 921                                         mp = savemp;
 922                                 }
 923                                 /*
 924                                  * putback mp
 925                                  */
 926                                 so_prepend_msg(so, mp, last_tail);
 927                         }
 928                 }
 929 
 930                 /* fast check so_rcv_head if there is more data */
 931                 if (partial_read && !(so->so_state & SS_RCVATMARK) &&
 932                     *mctlp == NULL && uiop->uio_resid > 0 &&
 933                     !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
 934                         goto again;
 935                 }
 936         } else if (!partial_read) {
 937                 mutex_enter(&so->so_lock);
 938                 if (so->so_error != 0) {
 939                         error = sogeterr(so, !(flags & MSG_PEEK));
 940                         mutex_exit(&so->so_lock);
 941                         return (error);
 942                 }
 943                 /*
 944                  * No pending data. Return right away for nonblocking
 945                  * socket, otherwise sleep waiting for data.
 946                  */
 947                 if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
 948                         if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
 949                             (flags & MSG_DONTWAIT)) {
 950                                 error = EWOULDBLOCK;
 951                         } else {
 952                                 if (so->so_state & (SS_CLOSING |
 953                                     SS_FALLBACK_PENDING)) {
 954                                         mutex_exit(&so->so_lock);
 955                                         error = EINTR;
 956                                         goto done;
 957                                 }
 958 
 959                                 if (so->so_rcv_head != NULL) {
 960                                         goto again1;
 961                                 }
 962                                 so->so_rcv_wakeup = B_TRUE;
 963                                 so->so_rcv_wanted = uiop->uio_resid;
 964                                 if (so->so_rcvtimeo == 0) {
 965                                         /*
 966                                          * Zero means disable timeout.
 967                                          */
 968                                         error = cv_wait_sig(&so->so_rcv_cv,
 969                                             &so->so_lock);
 970                                 } else {
 971                                         error = cv_reltimedwait_sig(
 972                                             &so->so_rcv_cv, &so->so_lock,
 973                                             so->so_rcvtimeo, TR_CLOCK_TICK);
 974                                 }
 975                                 so->so_rcv_wakeup = B_FALSE;
 976                                 so->so_rcv_wanted = 0;
 977 
 978                                 if (error == 0) {
 979                                         error = EINTR;
 980                                 } else if (error == -1) {
 981                                         error = EAGAIN;
 982                                 } else {
 983                                         goto again1;
 984                                 }
 985                         }
 986                 }
 987                 mutex_exit(&so->so_lock);
 988         }
 989         if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
 990                 /*
 991                  * We are passed the mark, update state
 992                  * 4.3BSD and 4.4BSD clears the mark when peeking across it.
 993                  * The draft Posix socket spec states that the mark should
 994                  * not be cleared when peeking. We follow the latter.
 995                  */
 996                 mutex_enter(&so->so_lock);
 997                 ASSERT(so_verify_oobstate(so));
 998                 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
 999                 freemsg(so->so_oobmsg);
1000                 so->so_oobmsg = NULL;
1001                 ASSERT(so_verify_oobstate(so));
1002                 mutex_exit(&so->so_lock);
1003         }
1004         ASSERT(so->so_rcv_wakeup == B_FALSE);
1005 done:
1006         if (sodp != NULL) {
1007                 mutex_enter(&so->so_lock);
1008                 if (sodp->sod_enabled &&
1009                     (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
1010                         SOD_UIOAFINI(sodp);
1011                         if (sodp->sod_uioa.uioa_mbytes > 0) {
1012                                 ASSERT(so->so_rcv_q_head != NULL ||
1013                                     so->so_rcv_head != NULL);
1014                                 so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
1015                                 if (error == EWOULDBLOCK)
1016                                         error = 0;
1017                         }
1018                 }
1019                 mutex_exit(&so->so_lock);
1020         }
1021 #ifdef DEBUG
1022         if (so_debug_length) {
1023                 mutex_enter(&so->so_lock);
1024                 ASSERT(so_check_length(so));
1025                 mutex_exit(&so->so_lock);
1026         }
1027 #endif
1028         rvalp->r_val1 = more;
1029         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1030         return (error);
1031 }
1032 
1033 /*
1034  * Enqueue data from the protocol on the socket's rcv queue.
1035  *
1036  * We try to hook new M_DATA mblks onto an existing chain, however,
1037  * that cannot be done if the existing chain has already been
1038  * processed by I/OAT. Non-M_DATA mblks are just linked together via
1039  * b_next. In all cases the b_prev of the enqueued mblk is set to
1040  * point to the last mblk in its b_cont chain.
1041  */
1042 void
1043 so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1044 {
1045         ASSERT(MUTEX_HELD(&so->so_lock));
1046 
1047 #ifdef DEBUG
1048         if (so_debug_length) {
1049                 ASSERT(so_check_length(so));
1050         }
1051 #endif
1052         so->so_rcv_queued += msg_size;
1053 
1054         if (so->so_rcv_head == NULL) {
1055                 ASSERT(so->so_rcv_last_head == NULL);
1056                 so->so_rcv_head = mp;
1057                 so->so_rcv_last_head = mp;
1058         } else if ((DB_TYPE(mp) == M_DATA &&
1059             DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1060             ((DB_FLAGS(mp) & DBLK_UIOA) ==
1061             (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1062                 /* Added to the end */
1063                 ASSERT(so->so_rcv_last_head != NULL);
1064                 ASSERT(so->so_rcv_last_head->b_prev != NULL);
1065                 so->so_rcv_last_head->b_prev->b_cont = mp;
1066         } else {
1067                 /* Start a new end */
1068                 so->so_rcv_last_head->b_next = mp;
1069                 so->so_rcv_last_head = mp;
1070         }
1071         while (mp->b_cont != NULL)
1072                 mp = mp->b_cont;
1073 
1074         so->so_rcv_last_head->b_prev = mp;
1075 #ifdef DEBUG
1076         if (so_debug_length) {
1077                 ASSERT(so_check_length(so));
1078         }
1079 #endif
1080 }
1081 
1082 /*
1083  * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1084  */
1085 boolean_t
1086 somsghasdata(mblk_t *mp)
1087 {
1088         for (; mp; mp = mp->b_cont)
1089                 if (mp->b_datap->db_type == M_DATA) {
1090                         ASSERT(mp->b_wptr >= mp->b_rptr);
1091                         if (mp->b_wptr > mp->b_rptr)
1092                                 return (B_TRUE);
1093                 }
1094         return (B_FALSE);
1095 }
1096 
1097 /*
1098  * Flush the read side of sockfs.
1099  *
1100  * The caller must be sure that a reader is not already active when the
1101  * buffer is being flushed.
1102  */
1103 void
1104 so_rcv_flush(struct sonode *so)
1105 {
1106         mblk_t  *mp;
1107 
1108         ASSERT(MUTEX_HELD(&so->so_lock));
1109 
1110         if (so->so_oobmsg != NULL) {
1111                 freemsg(so->so_oobmsg);
1112                 so->so_oobmsg = NULL;
1113                 so->so_oobmark = 0;
1114                 so->so_state &=
1115                     ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1116         }
1117 
1118         /*
1119          * Free messages sitting in the recv queues
1120          */
1121         while (so->so_rcv_q_head != NULL) {
1122                 mp = so->so_rcv_q_head;
1123                 so->so_rcv_q_head = mp->b_next;
1124                 mp->b_next = mp->b_prev = NULL;
1125                 freemsg(mp);
1126         }
1127         while (so->so_rcv_head != NULL) {
1128                 mp = so->so_rcv_head;
1129                 so->so_rcv_head = mp->b_next;
1130                 mp->b_next = mp->b_prev = NULL;
1131                 freemsg(mp);
1132         }
1133         so->so_rcv_queued = 0;
1134         so->so_rcv_q_head = NULL;
1135         so->so_rcv_q_last_head = NULL;
1136         so->so_rcv_head = NULL;
1137         so->so_rcv_last_head = NULL;
1138 }
1139 
1140 /*
1141  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1142  */
1143 int
1144 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1145     boolean_t oob_inline)
1146 {
1147         mblk_t          *mp, *nmp;
1148         int             error;
1149 
1150         dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1151             flags));
1152 
1153         if (msg != NULL) {
1154                 /*
1155                  * There is never any oob data with addresses or control since
1156                  * the T_EXDATA_IND does not carry any options.
1157                  */
1158                 msg->msg_controllen = 0;
1159                 msg->msg_namelen = 0;
1160                 msg->msg_flags = 0;
1161         }
1162 
1163         mutex_enter(&so->so_lock);
1164         ASSERT(so_verify_oobstate(so));
1165         if (oob_inline ||
1166             (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1167                 dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1168                 mutex_exit(&so->so_lock);
1169                 return (EINVAL);
1170         }
1171         if (!(so->so_state & SS_HAVEOOBDATA)) {
1172                 dprintso(so, 1, ("sorecvoob: no data yet\n"));
1173                 mutex_exit(&so->so_lock);
1174                 return (EWOULDBLOCK);
1175         }
1176         ASSERT(so->so_oobmsg != NULL);
1177         mp = so->so_oobmsg;
1178         if (flags & MSG_PEEK) {
1179                 /*
1180                  * Since recv* can not return ENOBUFS we can not use dupmsg.
1181                  * Instead we revert to the consolidation private
1182                  * allocb_wait plus bcopy.
1183                  */
1184                 mblk_t *mp1;
1185 
1186                 mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1187                 ASSERT(mp1);
1188 
1189                 while (mp != NULL) {
1190                         ssize_t size;
1191 
1192                         size = MBLKL(mp);
1193                         bcopy(mp->b_rptr, mp1->b_wptr, size);
1194                         mp1->b_wptr += size;
1195                         ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1196                         mp = mp->b_cont;
1197                 }
1198                 mp = mp1;
1199         } else {
1200                 /*
1201                  * Update the state indicating that the data has been consumed.
1202                  * Keep SS_OOBPEND set until data is consumed past the mark.
1203                  */
1204                 so->so_oobmsg = NULL;
1205                 so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1206         }
1207         ASSERT(so_verify_oobstate(so));
1208         mutex_exit(&so->so_lock);
1209 
1210         error = 0;
1211         nmp = mp;
1212         while (nmp != NULL && uiop->uio_resid > 0) {
1213                 ssize_t n = MBLKL(nmp);
1214 
1215                 n = MIN(n, uiop->uio_resid);
1216                 if (n > 0)
1217                         error = uiomove(nmp->b_rptr, n,
1218                             UIO_READ, uiop);
1219                 if (error)
1220                         break;
1221                 nmp = nmp->b_cont;
1222         }
1223         ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1224         freemsg(mp);
1225         return (error);
1226 }
1227 
1228 /*
1229  * Allocate and initializ sonode
1230  */
1231 /* ARGSUSED */
1232 struct sonode *
1233 socket_sonode_create(struct sockparams *sp, int family, int type,
1234     int protocol, int version, int sflags, int *errorp, struct cred *cr)
1235 {
1236         sonode_t *so;
1237         int     kmflags;
1238 
1239         /*
1240          * Choose the right set of sonodeops based on the upcall and
1241          * down call version that the protocol has provided
1242          */
1243         if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1244             SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1245                 /*
1246                  * mismatch
1247                  */
1248 #ifdef DEBUG
1249                 cmn_err(CE_CONT, "protocol and socket module version mismatch");
1250 #endif
1251                 *errorp = EINVAL;
1252                 return (NULL);
1253         }
1254 
1255         kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1256 
1257         so = kmem_cache_alloc(socket_cache, kmflags);
1258         if (so == NULL) {
1259                 *errorp = ENOMEM;
1260                 return (NULL);
1261         }
1262 
1263         sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1264 
1265         if (version == SOV_DEFAULT)
1266                 version = so_default_version;
1267 
1268         so->so_version = (short)version;
1269 
1270         /*
1271          * set the default values to be INFPSZ
1272          * if a protocol desires it can change the value later
1273          */
1274         so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1275         so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1276         so->so_proto_props.sopp_maxpsz = INFPSZ;
1277         so->so_proto_props.sopp_maxblk = INFPSZ;
1278 
1279         return (so);
1280 }
1281 
1282 int
1283 socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1284 {
1285         int error = 0;
1286 
1287         if (pso != NULL) {
1288                 /*
1289                  * We have a passive open, so inherit basic state from
1290                  * the parent (listener).
1291                  *
1292                  * No need to grab the new sonode's lock, since there is no
1293                  * one that can have a reference to it.
1294                  */
1295                 mutex_enter(&pso->so_lock);
1296 
1297                 so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1298                 so->so_pgrp = pso->so_pgrp;
1299                 so->so_rcvtimeo = pso->so_rcvtimeo;
1300                 so->so_sndtimeo = pso->so_sndtimeo;
1301                 so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
1302                 /*
1303                  * Make note of the socket level options. TCP and IP level
1304                  * options are already inherited. We could do all this after
1305                  * accept is successful but doing it here simplifies code and
1306                  * no harm done for error case.
1307                  */
1308                 so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
1309                     SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1310                     SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1311                 so->so_proto_props = pso->so_proto_props;
1312                 so->so_mode = pso->so_mode;
1313                 so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
1314 
1315                 mutex_exit(&pso->so_lock);
1316 
1317                 /*
1318                  * If the parent has any filters, try to inherit them.
1319                  */
1320                 if (pso->so_filter_active > 0 &&
1321                     (error = sof_sonode_inherit_filters(so, pso)) != 0)
1322                         return (error);
1323 
1324         } else {
1325                 struct sockparams *sp = so->so_sockparams;
1326                 sock_upcalls_t *upcalls_to_use;
1327 
1328                 /*
1329                  * Attach automatic filters, if there are any.
1330                  */
1331                 if (!list_is_empty(&sp->sp_auto_filters) &&
1332                     (error = sof_sonode_autoattach_filters(so, cr)) != 0)
1333                         return (error);
1334 
1335                 /* OK to attach filters */
1336                 so->so_state |= SS_FILOP_OK;
1337 
1338                 /*
1339                  * Based on the version number select the right upcalls to
1340                  * pass down. Currently we only have one version so choose
1341                  * default
1342                  */
1343                 upcalls_to_use = &so_upcalls;
1344 
1345                 /* active open, so create a lower handle */
1346                 so->so_proto_handle =
1347                     sp->sp_smod_info->smod_proto_create_func(so->so_family,
1348                     so->so_type, so->so_protocol, &so->so_downcalls,
1349                     &so->so_mode, &error, flags, cr);
1350 
1351                 if (so->so_proto_handle == NULL) {
1352                         ASSERT(error != 0);
1353                         /*
1354                          * To be safe; if a lower handle cannot be created, and
1355                          * the proto does not give a reason why, assume there
1356                          * was a lack of memory.
1357                          */
1358                         return ((error == 0) ? ENOMEM : error);
1359                 }
1360                 ASSERT(so->so_downcalls != NULL);
1361                 ASSERT(so->so_downcalls->sd_send != NULL ||
1362                     so->so_downcalls->sd_send_uio != NULL);
1363                 if (so->so_downcalls->sd_recv_uio != NULL) {
1364                         ASSERT(so->so_downcalls->sd_poll != NULL);
1365                         so->so_pollev |= SO_POLLEV_ALWAYS;
1366                 }
1367 
1368                 (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1369                     (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1370 
1371                 /* Wildcard */
1372 
1373                 /*
1374                  * FIXME No need for this, the protocol can deal with it in
1375                  * sd_create(). Should update ICMP.
1376                  */
1377                 if (so->so_protocol != so->so_sockparams->sp_protocol) {
1378                         int protocol = so->so_protocol;
1379                         int error;
1380                         /*
1381                          * Issue SO_PROTOTYPE setsockopt.
1382                          */
1383                         error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1384                             &protocol, (t_uscalar_t)sizeof (protocol), cr);
1385                         if (error) {
1386                                 (void) (*so->so_downcalls->sd_close)
1387                                     (so->so_proto_handle, 0, cr);
1388 
1389                                 mutex_enter(&so->so_lock);
1390                                 so_rcv_flush(so);
1391                                 mutex_exit(&so->so_lock);
1392                                 /*
1393                                  * Setsockopt often fails with ENOPROTOOPT but
1394                                  * socket() should fail with
1395                                  * EPROTONOSUPPORT/EPROTOTYPE.
1396                                  */
1397                                 return (EPROTONOSUPPORT);
1398                         }
1399                 }
1400         }
1401 
1402         if (uioasync.enabled)
1403                 sod_sock_init(so);
1404 
1405         /* put an extra reference on the socket for the protocol */
1406         VN_HOLD(SOTOV(so));
1407 
1408         return (0);
1409 }
1410 
1411 /*
1412  * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1413  *         struct cred *cr, int32_t *rvalp)
1414  *
1415  * Handle ioctls that manipulate basic socket state; non-blocking,
1416  * async, etc.
1417  *
1418  * Returns:
1419  *   < 0  - ioctl was not handle
1420  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1421  *
1422  * Notes:
1423  *   Assumes the standard receive buffer is used to obtain info for
1424  *   NREAD.
1425  */
1426 /* ARGSUSED */
1427 int
1428 socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1429     struct cred *cr, int32_t *rvalp)
1430 {
1431         switch (cmd) {
1432         case SIOCSQPTR:
1433                 /*
1434                  * SIOCSQPTR is valid only when helper stream is created
1435                  * by the protocol.
1436                  */
1437 
1438                 return (EOPNOTSUPP);
1439         case FIONBIO: {
1440                 int32_t value;
1441 
1442                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1443                     (mode & (int)FKIOCTL)))
1444                         return (EFAULT);
1445 
1446                 mutex_enter(&so->so_lock);
1447                 if (value) {
1448                         so->so_state |= SS_NDELAY;
1449                 } else {
1450                         so->so_state &= ~SS_NDELAY;
1451                 }
1452                 mutex_exit(&so->so_lock);
1453                 return (0);
1454         }
1455         case FIOASYNC: {
1456                 int32_t value;
1457 
1458                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1459                     (mode & (int)FKIOCTL)))
1460                         return (EFAULT);
1461 
1462                 mutex_enter(&so->so_lock);
1463 
1464                 if (value) {
1465                         /* Turn on SIGIO */
1466                         so->so_state |= SS_ASYNC;
1467                 } else {
1468                         /* Turn off SIGIO */
1469                         so->so_state &= ~SS_ASYNC;
1470                 }
1471                 mutex_exit(&so->so_lock);
1472 
1473                 return (0);
1474         }
1475 
1476         case SIOCSPGRP:
1477         case FIOSETOWN: {
1478                 int error;
1479                 pid_t pid;
1480 
1481                 if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1482                     (mode & (int)FKIOCTL)))
1483                         return (EFAULT);
1484 
1485                 mutex_enter(&so->so_lock);
1486                 error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1487                 mutex_exit(&so->so_lock);
1488                 return (error);
1489         }
1490         case SIOCGPGRP:
1491         case FIOGETOWN:
1492                 if (so_copyout(&so->so_pgrp, (void *)arg,
1493                     sizeof (pid_t), (mode & (int)FKIOCTL)))
1494                         return (EFAULT);
1495 
1496                 return (0);
1497         case SIOCATMARK: {
1498                 int retval;
1499 
1500                 /*
1501                  * Only protocols that support urgent data can handle ATMARK.
1502                  */
1503                 if ((so->so_mode & SM_EXDATA) == 0)
1504                         return (EINVAL);
1505 
1506                 /*
1507                  * If the protocol is maintaining its own buffer, then the
1508                  * request must be passed down.
1509                  */
1510                 if (so->so_downcalls->sd_recv_uio != NULL)
1511                         return (-1);
1512 
1513                 retval = (so->so_state & SS_RCVATMARK) != 0;
1514 
1515                 if (so_copyout(&retval, (void *)arg, sizeof (int),
1516                     (mode & (int)FKIOCTL))) {
1517                         return (EFAULT);
1518                 }
1519                 return (0);
1520         }
1521 
1522         case FIONREAD: {
1523                 int retval;
1524 
1525                 /*
1526                  * If the protocol is maintaining its own buffer, then the
1527                  * request must be passed down.
1528                  */
1529                 if (so->so_downcalls->sd_recv_uio != NULL)
1530                         return (-1);
1531 
1532                 retval = MIN(so->so_rcv_queued, INT_MAX);
1533 
1534                 if (so_copyout(&retval, (void *)arg,
1535                     sizeof (retval), (mode & (int)FKIOCTL))) {
1536                         return (EFAULT);
1537                 }
1538                 return (0);
1539         }
1540 
1541         case _I_GETPEERCRED: {
1542                 int error = 0;
1543 
1544                 if ((mode & FKIOCTL) == 0)
1545                         return (EINVAL);
1546 
1547                 mutex_enter(&so->so_lock);
1548                 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1549                         error = ENOTSUP;
1550                 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1551                         error = ENOTCONN;
1552                 } else if (so->so_peercred != NULL) {
1553                         k_peercred_t *kp = (k_peercred_t *)arg;
1554                         kp->pc_cr = so->so_peercred;
1555                         kp->pc_cpid = so->so_cpid;
1556                         crhold(so->so_peercred);
1557                 } else {
1558                         error = EINVAL;
1559                 }
1560                 mutex_exit(&so->so_lock);
1561                 return (error);
1562         }
1563         default:
1564                 return (-1);
1565         }
1566 }
1567 
1568 /*
1569  * Handle the I_NREAD STREAM ioctl.
1570  */
1571 static int
1572 so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
1573 {
1574         size_t size = 0;
1575         int retval;
1576         int count = 0;
1577         mblk_t *mp;
1578         clock_t wakeup = drv_usectohz(10);
1579 
1580         if (so->so_downcalls == NULL ||
1581             so->so_downcalls->sd_recv_uio != NULL)
1582                 return (EINVAL);
1583 
1584         mutex_enter(&so->so_lock);
1585         /* Wait for reader to get out of the way. */
1586         while (so->so_flag & SOREADLOCKED) {
1587                 /*
1588                  * If reader is waiting for data, then there should be nothing
1589                  * on the rcv queue.
1590                  */
1591                 if (so->so_rcv_wakeup)
1592                         goto out;
1593 
1594                 /* Do a timed sleep, in case the reader goes to sleep. */
1595                 (void) cv_reltimedwait(&so->so_read_cv, &so->so_lock, wakeup,
1596                     TR_CLOCK_TICK);
1597         }
1598 
1599         /*
1600          * Since we are holding so_lock no new reader will come in, and the
1601          * protocol will not be able to enqueue data. So it's safe to walk
1602          * both rcv queues.
1603          */
1604         mp = so->so_rcv_q_head;
1605         if (mp != NULL) {
1606                 size = msgdsize(so->so_rcv_q_head);
1607                 for (; mp != NULL; mp = mp->b_next)
1608                         count++;
1609         } else {
1610                 /*
1611                  * In case the processing list was empty, get the size of the
1612                  * next msg in line.
1613                  */
1614                 size = msgdsize(so->so_rcv_head);
1615         }
1616 
1617         for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
1618                 count++;
1619 out:
1620         mutex_exit(&so->so_lock);
1621 
1622         /*
1623          * Drop down from size_t to the "int" required by the
1624          * interface.  Cap at INT_MAX.
1625          */
1626         retval = MIN(size, INT_MAX);
1627         if (so_copyout(&retval, (void *)arg, sizeof (retval),
1628             (mode & (int)FKIOCTL))) {
1629                 return (EFAULT);
1630         } else {
1631                 *rvalp = count;
1632                 return (0);
1633         }
1634 }
1635 
1636 /*
1637  * Process STREAM ioctls.
1638  *
1639  * Returns:
1640  *   < 0  - ioctl was not handle
1641  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1642  */
1643 int
1644 socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1645     struct cred *cr, int32_t *rvalp)
1646 {
1647         int retval;
1648 
1649         /* Only STREAM iotcls are handled here */
1650         if ((cmd & 0xffffff00U) != STR)
1651                 return (-1);
1652 
1653         switch (cmd) {
1654         case I_CANPUT:
1655                 /*
1656                  * We return an error for I_CANPUT so that isastream(3C) will
1657                  * not report the socket as being a STREAM.
1658                  */
1659                 return (EOPNOTSUPP);
1660         case I_NREAD:
1661                 /* Avoid doing a fallback for I_NREAD. */
1662                 return (so_strioc_nread(so, arg, mode, rvalp));
1663         case I_LOOK:
1664                 /* Avoid doing a fallback for I_LOOK. */
1665                 if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1666                     (mode & (int)FKIOCTL))) {
1667                         return (EFAULT);
1668                 }
1669                 return (0);
1670         default:
1671                 break;
1672         }
1673 
1674         /*
1675          * Try to fall back to TPI, and if successful, reissue the ioctl.
1676          */
1677         if ((retval = so_tpi_fallback(so, cr)) == 0) {
1678                 /* Reissue the ioctl */
1679                 ASSERT(so->so_rcv_q_head == NULL);
1680                 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1681         } else {
1682                 return (retval);
1683         }
1684 }
1685 
1686 /*
1687  * This is called for all socket types to verify that the buffer size is large
1688  * enough for the option, and if we can, handle the request as well. Most
1689  * options will be forwarded to the protocol.
1690  */
1691 int
1692 socket_getopt_common(struct sonode *so, int level, int option_name,
1693     void *optval, socklen_t *optlenp, int flags)
1694 {
1695         if (level != SOL_SOCKET)
1696                 return (-1);
1697 
1698         switch (option_name) {
1699         case SO_ERROR:
1700         case SO_DOMAIN:
1701         case SO_TYPE:
1702         case SO_ACCEPTCONN: {
1703                 int32_t value;
1704                 socklen_t optlen = *optlenp;
1705 
1706                 if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1707                         return (EINVAL);
1708                 }
1709 
1710                 switch (option_name) {
1711                 case SO_ERROR:
1712                         mutex_enter(&so->so_lock);
1713                         value = sogeterr(so, B_TRUE);
1714                         mutex_exit(&so->so_lock);
1715                         break;
1716                 case SO_DOMAIN:
1717                         value = so->so_family;
1718                         break;
1719                 case SO_TYPE:
1720                         value = so->so_type;
1721                         break;
1722                 case SO_ACCEPTCONN:
1723                         if (so->so_state & SS_ACCEPTCONN)
1724                                 value = SO_ACCEPTCONN;
1725                         else
1726                                 value = 0;
1727                         break;
1728                 }
1729 
1730                 bcopy(&value, optval, sizeof (value));
1731                 *optlenp = sizeof (value);
1732 
1733                 return (0);
1734         }
1735         case SO_SNDTIMEO:
1736         case SO_RCVTIMEO: {
1737                 clock_t value;
1738                 socklen_t optlen = *optlenp;
1739 
1740                 if (get_udatamodel() == DATAMODEL_NONE ||
1741                     get_udatamodel() == DATAMODEL_NATIVE) {
1742                         if (optlen < sizeof (struct timeval))
1743                                 return (EINVAL);
1744                 } else {
1745                         if (optlen < sizeof (struct timeval32))
1746                                 return (EINVAL);
1747                 }
1748                 if (option_name == SO_RCVTIMEO)
1749                         value = drv_hztousec(so->so_rcvtimeo);
1750                 else
1751                         value = drv_hztousec(so->so_sndtimeo);
1752 
1753                 if (get_udatamodel() == DATAMODEL_NONE ||
1754                     get_udatamodel() == DATAMODEL_NATIVE) {
1755                         ((struct timeval *)(optval))->tv_sec =
1756                             value / (1000 * 1000);
1757                         ((struct timeval *)(optval))->tv_usec =
1758                             value % (1000 * 1000);
1759                         *optlenp = sizeof (struct timeval);
1760                 } else {
1761                         ((struct timeval32 *)(optval))->tv_sec =
1762                             value / (1000 * 1000);
1763                         ((struct timeval32 *)(optval))->tv_usec =
1764                             value % (1000 * 1000);
1765                         *optlenp = sizeof (struct timeval32);
1766                 }
1767                 return (0);
1768         }
1769         case SO_DEBUG:
1770         case SO_REUSEADDR:
1771         case SO_KEEPALIVE:
1772         case SO_DONTROUTE:
1773         case SO_BROADCAST:
1774         case SO_USELOOPBACK:
1775         case SO_OOBINLINE:
1776         case SO_SNDBUF:
1777 #ifdef notyet
1778         case SO_SNDLOWAT:
1779         case SO_RCVLOWAT:
1780 #endif /* notyet */
1781         case SO_DGRAM_ERRIND: {
1782                 socklen_t optlen = *optlenp;
1783 
1784                 if (optlen < (t_uscalar_t)sizeof (int32_t))
1785                         return (EINVAL);
1786                 break;
1787         }
1788         case SO_RCVBUF: {
1789                 socklen_t optlen = *optlenp;
1790 
1791                 if (optlen < (t_uscalar_t)sizeof (int32_t))
1792                         return (EINVAL);
1793 
1794                 if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1795                         /*
1796                          * XXX If SO_RCVBUF has been set and this is an
1797                          * XPG 4.2 application then do not ask the transport
1798                          * since the transport might adjust the value and not
1799                          * return exactly what was set by the application.
1800                          * For non-XPG 4.2 application we return the value
1801                          * that the transport is actually using.
1802                          */
1803                         *(int32_t *)optval = so->so_xpg_rcvbuf;
1804                         *optlenp = sizeof (so->so_xpg_rcvbuf);
1805                         return (0);
1806                 }
1807                 /*
1808                  * If the option has not been set then get a default
1809                  * value from the transport.
1810                  */
1811                 break;
1812         }
1813         case SO_LINGER: {
1814                 socklen_t optlen = *optlenp;
1815 
1816                 if (optlen < (t_uscalar_t)sizeof (struct linger))
1817                         return (EINVAL);
1818                 break;
1819         }
1820         case SO_SND_BUFINFO: {
1821                 socklen_t optlen = *optlenp;
1822 
1823                 if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1824                         return (EINVAL);
1825                 ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1826                     (so->so_proto_props).sopp_wroff;
1827                 ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1828                     (so->so_proto_props).sopp_maxblk;
1829                 ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1830                     (so->so_proto_props).sopp_maxpsz;
1831                 ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1832                     (so->so_proto_props).sopp_tail;
1833                 *optlenp = sizeof (struct so_snd_bufinfo);
1834                 return (0);
1835         }
1836         case SO_SND_COPYAVOID: {
1837                 sof_instance_t *inst;
1838 
1839                 /*
1840                  * Avoid zero-copy if there is a filter with a data_out
1841                  * callback. We could let the operation succeed, but then
1842                  * the filter would have to copy the data anyway.
1843                  */
1844                 for (inst = so->so_filter_top; inst != NULL;
1845                     inst = inst->sofi_next) {
1846                         if (SOF_INTERESTED(inst, data_out))
1847                                 return (EOPNOTSUPP);
1848                 }
1849                 break;
1850         }
1851 
1852         default:
1853                 break;
1854         }
1855 
1856         /* Unknown Option */
1857         return (-1);
1858 }
1859 
1860 void
1861 socket_sonode_destroy(struct sonode *so)
1862 {
1863         sonode_fini(so);
1864         kmem_cache_free(socket_cache, so);
1865 }
1866 
1867 int
1868 so_zcopy_wait(struct sonode *so)
1869 {
1870         int error = 0;
1871 
1872         mutex_enter(&so->so_lock);
1873         while (!(so->so_copyflag & STZCNOTIFY)) {
1874                 if (so->so_state & SS_CLOSING) {
1875                         mutex_exit(&so->so_lock);
1876                         return (EINTR);
1877                 }
1878                 if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1879                         error = EINTR;
1880                         break;
1881                 }
1882         }
1883         so->so_copyflag &= ~STZCNOTIFY;
1884         mutex_exit(&so->so_lock);
1885         return (error);
1886 }
1887 
1888 void
1889 so_timer_callback(void *arg)
1890 {
1891         struct sonode *so = (struct sonode *)arg;
1892 
1893         mutex_enter(&so->so_lock);
1894 
1895         so->so_rcv_timer_tid = 0;
1896         if (so->so_rcv_queued > 0) {
1897                 so_notify_data(so, so->so_rcv_queued);
1898         } else {
1899                 mutex_exit(&so->so_lock);
1900         }
1901 }
1902 
1903 #ifdef DEBUG
1904 /*
1905  * Verify that the length stored in so_rcv_queued and the length of data blocks
1906  * queued is same.
1907  */
1908 static boolean_t
1909 so_check_length(sonode_t *so)
1910 {
1911         mblk_t *mp = so->so_rcv_q_head;
1912         int len = 0;
1913 
1914         ASSERT(MUTEX_HELD(&so->so_lock));
1915 
1916         if (mp != NULL) {
1917                 len = msgdsize(mp);
1918                 while ((mp = mp->b_next) != NULL)
1919                         len += msgdsize(mp);
1920         }
1921         mp = so->so_rcv_head;
1922         if (mp != NULL) {
1923                 len += msgdsize(mp);
1924                 while ((mp = mp->b_next) != NULL)
1925                         len += msgdsize(mp);
1926         }
1927         return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1928 }
1929 #endif
1930 
1931 int
1932 so_get_mod_version(struct sockparams *sp)
1933 {
1934         ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1935         return (sp->sp_smod_info->smod_version);
1936 }
1937 
1938 /*
1939  * so_start_fallback()
1940  *
1941  * Block new socket operations from coming in, and wait for active operations
1942  * to complete. Threads that are sleeping will be woken up so they can get
1943  * out of the way.
1944  *
1945  * The caller must be a reader on so_fallback_rwlock.
1946  */
1947 static boolean_t
1948 so_start_fallback(struct sonode *so)
1949 {
1950         ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1951 
1952         mutex_enter(&so->so_lock);
1953         if (so->so_state & SS_FALLBACK_PENDING) {
1954                 mutex_exit(&so->so_lock);
1955                 return (B_FALSE);
1956         }
1957         so->so_state |= SS_FALLBACK_PENDING;
1958         /*
1959          * Poke all threads that might be sleeping. Any operation that comes
1960          * in after the cv_broadcast will observe the fallback pending flag
1961          * which cause the call to return where it would normally sleep.
1962          */
1963         cv_broadcast(&so->so_state_cv);          /* threads in connect() */
1964         cv_broadcast(&so->so_rcv_cv);            /* threads in recvmsg() */
1965         cv_broadcast(&so->so_snd_cv);            /* threads in sendmsg() */
1966         mutex_enter(&so->so_acceptq_lock);
1967         cv_broadcast(&so->so_acceptq_cv);        /* threads in accept() */
1968         mutex_exit(&so->so_acceptq_lock);
1969         mutex_exit(&so->so_lock);
1970 
1971         /*
1972          * The main reason for the rw_tryupgrade call is to provide
1973          * observability during the fallback process. We want to
1974          * be able to see if there are pending operations.
1975          */
1976         if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1977                 /*
1978                  * It is safe to drop and reaquire the fallback lock, because
1979                  * we are guaranteed that another fallback cannot take place.
1980                  */
1981                 rw_exit(&so->so_fallback_rwlock);
1982                 DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1983                 rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1984                 DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1985         }
1986 
1987         return (B_TRUE);
1988 }
1989 
1990 /*
1991  * so_end_fallback()
1992  *
1993  * Allow socket opertions back in.
1994  *
1995  * The caller must be a writer on so_fallback_rwlock.
1996  */
1997 static void
1998 so_end_fallback(struct sonode *so)
1999 {
2000         ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
2001 
2002         mutex_enter(&so->so_lock);
2003         so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
2004         mutex_exit(&so->so_lock);
2005 
2006         rw_downgrade(&so->so_fallback_rwlock);
2007 }
2008 
2009 /*
2010  * so_quiesced_cb()
2011  *
2012  * Callback passed to the protocol during fallback. It is called once
2013  * the endpoint is quiescent.
2014  *
2015  * No requests from the user, no notifications from the protocol, so it
2016  * is safe to synchronize the state. Data can also be moved without
2017  * risk for reordering.
2018  *
2019  * We do not need to hold so_lock, since there can be only one thread
2020  * operating on the sonode.
2021  */
2022 static mblk_t *
2023 so_quiesced_cb(sock_upper_handle_t sock_handle, sock_quiesce_arg_t *arg,
2024     struct T_capability_ack *tcap,
2025     struct sockaddr *laddr, socklen_t laddrlen,
2026     struct sockaddr *faddr, socklen_t faddrlen, short opts)
2027 {
2028         struct sonode *so = (struct sonode *)sock_handle;
2029         boolean_t atmark;
2030         mblk_t *retmp = NULL, **tailmpp = &retmp;
2031 
2032         if (tcap != NULL)
2033                 sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen,
2034                     opts);
2035 
2036         /*
2037          * Some protocols do not quiece the data path during fallback. Once
2038          * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
2039          * fail and the protocol is responsible for saving the data for later
2040          * delivery (i.e., once the fallback has completed).
2041          */
2042         mutex_enter(&so->so_lock);
2043         so->so_state |= SS_FALLBACK_DRAIN;
2044         SOCKET_TIMER_CANCEL(so);
2045         mutex_exit(&so->so_lock);
2046 
2047         if (so->so_rcv_head != NULL) {
2048                 if (so->so_rcv_q_last_head == NULL)
2049                         so->so_rcv_q_head = so->so_rcv_head;
2050                 else
2051                         so->so_rcv_q_last_head->b_next = so->so_rcv_head;
2052                 so->so_rcv_q_last_head = so->so_rcv_last_head;
2053         }
2054 
2055         atmark = (so->so_state & SS_RCVATMARK) != 0;
2056         /*
2057          * Clear any OOB state having to do with pending data. The TPI
2058          * code path will set the appropriate oob state when we move the
2059          * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
2060          * data has already been consumed.
2061          */
2062         so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
2063 
2064         ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
2065 
2066         /*
2067          * Move data to the STREAM head.
2068          */
2069         while (so->so_rcv_q_head != NULL) {
2070                 mblk_t *mp = so->so_rcv_q_head;
2071                 size_t mlen = msgdsize(mp);
2072 
2073                 so->so_rcv_q_head = mp->b_next;
2074                 mp->b_next = NULL;
2075                 mp->b_prev = NULL;
2076 
2077                 /*
2078                  * Send T_EXDATA_IND if we are at the oob mark.
2079                  */
2080                 if (atmark) {
2081                         struct T_exdata_ind *tei;
2082                         mblk_t *mp1 = arg->soqa_exdata_mp;
2083 
2084                         arg->soqa_exdata_mp = NULL;
2085                         ASSERT(mp1 != NULL);
2086                         mp1->b_datap->db_type = M_PROTO;
2087                         tei = (struct T_exdata_ind *)mp1->b_rptr;
2088                         tei->PRIM_type = T_EXDATA_IND;
2089                         tei->MORE_flag = 0;
2090                         mp1->b_wptr = (uchar_t *)&tei[1];
2091 
2092                         if (IS_SO_OOB_INLINE(so)) {
2093                                 mp1->b_cont = mp;
2094                         } else {
2095                                 ASSERT(so->so_oobmsg != NULL);
2096                                 mp1->b_cont = so->so_oobmsg;
2097                                 so->so_oobmsg = NULL;
2098 
2099                                 /* process current mp next time around */
2100                                 mp->b_next = so->so_rcv_q_head;
2101                                 so->so_rcv_q_head = mp;
2102                                 mlen = 0;
2103                         }
2104                         mp = mp1;
2105 
2106                         /* we have consumed the oob mark */
2107                         atmark = B_FALSE;
2108                 } else if (so->so_oobmark > 0) {
2109                         /*
2110                          * Check if the OOB mark is within the current
2111                          * mblk chain. In that case we have to split it up.
2112                          */
2113                         if (so->so_oobmark < mlen) {
2114                                 mblk_t *urg_mp = mp;
2115 
2116                                 atmark = B_TRUE;
2117                                 mp = NULL;
2118                                 mlen = so->so_oobmark;
2119 
2120                                 /*
2121                                  * It is assumed that the OOB mark does
2122                                  * not land within a mblk.
2123                                  */
2124                                 do {
2125                                         so->so_oobmark -= MBLKL(urg_mp);
2126                                         mp = urg_mp;
2127                                         urg_mp = urg_mp->b_cont;
2128                                 } while (so->so_oobmark > 0);
2129                                 mp->b_cont = NULL;
2130                                 if (urg_mp != NULL) {
2131                                         urg_mp->b_next = so->so_rcv_q_head;
2132                                         so->so_rcv_q_head = urg_mp;
2133                                 }
2134                         } else {
2135                                 so->so_oobmark -= mlen;
2136                                 if (so->so_oobmark == 0)
2137                                         atmark = B_TRUE;
2138                         }
2139                 }
2140 
2141                 /*
2142                  * Queue data on the STREAM head.
2143                  */
2144                 so->so_rcv_queued -= mlen;
2145                 *tailmpp = mp;
2146                 tailmpp = &mp->b_next;
2147         }
2148         so->so_rcv_head = NULL;
2149         so->so_rcv_last_head = NULL;
2150         so->so_rcv_q_head = NULL;
2151         so->so_rcv_q_last_head = NULL;
2152 
2153         /*
2154          * Check if the oob byte is at the end of the data stream, or if the
2155          * oob byte has not yet arrived. In the latter case we have to send a
2156          * SIGURG and a mark indicator to the STREAM head. The mark indicator
2157          * is needed to guarantee correct behavior for SIOCATMARK. See block
2158          * comment in socktpi.h for more details.
2159          */
2160         if (atmark || so->so_oobmark > 0) {
2161                 mblk_t *mp;
2162 
2163                 if (atmark && so->so_oobmsg != NULL) {
2164                         struct T_exdata_ind *tei;
2165 
2166                         mp = arg->soqa_exdata_mp;
2167                         arg->soqa_exdata_mp = NULL;
2168                         ASSERT(mp != NULL);
2169                         mp->b_datap->db_type = M_PROTO;
2170                         tei = (struct T_exdata_ind *)mp->b_rptr;
2171                         tei->PRIM_type = T_EXDATA_IND;
2172                         tei->MORE_flag = 0;
2173                         mp->b_wptr = (uchar_t *)&tei[1];
2174 
2175                         mp->b_cont = so->so_oobmsg;
2176                         so->so_oobmsg = NULL;
2177 
2178                         *tailmpp = mp;
2179                         tailmpp = &mp->b_next;
2180                 } else {
2181                         /* Send up the signal */
2182                         mp = arg->soqa_exdata_mp;
2183                         arg->soqa_exdata_mp = NULL;
2184                         ASSERT(mp != NULL);
2185                         DB_TYPE(mp) = M_PCSIG;
2186                         *mp->b_wptr++ = (uchar_t)SIGURG;
2187                         *tailmpp = mp;
2188                         tailmpp = &mp->b_next;
2189 
2190                         /* Send up the mark indicator */
2191                         mp = arg->soqa_urgmark_mp;
2192                         arg->soqa_urgmark_mp = NULL;
2193                         mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
2194                         *tailmpp = mp;
2195                         tailmpp = &mp->b_next;
2196 
2197                         so->so_oobmark = 0;
2198                 }
2199         }
2200         ASSERT(so->so_oobmark == 0);
2201         ASSERT(so->so_rcv_queued == 0);
2202 
2203         return (retmp);
2204 }
2205 
2206 #ifdef DEBUG
2207 /*
2208  * Do an integrity check of the sonode. This should be done if a
2209  * fallback fails after sonode has initially been converted to use
2210  * TPI and subsequently have to be reverted.
2211  *
2212  * Failure to pass the integrity check will panic the system.
2213  */
2214 void
2215 so_integrity_check(struct sonode *cur, struct sonode *orig)
2216 {
2217         VERIFY(cur->so_vnode == orig->so_vnode);
2218         VERIFY(cur->so_ops == orig->so_ops);
2219         /*
2220          * For so_state we can only VERIFY the state flags in CHECK_STATE.
2221          * The other state flags might be affected by a notification from the
2222          * protocol.
2223          */
2224 #define CHECK_STATE     (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
2225         SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
2226         SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
2227         VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
2228             (orig->so_state & CHECK_STATE));
2229         VERIFY(cur->so_mode == orig->so_mode);
2230         VERIFY(cur->so_flag == orig->so_flag);
2231         VERIFY(cur->so_count == orig->so_count);
2232         /* Cannot VERIFY so_proto_connid; proto can update it */
2233         VERIFY(cur->so_sockparams == orig->so_sockparams);
2234         /* an error might have been recorded, but it can not be lost */
2235         VERIFY(cur->so_error != 0 || orig->so_error == 0);
2236         VERIFY(cur->so_family == orig->so_family);
2237         VERIFY(cur->so_type == orig->so_type);
2238         VERIFY(cur->so_protocol == orig->so_protocol);
2239         VERIFY(cur->so_version == orig->so_version);
2240         /* New conns might have arrived, but none should have been lost */
2241         VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
2242         VERIFY(list_head(&cur->so_acceptq_list) ==
2243             list_head(&orig->so_acceptq_list));
2244         VERIFY(cur->so_backlog == orig->so_backlog);
2245         /* New OOB migth have arrived, but mark should not have been lost */
2246         VERIFY(cur->so_oobmark >= orig->so_oobmark);
2247         /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
2248         VERIFY(cur->so_pgrp == orig->so_pgrp);
2249         VERIFY(cur->so_peercred == orig->so_peercred);
2250         VERIFY(cur->so_cpid == orig->so_cpid);
2251         VERIFY(cur->so_zoneid == orig->so_zoneid);
2252         /* New data migth have arrived, but none should have been lost */
2253         VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
2254         VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
2255         VERIFY(cur->so_rcv_head == orig->so_rcv_head);
2256         VERIFY(cur->so_proto_handle == orig->so_proto_handle);
2257         VERIFY(cur->so_downcalls == orig->so_downcalls);
2258         /* Cannot VERIFY so_proto_props; they can be updated by proto */
2259 }
2260 #endif
2261 
2262 /*
2263  * so_tpi_fallback()
2264  *
2265  * This is the fallback initation routine; things start here.
2266  *
2267  * Basic strategy:
2268  *   o Block new socket operations from coming in
2269  *   o Allocate/initate info needed by TPI
2270  *   o Quiesce the connection, at which point we sync
2271  *     state and move data
2272  *   o Change operations (sonodeops) associated with the socket
2273  *   o Unblock threads waiting for the fallback to finish
2274  */
2275 int
2276 so_tpi_fallback(struct sonode *so, struct cred *cr)
2277 {
2278         int error;
2279         queue_t *q;
2280         struct sockparams *sp;
2281         struct sockparams *newsp = NULL;
2282         so_proto_fallback_func_t fbfunc;
2283         const char *devpath;
2284         boolean_t direct;
2285         struct sonode *nso;
2286         sock_quiesce_arg_t arg = { NULL, NULL };
2287 #ifdef DEBUG
2288         struct sonode origso;
2289 #endif
2290         error = 0;
2291         sp = so->so_sockparams;
2292         fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
2293 
2294         /*
2295          * Cannot fallback if the socket has active filters or a krecv callback.
2296          */
2297         if (so->so_filter_active > 0 || so->so_krecv_cb != NULL)
2298                 return (EINVAL);
2299 
2300         switch (so->so_family) {
2301         case AF_INET:
2302                 devpath = sp->sp_smod_info->smod_fallback_devpath_v4;
2303                 break;
2304         case AF_INET6:
2305                 devpath = sp->sp_smod_info->smod_fallback_devpath_v6;
2306                 break;
2307         default:
2308                 return (EINVAL);
2309         }
2310 
2311         /*
2312          * Fallback can only happen if the socket module has a TPI device
2313          * and fallback function.
2314          */
2315         if (devpath == NULL || fbfunc == NULL)
2316                 return (EINVAL);
2317 
2318         /*
2319          * Initiate fallback; upon success we know that no new requests
2320          * will come in from the user.
2321          */
2322         if (!so_start_fallback(so))
2323                 return (EAGAIN);
2324 #ifdef DEBUG
2325         /*
2326          * Make a copy of the sonode in case we need to make an integrity
2327          * check later on.
2328          */
2329         bcopy(so, &origso, sizeof (*so));
2330 #endif
2331 
2332         sp->sp_stats.sps_nfallback.value.ui64++;
2333 
2334         newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
2335             so->so_protocol, devpath, KM_SLEEP, &error);
2336         if (error != 0)
2337                 goto out;
2338 
2339         if (so->so_direct != NULL) {
2340                 sodirect_t *sodp = so->so_direct;
2341                 mutex_enter(&so->so_lock);
2342 
2343                 so->so_direct->sod_enabled = B_FALSE;
2344                 so->so_state &= ~SS_SODIRECT;
2345                 ASSERT(sodp->sod_uioafh == NULL);
2346                 mutex_exit(&so->so_lock);
2347         }
2348 
2349         /* Turn sonode into a TPI socket */
2350         error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
2351         if (error != 0)
2352                 goto out;
2353         /*
2354          * When it comes to urgent data we have two cases to deal with;
2355          * (1) The oob byte has already arrived, or (2) the protocol has
2356          * notified that oob data is pending, but it has not yet arrived.
2357          *
2358          * For (1) all we need to do is send a T_EXDATA_IND to indicate were
2359          * in the byte stream the oob byte is. For (2) we have to send a
2360          * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
2361          * the oob byte will be the next byte from the protocol.
2362          *
2363          * So in the worst case we need two mblks, one for the signal, another
2364          * for mark indication. In that case we use the exdata_mp for the sig.
2365          */
2366         arg.soqa_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind),
2367             BPRI_MED, STR_NOSIG, NULL);
2368         arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
2369 
2370         /*
2371          * Now tell the protocol to start using TPI. so_quiesced_cb be
2372          * called once it's safe to synchronize state.
2373          */
2374         DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
2375         error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb,
2376             &arg);
2377         DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
2378 
2379         if (error != 0) {
2380                 /* protocol was unable to do a fallback, revert the sonode */
2381                 sotpi_revert_sonode(so, cr);
2382                 goto out;
2383         }
2384 
2385         /*
2386          * Walk the accept queue and notify the proto that they should
2387          * fall back to TPI. The protocol will send up the T_CONN_IND.
2388          */
2389         nso = list_head(&so->so_acceptq_list);
2390         while (nso != NULL) {
2391                 int rval;
2392                 struct sonode *next;
2393 
2394                 if (arg.soqa_exdata_mp == NULL) {
2395                         arg.soqa_exdata_mp =
2396                             allocb_wait(sizeof (struct T_exdata_ind),
2397                             BPRI_MED, STR_NOSIG, NULL);
2398                 }
2399                 if (arg.soqa_urgmark_mp == NULL) {
2400                         arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED,
2401                             STR_NOSIG, NULL);
2402                 }
2403 
2404                 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
2405                 rval = (*fbfunc)(nso->so_proto_handle, NULL, direct,
2406                     so_quiesced_cb, &arg);
2407                 DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
2408                 if (rval != 0) {
2409                         /* Abort the connection */
2410                         zcmn_err(getzoneid(), CE_WARN,
2411                             "Failed to convert socket in accept queue to TPI. "
2412                             "Pid = %d\n", curproc->p_pid);
2413                         next = list_next(&so->so_acceptq_list, nso);
2414                         list_remove(&so->so_acceptq_list, nso);
2415                         so->so_acceptq_len--;
2416 
2417                         (void) socket_close(nso, 0, CRED());
2418                         socket_destroy(nso);
2419                         nso = next;
2420                 } else {
2421                         nso = list_next(&so->so_acceptq_list, nso);
2422                 }
2423         }
2424 
2425         /*
2426          * Now flush the acceptq, this will destroy all sockets. They will
2427          * be recreated in sotpi_accept().
2428          */
2429         so_acceptq_flush(so, B_FALSE);
2430 
2431         mutex_enter(&so->so_lock);
2432         so->so_state |= SS_FALLBACK_COMP;
2433         mutex_exit(&so->so_lock);
2434 
2435         /*
2436          * Swap the sonode ops. Socket opertations that come in once this
2437          * is done will proceed without blocking.
2438          */
2439         so->so_ops = &sotpi_sonodeops;
2440 
2441         /*
2442          * Wake up any threads stuck in poll. This is needed since the poll
2443          * head changes when the fallback happens (moves from the sonode to
2444          * the STREAMS head).
2445          */
2446         pollwakeup(&so->so_poll_list, POLLERR);
2447 
2448         /*
2449          * When this non-STREAM socket was created we placed an extra ref on
2450          * the associated vnode to support asynchronous close. Drop that ref
2451          * here.
2452          */
2453         ASSERT(SOTOV(so)->v_count >= 2);
2454         VN_RELE(SOTOV(so));
2455 out:
2456         so_end_fallback(so);
2457 
2458         if (error != 0) {
2459 #ifdef DEBUG
2460                 so_integrity_check(so, &origso);
2461 #endif
2462                 zcmn_err(getzoneid(), CE_WARN,
2463                     "Failed to convert socket to TPI (err=%d). Pid = %d\n",
2464                     error, curproc->p_pid);
2465                 if (newsp != NULL)
2466                         SOCKPARAMS_DEC_REF(newsp);
2467         }
2468         if (arg.soqa_exdata_mp != NULL)
2469                 freemsg(arg.soqa_exdata_mp);
2470         if (arg.soqa_urgmark_mp != NULL)
2471                 freemsg(arg.soqa_urgmark_mp);
2472 
2473         return (error);
2474 }
2475 
2476 int
2477 so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg)
2478 {
2479         int ret;
2480 
2481         if (cb == NULL && arg != NULL)
2482                 return (EINVAL);
2483 
2484         SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg));
2485 
2486         mutex_enter(&so->so_lock);
2487         if (so->so_state & SS_FALLBACK_COMP) {
2488                 mutex_exit(&so->so_lock);
2489                 SO_UNBLOCK_FALLBACK(so);
2490                 return (ENOTSUP);
2491         }
2492 
2493         ret = so_lock_read(so, 0);
2494         VERIFY(ret == 0);
2495         /*
2496          * Other consumers may actually care about getting extant data delivered
2497          * to them, when they come along, they should figure out the best API
2498          * for that.
2499          */
2500         so_rcv_flush(so);
2501 
2502         so->so_krecv_cb = cb;
2503         so->so_krecv_arg = arg;
2504 
2505         so_unlock_read(so);
2506         mutex_exit(&so->so_lock);
2507         SO_UNBLOCK_FALLBACK(so);
2508 
2509         return (0);
2510 }
2511 
2512 void
2513 so_krecv_unblock(sonode_t *so)
2514 {
2515         mutex_enter(&so->so_lock);
2516         VERIFY(so->so_krecv_cb != NULL);
2517 
2518         so->so_rcv_queued = 0;
2519         (void) so_check_flow_control(so);
2520         mutex_exit(&so->so_lock);
2521 }