1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/param.h>
  28 #include <sys/signal.h>
  29 #include <sys/cmn_err.h>
  30 
  31 #include <sys/stropts.h>
  32 #include <sys/socket.h>
  33 #include <sys/socketvar.h>
  34 #include <sys/sockio.h>
  35 #include <sys/strsubr.h>
  36 #include <sys/strsun.h>
  37 #include <sys/atomic.h>
  38 #include <sys/tihdr.h>
  39 
  40 #include <fs/sockfs/sockcommon.h>
  41 #include <fs/sockfs/sockfilter_impl.h>
  42 #include <fs/sockfs/socktpi.h>
  43 #include <fs/sockfs/sodirect.h>
  44 #include <sys/ddi.h>
  45 #include <inet/ip.h>
  46 #include <sys/time.h>
  47 #include <sys/cmn_err.h>
  48 
  49 #ifdef SOCK_TEST
  50 extern int do_useracc;
  51 extern clock_t sock_test_timelimit;
  52 #endif /* SOCK_TEST */
  53 
  54 #define MBLK_PULL_LEN 64
  55 uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
  56 
  57 #ifdef DEBUG
  58 boolean_t so_debug_length = B_FALSE;
  59 static boolean_t so_check_length(sonode_t *so);
  60 #endif
  61 
  62 static int
  63 so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
  64     struct sonode **nsop)
  65 {
  66         struct sonode *nso = NULL;
  67 
  68         *nsop = NULL;
  69         ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
  70         while ((nso = list_remove_head(&so->so_acceptq_list)) == NULL) {
  71                 /*
  72                  * No need to check so_error here, because it is not
  73                  * possible for a listening socket to be reset or otherwise
  74                  * disconnected.
  75                  *
  76                  * So now we just need check if it's ok to wait.
  77                  */
  78                 if (dontblock)
  79                         return (EWOULDBLOCK);
  80                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
  81                         return (EINTR);
  82 
  83                 if (cv_wait_sig_swap(&so->so_acceptq_cv,
  84                     &so->so_acceptq_lock) == 0)
  85                         return (EINTR);
  86         }
  87 
  88         ASSERT(nso != NULL);
  89         ASSERT(so->so_acceptq_len > 0);
  90         so->so_acceptq_len--;
  91         nso->so_listener = NULL;
  92 
  93         *nsop = nso;
  94 
  95         return (0);
  96 }
  97 
  98 /*
  99  * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
 100  *
 101  * Pulls a connection off of the accept queue.
 102  *
 103  * Arguments:
 104  *   so        - listening socket
 105  *   dontblock - indicate whether it's ok to sleep if there are no
 106  *               connections on the queue
 107  *   nsop      - Value-return argument
 108  *
 109  * Return values:
 110  *   0 when a connection is successfully dequeued, in which case nsop
 111  *   is set to point to the new connection. Upon failure a non-zero
 112  *   value is returned, and the value of nsop is set to NULL.
 113  *
 114  * Note:
 115  *   so_acceptq_dequeue() may return prematurly if the socket is falling
 116  *   back to TPI.
 117  */
 118 int
 119 so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
 120     struct sonode **nsop)
 121 {
 122         int error;
 123 
 124         mutex_enter(&so->so_acceptq_lock);
 125         error = so_acceptq_dequeue_locked(so, dontblock, nsop);
 126         mutex_exit(&so->so_acceptq_lock);
 127 
 128         return (error);
 129 }
 130 
 131 static void
 132 so_acceptq_flush_impl(struct sonode *so, list_t *list, boolean_t doclose)
 133 {
 134         struct sonode *nso;
 135 
 136         while ((nso = list_remove_head(list)) != NULL) {
 137                 nso->so_listener = NULL;
 138                 if (doclose) {
 139                         (void) socket_close(nso, 0, CRED());
 140                 } else {
 141                         /*
 142                          * Only used for fallback - not possible when filters
 143                          * are present.
 144                          */
 145                         ASSERT(so->so_filter_active == 0);
 146                         /*
 147                          * Since the socket is on the accept queue, there can
 148                          * only be one reference. We drop the reference and
 149                          * just blow off the socket.
 150                          */
 151                         ASSERT(nso->so_count == 1);
 152                         nso->so_count--;
 153                         /* drop the proto ref */
 154                         VN_RELE(SOTOV(nso));
 155                 }
 156                 socket_destroy(nso);
 157         }
 158 }
 159 /*
 160  * void so_acceptq_flush(struct sonode *so)
 161  *
 162  * Removes all pending connections from a listening socket, and
 163  * frees the associated resources.
 164  *
 165  * Arguments
 166  *   so      - listening socket
 167  *   doclose - make a close downcall for each socket on the accept queue
 168  *
 169  * Return values:
 170  *   None.
 171  *
 172  * Note:
 173  *   The caller has to ensure that no calls to so_acceptq_enqueue() or
 174  *   so_acceptq_dequeue() occur while the accept queue is being flushed.
 175  *   So either the socket needs to be in a state where no operations
 176  *   would come in, or so_lock needs to be obtained.
 177  */
 178 void
 179 so_acceptq_flush(struct sonode *so, boolean_t doclose)
 180 {
 181         so_acceptq_flush_impl(so, &so->so_acceptq_list, doclose);
 182         so_acceptq_flush_impl(so, &so->so_acceptq_defer, doclose);
 183 
 184         so->so_acceptq_len = 0;
 185 }
 186 
 187 int
 188 so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
 189     sock_connid_t id)
 190 {
 191         ASSERT(MUTEX_HELD(&so->so_lock));
 192 
 193         /*
 194          * The protocol has notified us that a connection attempt is being
 195          * made, so before we wait for a notification to arrive we must
 196          * clear out any errors associated with earlier connection attempts.
 197          */
 198         if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
 199                 so->so_error = 0;
 200 
 201         while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
 202                 if (nonblock)
 203                         return (EINPROGRESS);
 204 
 205                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 206                         return (EINTR);
 207 
 208                 if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
 209                         return (EINTR);
 210         }
 211 
 212         if (so->so_error != 0)
 213                 return (sogeterr(so, B_TRUE));
 214         /*
 215          * Under normal circumstances, so_error should contain an error
 216          * in case the connect failed. However, it is possible for another
 217          * thread to come in a consume the error, so generate a sensible
 218          * error in that case.
 219          */
 220         if ((so->so_state & SS_ISCONNECTED) == 0)
 221                 return (ECONNREFUSED);
 222 
 223         return (0);
 224 }
 225 
 226 /*
 227  * int so_wait_connected(struct sonode *so, boolean_t nonblock,
 228  *    sock_connid_t id)
 229  *
 230  * Wait until the socket is connected or an error has occured.
 231  *
 232  * Arguments:
 233  *   so       - socket
 234  *   nonblock - indicate whether it's ok to sleep if the connection has
 235  *              not yet been established
 236  *   gen      - generation number that was returned by the protocol
 237  *              when the operation was started
 238  *
 239  * Returns:
 240  *   0 if the connection attempt was successful, or an error indicating why
 241  *   the connection attempt failed.
 242  */
 243 int
 244 so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
 245 {
 246         int error;
 247 
 248         mutex_enter(&so->so_lock);
 249         error = so_wait_connected_locked(so, nonblock, id);
 250         mutex_exit(&so->so_lock);
 251 
 252         return (error);
 253 }
 254 
 255 int
 256 so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
 257 {
 258         int error;
 259 
 260         ASSERT(MUTEX_HELD(&so->so_lock));
 261         while (SO_SND_FLOWCTRLD(so)) {
 262                 if (so->so_state & SS_CANTSENDMORE)
 263                         return (EPIPE);
 264                 if (dontblock)
 265                         return (EWOULDBLOCK);
 266 
 267                 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 268                         return (EINTR);
 269 
 270                 if (so->so_sndtimeo == 0) {
 271                         /*
 272                          * Zero means disable timeout.
 273                          */
 274                         error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
 275                 } else {
 276                         error = cv_reltimedwait_sig(&so->so_snd_cv,
 277                             &so->so_lock, so->so_sndtimeo, TR_CLOCK_TICK);
 278                 }
 279                 if (error == 0)
 280                         return (EINTR);
 281                 else if (error == -1)
 282                         return (EAGAIN);
 283         }
 284         return (0);
 285 }
 286 
 287 /*
 288  * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
 289  *
 290  * Wait for the transport to notify us about send buffers becoming
 291  * available.
 292  */
 293 int
 294 so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
 295 {
 296         int error = 0;
 297 
 298         mutex_enter(&so->so_lock);
 299         so->so_snd_wakeup = B_TRUE;
 300         error = so_snd_wait_qnotfull_locked(so, dontblock);
 301         so->so_snd_wakeup = B_FALSE;
 302         mutex_exit(&so->so_lock);
 303 
 304         return (error);
 305 }
 306 
 307 void
 308 so_snd_qfull(struct sonode *so)
 309 {
 310         mutex_enter(&so->so_lock);
 311         so->so_snd_qfull = B_TRUE;
 312         mutex_exit(&so->so_lock);
 313 }
 314 
 315 void
 316 so_snd_qnotfull(struct sonode *so)
 317 {
 318         mutex_enter(&so->so_lock);
 319         so->so_snd_qfull = B_FALSE;
 320         /* wake up everyone waiting for buffers */
 321         cv_broadcast(&so->so_snd_cv);
 322         mutex_exit(&so->so_lock);
 323 }
 324 
 325 /*
 326  * Change the process/process group to which SIGIO is sent.
 327  */
 328 int
 329 socket_chgpgrp(struct sonode *so, pid_t pid)
 330 {
 331         int error;
 332 
 333         ASSERT(MUTEX_HELD(&so->so_lock));
 334         if (pid != 0) {
 335                 /*
 336                  * Permissions check by sending signal 0.
 337                  * Note that when kill fails it does a
 338                  * set_errno causing the system call to fail.
 339                  */
 340                 error = kill(pid, 0);
 341                 if (error != 0) {
 342                         return (error);
 343                 }
 344         }
 345         so->so_pgrp = pid;
 346         return (0);
 347 }
 348 
 349 
 350 /*
 351  * Generate a SIGIO, for 'writable' events include siginfo structure,
 352  * for read events just send the signal.
 353  */
 354 /*ARGSUSED*/
 355 static void
 356 socket_sigproc(proc_t *proc, int event)
 357 {
 358         k_siginfo_t info;
 359 
 360         ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
 361 
 362         if (event & SOCKETSIG_WRITE) {
 363                 info.si_signo = SIGPOLL;
 364                 info.si_code = POLL_OUT;
 365                 info.si_errno = 0;
 366                 info.si_fd = 0;
 367                 info.si_band = 0;
 368                 sigaddq(proc, NULL, &info, KM_NOSLEEP);
 369         }
 370         if (event & SOCKETSIG_READ) {
 371                 sigtoproc(proc, NULL, SIGPOLL);
 372         }
 373         if (event & SOCKETSIG_URG) {
 374                 sigtoproc(proc, NULL, SIGURG);
 375         }
 376 }
 377 
 378 void
 379 socket_sendsig(struct sonode *so, int event)
 380 {
 381         proc_t *proc;
 382 
 383         ASSERT(MUTEX_HELD(&so->so_lock));
 384 
 385         if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
 386             event != SOCKETSIG_URG)) {
 387                 return;
 388         }
 389 
 390         dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
 391 
 392         if (so->so_pgrp > 0) {
 393                 /*
 394                  * XXX This unfortunately still generates
 395                  * a signal when a fd is closed but
 396                  * the proc is active.
 397                  */
 398                 mutex_enter(&pidlock);
 399                 proc = prfind(so->so_pgrp);
 400                 if (proc == NULL) {
 401                         mutex_exit(&pidlock);
 402                         return;
 403                 }
 404                 mutex_enter(&proc->p_lock);
 405                 mutex_exit(&pidlock);
 406                 socket_sigproc(proc, event);
 407                 mutex_exit(&proc->p_lock);
 408         } else {
 409                 /*
 410                  * Send to process group. Hold pidlock across
 411                  * calls to socket_sigproc().
 412                  */
 413                 pid_t pgrp = -so->so_pgrp;
 414 
 415                 mutex_enter(&pidlock);
 416                 proc = pgfind(pgrp);
 417                 while (proc != NULL) {
 418                         mutex_enter(&proc->p_lock);
 419                         socket_sigproc(proc, event);
 420                         mutex_exit(&proc->p_lock);
 421                         proc = proc->p_pglink;
 422                 }
 423                 mutex_exit(&pidlock);
 424         }
 425 }
 426 
 427 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 428 /* Copy userdata into a new mblk_t */
 429 mblk_t *
 430 socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
 431     size_t tail_len, int *errorp)
 432 {
 433         mblk_t  *head = NULL, **tail = &head;
 434 
 435         ASSERT(iosize == INFPSZ || iosize > 0);
 436 
 437         if (iosize == INFPSZ || iosize > uiop->uio_resid)
 438                 iosize = uiop->uio_resid;
 439 
 440         if (maxblk == INFPSZ)
 441                 maxblk = iosize;
 442 
 443         /* Nothing to do in these cases, so we're done */
 444         if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
 445                 goto done;
 446 
 447         /*
 448          * We will enter the loop below if iosize is 0; it will allocate an
 449          * empty message block and call uiomove(9F) which will just return.
 450          * We could avoid that with an extra check but would only slow
 451          * down the much more likely case where iosize is larger than 0.
 452          */
 453         do {
 454                 ssize_t blocksize;
 455                 mblk_t  *mp;
 456 
 457                 blocksize = MIN(iosize, maxblk);
 458                 ASSERT(blocksize >= 0);
 459                 mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
 460                 if (mp == NULL) {
 461                         *errorp = ENOMEM;
 462                         return (head);
 463                 }
 464                 mp->b_rptr += wroff;
 465                 mp->b_wptr = mp->b_rptr + blocksize;
 466 
 467                 *tail = mp;
 468                 tail = &mp->b_cont;
 469 
 470                 /* uiomove(9F) either returns 0 or EFAULT */
 471                 if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
 472                     UIO_WRITE, uiop)) != 0) {
 473                         ASSERT(*errorp != ENOMEM);
 474                         freemsg(head);
 475                         return (NULL);
 476                 }
 477 
 478                 iosize -= blocksize;
 479         } while (iosize > 0);
 480 
 481 done:
 482         *errorp = 0;
 483         return (head);
 484 }
 485 
 486 mblk_t *
 487 socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
 488 {
 489         int error;
 490         ptrdiff_t n;
 491         mblk_t *nmp;
 492 
 493         ASSERT(mp->b_wptr >= mp->b_rptr);
 494 
 495         /*
 496          * max_read is the offset of the oobmark and read can not go pass
 497          * the oobmark.
 498          */
 499         if (max_read == INFPSZ || max_read > uiop->uio_resid)
 500                 max_read = uiop->uio_resid;
 501 
 502         do {
 503                 if ((n = MIN(max_read, MBLKL(mp))) != 0) {
 504                         ASSERT(n > 0);
 505 
 506                         error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
 507                         if (error != 0) {
 508                                 freemsg(mp);
 509                                 *errorp = error;
 510                                 return (NULL);
 511                         }
 512                 }
 513 
 514                 mp->b_rptr += n;
 515                 max_read -= n;
 516                 while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
 517                         /*
 518                          * get rid of zero length mblks
 519                          */
 520                         nmp = mp;
 521                         mp = mp->b_cont;
 522                         freeb(nmp);
 523                 }
 524         } while (mp != NULL && max_read > 0);
 525 
 526         *errorp = 0;
 527         return (mp);
 528 }
 529 
 530 static void
 531 so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
 532 {
 533         ASSERT(last_tail != NULL);
 534         mp->b_next = so->so_rcv_q_head;
 535         mp->b_prev = last_tail;
 536         ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
 537 
 538         if (so->so_rcv_q_head == NULL) {
 539                 ASSERT(so->so_rcv_q_last_head == NULL);
 540                 so->so_rcv_q_last_head = mp;
 541 #ifdef DEBUG
 542         } else {
 543                 ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
 544 #endif
 545         }
 546         so->so_rcv_q_head = mp;
 547 
 548 #ifdef DEBUG
 549         if (so_debug_length) {
 550                 mutex_enter(&so->so_lock);
 551                 ASSERT(so_check_length(so));
 552                 mutex_exit(&so->so_lock);
 553         }
 554 #endif
 555 }
 556 
 557 /*
 558  * Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
 559  * can be processed by so_dequeue_msg().
 560  */
 561 void
 562 so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
 563 {
 564         if (so->so_filter_active > 0 &&
 565             (mp_head = sof_filter_data_in_proc(so, mp_head,
 566             &mp_last_head)) == NULL)
 567                 return;
 568 
 569         ASSERT(mp_head->b_prev != NULL);
 570         if (so->so_rcv_q_head == NULL) {
 571                 so->so_rcv_q_head = mp_head;
 572                 so->so_rcv_q_last_head = mp_last_head;
 573                 ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
 574         } else {
 575                 boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
 576                     (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
 577 
 578                 if (mp_head->b_next == NULL &&
 579                     DB_TYPE(mp_head) == M_DATA &&
 580                     DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
 581                         so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 582                         so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 583                         mp_head->b_prev = NULL;
 584                 } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
 585                         /*
 586                          * Append to last_head if more than one mblks, and both
 587                          * mp_head and last_head are I/OAT mblks.
 588                          */
 589                         ASSERT(mp_head->b_next != NULL);
 590                         so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 591                         so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 592                         mp_head->b_prev = NULL;
 593 
 594                         so->so_rcv_q_last_head->b_next = mp_head->b_next;
 595                         mp_head->b_next = NULL;
 596                         so->so_rcv_q_last_head = mp_last_head;
 597                 } else {
 598 #ifdef DEBUG
 599                         {
 600                                 mblk_t *tmp_mblk;
 601                                 tmp_mblk = mp_head;
 602                                 while (tmp_mblk != NULL) {
 603                                         ASSERT(tmp_mblk->b_prev != NULL);
 604                                         tmp_mblk = tmp_mblk->b_next;
 605                                 }
 606                         }
 607 #endif
 608                         so->so_rcv_q_last_head->b_next = mp_head;
 609                         so->so_rcv_q_last_head = mp_last_head;
 610                 }
 611         }
 612 }
 613 
 614 /*
 615  * Check flow control on a given sonode.  Must have so_lock held, and
 616  * this function will release the hold.  Return true if flow control
 617  * is cleared.
 618  */
 619 boolean_t
 620 so_check_flow_control(struct sonode *so)
 621 {
 622         ASSERT(MUTEX_HELD(&so->so_lock));
 623 
 624         if (so->so_flowctrld && (so->so_rcv_queued < so->so_rcvlowat &&
 625             !(so->so_state & SS_FIL_RCV_FLOWCTRL))) {
 626                 so->so_flowctrld = B_FALSE;
 627                 mutex_exit(&so->so_lock);
 628                 /*
 629                  * Open up flow control. SCTP does not have any downcalls, and
 630                  * it will clr flow ctrl in sosctp_recvmsg().
 631                  */
 632                 if (so->so_downcalls != NULL &&
 633                     so->so_downcalls->sd_clr_flowctrl != NULL) {
 634                         (*so->so_downcalls->sd_clr_flowctrl)
 635                             (so->so_proto_handle);
 636                 }
 637                 /* filters can start injecting data */
 638                 sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0);
 639                 return (B_TRUE);
 640         } else {
 641                 mutex_exit(&so->so_lock);
 642                 return (B_FALSE);
 643         }
 644 }
 645 
 646 int
 647 so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
 648     rval_t *rvalp, int flags)
 649 {
 650         mblk_t  *mp, *nmp;
 651         mblk_t  *savemp, *savemptail;
 652         mblk_t  *new_msg_head;
 653         mblk_t  *new_msg_last_head;
 654         mblk_t  *last_tail;
 655         boolean_t partial_read;
 656         boolean_t reset_atmark = B_FALSE;
 657         int more = 0;
 658         int error;
 659         ssize_t oobmark;
 660         sodirect_t *sodp = so->so_direct;
 661 
 662         partial_read = B_FALSE;
 663         *mctlp = NULL;
 664 again:
 665         mutex_enter(&so->so_lock);
 666 again1:
 667 #ifdef DEBUG
 668         if (so_debug_length) {
 669                 ASSERT(so_check_length(so));
 670         }
 671 #endif
 672         if (so->so_state & SS_RCVATMARK) {
 673                 /* Check whether the caller is OK to read past the mark */
 674                 if (flags & MSG_NOMARK) {
 675                         mutex_exit(&so->so_lock);
 676                         return (EWOULDBLOCK);
 677                 }
 678                 reset_atmark = B_TRUE;
 679         }
 680         /*
 681          * First move messages from the dump area to processing area
 682          */
 683         if (sodp != NULL) {
 684                 if (sodp->sod_enabled) {
 685                         if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
 686                                 /* nothing to uioamove */
 687                                 sodp = NULL;
 688                         } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
 689                                 sodp->sod_uioa.uioa_state &= UIOA_CLR;
 690                                 sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
 691                                 /*
 692                                  * try to uioamove() the data that
 693                                  * has already queued.
 694                                  */
 695                                 sod_uioa_so_init(so, sodp, uiop);
 696                         }
 697                 } else {
 698                         sodp = NULL;
 699                 }
 700         }
 701         new_msg_head = so->so_rcv_head;
 702         new_msg_last_head = so->so_rcv_last_head;
 703         so->so_rcv_head = NULL;
 704         so->so_rcv_last_head = NULL;
 705         oobmark = so->so_oobmark;
 706         /*
 707          * We can release the lock as there can only be one reader
 708          */
 709         mutex_exit(&so->so_lock);
 710 
 711         if (new_msg_head != NULL) {
 712                 so_process_new_message(so, new_msg_head, new_msg_last_head);
 713         }
 714         savemp = savemptail = NULL;
 715         rvalp->r_vals = 0;
 716         error = 0;
 717         mp = so->so_rcv_q_head;
 718 
 719         if (mp != NULL &&
 720             (so->so_rcv_timer_tid == 0 ||
 721             so->so_rcv_queued >= so->so_rcv_thresh)) {
 722                 partial_read = B_FALSE;
 723 
 724                 if (flags & MSG_PEEK) {
 725                         if ((nmp = dupmsg(mp)) == NULL &&
 726                             (nmp = copymsg(mp)) == NULL) {
 727                                 size_t size = msgsize(mp);
 728 
 729                                 error = strwaitbuf(size, BPRI_HI);
 730                                 if (error) {
 731                                         return (error);
 732                                 }
 733                                 goto again;
 734                         }
 735                         mp = nmp;
 736                 } else {
 737                         ASSERT(mp->b_prev != NULL);
 738                         last_tail = mp->b_prev;
 739                         mp->b_prev = NULL;
 740                         so->so_rcv_q_head = mp->b_next;
 741                         if (so->so_rcv_q_head == NULL) {
 742                                 so->so_rcv_q_last_head = NULL;
 743                         }
 744                         mp->b_next = NULL;
 745                 }
 746 
 747                 ASSERT(mctlp != NULL);
 748                 /*
 749                  * First process PROTO or PCPROTO blocks, if any.
 750                  */
 751                 if (DB_TYPE(mp) != M_DATA) {
 752                         *mctlp = mp;
 753                         savemp = mp;
 754                         savemptail = mp;
 755                         ASSERT(DB_TYPE(mp) == M_PROTO ||
 756                             DB_TYPE(mp) == M_PCPROTO);
 757                         while (mp->b_cont != NULL &&
 758                             DB_TYPE(mp->b_cont) != M_DATA) {
 759                                 ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
 760                                     DB_TYPE(mp->b_cont) == M_PCPROTO);
 761                                 mp = mp->b_cont;
 762                                 savemptail = mp;
 763                         }
 764                         mp = savemptail->b_cont;
 765                         savemptail->b_cont = NULL;
 766                 }
 767 
 768                 ASSERT(DB_TYPE(mp) == M_DATA);
 769                 /*
 770                  * Now process DATA blocks, if any. Note that for sodirect
 771                  * enabled socket, uio_resid can be 0.
 772                  */
 773                 if (uiop->uio_resid >= 0) {
 774                         ssize_t copied = 0;
 775 
 776                         if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
 777                                 mutex_enter(&so->so_lock);
 778                                 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
 779                                 copied = sod_uioa_mblk(so, mp);
 780                                 if (copied > 0)
 781                                         partial_read = B_TRUE;
 782                                 mutex_exit(&so->so_lock);
 783                                 /* mark this mblk as processed */
 784                                 mp = NULL;
 785                         } else {
 786                                 ssize_t oldresid = uiop->uio_resid;
 787 
 788                                 if (MBLKL(mp) < so_mblk_pull_len) {
 789                                         if (pullupmsg(mp, -1) == 1) {
 790                                                 last_tail = mp;
 791                                         }
 792                                 }
 793                                 /*
 794                                  * Can not read beyond the oobmark
 795                                  */
 796                                 mp = socopyoutuio(mp, uiop,
 797                                     oobmark == 0 ? INFPSZ : oobmark, &error);
 798                                 if (error != 0) {
 799                                         freemsg(*mctlp);
 800                                         *mctlp = NULL;
 801                                         more = 0;
 802                                         goto done;
 803                                 }
 804                                 ASSERT(oldresid >= uiop->uio_resid);
 805                                 copied = oldresid - uiop->uio_resid;
 806                                 if (oldresid > uiop->uio_resid)
 807                                         partial_read = B_TRUE;
 808                         }
 809                         ASSERT(copied >= 0);
 810                         if (copied > 0 && !(flags & MSG_PEEK)) {
 811                                 mutex_enter(&so->so_lock);
 812                                 so->so_rcv_queued -= copied;
 813                                 ASSERT(so->so_oobmark >= 0);
 814                                 if (so->so_oobmark > 0) {
 815                                         so->so_oobmark -= copied;
 816                                         ASSERT(so->so_oobmark >= 0);
 817                                         if (so->so_oobmark == 0) {
 818                                                 ASSERT(so->so_state &
 819                                                     SS_OOBPEND);
 820                                                 so->so_oobmark = 0;
 821                                                 so->so_state |= SS_RCVATMARK;
 822                                         }
 823                                 }
 824                                 /*
 825                                  * so_check_flow_control() will drop
 826                                  * so->so_lock.
 827                                  */
 828                                 rvalp->r_val2 = so_check_flow_control(so);
 829                         }
 830                 }
 831                 if (mp != NULL) { /* more data blocks in msg */
 832                         more |= MOREDATA;
 833                         if ((flags & (MSG_PEEK|MSG_TRUNC))) {
 834                                 if (flags & MSG_PEEK) {
 835                                         freemsg(mp);
 836                                 } else {
 837                                         unsigned int msize = msgdsize(mp);
 838 
 839                                         freemsg(mp);
 840                                         mutex_enter(&so->so_lock);
 841                                         so->so_rcv_queued -= msize;
 842                                         /*
 843                                          * so_check_flow_control() will drop
 844                                          * so->so_lock.
 845                                          */
 846                                         rvalp->r_val2 =
 847                                             so_check_flow_control(so);
 848                                 }
 849                         } else if (partial_read && !somsghasdata(mp)) {
 850                                 /*
 851                                  * Avoid queuing a zero-length tail part of
 852                                  * a message. partial_read == 1 indicates that
 853                                  * we read some of the message.
 854                                  */
 855                                 freemsg(mp);
 856                                 more &= ~MOREDATA;
 857                         } else {
 858                                 if (savemp != NULL &&
 859                                     (flags & MSG_DUPCTRL)) {
 860                                         mblk_t *nmp;
 861                                         /*
 862                                          * There should only be non data mblks
 863                                          */
 864                                         ASSERT(DB_TYPE(savemp) != M_DATA &&
 865                                             DB_TYPE(savemptail) != M_DATA);
 866 try_again:
 867                                         if ((nmp = dupmsg(savemp)) == NULL &&
 868                                             (nmp = copymsg(savemp)) == NULL) {
 869 
 870                                                 size_t size = msgsize(savemp);
 871 
 872                                                 error = strwaitbuf(size,
 873                                                     BPRI_HI);
 874                                                 if (error != 0) {
 875                                                         /*
 876                                                          * In case we
 877                                                          * cannot copy
 878                                                          * control data
 879                                                          * free the remaining
 880                                                          * data.
 881                                                          */
 882                                                         freemsg(mp);
 883                                                         goto done;
 884                                                 }
 885                                                 goto try_again;
 886                                         }
 887 
 888                                         ASSERT(nmp != NULL);
 889                                         ASSERT(DB_TYPE(nmp) != M_DATA);
 890                                         savemptail->b_cont = mp;
 891                                         *mctlp = nmp;
 892                                         mp = savemp;
 893                                 }
 894                                 /*
 895                                  * putback mp
 896                                  */
 897                                 so_prepend_msg(so, mp, last_tail);
 898                         }
 899                 }
 900 
 901                 /* fast check so_rcv_head if there is more data */
 902                 if (partial_read && !(so->so_state & SS_RCVATMARK) &&
 903                     *mctlp == NULL && uiop->uio_resid > 0 &&
 904                     !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
 905                         goto again;
 906                 }
 907         } else if (!partial_read) {
 908                 mutex_enter(&so->so_lock);
 909                 if (so->so_error != 0) {
 910                         error = sogeterr(so, !(flags & MSG_PEEK));
 911                         mutex_exit(&so->so_lock);
 912                         return (error);
 913                 }
 914                 /*
 915                  * No pending data. Return right away for nonblocking
 916                  * socket, otherwise sleep waiting for data.
 917                  */
 918                 if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
 919                         if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
 920                             (flags & MSG_DONTWAIT)) {
 921                                 error = EWOULDBLOCK;
 922                         } else {
 923                                 if (so->so_state & (SS_CLOSING |
 924                                     SS_FALLBACK_PENDING)) {
 925                                         mutex_exit(&so->so_lock);
 926                                         error = EINTR;
 927                                         goto done;
 928                                 }
 929 
 930                                 if (so->so_rcv_head != NULL) {
 931                                         goto again1;
 932                                 }
 933                                 so->so_rcv_wakeup = B_TRUE;
 934                                 so->so_rcv_wanted = uiop->uio_resid;
 935                                 if (so->so_rcvtimeo == 0) {
 936                                         /*
 937                                          * Zero means disable timeout.
 938                                          */
 939                                         error = cv_wait_sig(&so->so_rcv_cv,
 940                                             &so->so_lock);
 941                                 } else {
 942                                         error = cv_reltimedwait_sig(
 943                                             &so->so_rcv_cv, &so->so_lock,
 944                                             so->so_rcvtimeo, TR_CLOCK_TICK);
 945                                 }
 946                                 so->so_rcv_wakeup = B_FALSE;
 947                                 so->so_rcv_wanted = 0;
 948 
 949                                 if (error == 0) {
 950                                         error = EINTR;
 951                                 } else if (error == -1) {
 952                                         error = EAGAIN;
 953                                 } else {
 954                                         goto again1;
 955                                 }
 956                         }
 957                 }
 958                 mutex_exit(&so->so_lock);
 959         }
 960         if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
 961                 /*
 962                  * We are passed the mark, update state
 963                  * 4.3BSD and 4.4BSD clears the mark when peeking across it.
 964                  * The draft Posix socket spec states that the mark should
 965                  * not be cleared when peeking. We follow the latter.
 966                  */
 967                 mutex_enter(&so->so_lock);
 968                 ASSERT(so_verify_oobstate(so));
 969                 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
 970                 freemsg(so->so_oobmsg);
 971                 so->so_oobmsg = NULL;
 972                 ASSERT(so_verify_oobstate(so));
 973                 mutex_exit(&so->so_lock);
 974         }
 975         ASSERT(so->so_rcv_wakeup == B_FALSE);
 976 done:
 977         if (sodp != NULL) {
 978                 mutex_enter(&so->so_lock);
 979                 if (sodp->sod_enabled &&
 980                     (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
 981                         SOD_UIOAFINI(sodp);
 982                         if (sodp->sod_uioa.uioa_mbytes > 0) {
 983                                 ASSERT(so->so_rcv_q_head != NULL ||
 984                                     so->so_rcv_head != NULL);
 985                                 so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
 986                                 if (error == EWOULDBLOCK)
 987                                         error = 0;
 988                         }
 989                 }
 990                 mutex_exit(&so->so_lock);
 991         }
 992 #ifdef DEBUG
 993         if (so_debug_length) {
 994                 mutex_enter(&so->so_lock);
 995                 ASSERT(so_check_length(so));
 996                 mutex_exit(&so->so_lock);
 997         }
 998 #endif
 999         rvalp->r_val1 = more;
1000         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1001         return (error);
1002 }
1003 
1004 /*
1005  * Enqueue data from the protocol on the socket's rcv queue.
1006  *
1007  * We try to hook new M_DATA mblks onto an existing chain, however,
1008  * that cannot be done if the existing chain has already been
1009  * processed by I/OAT. Non-M_DATA mblks are just linked together via
1010  * b_next. In all cases the b_prev of the enqueued mblk is set to
1011  * point to the last mblk in its b_cont chain.
1012  */
1013 void
1014 so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1015 {
1016         ASSERT(MUTEX_HELD(&so->so_lock));
1017 
1018 #ifdef DEBUG
1019         if (so_debug_length) {
1020                 ASSERT(so_check_length(so));
1021         }
1022 #endif
1023         so->so_rcv_queued += msg_size;
1024 
1025         if (so->so_rcv_head == NULL) {
1026                 ASSERT(so->so_rcv_last_head == NULL);
1027                 so->so_rcv_head = mp;
1028                 so->so_rcv_last_head = mp;
1029         } else if ((DB_TYPE(mp) == M_DATA &&
1030             DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1031             ((DB_FLAGS(mp) & DBLK_UIOA) ==
1032             (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1033                 /* Added to the end */
1034                 ASSERT(so->so_rcv_last_head != NULL);
1035                 ASSERT(so->so_rcv_last_head->b_prev != NULL);
1036                 so->so_rcv_last_head->b_prev->b_cont = mp;
1037         } else {
1038                 /* Start a new end */
1039                 so->so_rcv_last_head->b_next = mp;
1040                 so->so_rcv_last_head = mp;
1041         }
1042         while (mp->b_cont != NULL)
1043                 mp = mp->b_cont;
1044 
1045         so->so_rcv_last_head->b_prev = mp;
1046 #ifdef DEBUG
1047         if (so_debug_length) {
1048                 ASSERT(so_check_length(so));
1049         }
1050 #endif
1051 }
1052 
1053 /*
1054  * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1055  */
1056 boolean_t
1057 somsghasdata(mblk_t *mp)
1058 {
1059         for (; mp; mp = mp->b_cont)
1060                 if (mp->b_datap->db_type == M_DATA) {
1061                         ASSERT(mp->b_wptr >= mp->b_rptr);
1062                         if (mp->b_wptr > mp->b_rptr)
1063                                 return (B_TRUE);
1064                 }
1065         return (B_FALSE);
1066 }
1067 
1068 /*
1069  * Flush the read side of sockfs.
1070  *
1071  * The caller must be sure that a reader is not already active when the
1072  * buffer is being flushed.
1073  */
1074 void
1075 so_rcv_flush(struct sonode *so)
1076 {
1077         mblk_t  *mp;
1078 
1079         ASSERT(MUTEX_HELD(&so->so_lock));
1080 
1081         if (so->so_oobmsg != NULL) {
1082                 freemsg(so->so_oobmsg);
1083                 so->so_oobmsg = NULL;
1084                 so->so_oobmark = 0;
1085                 so->so_state &=
1086                     ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1087         }
1088 
1089         /*
1090          * Free messages sitting in the recv queues
1091          */
1092         while (so->so_rcv_q_head != NULL) {
1093                 mp = so->so_rcv_q_head;
1094                 so->so_rcv_q_head = mp->b_next;
1095                 mp->b_next = mp->b_prev = NULL;
1096                 freemsg(mp);
1097         }
1098         while (so->so_rcv_head != NULL) {
1099                 mp = so->so_rcv_head;
1100                 so->so_rcv_head = mp->b_next;
1101                 mp->b_next = mp->b_prev = NULL;
1102                 freemsg(mp);
1103         }
1104         so->so_rcv_queued = 0;
1105         so->so_rcv_q_head = NULL;
1106         so->so_rcv_q_last_head = NULL;
1107         so->so_rcv_head = NULL;
1108         so->so_rcv_last_head = NULL;
1109 }
1110 
1111 /*
1112  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1113  */
1114 int
1115 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1116     boolean_t oob_inline)
1117 {
1118         mblk_t          *mp, *nmp;
1119         int             error;
1120 
1121         dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1122             flags));
1123 
1124         if (msg != NULL) {
1125                 /*
1126                  * There is never any oob data with addresses or control since
1127                  * the T_EXDATA_IND does not carry any options.
1128                  */
1129                 msg->msg_controllen = 0;
1130                 msg->msg_namelen = 0;
1131                 msg->msg_flags = 0;
1132         }
1133 
1134         mutex_enter(&so->so_lock);
1135         ASSERT(so_verify_oobstate(so));
1136         if (oob_inline ||
1137             (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1138                 dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1139                 mutex_exit(&so->so_lock);
1140                 return (EINVAL);
1141         }
1142         if (!(so->so_state & SS_HAVEOOBDATA)) {
1143                 dprintso(so, 1, ("sorecvoob: no data yet\n"));
1144                 mutex_exit(&so->so_lock);
1145                 return (EWOULDBLOCK);
1146         }
1147         ASSERT(so->so_oobmsg != NULL);
1148         mp = so->so_oobmsg;
1149         if (flags & MSG_PEEK) {
1150                 /*
1151                  * Since recv* can not return ENOBUFS we can not use dupmsg.
1152                  * Instead we revert to the consolidation private
1153                  * allocb_wait plus bcopy.
1154                  */
1155                 mblk_t *mp1;
1156 
1157                 mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1158                 ASSERT(mp1);
1159 
1160                 while (mp != NULL) {
1161                         ssize_t size;
1162 
1163                         size = MBLKL(mp);
1164                         bcopy(mp->b_rptr, mp1->b_wptr, size);
1165                         mp1->b_wptr += size;
1166                         ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1167                         mp = mp->b_cont;
1168                 }
1169                 mp = mp1;
1170         } else {
1171                 /*
1172                  * Update the state indicating that the data has been consumed.
1173                  * Keep SS_OOBPEND set until data is consumed past the mark.
1174                  */
1175                 so->so_oobmsg = NULL;
1176                 so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1177         }
1178         ASSERT(so_verify_oobstate(so));
1179         mutex_exit(&so->so_lock);
1180 
1181         error = 0;
1182         nmp = mp;
1183         while (nmp != NULL && uiop->uio_resid > 0) {
1184                 ssize_t n = MBLKL(nmp);
1185 
1186                 n = MIN(n, uiop->uio_resid);
1187                 if (n > 0)
1188                         error = uiomove(nmp->b_rptr, n,
1189                             UIO_READ, uiop);
1190                 if (error)
1191                         break;
1192                 nmp = nmp->b_cont;
1193         }
1194         ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1195         freemsg(mp);
1196         return (error);
1197 }
1198 
1199 /*
1200  * Allocate and initializ sonode
1201  */
1202 /* ARGSUSED */
1203 struct sonode *
1204 socket_sonode_create(struct sockparams *sp, int family, int type,
1205     int protocol, int version, int sflags, int *errorp, struct cred *cr)
1206 {
1207         sonode_t *so;
1208         int     kmflags;
1209 
1210         /*
1211          * Choose the right set of sonodeops based on the upcall and
1212          * down call version that the protocol has provided
1213          */
1214         if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1215             SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1216                 /*
1217                  * mismatch
1218                  */
1219 #ifdef DEBUG
1220                 cmn_err(CE_CONT, "protocol and socket module version mismatch");
1221 #endif
1222                 *errorp = EINVAL;
1223                 return (NULL);
1224         }
1225 
1226         kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1227 
1228         so = kmem_cache_alloc(socket_cache, kmflags);
1229         if (so == NULL) {
1230                 *errorp = ENOMEM;
1231                 return (NULL);
1232         }
1233 
1234         sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1235 
1236         if (version == SOV_DEFAULT)
1237                 version = so_default_version;
1238 
1239         so->so_version = (short)version;
1240 
1241         /*
1242          * set the default values to be INFPSZ
1243          * if a protocol desires it can change the value later
1244          */
1245         so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1246         so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1247         so->so_proto_props.sopp_maxpsz = INFPSZ;
1248         so->so_proto_props.sopp_maxblk = INFPSZ;
1249 
1250         return (so);
1251 }
1252 
1253 int
1254 socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1255 {
1256         int error = 0;
1257 
1258         if (pso != NULL) {
1259                 /*
1260                  * We have a passive open, so inherit basic state from
1261                  * the parent (listener).
1262                  *
1263                  * No need to grab the new sonode's lock, since there is no
1264                  * one that can have a reference to it.
1265                  */
1266                 mutex_enter(&pso->so_lock);
1267 
1268                 so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1269                 so->so_pgrp = pso->so_pgrp;
1270                 so->so_rcvtimeo = pso->so_rcvtimeo;
1271                 so->so_sndtimeo = pso->so_sndtimeo;
1272                 so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
1273                 /*
1274                  * Make note of the socket level options. TCP and IP level
1275                  * options are already inherited. We could do all this after
1276                  * accept is successful but doing it here simplifies code and
1277                  * no harm done for error case.
1278                  */
1279                 so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
1280                     SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1281                     SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1282                 so->so_proto_props = pso->so_proto_props;
1283                 so->so_mode = pso->so_mode;
1284                 so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
1285 
1286                 mutex_exit(&pso->so_lock);
1287 
1288                 /*
1289                  * If the parent has any filters, try to inherit them.
1290                  */
1291                 if (pso->so_filter_active > 0 &&
1292                     (error = sof_sonode_inherit_filters(so, pso)) != 0)
1293                         return (error);
1294 
1295         } else {
1296                 struct sockparams *sp = so->so_sockparams;
1297                 sock_upcalls_t *upcalls_to_use;
1298 
1299                 /*
1300                  * Attach automatic filters, if there are any.
1301                  */
1302                 if (!list_is_empty(&sp->sp_auto_filters) &&
1303                     (error = sof_sonode_autoattach_filters(so, cr)) != 0)
1304                         return (error);
1305 
1306                 /* OK to attach filters */
1307                 so->so_state |= SS_FILOP_OK;
1308 
1309                 /*
1310                  * Based on the version number select the right upcalls to
1311                  * pass down. Currently we only have one version so choose
1312                  * default
1313                  */
1314                 upcalls_to_use = &so_upcalls;
1315 
1316                 /* active open, so create a lower handle */
1317                 so->so_proto_handle =
1318                     sp->sp_smod_info->smod_proto_create_func(so->so_family,
1319                     so->so_type, so->so_protocol, &so->so_downcalls,
1320                     &so->so_mode, &error, flags, cr);
1321 
1322                 if (so->so_proto_handle == NULL) {
1323                         ASSERT(error != 0);
1324                         /*
1325                          * To be safe; if a lower handle cannot be created, and
1326                          * the proto does not give a reason why, assume there
1327                          * was a lack of memory.
1328                          */
1329                         return ((error == 0) ? ENOMEM : error);
1330                 }
1331                 ASSERT(so->so_downcalls != NULL);
1332                 ASSERT(so->so_downcalls->sd_send != NULL ||
1333                     so->so_downcalls->sd_send_uio != NULL);
1334                 if (so->so_downcalls->sd_recv_uio != NULL) {
1335                         ASSERT(so->so_downcalls->sd_poll != NULL);
1336                         so->so_pollev |= SO_POLLEV_ALWAYS;
1337                 }
1338 
1339                 (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1340                     (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1341 
1342                 /* Wildcard */
1343 
1344                 /*
1345                  * FIXME No need for this, the protocol can deal with it in
1346                  * sd_create(). Should update ICMP.
1347                  */
1348                 if (so->so_protocol != so->so_sockparams->sp_protocol) {
1349                         int protocol = so->so_protocol;
1350                         int error;
1351                         /*
1352                          * Issue SO_PROTOTYPE setsockopt.
1353                          */
1354                         error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1355                             &protocol, (t_uscalar_t)sizeof (protocol), cr);
1356                         if (error) {
1357                                 (void) (*so->so_downcalls->sd_close)
1358                                     (so->so_proto_handle, 0, cr);
1359 
1360                                 mutex_enter(&so->so_lock);
1361                                 so_rcv_flush(so);
1362                                 mutex_exit(&so->so_lock);
1363                                 /*
1364                                  * Setsockopt often fails with ENOPROTOOPT but
1365                                  * socket() should fail with
1366                                  * EPROTONOSUPPORT/EPROTOTYPE.
1367                                  */
1368                                 return (EPROTONOSUPPORT);
1369                         }
1370                 }
1371         }
1372 
1373         if (uioasync.enabled)
1374                 sod_sock_init(so);
1375 
1376         /* put an extra reference on the socket for the protocol */
1377         VN_HOLD(SOTOV(so));
1378 
1379         return (0);
1380 }
1381 
1382 /*
1383  * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1384  *         struct cred *cr, int32_t *rvalp)
1385  *
1386  * Handle ioctls that manipulate basic socket state; non-blocking,
1387  * async, etc.
1388  *
1389  * Returns:
1390  *   < 0  - ioctl was not handle
1391  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1392  *
1393  * Notes:
1394  *   Assumes the standard receive buffer is used to obtain info for
1395  *   NREAD.
1396  */
1397 /* ARGSUSED */
1398 int
1399 socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1400     struct cred *cr, int32_t *rvalp)
1401 {
1402         switch (cmd) {
1403         case SIOCSQPTR:
1404                 /*
1405                  * SIOCSQPTR is valid only when helper stream is created
1406                  * by the protocol.
1407                  */
1408 
1409                 return (EOPNOTSUPP);
1410         case FIONBIO: {
1411                 int32_t value;
1412 
1413                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1414                     (mode & (int)FKIOCTL)))
1415                         return (EFAULT);
1416 
1417                 mutex_enter(&so->so_lock);
1418                 if (value) {
1419                         so->so_state |= SS_NDELAY;
1420                 } else {
1421                         so->so_state &= ~SS_NDELAY;
1422                 }
1423                 mutex_exit(&so->so_lock);
1424                 return (0);
1425         }
1426         case FIOASYNC: {
1427                 int32_t value;
1428 
1429                 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1430                     (mode & (int)FKIOCTL)))
1431                         return (EFAULT);
1432 
1433                 mutex_enter(&so->so_lock);
1434 
1435                 if (value) {
1436                         /* Turn on SIGIO */
1437                         so->so_state |= SS_ASYNC;
1438                 } else {
1439                         /* Turn off SIGIO */
1440                         so->so_state &= ~SS_ASYNC;
1441                 }
1442                 mutex_exit(&so->so_lock);
1443 
1444                 return (0);
1445         }
1446 
1447         case SIOCSPGRP:
1448         case FIOSETOWN: {
1449                 int error;
1450                 pid_t pid;
1451 
1452                 if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1453                     (mode & (int)FKIOCTL)))
1454                         return (EFAULT);
1455 
1456                 mutex_enter(&so->so_lock);
1457                 error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1458                 mutex_exit(&so->so_lock);
1459                 return (error);
1460         }
1461         case SIOCGPGRP:
1462         case FIOGETOWN:
1463                 if (so_copyout(&so->so_pgrp, (void *)arg,
1464                     sizeof (pid_t), (mode & (int)FKIOCTL)))
1465                         return (EFAULT);
1466 
1467                 return (0);
1468         case SIOCATMARK: {
1469                 int retval;
1470 
1471                 /*
1472                  * Only protocols that support urgent data can handle ATMARK.
1473                  */
1474                 if ((so->so_mode & SM_EXDATA) == 0)
1475                         return (EINVAL);
1476 
1477                 /*
1478                  * If the protocol is maintaining its own buffer, then the
1479                  * request must be passed down.
1480                  */
1481                 if (so->so_downcalls->sd_recv_uio != NULL)
1482                         return (-1);
1483 
1484                 retval = (so->so_state & SS_RCVATMARK) != 0;
1485 
1486                 if (so_copyout(&retval, (void *)arg, sizeof (int),
1487                     (mode & (int)FKIOCTL))) {
1488                         return (EFAULT);
1489                 }
1490                 return (0);
1491         }
1492 
1493         case FIONREAD: {
1494                 int retval;
1495 
1496                 /*
1497                  * If the protocol is maintaining its own buffer, then the
1498                  * request must be passed down.
1499                  */
1500                 if (so->so_downcalls->sd_recv_uio != NULL)
1501                         return (-1);
1502 
1503                 retval = MIN(so->so_rcv_queued, INT_MAX);
1504 
1505                 if (so_copyout(&retval, (void *)arg,
1506                     sizeof (retval), (mode & (int)FKIOCTL))) {
1507                         return (EFAULT);
1508                 }
1509                 return (0);
1510         }
1511 
1512         case _I_GETPEERCRED: {
1513                 int error = 0;
1514 
1515                 if ((mode & FKIOCTL) == 0)
1516                         return (EINVAL);
1517 
1518                 mutex_enter(&so->so_lock);
1519                 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1520                         error = ENOTSUP;
1521                 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1522                         error = ENOTCONN;
1523                 } else if (so->so_peercred != NULL) {
1524                         k_peercred_t *kp = (k_peercred_t *)arg;
1525                         kp->pc_cr = so->so_peercred;
1526                         kp->pc_cpid = so->so_cpid;
1527                         crhold(so->so_peercred);
1528                 } else {
1529                         error = EINVAL;
1530                 }
1531                 mutex_exit(&so->so_lock);
1532                 return (error);
1533         }
1534         default:
1535                 return (-1);
1536         }
1537 }
1538 
1539 /*
1540  * Handle the I_NREAD STREAM ioctl.
1541  */
1542 static int
1543 so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
1544 {
1545         size_t size = 0;
1546         int retval;
1547         int count = 0;
1548         mblk_t *mp;
1549         clock_t wakeup = drv_usectohz(10);
1550 
1551         if (so->so_downcalls == NULL ||
1552             so->so_downcalls->sd_recv_uio != NULL)
1553                 return (EINVAL);
1554 
1555         mutex_enter(&so->so_lock);
1556         /* Wait for reader to get out of the way. */
1557         while (so->so_flag & SOREADLOCKED) {
1558                 /*
1559                  * If reader is waiting for data, then there should be nothing
1560                  * on the rcv queue.
1561                  */
1562                 if (so->so_rcv_wakeup)
1563                         goto out;
1564 
1565                 /* Do a timed sleep, in case the reader goes to sleep. */
1566                 (void) cv_reltimedwait(&so->so_read_cv, &so->so_lock, wakeup,
1567                     TR_CLOCK_TICK);
1568         }
1569 
1570         /*
1571          * Since we are holding so_lock no new reader will come in, and the
1572          * protocol will not be able to enqueue data. So it's safe to walk
1573          * both rcv queues.
1574          */
1575         mp = so->so_rcv_q_head;
1576         if (mp != NULL) {
1577                 size = msgdsize(so->so_rcv_q_head);
1578                 for (; mp != NULL; mp = mp->b_next)
1579                         count++;
1580         } else {
1581                 /*
1582                  * In case the processing list was empty, get the size of the
1583                  * next msg in line.
1584                  */
1585                 size = msgdsize(so->so_rcv_head);
1586         }
1587 
1588         for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
1589                 count++;
1590 out:
1591         mutex_exit(&so->so_lock);
1592 
1593         /*
1594          * Drop down from size_t to the "int" required by the
1595          * interface.  Cap at INT_MAX.
1596          */
1597         retval = MIN(size, INT_MAX);
1598         if (so_copyout(&retval, (void *)arg, sizeof (retval),
1599             (mode & (int)FKIOCTL))) {
1600                 return (EFAULT);
1601         } else {
1602                 *rvalp = count;
1603                 return (0);
1604         }
1605 }
1606 
1607 /*
1608  * Process STREAM ioctls.
1609  *
1610  * Returns:
1611  *   < 0  - ioctl was not handle
1612  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1613  */
1614 int
1615 socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1616     struct cred *cr, int32_t *rvalp)
1617 {
1618         int retval;
1619 
1620         /* Only STREAM iotcls are handled here */
1621         if ((cmd & 0xffffff00U) != STR)
1622                 return (-1);
1623 
1624         switch (cmd) {
1625         case I_CANPUT:
1626                 /*
1627                  * We return an error for I_CANPUT so that isastream(3C) will
1628                  * not report the socket as being a STREAM.
1629                  */
1630                 return (EOPNOTSUPP);
1631         case I_NREAD:
1632                 /* Avoid doing a fallback for I_NREAD. */
1633                 return (so_strioc_nread(so, arg, mode, rvalp));
1634         case I_LOOK:
1635                 /* Avoid doing a fallback for I_LOOK. */
1636                 if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1637                     (mode & (int)FKIOCTL))) {
1638                         return (EFAULT);
1639                 }
1640                 return (0);
1641         default:
1642                 break;
1643         }
1644 
1645         /*
1646          * Try to fall back to TPI, and if successful, reissue the ioctl.
1647          */
1648         if ((retval = so_tpi_fallback(so, cr)) == 0) {
1649                 /* Reissue the ioctl */
1650                 ASSERT(so->so_rcv_q_head == NULL);
1651                 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1652         } else {
1653                 return (retval);
1654         }
1655 }
1656 
1657 /*
1658  * This is called for all socket types to verify that the buffer size is large
1659  * enough for the option, and if we can, handle the request as well. Most
1660  * options will be forwarded to the protocol.
1661  */
1662 int
1663 socket_getopt_common(struct sonode *so, int level, int option_name,
1664     void *optval, socklen_t *optlenp, int flags)
1665 {
1666         if (level != SOL_SOCKET)
1667                 return (-1);
1668 
1669         switch (option_name) {
1670         case SO_ERROR:
1671         case SO_DOMAIN:
1672         case SO_TYPE:
1673         case SO_ACCEPTCONN: {
1674                 int32_t value;
1675                 socklen_t optlen = *optlenp;
1676 
1677                 if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1678                         return (EINVAL);
1679                 }
1680 
1681                 switch (option_name) {
1682                 case SO_ERROR:
1683                         mutex_enter(&so->so_lock);
1684                         value = sogeterr(so, B_TRUE);
1685                         mutex_exit(&so->so_lock);
1686                         break;
1687                 case SO_DOMAIN:
1688                         value = so->so_family;
1689                         break;
1690                 case SO_TYPE:
1691                         value = so->so_type;
1692                         break;
1693                 case SO_ACCEPTCONN:
1694                         if (so->so_state & SS_ACCEPTCONN)
1695                                 value = SO_ACCEPTCONN;
1696                         else
1697                                 value = 0;
1698                         break;
1699                 }
1700 
1701                 bcopy(&value, optval, sizeof (value));
1702                 *optlenp = sizeof (value);
1703 
1704                 return (0);
1705         }
1706         case SO_SNDTIMEO:
1707         case SO_RCVTIMEO: {
1708                 clock_t value;
1709                 socklen_t optlen = *optlenp;
1710 
1711                 if (get_udatamodel() == DATAMODEL_NONE ||
1712                     get_udatamodel() == DATAMODEL_NATIVE) {
1713                         if (optlen < sizeof (struct timeval))
1714                                 return (EINVAL);
1715                 } else {
1716                         if (optlen < sizeof (struct timeval32))
1717                                 return (EINVAL);
1718                 }
1719                 if (option_name == SO_RCVTIMEO)
1720                         value = drv_hztousec(so->so_rcvtimeo);
1721                 else
1722                         value = drv_hztousec(so->so_sndtimeo);
1723 
1724                 if (get_udatamodel() == DATAMODEL_NONE ||
1725                     get_udatamodel() == DATAMODEL_NATIVE) {
1726                         ((struct timeval *)(optval))->tv_sec =
1727                             value / (1000 * 1000);
1728                         ((struct timeval *)(optval))->tv_usec =
1729                             value % (1000 * 1000);
1730                         *optlenp = sizeof (struct timeval);
1731                 } else {
1732                         ((struct timeval32 *)(optval))->tv_sec =
1733                             value / (1000 * 1000);
1734                         ((struct timeval32 *)(optval))->tv_usec =
1735                             value % (1000 * 1000);
1736                         *optlenp = sizeof (struct timeval32);
1737                 }
1738                 return (0);
1739         }
1740         case SO_DEBUG:
1741         case SO_REUSEADDR:
1742         case SO_KEEPALIVE:
1743         case SO_DONTROUTE:
1744         case SO_BROADCAST:
1745         case SO_USELOOPBACK:
1746         case SO_OOBINLINE:
1747         case SO_SNDBUF:
1748 #ifdef notyet
1749         case SO_SNDLOWAT:
1750         case SO_RCVLOWAT:
1751 #endif /* notyet */
1752         case SO_DGRAM_ERRIND: {
1753                 socklen_t optlen = *optlenp;
1754 
1755                 if (optlen < (t_uscalar_t)sizeof (int32_t))
1756                         return (EINVAL);
1757                 break;
1758         }
1759         case SO_RCVBUF: {
1760                 socklen_t optlen = *optlenp;
1761 
1762                 if (optlen < (t_uscalar_t)sizeof (int32_t))
1763                         return (EINVAL);
1764 
1765                 if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1766                         /*
1767                          * XXX If SO_RCVBUF has been set and this is an
1768                          * XPG 4.2 application then do not ask the transport
1769                          * since the transport might adjust the value and not
1770                          * return exactly what was set by the application.
1771                          * For non-XPG 4.2 application we return the value
1772                          * that the transport is actually using.
1773                          */
1774                         *(int32_t *)optval = so->so_xpg_rcvbuf;
1775                         *optlenp = sizeof (so->so_xpg_rcvbuf);
1776                         return (0);
1777                 }
1778                 /*
1779                  * If the option has not been set then get a default
1780                  * value from the transport.
1781                  */
1782                 break;
1783         }
1784         case SO_LINGER: {
1785                 socklen_t optlen = *optlenp;
1786 
1787                 if (optlen < (t_uscalar_t)sizeof (struct linger))
1788                         return (EINVAL);
1789                 break;
1790         }
1791         case SO_SND_BUFINFO: {
1792                 socklen_t optlen = *optlenp;
1793 
1794                 if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1795                         return (EINVAL);
1796                 ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1797                     (so->so_proto_props).sopp_wroff;
1798                 ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1799                     (so->so_proto_props).sopp_maxblk;
1800                 ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1801                     (so->so_proto_props).sopp_maxpsz;
1802                 ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1803                     (so->so_proto_props).sopp_tail;
1804                 *optlenp = sizeof (struct so_snd_bufinfo);
1805                 return (0);
1806         }
1807         case SO_SND_COPYAVOID: {
1808                 sof_instance_t *inst;
1809 
1810                 /*
1811                  * Avoid zero-copy if there is a filter with a data_out
1812                  * callback. We could let the operation succeed, but then
1813                  * the filter would have to copy the data anyway.
1814                  */
1815                 for (inst = so->so_filter_top; inst != NULL;
1816                     inst = inst->sofi_next) {
1817                         if (SOF_INTERESTED(inst, data_out))
1818                                 return (EOPNOTSUPP);
1819                 }
1820                 break;
1821         }
1822 
1823         default:
1824                 break;
1825         }
1826 
1827         /* Unknown Option */
1828         return (-1);
1829 }
1830 
1831 void
1832 socket_sonode_destroy(struct sonode *so)
1833 {
1834         sonode_fini(so);
1835         kmem_cache_free(socket_cache, so);
1836 }
1837 
1838 int
1839 so_zcopy_wait(struct sonode *so)
1840 {
1841         int error = 0;
1842 
1843         mutex_enter(&so->so_lock);
1844         while (!(so->so_copyflag & STZCNOTIFY)) {
1845                 if (so->so_state & SS_CLOSING) {
1846                         mutex_exit(&so->so_lock);
1847                         return (EINTR);
1848                 }
1849                 if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1850                         error = EINTR;
1851                         break;
1852                 }
1853         }
1854         so->so_copyflag &= ~STZCNOTIFY;
1855         mutex_exit(&so->so_lock);
1856         return (error);
1857 }
1858 
1859 void
1860 so_timer_callback(void *arg)
1861 {
1862         struct sonode *so = (struct sonode *)arg;
1863 
1864         mutex_enter(&so->so_lock);
1865 
1866         so->so_rcv_timer_tid = 0;
1867         if (so->so_rcv_queued > 0) {
1868                 so_notify_data(so, so->so_rcv_queued);
1869         } else {
1870                 mutex_exit(&so->so_lock);
1871         }
1872 }
1873 
1874 #ifdef DEBUG
1875 /*
1876  * Verify that the length stored in so_rcv_queued and the length of data blocks
1877  * queued is same.
1878  */
1879 static boolean_t
1880 so_check_length(sonode_t *so)
1881 {
1882         mblk_t *mp = so->so_rcv_q_head;
1883         int len = 0;
1884 
1885         ASSERT(MUTEX_HELD(&so->so_lock));
1886 
1887         if (mp != NULL) {
1888                 len = msgdsize(mp);
1889                 while ((mp = mp->b_next) != NULL)
1890                         len += msgdsize(mp);
1891         }
1892         mp = so->so_rcv_head;
1893         if (mp != NULL) {
1894                 len += msgdsize(mp);
1895                 while ((mp = mp->b_next) != NULL)
1896                         len += msgdsize(mp);
1897         }
1898         return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1899 }
1900 #endif
1901 
1902 int
1903 so_get_mod_version(struct sockparams *sp)
1904 {
1905         ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1906         return (sp->sp_smod_info->smod_version);
1907 }
1908 
1909 /*
1910  * so_start_fallback()
1911  *
1912  * Block new socket operations from coming in, and wait for active operations
1913  * to complete. Threads that are sleeping will be woken up so they can get
1914  * out of the way.
1915  *
1916  * The caller must be a reader on so_fallback_rwlock.
1917  */
1918 static boolean_t
1919 so_start_fallback(struct sonode *so)
1920 {
1921         ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1922 
1923         mutex_enter(&so->so_lock);
1924         if (so->so_state & SS_FALLBACK_PENDING) {
1925                 mutex_exit(&so->so_lock);
1926                 return (B_FALSE);
1927         }
1928         so->so_state |= SS_FALLBACK_PENDING;
1929         /*
1930          * Poke all threads that might be sleeping. Any operation that comes
1931          * in after the cv_broadcast will observe the fallback pending flag
1932          * which cause the call to return where it would normally sleep.
1933          */
1934         cv_broadcast(&so->so_state_cv);          /* threads in connect() */
1935         cv_broadcast(&so->so_rcv_cv);            /* threads in recvmsg() */
1936         cv_broadcast(&so->so_snd_cv);            /* threads in sendmsg() */
1937         mutex_enter(&so->so_acceptq_lock);
1938         cv_broadcast(&so->so_acceptq_cv);        /* threads in accept() */
1939         mutex_exit(&so->so_acceptq_lock);
1940         mutex_exit(&so->so_lock);
1941 
1942         /*
1943          * The main reason for the rw_tryupgrade call is to provide
1944          * observability during the fallback process. We want to
1945          * be able to see if there are pending operations.
1946          */
1947         if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1948                 /*
1949                  * It is safe to drop and reaquire the fallback lock, because
1950                  * we are guaranteed that another fallback cannot take place.
1951                  */
1952                 rw_exit(&so->so_fallback_rwlock);
1953                 DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1954                 rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1955                 DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1956         }
1957 
1958         return (B_TRUE);
1959 }
1960 
1961 /*
1962  * so_end_fallback()
1963  *
1964  * Allow socket opertions back in.
1965  *
1966  * The caller must be a writer on so_fallback_rwlock.
1967  */
1968 static void
1969 so_end_fallback(struct sonode *so)
1970 {
1971         ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
1972 
1973         mutex_enter(&so->so_lock);
1974         so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
1975         mutex_exit(&so->so_lock);
1976 
1977         rw_downgrade(&so->so_fallback_rwlock);
1978 }
1979 
1980 /*
1981  * so_quiesced_cb()
1982  *
1983  * Callback passed to the protocol during fallback. It is called once
1984  * the endpoint is quiescent.
1985  *
1986  * No requests from the user, no notifications from the protocol, so it
1987  * is safe to synchronize the state. Data can also be moved without
1988  * risk for reordering.
1989  *
1990  * We do not need to hold so_lock, since there can be only one thread
1991  * operating on the sonode.
1992  */
1993 static mblk_t *
1994 so_quiesced_cb(sock_upper_handle_t sock_handle, sock_quiesce_arg_t *arg,
1995     struct T_capability_ack *tcap,
1996     struct sockaddr *laddr, socklen_t laddrlen,
1997     struct sockaddr *faddr, socklen_t faddrlen, short opts)
1998 {
1999         struct sonode *so = (struct sonode *)sock_handle;
2000         boolean_t atmark;
2001         mblk_t *retmp = NULL, **tailmpp = &retmp;
2002 
2003         if (tcap != NULL)
2004                 sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen,
2005                     opts);
2006 
2007         /*
2008          * Some protocols do not quiece the data path during fallback. Once
2009          * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
2010          * fail and the protocol is responsible for saving the data for later
2011          * delivery (i.e., once the fallback has completed).
2012          */
2013         mutex_enter(&so->so_lock);
2014         so->so_state |= SS_FALLBACK_DRAIN;
2015         SOCKET_TIMER_CANCEL(so);
2016         mutex_exit(&so->so_lock);
2017 
2018         if (so->so_rcv_head != NULL) {
2019                 if (so->so_rcv_q_last_head == NULL)
2020                         so->so_rcv_q_head = so->so_rcv_head;
2021                 else
2022                         so->so_rcv_q_last_head->b_next = so->so_rcv_head;
2023                 so->so_rcv_q_last_head = so->so_rcv_last_head;
2024         }
2025 
2026         atmark = (so->so_state & SS_RCVATMARK) != 0;
2027         /*
2028          * Clear any OOB state having to do with pending data. The TPI
2029          * code path will set the appropriate oob state when we move the
2030          * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
2031          * data has already been consumed.
2032          */
2033         so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
2034 
2035         ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
2036 
2037         /*
2038          * Move data to the STREAM head.
2039          */
2040         while (so->so_rcv_q_head != NULL) {
2041                 mblk_t *mp = so->so_rcv_q_head;
2042                 size_t mlen = msgdsize(mp);
2043 
2044                 so->so_rcv_q_head = mp->b_next;
2045                 mp->b_next = NULL;
2046                 mp->b_prev = NULL;
2047 
2048                 /*
2049                  * Send T_EXDATA_IND if we are at the oob mark.
2050                  */
2051                 if (atmark) {
2052                         struct T_exdata_ind *tei;
2053                         mblk_t *mp1 = arg->soqa_exdata_mp;
2054 
2055                         arg->soqa_exdata_mp = NULL;
2056                         ASSERT(mp1 != NULL);
2057                         mp1->b_datap->db_type = M_PROTO;
2058                         tei = (struct T_exdata_ind *)mp1->b_rptr;
2059                         tei->PRIM_type = T_EXDATA_IND;
2060                         tei->MORE_flag = 0;
2061                         mp1->b_wptr = (uchar_t *)&tei[1];
2062 
2063                         if (IS_SO_OOB_INLINE(so)) {
2064                                 mp1->b_cont = mp;
2065                         } else {
2066                                 ASSERT(so->so_oobmsg != NULL);
2067                                 mp1->b_cont = so->so_oobmsg;
2068                                 so->so_oobmsg = NULL;
2069 
2070                                 /* process current mp next time around */
2071                                 mp->b_next = so->so_rcv_q_head;
2072                                 so->so_rcv_q_head = mp;
2073                                 mlen = 0;
2074                         }
2075                         mp = mp1;
2076 
2077                         /* we have consumed the oob mark */
2078                         atmark = B_FALSE;
2079                 } else if (so->so_oobmark > 0) {
2080                         /*
2081                          * Check if the OOB mark is within the current
2082                          * mblk chain. In that case we have to split it up.
2083                          */
2084                         if (so->so_oobmark < mlen) {
2085                                 mblk_t *urg_mp = mp;
2086 
2087                                 atmark = B_TRUE;
2088                                 mp = NULL;
2089                                 mlen = so->so_oobmark;
2090 
2091                                 /*
2092                                  * It is assumed that the OOB mark does
2093                                  * not land within a mblk.
2094                                  */
2095                                 do {
2096                                         so->so_oobmark -= MBLKL(urg_mp);
2097                                         mp = urg_mp;
2098                                         urg_mp = urg_mp->b_cont;
2099                                 } while (so->so_oobmark > 0);
2100                                 mp->b_cont = NULL;
2101                                 if (urg_mp != NULL) {
2102                                         urg_mp->b_next = so->so_rcv_q_head;
2103                                         so->so_rcv_q_head = urg_mp;
2104                                 }
2105                         } else {
2106                                 so->so_oobmark -= mlen;
2107                                 if (so->so_oobmark == 0)
2108                                         atmark = B_TRUE;
2109                         }
2110                 }
2111 
2112                 /*
2113                  * Queue data on the STREAM head.
2114                  */
2115                 so->so_rcv_queued -= mlen;
2116                 *tailmpp = mp;
2117                 tailmpp = &mp->b_next;
2118         }
2119         so->so_rcv_head = NULL;
2120         so->so_rcv_last_head = NULL;
2121         so->so_rcv_q_head = NULL;
2122         so->so_rcv_q_last_head = NULL;
2123 
2124         /*
2125          * Check if the oob byte is at the end of the data stream, or if the
2126          * oob byte has not yet arrived. In the latter case we have to send a
2127          * SIGURG and a mark indicator to the STREAM head. The mark indicator
2128          * is needed to guarantee correct behavior for SIOCATMARK. See block
2129          * comment in socktpi.h for more details.
2130          */
2131         if (atmark || so->so_oobmark > 0) {
2132                 mblk_t *mp;
2133 
2134                 if (atmark && so->so_oobmsg != NULL) {
2135                         struct T_exdata_ind *tei;
2136 
2137                         mp = arg->soqa_exdata_mp;
2138                         arg->soqa_exdata_mp = NULL;
2139                         ASSERT(mp != NULL);
2140                         mp->b_datap->db_type = M_PROTO;
2141                         tei = (struct T_exdata_ind *)mp->b_rptr;
2142                         tei->PRIM_type = T_EXDATA_IND;
2143                         tei->MORE_flag = 0;
2144                         mp->b_wptr = (uchar_t *)&tei[1];
2145 
2146                         mp->b_cont = so->so_oobmsg;
2147                         so->so_oobmsg = NULL;
2148 
2149                         *tailmpp = mp;
2150                         tailmpp = &mp->b_next;
2151                 } else {
2152                         /* Send up the signal */
2153                         mp = arg->soqa_exdata_mp;
2154                         arg->soqa_exdata_mp = NULL;
2155                         ASSERT(mp != NULL);
2156                         DB_TYPE(mp) = M_PCSIG;
2157                         *mp->b_wptr++ = (uchar_t)SIGURG;
2158                         *tailmpp = mp;
2159                         tailmpp = &mp->b_next;
2160 
2161                         /* Send up the mark indicator */
2162                         mp = arg->soqa_urgmark_mp;
2163                         arg->soqa_urgmark_mp = NULL;
2164                         mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
2165                         *tailmpp = mp;
2166                         tailmpp = &mp->b_next;
2167 
2168                         so->so_oobmark = 0;
2169                 }
2170         }
2171         ASSERT(so->so_oobmark == 0);
2172         ASSERT(so->so_rcv_queued == 0);
2173 
2174         return (retmp);
2175 }
2176 
2177 #ifdef DEBUG
2178 /*
2179  * Do an integrity check of the sonode. This should be done if a
2180  * fallback fails after sonode has initially been converted to use
2181  * TPI and subsequently have to be reverted.
2182  *
2183  * Failure to pass the integrity check will panic the system.
2184  */
2185 void
2186 so_integrity_check(struct sonode *cur, struct sonode *orig)
2187 {
2188         VERIFY(cur->so_vnode == orig->so_vnode);
2189         VERIFY(cur->so_ops == orig->so_ops);
2190         /*
2191          * For so_state we can only VERIFY the state flags in CHECK_STATE.
2192          * The other state flags might be affected by a notification from the
2193          * protocol.
2194          */
2195 #define CHECK_STATE     (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
2196         SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
2197         SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
2198         VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
2199             (orig->so_state & CHECK_STATE));
2200         VERIFY(cur->so_mode == orig->so_mode);
2201         VERIFY(cur->so_flag == orig->so_flag);
2202         VERIFY(cur->so_count == orig->so_count);
2203         /* Cannot VERIFY so_proto_connid; proto can update it */
2204         VERIFY(cur->so_sockparams == orig->so_sockparams);
2205         /* an error might have been recorded, but it can not be lost */
2206         VERIFY(cur->so_error != 0 || orig->so_error == 0);
2207         VERIFY(cur->so_family == orig->so_family);
2208         VERIFY(cur->so_type == orig->so_type);
2209         VERIFY(cur->so_protocol == orig->so_protocol);
2210         VERIFY(cur->so_version == orig->so_version);
2211         /* New conns might have arrived, but none should have been lost */
2212         VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
2213         VERIFY(list_head(&cur->so_acceptq_list) ==
2214             list_head(&orig->so_acceptq_list));
2215         VERIFY(cur->so_backlog == orig->so_backlog);
2216         /* New OOB migth have arrived, but mark should not have been lost */
2217         VERIFY(cur->so_oobmark >= orig->so_oobmark);
2218         /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
2219         VERIFY(cur->so_pgrp == orig->so_pgrp);
2220         VERIFY(cur->so_peercred == orig->so_peercred);
2221         VERIFY(cur->so_cpid == orig->so_cpid);
2222         VERIFY(cur->so_zoneid == orig->so_zoneid);
2223         /* New data migth have arrived, but none should have been lost */
2224         VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
2225         VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
2226         VERIFY(cur->so_rcv_head == orig->so_rcv_head);
2227         VERIFY(cur->so_proto_handle == orig->so_proto_handle);
2228         VERIFY(cur->so_downcalls == orig->so_downcalls);
2229         /* Cannot VERIFY so_proto_props; they can be updated by proto */
2230 }
2231 #endif
2232 
2233 /*
2234  * so_tpi_fallback()
2235  *
2236  * This is the fallback initation routine; things start here.
2237  *
2238  * Basic strategy:
2239  *   o Block new socket operations from coming in
2240  *   o Allocate/initate info needed by TPI
2241  *   o Quiesce the connection, at which point we sync
2242  *     state and move data
2243  *   o Change operations (sonodeops) associated with the socket
2244  *   o Unblock threads waiting for the fallback to finish
2245  */
2246 int
2247 so_tpi_fallback(struct sonode *so, struct cred *cr)
2248 {
2249         int error;
2250         queue_t *q;
2251         struct sockparams *sp;
2252         struct sockparams *newsp = NULL;
2253         so_proto_fallback_func_t fbfunc;
2254         const char *devpath;
2255         boolean_t direct;
2256         struct sonode *nso;
2257         sock_quiesce_arg_t arg = { NULL, NULL };
2258 #ifdef DEBUG
2259         struct sonode origso;
2260 #endif
2261         error = 0;
2262         sp = so->so_sockparams;
2263         fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
2264 
2265         /*
2266          * Cannot fallback if the socket has active filters
2267          */
2268         if (so->so_filter_active > 0)
2269                 return (EINVAL);
2270 
2271         switch (so->so_family) {
2272         case AF_INET:
2273                 devpath = sp->sp_smod_info->smod_fallback_devpath_v4;
2274                 break;
2275         case AF_INET6:
2276                 devpath = sp->sp_smod_info->smod_fallback_devpath_v6;
2277                 break;
2278         default:
2279                 return (EINVAL);
2280         }
2281 
2282         /*
2283          * Fallback can only happen if the socket module has a TPI device
2284          * and fallback function.
2285          */
2286         if (devpath == NULL || fbfunc == NULL)
2287                 return (EINVAL);
2288 
2289         /*
2290          * Initiate fallback; upon success we know that no new requests
2291          * will come in from the user.
2292          */
2293         if (!so_start_fallback(so))
2294                 return (EAGAIN);
2295 #ifdef DEBUG
2296         /*
2297          * Make a copy of the sonode in case we need to make an integrity
2298          * check later on.
2299          */
2300         bcopy(so, &origso, sizeof (*so));
2301 #endif
2302 
2303         sp->sp_stats.sps_nfallback.value.ui64++;
2304 
2305         newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
2306             so->so_protocol, devpath, KM_SLEEP, &error);
2307         if (error != 0)
2308                 goto out;
2309 
2310         if (so->so_direct != NULL) {
2311                 sodirect_t *sodp = so->so_direct;
2312                 mutex_enter(&so->so_lock);
2313 
2314                 so->so_direct->sod_enabled = B_FALSE;
2315                 so->so_state &= ~SS_SODIRECT;
2316                 ASSERT(sodp->sod_uioafh == NULL);
2317                 mutex_exit(&so->so_lock);
2318         }
2319 
2320         /* Turn sonode into a TPI socket */
2321         error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
2322         if (error != 0)
2323                 goto out;
2324         /*
2325          * When it comes to urgent data we have two cases to deal with;
2326          * (1) The oob byte has already arrived, or (2) the protocol has
2327          * notified that oob data is pending, but it has not yet arrived.
2328          *
2329          * For (1) all we need to do is send a T_EXDATA_IND to indicate were
2330          * in the byte stream the oob byte is. For (2) we have to send a
2331          * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
2332          * the oob byte will be the next byte from the protocol.
2333          *
2334          * So in the worst case we need two mblks, one for the signal, another
2335          * for mark indication. In that case we use the exdata_mp for the sig.
2336          */
2337         arg.soqa_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind),
2338             BPRI_MED, STR_NOSIG, NULL);
2339         arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
2340 
2341         /*
2342          * Now tell the protocol to start using TPI. so_quiesced_cb be
2343          * called once it's safe to synchronize state.
2344          */
2345         DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
2346         error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb,
2347             &arg);
2348         DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
2349 
2350         if (error != 0) {
2351                 /* protocol was unable to do a fallback, revert the sonode */
2352                 sotpi_revert_sonode(so, cr);
2353                 goto out;
2354         }
2355 
2356         /*
2357          * Walk the accept queue and notify the proto that they should
2358          * fall back to TPI. The protocol will send up the T_CONN_IND.
2359          */
2360         nso = list_head(&so->so_acceptq_list);
2361         while (nso != NULL) {
2362                 int rval;
2363                 struct sonode *next;
2364 
2365                 if (arg.soqa_exdata_mp == NULL) {
2366                         arg.soqa_exdata_mp =
2367                             allocb_wait(sizeof (struct T_exdata_ind),
2368                             BPRI_MED, STR_NOSIG, NULL);
2369                 }
2370                 if (arg.soqa_urgmark_mp == NULL) {
2371                         arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED,
2372                             STR_NOSIG, NULL);
2373                 }
2374 
2375                 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
2376                 rval = (*fbfunc)(nso->so_proto_handle, NULL, direct,
2377                     so_quiesced_cb, &arg);
2378                 DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
2379                 if (rval != 0) {
2380                         /* Abort the connection */
2381                         zcmn_err(getzoneid(), CE_WARN,
2382                             "Failed to convert socket in accept queue to TPI. "
2383                             "Pid = %d\n", curproc->p_pid);
2384                         next = list_next(&so->so_acceptq_list, nso);
2385                         list_remove(&so->so_acceptq_list, nso);
2386                         so->so_acceptq_len--;
2387 
2388                         (void) socket_close(nso, 0, CRED());
2389                         socket_destroy(nso);
2390                         nso = next;
2391                 } else {
2392                         nso = list_next(&so->so_acceptq_list, nso);
2393                 }
2394         }
2395 
2396         /*
2397          * Now flush the acceptq, this will destroy all sockets. They will
2398          * be recreated in sotpi_accept().
2399          */
2400         so_acceptq_flush(so, B_FALSE);
2401 
2402         mutex_enter(&so->so_lock);
2403         so->so_state |= SS_FALLBACK_COMP;
2404         mutex_exit(&so->so_lock);
2405 
2406         /*
2407          * Swap the sonode ops. Socket opertations that come in once this
2408          * is done will proceed without blocking.
2409          */
2410         so->so_ops = &sotpi_sonodeops;
2411 
2412         /*
2413          * Wake up any threads stuck in poll. This is needed since the poll
2414          * head changes when the fallback happens (moves from the sonode to
2415          * the STREAMS head).
2416          */
2417         pollwakeup(&so->so_poll_list, POLLERR);
2418 
2419         /*
2420          * When this non-STREAM socket was created we placed an extra ref on
2421          * the associated vnode to support asynchronous close. Drop that ref
2422          * here.
2423          */
2424         ASSERT(SOTOV(so)->v_count >= 2);
2425         VN_RELE(SOTOV(so));
2426 out:
2427         so_end_fallback(so);
2428 
2429         if (error != 0) {
2430 #ifdef DEBUG
2431                 so_integrity_check(so, &origso);
2432 #endif
2433                 zcmn_err(getzoneid(), CE_WARN,
2434                     "Failed to convert socket to TPI (err=%d). Pid = %d\n",
2435                     error, curproc->p_pid);
2436                 if (newsp != NULL)
2437                         SOCKPARAMS_DEC_REF(newsp);
2438         }
2439         if (arg.soqa_exdata_mp != NULL)
2440                 freemsg(arg.soqa_exdata_mp);
2441         if (arg.soqa_urgmark_mp != NULL)
2442                 freemsg(arg.soqa_urgmark_mp);
2443 
2444         return (error);
2445 }