Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
          +++ new/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  25   25  /*
  26   26   * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  27      - * Copyright 2015 Joyent, Inc.
  28   27   */
  29   28  
  30   29  #include <sys/types.h>
  31   30  #include <sys/param.h>
  32   31  #include <sys/signal.h>
  33   32  #include <sys/cmn_err.h>
  34   33  
  35   34  #include <sys/stropts.h>
  36   35  #include <sys/socket.h>
  37   36  #include <sys/socketvar.h>
  38   37  #include <sys/sockio.h>
  39   38  #include <sys/strsubr.h>
  40   39  #include <sys/strsun.h>
  41   40  #include <sys/atomic.h>
  42   41  #include <sys/tihdr.h>
  43   42  
  44   43  #include <fs/sockfs/sockcommon.h>
  45   44  #include <fs/sockfs/sockfilter_impl.h>
  46   45  #include <fs/sockfs/socktpi.h>
  47   46  #include <fs/sockfs/sodirect.h>
  48   47  #include <sys/ddi.h>
  49   48  #include <inet/ip.h>
  50   49  #include <sys/time.h>
  51   50  #include <sys/cmn_err.h>
  52   51  
  53   52  #ifdef SOCK_TEST
  54   53  extern int do_useracc;
  55   54  extern clock_t sock_test_timelimit;
  56   55  #endif /* SOCK_TEST */
  57   56  
  58   57  #define MBLK_PULL_LEN 64
  59   58  uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
  60   59  
  61   60  #ifdef DEBUG
  62   61  boolean_t so_debug_length = B_FALSE;
  63   62  static boolean_t so_check_length(sonode_t *so);
  64   63  #endif
  65   64  
  66   65  static int
  67   66  so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
  68   67      struct sonode **nsop)
  69   68  {
  70   69          struct sonode *nso = NULL;
  71   70  
  72   71          *nsop = NULL;
  73   72          ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
  74   73          while ((nso = list_remove_head(&so->so_acceptq_list)) == NULL) {
  75   74                  /*
  76   75                   * No need to check so_error here, because it is not
  77   76                   * possible for a listening socket to be reset or otherwise
  78   77                   * disconnected.
  79   78                   *
  80   79                   * So now we just need check if it's ok to wait.
  81   80                   */
  82   81                  if (dontblock)
  83   82                          return (EWOULDBLOCK);
  84   83                  if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
  85   84                          return (EINTR);
  86   85  
  87   86                  if (cv_wait_sig_swap(&so->so_acceptq_cv,
  88   87                      &so->so_acceptq_lock) == 0)
  89   88                          return (EINTR);
  90   89          }
  91   90  
  92   91          ASSERT(nso != NULL);
  93   92          ASSERT(so->so_acceptq_len > 0);
  94   93          so->so_acceptq_len--;
  95   94          nso->so_listener = NULL;
  96   95  
  97   96          *nsop = nso;
  98   97  
  99   98          return (0);
 100   99  }
 101  100  
 102  101  /*
 103  102   * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
 104  103   *
 105  104   * Pulls a connection off of the accept queue.
 106  105   *
 107  106   * Arguments:
 108  107   *   so        - listening socket
 109  108   *   dontblock - indicate whether it's ok to sleep if there are no
 110  109   *               connections on the queue
 111  110   *   nsop      - Value-return argument
 112  111   *
 113  112   * Return values:
 114  113   *   0 when a connection is successfully dequeued, in which case nsop
 115  114   *   is set to point to the new connection. Upon failure a non-zero
 116  115   *   value is returned, and the value of nsop is set to NULL.
 117  116   *
 118  117   * Note:
 119  118   *   so_acceptq_dequeue() may return prematurly if the socket is falling
 120  119   *   back to TPI.
 121  120   */
 122  121  int
 123  122  so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
 124  123      struct sonode **nsop)
 125  124  {
 126  125          int error;
 127  126  
 128  127          mutex_enter(&so->so_acceptq_lock);
 129  128          error = so_acceptq_dequeue_locked(so, dontblock, nsop);
 130  129          mutex_exit(&so->so_acceptq_lock);
 131  130  
 132  131          return (error);
 133  132  }
 134  133  
 135  134  static void
 136  135  so_acceptq_flush_impl(struct sonode *so, list_t *list, boolean_t doclose)
 137  136  {
 138  137          struct sonode *nso;
 139  138  
 140  139          while ((nso = list_remove_head(list)) != NULL) {
 141  140                  nso->so_listener = NULL;
 142  141                  if (doclose) {
 143  142                          (void) socket_close(nso, 0, CRED());
 144  143                  } else {
 145  144                          /*
 146  145                           * Only used for fallback - not possible when filters
 147  146                           * are present.
 148  147                           */
 149  148                          ASSERT(so->so_filter_active == 0);
 150  149                          /*
 151  150                           * Since the socket is on the accept queue, there can
 152  151                           * only be one reference. We drop the reference and
 153  152                           * just blow off the socket.
 154  153                           */
 155  154                          ASSERT(nso->so_count == 1);
 156  155                          nso->so_count--;
 157  156                          /* drop the proto ref */
 158  157                          VN_RELE(SOTOV(nso));
 159  158                  }
 160  159                  socket_destroy(nso);
 161  160          }
 162  161  }
 163  162  /*
 164  163   * void so_acceptq_flush(struct sonode *so)
 165  164   *
 166  165   * Removes all pending connections from a listening socket, and
 167  166   * frees the associated resources.
 168  167   *
 169  168   * Arguments
 170  169   *   so      - listening socket
 171  170   *   doclose - make a close downcall for each socket on the accept queue
 172  171   *
 173  172   * Return values:
 174  173   *   None.
 175  174   *
 176  175   * Note:
 177  176   *   The caller has to ensure that no calls to so_acceptq_enqueue() or
 178  177   *   so_acceptq_dequeue() occur while the accept queue is being flushed.
 179  178   *   So either the socket needs to be in a state where no operations
 180  179   *   would come in, or so_lock needs to be obtained.
 181  180   */
 182  181  void
 183  182  so_acceptq_flush(struct sonode *so, boolean_t doclose)
 184  183  {
 185  184          so_acceptq_flush_impl(so, &so->so_acceptq_list, doclose);
 186  185          so_acceptq_flush_impl(so, &so->so_acceptq_defer, doclose);
 187  186  
 188  187          so->so_acceptq_len = 0;
 189  188  }
 190  189  
 191  190  int
 192  191  so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
 193  192      sock_connid_t id)
 194  193  {
 195  194          ASSERT(MUTEX_HELD(&so->so_lock));
 196  195  
 197  196          /*
 198  197           * The protocol has notified us that a connection attempt is being
 199  198           * made, so before we wait for a notification to arrive we must
 200  199           * clear out any errors associated with earlier connection attempts.
 201  200           */
 202  201          if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
 203  202                  so->so_error = 0;
 204  203  
 205  204          while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
 206  205                  if (nonblock)
 207  206                          return (EINPROGRESS);
 208  207  
 209  208                  if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 210  209                          return (EINTR);
 211  210  
 212  211                  if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
 213  212                          return (EINTR);
 214  213          }
 215  214  
 216  215          if (so->so_error != 0)
 217  216                  return (sogeterr(so, B_TRUE));
 218  217          /*
 219  218           * Under normal circumstances, so_error should contain an error
 220  219           * in case the connect failed. However, it is possible for another
 221  220           * thread to come in a consume the error, so generate a sensible
 222  221           * error in that case.
 223  222           */
 224  223          if ((so->so_state & SS_ISCONNECTED) == 0)
 225  224                  return (ECONNREFUSED);
 226  225  
 227  226          return (0);
 228  227  }
 229  228  
 230  229  /*
 231  230   * int so_wait_connected(struct sonode *so, boolean_t nonblock,
 232  231   *    sock_connid_t id)
 233  232   *
 234  233   * Wait until the socket is connected or an error has occured.
 235  234   *
 236  235   * Arguments:
 237  236   *   so       - socket
 238  237   *   nonblock - indicate whether it's ok to sleep if the connection has
 239  238   *              not yet been established
 240  239   *   gen      - generation number that was returned by the protocol
 241  240   *              when the operation was started
 242  241   *
 243  242   * Returns:
 244  243   *   0 if the connection attempt was successful, or an error indicating why
 245  244   *   the connection attempt failed.
 246  245   */
 247  246  int
 248  247  so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
 249  248  {
 250  249          int error;
 251  250  
 252  251          mutex_enter(&so->so_lock);
 253  252          error = so_wait_connected_locked(so, nonblock, id);
 254  253          mutex_exit(&so->so_lock);
 255  254  
 256  255          return (error);
 257  256  }
 258  257  
 259  258  int
 260  259  so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
 261  260  {
 262  261          int error;
 263  262  
 264  263          ASSERT(MUTEX_HELD(&so->so_lock));
 265  264          while (SO_SND_FLOWCTRLD(so)) {
 266  265                  if (so->so_state & SS_CANTSENDMORE)
 267  266                          return (EPIPE);
 268  267                  if (dontblock)
 269  268                          return (EWOULDBLOCK);
 270  269  
 271  270                  if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
 272  271                          return (EINTR);
 273  272  
 274  273                  if (so->so_sndtimeo == 0) {
 275  274                          /*
 276  275                           * Zero means disable timeout.
 277  276                           */
 278  277                          error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
 279  278                  } else {
 280  279                          error = cv_reltimedwait_sig(&so->so_snd_cv,
 281  280                              &so->so_lock, so->so_sndtimeo, TR_CLOCK_TICK);
 282  281                  }
 283  282                  if (error == 0)
 284  283                          return (EINTR);
 285  284                  else if (error == -1)
 286  285                          return (EAGAIN);
 287  286          }
 288  287          return (0);
 289  288  }
 290  289  
 291  290  /*
 292  291   * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
 293  292   *
 294  293   * Wait for the transport to notify us about send buffers becoming
 295  294   * available.
 296  295   */
 297  296  int
 298  297  so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
 299  298  {
 300  299          int error = 0;
 301  300  
 302  301          mutex_enter(&so->so_lock);
 303  302          so->so_snd_wakeup = B_TRUE;
 304  303          error = so_snd_wait_qnotfull_locked(so, dontblock);
 305  304          so->so_snd_wakeup = B_FALSE;
 306  305          mutex_exit(&so->so_lock);
 307  306  
 308  307          return (error);
 309  308  }
 310  309  
 311  310  void
 312  311  so_snd_qfull(struct sonode *so)
 313  312  {
 314  313          mutex_enter(&so->so_lock);
 315  314          so->so_snd_qfull = B_TRUE;
 316  315          mutex_exit(&so->so_lock);
 317  316  }
 318  317  
 319  318  void
 320  319  so_snd_qnotfull(struct sonode *so)
 321  320  {
 322  321          mutex_enter(&so->so_lock);
 323  322          so->so_snd_qfull = B_FALSE;
 324  323          /* wake up everyone waiting for buffers */
 325  324          cv_broadcast(&so->so_snd_cv);
 326  325          mutex_exit(&so->so_lock);
 327  326  }
 328  327  
 329  328  /*
 330  329   * Change the process/process group to which SIGIO is sent.
 331  330   */
 332  331  int
 333  332  socket_chgpgrp(struct sonode *so, pid_t pid)
 334  333  {
 335  334          int error;
 336  335  
 337  336          ASSERT(MUTEX_HELD(&so->so_lock));
 338  337          if (pid != 0) {
 339  338                  /*
 340  339                   * Permissions check by sending signal 0.
 341  340                   * Note that when kill fails it does a
 342  341                   * set_errno causing the system call to fail.
 343  342                   */
 344  343                  error = kill(pid, 0);
 345  344                  if (error != 0) {
 346  345                          return (error);
 347  346                  }
 348  347          }
 349  348          so->so_pgrp = pid;
 350  349          return (0);
 351  350  }
 352  351  
 353  352  
 354  353  /*
 355  354   * Generate a SIGIO, for 'writable' events include siginfo structure,
 356  355   * for read events just send the signal.
 357  356   */
 358  357  /*ARGSUSED*/
 359  358  static void
 360  359  socket_sigproc(proc_t *proc, int event)
 361  360  {
 362  361          k_siginfo_t info;
 363  362  
 364  363          ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
 365  364  
 366  365          if (event & SOCKETSIG_WRITE) {
 367  366                  info.si_signo = SIGPOLL;
 368  367                  info.si_code = POLL_OUT;
 369  368                  info.si_errno = 0;
 370  369                  info.si_fd = 0;
 371  370                  info.si_band = 0;
 372  371                  sigaddq(proc, NULL, &info, KM_NOSLEEP);
 373  372          }
 374  373          if (event & SOCKETSIG_READ) {
 375  374                  sigtoproc(proc, NULL, SIGPOLL);
 376  375          }
 377  376          if (event & SOCKETSIG_URG) {
 378  377                  sigtoproc(proc, NULL, SIGURG);
 379  378          }
 380  379  }
 381  380  
 382  381  void
 383  382  socket_sendsig(struct sonode *so, int event)
 384  383  {
 385  384          proc_t *proc;
 386  385  
 387  386          ASSERT(MUTEX_HELD(&so->so_lock));
 388  387  
 389  388          if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
 390  389              event != SOCKETSIG_URG)) {
 391  390                  return;
 392  391          }
 393  392  
 394  393          dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
 395  394  
 396  395          if (so->so_pgrp > 0) {
 397  396                  /*
 398  397                   * XXX This unfortunately still generates
 399  398                   * a signal when a fd is closed but
 400  399                   * the proc is active.
 401  400                   */
 402  401                  mutex_enter(&pidlock);
 403  402                  /*
 404  403                   * Even if the thread started in another zone, we're receiving
 405  404                   * on behalf of this socket's zone, so find the proc using the
 406  405                   * socket's zone ID.
 407  406                   */
 408  407                  proc = prfind_zone(so->so_pgrp, so->so_zoneid);
 409  408                  if (proc == NULL) {
 410  409                          mutex_exit(&pidlock);
 411  410                          return;
 412  411                  }
 413  412                  mutex_enter(&proc->p_lock);
 414  413                  mutex_exit(&pidlock);
 415  414                  socket_sigproc(proc, event);
 416  415                  mutex_exit(&proc->p_lock);
 417  416          } else {
 418  417                  /*
 419  418                   * Send to process group. Hold pidlock across
 420  419                   * calls to socket_sigproc().
 421  420                   */
 422  421                  pid_t pgrp = -so->so_pgrp;
 423  422  
 424  423                  mutex_enter(&pidlock);
 425  424                  /*
 426  425                   * Even if the thread started in another zone, we're receiving
 427  426                   * on behalf of this socket's zone, so find the pgrp using the
 428  427                   * socket's zone ID.
 429  428                   */
 430  429                  proc = pgfind_zone(pgrp, so->so_zoneid);
 431  430                  while (proc != NULL) {
 432  431                          mutex_enter(&proc->p_lock);
 433  432                          socket_sigproc(proc, event);
 434  433                          mutex_exit(&proc->p_lock);
 435  434                          proc = proc->p_pglink;
 436  435                  }
 437  436                  mutex_exit(&pidlock);
 438  437          }
 439  438  }
 440  439  
 441  440  #define MIN(a, b) ((a) < (b) ? (a) : (b))
 442  441  /* Copy userdata into a new mblk_t */
 443  442  mblk_t *
 444  443  socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
 445  444      size_t tail_len, int *errorp)
 446  445  {
 447  446          mblk_t  *head = NULL, **tail = &head;
 448  447  
 449  448          ASSERT(iosize == INFPSZ || iosize > 0);
 450  449  
 451  450          if (iosize == INFPSZ || iosize > uiop->uio_resid)
 452  451                  iosize = uiop->uio_resid;
 453  452  
 454  453          if (maxblk == INFPSZ)
 455  454                  maxblk = iosize;
 456  455  
 457  456          /* Nothing to do in these cases, so we're done */
 458  457          if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
 459  458                  goto done;
 460  459  
 461  460          /*
 462  461           * We will enter the loop below if iosize is 0; it will allocate an
 463  462           * empty message block and call uiomove(9F) which will just return.
 464  463           * We could avoid that with an extra check but would only slow
 465  464           * down the much more likely case where iosize is larger than 0.
 466  465           */
 467  466          do {
 468  467                  ssize_t blocksize;
 469  468                  mblk_t  *mp;
 470  469  
 471  470                  blocksize = MIN(iosize, maxblk);
 472  471                  ASSERT(blocksize >= 0);
 473  472                  mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
 474  473                  if (mp == NULL) {
 475  474                          *errorp = ENOMEM;
 476  475                          return (head);
 477  476                  }
 478  477                  mp->b_rptr += wroff;
 479  478                  mp->b_wptr = mp->b_rptr + blocksize;
 480  479  
 481  480                  *tail = mp;
 482  481                  tail = &mp->b_cont;
 483  482  
 484  483                  /* uiomove(9F) either returns 0 or EFAULT */
 485  484                  if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
 486  485                      UIO_WRITE, uiop)) != 0) {
 487  486                          ASSERT(*errorp != ENOMEM);
 488  487                          freemsg(head);
 489  488                          return (NULL);
 490  489                  }
 491  490  
 492  491                  iosize -= blocksize;
 493  492          } while (iosize > 0);
 494  493  
 495  494  done:
 496  495          *errorp = 0;
 497  496          return (head);
 498  497  }
 499  498  
 500  499  mblk_t *
 501  500  socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
 502  501  {
 503  502          int error;
 504  503          ptrdiff_t n;
 505  504          mblk_t *nmp;
 506  505  
 507  506          ASSERT(mp->b_wptr >= mp->b_rptr);
 508  507  
 509  508          /*
 510  509           * max_read is the offset of the oobmark and read can not go pass
 511  510           * the oobmark.
 512  511           */
 513  512          if (max_read == INFPSZ || max_read > uiop->uio_resid)
 514  513                  max_read = uiop->uio_resid;
 515  514  
 516  515          do {
 517  516                  if ((n = MIN(max_read, MBLKL(mp))) != 0) {
 518  517                          ASSERT(n > 0);
 519  518  
 520  519                          error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
 521  520                          if (error != 0) {
 522  521                                  freemsg(mp);
 523  522                                  *errorp = error;
 524  523                                  return (NULL);
 525  524                          }
 526  525                  }
 527  526  
 528  527                  mp->b_rptr += n;
 529  528                  max_read -= n;
 530  529                  while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
 531  530                          /*
 532  531                           * get rid of zero length mblks
 533  532                           */
 534  533                          nmp = mp;
 535  534                          mp = mp->b_cont;
 536  535                          freeb(nmp);
 537  536                  }
 538  537          } while (mp != NULL && max_read > 0);
 539  538  
 540  539          *errorp = 0;
 541  540          return (mp);
 542  541  }
 543  542  
 544  543  static void
 545  544  so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
 546  545  {
 547  546          ASSERT(last_tail != NULL);
 548  547          mp->b_next = so->so_rcv_q_head;
 549  548          mp->b_prev = last_tail;
 550  549          ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
 551  550  
 552  551          if (so->so_rcv_q_head == NULL) {
 553  552                  ASSERT(so->so_rcv_q_last_head == NULL);
 554  553                  so->so_rcv_q_last_head = mp;
 555  554  #ifdef DEBUG
 556  555          } else {
 557  556                  ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
 558  557  #endif
 559  558          }
 560  559          so->so_rcv_q_head = mp;
 561  560  
 562  561  #ifdef DEBUG
 563  562          if (so_debug_length) {
 564  563                  mutex_enter(&so->so_lock);
 565  564                  ASSERT(so_check_length(so));
 566  565                  mutex_exit(&so->so_lock);
 567  566          }
 568  567  #endif
 569  568  }
 570  569  
 571  570  /*
 572  571   * Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
 573  572   * can be processed by so_dequeue_msg().
 574  573   */
 575  574  void
 576  575  so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
 577  576  {
 578  577          if (so->so_filter_active > 0 &&
 579  578              (mp_head = sof_filter_data_in_proc(so, mp_head,
 580  579              &mp_last_head)) == NULL)
 581  580                  return;
 582  581  
 583  582          ASSERT(mp_head->b_prev != NULL);
 584  583          if (so->so_rcv_q_head == NULL) {
 585  584                  so->so_rcv_q_head = mp_head;
 586  585                  so->so_rcv_q_last_head = mp_last_head;
 587  586                  ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
 588  587          } else {
 589  588                  boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
 590  589                      (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
 591  590  
 592  591                  if (mp_head->b_next == NULL &&
 593  592                      DB_TYPE(mp_head) == M_DATA &&
 594  593                      DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
 595  594                          so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 596  595                          so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 597  596                          mp_head->b_prev = NULL;
 598  597                  } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
 599  598                          /*
 600  599                           * Append to last_head if more than one mblks, and both
 601  600                           * mp_head and last_head are I/OAT mblks.
 602  601                           */
 603  602                          ASSERT(mp_head->b_next != NULL);
 604  603                          so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
 605  604                          so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
 606  605                          mp_head->b_prev = NULL;
 607  606  
 608  607                          so->so_rcv_q_last_head->b_next = mp_head->b_next;
 609  608                          mp_head->b_next = NULL;
 610  609                          so->so_rcv_q_last_head = mp_last_head;
 611  610                  } else {
 612  611  #ifdef DEBUG
 613  612                          {
 614  613                                  mblk_t *tmp_mblk;
 615  614                                  tmp_mblk = mp_head;
 616  615                                  while (tmp_mblk != NULL) {
 617  616                                          ASSERT(tmp_mblk->b_prev != NULL);
 618  617                                          tmp_mblk = tmp_mblk->b_next;
 619  618                                  }
 620  619                          }
 621  620  #endif
 622  621                          so->so_rcv_q_last_head->b_next = mp_head;
 623  622                          so->so_rcv_q_last_head = mp_last_head;
 624  623                  }
 625  624          }
 626  625  }
 627  626  
 628  627  /*
 629  628   * Check flow control on a given sonode.  Must have so_lock held, and
 630  629   * this function will release the hold.  Return true if flow control
 631  630   * is cleared.
 632  631   */
 633  632  boolean_t
 634  633  so_check_flow_control(struct sonode *so)
 635  634  {
 636  635          ASSERT(MUTEX_HELD(&so->so_lock));
 637  636  
 638  637          if (so->so_flowctrld && (so->so_rcv_queued < so->so_rcvlowat &&
 639  638              !(so->so_state & SS_FIL_RCV_FLOWCTRL))) {
 640  639                  so->so_flowctrld = B_FALSE;
 641  640                  mutex_exit(&so->so_lock);
 642  641                  /*
 643  642                   * Open up flow control. SCTP does not have any downcalls, and
 644  643                   * it will clr flow ctrl in sosctp_recvmsg().
 645  644                   */
 646  645                  if (so->so_downcalls != NULL &&
 647  646                      so->so_downcalls->sd_clr_flowctrl != NULL) {
 648  647                          (*so->so_downcalls->sd_clr_flowctrl)
 649  648                              (so->so_proto_handle);
 650  649                  }
 651  650                  /* filters can start injecting data */
 652  651                  sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0);
 653  652                  return (B_TRUE);
 654  653          } else {
 655  654                  mutex_exit(&so->so_lock);
 656  655                  return (B_FALSE);
 657  656          }
 658  657  }
 659  658  
 660  659  int
 661  660  so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
 662  661      rval_t *rvalp, int flags)
 663  662  {
 664  663          mblk_t  *mp, *nmp;
 665  664          mblk_t  *savemp, *savemptail;
 666  665          mblk_t  *new_msg_head;
 667  666          mblk_t  *new_msg_last_head;
 668  667          mblk_t  *last_tail;
 669  668          boolean_t partial_read;
 670  669          boolean_t reset_atmark = B_FALSE;
 671  670          int more = 0;
 672  671          int error;
 673  672          ssize_t oobmark;
 674  673          ssize_t copied = 0;
 675  674          sodirect_t *sodp = so->so_direct;
 676  675          xuio_t *xuio = NULL;
 677  676  
 678  677          partial_read = B_FALSE;
 679  678          *mctlp = NULL;
 680  679          if ((uiop->uio_extflg & UIO_XUIO) != 0) {
 681  680                  xuio = (xuio_t *)uiop;
 682  681          }
 683  682  again:
 684  683          mutex_enter(&so->so_lock);
 685  684  again1:
 686  685  #ifdef DEBUG
 687  686          if (so_debug_length) {
 688  687                  ASSERT(so_check_length(so));
 689  688          }
 690  689  #endif
 691  690          if (so->so_state & SS_RCVATMARK) {
 692  691                  /* Check whether the caller is OK to read past the mark */
 693  692                  if (flags & MSG_NOMARK) {
 694  693                          mutex_exit(&so->so_lock);
 695  694                          return (EWOULDBLOCK);
 696  695                  }
 697  696                  reset_atmark = B_TRUE;
 698  697          }
 699  698          /*
 700  699           * First move messages from the dump area to processing area
 701  700           */
 702  701          if (sodp != NULL) {
 703  702                  if (sodp->sod_enabled) {
 704  703                          if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
 705  704                                  /* nothing to uioamove */
 706  705                                  sodp = NULL;
 707  706                          } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
 708  707                                  sodp->sod_uioa.uioa_state &= UIOA_CLR;
 709  708                                  sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
 710  709                                  /*
 711  710                                   * try to uioamove() the data that
 712  711                                   * has already queued.
 713  712                                   */
 714  713                                  sod_uioa_so_init(so, sodp, uiop);
 715  714                          }
 716  715                  } else {
 717  716                          sodp = NULL;
 718  717                  }
 719  718          }
 720  719          new_msg_head = so->so_rcv_head;
 721  720          new_msg_last_head = so->so_rcv_last_head;
 722  721          so->so_rcv_head = NULL;
 723  722          so->so_rcv_last_head = NULL;
 724  723          oobmark = so->so_oobmark;
 725  724          /*
 726  725           * We can release the lock as there can only be one reader
 727  726           */
 728  727          mutex_exit(&so->so_lock);
 729  728  
 730  729          if (new_msg_head != NULL) {
 731  730                  so_process_new_message(so, new_msg_head, new_msg_last_head);
 732  731          }
 733  732          savemp = savemptail = NULL;
 734  733          rvalp->r_vals = 0;
 735  734          error = 0;
 736  735          mp = so->so_rcv_q_head;
 737  736  
 738  737          if (mp != NULL &&
 739  738              (so->so_rcv_timer_tid == 0 ||
 740  739              so->so_rcv_queued >= so->so_rcv_thresh)) {
 741  740                  partial_read = B_FALSE;
 742  741  
 743  742                  if (flags & MSG_PEEK) {
 744  743                          if ((nmp = dupmsg(mp)) == NULL &&
 745  744                              (nmp = copymsg(mp)) == NULL) {
 746  745                                  size_t size = msgsize(mp);
 747  746  
 748  747                                  error = strwaitbuf(size, BPRI_HI);
 749  748                                  if (error) {
 750  749                                          return (error);
 751  750                                  }
 752  751                                  goto again;
 753  752                          }
 754  753                          mp = nmp;
 755  754                  } else {
 756  755                          ASSERT(mp->b_prev != NULL);
 757  756                          last_tail = mp->b_prev;
 758  757                          mp->b_prev = NULL;
 759  758                          so->so_rcv_q_head = mp->b_next;
 760  759                          if (so->so_rcv_q_head == NULL) {
 761  760                                  so->so_rcv_q_last_head = NULL;
 762  761                          }
 763  762                          mp->b_next = NULL;
 764  763                  }
 765  764  
 766  765                  ASSERT(mctlp != NULL);
 767  766                  /*
 768  767                   * First process PROTO or PCPROTO blocks, if any.
 769  768                   */
 770  769                  if (DB_TYPE(mp) != M_DATA) {
 771  770                          *mctlp = mp;
 772  771                          savemp = mp;
 773  772                          savemptail = mp;
 774  773                          ASSERT(DB_TYPE(mp) == M_PROTO ||
 775  774                              DB_TYPE(mp) == M_PCPROTO);
 776  775                          while (mp->b_cont != NULL &&
 777  776                              DB_TYPE(mp->b_cont) != M_DATA) {
 778  777                                  ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
 779  778                                      DB_TYPE(mp->b_cont) == M_PCPROTO);
 780  779                                  mp = mp->b_cont;
 781  780                                  savemptail = mp;
 782  781                          }
 783  782                          mp = savemptail->b_cont;
 784  783                          savemptail->b_cont = NULL;
 785  784                  }
 786  785  
 787  786                  ASSERT(DB_TYPE(mp) == M_DATA);
 788  787                  /*
 789  788                   * Now process DATA blocks, if any. Note that for sodirect
 790  789                   * enabled socket, uio_resid can be 0.
 791  790                   */
 792  791                  if (uiop->uio_resid >= 0) {
 793  792                          if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
 794  793                                  mutex_enter(&so->so_lock);
 795  794                                  ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
 796  795                                  copied = sod_uioa_mblk(so, mp);
 797  796                                  if (copied > 0)
 798  797                                          partial_read = B_TRUE;
 799  798                                  mutex_exit(&so->so_lock);
 800  799                                  /* mark this mblk as processed */
 801  800                                  mp = NULL;
 802  801                          } else {
 803  802                                  ssize_t oldresid = uiop->uio_resid;
 804  803  
 805  804                                  if (MBLKL(mp) < so_mblk_pull_len) {
 806  805                                          if (pullupmsg(mp, -1) == 1) {
 807  806                                                  last_tail = mp;
 808  807                                          }
 809  808                                  }
 810  809                                  /*
 811  810                                   * Can not read beyond the oobmark
 812  811                                   */
 813  812                                  mp = socopyoutuio(mp, uiop,
 814  813                                      oobmark == 0 ? INFPSZ : oobmark, &error);
 815  814                                  if (error != 0) {
 816  815                                          freemsg(*mctlp);
 817  816                                          *mctlp = NULL;
 818  817                                          more = 0;
 819  818                                          goto done;
 820  819                                  }
 821  820                                  ASSERT(oldresid >= uiop->uio_resid);
 822  821                                  copied = oldresid - uiop->uio_resid;
 823  822                                  if (oldresid > uiop->uio_resid)
 824  823                                          partial_read = B_TRUE;
 825  824                          }
 826  825                          ASSERT(copied >= 0);
 827  826                          if (copied > 0 && !(flags & MSG_PEEK)) {
 828  827                                  mutex_enter(&so->so_lock);
 829  828                                  so->so_rcv_queued -= copied;
 830  829                                  ASSERT(so->so_oobmark >= 0);
 831  830                                  if (so->so_oobmark > 0) {
 832  831                                          so->so_oobmark -= copied;
 833  832                                          ASSERT(so->so_oobmark >= 0);
 834  833                                          if (so->so_oobmark == 0) {
 835  834                                                  ASSERT(so->so_state &
 836  835                                                      SS_OOBPEND);
 837  836                                                  so->so_oobmark = 0;
 838  837                                                  so->so_state |= SS_RCVATMARK;
 839  838                                          }
 840  839                                  }
 841  840                                  /*
 842  841                                   * so_check_flow_control() will drop
 843  842                                   * so->so_lock.
 844  843                                   */
 845  844                                  rvalp->r_val2 = so_check_flow_control(so);
 846  845                          }
 847  846                  }
 848  847                  if (mp != NULL) { /* more data blocks in msg */
 849  848                          more |= MOREDATA;
 850  849  
 851  850                          /*
 852  851                           * If requested, tally up remaining data along with the
 853  852                           * amount already copied.
 854  853                           */
 855  854                          if (xuio != NULL &&
 856  855                              xuio->xu_type == UIOTYPE_PEEKSIZE) {
 857  856                                  xuio->xu_ext.xu_ps.xu_ps_set = B_TRUE;
 858  857                                  xuio->xu_ext.xu_ps.xu_ps_size =
 859  858                                      copied + msgdsize(mp);
 860  859                          }
 861  860  
 862  861                          if ((flags & (MSG_PEEK|MSG_TRUNC))) {
 863  862                                  if (flags & MSG_PEEK) {
 864  863                                          freemsg(mp);
 865  864                                  } else {
 866  865                                          unsigned int msize = msgdsize(mp);
 867  866  
 868  867                                          freemsg(mp);
 869  868                                          mutex_enter(&so->so_lock);
 870  869                                          so->so_rcv_queued -= msize;
 871  870                                          /*
 872  871                                           * so_check_flow_control() will drop
 873  872                                           * so->so_lock.
 874  873                                           */
 875  874                                          rvalp->r_val2 =
 876  875                                              so_check_flow_control(so);
 877  876                                  }
 878  877                          } else if (partial_read && !somsghasdata(mp)) {
 879  878                                  /*
 880  879                                   * Avoid queuing a zero-length tail part of
 881  880                                   * a message. partial_read == 1 indicates that
 882  881                                   * we read some of the message.
 883  882                                   */
 884  883                                  freemsg(mp);
 885  884                                  more &= ~MOREDATA;
 886  885                          } else {
 887  886                                  if (savemp != NULL &&
 888  887                                      (flags & MSG_DUPCTRL)) {
 889  888                                          mblk_t *nmp;
 890  889                                          /*
 891  890                                           * There should only be non data mblks
 892  891                                           */
 893  892                                          ASSERT(DB_TYPE(savemp) != M_DATA &&
 894  893                                              DB_TYPE(savemptail) != M_DATA);
 895  894  try_again:
 896  895                                          if ((nmp = dupmsg(savemp)) == NULL &&
 897  896                                              (nmp = copymsg(savemp)) == NULL) {
 898  897  
 899  898                                                  size_t size = msgsize(savemp);
 900  899  
 901  900                                                  error = strwaitbuf(size,
 902  901                                                      BPRI_HI);
 903  902                                                  if (error != 0) {
 904  903                                                          /*
 905  904                                                           * In case we
 906  905                                                           * cannot copy
 907  906                                                           * control data
 908  907                                                           * free the remaining
 909  908                                                           * data.
 910  909                                                           */
 911  910                                                          freemsg(mp);
 912  911                                                          goto done;
 913  912                                                  }
 914  913                                                  goto try_again;
 915  914                                          }
 916  915  
 917  916                                          ASSERT(nmp != NULL);
 918  917                                          ASSERT(DB_TYPE(nmp) != M_DATA);
 919  918                                          savemptail->b_cont = mp;
 920  919                                          *mctlp = nmp;
 921  920                                          mp = savemp;
 922  921                                  }
 923  922                                  /*
 924  923                                   * putback mp
 925  924                                   */
 926  925                                  so_prepend_msg(so, mp, last_tail);
 927  926                          }
 928  927                  }
 929  928  
 930  929                  /* fast check so_rcv_head if there is more data */
 931  930                  if (partial_read && !(so->so_state & SS_RCVATMARK) &&
 932  931                      *mctlp == NULL && uiop->uio_resid > 0 &&
 933  932                      !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
 934  933                          goto again;
 935  934                  }
 936  935          } else if (!partial_read) {
 937  936                  mutex_enter(&so->so_lock);
 938  937                  if (so->so_error != 0) {
 939  938                          error = sogeterr(so, !(flags & MSG_PEEK));
 940  939                          mutex_exit(&so->so_lock);
 941  940                          return (error);
 942  941                  }
 943  942                  /*
 944  943                   * No pending data. Return right away for nonblocking
 945  944                   * socket, otherwise sleep waiting for data.
 946  945                   */
 947  946                  if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
 948  947                          if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
 949  948                              (flags & MSG_DONTWAIT)) {
 950  949                                  error = EWOULDBLOCK;
 951  950                          } else {
 952  951                                  if (so->so_state & (SS_CLOSING |
 953  952                                      SS_FALLBACK_PENDING)) {
 954  953                                          mutex_exit(&so->so_lock);
 955  954                                          error = EINTR;
 956  955                                          goto done;
 957  956                                  }
 958  957  
 959  958                                  if (so->so_rcv_head != NULL) {
 960  959                                          goto again1;
 961  960                                  }
 962  961                                  so->so_rcv_wakeup = B_TRUE;
 963  962                                  so->so_rcv_wanted = uiop->uio_resid;
 964  963                                  if (so->so_rcvtimeo == 0) {
 965  964                                          /*
 966  965                                           * Zero means disable timeout.
 967  966                                           */
 968  967                                          error = cv_wait_sig(&so->so_rcv_cv,
 969  968                                              &so->so_lock);
 970  969                                  } else {
 971  970                                          error = cv_reltimedwait_sig(
 972  971                                              &so->so_rcv_cv, &so->so_lock,
 973  972                                              so->so_rcvtimeo, TR_CLOCK_TICK);
 974  973                                  }
 975  974                                  so->so_rcv_wakeup = B_FALSE;
 976  975                                  so->so_rcv_wanted = 0;
 977  976  
 978  977                                  if (error == 0) {
 979  978                                          error = EINTR;
 980  979                                  } else if (error == -1) {
 981  980                                          error = EAGAIN;
 982  981                                  } else {
 983  982                                          goto again1;
 984  983                                  }
 985  984                          }
 986  985                  }
 987  986                  mutex_exit(&so->so_lock);
 988  987          }
 989  988          if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
 990  989                  /*
 991  990                   * We are passed the mark, update state
 992  991                   * 4.3BSD and 4.4BSD clears the mark when peeking across it.
 993  992                   * The draft Posix socket spec states that the mark should
 994  993                   * not be cleared when peeking. We follow the latter.
 995  994                   */
 996  995                  mutex_enter(&so->so_lock);
 997  996                  ASSERT(so_verify_oobstate(so));
 998  997                  so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
 999  998                  freemsg(so->so_oobmsg);
1000  999                  so->so_oobmsg = NULL;
1001 1000                  ASSERT(so_verify_oobstate(so));
1002 1001                  mutex_exit(&so->so_lock);
1003 1002          }
1004 1003          ASSERT(so->so_rcv_wakeup == B_FALSE);
1005 1004  done:
1006 1005          if (sodp != NULL) {
1007 1006                  mutex_enter(&so->so_lock);
1008 1007                  if (sodp->sod_enabled &&
1009 1008                      (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
1010 1009                          SOD_UIOAFINI(sodp);
1011 1010                          if (sodp->sod_uioa.uioa_mbytes > 0) {
1012 1011                                  ASSERT(so->so_rcv_q_head != NULL ||
1013 1012                                      so->so_rcv_head != NULL);
1014 1013                                  so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
1015 1014                                  if (error == EWOULDBLOCK)
1016 1015                                          error = 0;
1017 1016                          }
1018 1017                  }
1019 1018                  mutex_exit(&so->so_lock);
1020 1019          }
1021 1020  #ifdef DEBUG
1022 1021          if (so_debug_length) {
1023 1022                  mutex_enter(&so->so_lock);
1024 1023                  ASSERT(so_check_length(so));
1025 1024                  mutex_exit(&so->so_lock);
1026 1025          }
1027 1026  #endif
1028 1027          rvalp->r_val1 = more;
1029 1028          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1030 1029          return (error);
1031 1030  }
1032 1031  
1033 1032  /*
1034 1033   * Enqueue data from the protocol on the socket's rcv queue.
1035 1034   *
1036 1035   * We try to hook new M_DATA mblks onto an existing chain, however,
1037 1036   * that cannot be done if the existing chain has already been
1038 1037   * processed by I/OAT. Non-M_DATA mblks are just linked together via
1039 1038   * b_next. In all cases the b_prev of the enqueued mblk is set to
1040 1039   * point to the last mblk in its b_cont chain.
1041 1040   */
1042 1041  void
1043 1042  so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1044 1043  {
1045 1044          ASSERT(MUTEX_HELD(&so->so_lock));
1046 1045  
1047 1046  #ifdef DEBUG
1048 1047          if (so_debug_length) {
1049 1048                  ASSERT(so_check_length(so));
1050 1049          }
1051 1050  #endif
1052 1051          so->so_rcv_queued += msg_size;
1053 1052  
1054 1053          if (so->so_rcv_head == NULL) {
1055 1054                  ASSERT(so->so_rcv_last_head == NULL);
1056 1055                  so->so_rcv_head = mp;
1057 1056                  so->so_rcv_last_head = mp;
1058 1057          } else if ((DB_TYPE(mp) == M_DATA &&
1059 1058              DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1060 1059              ((DB_FLAGS(mp) & DBLK_UIOA) ==
1061 1060              (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1062 1061                  /* Added to the end */
1063 1062                  ASSERT(so->so_rcv_last_head != NULL);
1064 1063                  ASSERT(so->so_rcv_last_head->b_prev != NULL);
1065 1064                  so->so_rcv_last_head->b_prev->b_cont = mp;
1066 1065          } else {
1067 1066                  /* Start a new end */
1068 1067                  so->so_rcv_last_head->b_next = mp;
1069 1068                  so->so_rcv_last_head = mp;
1070 1069          }
1071 1070          while (mp->b_cont != NULL)
1072 1071                  mp = mp->b_cont;
1073 1072  
1074 1073          so->so_rcv_last_head->b_prev = mp;
1075 1074  #ifdef DEBUG
1076 1075          if (so_debug_length) {
1077 1076                  ASSERT(so_check_length(so));
1078 1077          }
1079 1078  #endif
1080 1079  }
1081 1080  
1082 1081  /*
1083 1082   * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1084 1083   */
1085 1084  boolean_t
1086 1085  somsghasdata(mblk_t *mp)
1087 1086  {
1088 1087          for (; mp; mp = mp->b_cont)
1089 1088                  if (mp->b_datap->db_type == M_DATA) {
1090 1089                          ASSERT(mp->b_wptr >= mp->b_rptr);
1091 1090                          if (mp->b_wptr > mp->b_rptr)
1092 1091                                  return (B_TRUE);
1093 1092                  }
1094 1093          return (B_FALSE);
1095 1094  }
1096 1095  
1097 1096  /*
1098 1097   * Flush the read side of sockfs.
1099 1098   *
1100 1099   * The caller must be sure that a reader is not already active when the
1101 1100   * buffer is being flushed.
1102 1101   */
1103 1102  void
1104 1103  so_rcv_flush(struct sonode *so)
1105 1104  {
1106 1105          mblk_t  *mp;
1107 1106  
1108 1107          ASSERT(MUTEX_HELD(&so->so_lock));
1109 1108  
1110 1109          if (so->so_oobmsg != NULL) {
1111 1110                  freemsg(so->so_oobmsg);
1112 1111                  so->so_oobmsg = NULL;
1113 1112                  so->so_oobmark = 0;
1114 1113                  so->so_state &=
1115 1114                      ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1116 1115          }
1117 1116  
1118 1117          /*
1119 1118           * Free messages sitting in the recv queues
1120 1119           */
1121 1120          while (so->so_rcv_q_head != NULL) {
1122 1121                  mp = so->so_rcv_q_head;
1123 1122                  so->so_rcv_q_head = mp->b_next;
1124 1123                  mp->b_next = mp->b_prev = NULL;
1125 1124                  freemsg(mp);
1126 1125          }
1127 1126          while (so->so_rcv_head != NULL) {
1128 1127                  mp = so->so_rcv_head;
1129 1128                  so->so_rcv_head = mp->b_next;
1130 1129                  mp->b_next = mp->b_prev = NULL;
1131 1130                  freemsg(mp);
1132 1131          }
1133 1132          so->so_rcv_queued = 0;
1134 1133          so->so_rcv_q_head = NULL;
1135 1134          so->so_rcv_q_last_head = NULL;
1136 1135          so->so_rcv_head = NULL;
1137 1136          so->so_rcv_last_head = NULL;
1138 1137  }
1139 1138  
1140 1139  /*
1141 1140   * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1142 1141   */
1143 1142  int
1144 1143  sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1145 1144      boolean_t oob_inline)
1146 1145  {
1147 1146          mblk_t          *mp, *nmp;
1148 1147          int             error;
1149 1148  
1150 1149          dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1151 1150              flags));
1152 1151  
1153 1152          if (msg != NULL) {
1154 1153                  /*
1155 1154                   * There is never any oob data with addresses or control since
1156 1155                   * the T_EXDATA_IND does not carry any options.
1157 1156                   */
1158 1157                  msg->msg_controllen = 0;
1159 1158                  msg->msg_namelen = 0;
1160 1159                  msg->msg_flags = 0;
1161 1160          }
1162 1161  
1163 1162          mutex_enter(&so->so_lock);
1164 1163          ASSERT(so_verify_oobstate(so));
1165 1164          if (oob_inline ||
1166 1165              (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1167 1166                  dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1168 1167                  mutex_exit(&so->so_lock);
1169 1168                  return (EINVAL);
1170 1169          }
1171 1170          if (!(so->so_state & SS_HAVEOOBDATA)) {
1172 1171                  dprintso(so, 1, ("sorecvoob: no data yet\n"));
1173 1172                  mutex_exit(&so->so_lock);
1174 1173                  return (EWOULDBLOCK);
1175 1174          }
1176 1175          ASSERT(so->so_oobmsg != NULL);
1177 1176          mp = so->so_oobmsg;
1178 1177          if (flags & MSG_PEEK) {
1179 1178                  /*
1180 1179                   * Since recv* can not return ENOBUFS we can not use dupmsg.
1181 1180                   * Instead we revert to the consolidation private
1182 1181                   * allocb_wait plus bcopy.
1183 1182                   */
1184 1183                  mblk_t *mp1;
1185 1184  
1186 1185                  mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1187 1186                  ASSERT(mp1);
1188 1187  
1189 1188                  while (mp != NULL) {
1190 1189                          ssize_t size;
1191 1190  
1192 1191                          size = MBLKL(mp);
1193 1192                          bcopy(mp->b_rptr, mp1->b_wptr, size);
1194 1193                          mp1->b_wptr += size;
1195 1194                          ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1196 1195                          mp = mp->b_cont;
1197 1196                  }
1198 1197                  mp = mp1;
1199 1198          } else {
1200 1199                  /*
1201 1200                   * Update the state indicating that the data has been consumed.
1202 1201                   * Keep SS_OOBPEND set until data is consumed past the mark.
1203 1202                   */
1204 1203                  so->so_oobmsg = NULL;
1205 1204                  so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1206 1205          }
1207 1206          ASSERT(so_verify_oobstate(so));
1208 1207          mutex_exit(&so->so_lock);
1209 1208  
1210 1209          error = 0;
1211 1210          nmp = mp;
1212 1211          while (nmp != NULL && uiop->uio_resid > 0) {
1213 1212                  ssize_t n = MBLKL(nmp);
1214 1213  
1215 1214                  n = MIN(n, uiop->uio_resid);
1216 1215                  if (n > 0)
1217 1216                          error = uiomove(nmp->b_rptr, n,
1218 1217                              UIO_READ, uiop);
1219 1218                  if (error)
1220 1219                          break;
1221 1220                  nmp = nmp->b_cont;
1222 1221          }
1223 1222          ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1224 1223          freemsg(mp);
1225 1224          return (error);
1226 1225  }
1227 1226  
1228 1227  /*
1229 1228   * Allocate and initializ sonode
1230 1229   */
1231 1230  /* ARGSUSED */
1232 1231  struct sonode *
1233 1232  socket_sonode_create(struct sockparams *sp, int family, int type,
1234 1233      int protocol, int version, int sflags, int *errorp, struct cred *cr)
1235 1234  {
1236 1235          sonode_t *so;
1237 1236          int     kmflags;
1238 1237  
1239 1238          /*
1240 1239           * Choose the right set of sonodeops based on the upcall and
1241 1240           * down call version that the protocol has provided
1242 1241           */
1243 1242          if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1244 1243              SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1245 1244                  /*
1246 1245                   * mismatch
1247 1246                   */
1248 1247  #ifdef DEBUG
1249 1248                  cmn_err(CE_CONT, "protocol and socket module version mismatch");
1250 1249  #endif
1251 1250                  *errorp = EINVAL;
1252 1251                  return (NULL);
1253 1252          }
1254 1253  
1255 1254          kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1256 1255  
1257 1256          so = kmem_cache_alloc(socket_cache, kmflags);
1258 1257          if (so == NULL) {
1259 1258                  *errorp = ENOMEM;
1260 1259                  return (NULL);
1261 1260          }
1262 1261  
1263 1262          sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1264 1263  
1265 1264          if (version == SOV_DEFAULT)
1266 1265                  version = so_default_version;
1267 1266  
1268 1267          so->so_version = (short)version;
1269 1268  
1270 1269          /*
1271 1270           * set the default values to be INFPSZ
1272 1271           * if a protocol desires it can change the value later
1273 1272           */
1274 1273          so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1275 1274          so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1276 1275          so->so_proto_props.sopp_maxpsz = INFPSZ;
1277 1276          so->so_proto_props.sopp_maxblk = INFPSZ;
1278 1277  
1279 1278          return (so);
1280 1279  }
1281 1280  
1282 1281  int
1283 1282  socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1284 1283  {
1285 1284          int error = 0;
1286 1285  
1287 1286          if (pso != NULL) {
1288 1287                  /*
1289 1288                   * We have a passive open, so inherit basic state from
1290 1289                   * the parent (listener).
1291 1290                   *
1292 1291                   * No need to grab the new sonode's lock, since there is no
1293 1292                   * one that can have a reference to it.
1294 1293                   */
1295 1294                  mutex_enter(&pso->so_lock);
1296 1295  
1297 1296                  so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1298 1297                  so->so_pgrp = pso->so_pgrp;
1299 1298                  so->so_rcvtimeo = pso->so_rcvtimeo;
1300 1299                  so->so_sndtimeo = pso->so_sndtimeo;
1301 1300                  so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
1302 1301                  /*
1303 1302                   * Make note of the socket level options. TCP and IP level
1304 1303                   * options are already inherited. We could do all this after
1305 1304                   * accept is successful but doing it here simplifies code and
1306 1305                   * no harm done for error case.
1307 1306                   */
1308 1307                  so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
1309 1308                      SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1310 1309                      SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1311 1310                  so->so_proto_props = pso->so_proto_props;
1312 1311                  so->so_mode = pso->so_mode;
1313 1312                  so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
1314 1313  
1315 1314                  mutex_exit(&pso->so_lock);
1316 1315  
1317 1316                  /*
1318 1317                   * If the parent has any filters, try to inherit them.
1319 1318                   */
1320 1319                  if (pso->so_filter_active > 0 &&
1321 1320                      (error = sof_sonode_inherit_filters(so, pso)) != 0)
1322 1321                          return (error);
1323 1322  
1324 1323          } else {
1325 1324                  struct sockparams *sp = so->so_sockparams;
1326 1325                  sock_upcalls_t *upcalls_to_use;
1327 1326  
1328 1327                  /*
1329 1328                   * Attach automatic filters, if there are any.
1330 1329                   */
1331 1330                  if (!list_is_empty(&sp->sp_auto_filters) &&
1332 1331                      (error = sof_sonode_autoattach_filters(so, cr)) != 0)
1333 1332                          return (error);
1334 1333  
1335 1334                  /* OK to attach filters */
1336 1335                  so->so_state |= SS_FILOP_OK;
1337 1336  
1338 1337                  /*
1339 1338                   * Based on the version number select the right upcalls to
1340 1339                   * pass down. Currently we only have one version so choose
1341 1340                   * default
1342 1341                   */
1343 1342                  upcalls_to_use = &so_upcalls;
1344 1343  
1345 1344                  /* active open, so create a lower handle */
1346 1345                  so->so_proto_handle =
1347 1346                      sp->sp_smod_info->smod_proto_create_func(so->so_family,
1348 1347                      so->so_type, so->so_protocol, &so->so_downcalls,
1349 1348                      &so->so_mode, &error, flags, cr);
1350 1349  
1351 1350                  if (so->so_proto_handle == NULL) {
1352 1351                          ASSERT(error != 0);
1353 1352                          /*
1354 1353                           * To be safe; if a lower handle cannot be created, and
1355 1354                           * the proto does not give a reason why, assume there
1356 1355                           * was a lack of memory.
1357 1356                           */
1358 1357                          return ((error == 0) ? ENOMEM : error);
1359 1358                  }
1360 1359                  ASSERT(so->so_downcalls != NULL);
1361 1360                  ASSERT(so->so_downcalls->sd_send != NULL ||
1362 1361                      so->so_downcalls->sd_send_uio != NULL);
1363 1362                  if (so->so_downcalls->sd_recv_uio != NULL) {
1364 1363                          ASSERT(so->so_downcalls->sd_poll != NULL);
1365 1364                          so->so_pollev |= SO_POLLEV_ALWAYS;
1366 1365                  }
1367 1366  
1368 1367                  (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1369 1368                      (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1370 1369  
1371 1370                  /* Wildcard */
1372 1371  
1373 1372                  /*
1374 1373                   * FIXME No need for this, the protocol can deal with it in
1375 1374                   * sd_create(). Should update ICMP.
1376 1375                   */
1377 1376                  if (so->so_protocol != so->so_sockparams->sp_protocol) {
1378 1377                          int protocol = so->so_protocol;
1379 1378                          int error;
1380 1379                          /*
1381 1380                           * Issue SO_PROTOTYPE setsockopt.
1382 1381                           */
1383 1382                          error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1384 1383                              &protocol, (t_uscalar_t)sizeof (protocol), cr);
1385 1384                          if (error) {
1386 1385                                  (void) (*so->so_downcalls->sd_close)
1387 1386                                      (so->so_proto_handle, 0, cr);
1388 1387  
1389 1388                                  mutex_enter(&so->so_lock);
1390 1389                                  so_rcv_flush(so);
1391 1390                                  mutex_exit(&so->so_lock);
1392 1391                                  /*
1393 1392                                   * Setsockopt often fails with ENOPROTOOPT but
1394 1393                                   * socket() should fail with
1395 1394                                   * EPROTONOSUPPORT/EPROTOTYPE.
1396 1395                                   */
1397 1396                                  return (EPROTONOSUPPORT);
1398 1397                          }
1399 1398                  }
1400 1399          }
1401 1400  
1402 1401          if (uioasync.enabled)
1403 1402                  sod_sock_init(so);
1404 1403  
1405 1404          /* put an extra reference on the socket for the protocol */
1406 1405          VN_HOLD(SOTOV(so));
1407 1406  
1408 1407          return (0);
1409 1408  }
1410 1409  
1411 1410  /*
1412 1411   * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1413 1412   *         struct cred *cr, int32_t *rvalp)
1414 1413   *
1415 1414   * Handle ioctls that manipulate basic socket state; non-blocking,
1416 1415   * async, etc.
1417 1416   *
1418 1417   * Returns:
1419 1418   *   < 0  - ioctl was not handle
1420 1419   *  >= 0  - ioctl was handled, if > 0, then it is an errno
1421 1420   *
1422 1421   * Notes:
1423 1422   *   Assumes the standard receive buffer is used to obtain info for
1424 1423   *   NREAD.
1425 1424   */
1426 1425  /* ARGSUSED */
1427 1426  int
1428 1427  socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1429 1428      struct cred *cr, int32_t *rvalp)
1430 1429  {
1431 1430          switch (cmd) {
1432 1431          case SIOCSQPTR:
1433 1432                  /*
1434 1433                   * SIOCSQPTR is valid only when helper stream is created
1435 1434                   * by the protocol.
1436 1435                   */
1437 1436  
1438 1437                  return (EOPNOTSUPP);
1439 1438          case FIONBIO: {
1440 1439                  int32_t value;
1441 1440  
1442 1441                  if (so_copyin((void *)arg, &value, sizeof (int32_t),
1443 1442                      (mode & (int)FKIOCTL)))
1444 1443                          return (EFAULT);
1445 1444  
1446 1445                  mutex_enter(&so->so_lock);
1447 1446                  if (value) {
1448 1447                          so->so_state |= SS_NDELAY;
1449 1448                  } else {
1450 1449                          so->so_state &= ~SS_NDELAY;
1451 1450                  }
1452 1451                  mutex_exit(&so->so_lock);
1453 1452                  return (0);
1454 1453          }
1455 1454          case FIOASYNC: {
1456 1455                  int32_t value;
1457 1456  
1458 1457                  if (so_copyin((void *)arg, &value, sizeof (int32_t),
1459 1458                      (mode & (int)FKIOCTL)))
1460 1459                          return (EFAULT);
1461 1460  
1462 1461                  mutex_enter(&so->so_lock);
1463 1462  
1464 1463                  if (value) {
1465 1464                          /* Turn on SIGIO */
1466 1465                          so->so_state |= SS_ASYNC;
1467 1466                  } else {
1468 1467                          /* Turn off SIGIO */
1469 1468                          so->so_state &= ~SS_ASYNC;
1470 1469                  }
1471 1470                  mutex_exit(&so->so_lock);
1472 1471  
1473 1472                  return (0);
1474 1473          }
1475 1474  
1476 1475          case SIOCSPGRP:
1477 1476          case FIOSETOWN: {
1478 1477                  int error;
1479 1478                  pid_t pid;
1480 1479  
1481 1480                  if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1482 1481                      (mode & (int)FKIOCTL)))
1483 1482                          return (EFAULT);
1484 1483  
1485 1484                  mutex_enter(&so->so_lock);
1486 1485                  error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1487 1486                  mutex_exit(&so->so_lock);
1488 1487                  return (error);
1489 1488          }
1490 1489          case SIOCGPGRP:
1491 1490          case FIOGETOWN:
1492 1491                  if (so_copyout(&so->so_pgrp, (void *)arg,
1493 1492                      sizeof (pid_t), (mode & (int)FKIOCTL)))
1494 1493                          return (EFAULT);
1495 1494  
1496 1495                  return (0);
1497 1496          case SIOCATMARK: {
1498 1497                  int retval;
1499 1498  
1500 1499                  /*
1501 1500                   * Only protocols that support urgent data can handle ATMARK.
1502 1501                   */
1503 1502                  if ((so->so_mode & SM_EXDATA) == 0)
1504 1503                          return (EINVAL);
1505 1504  
1506 1505                  /*
1507 1506                   * If the protocol is maintaining its own buffer, then the
1508 1507                   * request must be passed down.
1509 1508                   */
1510 1509                  if (so->so_downcalls->sd_recv_uio != NULL)
1511 1510                          return (-1);
1512 1511  
1513 1512                  retval = (so->so_state & SS_RCVATMARK) != 0;
1514 1513  
1515 1514                  if (so_copyout(&retval, (void *)arg, sizeof (int),
1516 1515                      (mode & (int)FKIOCTL))) {
1517 1516                          return (EFAULT);
1518 1517                  }
1519 1518                  return (0);
1520 1519          }
1521 1520  
1522 1521          case FIONREAD: {
1523 1522                  int retval;
1524 1523  
1525 1524                  /*
1526 1525                   * If the protocol is maintaining its own buffer, then the
1527 1526                   * request must be passed down.
1528 1527                   */
1529 1528                  if (so->so_downcalls->sd_recv_uio != NULL)
1530 1529                          return (-1);
1531 1530  
1532 1531                  retval = MIN(so->so_rcv_queued, INT_MAX);
1533 1532  
1534 1533                  if (so_copyout(&retval, (void *)arg,
1535 1534                      sizeof (retval), (mode & (int)FKIOCTL))) {
1536 1535                          return (EFAULT);
1537 1536                  }
1538 1537                  return (0);
1539 1538          }
1540 1539  
1541 1540          case _I_GETPEERCRED: {
1542 1541                  int error = 0;
1543 1542  
1544 1543                  if ((mode & FKIOCTL) == 0)
1545 1544                          return (EINVAL);
1546 1545  
1547 1546                  mutex_enter(&so->so_lock);
1548 1547                  if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1549 1548                          error = ENOTSUP;
1550 1549                  } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1551 1550                          error = ENOTCONN;
1552 1551                  } else if (so->so_peercred != NULL) {
1553 1552                          k_peercred_t *kp = (k_peercred_t *)arg;
1554 1553                          kp->pc_cr = so->so_peercred;
1555 1554                          kp->pc_cpid = so->so_cpid;
1556 1555                          crhold(so->so_peercred);
1557 1556                  } else {
1558 1557                          error = EINVAL;
1559 1558                  }
1560 1559                  mutex_exit(&so->so_lock);
1561 1560                  return (error);
1562 1561          }
1563 1562          default:
1564 1563                  return (-1);
1565 1564          }
1566 1565  }
1567 1566  
1568 1567  /*
1569 1568   * Handle the I_NREAD STREAM ioctl.
1570 1569   */
1571 1570  static int
1572 1571  so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
1573 1572  {
1574 1573          size_t size = 0;
1575 1574          int retval;
1576 1575          int count = 0;
1577 1576          mblk_t *mp;
1578 1577          clock_t wakeup = drv_usectohz(10);
1579 1578  
1580 1579          if (so->so_downcalls == NULL ||
1581 1580              so->so_downcalls->sd_recv_uio != NULL)
1582 1581                  return (EINVAL);
1583 1582  
1584 1583          mutex_enter(&so->so_lock);
1585 1584          /* Wait for reader to get out of the way. */
1586 1585          while (so->so_flag & SOREADLOCKED) {
1587 1586                  /*
1588 1587                   * If reader is waiting for data, then there should be nothing
1589 1588                   * on the rcv queue.
1590 1589                   */
1591 1590                  if (so->so_rcv_wakeup)
1592 1591                          goto out;
1593 1592  
1594 1593                  /* Do a timed sleep, in case the reader goes to sleep. */
1595 1594                  (void) cv_reltimedwait(&so->so_read_cv, &so->so_lock, wakeup,
1596 1595                      TR_CLOCK_TICK);
1597 1596          }
1598 1597  
1599 1598          /*
1600 1599           * Since we are holding so_lock no new reader will come in, and the
1601 1600           * protocol will not be able to enqueue data. So it's safe to walk
1602 1601           * both rcv queues.
1603 1602           */
1604 1603          mp = so->so_rcv_q_head;
1605 1604          if (mp != NULL) {
1606 1605                  size = msgdsize(so->so_rcv_q_head);
1607 1606                  for (; mp != NULL; mp = mp->b_next)
1608 1607                          count++;
1609 1608          } else {
1610 1609                  /*
1611 1610                   * In case the processing list was empty, get the size of the
1612 1611                   * next msg in line.
1613 1612                   */
1614 1613                  size = msgdsize(so->so_rcv_head);
1615 1614          }
1616 1615  
1617 1616          for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
1618 1617                  count++;
1619 1618  out:
1620 1619          mutex_exit(&so->so_lock);
1621 1620  
1622 1621          /*
1623 1622           * Drop down from size_t to the "int" required by the
1624 1623           * interface.  Cap at INT_MAX.
1625 1624           */
1626 1625          retval = MIN(size, INT_MAX);
1627 1626          if (so_copyout(&retval, (void *)arg, sizeof (retval),
1628 1627              (mode & (int)FKIOCTL))) {
1629 1628                  return (EFAULT);
1630 1629          } else {
1631 1630                  *rvalp = count;
1632 1631                  return (0);
1633 1632          }
1634 1633  }
1635 1634  
1636 1635  /*
1637 1636   * Process STREAM ioctls.
1638 1637   *
1639 1638   * Returns:
1640 1639   *   < 0  - ioctl was not handle
1641 1640   *  >= 0  - ioctl was handled, if > 0, then it is an errno
1642 1641   */
1643 1642  int
1644 1643  socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1645 1644      struct cred *cr, int32_t *rvalp)
1646 1645  {
1647 1646          int retval;
1648 1647  
1649 1648          /* Only STREAM iotcls are handled here */
1650 1649          if ((cmd & 0xffffff00U) != STR)
1651 1650                  return (-1);
1652 1651  
1653 1652          switch (cmd) {
1654 1653          case I_CANPUT:
1655 1654                  /*
1656 1655                   * We return an error for I_CANPUT so that isastream(3C) will
1657 1656                   * not report the socket as being a STREAM.
1658 1657                   */
1659 1658                  return (EOPNOTSUPP);
1660 1659          case I_NREAD:
1661 1660                  /* Avoid doing a fallback for I_NREAD. */
1662 1661                  return (so_strioc_nread(so, arg, mode, rvalp));
1663 1662          case I_LOOK:
1664 1663                  /* Avoid doing a fallback for I_LOOK. */
1665 1664                  if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1666 1665                      (mode & (int)FKIOCTL))) {
1667 1666                          return (EFAULT);
1668 1667                  }
1669 1668                  return (0);
1670 1669          default:
1671 1670                  break;
1672 1671          }
1673 1672  
1674 1673          /*
1675 1674           * Try to fall back to TPI, and if successful, reissue the ioctl.
1676 1675           */
1677 1676          if ((retval = so_tpi_fallback(so, cr)) == 0) {
1678 1677                  /* Reissue the ioctl */
1679 1678                  ASSERT(so->so_rcv_q_head == NULL);
1680 1679                  return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1681 1680          } else {
1682 1681                  return (retval);
1683 1682          }
1684 1683  }
1685 1684  
1686 1685  /*
1687 1686   * This is called for all socket types to verify that the buffer size is large
1688 1687   * enough for the option, and if we can, handle the request as well. Most
1689 1688   * options will be forwarded to the protocol.
1690 1689   */
1691 1690  int
1692 1691  socket_getopt_common(struct sonode *so, int level, int option_name,
1693 1692      void *optval, socklen_t *optlenp, int flags)
1694 1693  {
1695 1694          if (level != SOL_SOCKET)
1696 1695                  return (-1);
1697 1696  
1698 1697          switch (option_name) {
1699 1698          case SO_ERROR:
1700 1699          case SO_DOMAIN:
1701 1700          case SO_TYPE:
1702 1701          case SO_ACCEPTCONN: {
1703 1702                  int32_t value;
1704 1703                  socklen_t optlen = *optlenp;
1705 1704  
1706 1705                  if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1707 1706                          return (EINVAL);
1708 1707                  }
1709 1708  
1710 1709                  switch (option_name) {
1711 1710                  case SO_ERROR:
1712 1711                          mutex_enter(&so->so_lock);
1713 1712                          value = sogeterr(so, B_TRUE);
1714 1713                          mutex_exit(&so->so_lock);
1715 1714                          break;
1716 1715                  case SO_DOMAIN:
1717 1716                          value = so->so_family;
1718 1717                          break;
1719 1718                  case SO_TYPE:
1720 1719                          value = so->so_type;
1721 1720                          break;
1722 1721                  case SO_ACCEPTCONN:
1723 1722                          if (so->so_state & SS_ACCEPTCONN)
1724 1723                                  value = SO_ACCEPTCONN;
1725 1724                          else
1726 1725                                  value = 0;
1727 1726                          break;
1728 1727                  }
1729 1728  
1730 1729                  bcopy(&value, optval, sizeof (value));
1731 1730                  *optlenp = sizeof (value);
1732 1731  
1733 1732                  return (0);
1734 1733          }
1735 1734          case SO_SNDTIMEO:
1736 1735          case SO_RCVTIMEO: {
1737 1736                  clock_t value;
1738 1737                  socklen_t optlen = *optlenp;
1739 1738  
1740 1739                  if (get_udatamodel() == DATAMODEL_NONE ||
1741 1740                      get_udatamodel() == DATAMODEL_NATIVE) {
1742 1741                          if (optlen < sizeof (struct timeval))
1743 1742                                  return (EINVAL);
1744 1743                  } else {
1745 1744                          if (optlen < sizeof (struct timeval32))
1746 1745                                  return (EINVAL);
1747 1746                  }
1748 1747                  if (option_name == SO_RCVTIMEO)
1749 1748                          value = drv_hztousec(so->so_rcvtimeo);
1750 1749                  else
1751 1750                          value = drv_hztousec(so->so_sndtimeo);
1752 1751  
1753 1752                  if (get_udatamodel() == DATAMODEL_NONE ||
1754 1753                      get_udatamodel() == DATAMODEL_NATIVE) {
1755 1754                          ((struct timeval *)(optval))->tv_sec =
1756 1755                              value / (1000 * 1000);
1757 1756                          ((struct timeval *)(optval))->tv_usec =
1758 1757                              value % (1000 * 1000);
1759 1758                          *optlenp = sizeof (struct timeval);
1760 1759                  } else {
1761 1760                          ((struct timeval32 *)(optval))->tv_sec =
1762 1761                              value / (1000 * 1000);
1763 1762                          ((struct timeval32 *)(optval))->tv_usec =
1764 1763                              value % (1000 * 1000);
1765 1764                          *optlenp = sizeof (struct timeval32);
1766 1765                  }
1767 1766                  return (0);
1768 1767          }
1769 1768          case SO_DEBUG:
1770 1769          case SO_REUSEADDR:
1771 1770          case SO_KEEPALIVE:
1772 1771          case SO_DONTROUTE:
1773 1772          case SO_BROADCAST:
1774 1773          case SO_USELOOPBACK:
1775 1774          case SO_OOBINLINE:
1776 1775          case SO_SNDBUF:
1777 1776  #ifdef notyet
1778 1777          case SO_SNDLOWAT:
1779 1778          case SO_RCVLOWAT:
1780 1779  #endif /* notyet */
1781 1780          case SO_DGRAM_ERRIND: {
1782 1781                  socklen_t optlen = *optlenp;
1783 1782  
1784 1783                  if (optlen < (t_uscalar_t)sizeof (int32_t))
1785 1784                          return (EINVAL);
1786 1785                  break;
1787 1786          }
1788 1787          case SO_RCVBUF: {
1789 1788                  socklen_t optlen = *optlenp;
1790 1789  
1791 1790                  if (optlen < (t_uscalar_t)sizeof (int32_t))
1792 1791                          return (EINVAL);
1793 1792  
1794 1793                  if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1795 1794                          /*
1796 1795                           * XXX If SO_RCVBUF has been set and this is an
1797 1796                           * XPG 4.2 application then do not ask the transport
1798 1797                           * since the transport might adjust the value and not
1799 1798                           * return exactly what was set by the application.
1800 1799                           * For non-XPG 4.2 application we return the value
1801 1800                           * that the transport is actually using.
1802 1801                           */
1803 1802                          *(int32_t *)optval = so->so_xpg_rcvbuf;
1804 1803                          *optlenp = sizeof (so->so_xpg_rcvbuf);
1805 1804                          return (0);
1806 1805                  }
1807 1806                  /*
1808 1807                   * If the option has not been set then get a default
1809 1808                   * value from the transport.
1810 1809                   */
1811 1810                  break;
1812 1811          }
1813 1812          case SO_LINGER: {
1814 1813                  socklen_t optlen = *optlenp;
1815 1814  
1816 1815                  if (optlen < (t_uscalar_t)sizeof (struct linger))
1817 1816                          return (EINVAL);
1818 1817                  break;
1819 1818          }
1820 1819          case SO_SND_BUFINFO: {
1821 1820                  socklen_t optlen = *optlenp;
1822 1821  
1823 1822                  if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1824 1823                          return (EINVAL);
1825 1824                  ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1826 1825                      (so->so_proto_props).sopp_wroff;
1827 1826                  ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1828 1827                      (so->so_proto_props).sopp_maxblk;
1829 1828                  ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1830 1829                      (so->so_proto_props).sopp_maxpsz;
1831 1830                  ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1832 1831                      (so->so_proto_props).sopp_tail;
1833 1832                  *optlenp = sizeof (struct so_snd_bufinfo);
1834 1833                  return (0);
1835 1834          }
1836 1835          case SO_SND_COPYAVOID: {
1837 1836                  sof_instance_t *inst;
1838 1837  
1839 1838                  /*
1840 1839                   * Avoid zero-copy if there is a filter with a data_out
1841 1840                   * callback. We could let the operation succeed, but then
1842 1841                   * the filter would have to copy the data anyway.
1843 1842                   */
1844 1843                  for (inst = so->so_filter_top; inst != NULL;
1845 1844                      inst = inst->sofi_next) {
1846 1845                          if (SOF_INTERESTED(inst, data_out))
1847 1846                                  return (EOPNOTSUPP);
1848 1847                  }
1849 1848                  break;
1850 1849          }
1851 1850  
1852 1851          default:
1853 1852                  break;
1854 1853          }
1855 1854  
1856 1855          /* Unknown Option */
1857 1856          return (-1);
1858 1857  }
1859 1858  
1860 1859  void
1861 1860  socket_sonode_destroy(struct sonode *so)
1862 1861  {
1863 1862          sonode_fini(so);
1864 1863          kmem_cache_free(socket_cache, so);
1865 1864  }
1866 1865  
1867 1866  int
1868 1867  so_zcopy_wait(struct sonode *so)
1869 1868  {
1870 1869          int error = 0;
1871 1870  
1872 1871          mutex_enter(&so->so_lock);
1873 1872          while (!(so->so_copyflag & STZCNOTIFY)) {
1874 1873                  if (so->so_state & SS_CLOSING) {
1875 1874                          mutex_exit(&so->so_lock);
1876 1875                          return (EINTR);
1877 1876                  }
1878 1877                  if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1879 1878                          error = EINTR;
1880 1879                          break;
1881 1880                  }
1882 1881          }
1883 1882          so->so_copyflag &= ~STZCNOTIFY;
1884 1883          mutex_exit(&so->so_lock);
1885 1884          return (error);
1886 1885  }
1887 1886  
1888 1887  void
1889 1888  so_timer_callback(void *arg)
1890 1889  {
1891 1890          struct sonode *so = (struct sonode *)arg;
1892 1891  
1893 1892          mutex_enter(&so->so_lock);
1894 1893  
1895 1894          so->so_rcv_timer_tid = 0;
1896 1895          if (so->so_rcv_queued > 0) {
1897 1896                  so_notify_data(so, so->so_rcv_queued);
1898 1897          } else {
1899 1898                  mutex_exit(&so->so_lock);
1900 1899          }
1901 1900  }
1902 1901  
1903 1902  #ifdef DEBUG
1904 1903  /*
1905 1904   * Verify that the length stored in so_rcv_queued and the length of data blocks
1906 1905   * queued is same.
1907 1906   */
1908 1907  static boolean_t
1909 1908  so_check_length(sonode_t *so)
1910 1909  {
1911 1910          mblk_t *mp = so->so_rcv_q_head;
1912 1911          int len = 0;
1913 1912  
1914 1913          ASSERT(MUTEX_HELD(&so->so_lock));
1915 1914  
1916 1915          if (mp != NULL) {
1917 1916                  len = msgdsize(mp);
1918 1917                  while ((mp = mp->b_next) != NULL)
1919 1918                          len += msgdsize(mp);
1920 1919          }
1921 1920          mp = so->so_rcv_head;
1922 1921          if (mp != NULL) {
1923 1922                  len += msgdsize(mp);
1924 1923                  while ((mp = mp->b_next) != NULL)
1925 1924                          len += msgdsize(mp);
1926 1925          }
1927 1926          return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1928 1927  }
1929 1928  #endif
1930 1929  
1931 1930  int
1932 1931  so_get_mod_version(struct sockparams *sp)
1933 1932  {
1934 1933          ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1935 1934          return (sp->sp_smod_info->smod_version);
1936 1935  }
1937 1936  
1938 1937  /*
1939 1938   * so_start_fallback()
1940 1939   *
1941 1940   * Block new socket operations from coming in, and wait for active operations
1942 1941   * to complete. Threads that are sleeping will be woken up so they can get
1943 1942   * out of the way.
1944 1943   *
1945 1944   * The caller must be a reader on so_fallback_rwlock.
1946 1945   */
1947 1946  static boolean_t
1948 1947  so_start_fallback(struct sonode *so)
1949 1948  {
1950 1949          ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1951 1950  
1952 1951          mutex_enter(&so->so_lock);
1953 1952          if (so->so_state & SS_FALLBACK_PENDING) {
1954 1953                  mutex_exit(&so->so_lock);
1955 1954                  return (B_FALSE);
1956 1955          }
1957 1956          so->so_state |= SS_FALLBACK_PENDING;
1958 1957          /*
1959 1958           * Poke all threads that might be sleeping. Any operation that comes
1960 1959           * in after the cv_broadcast will observe the fallback pending flag
1961 1960           * which cause the call to return where it would normally sleep.
1962 1961           */
1963 1962          cv_broadcast(&so->so_state_cv);         /* threads in connect() */
1964 1963          cv_broadcast(&so->so_rcv_cv);           /* threads in recvmsg() */
1965 1964          cv_broadcast(&so->so_snd_cv);           /* threads in sendmsg() */
1966 1965          mutex_enter(&so->so_acceptq_lock);
1967 1966          cv_broadcast(&so->so_acceptq_cv);       /* threads in accept() */
1968 1967          mutex_exit(&so->so_acceptq_lock);
1969 1968          mutex_exit(&so->so_lock);
1970 1969  
1971 1970          /*
1972 1971           * The main reason for the rw_tryupgrade call is to provide
1973 1972           * observability during the fallback process. We want to
1974 1973           * be able to see if there are pending operations.
1975 1974           */
1976 1975          if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1977 1976                  /*
1978 1977                   * It is safe to drop and reaquire the fallback lock, because
1979 1978                   * we are guaranteed that another fallback cannot take place.
1980 1979                   */
1981 1980                  rw_exit(&so->so_fallback_rwlock);
1982 1981                  DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1983 1982                  rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1984 1983                  DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1985 1984          }
1986 1985  
1987 1986          return (B_TRUE);
1988 1987  }
1989 1988  
1990 1989  /*
1991 1990   * so_end_fallback()
1992 1991   *
1993 1992   * Allow socket opertions back in.
1994 1993   *
1995 1994   * The caller must be a writer on so_fallback_rwlock.
1996 1995   */
1997 1996  static void
1998 1997  so_end_fallback(struct sonode *so)
1999 1998  {
2000 1999          ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
2001 2000  
2002 2001          mutex_enter(&so->so_lock);
2003 2002          so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
2004 2003          mutex_exit(&so->so_lock);
2005 2004  
2006 2005          rw_downgrade(&so->so_fallback_rwlock);
2007 2006  }
2008 2007  
2009 2008  /*
2010 2009   * so_quiesced_cb()
2011 2010   *
2012 2011   * Callback passed to the protocol during fallback. It is called once
2013 2012   * the endpoint is quiescent.
2014 2013   *
2015 2014   * No requests from the user, no notifications from the protocol, so it
2016 2015   * is safe to synchronize the state. Data can also be moved without
2017 2016   * risk for reordering.
2018 2017   *
2019 2018   * We do not need to hold so_lock, since there can be only one thread
2020 2019   * operating on the sonode.
2021 2020   */
2022 2021  static mblk_t *
2023 2022  so_quiesced_cb(sock_upper_handle_t sock_handle, sock_quiesce_arg_t *arg,
2024 2023      struct T_capability_ack *tcap,
2025 2024      struct sockaddr *laddr, socklen_t laddrlen,
2026 2025      struct sockaddr *faddr, socklen_t faddrlen, short opts)
2027 2026  {
2028 2027          struct sonode *so = (struct sonode *)sock_handle;
2029 2028          boolean_t atmark;
2030 2029          mblk_t *retmp = NULL, **tailmpp = &retmp;
2031 2030  
2032 2031          if (tcap != NULL)
2033 2032                  sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen,
2034 2033                      opts);
2035 2034  
2036 2035          /*
2037 2036           * Some protocols do not quiece the data path during fallback. Once
2038 2037           * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
2039 2038           * fail and the protocol is responsible for saving the data for later
2040 2039           * delivery (i.e., once the fallback has completed).
2041 2040           */
2042 2041          mutex_enter(&so->so_lock);
2043 2042          so->so_state |= SS_FALLBACK_DRAIN;
2044 2043          SOCKET_TIMER_CANCEL(so);
2045 2044          mutex_exit(&so->so_lock);
2046 2045  
2047 2046          if (so->so_rcv_head != NULL) {
2048 2047                  if (so->so_rcv_q_last_head == NULL)
2049 2048                          so->so_rcv_q_head = so->so_rcv_head;
2050 2049                  else
2051 2050                          so->so_rcv_q_last_head->b_next = so->so_rcv_head;
2052 2051                  so->so_rcv_q_last_head = so->so_rcv_last_head;
2053 2052          }
2054 2053  
2055 2054          atmark = (so->so_state & SS_RCVATMARK) != 0;
2056 2055          /*
2057 2056           * Clear any OOB state having to do with pending data. The TPI
2058 2057           * code path will set the appropriate oob state when we move the
2059 2058           * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
2060 2059           * data has already been consumed.
2061 2060           */
2062 2061          so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
2063 2062  
2064 2063          ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
2065 2064  
2066 2065          /*
2067 2066           * Move data to the STREAM head.
2068 2067           */
2069 2068          while (so->so_rcv_q_head != NULL) {
2070 2069                  mblk_t *mp = so->so_rcv_q_head;
2071 2070                  size_t mlen = msgdsize(mp);
2072 2071  
2073 2072                  so->so_rcv_q_head = mp->b_next;
2074 2073                  mp->b_next = NULL;
2075 2074                  mp->b_prev = NULL;
2076 2075  
2077 2076                  /*
2078 2077                   * Send T_EXDATA_IND if we are at the oob mark.
2079 2078                   */
2080 2079                  if (atmark) {
2081 2080                          struct T_exdata_ind *tei;
2082 2081                          mblk_t *mp1 = arg->soqa_exdata_mp;
2083 2082  
2084 2083                          arg->soqa_exdata_mp = NULL;
2085 2084                          ASSERT(mp1 != NULL);
2086 2085                          mp1->b_datap->db_type = M_PROTO;
2087 2086                          tei = (struct T_exdata_ind *)mp1->b_rptr;
2088 2087                          tei->PRIM_type = T_EXDATA_IND;
2089 2088                          tei->MORE_flag = 0;
2090 2089                          mp1->b_wptr = (uchar_t *)&tei[1];
2091 2090  
2092 2091                          if (IS_SO_OOB_INLINE(so)) {
2093 2092                                  mp1->b_cont = mp;
2094 2093                          } else {
2095 2094                                  ASSERT(so->so_oobmsg != NULL);
2096 2095                                  mp1->b_cont = so->so_oobmsg;
2097 2096                                  so->so_oobmsg = NULL;
2098 2097  
2099 2098                                  /* process current mp next time around */
2100 2099                                  mp->b_next = so->so_rcv_q_head;
2101 2100                                  so->so_rcv_q_head = mp;
2102 2101                                  mlen = 0;
2103 2102                          }
2104 2103                          mp = mp1;
2105 2104  
2106 2105                          /* we have consumed the oob mark */
2107 2106                          atmark = B_FALSE;
2108 2107                  } else if (so->so_oobmark > 0) {
2109 2108                          /*
2110 2109                           * Check if the OOB mark is within the current
2111 2110                           * mblk chain. In that case we have to split it up.
2112 2111                           */
2113 2112                          if (so->so_oobmark < mlen) {
2114 2113                                  mblk_t *urg_mp = mp;
2115 2114  
2116 2115                                  atmark = B_TRUE;
2117 2116                                  mp = NULL;
2118 2117                                  mlen = so->so_oobmark;
2119 2118  
2120 2119                                  /*
2121 2120                                   * It is assumed that the OOB mark does
2122 2121                                   * not land within a mblk.
2123 2122                                   */
2124 2123                                  do {
2125 2124                                          so->so_oobmark -= MBLKL(urg_mp);
2126 2125                                          mp = urg_mp;
2127 2126                                          urg_mp = urg_mp->b_cont;
2128 2127                                  } while (so->so_oobmark > 0);
2129 2128                                  mp->b_cont = NULL;
2130 2129                                  if (urg_mp != NULL) {
2131 2130                                          urg_mp->b_next = so->so_rcv_q_head;
2132 2131                                          so->so_rcv_q_head = urg_mp;
2133 2132                                  }
2134 2133                          } else {
2135 2134                                  so->so_oobmark -= mlen;
2136 2135                                  if (so->so_oobmark == 0)
2137 2136                                          atmark = B_TRUE;
2138 2137                          }
2139 2138                  }
2140 2139  
2141 2140                  /*
2142 2141                   * Queue data on the STREAM head.
2143 2142                   */
2144 2143                  so->so_rcv_queued -= mlen;
2145 2144                  *tailmpp = mp;
2146 2145                  tailmpp = &mp->b_next;
2147 2146          }
2148 2147          so->so_rcv_head = NULL;
2149 2148          so->so_rcv_last_head = NULL;
2150 2149          so->so_rcv_q_head = NULL;
2151 2150          so->so_rcv_q_last_head = NULL;
2152 2151  
2153 2152          /*
2154 2153           * Check if the oob byte is at the end of the data stream, or if the
2155 2154           * oob byte has not yet arrived. In the latter case we have to send a
2156 2155           * SIGURG and a mark indicator to the STREAM head. The mark indicator
2157 2156           * is needed to guarantee correct behavior for SIOCATMARK. See block
2158 2157           * comment in socktpi.h for more details.
2159 2158           */
2160 2159          if (atmark || so->so_oobmark > 0) {
2161 2160                  mblk_t *mp;
2162 2161  
2163 2162                  if (atmark && so->so_oobmsg != NULL) {
2164 2163                          struct T_exdata_ind *tei;
2165 2164  
2166 2165                          mp = arg->soqa_exdata_mp;
2167 2166                          arg->soqa_exdata_mp = NULL;
2168 2167                          ASSERT(mp != NULL);
2169 2168                          mp->b_datap->db_type = M_PROTO;
2170 2169                          tei = (struct T_exdata_ind *)mp->b_rptr;
2171 2170                          tei->PRIM_type = T_EXDATA_IND;
2172 2171                          tei->MORE_flag = 0;
2173 2172                          mp->b_wptr = (uchar_t *)&tei[1];
2174 2173  
2175 2174                          mp->b_cont = so->so_oobmsg;
2176 2175                          so->so_oobmsg = NULL;
2177 2176  
2178 2177                          *tailmpp = mp;
2179 2178                          tailmpp = &mp->b_next;
2180 2179                  } else {
2181 2180                          /* Send up the signal */
2182 2181                          mp = arg->soqa_exdata_mp;
2183 2182                          arg->soqa_exdata_mp = NULL;
2184 2183                          ASSERT(mp != NULL);
2185 2184                          DB_TYPE(mp) = M_PCSIG;
2186 2185                          *mp->b_wptr++ = (uchar_t)SIGURG;
2187 2186                          *tailmpp = mp;
2188 2187                          tailmpp = &mp->b_next;
2189 2188  
2190 2189                          /* Send up the mark indicator */
2191 2190                          mp = arg->soqa_urgmark_mp;
2192 2191                          arg->soqa_urgmark_mp = NULL;
2193 2192                          mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
2194 2193                          *tailmpp = mp;
2195 2194                          tailmpp = &mp->b_next;
2196 2195  
2197 2196                          so->so_oobmark = 0;
2198 2197                  }
2199 2198          }
2200 2199          ASSERT(so->so_oobmark == 0);
2201 2200          ASSERT(so->so_rcv_queued == 0);
2202 2201  
2203 2202          return (retmp);
2204 2203  }
2205 2204  
2206 2205  #ifdef DEBUG
2207 2206  /*
2208 2207   * Do an integrity check of the sonode. This should be done if a
2209 2208   * fallback fails after sonode has initially been converted to use
2210 2209   * TPI and subsequently have to be reverted.
2211 2210   *
2212 2211   * Failure to pass the integrity check will panic the system.
2213 2212   */
2214 2213  void
2215 2214  so_integrity_check(struct sonode *cur, struct sonode *orig)
2216 2215  {
2217 2216          VERIFY(cur->so_vnode == orig->so_vnode);
2218 2217          VERIFY(cur->so_ops == orig->so_ops);
2219 2218          /*
2220 2219           * For so_state we can only VERIFY the state flags in CHECK_STATE.
2221 2220           * The other state flags might be affected by a notification from the
2222 2221           * protocol.
2223 2222           */
2224 2223  #define CHECK_STATE     (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
2225 2224          SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
2226 2225          SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
2227 2226          VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
2228 2227              (orig->so_state & CHECK_STATE));
2229 2228          VERIFY(cur->so_mode == orig->so_mode);
2230 2229          VERIFY(cur->so_flag == orig->so_flag);
2231 2230          VERIFY(cur->so_count == orig->so_count);
2232 2231          /* Cannot VERIFY so_proto_connid; proto can update it */
2233 2232          VERIFY(cur->so_sockparams == orig->so_sockparams);
2234 2233          /* an error might have been recorded, but it can not be lost */
2235 2234          VERIFY(cur->so_error != 0 || orig->so_error == 0);
2236 2235          VERIFY(cur->so_family == orig->so_family);
2237 2236          VERIFY(cur->so_type == orig->so_type);
2238 2237          VERIFY(cur->so_protocol == orig->so_protocol);
2239 2238          VERIFY(cur->so_version == orig->so_version);
2240 2239          /* New conns might have arrived, but none should have been lost */
2241 2240          VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
2242 2241          VERIFY(list_head(&cur->so_acceptq_list) ==
2243 2242              list_head(&orig->so_acceptq_list));
2244 2243          VERIFY(cur->so_backlog == orig->so_backlog);
2245 2244          /* New OOB migth have arrived, but mark should not have been lost */
2246 2245          VERIFY(cur->so_oobmark >= orig->so_oobmark);
2247 2246          /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
2248 2247          VERIFY(cur->so_pgrp == orig->so_pgrp);
2249 2248          VERIFY(cur->so_peercred == orig->so_peercred);
2250 2249          VERIFY(cur->so_cpid == orig->so_cpid);
2251 2250          VERIFY(cur->so_zoneid == orig->so_zoneid);
2252 2251          /* New data migth have arrived, but none should have been lost */
2253 2252          VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
2254 2253          VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
2255 2254          VERIFY(cur->so_rcv_head == orig->so_rcv_head);
2256 2255          VERIFY(cur->so_proto_handle == orig->so_proto_handle);
2257 2256          VERIFY(cur->so_downcalls == orig->so_downcalls);
2258 2257          /* Cannot VERIFY so_proto_props; they can be updated by proto */
2259 2258  }
2260 2259  #endif
2261 2260  
2262 2261  /*
2263 2262   * so_tpi_fallback()
2264 2263   *
2265 2264   * This is the fallback initation routine; things start here.
2266 2265   *
2267 2266   * Basic strategy:
2268 2267   *   o Block new socket operations from coming in
2269 2268   *   o Allocate/initate info needed by TPI
2270 2269   *   o Quiesce the connection, at which point we sync
2271 2270   *     state and move data
2272 2271   *   o Change operations (sonodeops) associated with the socket
2273 2272   *   o Unblock threads waiting for the fallback to finish
2274 2273   */
2275 2274  int
2276 2275  so_tpi_fallback(struct sonode *so, struct cred *cr)
2277 2276  {
2278 2277          int error;
2279 2278          queue_t *q;
2280 2279          struct sockparams *sp;
2281 2280          struct sockparams *newsp = NULL;
2282 2281          so_proto_fallback_func_t fbfunc;
2283 2282          const char *devpath;
2284 2283          boolean_t direct;
  
    | 
      ↓ open down ↓ | 
    2247 lines elided | 
    
      ↑ open up ↑ | 
  
2285 2284          struct sonode *nso;
2286 2285          sock_quiesce_arg_t arg = { NULL, NULL };
2287 2286  #ifdef DEBUG
2288 2287          struct sonode origso;
2289 2288  #endif
2290 2289          error = 0;
2291 2290          sp = so->so_sockparams;
2292 2291          fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
2293 2292  
2294 2293          /*
2295      -         * Cannot fallback if the socket has active filters or a krecv callback.
     2294 +         * Cannot fallback if the socket has active filters
2296 2295           */
2297      -        if (so->so_filter_active > 0 || so->so_krecv_cb != NULL)
     2296 +        if (so->so_filter_active > 0)
2298 2297                  return (EINVAL);
2299 2298  
2300 2299          switch (so->so_family) {
2301 2300          case AF_INET:
2302 2301                  devpath = sp->sp_smod_info->smod_fallback_devpath_v4;
2303 2302                  break;
2304 2303          case AF_INET6:
2305 2304                  devpath = sp->sp_smod_info->smod_fallback_devpath_v6;
2306 2305                  break;
2307 2306          default:
2308 2307                  return (EINVAL);
2309 2308          }
2310 2309  
2311 2310          /*
2312 2311           * Fallback can only happen if the socket module has a TPI device
2313 2312           * and fallback function.
2314 2313           */
2315 2314          if (devpath == NULL || fbfunc == NULL)
2316 2315                  return (EINVAL);
2317 2316  
2318 2317          /*
2319 2318           * Initiate fallback; upon success we know that no new requests
2320 2319           * will come in from the user.
2321 2320           */
2322 2321          if (!so_start_fallback(so))
2323 2322                  return (EAGAIN);
2324 2323  #ifdef DEBUG
2325 2324          /*
2326 2325           * Make a copy of the sonode in case we need to make an integrity
2327 2326           * check later on.
2328 2327           */
2329 2328          bcopy(so, &origso, sizeof (*so));
2330 2329  #endif
2331 2330  
2332 2331          sp->sp_stats.sps_nfallback.value.ui64++;
2333 2332  
2334 2333          newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
2335 2334              so->so_protocol, devpath, KM_SLEEP, &error);
2336 2335          if (error != 0)
2337 2336                  goto out;
2338 2337  
2339 2338          if (so->so_direct != NULL) {
2340 2339                  sodirect_t *sodp = so->so_direct;
2341 2340                  mutex_enter(&so->so_lock);
2342 2341  
2343 2342                  so->so_direct->sod_enabled = B_FALSE;
2344 2343                  so->so_state &= ~SS_SODIRECT;
2345 2344                  ASSERT(sodp->sod_uioafh == NULL);
2346 2345                  mutex_exit(&so->so_lock);
2347 2346          }
2348 2347  
2349 2348          /* Turn sonode into a TPI socket */
2350 2349          error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
2351 2350          if (error != 0)
2352 2351                  goto out;
2353 2352          /*
2354 2353           * When it comes to urgent data we have two cases to deal with;
2355 2354           * (1) The oob byte has already arrived, or (2) the protocol has
2356 2355           * notified that oob data is pending, but it has not yet arrived.
2357 2356           *
2358 2357           * For (1) all we need to do is send a T_EXDATA_IND to indicate were
2359 2358           * in the byte stream the oob byte is. For (2) we have to send a
2360 2359           * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
2361 2360           * the oob byte will be the next byte from the protocol.
2362 2361           *
2363 2362           * So in the worst case we need two mblks, one for the signal, another
2364 2363           * for mark indication. In that case we use the exdata_mp for the sig.
2365 2364           */
2366 2365          arg.soqa_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind),
2367 2366              BPRI_MED, STR_NOSIG, NULL);
2368 2367          arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
2369 2368  
2370 2369          /*
2371 2370           * Now tell the protocol to start using TPI. so_quiesced_cb be
2372 2371           * called once it's safe to synchronize state.
2373 2372           */
2374 2373          DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
2375 2374          error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb,
2376 2375              &arg);
2377 2376          DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
2378 2377  
2379 2378          if (error != 0) {
2380 2379                  /* protocol was unable to do a fallback, revert the sonode */
2381 2380                  sotpi_revert_sonode(so, cr);
2382 2381                  goto out;
2383 2382          }
2384 2383  
2385 2384          /*
2386 2385           * Walk the accept queue and notify the proto that they should
2387 2386           * fall back to TPI. The protocol will send up the T_CONN_IND.
2388 2387           */
2389 2388          nso = list_head(&so->so_acceptq_list);
2390 2389          while (nso != NULL) {
2391 2390                  int rval;
2392 2391                  struct sonode *next;
2393 2392  
2394 2393                  if (arg.soqa_exdata_mp == NULL) {
2395 2394                          arg.soqa_exdata_mp =
2396 2395                              allocb_wait(sizeof (struct T_exdata_ind),
2397 2396                              BPRI_MED, STR_NOSIG, NULL);
2398 2397                  }
2399 2398                  if (arg.soqa_urgmark_mp == NULL) {
2400 2399                          arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED,
2401 2400                              STR_NOSIG, NULL);
2402 2401                  }
2403 2402  
2404 2403                  DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
2405 2404                  rval = (*fbfunc)(nso->so_proto_handle, NULL, direct,
2406 2405                      so_quiesced_cb, &arg);
2407 2406                  DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
2408 2407                  if (rval != 0) {
2409 2408                          /* Abort the connection */
2410 2409                          zcmn_err(getzoneid(), CE_WARN,
2411 2410                              "Failed to convert socket in accept queue to TPI. "
2412 2411                              "Pid = %d\n", curproc->p_pid);
2413 2412                          next = list_next(&so->so_acceptq_list, nso);
2414 2413                          list_remove(&so->so_acceptq_list, nso);
2415 2414                          so->so_acceptq_len--;
2416 2415  
2417 2416                          (void) socket_close(nso, 0, CRED());
2418 2417                          socket_destroy(nso);
2419 2418                          nso = next;
2420 2419                  } else {
2421 2420                          nso = list_next(&so->so_acceptq_list, nso);
2422 2421                  }
2423 2422          }
2424 2423  
2425 2424          /*
2426 2425           * Now flush the acceptq, this will destroy all sockets. They will
2427 2426           * be recreated in sotpi_accept().
2428 2427           */
2429 2428          so_acceptq_flush(so, B_FALSE);
2430 2429  
2431 2430          mutex_enter(&so->so_lock);
2432 2431          so->so_state |= SS_FALLBACK_COMP;
2433 2432          mutex_exit(&so->so_lock);
2434 2433  
2435 2434          /*
2436 2435           * Swap the sonode ops. Socket opertations that come in once this
2437 2436           * is done will proceed without blocking.
2438 2437           */
2439 2438          so->so_ops = &sotpi_sonodeops;
2440 2439  
2441 2440          /*
2442 2441           * Wake up any threads stuck in poll. This is needed since the poll
2443 2442           * head changes when the fallback happens (moves from the sonode to
2444 2443           * the STREAMS head).
2445 2444           */
2446 2445          pollwakeup(&so->so_poll_list, POLLERR);
2447 2446  
2448 2447          /*
2449 2448           * When this non-STREAM socket was created we placed an extra ref on
2450 2449           * the associated vnode to support asynchronous close. Drop that ref
2451 2450           * here.
2452 2451           */
2453 2452          ASSERT(SOTOV(so)->v_count >= 2);
2454 2453          VN_RELE(SOTOV(so));
2455 2454  out:
2456 2455          so_end_fallback(so);
2457 2456  
2458 2457          if (error != 0) {
2459 2458  #ifdef DEBUG
2460 2459                  so_integrity_check(so, &origso);
2461 2460  #endif
2462 2461                  zcmn_err(getzoneid(), CE_WARN,
2463 2462                      "Failed to convert socket to TPI (err=%d). Pid = %d\n",
  
    | 
      ↓ open down ↓ | 
    156 lines elided | 
    
      ↑ open up ↑ | 
  
2464 2463                      error, curproc->p_pid);
2465 2464                  if (newsp != NULL)
2466 2465                          SOCKPARAMS_DEC_REF(newsp);
2467 2466          }
2468 2467          if (arg.soqa_exdata_mp != NULL)
2469 2468                  freemsg(arg.soqa_exdata_mp);
2470 2469          if (arg.soqa_urgmark_mp != NULL)
2471 2470                  freemsg(arg.soqa_urgmark_mp);
2472 2471  
2473 2472          return (error);
2474      -}
2475      -
2476      -int
2477      -so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg)
2478      -{
2479      -        int ret;
2480      -
2481      -        if (cb == NULL && arg != NULL)
2482      -                return (EINVAL);
2483      -
2484      -        SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg));
2485      -
2486      -        mutex_enter(&so->so_lock);
2487      -        if (so->so_state & SS_FALLBACK_COMP) {
2488      -                mutex_exit(&so->so_lock);
2489      -                SO_UNBLOCK_FALLBACK(so);
2490      -                return (ENOTSUP);
2491      -        }
2492      -
2493      -        ret = so_lock_read(so, 0);
2494      -        VERIFY(ret == 0);
2495      -        /*
2496      -         * Other consumers may actually care about getting extant data delivered
2497      -         * to them, when they come along, they should figure out the best API
2498      -         * for that.
2499      -         */
2500      -        so_rcv_flush(so);
2501      -
2502      -        so->so_krecv_cb = cb;
2503      -        so->so_krecv_arg = arg;
2504      -
2505      -        so_unlock_read(so);
2506      -        mutex_exit(&so->so_lock);
2507      -        SO_UNBLOCK_FALLBACK(so);
2508      -
2509      -        return (0);
2510      -}
2511      -
2512      -void
2513      -so_krecv_unblock(sonode_t *so)
2514      -{
2515      -        mutex_enter(&so->so_lock);
2516      -        VERIFY(so->so_krecv_cb != NULL);
2517      -
2518      -        so->so_rcv_queued = 0;
2519      -        (void) so_check_flow_control(so);
2520      -        mutex_exit(&so->so_lock);
2521 2473  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX