Print this page
    
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/tcp/tcp_socket.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_socket.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
       24 + * Copyright 2015 Joyent, Inc.
  24   25   */
  25   26  
  26   27  /* This file contains all TCP kernel socket related functions. */
  27   28  
  28   29  #include <sys/types.h>
  29   30  #include <sys/strlog.h>
  30   31  #include <sys/policy.h>
  31   32  #include <sys/sockio.h>
  32   33  #include <sys/strsubr.h>
  33   34  #include <sys/strsun.h>
  34   35  #include <sys/squeue_impl.h>
  35   36  #include <sys/squeue.h>
  36   37  #define _SUN_TPI_VERSION 2
  37   38  #include <sys/tihdr.h>
  38   39  #include <sys/timod.h>
  39   40  #include <sys/tpicommon.h>
  40   41  #include <sys/socketvar.h>
  41   42  
  42   43  #include <inet/common.h>
  43   44  #include <inet/proto_set.h>
  44   45  #include <inet/ip.h>
  45   46  #include <inet/tcp.h>
  46   47  #include <inet/tcp_impl.h>
  47   48  
  48   49  static void     tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
  49   50                      sock_upcalls_t *, int, cred_t *);
  50   51  static int      tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
  51   52                      sock_upper_handle_t, cred_t *);
  52   53  static int      tcp_bind(sock_lower_handle_t, struct sockaddr *,
  53   54                      socklen_t, cred_t *);
  54   55  static int      tcp_listen(sock_lower_handle_t, int, cred_t *);
  55   56  static int      tcp_connect(sock_lower_handle_t, const struct sockaddr *,
  56   57                      socklen_t, sock_connid_t *, cred_t *);
  57   58  static int      tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
  58   59                      socklen_t *, cred_t *);
  59   60  static int      tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
  60   61                      socklen_t *, cred_t *);
  61   62  static int      tcp_getsockopt(sock_lower_handle_t, int, int, void *,
  62   63                      socklen_t *, cred_t *);
  63   64  static int      tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
  64   65                      socklen_t, cred_t *);
  65   66  static int      tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
  66   67                      cred_t *);
  67   68  static int      tcp_shutdown(sock_lower_handle_t, int, cred_t *);
  68   69  static void     tcp_clr_flowctrl(sock_lower_handle_t);
  69   70  static int      tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
  70   71                      cred_t *);
  71   72  static int      tcp_close(sock_lower_handle_t, int, cred_t *);
  72   73  
  73   74  sock_downcalls_t sock_tcp_downcalls = {
  74   75          tcp_activate,
  75   76          tcp_accept,
  76   77          tcp_bind,
  77   78          tcp_listen,
  78   79          tcp_connect,
  79   80          tcp_getpeername,
  80   81          tcp_getsockname,
  81   82          tcp_getsockopt,
  82   83          tcp_setsockopt,
  83   84          tcp_sendmsg,
  84   85          NULL,
  85   86          NULL,
  86   87          NULL,
  87   88          tcp_shutdown,
  88   89          tcp_clr_flowctrl,
  89   90          tcp_ioctl,
  90   91          tcp_close,
  91   92  };
  92   93  
  93   94  /* ARGSUSED */
  94   95  static void
  95   96  tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
  96   97      sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
  97   98  {
  98   99          conn_t *connp = (conn_t *)proto_handle;
  99  100          struct sock_proto_props sopp;
 100  101          extern struct module_info tcp_rinfo;
 101  102  
 102  103          ASSERT(connp->conn_upper_handle == NULL);
 103  104  
 104  105          /* All Solaris components should pass a cred for this operation. */
 105  106          ASSERT(cr != NULL);
 106  107  
 107  108          sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
 108  109              SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
 109  110              SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
 110  111  
 111  112          sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
 112  113          sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
 113  114          sopp.sopp_maxpsz = INFPSZ;
 114  115          sopp.sopp_maxblk = INFPSZ;
 115  116          sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
 116  117          sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
 117  118          sopp.sopp_maxaddrlen = sizeof (sin6_t);
 118  119          sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
 119  120              tcp_rinfo.mi_minpsz;
 120  121  
 121  122          connp->conn_upcalls = sock_upcalls;
 122  123          connp->conn_upper_handle = sock_handle;
 123  124  
 124  125          ASSERT(connp->conn_rcvbuf != 0 &&
 125  126              connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
 126  127          (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
 127  128  }
 128  129  
 129  130  /*ARGSUSED*/
 130  131  static int
 131  132  tcp_accept(sock_lower_handle_t lproto_handle,
 132  133      sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
 133  134      cred_t *cr)
 134  135  {
 135  136          conn_t *lconnp, *econnp;
 136  137          tcp_t *listener, *eager;
 137  138  
 138  139          /*
 139  140           * KSSL can move a socket from one listener to another, in which
 140  141           * case `lproto_handle' points to the new listener. To ensure that
 141  142           * the original listener is used the information is obtained from
 142  143           * the eager.
 143  144           */
 144  145          econnp = (conn_t *)eproto_handle;
 145  146          eager = econnp->conn_tcp;
 146  147          ASSERT(IPCL_IS_NONSTR(econnp));
 147  148          ASSERT(eager->tcp_listener != NULL);
 148  149          listener = eager->tcp_listener;
 149  150          lconnp = (conn_t *)listener->tcp_connp;
 150  151          ASSERT(listener->tcp_state == TCPS_LISTEN);
 151  152          ASSERT(lconnp->conn_upper_handle != NULL);
 152  153  
 153  154          /*
 154  155           * It is possible for the accept thread to race with the thread that
 155  156           * made the su_newconn upcall in tcp_newconn_notify. Both
 156  157           * tcp_newconn_notify and tcp_accept require that conn_upper_handle
 157  158           * and conn_upcalls be set before returning, so they both write to
 158  159           * them. However, we're guaranteed that the value written is the same
 159  160           * for both threads.
 160  161           */
 161  162          ASSERT(econnp->conn_upper_handle == NULL ||
 162  163              econnp->conn_upper_handle == sock_handle);
 163  164          ASSERT(econnp->conn_upcalls == NULL ||
 164  165              econnp->conn_upcalls == lconnp->conn_upcalls);
 165  166          econnp->conn_upper_handle = sock_handle;
 166  167          econnp->conn_upcalls = lconnp->conn_upcalls;
 167  168  
 168  169          ASSERT(econnp->conn_netstack ==
 169  170              listener->tcp_connp->conn_netstack);
 170  171          ASSERT(eager->tcp_tcps == listener->tcp_tcps);
 171  172  
 172  173          /*
 173  174           * We should have a minimum of 2 references on the conn at this
 174  175           * point. One for TCP and one for the newconn notification
 175  176           * (which is now taken over by IP). In the normal case we would
 176  177           * also have another reference (making a total of 3) for the conn
 177  178           * being in the classifier hash list. However the eager could have
 178  179           * received an RST subsequently and tcp_closei_local could have
 179  180           * removed the eager from the classifier hash list, hence we can't
 180  181           * assert that reference.
 181  182           */
 182  183          ASSERT(econnp->conn_ref >= 2);
 183  184  
 184  185          mutex_enter(&listener->tcp_eager_lock);
 185  186          /*
 186  187           * Non-STREAMS listeners never defer the notification of new
 187  188           * connections.
 188  189           */
 189  190          ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
 190  191          tcp_eager_unlink(eager);
 191  192          mutex_exit(&listener->tcp_eager_lock);
 192  193          CONN_DEC_REF(listener->tcp_connp);
 193  194  
 194  195          return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
 195  196  }
 196  197  
 197  198  static int
 198  199  tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
 199  200      socklen_t len, cred_t *cr)
 200  201  {
 201  202          int             error;
 202  203          conn_t          *connp = (conn_t *)proto_handle;
 203  204  
 204  205          /* All Solaris components should pass a cred for this operation. */
 205  206          ASSERT(cr != NULL);
 206  207          ASSERT(connp->conn_upper_handle != NULL);
 207  208  
 208  209          error = squeue_synch_enter(connp, NULL);
 209  210          if (error != 0) {
 210  211                  /* failed to enter */
 211  212                  return (ENOSR);
 212  213          }
 213  214  
 214  215          /* binding to a NULL address really means unbind */
 215  216          if (sa == NULL) {
 216  217                  if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
 217  218                          error = tcp_do_unbind(connp);
 218  219                  else
 219  220                          error = EINVAL;
 220  221          } else {
 221  222                  error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
 222  223          }
 223  224  
 224  225          squeue_synch_exit(connp);
 225  226  
 226  227          if (error < 0) {
 227  228                  if (error == -TOUTSTATE)
 228  229                          error = EINVAL;
 229  230                  else
 230  231                          error = proto_tlitosyserr(-error);
 231  232          }
 232  233  
 233  234          return (error);
 234  235  }
 235  236  
 236  237  /* ARGSUSED */
 237  238  static int
 238  239  tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
 239  240  {
 240  241          conn_t  *connp = (conn_t *)proto_handle;
 241  242          tcp_t   *tcp = connp->conn_tcp;
 242  243          int     error;
 243  244  
 244  245          ASSERT(connp->conn_upper_handle != NULL);
 245  246  
 246  247          /* All Solaris components should pass a cred for this operation. */
 247  248          ASSERT(cr != NULL);
 248  249  
 249  250          error = squeue_synch_enter(connp, NULL);
 250  251          if (error != 0) {
 251  252                  /* failed to enter */
 252  253                  return (ENOBUFS);
 253  254          }
 254  255  
 255  256          error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
 256  257          if (error == 0) {
 257  258                  /*
 258  259                   * sockfs needs to know what's the maximum number of socket
 259  260                   * that can be queued on the listener.
 260  261                   */
 261  262                  (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 262  263                      SOCK_OPCTL_ENAB_ACCEPT,
 263  264                      (uintptr_t)(tcp->tcp_conn_req_max +
 264  265                      tcp->tcp_tcps->tcps_conn_req_max_q0));
 265  266          } else if (error < 0) {
 266  267                  if (error == -TOUTSTATE)
 267  268                          error = EINVAL;
 268  269                  else
 269  270                          error = proto_tlitosyserr(-error);
 270  271          }
 271  272          squeue_synch_exit(connp);
 272  273          return (error);
 273  274  }
 274  275  
 275  276  static int
 276  277  tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 277  278      socklen_t len, sock_connid_t *id, cred_t *cr)
 278  279  {
 279  280          conn_t          *connp = (conn_t *)proto_handle;
 280  281          int             error;
 281  282  
 282  283          ASSERT(connp->conn_upper_handle != NULL);
 283  284  
 284  285          /* All Solaris components should pass a cred for this operation. */
 285  286          ASSERT(cr != NULL);
 286  287  
 287  288          error = proto_verify_ip_addr(connp->conn_family, sa, len);
 288  289          if (error != 0) {
 289  290                  return (error);
 290  291          }
 291  292  
 292  293          error = squeue_synch_enter(connp, NULL);
 293  294          if (error != 0) {
 294  295                  /* failed to enter */
 295  296                  return (ENOSR);
 296  297          }
 297  298  
 298  299          /*
 299  300           * TCP supports quick connect, so no need to do an implicit bind
 300  301           */
 301  302          error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
 302  303          if (error == 0) {
 303  304                  *id = connp->conn_tcp->tcp_connid;
 304  305          } else if (error < 0) {
 305  306                  if (error == -TOUTSTATE) {
 306  307                          switch (connp->conn_tcp->tcp_state) {
 307  308                          case TCPS_SYN_SENT:
 308  309                                  error = EALREADY;
 309  310                                  break;
 310  311                          case TCPS_ESTABLISHED:
 311  312                                  error = EISCONN;
 312  313                                  break;
 313  314                          case TCPS_LISTEN:
 314  315                                  error = EOPNOTSUPP;
 315  316                                  break;
 316  317                          default:
 317  318                                  error = EINVAL;
 318  319                                  break;
 319  320                          }
 320  321                  } else {
 321  322                          error = proto_tlitosyserr(-error);
 322  323                  }
 323  324          }
 324  325  
 325  326          if (connp->conn_tcp->tcp_loopback) {
 326  327                  struct sock_proto_props sopp;
 327  328  
 328  329                  sopp.sopp_flags = SOCKOPT_LOOPBACK;
 329  330                  sopp.sopp_loopback = B_TRUE;
 330  331  
 331  332                  (*connp->conn_upcalls->su_set_proto_props)(
 332  333                      connp->conn_upper_handle, &sopp);
 333  334          }
 334  335  done:
 335  336          squeue_synch_exit(connp);
 336  337  
 337  338          return ((error == 0) ? EINPROGRESS : error);
 338  339  }
 339  340  
 340  341  /* ARGSUSED3 */
 341  342  static int
 342  343  tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 343  344      socklen_t *addrlenp, cred_t *cr)
 344  345  {
 345  346          conn_t  *connp = (conn_t *)proto_handle;
 346  347          tcp_t   *tcp = connp->conn_tcp;
 347  348  
 348  349          /* All Solaris components should pass a cred for this operation. */
 349  350          ASSERT(cr != NULL);
 350  351  
 351  352          ASSERT(tcp != NULL);
 352  353          if (tcp->tcp_state < TCPS_SYN_RCVD)
 353  354                  return (ENOTCONN);
 354  355  
 355  356          return (conn_getpeername(connp, addr, addrlenp));
 356  357  }
 357  358  
 358  359  /* ARGSUSED3 */
 359  360  static int
 360  361  tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 361  362      socklen_t *addrlenp, cred_t *cr)
 362  363  {
 363  364          conn_t  *connp = (conn_t *)proto_handle;
 364  365  
 365  366          /* All Solaris components should pass a cred for this operation. */
 366  367          ASSERT(cr != NULL);
 367  368  
 368  369          return (conn_getsockname(connp, addr, addrlenp));
 369  370  }
 370  371  
 371  372  /* returns UNIX error, the optlen is a value-result arg */
 372  373  static int
 373  374  tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 374  375      void *optvalp, socklen_t *optlen, cred_t *cr)
 375  376  {
 376  377          conn_t          *connp = (conn_t *)proto_handle;
 377  378          int             error;
 378  379          t_uscalar_t     max_optbuf_len;
 379  380          void            *optvalp_buf;
 380  381          int             len;
 381  382  
 382  383          ASSERT(connp->conn_upper_handle != NULL);
 383  384  
 384  385          error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 385  386              tcp_opt_obj.odb_opt_des_arr,
 386  387              tcp_opt_obj.odb_opt_arr_cnt,
 387  388              B_FALSE, B_TRUE, cr);
 388  389          if (error != 0) {
 389  390                  if (error < 0) {
 390  391                          error = proto_tlitosyserr(-error);
 391  392                  }
 392  393                  return (error);
 393  394          }
 394  395  
 395  396          optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
 396  397  
 397  398          error = squeue_synch_enter(connp, NULL);
 398  399          if (error == ENOMEM) {
 399  400                  kmem_free(optvalp_buf, max_optbuf_len);
 400  401                  return (ENOMEM);
 401  402          }
 402  403  
 403  404          len = tcp_opt_get(connp, level, option_name, optvalp_buf);
 404  405          squeue_synch_exit(connp);
 405  406  
 406  407          if (len == -1) {
 407  408                  kmem_free(optvalp_buf, max_optbuf_len);
 408  409                  return (EINVAL);
 409  410          }
 410  411  
 411  412          /*
 412  413           * update optlen and copy option value
 413  414           */
 414  415          t_uscalar_t size = MIN(len, *optlen);
 415  416  
 416  417          bcopy(optvalp_buf, optvalp, size);
 417  418          bcopy(&size, optlen, sizeof (size));
 418  419  
 419  420          kmem_free(optvalp_buf, max_optbuf_len);
 420  421          return (0);
 421  422  }
 422  423  
 423  424  static int
 424  425  tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 425  426      const void *optvalp, socklen_t optlen, cred_t *cr)
 426  427  {
 427  428          conn_t          *connp = (conn_t *)proto_handle;
 428  429          int             error;
 429  430  
 430  431          ASSERT(connp->conn_upper_handle != NULL);
 431  432          /*
 432  433           * Entering the squeue synchronously can result in a context switch,
 433  434           * which can cause a rather sever performance degradation. So we try to
 434  435           * handle whatever options we can without entering the squeue.
 435  436           */
 436  437          if (level == IPPROTO_TCP) {
 437  438                  switch (option_name) {
 438  439                  case TCP_NODELAY:
 439  440                          if (optlen != sizeof (int32_t))
 440  441                                  return (EINVAL);
 441  442                          mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
 442  443                          connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
 443  444                              connp->conn_tcp->tcp_mss;
 444  445                          mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
 445  446                          return (0);
 446  447                  default:
 447  448                          break;
 448  449                  }
 449  450          }
 450  451  
 451  452          error = squeue_synch_enter(connp, NULL);
 452  453          if (error == ENOMEM) {
 453  454                  return (ENOMEM);
 454  455          }
 455  456  
 456  457          error = proto_opt_check(level, option_name, optlen, NULL,
 457  458              tcp_opt_obj.odb_opt_des_arr,
 458  459              tcp_opt_obj.odb_opt_arr_cnt,
 459  460              B_TRUE, B_FALSE, cr);
 460  461  
 461  462          if (error != 0) {
 462  463                  if (error < 0) {
 463  464                          error = proto_tlitosyserr(-error);
 464  465                  }
 465  466                  squeue_synch_exit(connp);
 466  467                  return (error);
 467  468          }
 468  469  
 469  470          error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
 470  471              optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
 471  472              NULL, cr);
 472  473          squeue_synch_exit(connp);
 473  474  
 474  475          ASSERT(error >= 0);
 475  476  
 476  477          return (error);
 477  478  }
 478  479  
 479  480  /* ARGSUSED */
 480  481  static int
 481  482  tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 482  483      cred_t *cr)
 483  484  {
 484  485          tcp_t           *tcp;
 485  486          uint32_t        msize;
 486  487          conn_t *connp = (conn_t *)proto_handle;
 487  488          int32_t         tcpstate;
 488  489  
 489  490          /* All Solaris components should pass a cred for this operation. */
 490  491          ASSERT(cr != NULL);
 491  492  
 492  493          ASSERT(connp->conn_ref >= 2);
 493  494          ASSERT(connp->conn_upper_handle != NULL);
 494  495  
 495  496          if (msg->msg_controllen != 0) {
 496  497                  freemsg(mp);
 497  498                  return (EOPNOTSUPP);
 498  499          }
 499  500  
 500  501          switch (DB_TYPE(mp)) {
 501  502          case M_DATA:
 502  503                  tcp = connp->conn_tcp;
 503  504                  ASSERT(tcp != NULL);
 504  505  
 505  506                  tcpstate = tcp->tcp_state;
 506  507                  if (tcpstate < TCPS_ESTABLISHED) {
 507  508                          freemsg(mp);
 508  509                          /*
 509  510                           * We return ENOTCONN if the endpoint is trying to
 510  511                           * connect or has never been connected, and EPIPE if it
 511  512                           * has been disconnected. The connection id helps us
 512  513                           * distinguish between the last two cases.
 513  514                           */
 514  515                          return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
 515  516                              ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
 516  517                  } else if (tcpstate > TCPS_CLOSE_WAIT) {
 517  518                          freemsg(mp);
 518  519                          return (EPIPE);
 519  520                  }
 520  521  
 521  522                  msize = msgdsize(mp);
 522  523  
 523  524                  mutex_enter(&tcp->tcp_non_sq_lock);
 524  525                  tcp->tcp_squeue_bytes += msize;
 525  526                  /*
 526  527                   * Squeue Flow Control
 527  528                   */
 528  529                  if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 529  530                          tcp_setqfull(tcp);
 530  531                  }
 531  532                  mutex_exit(&tcp->tcp_non_sq_lock);
 532  533  
 533  534                  /*
 534  535                   * The application may pass in an address in the msghdr, but
 535  536                   * we ignore the address on connection-oriented sockets.
 536  537                   * Just like BSD this code does not generate an error for
 537  538                   * TCP (a CONNREQUIRED socket) when sending to an address
 538  539                   * passed in with sendto/sendmsg. Instead the data is
 539  540                   * delivered on the connection as if no address had been
 540  541                   * supplied.
 541  542                   */
 542  543                  CONN_INC_REF(connp);
 543  544  
 544  545                  if (msg->msg_flags & MSG_OOB) {
 545  546                          SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
 546  547                              connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 547  548                  } else {
 548  549                          SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
 549  550                              connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 550  551                  }
 551  552  
 552  553                  return (0);
 553  554  
 554  555          default:
 555  556                  ASSERT(0);
 556  557          }
 557  558  
 558  559          freemsg(mp);
 559  560          return (0);
 560  561  }
 561  562  
 562  563  /* ARGSUSED */
 563  564  static int
 564  565  tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 565  566  {
 566  567          conn_t  *connp = (conn_t *)proto_handle;
 567  568          tcp_t   *tcp = connp->conn_tcp;
 568  569  
 569  570          ASSERT(connp->conn_upper_handle != NULL);
 570  571  
 571  572          /* All Solaris components should pass a cred for this operation. */
 572  573          ASSERT(cr != NULL);
 573  574  
 574  575          /*
 575  576           * X/Open requires that we check the connected state.
 576  577           */
 577  578          if (tcp->tcp_state < TCPS_SYN_SENT)
 578  579                  return (ENOTCONN);
 579  580  
 580  581          /* shutdown the send side */
 581  582          if (how != SHUT_RD) {
 582  583                  mblk_t *bp;
 583  584  
 584  585                  bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
 585  586                  CONN_INC_REF(connp);
 586  587                  SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
 587  588                      connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
 588  589  
 589  590                  (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 590  591                      SOCK_OPCTL_SHUT_SEND, 0);
 591  592          }
 592  593  
 593  594          /* shutdown the recv side */
 594  595          if (how != SHUT_WR)
 595  596                  (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 596  597                      SOCK_OPCTL_SHUT_RECV, 0);
 597  598  
 598  599          return (0);
 599  600  }
 600  601  
 601  602  static void
 602  603  tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
 603  604  {
 604  605          conn_t  *connp = (conn_t *)proto_handle;
 605  606          tcp_t   *tcp = connp->conn_tcp;
 606  607          mblk_t *mp;
 607  608          int error;
 608  609  
 609  610          ASSERT(connp->conn_upper_handle != NULL);
 610  611  
 611  612          /*
 612  613           * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
 613  614           * is currently running.
 614  615           */
 615  616          mutex_enter(&tcp->tcp_rsrv_mp_lock);
 616  617          if ((mp = tcp->tcp_rsrv_mp) == NULL) {
 617  618                  mutex_exit(&tcp->tcp_rsrv_mp_lock);
 618  619                  return;
 619  620          }
 620  621          tcp->tcp_rsrv_mp = NULL;
 621  622          mutex_exit(&tcp->tcp_rsrv_mp_lock);
 622  623  
 623  624          error = squeue_synch_enter(connp, mp);
 624  625          ASSERT(error == 0);
 625  626  
 626  627          mutex_enter(&tcp->tcp_rsrv_mp_lock);
 627  628          tcp->tcp_rsrv_mp = mp;
 628  629          mutex_exit(&tcp->tcp_rsrv_mp_lock);
 629  630  
 630  631          if (tcp->tcp_fused) {
 631  632                  tcp_fuse_backenable(tcp);
 632  633          } else {
 633  634                  tcp->tcp_rwnd = connp->conn_rcvbuf;
 634  635                  /*
 635  636                   * Send back a window update immediately if TCP is above
 636  637                   * ESTABLISHED state and the increase of the rcv window
 637  638                   * that the other side knows is at least 1 MSS after flow
 638  639                   * control is lifted.
 639  640                   */
 640  641                  if (tcp->tcp_state >= TCPS_ESTABLISHED &&
 641  642                      tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
 642  643                          tcp_xmit_ctl(NULL, tcp,
 643  644                              (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
 644  645                              tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
 645  646                  }
 646  647          }
 647  648  
 648  649          squeue_synch_exit(connp);
 649  650  }
 650  651  
 651  652  /* ARGSUSED */
 652  653  static int
 653  654  tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 654  655      int mode, int32_t *rvalp, cred_t *cr)
 655  656  {
 656  657          conn_t          *connp = (conn_t *)proto_handle;
 657  658          int             error;
 658  659  
 659  660          ASSERT(connp->conn_upper_handle != NULL);
 660  661  
 661  662          /* All Solaris components should pass a cred for this operation. */
 662  663          ASSERT(cr != NULL);
 663  664  
 664  665          /*
 665  666           * If we don't have a helper stream then create one.
 666  667           * ip_create_helper_stream takes care of locking the conn_t,
 667  668           * so this check for NULL is just a performance optimization.
 668  669           */
 669  670          if (connp->conn_helper_info == NULL) {
 670  671                  tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
 671  672  
 672  673                  /*
 673  674                   * Create a helper stream for non-STREAMS socket.
 674  675                   */
 675  676                  error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
 676  677                  if (error != 0) {
 677  678                          ip0dbg(("tcp_ioctl: create of IP helper stream "
 678  679                              "failed %d\n", error));
 679  680                          return (error);
 680  681                  }
 681  682          }
 682  683  
 683  684          switch (cmd) {
 684  685                  case ND_SET:
 685  686                  case ND_GET:
 686  687                  case _SIOCSOCKFALLBACK:
 687  688                  case TCP_IOC_ABORT_CONN:
 688  689                  case TI_GETPEERNAME:
 689  690                  case TI_GETMYNAME:
 690  691                          ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
 691  692                              cmd));
 692  693                          error = EINVAL;
 693  694                          break;
 694  695                  default:
 695  696                          /*
 696  697                           * If the conn is not closing, pass on to IP using
 697  698                           * helper stream. Bump the ioctlref to prevent tcp_close
 698  699                           * from closing the rq/wq out from underneath the ioctl
 699  700                           * if it ends up queued or aborted/interrupted.
 700  701                           */
 701  702                          mutex_enter(&connp->conn_lock);
 702  703                          if (connp->conn_state_flags & (CONN_CLOSING)) {
 703  704                                  mutex_exit(&connp->conn_lock);
 704  705                                  error = EINVAL;
 705  706                                  break;
 706  707                          }
 707  708                          CONN_INC_IOCTLREF_LOCKED(connp);
 708  709                          error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
 709  710                              cmd, arg, mode, cr, rvalp);
 710  711                          CONN_DEC_IOCTLREF(connp);
 711  712                          break;
 712  713          }
 713  714          return (error);
 714  715  }
 715  716  
 716  717  /* ARGSUSED */
 717  718  static int
 718  719  tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
 719  720  {
 720  721          conn_t *connp = (conn_t *)proto_handle;
 721  722  
 722  723          ASSERT(connp->conn_upper_handle != NULL);
 723  724  
 724  725          /* All Solaris components should pass a cred for this operation. */
 725  726          ASSERT(cr != NULL);
 726  727  
 727  728          tcp_close_common(connp, flags);
 728  729  
 729  730          ip_free_helper_stream(connp);
 730  731  
 731  732          /*
 732  733           * Drop IP's reference on the conn. This is the last reference
 733  734           * on the connp if the state was less than established. If the
 734  735           * connection has gone into timewait state, then we will have
 735  736           * one ref for the TCP and one more ref (total of two) for the
 736  737           * classifier connected hash list (a timewait connections stays
 737  738           * in connected hash till closed).
 738  739           *
 739  740           * We can't assert the references because there might be other
 740  741           * transient reference places because of some walkers or queued
 741  742           * packets in squeue for the timewait state.
 742  743           */
 743  744          CONN_DEC_REF(connp);
 744  745  
 745  746          /*
 746  747           * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
 747  748           * freeing the socket.
 748  749           */
 749  750          return (EINPROGRESS);
 750  751  }
 751  752  
 752  753  /* ARGSUSED */
 753  754  sock_lower_handle_t
 754  755  tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 755  756      uint_t *smodep, int *errorp, int flags, cred_t *credp)
 756  757  {
 757  758          conn_t          *connp;
 758  759          boolean_t       isv6 = family == AF_INET6;
 759  760  
 760  761          if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
 761  762              (proto != 0 && proto != IPPROTO_TCP)) {
 762  763                  *errorp = EPROTONOSUPPORT;
 763  764                  return (NULL);
 764  765          }
 765  766  
 766  767          connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
 767  768          if (connp == NULL) {
 768  769                  return (NULL);
 769  770          }
 770  771  
 771  772          /*
 772  773           * Put the ref for TCP. Ref for IP was already put
 773  774           * by ipcl_conn_create. Also make the conn_t globally
 774  775           * visible to walkers
 775  776           */
 776  777          mutex_enter(&connp->conn_lock);
 777  778          CONN_INC_REF_LOCKED(connp);
 778  779          ASSERT(connp->conn_ref == 2);
 779  780          connp->conn_state_flags &= ~CONN_INCIPIENT;
 780  781  
 781  782          connp->conn_flags |= IPCL_NONSTR;
 782  783          mutex_exit(&connp->conn_lock);
 783  784  
 784  785          ASSERT(errorp != NULL);
 785  786          *errorp = 0;
 786  787          *sock_downcalls = &sock_tcp_downcalls;
 787  788          *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
 788  789              SM_SENDFILESUPP;
 789  790  
 790  791          return ((sock_lower_handle_t)connp);
 791  792  }
 792  793  
 793  794  /*
 794  795   * tcp_fallback
 795  796   *
 796  797   * A direct socket is falling back to using STREAMS. The queue
 797  798   * that is being passed down was created using tcp_open() with
 798  799   * the SO_FALLBACK flag set. As a result, the queue is not
 799  800   * associated with a conn, and the q_ptrs instead contain the
 800  801   * dev and minor area that should be used.
 801  802   *
 802  803   * The 'issocket' flag indicates whether the FireEngine
 803  804   * optimizations should be used. The common case would be that
 804  805   * optimizations are enabled, and they might be subsequently
 805  806   * disabled using the _SIOCSOCKFALLBACK ioctl.
 806  807   */
 807  808  
 808  809  /*
 809  810   * An active connection is falling back to TPI. Gather all the information
 810  811   * required by the STREAM head and TPI sonode and send it up.
 811  812   */
 812  813  static void
 813  814  tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 814  815      boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
 815  816      sock_quiesce_arg_t *arg)
 816  817  {
 817  818          conn_t                  *connp = tcp->tcp_connp;
 818  819          struct stroptions       *stropt;
 819  820          struct T_capability_ack tca;
 820  821          struct sockaddr_in6     laddr, faddr;
 821  822          socklen_t               laddrlen, faddrlen;
 822  823          short                   opts;
 823  824          int                     error;
 824  825          mblk_t                  *mp, *mpnext;
 825  826  
 826  827          connp->conn_dev = (dev_t)RD(q)->q_ptr;
 827  828          connp->conn_minor_arena = WR(q)->q_ptr;
 828  829  
 829  830          RD(q)->q_ptr = WR(q)->q_ptr = connp;
 830  831  
 831  832          connp->conn_rq = RD(q);
 832  833          connp->conn_wq = WR(q);
 833  834  
 834  835          WR(q)->q_qinfo = &tcp_sock_winit;
 835  836  
 836  837          if (!issocket)
 837  838                  tcp_use_pure_tpi(tcp);
 838  839  
 839  840          /*
 840  841           * free the helper stream
 841  842           */
 842  843          ip_free_helper_stream(connp);
 843  844  
 844  845          /*
 845  846           * Notify the STREAM head about options
 846  847           */
 847  848          DB_TYPE(stropt_mp) = M_SETOPTS;
 848  849          stropt = (struct stroptions *)stropt_mp->b_rptr;
 849  850          stropt_mp->b_wptr += sizeof (struct stroptions);
 850  851          stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
 851  852  
 852  853          stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
 853  854              tcp->tcp_tcps->tcps_wroff_xtra);
 854  855          if (tcp->tcp_snd_sack_ok)
 855  856                  stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
 856  857          stropt->so_hiwat = connp->conn_rcvbuf;
 857  858          stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 858  859  
 859  860          putnext(RD(q), stropt_mp);
 860  861  
 861  862          /*
 862  863           * Collect the information needed to sync with the sonode
 863  864           */
 864  865          tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
 865  866  
 866  867          laddrlen = faddrlen = sizeof (sin6_t);
 867  868          (void) tcp_getsockname((sock_lower_handle_t)connp,
 868  869              (struct sockaddr *)&laddr, &laddrlen, CRED());
 869  870          error = tcp_getpeername((sock_lower_handle_t)connp,
 870  871              (struct sockaddr *)&faddr, &faddrlen, CRED());
 871  872          if (error != 0)
 872  873                  faddrlen = 0;
 873  874  
 874  875          opts = 0;
 875  876          if (connp->conn_oobinline)
 876  877                  opts |= SO_OOBINLINE;
 877  878          if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 878  879                  opts |= SO_DONTROUTE;
 879  880  
 880  881          /*
 881  882           * Notify the socket that the protocol is now quiescent,
 882  883           * and it's therefore safe move data from the socket
 883  884           * to the stream head.
 884  885           */
 885  886          mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
 886  887              (struct sockaddr *)&laddr, laddrlen,
 887  888              (struct sockaddr *)&faddr, faddrlen, opts);
 888  889  
 889  890          while (mp != NULL) {
 890  891                  mpnext = mp->b_next;
 891  892                  tcp->tcp_rcv_list = mp->b_next;
 892  893                  mp->b_next = NULL;
 893  894                  putnext(q, mp);
 894  895                  mp = mpnext;
 895  896          }
 896  897          ASSERT(tcp->tcp_rcv_last_head == NULL);
 897  898          ASSERT(tcp->tcp_rcv_last_tail == NULL);
 898  899          ASSERT(tcp->tcp_rcv_cnt == 0);
 899  900  
 900  901          /*
 901  902           * All eagers in q0 are marked as being non-STREAM, so they will
 902  903           * make su_newconn upcalls when the handshake completes, which
 903  904           * will fail (resulting in the conn being closed). So we just blow
 904  905           * off everything in q0 instead of waiting for the inevitable.
 905  906           */
 906  907          if (tcp->tcp_conn_req_cnt_q0 != 0)
 907  908                  tcp_eager_cleanup(tcp, B_TRUE);
 908  909  }
 909  910  
 910  911  /*
 911  912   * An eager is falling back to TPI. All we have to do is send
 912  913   * up a T_CONN_IND.
 913  914   */
 914  915  static void
 915  916  tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
 916  917      so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
 917  918  {
 918  919          conn_t *connp = eager->tcp_connp;
 919  920          tcp_t *listener = eager->tcp_listener;
 920  921          mblk_t *mp;
 921  922  
 922  923          ASSERT(listener != NULL);
 923  924  
 924  925          /*
 925  926           * Notify the socket that the protocol is now quiescent,
 926  927           * and it's therefore safe move data from the socket
 927  928           * to tcp's rcv queue.
 928  929           */
 929  930          mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
 930  931              NULL, 0, 0);
 931  932  
 932  933          if (mp != NULL) {
 933  934                  ASSERT(eager->tcp_rcv_cnt == 0);
 934  935  
 935  936                  eager->tcp_rcv_list = mp;
 936  937                  eager->tcp_rcv_cnt = msgdsize(mp);
 937  938                  while (mp->b_next != NULL) {
 938  939                          mp = mp->b_next;
 939  940                          eager->tcp_rcv_cnt += msgdsize(mp);
 940  941                  }
 941  942                  eager->tcp_rcv_last_head = mp;
 942  943                  while (mp->b_cont)
 943  944                          mp = mp->b_cont;
 944  945                  eager->tcp_rcv_last_tail = mp;
 945  946                  if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
 946  947                          eager->tcp_rwnd = 0;
 947  948                  else
 948  949                          eager->tcp_rwnd -= eager->tcp_rcv_cnt;
 949  950          }
 950  951  
 951  952          if (!issocket)
 952  953                  eager->tcp_issocket = B_FALSE;
 953  954          /*
 954  955           * The stream for this eager does not yet exist, so mark it as
 955  956           * being detached.
 956  957           */
 957  958          eager->tcp_detached = B_TRUE;
 958  959          eager->tcp_hard_binding = B_TRUE;
 959  960          connp->conn_rq = listener->tcp_connp->conn_rq;
 960  961          connp->conn_wq = listener->tcp_connp->conn_wq;
 961  962  
 962  963          /* Send up the connection indication */
 963  964          mp = eager->tcp_conn.tcp_eager_conn_ind;
 964  965          ASSERT(mp != NULL);
 965  966          eager->tcp_conn.tcp_eager_conn_ind = NULL;
 966  967  
 967  968          /*
 968  969           * TLI/XTI applications will get confused by
 969  970           * sending eager as an option since it violates
 970  971           * the option semantics. So remove the eager as
 971  972           * option since TLI/XTI app doesn't need it anyway.
 972  973           */
 973  974          if (!issocket) {
 974  975                  struct T_conn_ind *conn_ind;
 975  976  
 976  977                  conn_ind = (struct T_conn_ind *)mp->b_rptr;
 977  978                  conn_ind->OPT_length = 0;
 978  979                  conn_ind->OPT_offset = 0;
 979  980          }
 980  981  
 981  982          /*
 982  983           * Sockfs guarantees that the listener will not be closed
 983  984           * during fallback. So we can safely use the listener's queue.
 984  985           */
 985  986          putnext(listener->tcp_connp->conn_rq, mp);
 986  987  }
 987  988  
 988  989  
 989  990  int
 990  991  tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 991  992      boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
 992  993      sock_quiesce_arg_t *arg)
 993  994  {
 994  995          tcp_t                   *tcp;
 995  996          conn_t                  *connp = (conn_t *)proto_handle;
 996  997          int                     error;
 997  998          mblk_t                  *stropt_mp;
 998  999          mblk_t                  *ordrel_mp;
 999 1000  
1000 1001          tcp = connp->conn_tcp;
1001 1002  
1002 1003          stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1003 1004              NULL);
1004 1005  
1005 1006          /* Pre-allocate the T_ordrel_ind mblk. */
1006 1007          ASSERT(tcp->tcp_ordrel_mp == NULL);
1007 1008          ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1008 1009              STR_NOSIG, NULL);
1009 1010          ordrel_mp->b_datap->db_type = M_PROTO;
1010 1011          ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1011 1012          ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1012 1013  
1013 1014          /*
  
    | 
      ↓ open down ↓ | 
    980 lines elided | 
    
      ↑ open up ↑ | 
  
1014 1015           * Enter the squeue so that no new packets can come in
1015 1016           */
1016 1017          error = squeue_synch_enter(connp, NULL);
1017 1018          if (error != 0) {
1018 1019                  /* failed to enter, free all the pre-allocated messages. */
1019 1020                  freeb(stropt_mp);
1020 1021                  freeb(ordrel_mp);
1021 1022                  return (ENOMEM);
1022 1023          }
1023 1024  
     1025 +        /*
     1026 +         * Do not allow fallback on connections making use of SO_REUSEPORT.
     1027 +         */
     1028 +        if (tcp->tcp_rg_bind != NULL) {
     1029 +                freeb(stropt_mp);
     1030 +                freeb(ordrel_mp);
     1031 +                squeue_synch_exit(connp);
     1032 +                return (EINVAL);
     1033 +        }
     1034 +
1024 1035          /*
1025 1036           * Both endpoints must be of the same type (either STREAMS or
1026 1037           * non-STREAMS) for fusion to be enabled. So if we are fused,
1027 1038           * we have to unfuse.
1028 1039           */
1029 1040          if (tcp->tcp_fused)
1030 1041                  tcp_unfuse(tcp);
1031 1042  
1032 1043          if (tcp->tcp_listener != NULL) {
1033 1044                  /* The eager will deal with opts when accept() is called */
1034 1045                  freeb(stropt_mp);
1035 1046                  tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1036 1047          } else {
1037 1048                  tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1038 1049                      quiesced_cb, arg);
1039 1050          }
1040 1051  
1041 1052          /*
1042 1053           * No longer a direct socket
1043 1054           *
1044 1055           * Note that we intentionally leave the upper_handle and upcalls
1045 1056           * intact, since eagers may still be using them.
1046 1057           */
1047 1058          connp->conn_flags &= ~IPCL_NONSTR;
1048 1059          tcp->tcp_ordrel_mp = ordrel_mp;
1049 1060  
1050 1061          /*
1051 1062           * There should be atleast two ref's (IP + TCP)
1052 1063           */
1053 1064          ASSERT(connp->conn_ref >= 2);
1054 1065          squeue_synch_exit(connp);
1055 1066  
1056 1067          return (0);
1057 1068  }
1058 1069  
1059 1070  /*
1060 1071   * Notifies a non-STREAMS based listener about a new connection. This
1061 1072   * function is executed on the *eager*'s squeue once the 3 way handshake
1062 1073   * has completed. Note that the behavior differs from STREAMS, where the
1063 1074   * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1064 1075   * squeue.
1065 1076   *
1066 1077   * Returns B_TRUE if the notification succeeded and an upper handle was
1067 1078   * obtained. `tcp' should be closed on failure.
1068 1079   */
1069 1080  boolean_t
1070 1081  tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1071 1082  {
1072 1083          tcp_t *listener = tcp->tcp_listener;
1073 1084          conn_t *lconnp = listener->tcp_connp;
1074 1085          conn_t *econnp = tcp->tcp_connp;
1075 1086          tcp_t *tail;
1076 1087          ipaddr_t *addr_cache;
1077 1088          sock_upper_handle_t upper;
1078 1089          struct sock_proto_props sopp;
1079 1090  
1080 1091          mutex_enter(&listener->tcp_eager_lock);
1081 1092          /*
1082 1093           * Take the eager out, if it is in the list of droppable eagers
1083 1094           * as we are here because the 3W handshake is over.
1084 1095           */
1085 1096          MAKE_UNDROPPABLE(tcp);
1086 1097          /*
1087 1098           * The eager already has an extra ref put in tcp_input_data
1088 1099           * so that it stays till accept comes back even though it
1089 1100           * might get into TCPS_CLOSED as a result of a TH_RST etc.
1090 1101           */
1091 1102          ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1092 1103          listener->tcp_conn_req_cnt_q0--;
1093 1104          listener->tcp_conn_req_cnt_q++;
1094 1105  
1095 1106          /* Move from SYN_RCVD to ESTABLISHED list  */
1096 1107          tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1097 1108          tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1098 1109          tcp->tcp_eager_prev_q0 = NULL;
1099 1110          tcp->tcp_eager_next_q0 = NULL;
1100 1111  
1101 1112          /*
1102 1113           * Insert at end of the queue because connections are accepted
1103 1114           * in chronological order. Leaving the older connections at front
1104 1115           * of the queue helps reducing search time.
1105 1116           */
1106 1117          tail = listener->tcp_eager_last_q;
1107 1118          if (tail != NULL)
1108 1119                  tail->tcp_eager_next_q = tcp;
1109 1120          else
1110 1121                  listener->tcp_eager_next_q = tcp;
1111 1122          listener->tcp_eager_last_q = tcp;
1112 1123          tcp->tcp_eager_next_q = NULL;
1113 1124  
1114 1125          /* we have timed out before */
1115 1126          if (tcp->tcp_syn_rcvd_timeout != 0) {
1116 1127                  tcp->tcp_syn_rcvd_timeout = 0;
1117 1128                  listener->tcp_syn_rcvd_timeout--;
1118 1129                  if (listener->tcp_syn_defense &&
1119 1130                      listener->tcp_syn_rcvd_timeout <=
1120 1131                      (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1121 1132                      10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1122 1133                      listener->tcp_last_rcv_lbolt)) {
1123 1134                          /*
1124 1135                           * Turn off the defense mode if we
1125 1136                           * believe the SYN attack is over.
1126 1137                           */
1127 1138                          listener->tcp_syn_defense = B_FALSE;
1128 1139                          if (listener->tcp_ip_addr_cache) {
1129 1140                                  kmem_free((void *)listener->tcp_ip_addr_cache,
1130 1141                                      IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1131 1142                                  listener->tcp_ip_addr_cache = NULL;
1132 1143                          }
1133 1144                  }
1134 1145          }
1135 1146          addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1136 1147          if (addr_cache != NULL) {
1137 1148                  /*
1138 1149                   * We have finished a 3-way handshake with this
1139 1150                   * remote host. This proves the IP addr is good.
1140 1151                   * Cache it!
1141 1152                   */
1142 1153                  addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1143 1154                      tcp->tcp_connp->conn_faddr_v4;
1144 1155          }
1145 1156          mutex_exit(&listener->tcp_eager_lock);
1146 1157  
1147 1158          /*
1148 1159           * Notify the ULP about the newconn. It is guaranteed that no
1149 1160           * tcp_accept() call will be made for the eager if the
1150 1161           * notification fails.
1151 1162           */
1152 1163          if ((upper = (*lconnp->conn_upcalls->su_newconn)
1153 1164              (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1154 1165              &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1155 1166              &econnp->conn_upcalls)) == NULL) {
1156 1167                  return (B_FALSE);
1157 1168          }
1158 1169          econnp->conn_upper_handle = upper;
1159 1170  
1160 1171          tcp->tcp_detached = B_FALSE;
1161 1172          tcp->tcp_hard_binding = B_FALSE;
1162 1173          tcp->tcp_tconnind_started = B_TRUE;
1163 1174  
1164 1175          if (econnp->conn_keepalive) {
1165 1176                  tcp->tcp_ka_last_intrvl = 0;
1166 1177                  tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1167 1178                      tcp->tcp_ka_interval);
1168 1179          }
1169 1180  
1170 1181          /* Update the necessary parameters */
1171 1182          tcp_get_proto_props(tcp, &sopp);
1172 1183  
1173 1184          (*econnp->conn_upcalls->su_set_proto_props)
1174 1185              (econnp->conn_upper_handle, &sopp);
1175 1186  
1176 1187          return (B_TRUE);
1177 1188  }
  
    | 
      ↓ open down ↓ | 
    144 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX