1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Joyent, Inc.
  25  */
  26 
  27 /* This file contains all TCP kernel socket related functions. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/strlog.h>
  31 #include <sys/policy.h>
  32 #include <sys/sockio.h>
  33 #include <sys/strsubr.h>
  34 #include <sys/strsun.h>
  35 #include <sys/squeue_impl.h>
  36 #include <sys/squeue.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/timod.h>
  40 #include <sys/tpicommon.h>
  41 #include <sys/socketvar.h>
  42 
  43 #include <inet/common.h>
  44 #include <inet/proto_set.h>
  45 #include <inet/ip.h>
  46 #include <inet/tcp.h>
  47 #include <inet/tcp_impl.h>
  48 
  49 static void     tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
  50                     sock_upcalls_t *, int, cred_t *);
  51 static int      tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
  52                     sock_upper_handle_t, cred_t *);
  53 static int      tcp_bind(sock_lower_handle_t, struct sockaddr *,
  54                     socklen_t, cred_t *);
  55 static int      tcp_listen(sock_lower_handle_t, int, cred_t *);
  56 static int      tcp_connect(sock_lower_handle_t, const struct sockaddr *,
  57                     socklen_t, sock_connid_t *, cred_t *);
  58 static int      tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
  59                     socklen_t *, cred_t *);
  60 static int      tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
  61                     socklen_t *, cred_t *);
  62 static int      tcp_getsockopt(sock_lower_handle_t, int, int, void *,
  63                     socklen_t *, cred_t *);
  64 static int      tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
  65                     socklen_t, cred_t *);
  66 static int      tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
  67                     cred_t *);
  68 static int      tcp_shutdown(sock_lower_handle_t, int, cred_t *);
  69 static void     tcp_clr_flowctrl(sock_lower_handle_t);
  70 static int      tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
  71                     cred_t *);
  72 static int      tcp_close(sock_lower_handle_t, int, cred_t *);
  73 
  74 sock_downcalls_t sock_tcp_downcalls = {
  75         tcp_activate,
  76         tcp_accept,
  77         tcp_bind,
  78         tcp_listen,
  79         tcp_connect,
  80         tcp_getpeername,
  81         tcp_getsockname,
  82         tcp_getsockopt,
  83         tcp_setsockopt,
  84         tcp_sendmsg,
  85         NULL,
  86         NULL,
  87         NULL,
  88         tcp_shutdown,
  89         tcp_clr_flowctrl,
  90         tcp_ioctl,
  91         tcp_close,
  92 };
  93 
  94 /* ARGSUSED */
  95 static void
  96 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
  97     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
  98 {
  99         conn_t *connp = (conn_t *)proto_handle;
 100         struct sock_proto_props sopp;
 101         extern struct module_info tcp_rinfo;
 102 
 103         ASSERT(connp->conn_upper_handle == NULL);
 104 
 105         /* All Solaris components should pass a cred for this operation. */
 106         ASSERT(cr != NULL);
 107 
 108         sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
 109             SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
 110             SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
 111 
 112         sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
 113         sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
 114         sopp.sopp_maxpsz = INFPSZ;
 115         sopp.sopp_maxblk = INFPSZ;
 116         sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
 117         sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
 118         sopp.sopp_maxaddrlen = sizeof (sin6_t);
 119         sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
 120             tcp_rinfo.mi_minpsz;
 121 
 122         connp->conn_upcalls = sock_upcalls;
 123         connp->conn_upper_handle = sock_handle;
 124 
 125         ASSERT(connp->conn_rcvbuf != 0 &&
 126             connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
 127         (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
 128 }
 129 
 130 /*ARGSUSED*/
 131 static int
 132 tcp_accept(sock_lower_handle_t lproto_handle,
 133     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
 134     cred_t *cr)
 135 {
 136         conn_t *lconnp, *econnp;
 137         tcp_t *listener, *eager;
 138 
 139         /*
 140          * KSSL can move a socket from one listener to another, in which
 141          * case `lproto_handle' points to the new listener. To ensure that
 142          * the original listener is used the information is obtained from
 143          * the eager.
 144          */
 145         econnp = (conn_t *)eproto_handle;
 146         eager = econnp->conn_tcp;
 147         ASSERT(IPCL_IS_NONSTR(econnp));
 148         ASSERT(eager->tcp_listener != NULL);
 149         listener = eager->tcp_listener;
 150         lconnp = (conn_t *)listener->tcp_connp;
 151         ASSERT(listener->tcp_state == TCPS_LISTEN);
 152         ASSERT(lconnp->conn_upper_handle != NULL);
 153 
 154         /*
 155          * It is possible for the accept thread to race with the thread that
 156          * made the su_newconn upcall in tcp_newconn_notify. Both
 157          * tcp_newconn_notify and tcp_accept require that conn_upper_handle
 158          * and conn_upcalls be set before returning, so they both write to
 159          * them. However, we're guaranteed that the value written is the same
 160          * for both threads.
 161          */
 162         ASSERT(econnp->conn_upper_handle == NULL ||
 163             econnp->conn_upper_handle == sock_handle);
 164         ASSERT(econnp->conn_upcalls == NULL ||
 165             econnp->conn_upcalls == lconnp->conn_upcalls);
 166         econnp->conn_upper_handle = sock_handle;
 167         econnp->conn_upcalls = lconnp->conn_upcalls;
 168 
 169         ASSERT(econnp->conn_netstack ==
 170             listener->tcp_connp->conn_netstack);
 171         ASSERT(eager->tcp_tcps == listener->tcp_tcps);
 172 
 173         /*
 174          * We should have a minimum of 2 references on the conn at this
 175          * point. One for TCP and one for the newconn notification
 176          * (which is now taken over by IP). In the normal case we would
 177          * also have another reference (making a total of 3) for the conn
 178          * being in the classifier hash list. However the eager could have
 179          * received an RST subsequently and tcp_closei_local could have
 180          * removed the eager from the classifier hash list, hence we can't
 181          * assert that reference.
 182          */
 183         ASSERT(econnp->conn_ref >= 2);
 184 
 185         mutex_enter(&listener->tcp_eager_lock);
 186         /*
 187          * Non-STREAMS listeners never defer the notification of new
 188          * connections.
 189          */
 190         ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
 191         tcp_eager_unlink(eager);
 192         mutex_exit(&listener->tcp_eager_lock);
 193         CONN_DEC_REF(listener->tcp_connp);
 194 
 195         return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
 196 }
 197 
 198 static int
 199 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
 200     socklen_t len, cred_t *cr)
 201 {
 202         int             error;
 203         conn_t          *connp = (conn_t *)proto_handle;
 204 
 205         /* All Solaris components should pass a cred for this operation. */
 206         ASSERT(cr != NULL);
 207         ASSERT(connp->conn_upper_handle != NULL);
 208 
 209         error = squeue_synch_enter(connp, NULL);
 210         if (error != 0) {
 211                 /* failed to enter */
 212                 return (ENOSR);
 213         }
 214 
 215         /* binding to a NULL address really means unbind */
 216         if (sa == NULL) {
 217                 if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
 218                         error = tcp_do_unbind(connp);
 219                 else
 220                         error = EINVAL;
 221         } else {
 222                 error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
 223         }
 224 
 225         squeue_synch_exit(connp);
 226 
 227         if (error < 0) {
 228                 if (error == -TOUTSTATE)
 229                         error = EINVAL;
 230                 else
 231                         error = proto_tlitosyserr(-error);
 232         }
 233 
 234         return (error);
 235 }
 236 
 237 /* ARGSUSED */
 238 static int
 239 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
 240 {
 241         conn_t  *connp = (conn_t *)proto_handle;
 242         tcp_t   *tcp = connp->conn_tcp;
 243         int     error;
 244 
 245         ASSERT(connp->conn_upper_handle != NULL);
 246 
 247         /* All Solaris components should pass a cred for this operation. */
 248         ASSERT(cr != NULL);
 249 
 250         error = squeue_synch_enter(connp, NULL);
 251         if (error != 0) {
 252                 /* failed to enter */
 253                 return (ENOBUFS);
 254         }
 255 
 256         error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
 257         if (error == 0) {
 258                 /*
 259                  * sockfs needs to know what's the maximum number of socket
 260                  * that can be queued on the listener.
 261                  */
 262                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 263                     SOCK_OPCTL_ENAB_ACCEPT,
 264                     (uintptr_t)(tcp->tcp_conn_req_max +
 265                     tcp->tcp_tcps->tcps_conn_req_max_q0));
 266         } else if (error < 0) {
 267                 if (error == -TOUTSTATE)
 268                         error = EINVAL;
 269                 else
 270                         error = proto_tlitosyserr(-error);
 271         }
 272         squeue_synch_exit(connp);
 273         return (error);
 274 }
 275 
 276 static int
 277 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 278     socklen_t len, sock_connid_t *id, cred_t *cr)
 279 {
 280         conn_t          *connp = (conn_t *)proto_handle;
 281         int             error;
 282 
 283         ASSERT(connp->conn_upper_handle != NULL);
 284 
 285         /* All Solaris components should pass a cred for this operation. */
 286         ASSERT(cr != NULL);
 287 
 288         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 289         if (error != 0) {
 290                 return (error);
 291         }
 292 
 293         error = squeue_synch_enter(connp, NULL);
 294         if (error != 0) {
 295                 /* failed to enter */
 296                 return (ENOSR);
 297         }
 298 
 299         /*
 300          * TCP supports quick connect, so no need to do an implicit bind
 301          */
 302         error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
 303         if (error == 0) {
 304                 *id = connp->conn_tcp->tcp_connid;
 305         } else if (error < 0) {
 306                 if (error == -TOUTSTATE) {
 307                         switch (connp->conn_tcp->tcp_state) {
 308                         case TCPS_SYN_SENT:
 309                                 error = EALREADY;
 310                                 break;
 311                         case TCPS_ESTABLISHED:
 312                                 error = EISCONN;
 313                                 break;
 314                         case TCPS_LISTEN:
 315                                 error = EOPNOTSUPP;
 316                                 break;
 317                         default:
 318                                 error = EINVAL;
 319                                 break;
 320                         }
 321                 } else {
 322                         error = proto_tlitosyserr(-error);
 323                 }
 324         }
 325 
 326         if (connp->conn_tcp->tcp_loopback) {
 327                 struct sock_proto_props sopp;
 328 
 329                 sopp.sopp_flags = SOCKOPT_LOOPBACK;
 330                 sopp.sopp_loopback = B_TRUE;
 331 
 332                 (*connp->conn_upcalls->su_set_proto_props)(
 333                     connp->conn_upper_handle, &sopp);
 334         }
 335 done:
 336         squeue_synch_exit(connp);
 337 
 338         return ((error == 0) ? EINPROGRESS : error);
 339 }
 340 
 341 /* ARGSUSED3 */
 342 static int
 343 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 344     socklen_t *addrlenp, cred_t *cr)
 345 {
 346         conn_t  *connp = (conn_t *)proto_handle;
 347         tcp_t   *tcp = connp->conn_tcp;
 348 
 349         /* All Solaris components should pass a cred for this operation. */
 350         ASSERT(cr != NULL);
 351 
 352         ASSERT(tcp != NULL);
 353         if (tcp->tcp_state < TCPS_SYN_RCVD)
 354                 return (ENOTCONN);
 355 
 356         return (conn_getpeername(connp, addr, addrlenp));
 357 }
 358 
 359 /* ARGSUSED3 */
 360 static int
 361 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 362     socklen_t *addrlenp, cred_t *cr)
 363 {
 364         conn_t  *connp = (conn_t *)proto_handle;
 365 
 366         /* All Solaris components should pass a cred for this operation. */
 367         ASSERT(cr != NULL);
 368 
 369         return (conn_getsockname(connp, addr, addrlenp));
 370 }
 371 
 372 /* returns UNIX error, the optlen is a value-result arg */
 373 static int
 374 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 375     void *optvalp, socklen_t *optlen, cred_t *cr)
 376 {
 377         conn_t          *connp = (conn_t *)proto_handle;
 378         int             error;
 379         t_uscalar_t     max_optbuf_len;
 380         void            *optvalp_buf;
 381         int             len;
 382 
 383         ASSERT(connp->conn_upper_handle != NULL);
 384 
 385         error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 386             tcp_opt_obj.odb_opt_des_arr,
 387             tcp_opt_obj.odb_opt_arr_cnt,
 388             B_FALSE, B_TRUE, cr);
 389         if (error != 0) {
 390                 if (error < 0) {
 391                         error = proto_tlitosyserr(-error);
 392                 }
 393                 return (error);
 394         }
 395 
 396         optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
 397 
 398         error = squeue_synch_enter(connp, NULL);
 399         if (error == ENOMEM) {
 400                 kmem_free(optvalp_buf, max_optbuf_len);
 401                 return (ENOMEM);
 402         }
 403 
 404         len = tcp_opt_get(connp, level, option_name, optvalp_buf);
 405         squeue_synch_exit(connp);
 406 
 407         if (len == -1) {
 408                 kmem_free(optvalp_buf, max_optbuf_len);
 409                 return (EINVAL);
 410         }
 411 
 412         /*
 413          * update optlen and copy option value
 414          */
 415         t_uscalar_t size = MIN(len, *optlen);
 416 
 417         bcopy(optvalp_buf, optvalp, size);
 418         bcopy(&size, optlen, sizeof (size));
 419 
 420         kmem_free(optvalp_buf, max_optbuf_len);
 421         return (0);
 422 }
 423 
 424 static int
 425 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 426     const void *optvalp, socklen_t optlen, cred_t *cr)
 427 {
 428         conn_t          *connp = (conn_t *)proto_handle;
 429         int             error;
 430 
 431         ASSERT(connp->conn_upper_handle != NULL);
 432         /*
 433          * Entering the squeue synchronously can result in a context switch,
 434          * which can cause a rather sever performance degradation. So we try to
 435          * handle whatever options we can without entering the squeue.
 436          */
 437         if (level == IPPROTO_TCP) {
 438                 switch (option_name) {
 439                 case TCP_NODELAY:
 440                         if (optlen != sizeof (int32_t))
 441                                 return (EINVAL);
 442                         mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
 443                         connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
 444                             connp->conn_tcp->tcp_mss;
 445                         mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
 446                         return (0);
 447                 default:
 448                         break;
 449                 }
 450         }
 451 
 452         error = squeue_synch_enter(connp, NULL);
 453         if (error == ENOMEM) {
 454                 return (ENOMEM);
 455         }
 456 
 457         error = proto_opt_check(level, option_name, optlen, NULL,
 458             tcp_opt_obj.odb_opt_des_arr,
 459             tcp_opt_obj.odb_opt_arr_cnt,
 460             B_TRUE, B_FALSE, cr);
 461 
 462         if (error != 0) {
 463                 if (error < 0) {
 464                         error = proto_tlitosyserr(-error);
 465                 }
 466                 squeue_synch_exit(connp);
 467                 return (error);
 468         }
 469 
 470         error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
 471             optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
 472             NULL, cr);
 473         squeue_synch_exit(connp);
 474 
 475         ASSERT(error >= 0);
 476 
 477         return (error);
 478 }
 479 
 480 /* ARGSUSED */
 481 static int
 482 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 483     cred_t *cr)
 484 {
 485         tcp_t           *tcp;
 486         uint32_t        msize;
 487         conn_t *connp = (conn_t *)proto_handle;
 488         int32_t         tcpstate;
 489 
 490         /* All Solaris components should pass a cred for this operation. */
 491         ASSERT(cr != NULL);
 492 
 493         ASSERT(connp->conn_ref >= 2);
 494         ASSERT(connp->conn_upper_handle != NULL);
 495 
 496         if (msg->msg_controllen != 0) {
 497                 freemsg(mp);
 498                 return (EOPNOTSUPP);
 499         }
 500 
 501         switch (DB_TYPE(mp)) {
 502         case M_DATA:
 503                 tcp = connp->conn_tcp;
 504                 ASSERT(tcp != NULL);
 505 
 506                 tcpstate = tcp->tcp_state;
 507                 if (tcpstate < TCPS_ESTABLISHED) {
 508                         freemsg(mp);
 509                         /*
 510                          * We return ENOTCONN if the endpoint is trying to
 511                          * connect or has never been connected, and EPIPE if it
 512                          * has been disconnected. The connection id helps us
 513                          * distinguish between the last two cases.
 514                          */
 515                         return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
 516                             ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
 517                 } else if (tcpstate > TCPS_CLOSE_WAIT) {
 518                         freemsg(mp);
 519                         return (EPIPE);
 520                 }
 521 
 522                 msize = msgdsize(mp);
 523 
 524                 mutex_enter(&tcp->tcp_non_sq_lock);
 525                 tcp->tcp_squeue_bytes += msize;
 526                 /*
 527                  * Squeue Flow Control
 528                  */
 529                 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 530                         tcp_setqfull(tcp);
 531                 }
 532                 mutex_exit(&tcp->tcp_non_sq_lock);
 533 
 534                 /*
 535                  * The application may pass in an address in the msghdr, but
 536                  * we ignore the address on connection-oriented sockets.
 537                  * Just like BSD this code does not generate an error for
 538                  * TCP (a CONNREQUIRED socket) when sending to an address
 539                  * passed in with sendto/sendmsg. Instead the data is
 540                  * delivered on the connection as if no address had been
 541                  * supplied.
 542                  */
 543                 CONN_INC_REF(connp);
 544 
 545                 if (msg->msg_flags & MSG_OOB) {
 546                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
 547                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 548                 } else {
 549                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
 550                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 551                 }
 552 
 553                 return (0);
 554 
 555         default:
 556                 ASSERT(0);
 557         }
 558 
 559         freemsg(mp);
 560         return (0);
 561 }
 562 
 563 /* ARGSUSED */
 564 static int
 565 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 566 {
 567         conn_t  *connp = (conn_t *)proto_handle;
 568         tcp_t   *tcp = connp->conn_tcp;
 569 
 570         ASSERT(connp->conn_upper_handle != NULL);
 571 
 572         /* All Solaris components should pass a cred for this operation. */
 573         ASSERT(cr != NULL);
 574 
 575         /*
 576          * X/Open requires that we check the connected state.
 577          */
 578         if (tcp->tcp_state < TCPS_SYN_SENT)
 579                 return (ENOTCONN);
 580 
 581         /* shutdown the send side */
 582         if (how != SHUT_RD) {
 583                 mblk_t *bp;
 584 
 585                 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
 586                 CONN_INC_REF(connp);
 587                 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
 588                     connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
 589 
 590                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 591                     SOCK_OPCTL_SHUT_SEND, 0);
 592         }
 593 
 594         /* shutdown the recv side */
 595         if (how != SHUT_WR)
 596                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 597                     SOCK_OPCTL_SHUT_RECV, 0);
 598 
 599         return (0);
 600 }
 601 
 602 static void
 603 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
 604 {
 605         conn_t  *connp = (conn_t *)proto_handle;
 606         tcp_t   *tcp = connp->conn_tcp;
 607         mblk_t *mp;
 608         int error;
 609 
 610         ASSERT(connp->conn_upper_handle != NULL);
 611 
 612         /*
 613          * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
 614          * is currently running.
 615          */
 616         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 617         if ((mp = tcp->tcp_rsrv_mp) == NULL) {
 618                 mutex_exit(&tcp->tcp_rsrv_mp_lock);
 619                 return;
 620         }
 621         tcp->tcp_rsrv_mp = NULL;
 622         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 623 
 624         error = squeue_synch_enter(connp, mp);
 625         ASSERT(error == 0);
 626 
 627         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 628         tcp->tcp_rsrv_mp = mp;
 629         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 630 
 631         if (tcp->tcp_fused) {
 632                 tcp_fuse_backenable(tcp);
 633         } else {
 634                 tcp->tcp_rwnd = connp->conn_rcvbuf;
 635                 /*
 636                  * Send back a window update immediately if TCP is above
 637                  * ESTABLISHED state and the increase of the rcv window
 638                  * that the other side knows is at least 1 MSS after flow
 639                  * control is lifted.
 640                  */
 641                 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
 642                     tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
 643                         tcp_xmit_ctl(NULL, tcp,
 644                             (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
 645                             tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
 646                 }
 647         }
 648 
 649         squeue_synch_exit(connp);
 650 }
 651 
 652 /* ARGSUSED */
 653 static int
 654 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 655     int mode, int32_t *rvalp, cred_t *cr)
 656 {
 657         conn_t          *connp = (conn_t *)proto_handle;
 658         int             error;
 659 
 660         ASSERT(connp->conn_upper_handle != NULL);
 661 
 662         /* All Solaris components should pass a cred for this operation. */
 663         ASSERT(cr != NULL);
 664 
 665         /*
 666          * If we don't have a helper stream then create one.
 667          * ip_create_helper_stream takes care of locking the conn_t,
 668          * so this check for NULL is just a performance optimization.
 669          */
 670         if (connp->conn_helper_info == NULL) {
 671                 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
 672 
 673                 /*
 674                  * Create a helper stream for non-STREAMS socket.
 675                  */
 676                 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
 677                 if (error != 0) {
 678                         ip0dbg(("tcp_ioctl: create of IP helper stream "
 679                             "failed %d\n", error));
 680                         return (error);
 681                 }
 682         }
 683 
 684         switch (cmd) {
 685                 case ND_SET:
 686                 case ND_GET:
 687                 case _SIOCSOCKFALLBACK:
 688                 case TCP_IOC_ABORT_CONN:
 689                 case TI_GETPEERNAME:
 690                 case TI_GETMYNAME:
 691                         ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
 692                             cmd));
 693                         error = EINVAL;
 694                         break;
 695                 default:
 696                         /*
 697                          * If the conn is not closing, pass on to IP using
 698                          * helper stream. Bump the ioctlref to prevent tcp_close
 699                          * from closing the rq/wq out from underneath the ioctl
 700                          * if it ends up queued or aborted/interrupted.
 701                          */
 702                         mutex_enter(&connp->conn_lock);
 703                         if (connp->conn_state_flags & (CONN_CLOSING)) {
 704                                 mutex_exit(&connp->conn_lock);
 705                                 error = EINVAL;
 706                                 break;
 707                         }
 708                         CONN_INC_IOCTLREF_LOCKED(connp);
 709                         error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
 710                             cmd, arg, mode, cr, rvalp);
 711                         CONN_DEC_IOCTLREF(connp);
 712                         break;
 713         }
 714         return (error);
 715 }
 716 
 717 /* ARGSUSED */
 718 static int
 719 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
 720 {
 721         conn_t *connp = (conn_t *)proto_handle;
 722 
 723         ASSERT(connp->conn_upper_handle != NULL);
 724 
 725         /* All Solaris components should pass a cred for this operation. */
 726         ASSERT(cr != NULL);
 727 
 728         tcp_close_common(connp, flags);
 729 
 730         ip_free_helper_stream(connp);
 731 
 732         /*
 733          * Drop IP's reference on the conn. This is the last reference
 734          * on the connp if the state was less than established. If the
 735          * connection has gone into timewait state, then we will have
 736          * one ref for the TCP and one more ref (total of two) for the
 737          * classifier connected hash list (a timewait connections stays
 738          * in connected hash till closed).
 739          *
 740          * We can't assert the references because there might be other
 741          * transient reference places because of some walkers or queued
 742          * packets in squeue for the timewait state.
 743          */
 744         CONN_DEC_REF(connp);
 745 
 746         /*
 747          * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
 748          * freeing the socket.
 749          */
 750         return (EINPROGRESS);
 751 }
 752 
 753 /* ARGSUSED */
 754 sock_lower_handle_t
 755 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 756     uint_t *smodep, int *errorp, int flags, cred_t *credp)
 757 {
 758         conn_t          *connp;
 759         boolean_t       isv6 = family == AF_INET6;
 760 
 761         if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
 762             (proto != 0 && proto != IPPROTO_TCP)) {
 763                 *errorp = EPROTONOSUPPORT;
 764                 return (NULL);
 765         }
 766 
 767         connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
 768         if (connp == NULL) {
 769                 return (NULL);
 770         }
 771 
 772         /*
 773          * Put the ref for TCP. Ref for IP was already put
 774          * by ipcl_conn_create. Also make the conn_t globally
 775          * visible to walkers
 776          */
 777         mutex_enter(&connp->conn_lock);
 778         CONN_INC_REF_LOCKED(connp);
 779         ASSERT(connp->conn_ref == 2);
 780         connp->conn_state_flags &= ~CONN_INCIPIENT;
 781 
 782         connp->conn_flags |= IPCL_NONSTR;
 783         mutex_exit(&connp->conn_lock);
 784 
 785         ASSERT(errorp != NULL);
 786         *errorp = 0;
 787         *sock_downcalls = &sock_tcp_downcalls;
 788         *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
 789             SM_SENDFILESUPP;
 790 
 791         return ((sock_lower_handle_t)connp);
 792 }
 793 
 794 /*
 795  * tcp_fallback
 796  *
 797  * A direct socket is falling back to using STREAMS. The queue
 798  * that is being passed down was created using tcp_open() with
 799  * the SO_FALLBACK flag set. As a result, the queue is not
 800  * associated with a conn, and the q_ptrs instead contain the
 801  * dev and minor area that should be used.
 802  *
 803  * The 'issocket' flag indicates whether the FireEngine
 804  * optimizations should be used. The common case would be that
 805  * optimizations are enabled, and they might be subsequently
 806  * disabled using the _SIOCSOCKFALLBACK ioctl.
 807  */
 808 
 809 /*
 810  * An active connection is falling back to TPI. Gather all the information
 811  * required by the STREAM head and TPI sonode and send it up.
 812  */
 813 static void
 814 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 815     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
 816     sock_quiesce_arg_t *arg)
 817 {
 818         conn_t                  *connp = tcp->tcp_connp;
 819         struct stroptions       *stropt;
 820         struct T_capability_ack tca;
 821         struct sockaddr_in6     laddr, faddr;
 822         socklen_t               laddrlen, faddrlen;
 823         short                   opts;
 824         int                     error;
 825         mblk_t                  *mp, *mpnext;
 826 
 827         connp->conn_dev = (dev_t)RD(q)->q_ptr;
 828         connp->conn_minor_arena = WR(q)->q_ptr;
 829 
 830         RD(q)->q_ptr = WR(q)->q_ptr = connp;
 831 
 832         connp->conn_rq = RD(q);
 833         connp->conn_wq = WR(q);
 834 
 835         WR(q)->q_qinfo = &tcp_sock_winit;
 836 
 837         if (!issocket)
 838                 tcp_use_pure_tpi(tcp);
 839 
 840         /*
 841          * free the helper stream
 842          */
 843         ip_free_helper_stream(connp);
 844 
 845         /*
 846          * Notify the STREAM head about options
 847          */
 848         DB_TYPE(stropt_mp) = M_SETOPTS;
 849         stropt = (struct stroptions *)stropt_mp->b_rptr;
 850         stropt_mp->b_wptr += sizeof (struct stroptions);
 851         stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
 852 
 853         stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
 854             tcp->tcp_tcps->tcps_wroff_xtra);
 855         if (tcp->tcp_snd_sack_ok)
 856                 stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
 857         stropt->so_hiwat = connp->conn_rcvbuf;
 858         stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 859 
 860         putnext(RD(q), stropt_mp);
 861 
 862         /*
 863          * Collect the information needed to sync with the sonode
 864          */
 865         tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
 866 
 867         laddrlen = faddrlen = sizeof (sin6_t);
 868         (void) tcp_getsockname((sock_lower_handle_t)connp,
 869             (struct sockaddr *)&laddr, &laddrlen, CRED());
 870         error = tcp_getpeername((sock_lower_handle_t)connp,
 871             (struct sockaddr *)&faddr, &faddrlen, CRED());
 872         if (error != 0)
 873                 faddrlen = 0;
 874 
 875         opts = 0;
 876         if (connp->conn_oobinline)
 877                 opts |= SO_OOBINLINE;
 878         if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 879                 opts |= SO_DONTROUTE;
 880 
 881         /*
 882          * Notify the socket that the protocol is now quiescent,
 883          * and it's therefore safe move data from the socket
 884          * to the stream head.
 885          */
 886         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
 887             (struct sockaddr *)&laddr, laddrlen,
 888             (struct sockaddr *)&faddr, faddrlen, opts);
 889 
 890         while (mp != NULL) {
 891                 mpnext = mp->b_next;
 892                 tcp->tcp_rcv_list = mp->b_next;
 893                 mp->b_next = NULL;
 894                 putnext(q, mp);
 895                 mp = mpnext;
 896         }
 897         ASSERT(tcp->tcp_rcv_last_head == NULL);
 898         ASSERT(tcp->tcp_rcv_last_tail == NULL);
 899         ASSERT(tcp->tcp_rcv_cnt == 0);
 900 
 901         /*
 902          * All eagers in q0 are marked as being non-STREAM, so they will
 903          * make su_newconn upcalls when the handshake completes, which
 904          * will fail (resulting in the conn being closed). So we just blow
 905          * off everything in q0 instead of waiting for the inevitable.
 906          */
 907         if (tcp->tcp_conn_req_cnt_q0 != 0)
 908                 tcp_eager_cleanup(tcp, B_TRUE);
 909 }
 910 
 911 /*
 912  * An eager is falling back to TPI. All we have to do is send
 913  * up a T_CONN_IND.
 914  */
 915 static void
 916 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
 917     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
 918 {
 919         conn_t *connp = eager->tcp_connp;
 920         tcp_t *listener = eager->tcp_listener;
 921         mblk_t *mp;
 922 
 923         ASSERT(listener != NULL);
 924 
 925         /*
 926          * Notify the socket that the protocol is now quiescent,
 927          * and it's therefore safe move data from the socket
 928          * to tcp's rcv queue.
 929          */
 930         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
 931             NULL, 0, 0);
 932 
 933         if (mp != NULL) {
 934                 ASSERT(eager->tcp_rcv_cnt == 0);
 935 
 936                 eager->tcp_rcv_list = mp;
 937                 eager->tcp_rcv_cnt = msgdsize(mp);
 938                 while (mp->b_next != NULL) {
 939                         mp = mp->b_next;
 940                         eager->tcp_rcv_cnt += msgdsize(mp);
 941                 }
 942                 eager->tcp_rcv_last_head = mp;
 943                 while (mp->b_cont)
 944                         mp = mp->b_cont;
 945                 eager->tcp_rcv_last_tail = mp;
 946                 if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
 947                         eager->tcp_rwnd = 0;
 948                 else
 949                         eager->tcp_rwnd -= eager->tcp_rcv_cnt;
 950         }
 951 
 952         if (!issocket)
 953                 eager->tcp_issocket = B_FALSE;
 954         /*
 955          * The stream for this eager does not yet exist, so mark it as
 956          * being detached.
 957          */
 958         eager->tcp_detached = B_TRUE;
 959         eager->tcp_hard_binding = B_TRUE;
 960         connp->conn_rq = listener->tcp_connp->conn_rq;
 961         connp->conn_wq = listener->tcp_connp->conn_wq;
 962 
 963         /* Send up the connection indication */
 964         mp = eager->tcp_conn.tcp_eager_conn_ind;
 965         ASSERT(mp != NULL);
 966         eager->tcp_conn.tcp_eager_conn_ind = NULL;
 967 
 968         /*
 969          * TLI/XTI applications will get confused by
 970          * sending eager as an option since it violates
 971          * the option semantics. So remove the eager as
 972          * option since TLI/XTI app doesn't need it anyway.
 973          */
 974         if (!issocket) {
 975                 struct T_conn_ind *conn_ind;
 976 
 977                 conn_ind = (struct T_conn_ind *)mp->b_rptr;
 978                 conn_ind->OPT_length = 0;
 979                 conn_ind->OPT_offset = 0;
 980         }
 981 
 982         /*
 983          * Sockfs guarantees that the listener will not be closed
 984          * during fallback. So we can safely use the listener's queue.
 985          */
 986         putnext(listener->tcp_connp->conn_rq, mp);
 987 }
 988 
 989 
 990 int
 991 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 992     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
 993     sock_quiesce_arg_t *arg)
 994 {
 995         tcp_t                   *tcp;
 996         conn_t                  *connp = (conn_t *)proto_handle;
 997         int                     error;
 998         mblk_t                  *stropt_mp;
 999         mblk_t                  *ordrel_mp;
1000 
1001         tcp = connp->conn_tcp;
1002 
1003         stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1004             NULL);
1005 
1006         /* Pre-allocate the T_ordrel_ind mblk. */
1007         ASSERT(tcp->tcp_ordrel_mp == NULL);
1008         ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1009             STR_NOSIG, NULL);
1010         ordrel_mp->b_datap->db_type = M_PROTO;
1011         ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1012         ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1013 
1014         /*
1015          * Enter the squeue so that no new packets can come in
1016          */
1017         error = squeue_synch_enter(connp, NULL);
1018         if (error != 0) {
1019                 /* failed to enter, free all the pre-allocated messages. */
1020                 freeb(stropt_mp);
1021                 freeb(ordrel_mp);
1022                 return (ENOMEM);
1023         }
1024 
1025         /*
1026          * Do not allow fallback on connections making use of SO_REUSEPORT.
1027          */
1028         if (tcp->tcp_rg_bind != NULL) {
1029                 freeb(stropt_mp);
1030                 freeb(ordrel_mp);
1031                 squeue_synch_exit(connp);
1032                 return (EINVAL);
1033         }
1034 
1035         /*
1036          * Both endpoints must be of the same type (either STREAMS or
1037          * non-STREAMS) for fusion to be enabled. So if we are fused,
1038          * we have to unfuse.
1039          */
1040         if (tcp->tcp_fused)
1041                 tcp_unfuse(tcp);
1042 
1043         if (tcp->tcp_listener != NULL) {
1044                 /* The eager will deal with opts when accept() is called */
1045                 freeb(stropt_mp);
1046                 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1047         } else {
1048                 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1049                     quiesced_cb, arg);
1050         }
1051 
1052         /*
1053          * No longer a direct socket
1054          *
1055          * Note that we intentionally leave the upper_handle and upcalls
1056          * intact, since eagers may still be using them.
1057          */
1058         connp->conn_flags &= ~IPCL_NONSTR;
1059         tcp->tcp_ordrel_mp = ordrel_mp;
1060 
1061         /*
1062          * There should be atleast two ref's (IP + TCP)
1063          */
1064         ASSERT(connp->conn_ref >= 2);
1065         squeue_synch_exit(connp);
1066 
1067         return (0);
1068 }
1069 
1070 /*
1071  * Notifies a non-STREAMS based listener about a new connection. This
1072  * function is executed on the *eager*'s squeue once the 3 way handshake
1073  * has completed. Note that the behavior differs from STREAMS, where the
1074  * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1075  * squeue.
1076  *
1077  * Returns B_TRUE if the notification succeeded and an upper handle was
1078  * obtained. `tcp' should be closed on failure.
1079  */
1080 boolean_t
1081 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1082 {
1083         tcp_t *listener = tcp->tcp_listener;
1084         conn_t *lconnp = listener->tcp_connp;
1085         conn_t *econnp = tcp->tcp_connp;
1086         tcp_t *tail;
1087         ipaddr_t *addr_cache;
1088         sock_upper_handle_t upper;
1089         struct sock_proto_props sopp;
1090 
1091         mutex_enter(&listener->tcp_eager_lock);
1092         /*
1093          * Take the eager out, if it is in the list of droppable eagers
1094          * as we are here because the 3W handshake is over.
1095          */
1096         MAKE_UNDROPPABLE(tcp);
1097         /*
1098          * The eager already has an extra ref put in tcp_input_data
1099          * so that it stays till accept comes back even though it
1100          * might get into TCPS_CLOSED as a result of a TH_RST etc.
1101          */
1102         ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1103         listener->tcp_conn_req_cnt_q0--;
1104         listener->tcp_conn_req_cnt_q++;
1105 
1106         /* Move from SYN_RCVD to ESTABLISHED list  */
1107         tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1108         tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1109         tcp->tcp_eager_prev_q0 = NULL;
1110         tcp->tcp_eager_next_q0 = NULL;
1111 
1112         /*
1113          * Insert at end of the queue because connections are accepted
1114          * in chronological order. Leaving the older connections at front
1115          * of the queue helps reducing search time.
1116          */
1117         tail = listener->tcp_eager_last_q;
1118         if (tail != NULL)
1119                 tail->tcp_eager_next_q = tcp;
1120         else
1121                 listener->tcp_eager_next_q = tcp;
1122         listener->tcp_eager_last_q = tcp;
1123         tcp->tcp_eager_next_q = NULL;
1124 
1125         /* we have timed out before */
1126         if (tcp->tcp_syn_rcvd_timeout != 0) {
1127                 tcp->tcp_syn_rcvd_timeout = 0;
1128                 listener->tcp_syn_rcvd_timeout--;
1129                 if (listener->tcp_syn_defense &&
1130                     listener->tcp_syn_rcvd_timeout <=
1131                     (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1132                     10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1133                     listener->tcp_last_rcv_lbolt)) {
1134                         /*
1135                          * Turn off the defense mode if we
1136                          * believe the SYN attack is over.
1137                          */
1138                         listener->tcp_syn_defense = B_FALSE;
1139                         if (listener->tcp_ip_addr_cache) {
1140                                 kmem_free((void *)listener->tcp_ip_addr_cache,
1141                                     IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1142                                 listener->tcp_ip_addr_cache = NULL;
1143                         }
1144                 }
1145         }
1146         addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1147         if (addr_cache != NULL) {
1148                 /*
1149                  * We have finished a 3-way handshake with this
1150                  * remote host. This proves the IP addr is good.
1151                  * Cache it!
1152                  */
1153                 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1154                     tcp->tcp_connp->conn_faddr_v4;
1155         }
1156         mutex_exit(&listener->tcp_eager_lock);
1157 
1158         /*
1159          * Notify the ULP about the newconn. It is guaranteed that no
1160          * tcp_accept() call will be made for the eager if the
1161          * notification fails.
1162          */
1163         if ((upper = (*lconnp->conn_upcalls->su_newconn)
1164             (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1165             &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1166             &econnp->conn_upcalls)) == NULL) {
1167                 return (B_FALSE);
1168         }
1169         econnp->conn_upper_handle = upper;
1170 
1171         tcp->tcp_detached = B_FALSE;
1172         tcp->tcp_hard_binding = B_FALSE;
1173         tcp->tcp_tconnind_started = B_TRUE;
1174 
1175         if (econnp->conn_keepalive) {
1176                 tcp->tcp_ka_last_intrvl = 0;
1177                 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1178                     tcp->tcp_ka_interval);
1179         }
1180 
1181         /* Update the necessary parameters */
1182         tcp_get_proto_props(tcp, &sopp);
1183 
1184         (*econnp->conn_upcalls->su_set_proto_props)
1185             (econnp->conn_upper_handle, &sopp);
1186 
1187         return (B_TRUE);
1188 }