Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2017 Joyent, Inc.
  24  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  25  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  26  */
  27 
  28 #ifndef _INET_TCP_IMPL_H
  29 #define _INET_TCP_IMPL_H
  30 
  31 /*
  32  * TCP implementation private declarations.  These interfaces are
  33  * used to build the IP module and are not meant to be accessed
  34  * by any modules except IP itself.  They are undocumented and are
  35  * subject to change without notice.
  36  */
  37 
  38 #ifdef  __cplusplus
  39 extern "C" {
  40 #endif
  41 
  42 #ifdef _KERNEL
  43 
  44 #include <sys/cpuvar.h>
  45 #include <sys/clock_impl.h>       /* For LBOLT_FASTPATH{,64} */


 283 }
 284 
 285 /*
 286  * Set ECN capable transport (ECT) code point in IP header.
 287  *
 288  * Note that there are 2 ECT code points '01' and '10', which are called
 289  * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
 290  * point ECT(0) for TCP as described in RFC 2481.
 291  */
 292 #define TCP_SET_ECT(tcp, iph) \
 293         if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
 294                 /* We need to clear the code point first. */ \
 295                 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
 296                 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
 297         } else { \
 298                 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
 299                 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
 300         }
 301 
 302 /*
 303  * Set tcp_rto with boundary checking.
 304  */
 305 #define TCP_SET_RTO(tcp, rto) \
 306         if ((rto) < (tcp)->tcp_rto_min)                   \
 307                 (tcp)->tcp_rto = (tcp)->tcp_rto_min;      \
 308         else if ((rto) > (tcp)->tcp_rto_max)              \
 309                 (tcp)->tcp_rto = (tcp)->tcp_rto_max;      \
 310         else                                            \
 311                 (tcp)->tcp_rto = (rto);
 312 
 313 /*
 314  * TCP options struct returned from tcp_parse_options.
 315  */
 316 typedef struct tcp_opt_s {
 317         uint32_t        tcp_opt_mss;
 318         uint32_t        tcp_opt_wscale;
 319         uint32_t        tcp_opt_ts_val;
 320         uint32_t        tcp_opt_ts_ecr;
 321         tcp_t           *tcp;
 322 } tcp_opt_t;
 323 
 324 /*
 325  * Flags returned from tcp_parse_options.
 326  */
 327 #define TCP_OPT_MSS_PRESENT     1
 328 #define TCP_OPT_WSCALE_PRESENT  2
 329 #define TCP_OPT_TSTAMP_PRESENT  4
 330 #define TCP_OPT_SACK_OK_PRESENT 8
 331 #define TCP_OPT_SACK_PRESENT    16
 332 
 333 /*


 572 #define tcps_mss_def_ipv6               tcps_propinfo_tbl[46].prop_cur_uval
 573 #define tcps_mss_max_ipv6               tcps_propinfo_tbl[47].prop_cur_uval
 574 #define tcps_rev_src_routes             tcps_propinfo_tbl[48].prop_cur_bval
 575 #define tcps_local_dack_interval        tcps_propinfo_tbl[49].prop_cur_uval
 576 #define tcps_local_dacks_max            tcps_propinfo_tbl[50].prop_cur_uval
 577 #define tcps_ecn_permitted              tcps_propinfo_tbl[51].prop_cur_uval
 578 #define tcps_rst_sent_rate_enabled      tcps_propinfo_tbl[52].prop_cur_bval
 579 #define tcps_rst_sent_rate              tcps_propinfo_tbl[53].prop_cur_uval
 580 #define tcps_push_timer_interval        tcps_propinfo_tbl[54].prop_cur_uval
 581 #define tcps_use_smss_as_mss_opt        tcps_propinfo_tbl[55].prop_cur_bval
 582 #define tcps_keepalive_abort_interval_high \
 583                                         tcps_propinfo_tbl[56].prop_max_uval
 584 #define tcps_keepalive_abort_interval \
 585                                         tcps_propinfo_tbl[56].prop_cur_uval
 586 #define tcps_keepalive_abort_interval_low \
 587                                         tcps_propinfo_tbl[56].prop_min_uval
 588 #define tcps_wroff_xtra                 tcps_propinfo_tbl[57].prop_cur_uval
 589 #define tcps_dev_flow_ctl               tcps_propinfo_tbl[58].prop_cur_bval
 590 #define tcps_reass_timeout              tcps_propinfo_tbl[59].prop_cur_uval
 591 #define tcps_iss_incr                   tcps_propinfo_tbl[65].prop_cur_uval


 592 
 593 extern struct qinit tcp_rinitv4, tcp_rinitv6;
 594 extern boolean_t do_tcp_fusion;
 595 
 596 /*
 597  * Object to represent database of options to search passed to
 598  * {sock,tpi}optcom_req() interface routine to take care of option
 599  * management and associated methods.
 600  */
 601 extern optdb_obj_t      tcp_opt_obj;
 602 extern uint_t           tcp_max_optsize;
 603 
 604 extern int tcp_squeue_flag;
 605 
 606 extern uint_t tcp_free_list_max_cnt;
 607 
 608 /*
 609  * Functions in tcp.c.
 610  */
 611 extern void     tcp_acceptor_hash_insert(t_uscalar_t, tcp_t *);
 612 extern tcp_t    *tcp_acceptor_hash_lookup(t_uscalar_t, tcp_stack_t *);
 613 extern void     tcp_acceptor_hash_remove(tcp_t *);
 614 extern mblk_t   *tcp_ack_mp(tcp_t *);
 615 extern int      tcp_build_hdrs(tcp_t *);

 616 extern void     tcp_cleanup(tcp_t *);
 617 extern int      tcp_clean_death(tcp_t *, int);
 618 extern void     tcp_clean_death_wrapper(void *, mblk_t *, void *,
 619                     ip_recv_attr_t *);
 620 extern void     tcp_close_common(conn_t *, int);
 621 extern void     tcp_close_detached(tcp_t *);
 622 extern void     tcp_close_mpp(mblk_t **);
 623 extern void     tcp_closei_local(tcp_t *);
 624 extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **,
 625                     uint_t *, int *, int, cred_t *);
 626 extern conn_t   *tcp_create_common(cred_t *, boolean_t, boolean_t, int *);
 627 extern void     tcp_disconnect(tcp_t *, mblk_t *);
 628 extern char     *tcp_display(tcp_t *, char *, char);
 629 extern int      tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
 630                     boolean_t);
 631 extern int      tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
 632                     cred_t *, pid_t);
 633 extern int      tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int,
 634                     cred_t *, boolean_t);
 635 extern int      tcp_do_unbind(conn_t *);


 692 extern void     tcp_rexmit_after_error(tcp_t *);
 693 extern void     tcp_sack_rexmit(tcp_t *, uint_t *);
 694 extern void     tcp_send_data(tcp_t *, mblk_t *);
 695 extern void     tcp_send_synack(void *, mblk_t *, void *, ip_recv_attr_t *);
 696 extern void     tcp_shutdown_output(void *, mblk_t *, void *, ip_recv_attr_t *);
 697 extern void     tcp_ss_rexmit(tcp_t *);
 698 extern void     tcp_update_xmit_tail(tcp_t *, uint32_t);
 699 extern void     tcp_wput(queue_t *, mblk_t *);
 700 extern void     tcp_wput_data(tcp_t *, mblk_t *, boolean_t);
 701 extern void     tcp_wput_sock(queue_t *, mblk_t *);
 702 extern void     tcp_wput_fallback(queue_t *, mblk_t *);
 703 extern void     tcp_xmit_ctl(char *, tcp_t *, uint32_t, uint32_t, int);
 704 extern void     tcp_xmit_listeners_reset(mblk_t *, ip_recv_attr_t *,
 705                     ip_stack_t *i, conn_t *);
 706 extern mblk_t   *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *,
 707                     mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t);
 708 
 709 /*
 710  * Input related functions in tcp_input.c.
 711  */

 712 extern void     tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 713 extern void     tcp_input_data(void *, mblk_t *, void *, ip_recv_attr_t *);
 714 extern void     tcp_input_listener_unbound(void *, mblk_t *, void *,
 715                     ip_recv_attr_t *);
 716 extern boolean_t        tcp_paws_check(tcp_t *, const tcp_opt_t *);
 717 extern int      tcp_parse_options(tcpha_t *, tcp_opt_t *);
 718 extern uint_t   tcp_rcv_drain(tcp_t *);
 719 extern void     tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t, cred_t *);
 720 extern boolean_t        tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
 721                             ip_recv_attr_t *);
 722 
 723 /*
 724  * Kernel socket related functions in tcp_socket.c.
 725  */
 726 extern int      tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
 727                     so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
 728 extern boolean_t tcp_newconn_notify(tcp_t *, ip_recv_attr_t *);
 729 
 730 /*
 731  * Timer related functions in tcp_timers.c.




   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2017 Joyent, Inc.
  24  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  25  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  26  */
  27 
  28 #ifndef _INET_TCP_IMPL_H
  29 #define _INET_TCP_IMPL_H
  30 
  31 /*
  32  * TCP implementation private declarations.  These interfaces are
  33  * used to build the IP module and are not meant to be accessed
  34  * by any modules except IP itself.  They are undocumented and are
  35  * subject to change without notice.
  36  */
  37 
  38 #ifdef  __cplusplus
  39 extern "C" {
  40 #endif
  41 
  42 #ifdef _KERNEL
  43 
  44 #include <sys/cpuvar.h>
  45 #include <sys/clock_impl.h>       /* For LBOLT_FASTPATH{,64} */


 283 }
 284 
 285 /*
 286  * Set ECN capable transport (ECT) code point in IP header.
 287  *
 288  * Note that there are 2 ECT code points '01' and '10', which are called
 289  * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
 290  * point ECT(0) for TCP as described in RFC 2481.
 291  */
 292 #define TCP_SET_ECT(tcp, iph) \
 293         if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
 294                 /* We need to clear the code point first. */ \
 295                 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
 296                 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
 297         } else { \
 298                 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
 299                 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
 300         }
 301 
 302 /*











 303  * TCP options struct returned from tcp_parse_options.
 304  */
 305 typedef struct tcp_opt_s {
 306         uint32_t        tcp_opt_mss;
 307         uint32_t        tcp_opt_wscale;
 308         uint32_t        tcp_opt_ts_val;
 309         uint32_t        tcp_opt_ts_ecr;
 310         tcp_t           *tcp;
 311 } tcp_opt_t;
 312 
 313 /*
 314  * Flags returned from tcp_parse_options.
 315  */
 316 #define TCP_OPT_MSS_PRESENT     1
 317 #define TCP_OPT_WSCALE_PRESENT  2
 318 #define TCP_OPT_TSTAMP_PRESENT  4
 319 #define TCP_OPT_SACK_OK_PRESENT 8
 320 #define TCP_OPT_SACK_PRESENT    16
 321 
 322 /*


 561 #define tcps_mss_def_ipv6               tcps_propinfo_tbl[46].prop_cur_uval
 562 #define tcps_mss_max_ipv6               tcps_propinfo_tbl[47].prop_cur_uval
 563 #define tcps_rev_src_routes             tcps_propinfo_tbl[48].prop_cur_bval
 564 #define tcps_local_dack_interval        tcps_propinfo_tbl[49].prop_cur_uval
 565 #define tcps_local_dacks_max            tcps_propinfo_tbl[50].prop_cur_uval
 566 #define tcps_ecn_permitted              tcps_propinfo_tbl[51].prop_cur_uval
 567 #define tcps_rst_sent_rate_enabled      tcps_propinfo_tbl[52].prop_cur_bval
 568 #define tcps_rst_sent_rate              tcps_propinfo_tbl[53].prop_cur_uval
 569 #define tcps_push_timer_interval        tcps_propinfo_tbl[54].prop_cur_uval
 570 #define tcps_use_smss_as_mss_opt        tcps_propinfo_tbl[55].prop_cur_bval
 571 #define tcps_keepalive_abort_interval_high \
 572                                         tcps_propinfo_tbl[56].prop_max_uval
 573 #define tcps_keepalive_abort_interval \
 574                                         tcps_propinfo_tbl[56].prop_cur_uval
 575 #define tcps_keepalive_abort_interval_low \
 576                                         tcps_propinfo_tbl[56].prop_min_uval
 577 #define tcps_wroff_xtra                 tcps_propinfo_tbl[57].prop_cur_uval
 578 #define tcps_dev_flow_ctl               tcps_propinfo_tbl[58].prop_cur_bval
 579 #define tcps_reass_timeout              tcps_propinfo_tbl[59].prop_cur_uval
 580 #define tcps_iss_incr                   tcps_propinfo_tbl[65].prop_cur_uval
 581 #define tcps_abc                        tcps_propinfo_tbl[67].prop_cur_bval
 582 #define tcps_abc_l_var                  tcps_propinfo_tbl[68].prop_cur_uval
 583 
 584 extern struct qinit tcp_rinitv4, tcp_rinitv6;
 585 extern boolean_t do_tcp_fusion;
 586 
 587 /*
 588  * Object to represent database of options to search passed to
 589  * {sock,tpi}optcom_req() interface routine to take care of option
 590  * management and associated methods.
 591  */
 592 extern optdb_obj_t      tcp_opt_obj;
 593 extern uint_t           tcp_max_optsize;
 594 
 595 extern int tcp_squeue_flag;
 596 
 597 extern uint_t tcp_free_list_max_cnt;
 598 
 599 /*
 600  * Functions in tcp.c.
 601  */
 602 extern void     tcp_acceptor_hash_insert(t_uscalar_t, tcp_t *);
 603 extern tcp_t    *tcp_acceptor_hash_lookup(t_uscalar_t, tcp_stack_t *);
 604 extern void     tcp_acceptor_hash_remove(tcp_t *);
 605 extern mblk_t   *tcp_ack_mp(tcp_t *);
 606 extern int      tcp_build_hdrs(tcp_t *);
 607 extern clock_t  tcp_calculate_rto(tcp_t *, tcp_stack_t *);
 608 extern void     tcp_cleanup(tcp_t *);
 609 extern int      tcp_clean_death(tcp_t *, int);
 610 extern void     tcp_clean_death_wrapper(void *, mblk_t *, void *,
 611                     ip_recv_attr_t *);
 612 extern void     tcp_close_common(conn_t *, int);
 613 extern void     tcp_close_detached(tcp_t *);
 614 extern void     tcp_close_mpp(mblk_t **);
 615 extern void     tcp_closei_local(tcp_t *);
 616 extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **,
 617                     uint_t *, int *, int, cred_t *);
 618 extern conn_t   *tcp_create_common(cred_t *, boolean_t, boolean_t, int *);
 619 extern void     tcp_disconnect(tcp_t *, mblk_t *);
 620 extern char     *tcp_display(tcp_t *, char *, char);
 621 extern int      tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
 622                     boolean_t);
 623 extern int      tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
 624                     cred_t *, pid_t);
 625 extern int      tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int,
 626                     cred_t *, boolean_t);
 627 extern int      tcp_do_unbind(conn_t *);


 684 extern void     tcp_rexmit_after_error(tcp_t *);
 685 extern void     tcp_sack_rexmit(tcp_t *, uint_t *);
 686 extern void     tcp_send_data(tcp_t *, mblk_t *);
 687 extern void     tcp_send_synack(void *, mblk_t *, void *, ip_recv_attr_t *);
 688 extern void     tcp_shutdown_output(void *, mblk_t *, void *, ip_recv_attr_t *);
 689 extern void     tcp_ss_rexmit(tcp_t *);
 690 extern void     tcp_update_xmit_tail(tcp_t *, uint32_t);
 691 extern void     tcp_wput(queue_t *, mblk_t *);
 692 extern void     tcp_wput_data(tcp_t *, mblk_t *, boolean_t);
 693 extern void     tcp_wput_sock(queue_t *, mblk_t *);
 694 extern void     tcp_wput_fallback(queue_t *, mblk_t *);
 695 extern void     tcp_xmit_ctl(char *, tcp_t *, uint32_t, uint32_t, int);
 696 extern void     tcp_xmit_listeners_reset(mblk_t *, ip_recv_attr_t *,
 697                     ip_stack_t *i, conn_t *);
 698 extern mblk_t   *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *,
 699                     mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t);
 700 
 701 /*
 702  * Input related functions in tcp_input.c.
 703  */
 704 extern void     cc_cong_signal(tcp_t *, uint32_t, uint32_t);
 705 extern void     tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 706 extern void     tcp_input_data(void *, mblk_t *, void *, ip_recv_attr_t *);
 707 extern void     tcp_input_listener_unbound(void *, mblk_t *, void *,
 708                     ip_recv_attr_t *);
 709 extern boolean_t        tcp_paws_check(tcp_t *, const tcp_opt_t *);
 710 extern int      tcp_parse_options(tcpha_t *, tcp_opt_t *);
 711 extern uint_t   tcp_rcv_drain(tcp_t *);
 712 extern void     tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t, cred_t *);
 713 extern boolean_t        tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
 714                             ip_recv_attr_t *);
 715 
 716 /*
 717  * Kernel socket related functions in tcp_socket.c.
 718  */
 719 extern int      tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
 720                     so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
 721 extern boolean_t tcp_newconn_notify(tcp_t *, ip_recv_attr_t *);
 722 
 723 /*
 724  * Timer related functions in tcp_timers.c.