Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp.h
          +++ new/usr/src/uts/common/inet/tcp.h
↓ open down ↓ 14 lines elided ↑ open up ↑
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2015 Joyent, Inc.
  24   24   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25      - * Copyright (c) 2014 by Delphix. All rights reserved.
       25 + * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  26   26   */
  27   27  /* Copyright (c) 1990 Mentat Inc. */
  28   28  
  29   29  #ifndef _INET_TCP_H
  30   30  #define _INET_TCP_H
  31   31  
  32   32  #ifdef  __cplusplus
  33   33  extern "C" {
  34   34  #endif
  35   35  
↓ open down ↓ 3 lines elided ↑ open up ↑
  39   39  #include <sys/socket.h>
  40   40  #include <sys/socket_proto.h>
  41   41  #include <sys/md5.h>
  42   42  #include <inet/common.h>
  43   43  #include <inet/ip.h>
  44   44  #include <inet/ip6.h>
  45   45  #include <inet/mi.h>
  46   46  #include <inet/mib2.h>
  47   47  #include <inet/tcp_stack.h>
  48   48  #include <inet/tcp_sack.h>
       49 +#include <inet/cc.h>
  49   50  
  50   51  /* TCP states */
  51   52  #define TCPS_CLOSED             -6
  52   53  #define TCPS_IDLE               -5      /* idle (opened, but not bound) */
  53   54  #define TCPS_BOUND              -4      /* bound, ready to connect or accept */
  54   55  #define TCPS_LISTEN             -3      /* listening for connection */
  55   56  #define TCPS_SYN_SENT           -2      /* active, have sent syn */
  56   57  #define TCPS_SYN_RCVD           -1      /* have received syn (and sent ours) */
  57   58  /* states < TCPS_ESTABLISHED are those where connections not established */
  58   59  #define TCPS_ESTABLISHED        0       /* established */
↓ open down ↓ 87 lines elided ↑ open up ↑
 146  147  typedef struct tcp_s {
 147  148          struct tcp_s    *tcp_time_wait_next;
 148  149                                  /* Pointer to next T/W block */
 149  150          struct tcp_s    *tcp_time_wait_prev;
 150  151                                  /* Pointer to previous T/W next */
 151  152          int64_t         tcp_time_wait_expire;
 152  153  
 153  154          struct conn_s   *tcp_connp;     /* back pointer to conn_t */
 154  155          tcp_stack_t     *tcp_tcps;      /* back pointer to tcp_stack_t */
 155  156  
      157 +        struct cc_algo  *tcp_cc_algo;   /* congestion control algorithm */
      158 +        struct cc_var   tcp_ccv;        /* congestion control specific vars */
      159 +
 156  160          int32_t tcp_state;
 157  161          int32_t tcp_rcv_ws;             /* My window scale power */
 158  162          int32_t tcp_snd_ws;             /* Sender's window scale power */
 159  163          uint32_t tcp_ts_recent;         /* Timestamp of earliest unacked */
 160  164                                          /*  data segment */
 161  165          clock_t tcp_rto;                /* Round trip timeout */
 162  166          int64_t tcp_last_rcv_lbolt;
 163  167                                  /* lbolt on last packet, used for PAWS */
 164  168          uint32_t tcp_rto_initial;       /* Initial RTO */
 165  169          uint32_t tcp_rto_min;           /* Minimum RTO */
↓ open down ↓ 5 lines elided ↑ open up ↑
 171  175          uint32_t tcp_iss;               /* Initial send seq num */
 172  176          uint32_t tcp_rnxt;              /* Seq we expect to recv next */
 173  177          uint32_t tcp_rwnd;
 174  178  
 175  179          /* Fields arranged in approximate access order along main paths */
 176  180          mblk_t  *tcp_xmit_head;         /* Head of xmit/rexmit list */
 177  181          mblk_t  *tcp_xmit_last;         /* Last valid data seen by tcp_wput */
 178  182          mblk_t  *tcp_xmit_tail;         /* Last data sent */
 179  183          uint32_t tcp_unsent;            /* # of bytes in hand that are unsent */
 180  184          uint32_t tcp_xmit_tail_unsent;  /* # of unsent bytes in xmit_tail */
 181      -
 182  185          uint32_t tcp_suna;              /* Sender unacknowledged */
 183  186          uint32_t tcp_rexmit_nxt;        /* Next rexmit seq num */
 184  187          uint32_t tcp_rexmit_max;        /* Max retran seq num */
 185  188          uint32_t tcp_cwnd;              /* Congestion window */
 186  189          int32_t tcp_cwnd_cnt;           /* cwnd cnt in congestion avoidance */
 187      -
 188      -        uint32_t tcp_ibsegs;            /* Inbound segments on this stream */
 189      -        uint32_t tcp_obsegs;            /* Outbound segments on this stream */
 190      -
 191  190          uint32_t tcp_naglim;            /* Tunable nagle limit */
 192  191          uint32_t        tcp_valid_bits;
 193  192  #define TCP_ISS_VALID   0x1     /* Is the tcp_iss seq num active? */
 194  193  #define TCP_FSS_VALID   0x2     /* Is the tcp_fss seq num active? */
 195  194  #define TCP_URG_VALID   0x4     /* Is the tcp_urg seq num active? */
 196  195  #define TCP_OFO_FIN_VALID 0x8   /* Has TCP received an out of order FIN? */
 197  196  
 198      -
 199      -
 200  197          timeout_id_t    tcp_timer_tid;  /* Control block for timer service */
 201  198          uchar_t tcp_timer_backoff;      /* Backoff shift count. */
 202  199          int64_t tcp_last_recv_time;     /* Last time we receive a segment. */
 203  200          uint32_t tcp_init_cwnd;         /* Initial cwnd (start/restart) */
 204  201  
 205  202          /* Following manipulated by TCP under squeue protection */
 206  203          uint32_t
 207  204                  tcp_urp_last_valid : 1, /* Is tcp_urp_last valid? */
 208  205                  tcp_hard_binding : 1,   /* TCP_DETACHED_NONEAGER */
 209  206                  tcp_fin_acked : 1,      /* Has our FIN been acked? */
↓ open down ↓ 66 lines elided ↑ open up ↑
 276  273  
 277  274          mblk_t  *tcp_rcv_list;          /* Queued until push, urgent data, */
 278  275          mblk_t  *tcp_rcv_last_head;     /* optdata, or the count exceeds */
 279  276          mblk_t  *tcp_rcv_last_tail;     /* tcp_rcv_push_wait. */
 280  277          uint32_t tcp_rcv_cnt;           /* tcp_rcv_list is b_next chain. */
 281  278  
 282  279          uint32_t tcp_cwnd_ssthresh;     /* Congestion window */
 283  280          uint32_t tcp_cwnd_max;
 284  281          uint32_t tcp_csuna;             /* Clear (no rexmits in window) suna */
 285  282  
 286      -        clock_t tcp_rtt_sa;             /* Round trip smoothed average */
 287      -        clock_t tcp_rtt_sd;             /* Round trip smoothed deviation */
 288      -        clock_t tcp_rtt_update;         /* Round trip update(s) */
      283 +        hrtime_t tcp_rtt_sum;           /* Round trip sum */
      284 +        uint32_t tcp_rtt_cnt;           /* Round trip count (non_dup ACKs) */
      285 +        hrtime_t tcp_rtt_sa;            /* Round trip smoothed average */
      286 +        hrtime_t tcp_rtt_sd;            /* Round trip smoothed deviation */
      287 +        uint32_t tcp_rtt_update;        /* Round trip update(s) */
 289  288          clock_t tcp_ms_we_have_waited;  /* Total retrans time */
 290  289  
 291  290          uint32_t tcp_swl1;              /* These help us avoid using stale */
 292  291          uint32_t tcp_swl2;              /*  packets to update state */
 293  292  
 294  293          uint32_t tcp_rack;              /* Seq # we have acked */
 295  294          uint32_t tcp_rack_cnt;          /* # of segs we have deferred ack */
 296  295          uint32_t tcp_rack_cur_max;      /* # of segs we may defer ack for now */
 297  296          uint32_t tcp_rack_abs_max;      /* # of segs we may defer ack ever */
 298  297          timeout_id_t    tcp_ack_tid;    /* Delayed ACK timer ID */
↓ open down ↓ 195 lines elided ↑ open up ↑
 494  493  
 495  494          /* For connection counting. */
 496  495          struct tcp_listen_cnt_s *tcp_listen_cnt;
 497  496  
 498  497          /* Segment reassembly timer. */
 499  498          timeout_id_t            tcp_reass_tid;
 500  499  
 501  500          /* FIN-WAIT-2 flush timeout */
 502  501          uint32_t                tcp_fin_wait_2_flush_interval;
 503  502  
      503 +        tcp_conn_stats_t        tcp_cs;
      504 +
 504  505  #ifdef DEBUG
 505  506          pc_t                    tcmp_stk[15];
 506  507  #endif
 507  508  } tcp_t;
 508  509  
 509  510  #ifdef DEBUG
 510  511  #define TCP_DEBUG_GETPCSTACK(buffer, depth)     ((void) getpcstack(buffer, \
 511  512                                                      depth))
 512  513  #else
 513  514  #define TCP_DEBUG_GETPCSTACK(buffer, depth)
 514  515  #endif
 515  516  
 516  517  extern void     tcp_conn_reclaim(void *);
 517      -extern void     tcp_free(tcp_t *tcp);
      518 +extern void     tcp_free(tcp_t *tcp);
 518  519  extern void     tcp_ddi_g_init(void);
 519  520  extern void     tcp_ddi_g_destroy(void);
 520      -extern void     *tcp_get_conn(void *arg, tcp_stack_t *);
      521 +extern conn_t   *tcp_get_conn(void *arg, tcp_stack_t *);
 521  522  extern mblk_t   *tcp_snmp_get(queue_t *, mblk_t *, boolean_t);
 522  523  extern int      tcp_snmp_set(queue_t *, int, int, uchar_t *, int len);
 523  524  
 524  525  /* Pad for the tf_t structure to avoid false cache line sharing. */
 525  526  #define TF_CACHEL_PAD   64
 526  527  
 527  528  /*
 528  529   * The TCP Fanout structure for bind and acceptor hashes.
 529  530   * The hash tables and their linkage (tcp_*_hash, tcp_ptp*hn) are
 530  531   * protected by the per-bucket tf_lock.  Each tcp_t
↓ open down ↓ 76 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX