Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp/tcp_timers.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_timers.c
↓ open down ↓ 15 lines elided ↑ open up ↑
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25   25   * Copyright 2011 Joyent, Inc.  All rights reserved.
  26      - * Copyright (c) 2014 by Delphix. All rights reserved.
       26 + * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  27   27   */
  28   28  
  29   29  #include <sys/types.h>
  30   30  #include <sys/strlog.h>
  31   31  #include <sys/strsun.h>
  32   32  #include <sys/squeue_impl.h>
  33   33  #include <sys/squeue.h>
  34   34  #include <sys/callo.h>
  35   35  #include <sys/strsubr.h>
  36   36  
↓ open down ↓ 550 lines elided ↑ open up ↑
 587  587                   * permanently back in the direction of
 588  588                   * ACK-every-other-packet as suggested in RFC 1122.
 589  589                   */
 590  590                  if (tcp->tcp_rack_abs_max > 2)
 591  591                          tcp->tcp_rack_abs_max--;
 592  592                  tcp->tcp_rack_cur_max = 2;
 593  593          }
 594  594          mp = tcp_ack_mp(tcp);
 595  595  
 596  596          if (mp != NULL) {
 597      -                BUMP_LOCAL(tcp->tcp_obsegs);
      597 +                TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
 598  598                  TCPS_BUMP_MIB(tcps, tcpOutAck);
 599  599                  TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
 600  600                  tcp_send_data(tcp, mp);
 601  601          }
 602  602  }
 603  603  
 604  604  /*
 605  605   * Notify IP that we are having trouble with this connection.  IP should
 606  606   * make note so it can potentially use a different IRE.
 607  607   */
↓ open down ↓ 136 lines elided ↑ open up ↑
 744  744                  if (second_threshold == 0) {
 745  745                          second_threshold = tcps->tcps_ip_abort_linterval;
 746  746                          dont_timeout = B_TRUE;
 747  747                  }
 748  748                  /* FALLTHRU */
 749  749          case TCPS_FIN_WAIT_1:
 750  750          case TCPS_CLOSING:
 751  751          case TCPS_LAST_ACK:
 752  752                  /* If we have data to rexmit */
 753  753                  if (tcp->tcp_suna != tcp->tcp_snxt) {
 754      -                        clock_t time_to_wait;
      754 +                        clock_t time_to_wait;
 755  755  
 756  756                          TCPS_BUMP_MIB(tcps, tcpTimRetrans);
 757  757                          if (!tcp->tcp_xmit_head)
 758  758                                  break;
 759      -                        time_to_wait = ddi_get_lbolt() -
 760      -                            (clock_t)tcp->tcp_xmit_head->b_prev;
 761      -                        time_to_wait = tcp->tcp_rto -
 762      -                            TICK_TO_MSEC(time_to_wait);
      759 +#ifdef KERNEL_32
      760 +                        time_to_wait = TICK_TO_MSEC(ddi_get_lbolt() -
      761 +                            (clock_t)tcp->tcp_xmit_head->b_prev);
      762 +#else
      763 +                        time_to_wait = NSEC2MSEC(gethrtime() -
      764 +                            (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
      765 +#endif
      766 +                        time_to_wait = tcp->tcp_rto - time_to_wait;
 763  767                          /*
 764  768                           * If the timer fires too early, 1 clock tick earlier,
 765  769                           * restart the timer.
 766  770                           */
 767  771                          if (time_to_wait > msec_per_tick) {
 768  772                                  TCP_STAT(tcps, tcp_timer_fire_early);
 769  773                                  TCP_TIMER_RESTART(tcp, time_to_wait);
 770  774                                  return;
 771  775                          }
 772  776                          /*
↓ open down ↓ 5 lines elided ↑ open up ↑
 778  782                           * first and second interval actions.  NOTE: the timer
 779  783                           * interval is allowed to continue its exponential
 780  784                           * backoff.
 781  785                           */
 782  786                          if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
 783  787                                  if (connp->conn_debug) {
 784  788                                          (void) strlog(TCP_MOD_ID, 0, 1,
 785  789                                              SL_TRACE, "tcp_timer: zero win");
 786  790                                  }
 787  791                          } else {
 788      -                                /*
 789      -                                 * After retransmission, we need to do
 790      -                                 * slow start.  Set the ssthresh to one
 791      -                                 * half of current effective window and
 792      -                                 * cwnd to one MSS.  Also reset
 793      -                                 * tcp_cwnd_cnt.
 794      -                                 *
 795      -                                 * Note that if tcp_ssthresh is reduced because
 796      -                                 * of ECN, do not reduce it again unless it is
 797      -                                 * already one window of data away (tcp_cwr
 798      -                                 * should then be cleared) or this is a
 799      -                                 * timeout for a retransmitted segment.
 800      -                                 */
 801      -                                uint32_t npkt;
 802      -
 803      -                                if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
 804      -                                        npkt = ((tcp->tcp_timer_backoff ?
 805      -                                            tcp->tcp_cwnd_ssthresh :
 806      -                                            tcp->tcp_snxt -
 807      -                                            tcp->tcp_suna) >> 1) / tcp->tcp_mss;
 808      -                                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
 809      -                                            tcp->tcp_mss;
 810      -                                }
 811      -                                tcp->tcp_cwnd = tcp->tcp_mss;
 812      -                                tcp->tcp_cwnd_cnt = 0;
 813      -                                if (tcp->tcp_ecn_ok) {
 814      -                                        tcp->tcp_cwr = B_TRUE;
 815      -                                        tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 816      -                                        tcp->tcp_ecn_cwr_sent = B_FALSE;
 817      -                                }
      792 +                                cc_cong_signal(tcp, NULL, CC_RTO);
 818  793                          }
 819  794                          break;
 820  795                  }
 821  796                  /*
 822  797                   * We have something to send yet we cannot send.  The
 823  798                   * reason can be:
 824  799                   *
 825  800                   * 1. Zero send window: we need to do zero window probe.
 826  801                   * 2. Zero cwnd: because of ECN, we need to "clock out
 827  802                   * segments.
↓ open down ↓ 19 lines elided ↑ open up ↑
 847  822                                   * the network is really congested.
 848  823                                   */
 849  824                                  ASSERT(tcp->tcp_ecn_ok);
 850  825                                  tcp->tcp_cwnd = tcp->tcp_mss;
 851  826                          }
 852  827                          if (tcp->tcp_swnd == 0) {
 853  828                                  /* Extend window for zero window probe */
 854  829                                  tcp->tcp_swnd++;
 855  830                                  tcp->tcp_zero_win_probe = B_TRUE;
 856  831                                  TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
      832 +                                tcp->tcp_cs.tcp_out_zwnd_probes++;
 857  833                          } else {
 858  834                                  /*
 859  835                                   * Handle timeout from sender SWS avoidance.
 860  836                                   * Reset our knowledge of the max send window
 861  837                                   * since the receiver might have reduced its
 862  838                                   * receive buffer.  Avoid setting tcp_max_swnd
 863  839                                   * to one since that will essentially disable
 864  840                                   * the SWS checks.
 865  841                                   *
 866  842                                   * Note that since we don't have a SWS
↓ open down ↓ 138 lines elided ↑ open up ↑
1005  981  
1006  982                  /*
1007  983                   * We have been retransmitting for too long...  The RTT
1008  984                   * we calculated is probably incorrect.  Reinitialize it.
1009  985                   * Need to compensate for 0 tcp_rtt_sa.  Reset
1010  986                   * tcp_rtt_update so that we won't accidentally cache a
1011  987                   * bad value.  But only do this if this is not a zero
1012  988                   * window probe.
1013  989                   */
1014  990                  if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
1015      -                        tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
1016      -                            (tcp->tcp_rtt_sa >> 5);
      991 +                        tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
      992 +                            tcp->tcp_rtt_sa >> 5;
1017  993                          tcp->tcp_rtt_sa = 0;
1018  994                          tcp_ip_notify(tcp);
1019  995                          tcp->tcp_rtt_update = 0;
1020  996                  }
1021  997          }
1022  998  
1023  999  timer_rexmit:
1024 1000          tcp->tcp_timer_backoff++;
1025      -        if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
1026      -            tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
1027      -            tcp->tcp_rto_min) {
     1001 +        if ((ms = tcp_calculate_rto(tcp, tcps)) < tcp->tcp_rto_min) {
1028 1002                  /*
1029 1003                   * This means the original RTO is tcp_rexmit_interval_min.
1030 1004                   * So we will use tcp_rexmit_interval_min as the RTO value
1031 1005                   * and do the backoff.
1032 1006                   */
1033 1007                  ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
1034 1008          } else {
1035 1009                  ms <<= tcp->tcp_timer_backoff;
1036 1010          }
1037 1011          if (ms > tcp->tcp_rto_max) {
↓ open down ↓ 14 lines elided ↑ open up ↑
1052 1026           * tcp_set_timer to 1 so that next time RTO is updated, we will
1053 1027           * restart the timer with a correct value.
1054 1028           */
1055 1029          tcp->tcp_set_timer = 1;
1056 1030          mss = tcp->tcp_snxt - tcp->tcp_suna;
1057 1031          if (mss > tcp->tcp_mss)
1058 1032                  mss = tcp->tcp_mss;
1059 1033          if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1060 1034                  mss = tcp->tcp_swnd;
1061 1035  
1062      -        if ((mp = tcp->tcp_xmit_head) != NULL)
     1036 +        if ((mp = tcp->tcp_xmit_head) != NULL) {
     1037 +#ifdef KERNEL_32
1063 1038                  mp->b_prev = (mblk_t *)ddi_get_lbolt();
     1039 +#else
     1040 +                mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
     1041 +#endif
     1042 +        }
1064 1043          mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1065 1044              B_TRUE);
1066 1045  
1067 1046          /*
1068 1047           * When slow start after retransmission begins, start with
1069 1048           * this seq no.  tcp_rexmit_max marks the end of special slow
1070 1049           * start phase.
1071 1050           */
1072 1051          tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1073 1052          if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
↓ open down ↓ 10 lines elided ↑ open up ↑
1084 1063           */
1085 1064          if (tcp->tcp_snd_sack_ok)
1086 1065                  TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
1087 1066          if (mp == NULL) {
1088 1067                  return;
1089 1068          }
1090 1069  
1091 1070          tcp->tcp_csuna = tcp->tcp_snxt;
1092 1071          TCPS_BUMP_MIB(tcps, tcpRetransSegs);
1093 1072          TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
     1073 +        tcp->tcp_cs.tcp_out_retrans_segs++;
     1074 +        tcp->tcp_cs.tcp_out_retrans_bytes += mss;
1094 1075          tcp_send_data(tcp, mp);
1095 1076  
1096 1077  }
1097 1078  
1098 1079  /*
1099 1080   * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1100 1081   * expires.
1101 1082   */
1102 1083  void
1103 1084  tcp_close_linger_timeout(void *arg)
1104 1085  {
1105 1086          conn_t  *connp = (conn_t *)arg;
1106 1087          tcp_t   *tcp = connp->conn_tcp;
1107 1088  
1108 1089          tcp->tcp_client_errno = ETIMEDOUT;
1109 1090          tcp_stop_lingering(tcp);
1110 1091  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX