Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp/tcp_input.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_input.c
↓ open down ↓ 15 lines elided ↑ open up ↑
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  25   25   * Copyright 2017 Joyent, Inc.
  26      - * Copyright (c) 2014 by Delphix. All rights reserved.
       26 + * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  27   27   */
  28   28  
  29   29  /* This file contains all TCP input processing functions. */
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/stream.h>
  33   33  #include <sys/strsun.h>
  34   34  #include <sys/strsubr.h>
  35   35  #include <sys/stropts.h>
  36   36  #include <sys/strlog.h>
↓ open down ↓ 122 lines elided ↑ open up ↑
 159  159                      ip_recv_attr_t *);
 160  160  static boolean_t        tcp_drop_q0(tcp_t *);
 161  161  static void     tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
 162  162  static mblk_t   *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
 163  163                      ip_recv_attr_t *);
 164  164  static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 165  165  static void     tcp_process_options(tcp_t *, tcpha_t *);
 166  166  static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 167  167  static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 168  168  static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 169      -static void     tcp_set_rto(tcp_t *, time_t);
      169 +static void     tcp_set_rto(tcp_t *, hrtime_t);
 170  170  static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 171  171  
 172  172  /*
      173 + * CC wrapper hook functions
      174 + */
      175 +static void
      176 +cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
      177 +    uint16_t type)
      178 +{
      179 +        uint32_t old_cwnd = tcp->tcp_cwnd;
      180 +
      181 +        tcp->tcp_ccv.bytes_this_ack = bytes_acked;
      182 +        if (tcp->tcp_cwnd <= tcp->tcp_swnd)
      183 +                tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
      184 +        else
      185 +                tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
      186 +
      187 +        if (type == CC_ACK) {
      188 +                if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
      189 +                        if (tcp->tcp_ccv.flags & CCF_RTO)
      190 +                                tcp->tcp_ccv.flags &= ~CCF_RTO;
      191 +
      192 +                        tcp->tcp_ccv.t_bytes_acked +=
      193 +                            min(tcp->tcp_ccv.bytes_this_ack,
      194 +                            tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
      195 +                        if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
      196 +                                tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
      197 +                                tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
      198 +                        }
      199 +                } else {
      200 +                        tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
      201 +                        tcp->tcp_ccv.t_bytes_acked = 0;
      202 +                }
      203 +        }
      204 +
      205 +        if (CC_ALGO(tcp)->ack_received != NULL) {
      206 +                /*
      207 +                 * The FreeBSD code where this originated had a comment "Find
      208 +                 * a way to live without this" in several places where curack
      209 +                 * got set.  If they eventually dump curack from from the cc
      210 +                 * variables, we'll need to adapt our code.
      211 +                 */
      212 +                tcp->tcp_ccv.curack = seg_ack;
      213 +                CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
      214 +        }
      215 +
      216 +        DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
      217 +            uint32_t, tcp->tcp_cwnd);
      218 +}
      219 +
      220 +void
      221 +cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
      222 +{
      223 +        uint32_t old_cwnd = tcp->tcp_cwnd;
      224 +        uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
      225 +        switch (type) {
      226 +        case CC_NDUPACK:
      227 +                if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
      228 +                        tcp->tcp_rexmit_max = tcp->tcp_snxt;
      229 +                        if (tcp->tcp_ecn_ok) {
      230 +                                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
      231 +                                tcp->tcp_cwr = B_TRUE;
      232 +                                tcp->tcp_ecn_cwr_sent = B_FALSE;
      233 +                        }
      234 +                }
      235 +                break;
      236 +        case CC_ECN:
      237 +                if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
      238 +                        tcp->tcp_rexmit_max = tcp->tcp_snxt;
      239 +                        if (tcp->tcp_ecn_ok) {
      240 +                                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
      241 +                                tcp->tcp_cwr = B_TRUE;
      242 +                                tcp->tcp_ecn_cwr_sent = B_FALSE;
      243 +                        }
      244 +                }
      245 +                break;
      246 +        case CC_RTO:
      247 +                tcp->tcp_ccv.flags |= CCF_RTO;
      248 +                tcp->tcp_dupack_cnt = 0;
      249 +                tcp->tcp_ccv.t_bytes_acked = 0;
      250 +                /*
      251 +                 * Give up on fast recovery and congestion recovery if we were
      252 +                 * attempting either.
      253 +                 */
      254 +                EXIT_RECOVERY(tcp->tcp_ccv.flags);
      255 +                if (CC_ALGO(tcp)->cong_signal == NULL) {
      256 +                        /*
      257 +                         * RFC5681 Section 3.1
      258 +                         * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
      259 +                         */
      260 +                        tcp->tcp_cwnd_ssthresh = max(
      261 +                            (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
      262 +                            2) * tcp->tcp_mss;
      263 +                        tcp->tcp_cwnd = tcp->tcp_mss;
      264 +                }
      265 +
      266 +                if (tcp->tcp_ecn_ok) {
      267 +                        tcp->tcp_cwr = B_TRUE;
      268 +                        tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
      269 +                        tcp->tcp_ecn_cwr_sent = B_FALSE;
      270 +                }
      271 +                break;
      272 +        }
      273 +
      274 +        if (CC_ALGO(tcp)->cong_signal != NULL) {
      275 +                tcp->tcp_ccv.curack = seg_ack;
      276 +                CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
      277 +        }
      278 +
      279 +        DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
      280 +            uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
      281 +            uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
      282 +}
      283 +
      284 +static void
      285 +cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
      286 +{
      287 +        uint32_t old_cwnd = tcp->tcp_cwnd;
      288 +
      289 +        if (CC_ALGO(tcp)->post_recovery != NULL) {
      290 +                tcp->tcp_ccv.curack = seg_ack;
      291 +                CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
      292 +        }
      293 +        tcp->tcp_ccv.t_bytes_acked = 0;
      294 +
      295 +        DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
      296 +            uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
      297 +}
      298 +
      299 +/*
 173  300   * Set the MSS associated with a particular tcp based on its current value,
 174  301   * and a new one passed in. Observe minimums and maximums, and reset other
 175  302   * state variables that we want to view as multiples of MSS.
 176  303   *
 177  304   * The value of MSS could be either increased or descreased.
 178  305   */
 179  306  void
 180  307  tcp_mss_set(tcp_t *tcp, uint32_t mss)
 181  308  {
 182  309          uint32_t        mss_max;
↓ open down ↓ 358 lines elided ↑ open up ↑
 541  668           * didn't want to do window scale, tcp_rwnd_set() will take
 542  669           * care of that.
 543  670           */
 544  671          tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
 545  672  
 546  673          /*
 547  674           * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
 548  675           * updated properly.
 549  676           */
 550  677          TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
      678 +
      679 +        if (tcp->tcp_cc_algo->conn_init != NULL)
      680 +                tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
 551  681  }
 552  682  
 553  683  /*
 554  684   * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
 555  685   * is filled, return as much as we can.  The message passed in may be
 556  686   * multi-part, chained using b_cont.  "start" is the starting sequence
 557  687   * number for this piece.
 558  688   */
 559  689  static mblk_t *
 560  690  tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 561  691  {
 562      -        uint32_t        end;
      692 +        uint32_t        end, bytes;
 563  693          mblk_t          *mp1;
 564  694          mblk_t          *mp2;
 565  695          mblk_t          *next_mp;
 566  696          uint32_t        u1;
 567  697          tcp_stack_t     *tcps = tcp->tcp_tcps;
 568  698  
 569  699  
 570  700          /* Walk through all the new pieces. */
 571  701          do {
 572  702                  ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
 573  703                      (uintptr_t)INT_MAX);
 574  704                  end = start + (int)(mp->b_wptr - mp->b_rptr);
 575  705                  next_mp = mp->b_cont;
 576  706                  if (start == end) {
 577  707                          /* Empty.  Blast it. */
 578  708                          freeb(mp);
 579  709                          continue;
 580  710                  }
      711 +                bytes = end - start;
 581  712                  mp->b_cont = NULL;
 582  713                  TCP_REASS_SET_SEQ(mp, start);
 583  714                  TCP_REASS_SET_END(mp, end);
 584  715                  mp1 = tcp->tcp_reass_tail;
 585      -                if (!mp1) {
      716 +                if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
      717 +                        if (mp1 != NULL) {
      718 +                                /*
      719 +                                 * New stuff is beyond the tail; link it on the
      720 +                                 * end.
      721 +                                 */
      722 +                                mp1->b_cont = mp;
      723 +                        } else {
      724 +                                tcp->tcp_reass_head = mp;
      725 +                        }
 586  726                          tcp->tcp_reass_tail = mp;
 587      -                        tcp->tcp_reass_head = mp;
 588  727                          TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 589      -                        TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
 590      -                            end - start);
      728 +                        TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
      729 +                        tcp->tcp_cs.tcp_in_data_unorder_segs++;
      730 +                        tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
 591  731                          continue;
 592  732                  }
 593      -                /* New stuff completely beyond tail? */
 594      -                if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
 595      -                        /* Link it on end. */
 596      -                        mp1->b_cont = mp;
 597      -                        tcp->tcp_reass_tail = mp;
 598      -                        TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 599      -                        TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
 600      -                            end - start);
 601      -                        continue;
 602      -                }
 603  733                  mp1 = tcp->tcp_reass_head;
 604  734                  u1 = TCP_REASS_SEQ(mp1);
 605  735                  /* New stuff at the front? */
 606  736                  if (SEQ_LT(start, u1)) {
 607  737                          /* Yes... Check for overlap. */
 608  738                          mp->b_cont = mp1;
 609  739                          tcp->tcp_reass_head = mp;
 610  740                          tcp_reass_elim_overlap(tcp, mp);
 611  741                          continue;
 612  742                  }
↓ open down ↓ 1704 lines elided ↑ open up ↑
2317 2447          uint32_t        seg_ack;
2318 2448          int             seg_len;
2319 2449          uint_t          ip_hdr_len;
2320 2450          uint32_t        seg_seq;
2321 2451          tcpha_t         *tcpha;
2322 2452          int             urp;
2323 2453          tcp_opt_t       tcpopt;
2324 2454          ip_pkt_t        ipp;
2325 2455          boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
2326 2456          uint32_t        cwnd;
2327      -        uint32_t        add;
2328      -        int             npkt;
2329 2457          int             mss;
2330 2458          conn_t          *connp = (conn_t *)arg;
2331 2459          squeue_t        *sqp = (squeue_t *)arg2;
2332 2460          tcp_t           *tcp = connp->conn_tcp;
2333 2461          tcp_stack_t     *tcps = tcp->tcp_tcps;
2334 2462          sock_upcalls_t  *sockupcalls;
2335 2463  
2336 2464          /*
2337 2465           * RST from fused tcp loopback peer should trigger an unfuse.
2338 2466           */
↓ open down ↓ 68 lines elided ↑ open up ↑
2407 2535                   * that it is also updated for tcp structure that belongs to
2408 2536                   * global and listener queues which do not really need updating.
2409 2537                   * But that should not cause any harm.  And it is updated for
2410 2538                   * all kinds of incoming segments, not only for data segments.
2411 2539                   */
2412 2540                  tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2413 2541          }
2414 2542  
2415 2543          flags = (unsigned int)tcpha->tha_flags & 0xFF;
2416 2544  
2417      -        BUMP_LOCAL(tcp->tcp_ibsegs);
     2545 +        TCPS_BUMP_MIB(tcps, tcpHCInSegs);
2418 2546          DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2419 2547  
2420 2548          if ((flags & TH_URG) && sqp != NULL) {
2421 2549                  /*
2422 2550                   * TCP can't handle urgent pointers that arrive before
2423 2551                   * the connection has been accept()ed since it can't
2424 2552                   * buffer OOB data.  Discard segment if this happens.
2425 2553                   *
2426 2554                   * We can't just rely on a non-null tcp_listener to indicate
2427 2555                   * that the accept() has completed since unlinking of the
↓ open down ↓ 166 lines elided ↑ open up ↑
2594 2722                                  tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2595 2723                                  tcp->tcp_rexmit_max = tcp->tcp_snxt;
2596 2724                                  tcp->tcp_ms_we_have_waited = 0;
2597 2725  
2598 2726                                  /*
2599 2727                                   * Set tcp_cwnd back to 1 MSS, per
2600 2728                                   * recommendation from
2601 2729                                   * draft-floyd-incr-init-win-01.txt,
2602 2730                                   * Increasing TCP's Initial Window.
2603 2731                                   */
     2732 +                                DTRACE_PROBE3(cwnd__retransmitted__syn,
     2733 +                                    tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
     2734 +                                    uint32_t, tcp->tcp_mss);
2604 2735                                  tcp->tcp_cwnd = tcp->tcp_mss;
2605 2736                          }
2606 2737  
2607 2738                          tcp->tcp_swl1 = seg_seq;
2608 2739                          tcp->tcp_swl2 = seg_ack;
2609 2740  
2610 2741                          new_swnd = ntohs(tcpha->tha_win);
2611 2742                          tcp->tcp_swnd = new_swnd;
2612 2743                          if (new_swnd > tcp->tcp_max_swnd)
2613 2744                                  tcp->tcp_max_swnd = new_swnd;
↓ open down ↓ 38 lines elided ↑ open up ↑
2652 2783                                   * final ACK triggers the passive side to
2653 2784                                   * perform fusion in ESTABLISHED state.
2654 2785                                   */
2655 2786                                  if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2656 2787                                          if (tcp->tcp_ack_tid != 0) {
2657 2788                                                  (void) TCP_TIMER_CANCEL(tcp,
2658 2789                                                      tcp->tcp_ack_tid);
2659 2790                                                  tcp->tcp_ack_tid = 0;
2660 2791                                          }
2661 2792                                          tcp_send_data(tcp, ack_mp);
2662      -                                        BUMP_LOCAL(tcp->tcp_obsegs);
     2793 +                                        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2663 2794                                          TCPS_BUMP_MIB(tcps, tcpOutAck);
2664 2795  
2665 2796                                          if (!IPCL_IS_NONSTR(connp)) {
2666 2797                                                  /* Send up T_CONN_CON */
2667 2798                                                  if (ira->ira_cred != NULL) {
2668 2799                                                          mblk_setcred(mp1,
2669 2800                                                              ira->ira_cred,
2670 2801                                                              ira->ira_cpid);
2671 2802                                                  }
2672 2803                                                  putnext(connp->conn_rq, mp1);
↓ open down ↓ 368 lines elided ↑ open up ↑
3041 3172          }
3042 3173          /*
3043 3174           * rgap is the amount of stuff received out of window.  A negative
3044 3175           * value is the amount out of window.
3045 3176           */
3046 3177          if (rgap < 0) {
3047 3178                  mblk_t  *mp2;
3048 3179  
3049 3180                  if (tcp->tcp_rwnd == 0) {
3050 3181                          TCPS_BUMP_MIB(tcps, tcpInWinProbe);
     3182 +                        tcp->tcp_cs.tcp_in_zwnd_probes++;
3051 3183                  } else {
3052 3184                          TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3053 3185                          TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3054 3186                  }
3055 3187  
3056 3188                  /*
3057 3189                   * seg_len does not include the FIN, so if more than
3058 3190                   * just the FIN is out of window, we act like we don't
3059 3191                   * see it.  (If just the FIN is out of window, rgap
3060 3192                   * will be zero and we will go ahead and acknowledge
↓ open down ↓ 229 lines elided ↑ open up ↑
3290 3422                                      tcp->tcp_reass_tid == 0) {
3291 3423                                          tcp->tcp_reass_tid = TCP_TIMER(tcp,
3292 3424                                              tcp_reass_timer,
3293 3425                                              tcps->tcps_reass_timeout);
3294 3426                                  }
3295 3427                          }
3296 3428                  }
3297 3429          } else if (seg_len > 0) {
3298 3430                  TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3299 3431                  TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
     3432 +                tcp->tcp_cs.tcp_in_data_inorder_segs++;
     3433 +                tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
     3434 +
3300 3435                  /*
3301 3436                   * If an out of order FIN was received before, and the seq
3302 3437                   * num and len of the new segment match that of the FIN,
3303 3438                   * put the FIN flag back in.
3304 3439                   */
3305 3440                  if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3306 3441                      seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3307 3442                          flags |= TH_FIN;
3308 3443                          tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3309 3444                  }
↓ open down ↓ 45 lines elided ↑ open up ↑
3355 3490                      TH_RST|TH_ACK);
3356 3491                  ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3357 3492                  (void) tcp_clean_death(tcp, ECONNRESET);
3358 3493                  return;
3359 3494          }
3360 3495          /*
3361 3496           * urp could be -1 when the urp field in the packet is 0
3362 3497           * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3363 3498           * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3364 3499           */
3365      -        if (flags & TH_URG && urp >= 0) {
     3500 +        if ((flags & TH_URG) && urp >= 0) {
3366 3501                  if (!tcp->tcp_urp_last_valid ||
3367 3502                      SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3368 3503                          /*
3369 3504                           * Non-STREAMS sockets handle the urgent data a litte
3370 3505                           * differently from STREAMS based sockets. There is no
3371 3506                           * need to mark any mblks with the MSG{NOT,}MARKNEXT
3372 3507                           * flags to keep SIOCATMARK happy. Instead a
3373 3508                           * su_signal_oob upcall is made to update the mark.
3374 3509                           * Neither is a T_EXDATA_IND mblk needed to be
3375 3510                           * prepended to the urgent data. The urgent data is
↓ open down ↓ 436 lines elided ↑ open up ↑
3812 3947                  /*
3813 3948                   * If SYN was retransmitted, need to reset all
3814 3949                   * retransmission info as this segment will be
3815 3950                   * treated as a dup ACK.
3816 3951                   */
3817 3952                  if (tcp->tcp_rexmit) {
3818 3953                          tcp->tcp_rexmit = B_FALSE;
3819 3954                          tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3820 3955                          tcp->tcp_rexmit_max = tcp->tcp_snxt;
3821 3956                          tcp->tcp_ms_we_have_waited = 0;
     3957 +                        DTRACE_PROBE3(cwnd__retransmitted__syn,
     3958 +                            tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
     3959 +                            uint32_t, tcp->tcp_mss);
3822 3960                          tcp->tcp_cwnd = mss;
3823 3961                  }
3824 3962  
3825 3963                  /*
3826 3964                   * We set the send window to zero here.
3827 3965                   * This is needed if there is data to be
3828 3966                   * processed already on the queue.
3829 3967                   * Later (at swnd_update label), the
3830 3968                   * "new_swnd > tcp_swnd" condition is satisfied
3831 3969                   * the XMIT_NEEDED flag is set in the current
↓ open down ↓ 23 lines elided ↑ open up ↑
3855 3993          if (bytes_acked < 0)
3856 3994                  goto est;
3857 3995  
3858 3996          /*
3859 3997           * If TCP is ECN capable and the congestion experience bit is
3860 3998           * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
3861 3999           * done once per window (or more loosely, per RTT).
3862 4000           */
3863 4001          if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
3864 4002                  tcp->tcp_cwr = B_FALSE;
3865      -        if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
3866      -                if (!tcp->tcp_cwr) {
3867      -                        npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
3868      -                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
3869      -                        tcp->tcp_cwnd = npkt * mss;
3870      -                        /*
3871      -                         * If the cwnd is 0, use the timer to clock out
3872      -                         * new segments.  This is required by the ECN spec.
3873      -                         */
3874      -                        if (npkt == 0) {
3875      -                                TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3876      -                                /*
3877      -                                 * This makes sure that when the ACK comes
3878      -                                 * back, we will increase tcp_cwnd by 1 MSS.
3879      -                                 */
3880      -                                tcp->tcp_cwnd_cnt = 0;
3881      -                        }
3882      -                        tcp->tcp_cwr = B_TRUE;
3883      -                        /*
3884      -                         * This marks the end of the current window of in
3885      -                         * flight data.  That is why we don't use
3886      -                         * tcp_suna + tcp_swnd.  Only data in flight can
3887      -                         * provide ECN info.
3888      -                         */
3889      -                        tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3890      -                        tcp->tcp_ecn_cwr_sent = B_FALSE;
3891      -                }
     4003 +        if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
     4004 +                cc_cong_signal(tcp, seg_ack, CC_ECN);
     4005 +                /*
     4006 +                 * If the cwnd is 0, use the timer to clock out
     4007 +                 * new segments.  This is required by the ECN spec.
     4008 +                 */
     4009 +                if (tcp->tcp_cwnd == 0)
     4010 +                        TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
     4011 +                tcp->tcp_cwr = B_TRUE;
     4012 +                /*
     4013 +                 * This marks the end of the current window of in
     4014 +                 * flight data.  That is why we don't use
     4015 +                 * tcp_suna + tcp_swnd.  Only data in flight can
     4016 +                 * provide ECN info.
     4017 +                 */
     4018 +                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3892 4019          }
3893 4020  
3894 4021          mp1 = tcp->tcp_xmit_head;
3895 4022          if (bytes_acked == 0) {
3896 4023                  if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
3897 4024                          int dupack_cnt;
3898 4025  
3899 4026                          TCPS_BUMP_MIB(tcps, tcpInDupAck);
3900 4027                          /*
3901 4028                           * Fast retransmit.  When we have seen exactly three
↓ open down ↓ 1 lines elided ↑ open up ↑
3903 4030                           * outstanding we take it as a hint that our peer
3904 4031                           * dropped something.
3905 4032                           *
3906 4033                           * If TCP is retransmitting, don't do fast retransmit.
3907 4034                           */
3908 4035                          if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
3909 4036                              ! tcp->tcp_rexmit) {
3910 4037                                  /* Do Limited Transmit */
3911 4038                                  if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
3912 4039                                      tcps->tcps_dupack_fast_retransmit) {
     4040 +                                        cc_ack_received(tcp, seg_ack,
     4041 +                                            bytes_acked, CC_DUPACK);
3913 4042                                          /*
3914 4043                                           * RFC 3042
3915 4044                                           *
3916 4045                                           * What we need to do is temporarily
3917 4046                                           * increase tcp_cwnd so that new
3918 4047                                           * data can be sent if it is allowed
3919 4048                                           * by the receive window (tcp_rwnd).
3920 4049                                           * tcp_wput_data() will take care of
3921 4050                                           * the rest.
3922 4051                                           *
↓ open down ↓ 26 lines elided ↑ open up ↑
3949 4078                                   * away.  After one window of data, tcp_cwr
3950 4079                                   * should then be cleared.  Note that
3951 4080                                   * for non ECN capable connection, tcp_cwr
3952 4081                                   * should always be false.
3953 4082                                   *
3954 4083                                   * Adjust cwnd since the duplicate
3955 4084                                   * ack indicates that a packet was
3956 4085                                   * dropped (due to congestion.)
3957 4086                                   */
3958 4087                                  if (!tcp->tcp_cwr) {
3959      -                                        npkt = ((tcp->tcp_snxt -
3960      -                                            tcp->tcp_suna) >> 1) / mss;
3961      -                                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
3962      -                                            mss;
3963      -                                        tcp->tcp_cwnd = (npkt +
3964      -                                            tcp->tcp_dupack_cnt) * mss;
     4088 +                                        cc_cong_signal(tcp, seg_ack,
     4089 +                                            CC_NDUPACK);
     4090 +                                        cc_ack_received(tcp, seg_ack,
     4091 +                                            bytes_acked, CC_DUPACK);
3965 4092                                  }
3966 4093                                  if (tcp->tcp_ecn_ok) {
3967 4094                                          tcp->tcp_cwr = B_TRUE;
3968 4095                                          tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3969 4096                                          tcp->tcp_ecn_cwr_sent = B_FALSE;
3970 4097                                  }
3971 4098  
3972 4099                                  /*
3973 4100                                   * We do Hoe's algorithm.  Refer to her
3974 4101                                   * paper "Improving the Start-up Behavior
↓ open down ↓ 41 lines elided ↑ open up ↑
4016 4143                                                   * funny things will happen.
4017 4144                                                   */
4018 4145                                                  tcp->tcp_pipe =
4019 4146                                                      tcp->tcp_cwnd_ssthresh;
4020 4147                                          }
4021 4148                                  } else {
4022 4149                                          flags |= TH_REXMIT_NEEDED;
4023 4150                                  } /* tcp_snd_sack_ok */
4024 4151  
4025 4152                                  } else {
     4153 +                                        cc_ack_received(tcp, seg_ack,
     4154 +                                            bytes_acked, CC_DUPACK);
4026 4155                                          /*
4027 4156                                           * Here we perform congestion
4028 4157                                           * avoidance, but NOT slow start.
4029 4158                                           * This is known as the Fast
4030 4159                                           * Recovery Algorithm.
4031 4160                                           */
4032 4161                                          if (tcp->tcp_snd_sack_ok &&
4033 4162                                              tcp->tcp_notsack_list != NULL) {
4034 4163                                                  flags |= TH_NEED_SACK_REXMIT;
4035 4164                                                  tcp->tcp_pipe -= mss;
↓ open down ↓ 1 lines elided ↑ open up ↑
4037 4166                                                          tcp->tcp_pipe = 0;
4038 4167                                          } else {
4039 4168                                          /*
4040 4169                                           * We know that one more packet has
4041 4170                                           * left the pipe thus we can update
4042 4171                                           * cwnd.
4043 4172                                           */
4044 4173                                          cwnd = tcp->tcp_cwnd + mss;
4045 4174                                          if (cwnd > tcp->tcp_cwnd_max)
4046 4175                                                  cwnd = tcp->tcp_cwnd_max;
     4176 +                                        DTRACE_PROBE3(cwnd__fast__recovery,
     4177 +                                            tcp_t *, tcp,
     4178 +                                            uint32_t, tcp->tcp_cwnd,
     4179 +                                            uint32_t, cwnd);
4047 4180                                          tcp->tcp_cwnd = cwnd;
4048 4181                                          if (tcp->tcp_unsent > 0)
4049 4182                                                  flags |= TH_XMIT_NEEDED;
4050 4183                                          }
4051 4184                                  }
4052 4185                          }
4053 4186                  } else if (tcp->tcp_zero_win_probe) {
4054 4187                          /*
4055 4188                           * If the window has opened, need to arrange
4056 4189                           * to send additional data.
↓ open down ↓ 82 lines elided ↑ open up ↑
4139 4272                              tcp_drop_ack_unsent_cnt) {
4140 4273                                  TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4141 4274                                  if (tcp->tcp_in_ack_unsent > 2 *
4142 4275                                      tcp_drop_ack_unsent_cnt) {
4143 4276                                          (void) tcp_clean_death(tcp, EPROTO);
4144 4277                                  }
4145 4278                                  return;
4146 4279                          }
4147 4280                          mp = tcp_ack_mp(tcp);
4148 4281                          if (mp != NULL) {
4149      -                                BUMP_LOCAL(tcp->tcp_obsegs);
     4282 +                                TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4150 4283                                  TCPS_BUMP_MIB(tcps, tcpOutAck);
4151 4284                                  tcp_send_data(tcp, mp);
4152 4285                          }
4153 4286                          return;
4154 4287                  }
4155 4288          } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4156 4289              tcp->tcp_snxt_shrunk)) {
4157 4290                          tcp->tcp_is_wnd_shrnk = B_FALSE;
4158 4291          }
4159 4292  
↓ open down ↓ 9 lines elided ↑ open up ↑
4169 4302          /*
4170 4303           * If we got an ACK after fast retransmit, check to see
4171 4304           * if it is a partial ACK.  If it is not and the congestion
4172 4305           * window was inflated to account for the other side's
4173 4306           * cached packets, retract it.  If it is, do Hoe's algorithm.
4174 4307           */
4175 4308          if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4176 4309                  ASSERT(tcp->tcp_rexmit == B_FALSE);
4177 4310                  if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4178 4311                          tcp->tcp_dupack_cnt = 0;
4179      -                        /*
4180      -                         * Restore the orig tcp_cwnd_ssthresh after
4181      -                         * fast retransmit phase.
4182      -                         */
4183      -                        if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
4184      -                                tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
4185      -                        }
     4312 +
     4313 +                        cc_post_recovery(tcp, seg_ack);
     4314 +
4186 4315                          tcp->tcp_rexmit_max = seg_ack;
4187      -                        tcp->tcp_cwnd_cnt = 0;
4188 4316  
4189 4317                          /*
4190 4318                           * Remove all notsack info to avoid confusion with
4191 4319                           * the next fast retrasnmit/recovery phase.
4192 4320                           */
4193 4321                          if (tcp->tcp_snd_sack_ok) {
4194 4322                                  TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4195 4323                                      tcp);
4196 4324                          }
4197 4325                  } else {
↓ open down ↓ 8 lines elided ↑ open up ↑
4206 4334                                   * Hoe's algorithm:
4207 4335                                   *
4208 4336                                   * Retransmit the unack'ed segment and
4209 4337                                   * restart fast recovery.  Note that we
4210 4338                                   * need to scale back tcp_cwnd to the
4211 4339                                   * original value when we started fast
4212 4340                                   * recovery.  This is to prevent overly
4213 4341                                   * aggressive behaviour in sending new
4214 4342                                   * segments.
4215 4343                                   */
4216      -                                tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
     4344 +                                cwnd = tcp->tcp_cwnd_ssthresh +
4217 4345                                      tcps->tcps_dupack_fast_retransmit * mss;
     4346 +                                DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
     4347 +                                    tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
     4348 +                                    uint32_t, cwnd);
     4349 +                                tcp->tcp_cwnd = cwnd;
4218 4350                                  tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4219 4351                                  flags |= TH_REXMIT_NEEDED;
4220 4352                          }
4221 4353                  }
4222 4354          } else {
4223 4355                  tcp->tcp_dupack_cnt = 0;
4224 4356                  if (tcp->tcp_rexmit) {
4225 4357                          /*
4226 4358                           * TCP is retranmitting.  If the ACK ack's all
4227 4359                           * outstanding data, update tcp_rexmit_max and
↓ open down ↓ 40 lines elided ↑ open up ↑
4268 4400          }
4269 4401  
4270 4402          /*
4271 4403           * Update the congestion window.
4272 4404           *
4273 4405           * If TCP is not ECN capable or TCP is ECN capable but the
4274 4406           * congestion experience bit is not set, increase the tcp_cwnd as
4275 4407           * usual.
4276 4408           */
4277 4409          if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4278      -                cwnd = tcp->tcp_cwnd;
4279      -                add = mss;
4280      -
4281      -                if (cwnd >= tcp->tcp_cwnd_ssthresh) {
4282      -                        /*
4283      -                         * This is to prevent an increase of less than 1 MSS of
4284      -                         * tcp_cwnd.  With partial increase, tcp_wput_data()
4285      -                         * may send out tinygrams in order to preserve mblk
4286      -                         * boundaries.
4287      -                         *
4288      -                         * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4289      -                         * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4290      -                         * increased by 1 MSS for every RTTs.
4291      -                         */
4292      -                        if (tcp->tcp_cwnd_cnt <= 0) {
4293      -                                tcp->tcp_cwnd_cnt = cwnd + add;
4294      -                        } else {
4295      -                                tcp->tcp_cwnd_cnt -= add;
4296      -                                add = 0;
4297      -                        }
     4410 +                if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
     4411 +                        EXIT_RECOVERY(tcp->tcp_ccv.flags);
4298 4412                  }
4299      -                tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
     4413 +                cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
4300 4414          }
4301 4415  
4302 4416          /* See if the latest urgent data has been acknowledged */
4303 4417          if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4304 4418              SEQ_GT(seg_ack, tcp->tcp_urg))
4305 4419                  tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4306 4420  
4307      -        /* Can we update the RTT estimates? */
4308      -        if (tcp->tcp_snd_ts_ok) {
4309      -                /* Ignore zero timestamp echo-reply. */
4310      -                if (tcpopt.tcp_opt_ts_ecr != 0) {
4311      -                        tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4312      -                            (int32_t)tcpopt.tcp_opt_ts_ecr);
4313      -                }
4314      -
4315      -                /* If needed, restart the timer. */
4316      -                if (tcp->tcp_set_timer == 1) {
4317      -                        TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4318      -                        tcp->tcp_set_timer = 0;
4319      -                }
     4421 +        /*
     4422 +         * Update the RTT estimates. Note that we don't use the TCP
     4423 +         * timestamp option to calculate RTT even if one is present. This is
     4424 +         * because the timestamp option's resolution (CPU tick) is
     4425 +         * too coarse to measure modern datacenter networks' microsecond
     4426 +         * latencies. The timestamp field's resolution is limited by its
     4427 +         * 4-byte width (see RFC1323), and since we always store a
     4428 +         * high-resolution nanosecond presision timestamp along with the data,
     4429 +         * there is no point to ever using the timestamp option.
     4430 +         */
     4431 +        if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4320 4432                  /*
4321      -                 * Update tcp_csuna in case the other side stops sending
4322      -                 * us timestamps.
4323      -                 */
4324      -                tcp->tcp_csuna = tcp->tcp_snxt;
4325      -        } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4326      -                /*
4327 4433                   * An ACK sequence we haven't seen before, so get the RTT
4328 4434                   * and update the RTO. But first check if the timestamp is
4329 4435                   * valid to use.
4330 4436                   */
4331 4437                  if ((mp1->b_next != NULL) &&
4332      -                    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
     4438 +                    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
     4439 +#ifdef KERNEL_32
4333 4440                          tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4334 4441                              (int32_t)(intptr_t)mp1->b_prev);
4335      -                else
     4442 +#else
     4443 +                        tcp_set_rto(tcp, gethrtime() -
     4444 +                            (hrtime_t)(intptr_t)mp1->b_prev);
     4445 +#endif
     4446 +                } else {
4336 4447                          TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
     4448 +                }
4337 4449  
4338 4450                  /* Remeber the last sequence to be ACKed */
4339 4451                  tcp->tcp_csuna = seg_ack;
4340 4452                  if (tcp->tcp_set_timer == 1) {
4341 4453                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4342 4454                          tcp->tcp_set_timer = 0;
4343 4455                  }
4344 4456          } else {
4345 4457                  TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4346 4458          }
↓ open down ↓ 7 lines elided ↑ open up ↑
4354 4466                  ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4355 4467                  bytes_acked -= (int)(wptr - mp1->b_rptr);
4356 4468                  if (bytes_acked < 0) {
4357 4469                          mp1->b_rptr = wptr + bytes_acked;
4358 4470                          /*
4359 4471                           * Set a new timestamp if all the bytes timed by the
4360 4472                           * old timestamp have been ack'ed.
4361 4473                           */
4362 4474                          if (SEQ_GT(seg_ack,
4363 4475                              (uint32_t)(uintptr_t)(mp1->b_next))) {
     4476 +#ifdef KERNEL_32
4364 4477                                  mp1->b_prev =
4365 4478                                      (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
     4479 +#else
     4480 +                                mp1->b_prev =
     4481 +                                    (mblk_t *)(intptr_t)gethrtime();
     4482 +#endif
4366 4483                                  mp1->b_next = NULL;
4367 4484                          }
4368 4485                          break;
4369 4486                  }
4370 4487                  mp1->b_next = NULL;
4371 4488                  mp1->b_prev = NULL;
4372 4489                  mp2 = mp1;
4373 4490                  mp1 = mp1->b_cont;
4374 4491  
4375 4492                  /*
↓ open down ↓ 455 lines elided ↑ open up ↑
4831 4948                          TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4832 4949                          if (snd_size > mss)
4833 4950                                  snd_size = mss;
4834 4951                          if (snd_size > tcp->tcp_swnd)
4835 4952                                  snd_size = tcp->tcp_swnd;
4836 4953                          mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4837 4954                              NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4838 4955                              B_TRUE);
4839 4956  
4840 4957                          if (mp1 != NULL) {
     4958 +#ifdef KERNEL_32
4841 4959                                  tcp->tcp_xmit_head->b_prev =
4842 4960                                      (mblk_t *)LBOLT_FASTPATH;
     4961 +#else
     4962 +                                tcp->tcp_xmit_head->b_prev =
     4963 +                                    (mblk_t *)(intptr_t)gethrtime();
     4964 +#endif
4843 4965                                  tcp->tcp_csuna = tcp->tcp_snxt;
4844 4966                                  TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4845 4967                                  TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4846 4968                                      snd_size);
     4969 +                                tcp->tcp_cs.tcp_out_retrans_segs++;
     4970 +                                tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
4847 4971                                  tcp_send_data(tcp, mp1);
4848 4972                          }
4849 4973                  }
4850 4974                  if (flags & TH_NEED_SACK_REXMIT) {
4851 4975                          tcp_sack_rexmit(tcp, &flags);
4852 4976                  }
4853 4977                  /*
4854 4978                   * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4855 4979                   * out new segment.  Note that tcp_rexmit should not be
4856 4980                   * set, otherwise TH_LIMIT_XMIT should not be set.
↓ open down ↓ 9 lines elided ↑ open up ↑
4866 4990                   * Adjust tcp_cwnd back to normal value after sending
4867 4991                   * new data segments.
4868 4992                   */
4869 4993                  if (flags & TH_LIMIT_XMIT) {
4870 4994                          tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4871 4995                          /*
4872 4996                           * This will restart the timer.  Restarting the
4873 4997                           * timer is used to avoid a timeout before the
4874 4998                           * limited transmitted segment's ACK gets back.
4875 4999                           */
4876      -                        if (tcp->tcp_xmit_head != NULL)
     5000 +                        if (tcp->tcp_xmit_head != NULL) {
     5001 +#ifdef KERNEL_32
4877 5002                                  tcp->tcp_xmit_head->b_prev =
4878 5003                                      (mblk_t *)LBOLT_FASTPATH;
     5004 +#else
     5005 +                                tcp->tcp_xmit_head->b_prev =
     5006 +                                    (mblk_t *)(intptr_t)gethrtime();
     5007 +#endif
     5008 +                        }
4879 5009                  }
4880 5010  
4881 5011                  /* Anything more to do? */
4882 5012                  if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
4883 5013                      TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4884 5014                          goto done;
4885 5015          }
4886 5016  ack_check:
4887 5017          if (flags & TH_SEND_URP_MARK) {
4888 5018                  ASSERT(tcp->tcp_urp_mark_mp);
↓ open down ↓ 22 lines elided ↑ open up ↑
4911 5041                  flags &= ~TH_SEND_URP_MARK;
4912 5042          }
4913 5043          if (flags & TH_ACK_NEEDED) {
4914 5044                  /*
4915 5045                   * Time to send an ack for some reason.
4916 5046                   */
4917 5047                  mp1 = tcp_ack_mp(tcp);
4918 5048  
4919 5049                  if (mp1 != NULL) {
4920 5050                          tcp_send_data(tcp, mp1);
4921      -                        BUMP_LOCAL(tcp->tcp_obsegs);
     5051 +                        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4922 5052                          TCPS_BUMP_MIB(tcps, tcpOutAck);
4923 5053                  }
4924 5054                  if (tcp->tcp_ack_tid != 0) {
4925 5055                          (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4926 5056                          tcp->tcp_ack_tid = 0;
4927 5057                  }
4928 5058          }
4929 5059          if (flags & TH_ACK_TIMER_NEEDED) {
4930 5060                  /*
4931 5061                   * Arrange for deferred ACK or push wait timeout.
↓ open down ↓ 272 lines elided ↑ open up ↑
5204 5334                  ASSERT(OK_32PTR(optptr));
5205 5335                  /* Save as last value */
5206 5336                  ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5207 5337                      (ipp->ipp_fields & IPPF_DSTOPTS),
5208 5338                      ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5209 5339          }
5210 5340          ASSERT(optptr == mp->b_wptr);
5211 5341          return (mp);
5212 5342  }
5213 5343  
5214      -/* The minimum of smoothed mean deviation in RTO calculation. */
5215      -#define TCP_SD_MIN      400
     5344 +/* The minimum of smoothed mean deviation in RTO calculation (nsec). */
     5345 +#define TCP_SD_MIN      400000000
5216 5346  
5217 5347  /*
5218      - * Set RTO for this connection.  The formula is from Jacobson and Karels'
5219      - * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
5220      - * are the same as those in Appendix A.2 of that paper.
     5348 + * Set RTO for this connection based on a new round-trip time measurement.
     5349 + * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
     5350 + * in SIGCOMM '88.  The variable names are the same as those in Appendix A.2
     5351 + * of that paper.
5221 5352   *
5222 5353   * m = new measurement
5223 5354   * sa = smoothed RTT average (8 * average estimates).
5224 5355   * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5225 5356   */
5226 5357  static void
5227      -tcp_set_rto(tcp_t *tcp, clock_t rtt)
     5358 +tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5228 5359  {
5229      -        long m = TICK_TO_MSEC(rtt);
5230      -        clock_t sa = tcp->tcp_rtt_sa;
5231      -        clock_t sv = tcp->tcp_rtt_sd;
5232      -        clock_t rto;
5233      -        tcp_stack_t     *tcps = tcp->tcp_tcps;
     5360 +        hrtime_t m = rtt;
     5361 +        hrtime_t sa = tcp->tcp_rtt_sa;
     5362 +        hrtime_t sv = tcp->tcp_rtt_sd;
     5363 +        tcp_stack_t *tcps = tcp->tcp_tcps;
5234 5364  
5235 5365          TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5236 5366          tcp->tcp_rtt_update++;
     5367 +        tcp->tcp_rtt_sum += m;
     5368 +        tcp->tcp_rtt_cnt++;
5237 5369  
5238 5370          /* tcp_rtt_sa is not 0 means this is a new sample. */
5239 5371          if (sa != 0) {
5240 5372                  /*
5241      -                 * Update average estimator:
5242      -                 *      new rtt = 7/8 old rtt + 1/8 Error
     5373 +                 * Update average estimator (see section 2.3 of RFC6298):
     5374 +                 *      SRTT = 7/8 SRTT + 1/8 rtt
     5375 +                 *
     5376 +                 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
     5377 +                 *      tcp_rtt_sa = 7 SRTT + rtt
     5378 +                 *      tcp_rtt_sa = tcp_rtt_sa - 1/8 tcp_rtt_sa + rtt
     5379 +                 *      tcp_rtt_sa = tcp_rtt_sa + (rtt - 1/8 tcp_rtt_sa)
     5380 +                 *
     5381 +                 * (rtt - 1/8 tcp_rtt_sa) is simply the difference
     5382 +                 * between the new rtt measurement and the existing smoothed
     5383 +                 * RTT average. This is referred to as "Error" in subsequent
     5384 +                 * calculations.
5243 5385                   */
5244 5386  
5245      -                /* m is now Error in estimate. */
     5387 +                /* m is now Error. */
5246 5388                  m -= sa >> 3;
5247 5389                  if ((sa += m) <= 0) {
5248 5390                          /*
5249 5391                           * Don't allow the smoothed average to be negative.
5250 5392                           * We use 0 to denote reinitialization of the
5251 5393                           * variables.
5252 5394                           */
5253 5395                          sa = 1;
5254 5396                  }
5255 5397  
5256 5398                  /*
5257 5399                   * Update deviation estimator:
5258      -                 *      new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
     5400 +                 *      mdev = 3/4 mdev + 1/4 abs(Error)
     5401 +                 *
     5402 +                 * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
     5403 +                 *      tcp_rtt_sd = 3 mdev + abs(Error)
     5404 +                 *      tcp_rtt_sd = tcp_rtt_sd - 1/4 tcp_rtt_sd + abs(Error)
5259 5405                   */
5260 5406                  if (m < 0)
5261 5407                          m = -m;
5262 5408                  m -= sv >> 2;
5263 5409                  sv += m;
5264 5410          } else {
5265 5411                  /*
5266 5412                   * This follows BSD's implementation.  So the reinitialized
5267 5413                   * RTO is 3 * m.  We cannot go less than 2 because if the
5268 5414                   * link is bandwidth dominated, doubling the window size
↓ open down ↓ 5 lines elided ↑ open up ↑
5274 5420                  sv = m << 1;
5275 5421          }
5276 5422          if (sv < TCP_SD_MIN) {
5277 5423                  /*
5278 5424                   * We do not know that if sa captures the delay ACK
5279 5425                   * effect as in a long train of segments, a receiver
5280 5426                   * does not delay its ACKs.  So set the minimum of sv
5281 5427                   * to be TCP_SD_MIN, which is default to 400 ms, twice
5282 5428                   * of BSD DATO.  That means the minimum of mean
5283 5429                   * deviation is 100 ms.
5284      -                 *
5285 5430                   */
5286 5431                  sv = TCP_SD_MIN;
5287 5432          }
5288 5433          tcp->tcp_rtt_sa = sa;
5289 5434          tcp->tcp_rtt_sd = sv;
5290      -        /*
5291      -         * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
5292      -         *
5293      -         * Add tcp_rexmit_interval extra in case of extreme environment
5294      -         * where the algorithm fails to work.  The default value of
5295      -         * tcp_rexmit_interval_extra should be 0.
5296      -         *
5297      -         * As we use a finer grained clock than BSD and update
5298      -         * RTO for every ACKs, add in another .25 of RTT to the
5299      -         * deviation of RTO to accomodate burstiness of 1/4 of
5300      -         * window size.
5301      -         */
5302      -        rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
5303 5435  
5304      -        TCP_SET_RTO(tcp, rto);
     5436 +        tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
5305 5437  
5306 5438          /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5307 5439          tcp->tcp_timer_backoff = 0;
5308 5440  }
5309 5441  
5310 5442  /*
5311 5443   * On a labeled system we have some protocols above TCP, such as RPC, which
5312 5444   * appear to assume that every mblk in a chain has a db_credp.
5313 5445   */
5314 5446  static void
↓ open down ↓ 304 lines elided ↑ open up ↑
5619 5751                  if (tcp_icmp_source_quench) {
5620 5752                          /*
5621 5753                           * Reduce the sending rate as if we got a
5622 5754                           * retransmit timeout
5623 5755                           */
5624 5756                          uint32_t npkt;
5625 5757  
5626 5758                          npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5627 5759                              tcp->tcp_mss;
5628 5760                          tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
     5761 +
     5762 +                        DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
     5763 +                            uint32_t, tcp->tcp_cwnd,
     5764 +                            uint32_t, tcp->tcp_mss);
5629 5765                          tcp->tcp_cwnd = tcp->tcp_mss;
5630 5766                          tcp->tcp_cwnd_cnt = 0;
5631 5767                  }
5632 5768                  break;
5633 5769          }
5634 5770          freemsg(mp);
5635 5771  }
5636 5772  
5637 5773  /*
5638 5774   * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
↓ open down ↓ 150 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX