5295 Old usr/src/uts/common/inet/tcp/tcp

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2011 Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/strlog.h>
  30 #include <sys/strsun.h>
  31 #include <sys/squeue_impl.h>
  32 #include <sys/squeue.h>
  33 #include <sys/callo.h>
  34 #include <sys/strsubr.h>
  35 
  36 #include <inet/common.h>
  37 #include <inet/ip.h>
  38 #include <inet/ip_ire.h>
  39 #include <inet/ip_rts.h>
  40 #include <inet/tcp.h>
  41 #include <inet/tcp_impl.h>
  42 
  43 /*
  44  * Implementation of TCP Timers.
  45  * =============================
  46  *
  47  * INTERFACE:
  48  *
  49  * There are two basic functions dealing with tcp timers:
  50  *
  51  *      timeout_id_t    tcp_timeout(connp, func, time)
  52  *      clock_t         tcp_timeout_cancel(connp, timeout_id)
  53  *      TCP_TIMER_RESTART(tcp, intvl)
  54  *
  55  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
  56  * after 'time' ticks passed. The function called by timeout() must adhere to
  57  * the same restrictions as a driver soft interrupt handler - it must not sleep
  58  * or call other functions that might sleep. The value returned is the opaque
  59  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
  60  * cancel the request. The call to tcp_timeout() may fail in which case it
  61  * returns zero. This is different from the timeout(9F) function which never
  62  * fails.
  63  *
  64  * The call-back function 'func' always receives 'connp' as its single
  65  * argument. It is always executed in the squeue corresponding to the tcp
  66  * structure. The tcp structure is guaranteed to be present at the time the
  67  * call-back is called.
  68  *
  69  * NOTE: The call-back function 'func' is never called if tcp is in
  70  *      the TCPS_CLOSED state.
  71  *
  72  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
  73  * request. locks acquired by the call-back routine should not be held across
  74  * the call to tcp_timeout_cancel() or a deadlock may result.
  75  *
  76  * tcp_timeout_cancel() returns -1 if the timeout request is invalid.
  77  * Otherwise, it returns an integer value greater than or equal to 0.
  78  *
  79  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
  80  *      within squeue context corresponding to the tcp instance. Since the
  81  *      call-back is also called via the same squeue, there are no race
  82  *      conditions described in untimeout(9F) manual page since all calls are
  83  *      strictly serialized.
  84  *
  85  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
  86  *      stored in tcp_timer_tid and starts a new one using
  87  *      MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
  88  *      and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
  89  *      field.
  90  *
  91  * IMPLEMENTATION:
  92  *
  93  * TCP timers are implemented using three-stage process. The call to
  94  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
  95  * when the timer expires. The tcp_timer_callback() arranges the call of the
  96  * tcp_timer_handler() function via squeue corresponding to the tcp
  97  * instance. The tcp_timer_handler() calls actual requested timeout call-back
  98  * and passes tcp instance as an argument to it. Information is passed between
  99  * stages using the tcp_timer_t structure which contains the connp pointer, the
 100  * tcp call-back to call and the timeout id returned by the timeout(9F).
 101  *
 102  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
 103  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
 104  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
 105  * returns the pointer to this mblk.
 106  *
 107  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
 108  * looks like a normal mblk without actual dblk attached to it.
 109  *
 110  * To optimize performance each tcp instance holds a small cache of timer
 111  * mblocks. In the current implementation it caches up to two timer mblocks per
 112  * tcp instance. The cache is preserved over tcp frees and is only freed when
 113  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
 114  * timer processing happens on a corresponding squeue, the cache manipulation
 115  * does not require any locks. Experiments show that majority of timer mblocks
 116  * allocations are satisfied from the tcp cache and do not involve kmem calls.
 117  *
 118  * The tcp_timeout() places a refhold on the connp instance which guarantees
 119  * that it will be present at the time the call-back function fires. The
 120  * tcp_timer_handler() drops the reference after calling the call-back, so the
 121  * call-back function does not need to manipulate the references explicitly.
 122  */
 123 
 124 kmem_cache_t *tcp_timercache;
 125 
 126 static void     tcp_ip_notify(tcp_t *);
 127 static void     tcp_timer_callback(void *);
 128 static void     tcp_timer_free(tcp_t *, mblk_t *);
 129 static void     tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *);
 130 
 131 /*
 132  * tim is in millisec.
 133  */
 134 timeout_id_t
 135 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim)
 136 {
 137         mblk_t *mp;
 138         tcp_timer_t *tcpt;
 139         tcp_t *tcp = connp->conn_tcp;
 140 
 141         ASSERT(connp->conn_sqp != NULL);
 142 
 143         TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
 144 
 145         if (tcp->tcp_timercache == NULL) {
 146                 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
 147         } else {
 148                 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
 149                 mp = tcp->tcp_timercache;
 150                 tcp->tcp_timercache = mp->b_next;
 151                 mp->b_next = NULL;
 152                 ASSERT(mp->b_wptr == NULL);
 153         }
 154 
 155         CONN_INC_REF(connp);
 156         tcpt = (tcp_timer_t *)mp->b_rptr;
 157         tcpt->connp = connp;
 158         tcpt->tcpt_proc = f;
 159         /*
 160          * TCP timers are normal timeouts. Plus, they do not require more than
 161          * a 10 millisecond resolution. By choosing a coarser resolution and by
 162          * rounding up the expiration to the next resolution boundary, we can
 163          * batch timers in the callout subsystem to make TCP timers more
 164          * efficient. The roundup also protects short timers from expiring too
 165          * early before they have a chance to be cancelled.
 166          */
 167         tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp,
 168             tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 169         VERIFY(!(tcpt->tcpt_tid & CALLOUT_ID_FREE));
 170 
 171         return ((timeout_id_t)mp);
 172 }
 173 
 174 static void
 175 tcp_timer_callback(void *arg)
 176 {
 177         mblk_t *mp = (mblk_t *)arg;
 178         tcp_timer_t *tcpt;
 179         conn_t  *connp;
 180 
 181         tcpt = (tcp_timer_t *)mp->b_rptr;
 182         connp = tcpt->connp;
 183         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
 184             NULL, SQ_FILL, SQTAG_TCP_TIMER);
 185 }
 186 
 187 /* ARGSUSED */
 188 static void
 189 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 190 {
 191         tcp_timer_t *tcpt;
 192         conn_t *connp = (conn_t *)arg;
 193         tcp_t *tcp = connp->conn_tcp;
 194 
 195         tcpt = (tcp_timer_t *)mp->b_rptr;
 196         ASSERT(connp == tcpt->connp);
 197         ASSERT((squeue_t *)arg2 == connp->conn_sqp);
 198 
 199         if (tcpt->tcpt_tid & CALLOUT_ID_FREE) {
 200                 /*
 201                  * This timeout was cancelled after it was enqueued to the
 202                  * squeue; free the timer and return.
 203                  */
 204                 tcp_timer_free(connp->conn_tcp, mp);
 205                 return;
 206         }
 207 
 208         /*
 209          * If the TCP has reached the closed state, don't proceed any
 210          * further. This TCP logically does not exist on the system.
 211          * tcpt_proc could for example access queues, that have already
 212          * been qprocoff'ed off.
 213          */
 214         if (tcp->tcp_state != TCPS_CLOSED) {
 215                 (*tcpt->tcpt_proc)(connp);
 216         } else {
 217                 tcp->tcp_timer_tid = 0;
 218         }
 219 
 220         tcp_timer_free(connp->conn_tcp, mp);
 221 }
 222 
 223 /*
 224  * There is potential race with untimeout and the handler firing at the same
 225  * time. The mblock may be freed by the handler while we are trying to use
 226  * it. But since both should execute on the same squeue, this race should not
 227  * occur.
 228  */
 229 clock_t
 230 tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
 231 {
 232         mblk_t  *mp = (mblk_t *)id;
 233         tcp_timer_t *tcpt;
 234         clock_t delta;
 235 
 236         TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
 237 
 238         if (mp == NULL)
 239                 return (-1);
 240 
 241         tcpt = (tcp_timer_t *)mp->b_rptr;
 242         ASSERT(tcpt->connp == connp);
 243 
 244         delta = untimeout_default(tcpt->tcpt_tid, 0);
 245 
 246         if (delta >= 0) {
 247                 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
 248                 tcp_timer_free(connp->conn_tcp, mp);
 249                 CONN_DEC_REF(connp);
 250         } else {
 251                 /*
 252                  * If we were unable to untimeout successfully, it has already
 253                  * been enqueued on the squeue; mark the ID with the free
 254                  * bit.  This bit can never be set in a valid identifier, and
 255                  * we'll use it to prevent the timeout from being executed.
 256                  * And note that we're within the squeue perimeter here, so
 257                  * we don't need to worry about racing with timer handling
 258                  * (which also executes within the perimeter).
 259                  */
 260                 tcpt->tcpt_tid |= CALLOUT_ID_FREE;
 261                 delta = 0;
 262         }
 263 
 264         return (TICK_TO_MSEC(delta));
 265 }
 266 
 267 /*
 268  * Allocate space for the timer event. The allocation looks like mblk, but it is
 269  * not a proper mblk. To avoid confusion we set b_wptr to NULL.
 270  *
 271  * Dealing with failures: If we can't allocate from the timer cache we try
 272  * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
 273  * points to b_rptr.
 274  * If we can't allocate anything using allocb_tryhard(), we perform a last
 275  * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
 276  * save the actual allocation size in b_datap.
 277  */
 278 mblk_t *
 279 tcp_timermp_alloc(int kmflags)
 280 {
 281         mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
 282             kmflags & ~KM_PANIC);
 283 
 284         if (mp != NULL) {
 285                 mp->b_next = mp->b_prev = NULL;
 286                 mp->b_rptr = (uchar_t *)(&mp[1]);
 287                 mp->b_wptr = NULL;
 288                 mp->b_datap = NULL;
 289                 mp->b_queue = NULL;
 290                 mp->b_cont = NULL;
 291         } else if (kmflags & KM_PANIC) {
 292                 /*
 293                  * Failed to allocate memory for the timer. Try allocating from
 294                  * dblock caches.
 295                  */
 296                 /* ipclassifier calls this from a constructor - hence no tcps */
 297                 TCP_G_STAT(tcp_timermp_allocfail);
 298                 mp = allocb_tryhard(sizeof (tcp_timer_t));
 299                 if (mp == NULL) {
 300                         size_t size = 0;
 301                         /*
 302                          * Memory is really low. Try tryhard allocation.
 303                          *
 304                          * ipclassifier calls this from a constructor -
 305                          * hence no tcps
 306                          */
 307                         TCP_G_STAT(tcp_timermp_allocdblfail);
 308                         mp = kmem_alloc_tryhard(sizeof (mblk_t) +
 309                             sizeof (tcp_timer_t), &size, kmflags);
 310                         mp->b_rptr = (uchar_t *)(&mp[1]);
 311                         mp->b_next = mp->b_prev = NULL;
 312                         mp->b_wptr = (uchar_t *)-1;
 313                         mp->b_datap = (dblk_t *)size;
 314                         mp->b_queue = NULL;
 315                         mp->b_cont = NULL;
 316                 }
 317                 ASSERT(mp->b_wptr != NULL);
 318         }
 319         /* ipclassifier calls this from a constructor - hence no tcps */
 320         TCP_G_DBGSTAT(tcp_timermp_alloced);
 321 
 322         return (mp);
 323 }
 324 
 325 /*
 326  * Free per-tcp timer cache.
 327  * It can only contain entries from tcp_timercache.
 328  */
 329 void
 330 tcp_timermp_free(tcp_t *tcp)
 331 {
 332         mblk_t *mp;
 333 
 334         while ((mp = tcp->tcp_timercache) != NULL) {
 335                 ASSERT(mp->b_wptr == NULL);
 336                 tcp->tcp_timercache = tcp->tcp_timercache->b_next;
 337                 kmem_cache_free(tcp_timercache, mp);
 338         }
 339 }
 340 
 341 /*
 342  * Free timer event. Put it on the per-tcp timer cache if there is not too many
 343  * events there already (currently at most two events are cached).
 344  * If the event is not allocated from the timer cache, free it right away.
 345  */
 346 static void
 347 tcp_timer_free(tcp_t *tcp, mblk_t *mp)
 348 {
 349         mblk_t *mp1 = tcp->tcp_timercache;
 350 
 351         if (mp->b_wptr != NULL) {
 352                 /*
 353                  * This allocation is not from a timer cache, free it right
 354                  * away.
 355                  */
 356                 if (mp->b_wptr != (uchar_t *)-1)
 357                         freeb(mp);
 358                 else
 359                         kmem_free(mp, (size_t)mp->b_datap);
 360         } else if (mp1 == NULL || mp1->b_next == NULL) {
 361                 /* Cache this timer block for future allocations */
 362                 mp->b_rptr = (uchar_t *)(&mp[1]);
 363                 mp->b_next = mp1;
 364                 tcp->tcp_timercache = mp;
 365         } else {
 366                 kmem_cache_free(tcp_timercache, mp);
 367                 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
 368         }
 369 }
 370 
 371 /*
 372  * Stop all TCP timers.
 373  */
 374 void
 375 tcp_timers_stop(tcp_t *tcp)
 376 {
 377         if (tcp->tcp_timer_tid != 0) {
 378                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
 379                 tcp->tcp_timer_tid = 0;
 380         }
 381         if (tcp->tcp_ka_tid != 0) {
 382                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
 383                 tcp->tcp_ka_tid = 0;
 384         }
 385         if (tcp->tcp_ack_tid != 0) {
 386                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
 387                 tcp->tcp_ack_tid = 0;
 388         }
 389         if (tcp->tcp_push_tid != 0) {
 390                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
 391                 tcp->tcp_push_tid = 0;
 392         }
 393         if (tcp->tcp_reass_tid != 0) {
 394                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
 395                 tcp->tcp_reass_tid = 0;
 396         }
 397 }
 398 
 399 /*
 400  * Timer callback routine for keepalive probe.  We do a fake resend of
 401  * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
 402  * check to see if we have heard anything from the other end for the last
 403  * RTO period.  If we have, set the timer to expire for another
 404  * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
 405  * RTO << 1 and check again when it expires.  Keep exponentially increasing
 406  * the timeout if we have not heard from the other side.  If for more than
 407  * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
 408  * kill the connection unless the keepalive abort threshold is 0.  In
 409  * that case, we will probe "forever."
 410  * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow
 411  * the exponential backoff, but send probes tcp_ka_cnt times in regular
 412  * intervals of tcp_ka_rinterval milliseconds until we hear back from peer.
 413  * Kill the connection if we don't hear back from peer after tcp_ka_cnt
 414  * probes are sent.
 415  */
 416 void
 417 tcp_keepalive_timer(void *arg)
 418 {
 419         mblk_t  *mp;
 420         conn_t  *connp = (conn_t *)arg;
 421         tcp_t   *tcp = connp->conn_tcp;
 422         int32_t firetime;
 423         int32_t idletime;
 424         int32_t ka_intrvl;
 425         tcp_stack_t     *tcps = tcp->tcp_tcps;
 426 
 427         tcp->tcp_ka_tid = 0;
 428 
 429         if (tcp->tcp_fused)
 430                 return;
 431 
 432         TCPS_BUMP_MIB(tcps, tcpTimKeepalive);
 433         ka_intrvl = tcp->tcp_ka_interval;
 434 
 435         /*
 436          * Keepalive probe should only be sent if the application has not
 437          * done a close on the connection.
 438          */
 439         if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
 440                 return;
 441         }
 442         /* Timer fired too early, restart it. */
 443         if (tcp->tcp_state < TCPS_ESTABLISHED) {
 444                 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
 445                     ka_intrvl);
 446                 return;
 447         }
 448 
 449         idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
 450         /*
 451          * If we have not heard from the other side for a long
 452          * time, kill the connection unless the keepalive abort
 453          * threshold is 0.  In that case, we will probe "forever."
 454          */
 455         if (tcp->tcp_ka_abort_thres != 0 &&
 456             idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
 457                 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop);
 458                 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
 459                     tcp->tcp_client_errno : ETIMEDOUT);
 460                 return;
 461         }
 462 
 463         if (tcp->tcp_snxt == tcp->tcp_suna &&
 464             idletime >= ka_intrvl) {
 465                 /* Fake resend of last ACKed byte. */
 466                 mblk_t  *mp1 = allocb(1, BPRI_LO);
 467 
 468                 if (mp1 != NULL) {
 469                         *mp1->b_wptr++ = '\0';
 470                         mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
 471                             tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
 472                         freeb(mp1);
 473                         /*
 474                          * if allocation failed, fall through to start the
 475                          * timer back.
 476                          */
 477                         if (mp != NULL) {
 478                                 tcp_send_data(tcp, mp);
 479                                 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
 480                                 if (tcp->tcp_ka_rinterval) {
 481                                         firetime = tcp->tcp_ka_rinterval;
 482                                 } else if (tcp->tcp_ka_last_intrvl != 0) {
 483                                         int max;
 484                                         /*
 485                                          * We should probe again at least
 486                                          * in ka_intrvl, but not more than
 487                                          * tcp_rto_max.
 488                                          */
 489                                         max = tcp->tcp_rto_max;
 490                                         firetime = MIN(ka_intrvl - 1,
 491                                             tcp->tcp_ka_last_intrvl << 1);
 492                                         if (firetime > max)
 493                                                 firetime = max;
 494                                 } else {
 495                                         firetime = tcp->tcp_rto;
 496                                 }
 497                                 tcp->tcp_ka_tid = TCP_TIMER(tcp,
 498                                     tcp_keepalive_timer, firetime);
 499                                 tcp->tcp_ka_last_intrvl = firetime;
 500                                 return;
 501                         }
 502                 }
 503         } else {
 504                 tcp->tcp_ka_last_intrvl = 0;
 505         }
 506 
 507         /* firetime can be negative if (mp1 == NULL || mp == NULL) */
 508         if ((firetime = ka_intrvl - idletime) < 0) {
 509                 firetime = ka_intrvl;
 510         }
 511         tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime);
 512 }
 513 
 514 void
 515 tcp_reass_timer(void *arg)
 516 {
 517         conn_t *connp = (conn_t *)arg;
 518         tcp_t *tcp = connp->conn_tcp;
 519 
 520         tcp->tcp_reass_tid = 0;
 521         if (tcp->tcp_reass_head == NULL)
 522                 return;
 523         ASSERT(tcp->tcp_reass_tail != NULL);
 524         if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
 525                 tcp_sack_remove(tcp->tcp_sack_list,
 526                     TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk);
 527         }
 528         tcp_close_mpp(&tcp->tcp_reass_head);
 529         tcp->tcp_reass_tail = NULL;
 530         TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout);
 531 }
 532 
 533 /* This function handles the push timeout. */
 534 void
 535 tcp_push_timer(void *arg)
 536 {
 537         conn_t  *connp = (conn_t *)arg;
 538         tcp_t *tcp = connp->conn_tcp;
 539 
 540         TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
 541 
 542         ASSERT(tcp->tcp_listener == NULL);
 543 
 544         ASSERT(!IPCL_IS_NONSTR(connp));
 545 
 546         tcp->tcp_push_tid = 0;
 547 
 548         if (tcp->tcp_rcv_list != NULL &&
 549             tcp_rcv_drain(tcp) == TH_ACK_NEEDED)
 550                 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
 551 }
 552 
 553 /*
 554  * This function handles delayed ACK timeout.
 555  */
 556 void
 557 tcp_ack_timer(void *arg)
 558 {
 559         conn_t  *connp = (conn_t *)arg;
 560         tcp_t *tcp = connp->conn_tcp;
 561         mblk_t *mp;
 562         tcp_stack_t     *tcps = tcp->tcp_tcps;
 563 
 564         TCP_DBGSTAT(tcps, tcp_ack_timer_cnt);
 565 
 566         tcp->tcp_ack_tid = 0;
 567 
 568         if (tcp->tcp_fused)
 569                 return;
 570 
 571         /*
 572          * Do not send ACK if there is no outstanding unack'ed data.
 573          */
 574         if (tcp->tcp_rnxt == tcp->tcp_rack) {
 575                 return;
 576         }
 577 
 578         if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
 579                 /*
 580                  * Make sure we don't allow deferred ACKs to result in
 581                  * timer-based ACKing.  If we have held off an ACK
 582                  * when there was more than an mss here, and the timer
 583                  * goes off, we have to worry about the possibility
 584                  * that the sender isn't doing slow-start, or is out
 585                  * of step with us for some other reason.  We fall
 586                  * permanently back in the direction of
 587                  * ACK-every-other-packet as suggested in RFC 1122.
 588                  */
 589                 if (tcp->tcp_rack_abs_max > 2)
 590                         tcp->tcp_rack_abs_max--;
 591                 tcp->tcp_rack_cur_max = 2;
 592         }
 593         mp = tcp_ack_mp(tcp);
 594 
 595         if (mp != NULL) {
 596                 BUMP_LOCAL(tcp->tcp_obsegs);
 597                 TCPS_BUMP_MIB(tcps, tcpOutAck);
 598                 TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
 599                 tcp_send_data(tcp, mp);
 600         }
 601 }
 602 
 603 /*
 604  * Notify IP that we are having trouble with this connection.  IP should
 605  * make note so it can potentially use a different IRE.
 606  */
 607 static void
 608 tcp_ip_notify(tcp_t *tcp)
 609 {
 610         conn_t          *connp = tcp->tcp_connp;
 611         ire_t           *ire;
 612 
 613         /*
 614          * Note: in the case of source routing we want to blow away the
 615          * route to the first source route hop.
 616          */
 617         ire = connp->conn_ixa->ixa_ire;
 618         if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
 619                 if (ire->ire_ipversion == IPV4_VERSION) {
 620                         /*
 621                          * As per RFC 1122, we send an RTM_LOSING to inform
 622                          * routing protocols.
 623                          */
 624                         ip_rts_change(RTM_LOSING, ire->ire_addr,
 625                             ire->ire_gateway_addr, ire->ire_mask,
 626                             connp->conn_laddr_v4,  0, 0, 0,
 627                             (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
 628                             ire->ire_ipst);
 629                 }
 630                 (void) ire_no_good(ire);
 631         }
 632 }
 633 
 634 /*
 635  * tcp_timer is the timer service routine.  It handles the retransmission,
 636  * FIN_WAIT_2 flush, and zero window probe timeout events.  It figures out
 637  * from the state of the tcp instance what kind of action needs to be done
 638  * at the time it is called.
 639  */
 640 void
 641 tcp_timer(void *arg)
 642 {
 643         mblk_t          *mp;
 644         clock_t         first_threshold;
 645         clock_t         second_threshold;
 646         clock_t         ms;
 647         uint32_t        mss;
 648         conn_t          *connp = (conn_t *)arg;
 649         tcp_t           *tcp = connp->conn_tcp;
 650         tcp_stack_t     *tcps = tcp->tcp_tcps;
 651         boolean_t       dont_timeout = B_FALSE;
 652 
 653         tcp->tcp_timer_tid = 0;
 654 
 655         if (tcp->tcp_fused)
 656                 return;
 657 
 658         first_threshold =  tcp->tcp_first_timer_threshold;
 659         second_threshold = tcp->tcp_second_timer_threshold;
 660         switch (tcp->tcp_state) {
 661         case TCPS_IDLE:
 662         case TCPS_BOUND:
 663         case TCPS_LISTEN:
 664                 return;
 665         case TCPS_SYN_RCVD: {
 666                 tcp_t   *listener = tcp->tcp_listener;
 667 
 668                 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
 669                         /* it's our first timeout */
 670                         tcp->tcp_syn_rcvd_timeout = 1;
 671                         mutex_enter(&listener->tcp_eager_lock);
 672                         listener->tcp_syn_rcvd_timeout++;
 673                         if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) {
 674                                 /*
 675                                  * Make this eager available for drop if we
 676                                  * need to drop one to accomodate a new
 677                                  * incoming SYN request.
 678                                  */
 679                                 MAKE_DROPPABLE(listener, tcp);
 680                         }
 681                         if (!listener->tcp_syn_defense &&
 682                             (listener->tcp_syn_rcvd_timeout >
 683                             (tcps->tcps_conn_req_max_q0 >> 2)) &&
 684                             (tcps->tcps_conn_req_max_q0 > 200)) {
 685                                 /* We may be under attack. Put on a defense. */
 686                                 listener->tcp_syn_defense = B_TRUE;
 687                                 cmn_err(CE_WARN, "High TCP connect timeout "
 688                                     "rate! System (port %d) may be under a "
 689                                     "SYN flood attack!",
 690                                     ntohs(listener->tcp_connp->conn_lport));
 691 
 692                                 listener->tcp_ip_addr_cache = kmem_zalloc(
 693                                     IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
 694                                     KM_NOSLEEP);
 695                         }
 696                         mutex_exit(&listener->tcp_eager_lock);
 697                 } else if (listener != NULL) {
 698                         mutex_enter(&listener->tcp_eager_lock);
 699                         tcp->tcp_syn_rcvd_timeout++;
 700                         if (tcp->tcp_syn_rcvd_timeout > 1 &&
 701                             !tcp->tcp_closemp_used) {
 702                                 /*
 703                                  * This is our second timeout. Put the tcp in
 704                                  * the list of droppable eagers to allow it to
 705                                  * be dropped, if needed. We don't check
 706                                  * whether tcp_dontdrop is set or not to
 707                                  * protect ourselve from a SYN attack where a
 708                                  * remote host can spoof itself as one of the
 709                                  * good IP source and continue to hold
 710                                  * resources too long.
 711                                  */
 712                                 MAKE_DROPPABLE(listener, tcp);
 713                         }
 714                         mutex_exit(&listener->tcp_eager_lock);
 715                 }
 716         }
 717                 /* FALLTHRU */
 718         case TCPS_SYN_SENT:
 719                 first_threshold =  tcp->tcp_first_ctimer_threshold;
 720                 second_threshold = tcp->tcp_second_ctimer_threshold;
 721 
 722                 /*
 723                  * If an app has set the second_threshold to 0, it means that
 724                  * we need to retransmit forever, unless this is a passive
 725                  * open.  We need to set second_threshold back to a normal
 726                  * value such that later comparison with it still makes
 727                  * sense.  But we set dont_timeout to B_TRUE so that we will
 728                  * never time out.
 729                  */
 730                 if (second_threshold == 0) {
 731                         second_threshold = tcps->tcps_ip_abort_linterval;
 732                         if (tcp->tcp_active_open)
 733                                 dont_timeout = B_TRUE;
 734                 }
 735                 break;
 736         case TCPS_ESTABLISHED:
 737         case TCPS_CLOSE_WAIT:
 738                 /*
 739                  * If the end point has not been closed, TCP can retransmit
 740                  * forever.  But if the end point is closed, the normal
 741                  * timeout applies.
 742                  */
 743                 if (second_threshold == 0) {
 744                         second_threshold = tcps->tcps_ip_abort_linterval;
 745                         dont_timeout = B_TRUE;
 746                 }
 747                 /* FALLTHRU */
 748         case TCPS_FIN_WAIT_1:
 749         case TCPS_CLOSING:
 750         case TCPS_LAST_ACK:
 751                 /* If we have data to rexmit */
 752                 if (tcp->tcp_suna != tcp->tcp_snxt) {
 753                         clock_t time_to_wait;
 754 
 755                         TCPS_BUMP_MIB(tcps, tcpTimRetrans);
 756                         if (!tcp->tcp_xmit_head)
 757                                 break;
 758                         time_to_wait = ddi_get_lbolt() -
 759                             (clock_t)tcp->tcp_xmit_head->b_prev;
 760                         time_to_wait = tcp->tcp_rto -
 761                             TICK_TO_MSEC(time_to_wait);
 762                         /*
 763                          * If the timer fires too early, 1 clock tick earlier,
 764                          * restart the timer.
 765                          */
 766                         if (time_to_wait > msec_per_tick) {
 767                                 TCP_STAT(tcps, tcp_timer_fire_early);
 768                                 TCP_TIMER_RESTART(tcp, time_to_wait);
 769                                 return;
 770                         }
 771                         /*
 772                          * When we probe zero windows, we force the swnd open.
 773                          * If our peer acks with a closed window swnd will be
 774                          * set to zero by tcp_rput(). As long as we are
 775                          * receiving acks tcp_rput will
 776                          * reset 'tcp_ms_we_have_waited' so as not to trip the
 777                          * first and second interval actions.  NOTE: the timer
 778                          * interval is allowed to continue its exponential
 779                          * backoff.
 780                          */
 781                         if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
 782                                 if (connp->conn_debug) {
 783                                         (void) strlog(TCP_MOD_ID, 0, 1,
 784                                             SL_TRACE, "tcp_timer: zero win");
 785                                 }
 786                         } else {
 787                                 /*
 788                                  * After retransmission, we need to do
 789                                  * slow start.  Set the ssthresh to one
 790                                  * half of current effective window and
 791                                  * cwnd to one MSS.  Also reset
 792                                  * tcp_cwnd_cnt.
 793                                  *
 794                                  * Note that if tcp_ssthresh is reduced because
 795                                  * of ECN, do not reduce it again unless it is
 796                                  * already one window of data away (tcp_cwr
 797                                  * should then be cleared) or this is a
 798                                  * timeout for a retransmitted segment.
 799                                  */
 800                                 uint32_t npkt;
 801 
 802                                 if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
 803                                         npkt = ((tcp->tcp_timer_backoff ?
 804                                             tcp->tcp_cwnd_ssthresh :
 805                                             tcp->tcp_snxt -
 806                                             tcp->tcp_suna) >> 1) / tcp->tcp_mss;
 807                                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
 808                                             tcp->tcp_mss;
 809                                 }
 810                                 tcp->tcp_cwnd = tcp->tcp_mss;
 811                                 tcp->tcp_cwnd_cnt = 0;
 812                                 if (tcp->tcp_ecn_ok) {
 813                                         tcp->tcp_cwr = B_TRUE;
 814                                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 815                                         tcp->tcp_ecn_cwr_sent = B_FALSE;
 816                                 }
 817                         }
 818                         break;
 819                 }
 820                 /*
 821                  * We have something to send yet we cannot send.  The
 822                  * reason can be:
 823                  *
 824                  * 1. Zero send window: we need to do zero window probe.
 825                  * 2. Zero cwnd: because of ECN, we need to "clock out
 826                  * segments.
 827                  * 3. SWS avoidance: receiver may have shrunk window,
 828                  * reset our knowledge.
 829                  *
 830                  * Note that condition 2 can happen with either 1 or
 831                  * 3.  But 1 and 3 are exclusive.
 832                  */
 833                 if (tcp->tcp_unsent != 0) {
 834                         /*
 835                          * Should not hold the zero-copy messages for too long.
 836                          */
 837                         if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
 838                                 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
 839                                     tcp->tcp_xmit_head, B_TRUE);
 840 
 841                         if (tcp->tcp_cwnd == 0) {
 842                                 /*
 843                                  * Set tcp_cwnd to 1 MSS so that a
 844                                  * new segment can be sent out.  We
 845                                  * are "clocking out" new data when
 846                                  * the network is really congested.
 847                                  */
 848                                 ASSERT(tcp->tcp_ecn_ok);
 849                                 tcp->tcp_cwnd = tcp->tcp_mss;
 850                         }
 851                         if (tcp->tcp_swnd == 0) {
 852                                 /* Extend window for zero window probe */
 853                                 tcp->tcp_swnd++;
 854                                 tcp->tcp_zero_win_probe = B_TRUE;
 855                                 TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
 856                         } else {
 857                                 /*
 858                                  * Handle timeout from sender SWS avoidance.
 859                                  * Reset our knowledge of the max send window
 860                                  * since the receiver might have reduced its
 861                                  * receive buffer.  Avoid setting tcp_max_swnd
 862                                  * to one since that will essentially disable
 863                                  * the SWS checks.
 864                                  *
 865                                  * Note that since we don't have a SWS
 866                                  * state variable, if the timeout is set
 867                                  * for ECN but not for SWS, this
 868                                  * code will also be executed.  This is
 869                                  * fine as tcp_max_swnd is updated
 870                                  * constantly and it will not affect
 871                                  * anything.
 872                                  */
 873                                 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
 874                         }
 875                         tcp_wput_data(tcp, NULL, B_FALSE);
 876                         return;
 877                 }
 878                 /* Is there a FIN that needs to be to re retransmitted? */
 879                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
 880                     !tcp->tcp_fin_acked)
 881                         break;
 882                 /* Nothing to do, return without restarting timer. */
 883                 TCP_STAT(tcps, tcp_timer_fire_miss);
 884                 return;
 885         case TCPS_FIN_WAIT_2:
 886                 /*
 887                  * User closed the TCP endpoint and peer ACK'ed our FIN.
 888                  * We waited some time for for peer's FIN, but it hasn't
 889                  * arrived.  We flush the connection now to avoid
 890                  * case where the peer has rebooted.
 891                  */
 892                 if (TCP_IS_DETACHED(tcp)) {
 893                         (void) tcp_clean_death(tcp, 0);
 894                 } else {
 895                         TCP_TIMER_RESTART(tcp,
 896                             tcp->tcp_fin_wait_2_flush_interval);
 897                 }
 898                 return;
 899         case TCPS_TIME_WAIT:
 900                 (void) tcp_clean_death(tcp, 0);
 901                 return;
 902         default:
 903                 if (connp->conn_debug) {
 904                         (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 905                             "tcp_timer: strange state (%d) %s",
 906                             tcp->tcp_state, tcp_display(tcp, NULL,
 907                             DISP_PORT_ONLY));
 908                 }
 909                 return;
 910         }
 911 
 912         /*
 913          * If the system is under memory pressure or the max number of
 914          * connections have been established for the listener, be more
 915          * aggressive in aborting connections.
 916          */
 917         if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
 918             tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
 919                 second_threshold = tcp_early_abort * SECONDS;
 920 
 921                 /* We will ignore the never timeout promise in this case... */
 922                 dont_timeout = B_FALSE;
 923         }
 924 
 925         ASSERT(second_threshold != 0);
 926 
 927         if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
 928                 /*
 929                  * Should not hold the zero-copy messages for too long.
 930                  */
 931                 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
 932                         tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
 933                             tcp->tcp_xmit_head, B_TRUE);
 934 
 935                 if (dont_timeout) {
 936                         /*
 937                          * Reset tcp_ms_we_have_waited to avoid overflow since
 938                          * we are going to retransmit forever.
 939                          */
 940                         tcp->tcp_ms_we_have_waited = second_threshold;
 941                         goto timer_rexmit;
 942                 }
 943 
 944                 /*
 945                  * For zero window probe, we need to send indefinitely,
 946                  * unless we have not heard from the other side for some
 947                  * time...
 948                  */
 949                 if ((tcp->tcp_zero_win_probe == 0) ||
 950                     (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) >
 951                     second_threshold)) {
 952                         TCPS_BUMP_MIB(tcps, tcpTimRetransDrop);
 953                         /*
 954                          * If TCP is in SYN_RCVD state, send back a
 955                          * RST|ACK as BSD does.  Note that tcp_zero_win_probe
 956                          * should be zero in TCPS_SYN_RCVD state.
 957                          */
 958                         if (tcp->tcp_state == TCPS_SYN_RCVD) {
 959                                 tcp_xmit_ctl("tcp_timer: RST sent on timeout "
 960                                     "in SYN_RCVD",
 961                                     tcp, tcp->tcp_snxt,
 962                                     tcp->tcp_rnxt, TH_RST | TH_ACK);
 963                         }
 964                         (void) tcp_clean_death(tcp,
 965                             tcp->tcp_client_errno ?
 966                             tcp->tcp_client_errno : ETIMEDOUT);
 967                         return;
 968                 } else {
 969                         /*
 970                          * If the system is under memory pressure, we also
 971                          * abort connection in zero window probing.
 972                          */
 973                         if (tcps->tcps_reclaim) {
 974                                 (void) tcp_clean_death(tcp,
 975                                     tcp->tcp_client_errno ?
 976                                     tcp->tcp_client_errno : ETIMEDOUT);
 977                                 TCP_STAT(tcps, tcp_zwin_mem_drop);
 978                                 return;
 979                         }
 980                         /*
 981                          * Set tcp_ms_we_have_waited to second_threshold
 982                          * so that in next timeout, we will do the above
 983                          * check (ddi_get_lbolt() - tcp_last_recv_time).
 984                          * This is also to avoid overflow.
 985                          *
 986                          * We don't need to decrement tcp_timer_backoff
 987                          * to avoid overflow because it will be decremented
 988                          * later if new timeout value is greater than
 989                          * tcp_rto_max.  In the case when tcp_rto_max is
 990                          * greater than second_threshold, it means that we
 991                          * will wait longer than second_threshold to send
 992                          * the next
 993                          * window probe.
 994                          */
 995                         tcp->tcp_ms_we_have_waited = second_threshold;
 996                 }
 997         } else if (ms > first_threshold) {
 998                 /*
 999                  * Should not hold the zero-copy messages for too long.
1000                  */
1001                 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
1002                         tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
1003                             tcp->tcp_xmit_head, B_TRUE);
1004 
1005                 /*
1006                  * We have been retransmitting for too long...  The RTT
1007                  * we calculated is probably incorrect.  Reinitialize it.
1008                  * Need to compensate for 0 tcp_rtt_sa.  Reset
1009                  * tcp_rtt_update so that we won't accidentally cache a
1010                  * bad value.  But only do this if this is not a zero
1011                  * window probe.
1012                  */
1013                 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
1014                         tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
1015                             (tcp->tcp_rtt_sa >> 5);
1016                         tcp->tcp_rtt_sa = 0;
1017                         tcp_ip_notify(tcp);
1018                         tcp->tcp_rtt_update = 0;
1019                 }
1020         }
1021 
1022 timer_rexmit:
1023         tcp->tcp_timer_backoff++;
1024         if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
1025             tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
1026             tcp->tcp_rto_min) {
1027                 /*
1028                  * This means the original RTO is tcp_rexmit_interval_min.
1029                  * So we will use tcp_rexmit_interval_min as the RTO value
1030                  * and do the backoff.
1031                  */
1032                 ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
1033         } else {
1034                 ms <<= tcp->tcp_timer_backoff;
1035         }
1036         if (ms > tcp->tcp_rto_max) {
1037                 ms = tcp->tcp_rto_max;
1038                 /*
1039                  * ms is at max, decrement tcp_timer_backoff to avoid
1040                  * overflow.
1041                  */
1042                 tcp->tcp_timer_backoff--;
1043         }
1044         tcp->tcp_ms_we_have_waited += ms;
1045         if (tcp->tcp_zero_win_probe == 0) {
1046                 tcp->tcp_rto = ms;
1047         }
1048         TCP_TIMER_RESTART(tcp, ms);
1049         /*
1050          * This is after a timeout and tcp_rto is backed off.  Set
1051          * tcp_set_timer to 1 so that next time RTO is updated, we will
1052          * restart the timer with a correct value.
1053          */
1054         tcp->tcp_set_timer = 1;
1055         mss = tcp->tcp_snxt - tcp->tcp_suna;
1056         if (mss > tcp->tcp_mss)
1057                 mss = tcp->tcp_mss;
1058         if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1059                 mss = tcp->tcp_swnd;
1060 
1061         if ((mp = tcp->tcp_xmit_head) != NULL)
1062                 mp->b_prev = (mblk_t *)ddi_get_lbolt();
1063         mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1064             B_TRUE);
1065 
1066         /*
1067          * When slow start after retransmission begins, start with
1068          * this seq no.  tcp_rexmit_max marks the end of special slow
1069          * start phase.  tcp_snd_burst controls how many segments
1070          * can be sent because of an ack.
1071          */
1072         tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1073         tcp->tcp_snd_burst = TCP_CWND_SS;
1074         if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
1075             (tcp->tcp_unsent == 0)) {
1076                 tcp->tcp_rexmit_max = tcp->tcp_fss;
1077         } else {
1078                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
1079         }
1080         tcp->tcp_rexmit = B_TRUE;
1081         tcp->tcp_dupack_cnt = 0;
1082 
1083         /*
1084          * Remove all rexmit SACK blk to start from fresh.
1085          */
1086         if (tcp->tcp_snd_sack_ok)
1087                 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
1088         if (mp == NULL) {
1089                 return;
1090         }
1091 
1092         tcp->tcp_csuna = tcp->tcp_snxt;
1093         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
1094         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
1095         tcp_send_data(tcp, mp);
1096 
1097 }
1098 
1099 /*
1100  * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1101  * expires.
1102  */
1103 void
1104 tcp_close_linger_timeout(void *arg)
1105 {
1106         conn_t  *connp = (conn_t *)arg;
1107         tcp_t   *tcp = connp->conn_tcp;
1108 
1109         tcp->tcp_client_errno = ETIMEDOUT;
1110         tcp_stop_lingering(tcp);
1111 }