5295 Wdiff usr/src/uts/common/inet/tcp/tcp_output.c

Print this page

5295 remove maxburst logic from TCP's send algorithm Reviewed by: Dan McDonald <danmcd@omniti.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/tcp/tcp_output.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_output.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
       24 + * Copyright (c) 2014 by Delphix. All rights reserved.
  24   25   */
  25   26  
  26   27  /* This file contains all TCP output processing functions. */
  27   28  
  28   29  #include <sys/types.h>
  29   30  #include <sys/stream.h>
  30   31  #include <sys/strsun.h>
  31   32  #include <sys/strsubr.h>
  32   33  #include <sys/stropts.h>
  33   34  #include <sys/strlog.h>

  34   35  #define _SUN_TPI_VERSION 2
  35   36  #include <sys/tihdr.h>
  36   37  #include <sys/suntpi.h>
  37   38  #include <sys/xti_inet.h>
  38   39  #include <sys/timod.h>
  39   40  #include <sys/pattr.h>
  40   41  #include <sys/squeue_impl.h>
  41   42  #include <sys/squeue.h>
  42   43  #include <sys/sockio.h>
  43   44  #include <sys/tsol/tnet.h>
  44   45  
  45   46  #include <inet/common.h>
  46   47  #include <inet/ip.h>
  47   48  #include <inet/tcp.h>
  48   49  #include <inet/tcp_impl.h>
  49   50  #include <inet/snmpcom.h>
  50   51  #include <inet/proto_set.h>
  51   52  #include <inet/ipsec_impl.h>
  52   53  #include <inet/ip_ndp.h>
  53   54  
  54   55  static mblk_t   *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
  55   56  static void     tcp_wput_cmdblk(queue_t *, mblk_t *);
  56   57  static void     tcp_wput_flush(tcp_t *, mblk_t *);
  57   58  static void     tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
  58   59  static int      tcp_xmit_end(tcp_t *);
  59   60  static int      tcp_send(tcp_t *, const int, const int, const int,
  60   61                      const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
  61   62  static void     tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
  62   63                      int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
  63   64  static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
  64   65  static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
  65   66  static void     tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
  66   67  
  67   68  /*
  68   69   * Functions called directly via squeue having a prototype of edesc_t.
  69   70   */
  70   71  static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
  71   72  static void     tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
  72   73  static void     tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
  73   74  
  74   75  /*
  75   76   * This controls how tiny a write must be before we try to copy it
  76   77   * into the mblk on the tail of the transmit queue.  Not much
  77   78   * speedup is observed for values larger than sixteen.  Zero will
  78   79   * disable the optimisation.
  79   80   */
  80   81  static int tcp_tx_pull_len = 16;
  81   82  
  82   83  void
  83   84  tcp_wput(queue_t *q, mblk_t *mp)
  84   85  {
  85   86          conn_t  *connp = Q_TO_CONN(q);
  86   87          tcp_t   *tcp;
  87   88          void (*output_proc)();
  88   89          t_scalar_t type;
  89   90          uchar_t *rptr;
  90   91          struct iocblk   *iocp;
  91   92          size_t size;
  92   93  
  93   94          ASSERT(connp->conn_ref >= 2);
  94   95  
  95   96          switch (DB_TYPE(mp)) {
  96   97          case M_DATA:
  97   98                  tcp = connp->conn_tcp;
  98   99                  ASSERT(tcp != NULL);
  99  100  
 100  101                  size = msgdsize(mp);
 101  102  
 102  103                  mutex_enter(&tcp->tcp_non_sq_lock);
 103  104                  tcp->tcp_squeue_bytes += size;
 104  105                  if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 105  106                          tcp_setqfull(tcp);
 106  107                  }
 107  108                  mutex_exit(&tcp->tcp_non_sq_lock);
 108  109  
 109  110                  CONN_INC_REF(connp);
 110  111                  SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
 111  112                      NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 112  113                  return;
 113  114  
 114  115          case M_CMD:
 115  116                  tcp_wput_cmdblk(q, mp);
 116  117                  return;
 117  118  
 118  119          case M_PROTO:
 119  120          case M_PCPROTO:
 120  121                  /*
 121  122                   * if it is a snmp message, don't get behind the squeue
 122  123                   */
 123  124                  tcp = connp->conn_tcp;
 124  125                  rptr = mp->b_rptr;
 125  126                  if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
 126  127                          type = ((union T_primitives *)rptr)->type;
 127  128                  } else {
 128  129                          if (connp->conn_debug) {
 129  130                                  (void) strlog(TCP_MOD_ID, 0, 1,
 130  131                                      SL_ERROR|SL_TRACE,
 131  132                                      "tcp_wput_proto, dropping one...");
 132  133                          }
 133  134                          freemsg(mp);
 134  135                          return;
 135  136                  }
 136  137                  if (type == T_SVR4_OPTMGMT_REQ) {
 137  138                          /*
 138  139                           * All Solaris components should pass a db_credp
 139  140                           * for this TPI message, hence we ASSERT.
 140  141                           * But in case there is some other M_PROTO that looks
 141  142                           * like a TPI message sent by some other kernel
 142  143                           * component, we check and return an error.
 143  144                           */
 144  145                          cred_t  *cr = msg_getcred(mp, NULL);
 145  146  
 146  147                          ASSERT(cr != NULL);
 147  148                          if (cr == NULL) {
 148  149                                  tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
 149  150                                  return;
 150  151                          }
 151  152                          if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get,
 152  153                              cr)) {
 153  154                                  /*
 154  155                                   * This was a SNMP request
 155  156                                   */
 156  157                                  return;
 157  158                          } else {
 158  159                                  output_proc = tcp_wput_proto;
 159  160                          }
 160  161                  } else {
 161  162                          output_proc = tcp_wput_proto;
 162  163                  }
 163  164                  break;
 164  165          case M_IOCTL:
 165  166                  /*
 166  167                   * Most ioctls can be processed right away without going via
 167  168                   * squeues - process them right here. Those that do require
 168  169                   * squeue (currently _SIOCSOCKFALLBACK)
 169  170                   * are processed by tcp_wput_ioctl().
 170  171                   */
 171  172                  iocp = (struct iocblk *)mp->b_rptr;
 172  173                  tcp = connp->conn_tcp;
 173  174  
 174  175                  switch (iocp->ioc_cmd) {
 175  176                  case TCP_IOC_ABORT_CONN:
 176  177                          tcp_ioctl_abort_conn(q, mp);
 177  178                          return;
 178  179                  case TI_GETPEERNAME:
 179  180                  case TI_GETMYNAME:
 180  181                          mi_copyin(q, mp, NULL,
 181  182                              SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
 182  183                          return;
 183  184  
 184  185                  default:
 185  186                          output_proc = tcp_wput_ioctl;
 186  187                          break;
 187  188                  }
 188  189                  break;
 189  190          default:
 190  191                  output_proc = tcp_wput_nondata;
 191  192                  break;
 192  193          }
 193  194  
 194  195          CONN_INC_REF(connp);
 195  196          SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp,
 196  197              NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
 197  198  }
 198  199  
 199  200  /*
 200  201   * The TCP normal data output path.
 201  202   * NOTE: the logic of the fast path is duplicated from this function.
 202  203   */
 203  204  void
 204  205  tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
 205  206  {
 206  207          int             len;
 207  208          mblk_t          *local_time;
 208  209          mblk_t          *mp1;
 209  210          uint32_t        snxt;
 210  211          int             tail_unsent;
 211  212          int             tcpstate;
 212  213          int             usable = 0;
 213  214          mblk_t          *xmit_tail;
 214  215          int32_t         mss;
 215  216          int32_t         num_sack_blk = 0;
 216  217          int32_t         total_hdr_len;
 217  218          int32_t         tcp_hdr_len;
 218  219          int             rc;
 219  220          tcp_stack_t     *tcps = tcp->tcp_tcps;
 220  221          conn_t          *connp = tcp->tcp_connp;
 221  222          clock_t         now = LBOLT_FASTPATH;
 222  223  
 223  224          tcpstate = tcp->tcp_state;
 224  225          if (mp == NULL) {
 225  226                  /*
 226  227                   * tcp_wput_data() with NULL mp should only be called when
 227  228                   * there is unsent data.
 228  229                   */
 229  230                  ASSERT(tcp->tcp_unsent > 0);
 230  231                  /* Really tacky... but we need this for detached closes. */
 231  232                  len = tcp->tcp_unsent;
 232  233                  goto data_null;
 233  234          }
 234  235  
 235  236          ASSERT(mp->b_datap->db_type == M_DATA);
 236  237          /*
 237  238           * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
 238  239           * or before a connection attempt has begun.
 239  240           */
 240  241          if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT ||
 241  242              (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
 242  243                  if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
 243  244  #ifdef DEBUG
 244  245                          cmn_err(CE_WARN,
 245  246                              "tcp_wput_data: data after ordrel, %s",
 246  247                              tcp_display(tcp, NULL,
 247  248                              DISP_ADDR_AND_PORT));
 248  249  #else
 249  250                          if (connp->conn_debug) {
 250  251                                  (void) strlog(TCP_MOD_ID, 0, 1,
 251  252                                      SL_TRACE|SL_ERROR,
 252  253                                      "tcp_wput_data: data after ordrel, %s\n",
 253  254                                      tcp_display(tcp, NULL,
 254  255                                      DISP_ADDR_AND_PORT));
 255  256                          }
 256  257  #endif /* DEBUG */
 257  258                  }
 258  259                  if (tcp->tcp_snd_zcopy_aware &&
 259  260                      (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
 260  261                          tcp_zcopy_notify(tcp);
 261  262                  freemsg(mp);
 262  263                  mutex_enter(&tcp->tcp_non_sq_lock);
 263  264                  if (tcp->tcp_flow_stopped &&
 264  265                      TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
 265  266                          tcp_clrqfull(tcp);
 266  267                  }
 267  268                  mutex_exit(&tcp->tcp_non_sq_lock);
 268  269                  return;
 269  270          }
 270  271  
 271  272          /* Strip empties */
 272  273          for (;;) {
 273  274                  ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
 274  275                      (uintptr_t)INT_MAX);
 275  276                  len = (int)(mp->b_wptr - mp->b_rptr);
 276  277                  if (len > 0)
 277  278                          break;
 278  279                  mp1 = mp;
 279  280                  mp = mp->b_cont;
 280  281                  freeb(mp1);
 281  282                  if (mp == NULL) {
 282  283                          return;
 283  284                  }
 284  285          }
 285  286  
 286  287          /* If we are the first on the list ... */
 287  288          if (tcp->tcp_xmit_head == NULL) {
 288  289                  tcp->tcp_xmit_head = mp;
 289  290                  tcp->tcp_xmit_tail = mp;
 290  291                  tcp->tcp_xmit_tail_unsent = len;
 291  292          } else {
 292  293                  /* If tiny tx and room in txq tail, pullup to save mblks. */
 293  294                  struct datab *dp;
 294  295  
 295  296                  mp1 = tcp->tcp_xmit_last;
 296  297                  if (len < tcp_tx_pull_len &&
 297  298                      (dp = mp1->b_datap)->db_ref == 1 &&
 298  299                      dp->db_lim - mp1->b_wptr >= len) {
 299  300                          ASSERT(len > 0);
 300  301                          ASSERT(!mp1->b_cont);
 301  302                          if (len == 1) {
 302  303                                  *mp1->b_wptr++ = *mp->b_rptr;
 303  304                          } else {
 304  305                                  bcopy(mp->b_rptr, mp1->b_wptr, len);
 305  306                                  mp1->b_wptr += len;
 306  307                          }
 307  308                          if (mp1 == tcp->tcp_xmit_tail)
 308  309                                  tcp->tcp_xmit_tail_unsent += len;
 309  310                          mp1->b_cont = mp->b_cont;
 310  311                          if (tcp->tcp_snd_zcopy_aware &&
 311  312                              (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
 312  313                                  mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
 313  314                          freeb(mp);
 314  315                          mp = mp1;
 315  316                  } else {
 316  317                          tcp->tcp_xmit_last->b_cont = mp;
 317  318                  }
 318  319                  len += tcp->tcp_unsent;
 319  320          }
 320  321  
 321  322          /* Tack on however many more positive length mblks we have */
 322  323          if ((mp1 = mp->b_cont) != NULL) {
 323  324                  do {
 324  325                          int tlen;
 325  326                          ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
 326  327                              (uintptr_t)INT_MAX);
 327  328                          tlen = (int)(mp1->b_wptr - mp1->b_rptr);
 328  329                          if (tlen <= 0) {
 329  330                                  mp->b_cont = mp1->b_cont;
 330  331                                  freeb(mp1);
 331  332                          } else {
 332  333                                  len += tlen;
 333  334                                  mp = mp1;
 334  335                          }
 335  336                  } while ((mp1 = mp->b_cont) != NULL);
 336  337          }
 337  338          tcp->tcp_xmit_last = mp;
 338  339          tcp->tcp_unsent = len;
 339  340  
 340  341          if (urgent)
 341  342                  usable = 1;
 342  343  
 343  344  data_null:
 344  345          snxt = tcp->tcp_snxt;
 345  346          xmit_tail = tcp->tcp_xmit_tail;
 346  347          tail_unsent = tcp->tcp_xmit_tail_unsent;
 347  348  
 348  349          /*
 349  350           * Note that tcp_mss has been adjusted to take into account the
 350  351           * timestamp option if applicable.  Because SACK options do not
 351  352           * appear in every TCP segments and they are of variable lengths,
 352  353           * they cannot be included in tcp_mss.  Thus we need to calculate
 353  354           * the actual segment length when we need to send a segment which
 354  355           * includes SACK options.
 355  356           */
 356  357          if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
 357  358                  int32_t opt_len;
 358  359  
 359  360                  num_sack_blk = MIN(tcp->tcp_max_sack_blk,
 360  361                      tcp->tcp_num_sack_blk);
 361  362                  opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
 362  363                      2 + TCPOPT_HEADER_LEN;
 363  364                  mss = tcp->tcp_mss - opt_len;
 364  365                  total_hdr_len = connp->conn_ht_iphc_len + opt_len;
 365  366                  tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
 366  367          } else {
 367  368                  mss = tcp->tcp_mss;
 368  369                  total_hdr_len = connp->conn_ht_iphc_len;
 369  370                  tcp_hdr_len = connp->conn_ht_ulp_len;
 370  371          }
 371  372  
 372  373          if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
 373  374              (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
 374  375                  TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
 375  376          }
 376  377          if (tcpstate == TCPS_SYN_RCVD) {
 377  378                  /*
 378  379                   * The three-way connection establishment handshake is not
 379  380                   * complete yet. We want to queue the data for transmission
 380  381                   * after entering ESTABLISHED state (RFC793). A jump to
 381  382                   * "done" label effectively leaves data on the queue.
 382  383                   */
 383  384                  goto done;
 384  385          } else {
 385  386                  int usable_r;
 386  387  
 387  388                  /*
 388  389                   * In the special case when cwnd is zero, which can only
 389  390                   * happen if the connection is ECN capable, return now.
 390  391                   * New segments is sent using tcp_timer().  The timer
 391  392                   * is set in tcp_input_data().
 392  393                   */
 393  394                  if (tcp->tcp_cwnd == 0) {
 394  395                          /*
 395  396                           * Note that tcp_cwnd is 0 before 3-way handshake is
 396  397                           * finished.
 397  398                           */
 398  399                          ASSERT(tcp->tcp_ecn_ok ||
 399  400                              tcp->tcp_state < TCPS_ESTABLISHED);
 400  401                          return;
 401  402                  }
 402  403  
 403  404                  /* NOTE: trouble if xmitting while SYN not acked? */
 404  405                  usable_r = snxt - tcp->tcp_suna;
 405  406                  usable_r = tcp->tcp_swnd - usable_r;
 406  407  
 407  408                  /*
 408  409                   * Check if the receiver has shrunk the window.  If
 409  410                   * tcp_wput_data() with NULL mp is called, tcp_fin_sent
 410  411                   * cannot be set as there is unsent data, so FIN cannot
 411  412                   * be sent out.  Otherwise, we need to take into account
 412  413                   * of FIN as it consumes an "invisible" sequence number.
 413  414                   */
 414  415                  ASSERT(tcp->tcp_fin_sent == 0);
 415  416                  if (usable_r < 0) {
 416  417                          /*
 417  418                           * The receiver has shrunk the window and we have sent
 418  419                           * -usable_r date beyond the window, re-adjust.
 419  420                           *
 420  421                           * If TCP window scaling is enabled, there can be
 421  422                           * round down error as the advertised receive window
 422  423                           * is actually right shifted n bits.  This means that
 423  424                           * the lower n bits info is wiped out.  It will look
 424  425                           * like the window is shrunk.  Do a check here to
 425  426                           * see if the shrunk amount is actually within the
 426  427                           * error in window calculation.  If it is, just
 427  428                           * return.  Note that this check is inside the
 428  429                           * shrunk window check.  This makes sure that even
 429  430                           * though tcp_process_shrunk_swnd() is not called,
 430  431                           * we will stop further processing.
 431  432                           */
 432  433                          if ((-usable_r >> tcp->tcp_snd_ws) > 0) {
 433  434                                  tcp_process_shrunk_swnd(tcp, -usable_r);
 434  435                          }
 435  436                          return;
 436  437                  }
 437  438  
 438  439                  /* usable = MIN(swnd, cwnd) - unacked_bytes */
 439  440                  if (tcp->tcp_swnd > tcp->tcp_cwnd)
 440  441                          usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
 441  442  
 442  443                  /* usable = MIN(usable, unsent) */
 443  444                  if (usable_r > len)
 444  445                          usable_r = len;
 445  446  
 446  447                  /* usable = MAX(usable, {1 for urgent, 0 for data}) */
 447  448                  if (usable_r > 0) {
 448  449                          usable = usable_r;
 449  450                  } else {
 450  451                          /* Bypass all other unnecessary processing. */
 451  452                          goto done;
 452  453                  }
 453  454          }
 454  455  
 455  456          local_time = (mblk_t *)now;
 456  457  
 457  458          /*
 458  459           * "Our" Nagle Algorithm.  This is not the same as in the old
 459  460           * BSD.  This is more in line with the true intent of Nagle.
 460  461           *
 461  462           * The conditions are:
 462  463           * 1. The amount of unsent data (or amount of data which can be
 463  464           *    sent, whichever is smaller) is less than Nagle limit.
 464  465           * 2. The last sent size is also less than Nagle limit.
 465  466           * 3. There is unack'ed data.
 466  467           * 4. Urgent pointer is not set.  Send urgent data ignoring the
 467  468           *    Nagle algorithm.  This reduces the probability that urgent
 468  469           *    bytes get "merged" together.
 469  470           * 5. The app has not closed the connection.  This eliminates the
 470  471           *    wait time of the receiving side waiting for the last piece of
 471  472           *    (small) data.
 472  473           *
 473  474           * If all are satisified, exit without sending anything.  Note
 474  475           * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
 475  476           * the smaller of 1 MSS and global tcp_naglim_def (default to be
 476  477           * 4095).
 477  478           */
 478  479          if (usable < (int)tcp->tcp_naglim &&
 479  480              tcp->tcp_naglim > tcp->tcp_last_sent_len &&
 480  481              snxt != tcp->tcp_suna &&
 481  482              !(tcp->tcp_valid_bits & TCP_URG_VALID) &&
 482  483              !(tcp->tcp_valid_bits & TCP_FSS_VALID)) {
 483  484                  goto done;
 484  485          }
 485  486  
 486  487          /*
 487  488           * If tcp_zero_win_probe is not set and the tcp->tcp_cork option
 488  489           * is set, then we have to force TCP not to send partial segment
 489  490           * (smaller than MSS bytes). We are calculating the usable now
 490  491           * based on full mss and will save the rest of remaining data for
 491  492           * later. When tcp_zero_win_probe is set, TCP needs to send out
 492  493           * something to do zero window probe.
 493  494           */
 494  495          if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) {
 495  496                  if (usable < mss)
 496  497                          goto done;
 497  498                  usable = (usable / mss) * mss;
 498  499          }
 499  500  
 500  501          /* Update the latest receive window size in TCP header. */
 501  502          tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
 502  503  
 503  504          /* Send the packet. */
 504  505          rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len,
 505  506              num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
 506  507              local_time);
 507  508  
 508  509          /* Pretend that all we were trying to send really got sent */
 509  510          if (rc < 0 && tail_unsent < 0) {
 510  511                  do {
 511  512                          xmit_tail = xmit_tail->b_cont;
 512  513                          xmit_tail->b_prev = local_time;
 513  514                          ASSERT((uintptr_t)(xmit_tail->b_wptr -
 514  515                              xmit_tail->b_rptr) <= (uintptr_t)INT_MAX);
 515  516                          tail_unsent += (int)(xmit_tail->b_wptr -
 516  517                              xmit_tail->b_rptr);
 517  518                  } while (tail_unsent < 0);
 518  519          }
 519  520  done:;
 520  521          tcp->tcp_xmit_tail = xmit_tail;
 521  522          tcp->tcp_xmit_tail_unsent = tail_unsent;
 522  523          len = tcp->tcp_snxt - snxt;
 523  524          if (len) {
 524  525                  /*
 525  526                   * If new data was sent, need to update the notsack
 526  527                   * list, which is, afterall, data blocks that have
 527  528                   * not been sack'ed by the receiver.  New data is
 528  529                   * not sack'ed.
 529  530                   */
 530  531                  if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
 531  532                          /* len is a negative value. */
 532  533                          tcp->tcp_pipe -= len;
 533  534                          tcp_notsack_update(&(tcp->tcp_notsack_list),
 534  535                              tcp->tcp_snxt, snxt,
 535  536                              &(tcp->tcp_num_notsack_blk),
 536  537                              &(tcp->tcp_cnt_notsack_list));
 537  538                  }
 538  539                  tcp->tcp_snxt = snxt + tcp->tcp_fin_sent;
 539  540                  tcp->tcp_rack = tcp->tcp_rnxt;
 540  541                  tcp->tcp_rack_cnt = 0;
 541  542                  if ((snxt + len) == tcp->tcp_suna) {
 542  543                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
 543  544                  }
 544  545          } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) {
 545  546                  /*
 546  547                   * Didn't send anything. Make sure the timer is running
 547  548                   * so that we will probe a zero window.
 548  549                   */
 549  550                  TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
 550  551          }
 551  552          /* Note that len is the amount we just sent but with a negative sign */
 552  553          tcp->tcp_unsent += len;
 553  554          mutex_enter(&tcp->tcp_non_sq_lock);
 554  555          if (tcp->tcp_flow_stopped) {
 555  556                  if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
 556  557                          tcp_clrqfull(tcp);
 557  558                  }
 558  559          } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) {
 559  560                  if (!(tcp->tcp_detached))
 560  561                          tcp_setqfull(tcp);
 561  562          }
 562  563          mutex_exit(&tcp->tcp_non_sq_lock);
 563  564  }
 564  565  
 565  566  /*
 566  567   * Initial STREAMS write side put() procedure for sockets. It tries to
 567  568   * handle the T_CAPABILITY_REQ which sockfs sends down while setting
 568  569   * up the socket without using the squeue. Non T_CAPABILITY_REQ messages
 569  570   * are handled by tcp_wput() as usual.
 570  571   *
 571  572   * All further messages will also be handled by tcp_wput() because we cannot
 572  573   * be sure that the above short cut is safe later.
 573  574   */
 574  575  void
 575  576  tcp_wput_sock(queue_t *wq, mblk_t *mp)
 576  577  {
 577  578          conn_t                  *connp = Q_TO_CONN(wq);
 578  579          tcp_t                   *tcp = connp->conn_tcp;
 579  580          struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr;
 580  581  
 581  582          ASSERT(wq->q_qinfo == &tcp_sock_winit);
 582  583          wq->q_qinfo = &tcp_winit;
 583  584  
 584  585          ASSERT(IPCL_IS_TCP(connp));
 585  586          ASSERT(TCP_IS_SOCKET(tcp));
 586  587  
 587  588          if (DB_TYPE(mp) == M_PCPROTO &&
 588  589              MBLKL(mp) == sizeof (struct T_capability_req) &&
 589  590              car->PRIM_type == T_CAPABILITY_REQ) {
 590  591                  tcp_capability_req(tcp, mp);
 591  592                  return;
 592  593          }
 593  594  
 594  595          tcp_wput(wq, mp);
 595  596  }
 596  597  
 597  598  /* ARGSUSED */
 598  599  void
 599  600  tcp_wput_fallback(queue_t *wq, mblk_t *mp)
 600  601  {
 601  602  #ifdef DEBUG
 602  603          cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n");
 603  604  #endif
 604  605          freemsg(mp);
 605  606  }
 606  607  
 607  608  /*
 608  609   * Call by tcp_wput() to handle misc non M_DATA messages.
 609  610   */
 610  611  /* ARGSUSED */
 611  612  static void
 612  613  tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 613  614  {
 614  615          conn_t  *connp = (conn_t *)arg;
 615  616          tcp_t   *tcp = connp->conn_tcp;
 616  617  
 617  618          ASSERT(DB_TYPE(mp) != M_IOCTL);
 618  619          /*
 619  620           * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close.
 620  621           * Once the close starts, streamhead and sockfs will not let any data
 621  622           * packets come down (close ensures that there are no threads using the
 622  623           * queue and no new threads will come down) but since qprocsoff()
 623  624           * hasn't happened yet, a M_FLUSH or some non data message might
 624  625           * get reflected back (in response to our own FLUSHRW) and get
 625  626           * processed after tcp_close() is done. The conn would still be valid
 626  627           * because a ref would have added but we need to check the state
 627  628           * before actually processing the packet.
 628  629           */
 629  630          if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) {
 630  631                  freemsg(mp);
 631  632                  return;
 632  633          }
 633  634  
 634  635          switch (DB_TYPE(mp)) {
 635  636          case M_IOCDATA:
 636  637                  tcp_wput_iocdata(tcp, mp);
 637  638                  break;
 638  639          case M_FLUSH:
 639  640                  tcp_wput_flush(tcp, mp);
 640  641                  break;
 641  642          default:
 642  643                  ip_wput_nondata(connp->conn_wq, mp);
 643  644                  break;
 644  645          }
 645  646  }
 646  647  
 647  648  /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */
 648  649  static void
 649  650  tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
 650  651  {
 651  652          uchar_t fval = *mp->b_rptr;
 652  653          mblk_t  *tail;
 653  654          conn_t  *connp = tcp->tcp_connp;
 654  655          queue_t *q = connp->conn_wq;
 655  656  
 656  657          /* TODO: How should flush interact with urgent data? */
 657  658          if ((fval & FLUSHW) && tcp->tcp_xmit_head != NULL &&
 658  659              !(tcp->tcp_valid_bits & TCP_URG_VALID)) {
 659  660                  /*
 660  661                   * Flush only data that has not yet been put on the wire.  If
 661  662                   * we flush data that we have already transmitted, life, as we
 662  663                   * know it, may come to an end.
 663  664                   */
 664  665                  tail = tcp->tcp_xmit_tail;
 665  666                  tail->b_wptr -= tcp->tcp_xmit_tail_unsent;
 666  667                  tcp->tcp_xmit_tail_unsent = 0;
 667  668                  tcp->tcp_unsent = 0;
 668  669                  if (tail->b_wptr != tail->b_rptr)
 669  670                          tail = tail->b_cont;
 670  671                  if (tail) {
 671  672                          mblk_t **excess = &tcp->tcp_xmit_head;
 672  673                          for (;;) {
 673  674                                  mblk_t *mp1 = *excess;
 674  675                                  if (mp1 == tail)
 675  676                                          break;
 676  677                                  tcp->tcp_xmit_tail = mp1;
 677  678                                  tcp->tcp_xmit_last = mp1;
 678  679                                  excess = &mp1->b_cont;
 679  680                          }
 680  681                          *excess = NULL;
 681  682                          tcp_close_mpp(&tail);
 682  683                          if (tcp->tcp_snd_zcopy_aware)
 683  684                                  tcp_zcopy_notify(tcp);
 684  685                  }
 685  686                  /*
 686  687                   * We have no unsent data, so unsent must be less than
 687  688                   * conn_sndlowat, so re-enable flow.
 688  689                   */
 689  690                  mutex_enter(&tcp->tcp_non_sq_lock);
 690  691                  if (tcp->tcp_flow_stopped) {
 691  692                          tcp_clrqfull(tcp);
 692  693                  }
 693  694                  mutex_exit(&tcp->tcp_non_sq_lock);
 694  695          }
 695  696          /*
 696  697           * TODO: you can't just flush these, you have to increase rwnd for one
 697  698           * thing.  For another, how should urgent data interact?
 698  699           */
 699  700          if (fval & FLUSHR) {
 700  701                  *mp->b_rptr = fval & ~FLUSHW;
 701  702                  /* XXX */
 702  703                  qreply(q, mp);
 703  704                  return;
 704  705          }
 705  706          freemsg(mp);
 706  707  }
 707  708  
 708  709  /*
 709  710   * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA
 710  711   * messages.
 711  712   */
 712  713  static void
 713  714  tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
 714  715  {
 715  716          mblk_t          *mp1;
 716  717          struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
 717  718          STRUCT_HANDLE(strbuf, sb);
 718  719          uint_t          addrlen;
 719  720          conn_t          *connp = tcp->tcp_connp;
 720  721          queue_t         *q = connp->conn_wq;
 721  722  
 722  723          /* Make sure it is one of ours. */
 723  724          switch (iocp->ioc_cmd) {
 724  725          case TI_GETMYNAME:
 725  726          case TI_GETPEERNAME:
 726  727                  break;
 727  728          default:
 728  729                  /*
 729  730                   * If the conn is closing, then error the ioctl here. Otherwise
 730  731                   * use the CONN_IOCTLREF_* macros to hold off tcp_close until
 731  732                   * we're done here.
 732  733                   */
 733  734                  mutex_enter(&connp->conn_lock);
 734  735                  if (connp->conn_state_flags & CONN_CLOSING) {
 735  736                          mutex_exit(&connp->conn_lock);
 736  737                          iocp->ioc_error = EINVAL;
 737  738                          mp->b_datap->db_type = M_IOCNAK;
 738  739                          iocp->ioc_count = 0;
 739  740                          qreply(q, mp);
 740  741                          return;
 741  742                  }
 742  743  
 743  744                  CONN_INC_IOCTLREF_LOCKED(connp);
 744  745                  ip_wput_nondata(q, mp);
 745  746                  CONN_DEC_IOCTLREF(connp);
 746  747                  return;
 747  748          }
 748  749          switch (mi_copy_state(q, mp, &mp1)) {
 749  750          case -1:
 750  751                  return;
 751  752          case MI_COPY_CASE(MI_COPY_IN, 1):
 752  753                  break;
 753  754          case MI_COPY_CASE(MI_COPY_OUT, 1):
 754  755                  /* Copy out the strbuf. */
 755  756                  mi_copyout(q, mp);
 756  757                  return;
 757  758          case MI_COPY_CASE(MI_COPY_OUT, 2):
 758  759                  /* All done. */
 759  760                  mi_copy_done(q, mp, 0);
 760  761                  return;
 761  762          default:
 762  763                  mi_copy_done(q, mp, EPROTO);
 763  764                  return;
 764  765          }
 765  766          /* Check alignment of the strbuf */
 766  767          if (!OK_32PTR(mp1->b_rptr)) {
 767  768                  mi_copy_done(q, mp, EINVAL);
 768  769                  return;
 769  770          }
 770  771  
 771  772          STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
 772  773  
 773  774          if (connp->conn_family == AF_INET)
 774  775                  addrlen = sizeof (sin_t);
 775  776          else
 776  777                  addrlen = sizeof (sin6_t);
 777  778  
 778  779          if (STRUCT_FGET(sb, maxlen) < addrlen) {
 779  780                  mi_copy_done(q, mp, EINVAL);
 780  781                  return;
 781  782          }
 782  783  
 783  784          switch (iocp->ioc_cmd) {
 784  785          case TI_GETMYNAME:
 785  786                  break;
 786  787          case TI_GETPEERNAME:
 787  788                  if (tcp->tcp_state < TCPS_SYN_RCVD) {
 788  789                          mi_copy_done(q, mp, ENOTCONN);
 789  790                          return;
 790  791                  }
 791  792                  break;
 792  793          }
 793  794          mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
 794  795          if (!mp1)
 795  796                  return;
 796  797  
 797  798          STRUCT_FSET(sb, len, addrlen);
 798  799          switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
 799  800          case TI_GETMYNAME:
 800  801                  (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
 801  802                      &addrlen);
 802  803                  break;
 803  804          case TI_GETPEERNAME:
 804  805                  (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
 805  806                      &addrlen);
 806  807                  break;
 807  808          }
 808  809          mp1->b_wptr += addrlen;
 809  810          /* Copy out the address */
 810  811          mi_copyout(q, mp);
 811  812  }
 812  813  
 813  814  /*
 814  815   * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL
 815  816   * messages.
 816  817   */
 817  818  /* ARGSUSED */
 818  819  static void
 819  820  tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 820  821  {
 821  822          conn_t          *connp = (conn_t *)arg;
 822  823          tcp_t           *tcp = connp->conn_tcp;
 823  824          queue_t         *q = connp->conn_wq;
 824  825          struct iocblk   *iocp;
 825  826  
 826  827          ASSERT(DB_TYPE(mp) == M_IOCTL);
 827  828          /*
 828  829           * Try and ASSERT the minimum possible references on the
 829  830           * conn early enough. Since we are executing on write side,
 830  831           * the connection is obviously not detached and that means
 831  832           * there is a ref each for TCP and IP. Since we are behind
 832  833           * the squeue, the minimum references needed are 3. If the
 833  834           * conn is in classifier hash list, there should be an
 834  835           * extra ref for that (we check both the possibilities).
 835  836           */
 836  837          ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
 837  838              (connp->conn_fanout == NULL && connp->conn_ref >= 3));
 838  839  
 839  840          iocp = (struct iocblk *)mp->b_rptr;
 840  841          switch (iocp->ioc_cmd) {
 841  842          case _SIOCSOCKFALLBACK:
 842  843                  /*
 843  844                   * Either sockmod is about to be popped and the socket
 844  845                   * would now be treated as a plain stream, or a module
 845  846                   * is about to be pushed so we could no longer use read-
 846  847                   * side synchronous streams for fused loopback tcp.
 847  848                   * Drain any queued data and disable direct sockfs
 848  849                   * interface from now on.
 849  850                   */
 850  851                  if (!tcp->tcp_issocket) {
 851  852                          DB_TYPE(mp) = M_IOCNAK;
 852  853                          iocp->ioc_error = EINVAL;
 853  854                  } else {
 854  855                          tcp_use_pure_tpi(tcp);
 855  856                          DB_TYPE(mp) = M_IOCACK;
 856  857                          iocp->ioc_error = 0;
 857  858                  }
 858  859                  iocp->ioc_count = 0;
 859  860                  iocp->ioc_rval = 0;
 860  861                  qreply(q, mp);
 861  862                  return;
 862  863          }
 863  864  
 864  865          /*
 865  866           * If the conn is closing, then error the ioctl here. Otherwise bump the
 866  867           * conn_ioctlref to hold off tcp_close until we're done here.
 867  868           */
 868  869          mutex_enter(&(connp)->conn_lock);
 869  870          if ((connp)->conn_state_flags & CONN_CLOSING) {
 870  871                  mutex_exit(&(connp)->conn_lock);
 871  872                  iocp->ioc_error = EINVAL;
 872  873                  mp->b_datap->db_type = M_IOCNAK;
 873  874                  iocp->ioc_count = 0;
 874  875                  qreply(q, mp);
 875  876                  return;
 876  877          }
 877  878  
 878  879          CONN_INC_IOCTLREF_LOCKED(connp);
 879  880          ip_wput_nondata(q, mp);
 880  881          CONN_DEC_IOCTLREF(connp);
 881  882  }
 882  883  
 883  884  /*
 884  885   * This routine is called by tcp_wput() to handle all TPI requests.
 885  886   */
 886  887  /* ARGSUSED */
 887  888  static void
 888  889  tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 889  890  {
 890  891          conn_t          *connp = (conn_t *)arg;
 891  892          tcp_t           *tcp = connp->conn_tcp;
 892  893          union T_primitives *tprim = (union T_primitives *)mp->b_rptr;
 893  894          uchar_t         *rptr;
 894  895          t_scalar_t      type;
 895  896          cred_t          *cr;
 896  897  
 897  898          /*
 898  899           * Try and ASSERT the minimum possible references on the
 899  900           * conn early enough. Since we are executing on write side,
 900  901           * the connection is obviously not detached and that means
 901  902           * there is a ref each for TCP and IP. Since we are behind
 902  903           * the squeue, the minimum references needed are 3. If the
 903  904           * conn is in classifier hash list, there should be an
 904  905           * extra ref for that (we check both the possibilities).
 905  906           */
 906  907          ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
 907  908              (connp->conn_fanout == NULL && connp->conn_ref >= 3));
 908  909  
 909  910          rptr = mp->b_rptr;
 910  911          ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
 911  912          if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
 912  913                  type = ((union T_primitives *)rptr)->type;
 913  914                  if (type == T_EXDATA_REQ) {
 914  915                          tcp_output_urgent(connp, mp, arg2, NULL);
 915  916                  } else if (type != T_DATA_REQ) {
 916  917                          goto non_urgent_data;
 917  918                  } else {
 918  919                          /* TODO: options, flags, ... from user */
 919  920                          /* Set length to zero for reclamation below */
 920  921                          tcp_wput_data(tcp, mp->b_cont, B_TRUE);
 921  922                          freeb(mp);
 922  923                  }
 923  924                  return;
 924  925          } else {
 925  926                  if (connp->conn_debug) {
 926  927                          (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 927  928                              "tcp_wput_proto, dropping one...");
 928  929                  }
 929  930                  freemsg(mp);
 930  931                  return;
 931  932          }
 932  933  
 933  934  non_urgent_data:
 934  935  
 935  936          switch ((int)tprim->type) {
 936  937          case O_T_BIND_REQ:      /* bind request */
 937  938          case T_BIND_REQ:        /* new semantics bind request */
 938  939                  tcp_tpi_bind(tcp, mp);
 939  940                  break;
 940  941          case T_UNBIND_REQ:      /* unbind request */
 941  942                  tcp_tpi_unbind(tcp, mp);
 942  943                  break;
 943  944          case O_T_CONN_RES:      /* old connection response XXX */
 944  945          case T_CONN_RES:        /* connection response */
 945  946                  tcp_tli_accept(tcp, mp);
 946  947                  break;
 947  948          case T_CONN_REQ:        /* connection request */
 948  949                  tcp_tpi_connect(tcp, mp);
 949  950                  break;
 950  951          case T_DISCON_REQ:      /* disconnect request */
 951  952                  tcp_disconnect(tcp, mp);
 952  953                  break;
 953  954          case T_CAPABILITY_REQ:
 954  955                  tcp_capability_req(tcp, mp);    /* capability request */
 955  956                  break;
 956  957          case T_INFO_REQ:        /* information request */
 957  958                  tcp_info_req(tcp, mp);
 958  959                  break;
 959  960          case T_SVR4_OPTMGMT_REQ:        /* manage options req */
 960  961          case T_OPTMGMT_REQ:
 961  962                  /*
 962  963                   * Note:  no support for snmpcom_req() through new
 963  964                   * T_OPTMGMT_REQ. See comments in ip.c
 964  965                   */
 965  966  
 966  967                  /*
 967  968                   * All Solaris components should pass a db_credp
 968  969                   * for this TPI message, hence we ASSERT.
 969  970                   * But in case there is some other M_PROTO that looks
 970  971                   * like a TPI message sent by some other kernel
 971  972                   * component, we check and return an error.
 972  973                   */
 973  974                  cr = msg_getcred(mp, NULL);
 974  975                  ASSERT(cr != NULL);
 975  976                  if (cr == NULL) {
 976  977                          tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
 977  978                          return;
 978  979                  }
 979  980                  /*
 980  981                   * If EINPROGRESS is returned, the request has been queued
 981  982                   * for subsequent processing by ip_restart_optmgmt(), which
 982  983                   * will do the CONN_DEC_REF().
 983  984                   */
 984  985                  if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) {
 985  986                          svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
 986  987                  } else {
 987  988                          tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
 988  989                  }
 989  990                  break;
 990  991  
 991  992          case T_UNITDATA_REQ:    /* unitdata request */
 992  993                  tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
 993  994                  break;
 994  995          case T_ORDREL_REQ:      /* orderly release req */
 995  996                  freemsg(mp);
 996  997  
 997  998                  if (tcp->tcp_fused)
 998  999                          tcp_unfuse(tcp);
 999 1000  
1000 1001                  if (tcp_xmit_end(tcp) != 0) {
1001 1002                          /*
1002 1003                           * We were crossing FINs and got a reset from
1003 1004                           * the other side. Just ignore it.
1004 1005                           */
1005 1006                          if (connp->conn_debug) {
1006 1007                                  (void) strlog(TCP_MOD_ID, 0, 1,
1007 1008                                      SL_ERROR|SL_TRACE,
1008 1009                                      "tcp_wput_proto, T_ORDREL_REQ out of "
1009 1010                                      "state %s",
1010 1011                                      tcp_display(tcp, NULL,
1011 1012                                      DISP_ADDR_AND_PORT));
1012 1013                          }
1013 1014                  }
1014 1015                  break;
1015 1016          case T_ADDR_REQ:
1016 1017                  tcp_addr_req(tcp, mp);
1017 1018                  break;
1018 1019          default:
1019 1020                  if (connp->conn_debug) {
1020 1021                          (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
1021 1022                              "tcp_wput_proto, bogus TPI msg, type %d",
1022 1023                              tprim->type);
1023 1024                  }
1024 1025                  /*
1025 1026                   * We used to M_ERROR.  Sending TNOTSUPPORT gives the user
1026 1027                   * to recover.
1027 1028                   */
1028 1029                  tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
1029 1030                  break;
1030 1031          }
1031 1032  }
1032 1033  
1033 1034  /*
1034 1035   * Handle special out-of-band ioctl requests (see PSARC/2008/265).
1035 1036   */
1036 1037  static void
1037 1038  tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
1038 1039  {
1039 1040          void    *data;
1040 1041          mblk_t  *datamp = mp->b_cont;
1041 1042          conn_t  *connp = Q_TO_CONN(q);
1042 1043          tcp_t   *tcp = connp->conn_tcp;
1043 1044          cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
1044 1045  
1045 1046          if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
1046 1047                  cmdp->cb_error = EPROTO;
1047 1048                  qreply(q, mp);
1048 1049                  return;
1049 1050          }
1050 1051  
1051 1052          data = datamp->b_rptr;
1052 1053  
1053 1054          switch (cmdp->cb_cmd) {
1054 1055          case TI_GETPEERNAME:
1055 1056                  if (tcp->tcp_state < TCPS_SYN_RCVD)
1056 1057                          cmdp->cb_error = ENOTCONN;
1057 1058                  else
1058 1059                          cmdp->cb_error = conn_getpeername(connp, data,
1059 1060                              &cmdp->cb_len);
1060 1061                  break;
1061 1062          case TI_GETMYNAME:
1062 1063                  cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
1063 1064                  break;
1064 1065          default:
1065 1066                  cmdp->cb_error = EINVAL;
1066 1067                  break;
1067 1068          }
1068 1069  
1069 1070          qreply(q, mp);
1070 1071  }
1071 1072  
1072 1073  /*
1073 1074   * The TCP fast path write put procedure.
1074 1075   * NOTE: the logic of the fast path is duplicated from tcp_wput_data()
1075 1076   */
1076 1077  /* ARGSUSED */
1077 1078  void
1078 1079  tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1079 1080  {
1080 1081          int             len;
1081 1082          int             hdrlen;
1082 1083          int             plen;
1083 1084          mblk_t          *mp1;
1084 1085          uchar_t         *rptr;
1085 1086          uint32_t        snxt;
1086 1087          tcpha_t         *tcpha;
1087 1088          struct datab    *db;
1088 1089          uint32_t        suna;
1089 1090          uint32_t        mss;
1090 1091          ipaddr_t        *dst;
1091 1092          ipaddr_t        *src;
1092 1093          uint32_t        sum;
1093 1094          int             usable;
1094 1095          conn_t          *connp = (conn_t *)arg;
1095 1096          tcp_t           *tcp = connp->conn_tcp;
1096 1097          uint32_t        msize;
1097 1098          tcp_stack_t     *tcps = tcp->tcp_tcps;
1098 1099          ip_xmit_attr_t  *ixa;
1099 1100          clock_t         now;
1100 1101  
1101 1102          /*
1102 1103           * Try and ASSERT the minimum possible references on the
1103 1104           * conn early enough. Since we are executing on write side,
1104 1105           * the connection is obviously not detached and that means
1105 1106           * there is a ref each for TCP and IP. Since we are behind
1106 1107           * the squeue, the minimum references needed are 3. If the
1107 1108           * conn is in classifier hash list, there should be an
1108 1109           * extra ref for that (we check both the possibilities).
1109 1110           */
1110 1111          ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1111 1112              (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1112 1113  
1113 1114          ASSERT(DB_TYPE(mp) == M_DATA);
1114 1115          msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1115 1116  
1116 1117          mutex_enter(&tcp->tcp_non_sq_lock);
1117 1118          tcp->tcp_squeue_bytes -= msize;
1118 1119          mutex_exit(&tcp->tcp_non_sq_lock);
1119 1120  
1120 1121          /* Bypass tcp protocol for fused tcp loopback */
1121 1122          if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
1122 1123                  return;
1123 1124  
1124 1125          mss = tcp->tcp_mss;
1125 1126          /*
1126 1127           * If ZEROCOPY has turned off, try not to send any zero-copy message
1127 1128           * down. Do backoff, now.
1128 1129           */
1129 1130          if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on)
1130 1131                  mp = tcp_zcopy_backoff(tcp, mp, B_FALSE);
1131 1132  
1132 1133  
1133 1134          ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1134 1135          len = (int)(mp->b_wptr - mp->b_rptr);
1135 1136  
1136 1137          /*
1137 1138           * Criteria for fast path:
1138 1139           *
1139 1140           *   1. no unsent data
1140 1141           *   2. single mblk in request
1141 1142           *   3. connection established
1142 1143           *   4. data in mblk
1143 1144           *   5. len <= mss
1144 1145           *   6. no tcp_valid bits
1145 1146           */
1146 1147          if ((tcp->tcp_unsent != 0) ||
1147 1148              (tcp->tcp_cork) ||
1148 1149              (mp->b_cont != NULL) ||
1149 1150              (tcp->tcp_state != TCPS_ESTABLISHED) ||
1150 1151              (len == 0) ||
1151 1152              (len > mss) ||
1152 1153              (tcp->tcp_valid_bits != 0)) {
1153 1154                  tcp_wput_data(tcp, mp, B_FALSE);
1154 1155                  return;
1155 1156          }
1156 1157  
1157 1158          ASSERT(tcp->tcp_xmit_tail_unsent == 0);
1158 1159          ASSERT(tcp->tcp_fin_sent == 0);
1159 1160  
1160 1161          /* queue new packet onto retransmission queue */
1161 1162          if (tcp->tcp_xmit_head == NULL) {
1162 1163                  tcp->tcp_xmit_head = mp;
1163 1164          } else {
1164 1165                  tcp->tcp_xmit_last->b_cont = mp;
1165 1166          }
1166 1167          tcp->tcp_xmit_last = mp;
1167 1168          tcp->tcp_xmit_tail = mp;
1168 1169  
1169 1170          /* find out how much we can send */
1170 1171          /* BEGIN CSTYLED */
1171 1172          /*
1172 1173           *    un-acked     usable
1173 1174           *  |--------------|-----------------|
1174 1175           *  tcp_suna       tcp_snxt       tcp_suna+tcp_swnd
1175 1176           */
1176 1177          /* END CSTYLED */
1177 1178  
1178 1179          /* start sending from tcp_snxt */
1179 1180          snxt = tcp->tcp_snxt;
1180 1181  
1181 1182          /*
1182 1183           * Check to see if this connection has been idled for some
1183 1184           * time and no ACK is expected.  If it is, we need to slow
1184 1185           * start again to get back the connection's "self-clock" as
1185 1186           * described in VJ's paper.
1186 1187           *
1187 1188           * Reinitialize tcp_cwnd after idle.
1188 1189           */
1189 1190          now = LBOLT_FASTPATH;
1190 1191          if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1191 1192              (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1192 1193                  TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1193 1194          }
1194 1195  
1195 1196          usable = tcp->tcp_swnd;         /* tcp window size */
1196 1197          if (usable > tcp->tcp_cwnd)
1197 1198                  usable = tcp->tcp_cwnd; /* congestion window smaller */
1198 1199          usable -= snxt;         /* subtract stuff already sent */
1199 1200          suna = tcp->tcp_suna;
1200 1201          usable += suna;
1201 1202          /* usable can be < 0 if the congestion window is smaller */
1202 1203          if (len > usable) {
1203 1204                  /* Can't send complete M_DATA in one shot */
1204 1205                  goto slow;
1205 1206          }
1206 1207  
1207 1208          mutex_enter(&tcp->tcp_non_sq_lock);
1208 1209          if (tcp->tcp_flow_stopped &&
1209 1210              TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1210 1211                  tcp_clrqfull(tcp);
1211 1212          }
1212 1213          mutex_exit(&tcp->tcp_non_sq_lock);
1213 1214  
1214 1215          /*
1215 1216           * determine if anything to send (Nagle).
1216 1217           *
1217 1218           *   1. len < tcp_mss (i.e. small)
1218 1219           *   2. unacknowledged data present
1219 1220           *   3. len < nagle limit
1220 1221           *   4. last packet sent < nagle limit (previous packet sent)
1221 1222           */
1222 1223          if ((len < mss) && (snxt != suna) &&
1223 1224              (len < (int)tcp->tcp_naglim) &&
1224 1225              (tcp->tcp_last_sent_len < tcp->tcp_naglim)) {
1225 1226                  /*
1226 1227                   * This was the first unsent packet and normally
1227 1228                   * mss < xmit_hiwater so there is no need to worry
1228 1229                   * about flow control. The next packet will go
1229 1230                   * through the flow control check in tcp_wput_data().
1230 1231                   */
1231 1232                  /* leftover work from above */
1232 1233                  tcp->tcp_unsent = len;
1233 1234                  tcp->tcp_xmit_tail_unsent = len;
1234 1235  
1235 1236                  return;
1236 1237          }
1237 1238  
1238 1239          /*
1239 1240           * len <= tcp->tcp_mss && len == unsent so no sender silly window.  Can
1240 1241           * send now.
1241 1242           */
1242 1243  
1243 1244          if (snxt == suna) {
1244 1245                  TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1245 1246          }
1246 1247  
1247 1248          /* we have always sent something */
1248 1249          tcp->tcp_rack_cnt = 0;
1249 1250  
1250 1251          tcp->tcp_snxt = snxt + len;
1251 1252          tcp->tcp_rack = tcp->tcp_rnxt;
1252 1253  
1253 1254          if ((mp1 = dupb(mp)) == 0)
1254 1255                  goto no_memory;
1255 1256          mp->b_prev = (mblk_t *)(uintptr_t)now;
1256 1257          mp->b_next = (mblk_t *)(uintptr_t)snxt;
1257 1258  
1258 1259          /* adjust tcp header information */
1259 1260          tcpha = tcp->tcp_tcpha;
1260 1261          tcpha->tha_flags = (TH_ACK|TH_PUSH);
1261 1262  
1262 1263          sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1263 1264          sum = (sum >> 16) + (sum & 0xFFFF);
1264 1265          tcpha->tha_sum = htons(sum);
1265 1266  
1266 1267          tcpha->tha_seq = htonl(snxt);
1267 1268  
1268 1269          TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1269 1270          TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1270 1271          BUMP_LOCAL(tcp->tcp_obsegs);
1271 1272  
1272 1273          /* Update the latest receive window size in TCP header. */
1273 1274          tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1274 1275  
1275 1276          tcp->tcp_last_sent_len = (ushort_t)len;
1276 1277  
1277 1278          plen = len + connp->conn_ht_iphc_len;
1278 1279  
1279 1280          ixa = connp->conn_ixa;
1280 1281          ixa->ixa_pktlen = plen;
1281 1282  
1282 1283          if (ixa->ixa_flags & IXAF_IS_IPV4) {
1283 1284                  tcp->tcp_ipha->ipha_length = htons(plen);
1284 1285          } else {
1285 1286                  tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
1286 1287          }
1287 1288  
1288 1289          /* see if we need to allocate a mblk for the headers */
1289 1290          hdrlen = connp->conn_ht_iphc_len;
1290 1291          rptr = mp1->b_rptr - hdrlen;
1291 1292          db = mp1->b_datap;
1292 1293          if ((db->db_ref != 2) || rptr < db->db_base ||
1293 1294              (!OK_32PTR(rptr))) {
1294 1295                  /* NOTE: we assume allocb returns an OK_32PTR */
1295 1296                  mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1296 1297                  if (!mp) {
1297 1298                          freemsg(mp1);
1298 1299                          goto no_memory;
1299 1300                  }
1300 1301                  mp->b_cont = mp1;
1301 1302                  mp1 = mp;
1302 1303                  /* Leave room for Link Level header */
1303 1304                  rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1304 1305                  mp1->b_wptr = &rptr[hdrlen];
1305 1306          }
1306 1307          mp1->b_rptr = rptr;
1307 1308  
1308 1309          /* Fill in the timestamp option. */
1309 1310          if (tcp->tcp_snd_ts_ok) {
1310 1311                  uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
1311 1312  
1312 1313                  U32_TO_BE32(llbolt,
1313 1314                      (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
1314 1315                  U32_TO_BE32(tcp->tcp_ts_recent,
1315 1316                      (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
1316 1317          } else {
1317 1318                  ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1318 1319          }
1319 1320  
1320 1321          /* copy header into outgoing packet */
1321 1322          dst = (ipaddr_t *)rptr;
1322 1323          src = (ipaddr_t *)connp->conn_ht_iphc;
1323 1324          dst[0] = src[0];
1324 1325          dst[1] = src[1];
1325 1326          dst[2] = src[2];
1326 1327          dst[3] = src[3];
1327 1328          dst[4] = src[4];
1328 1329          dst[5] = src[5];
1329 1330          dst[6] = src[6];
1330 1331          dst[7] = src[7];
1331 1332          dst[8] = src[8];
1332 1333          dst[9] = src[9];
1333 1334          if (hdrlen -= 40) {
1334 1335                  hdrlen >>= 2;
1335 1336                  dst += 10;
1336 1337                  src += 10;
1337 1338                  do {
1338 1339                          *dst++ = *src++;
1339 1340                  } while (--hdrlen);
1340 1341          }
1341 1342  
1342 1343          /*
1343 1344           * Set the ECN info in the TCP header.  Note that this
1344 1345           * is not the template header.
1345 1346           */
1346 1347          if (tcp->tcp_ecn_ok) {
1347 1348                  TCP_SET_ECT(tcp, rptr);
1348 1349  
1349 1350                  tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length);
1350 1351                  if (tcp->tcp_ecn_echo_on)
1351 1352                          tcpha->tha_flags |= TH_ECE;
1352 1353                  if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
1353 1354                          tcpha->tha_flags |= TH_CWR;
1354 1355                          tcp->tcp_ecn_cwr_sent = B_TRUE;
1355 1356                  }
1356 1357          }
1357 1358  
1358 1359          if (tcp->tcp_ip_forward_progress) {
1359 1360                  tcp->tcp_ip_forward_progress = B_FALSE;
1360 1361                  connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
1361 1362          } else {
1362 1363                  connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
1363 1364          }
1364 1365          tcp_send_data(tcp, mp1);
1365 1366          return;
1366 1367  
1367 1368          /*
1368 1369           * If we ran out of memory, we pretend to have sent the packet
1369 1370           * and that it was lost on the wire.
1370 1371           */
1371 1372  no_memory:
1372 1373          return;
1373 1374  
1374 1375  slow:
1375 1376          /* leftover work from above */
1376 1377          tcp->tcp_unsent = len;
1377 1378          tcp->tcp_xmit_tail_unsent = len;
1378 1379          tcp_wput_data(tcp, NULL, B_FALSE);
1379 1380  }
1380 1381  
1381 1382  /* ARGSUSED2 */
1382 1383  void
1383 1384  tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1384 1385  {
1385 1386          int len;
1386 1387          uint32_t msize;
1387 1388          conn_t *connp = (conn_t *)arg;
1388 1389          tcp_t *tcp = connp->conn_tcp;
1389 1390  
1390 1391          msize = msgdsize(mp);
1391 1392  
1392 1393          len = msize - 1;
1393 1394          if (len < 0) {
1394 1395                  freemsg(mp);
1395 1396                  return;
1396 1397          }
1397 1398  
1398 1399          /*
1399 1400           * Try to force urgent data out on the wire. Even if we have unsent
1400 1401           * data this will at least send the urgent flag.
1401 1402           * XXX does not handle more flag correctly.
1402 1403           */
1403 1404          len += tcp->tcp_unsent;
1404 1405          len += tcp->tcp_snxt;
1405 1406          tcp->tcp_urg = len;
1406 1407          tcp->tcp_valid_bits |= TCP_URG_VALID;
1407 1408  
1408 1409          /* Bypass tcp protocol for fused tcp loopback */
1409 1410          if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
1410 1411                  return;
1411 1412  
1412 1413          /* Strip off the T_EXDATA_REQ if the data is from TPI */
1413 1414          if (DB_TYPE(mp) != M_DATA) {
1414 1415                  mblk_t *mp1 = mp;
1415 1416                  ASSERT(!IPCL_IS_NONSTR(connp));
1416 1417                  mp = mp->b_cont;
1417 1418                  freeb(mp1);
1418 1419          }
1419 1420          tcp_wput_data(tcp, mp, B_TRUE);
1420 1421  }
1421 1422  
1422 1423  /*
1423 1424   * Called by streams close routine via squeues when our client blows off her
1424 1425   * descriptor, we take this to mean: "close the stream state NOW, close the tcp
1425 1426   * connection politely" When SO_LINGER is set (with a non-zero linger time and
1426 1427   * it is not a nonblocking socket) then this routine sleeps until the FIN is
1427 1428   * acked.
1428 1429   *
1429 1430   * NOTE: tcp_close potentially returns error when lingering.
1430 1431   * However, the stream head currently does not pass these errors
1431 1432   * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
1432 1433   * errors to the application (from tsleep()) and not errors
1433 1434   * like ECONNRESET caused by receiving a reset packet.
1434 1435   */
1435 1436  
1436 1437  /* ARGSUSED */
1437 1438  void
1438 1439  tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1439 1440  {
1440 1441          char    *msg;
1441 1442          conn_t  *connp = (conn_t *)arg;
1442 1443          tcp_t   *tcp = connp->conn_tcp;
1443 1444          clock_t delta = 0;
1444 1445          tcp_stack_t     *tcps = tcp->tcp_tcps;
1445 1446  
1446 1447          /*
1447 1448           * When a non-STREAMS socket is being closed, it does not always
1448 1449           * stick around waiting for tcp_close_output to run and can therefore
1449 1450           * have dropped a reference already. So adjust the asserts accordingly.
1450 1451           */
1451 1452          ASSERT((connp->conn_fanout != NULL &&
1452 1453              connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 3 : 4)) ||
1453 1454              (connp->conn_fanout == NULL &&
1454 1455              connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3)));
1455 1456  
1456 1457          mutex_enter(&tcp->tcp_eager_lock);
1457 1458          if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
1458 1459                  /*
1459 1460                   * Cleanup for listener. For non-STREAM sockets sockfs will
1460 1461                   * close all the eagers on 'q', so in that case only deal
1461 1462                   * with 'q0'.
1462 1463                   */
1463 1464                  tcp_eager_cleanup(tcp, IPCL_IS_NONSTR(connp) ? 1 : 0);
1464 1465                  tcp->tcp_wait_for_eagers = 1;
1465 1466          }
1466 1467          mutex_exit(&tcp->tcp_eager_lock);
1467 1468  
1468 1469          tcp->tcp_lso = B_FALSE;
1469 1470  
1470 1471          msg = NULL;
1471 1472          switch (tcp->tcp_state) {
1472 1473          case TCPS_CLOSED:
1473 1474          case TCPS_IDLE:
1474 1475                  break;
1475 1476          case TCPS_BOUND:
1476 1477                  if (tcp->tcp_listener != NULL) {
1477 1478                          ASSERT(IPCL_IS_NONSTR(connp));
1478 1479                          /*
1479 1480                           * Unlink from the listener and drop the reference
1480 1481                           * put on it by the eager. tcp_closei_local will not
1481 1482                           * do it because tcp_tconnind_started is TRUE.
1482 1483                           */
1483 1484                          mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
1484 1485                          tcp_eager_unlink(tcp);
1485 1486                          mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
1486 1487                          CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1487 1488                  }
1488 1489                  break;
1489 1490          case TCPS_LISTEN:
1490 1491                  break;
1491 1492          case TCPS_SYN_SENT:
1492 1493                  msg = "tcp_close, during connect";
1493 1494                  break;
1494 1495          case TCPS_SYN_RCVD:
1495 1496                  /*
1496 1497                   * Close during the connect 3-way handshake
1497 1498                   * but here there may or may not be pending data
1498 1499                   * already on queue. Process almost same as in
1499 1500                   * the ESTABLISHED state.
1500 1501                   */
1501 1502                  /* FALLTHRU */
1502 1503          default:
1503 1504                  if (tcp->tcp_fused)
1504 1505                          tcp_unfuse(tcp);
1505 1506  
1506 1507                  /*
1507 1508                   * If SO_LINGER has set a zero linger time, abort the
1508 1509                   * connection with a reset.
1509 1510                   */
1510 1511                  if (connp->conn_linger && connp->conn_lingertime == 0) {
1511 1512                          msg = "tcp_close, zero lingertime";
1512 1513                          break;
1513 1514                  }
1514 1515  
1515 1516                  /*
1516 1517                   * Abort connection if there is unread data queued.
1517 1518                   */
1518 1519                  if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
1519 1520                          msg = "tcp_close, unread data";
1520 1521                          break;
1521 1522                  }
1522 1523  
1523 1524                  /*
1524 1525                   * Abort connection if it is being closed without first
1525 1526                   * being accepted. This can happen if a listening non-STREAM
1526 1527                   * socket wants to get rid of the socket, for example, if the
1527 1528                   * listener is closing.
1528 1529                   */
1529 1530                  if (tcp->tcp_listener != NULL) {
1530 1531                          ASSERT(IPCL_IS_NONSTR(connp));
1531 1532                          msg = "tcp_close, close before accept";
1532 1533  
1533 1534                          /*
1534 1535                           * Unlink from the listener and drop the reference
1535 1536                           * put on it by the eager. tcp_closei_local will not
1536 1537                           * do it because tcp_tconnind_started is TRUE.
1537 1538                           */
1538 1539                          mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
1539 1540                          tcp_eager_unlink(tcp);
1540 1541                          mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
1541 1542                          CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1542 1543                          break;
1543 1544                  }
1544 1545  
1545 1546                  /*
1546 1547                   * Transmit the FIN before detaching the tcp_t.
1547 1548                   * After tcp_detach returns this queue/perimeter
1548 1549                   * no longer owns the tcp_t thus others can modify it.
1549 1550                   */
1550 1551                  (void) tcp_xmit_end(tcp);
1551 1552  
1552 1553                  /*
1553 1554                   * If lingering on close then wait until the fin is acked,
1554 1555                   * the SO_LINGER time passes, or a reset is sent/received.
1555 1556                   */
1556 1557                  if (connp->conn_linger && connp->conn_lingertime > 0 &&
1557 1558                      !(tcp->tcp_fin_acked) &&
1558 1559                      tcp->tcp_state >= TCPS_ESTABLISHED) {
1559 1560                          if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
1560 1561                                  tcp->tcp_client_errno = EWOULDBLOCK;
1561 1562                          } else if (tcp->tcp_client_errno == 0) {
1562 1563  
1563 1564                                  ASSERT(tcp->tcp_linger_tid == 0);
1564 1565  
1565 1566                                  /* conn_lingertime is in sec. */
1566 1567                                  tcp->tcp_linger_tid = TCP_TIMER(tcp,
1567 1568                                      tcp_close_linger_timeout,
1568 1569                                      connp->conn_lingertime * MILLISEC);
1569 1570  
1570 1571                                  /* tcp_close_linger_timeout will finish close */
1571 1572                                  if (tcp->tcp_linger_tid == 0)
1572 1573                                          tcp->tcp_client_errno = ENOSR;
1573 1574                                  else
1574 1575                                          return;
1575 1576                          }
1576 1577  
1577 1578                          /*
1578 1579                           * Check if we need to detach or just close
1579 1580                           * the instance.
1580 1581                           */
1581 1582                          if (tcp->tcp_state <= TCPS_LISTEN)
1582 1583                                  break;
1583 1584                  }
1584 1585  
1585 1586                  /*
1586 1587                   * Make sure that no other thread will access the conn_rq of
1587 1588                   * this instance (through lookups etc.) as conn_rq will go
1588 1589                   * away shortly.
1589 1590                   */
1590 1591                  tcp_acceptor_hash_remove(tcp);
1591 1592  
1592 1593                  mutex_enter(&tcp->tcp_non_sq_lock);
1593 1594                  if (tcp->tcp_flow_stopped) {
1594 1595                          tcp_clrqfull(tcp);
1595 1596                  }
1596 1597                  mutex_exit(&tcp->tcp_non_sq_lock);
1597 1598  
1598 1599                  if (tcp->tcp_timer_tid != 0) {
1599 1600                          delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
1600 1601                          tcp->tcp_timer_tid = 0;
1601 1602                  }
1602 1603                  /*
1603 1604                   * Need to cancel those timers which will not be used when
1604 1605                   * TCP is detached.  This has to be done before the conn_wq
1605 1606                   * is set to NULL.
1606 1607                   */
1607 1608                  tcp_timers_stop(tcp);
1608 1609  
1609 1610                  tcp->tcp_detached = B_TRUE;
1610 1611                  if (tcp->tcp_state == TCPS_TIME_WAIT) {
1611 1612                          tcp_time_wait_append(tcp);
1612 1613                          TCP_DBGSTAT(tcps, tcp_detach_time_wait);
1613 1614                          ASSERT(connp->conn_ref >=
1614 1615                              (IPCL_IS_NONSTR(connp) ? 2 : 3));
1615 1616                          goto finish;
1616 1617                  }
1617 1618  
1618 1619                  /*
1619 1620                   * If delta is zero the timer event wasn't executed and was
1620 1621                   * successfully canceled. In this case we need to restart it
1621 1622                   * with the minimal delta possible.
1622 1623                   */
1623 1624                  if (delta >= 0)
1624 1625                          tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
1625 1626                              delta ? delta : 1);
1626 1627  
1627 1628                  ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3));
1628 1629                  goto finish;
1629 1630          }
1630 1631  
1631 1632          /* Detach did not complete. Still need to remove q from stream. */
1632 1633          if (msg) {
1633 1634                  if (tcp->tcp_state == TCPS_ESTABLISHED ||
1634 1635                      tcp->tcp_state == TCPS_CLOSE_WAIT)
1635 1636                          TCPS_BUMP_MIB(tcps, tcpEstabResets);
1636 1637                  if (tcp->tcp_state == TCPS_SYN_SENT ||
1637 1638                      tcp->tcp_state == TCPS_SYN_RCVD)
1638 1639                          TCPS_BUMP_MIB(tcps, tcpAttemptFails);
1639 1640                  tcp_xmit_ctl(msg, tcp,  tcp->tcp_snxt, 0, TH_RST);
1640 1641          }
1641 1642  
1642 1643          tcp_closei_local(tcp);
1643 1644          CONN_DEC_REF(connp);
1644 1645          ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 1 : 2));
1645 1646  
1646 1647  finish:
1647 1648          /*
1648 1649           * Don't change the queues in the case of a listener that has
1649 1650           * eagers in its q or q0. It could surprise the eagers.
1650 1651           * Instead wait for the eagers outside the squeue.
1651 1652           *
1652 1653           * For non-STREAMS sockets tcp_wait_for_eagers implies that
1653 1654           * we should delay the su_closed upcall until all eagers have
1654 1655           * dropped their references.
1655 1656           */
1656 1657          if (!tcp->tcp_wait_for_eagers) {
1657 1658                  tcp->tcp_detached = B_TRUE;
1658 1659                  connp->conn_rq = NULL;
1659 1660                  connp->conn_wq = NULL;
1660 1661  
1661 1662                  /* non-STREAM socket, release the upper handle */
1662 1663                  if (IPCL_IS_NONSTR(connp)) {
1663 1664                          ASSERT(connp->conn_upper_handle != NULL);
1664 1665                          (*connp->conn_upcalls->su_closed)
1665 1666                              (connp->conn_upper_handle);
1666 1667                          connp->conn_upper_handle = NULL;
1667 1668                          connp->conn_upcalls = NULL;
1668 1669                  }
1669 1670          }
1670 1671  
1671 1672          /* Signal tcp_close() to finish closing. */
1672 1673          mutex_enter(&tcp->tcp_closelock);
1673 1674          tcp->tcp_closed = 1;
1674 1675          cv_signal(&tcp->tcp_closecv);
1675 1676          mutex_exit(&tcp->tcp_closelock);
1676 1677  }
1677 1678  
1678 1679  /* ARGSUSED */
1679 1680  void
1680 1681  tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1681 1682  {
1682 1683          conn_t  *connp = (conn_t *)arg;
1683 1684          tcp_t   *tcp = connp->conn_tcp;
1684 1685  
1685 1686          freemsg(mp);
1686 1687  
1687 1688          if (tcp->tcp_fused)
1688 1689                  tcp_unfuse(tcp);
1689 1690  
1690 1691          if (tcp_xmit_end(tcp) != 0) {
1691 1692                  /*
1692 1693                   * We were crossing FINs and got a reset from
1693 1694                   * the other side. Just ignore it.
1694 1695                   */
1695 1696                  if (connp->conn_debug) {
1696 1697                          (void) strlog(TCP_MOD_ID, 0, 1,
1697 1698                              SL_ERROR|SL_TRACE,
1698 1699                              "tcp_shutdown_output() out of state %s",
1699 1700                              tcp_display(tcp, NULL, DISP_ADDR_AND_PORT));
1700 1701                  }
1701 1702          }
1702 1703  }
1703 1704  
1704 1705  #pragma inline(tcp_send_data)
1705 1706  
1706 1707  void
1707 1708  tcp_send_data(tcp_t *tcp, mblk_t *mp)
1708 1709  {
1709 1710          conn_t          *connp = tcp->tcp_connp;
1710 1711  
1711 1712          /*
1712 1713           * Check here to avoid sending zero-copy message down to IP when
1713 1714           * ZEROCOPY capability has turned off. We only need to deal with
1714 1715           * the race condition between sockfs and the notification here.
1715 1716           * Since we have tried to backoff the tcp_xmit_head when turning
1716 1717           * zero-copy off and new messages in tcp_output(), we simply drop
1717 1718           * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean
1718 1719           * is not true.
1719 1720           */
1720 1721          if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on &&
1721 1722              !tcp->tcp_xmit_zc_clean) {
1722 1723                  ip_drop_output("TCP ZC was disabled but not clean", mp, NULL);
1723 1724                  freemsg(mp);
1724 1725                  return;
1725 1726          }
1726 1727  
1727 1728          DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa,
1728 1729              __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, tcp,
1729 1730              __dtrace_tcp_tcph_t *,
1730 1731              &mp->b_rptr[connp->conn_ixa->ixa_ip_hdr_length]);
1731 1732  
1732 1733          ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp);
1733 1734          (void) conn_ip_output(mp, connp->conn_ixa);
1734 1735  }
1735 1736  
1736 1737  /* ARGSUSED2 */
1737 1738  void
1738 1739  tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1739 1740  {
1740 1741          conn_t  *econnp = (conn_t *)arg;
1741 1742          tcp_t   *tcp = econnp->conn_tcp;
1742 1743          ip_xmit_attr_t *ixa = econnp->conn_ixa;
1743 1744  
1744 1745          /* Guard against a RST having blown it away while on the squeue */
1745 1746          if (tcp->tcp_state == TCPS_CLOSED) {
1746 1747                  freemsg(mp);
1747 1748                  return;
1748 1749          }
1749 1750  
1750 1751          /*
1751 1752           * In the off-chance that the eager received and responded to
1752 1753           * some other packet while the SYN|ACK was queued, we recalculate
1753 1754           * the ixa_pktlen. It would be better to fix the SYN/accept

↓ open down ↓

1720 lines elided

↑ open up ↑

1754 1755           * multithreading scheme to avoid this complexity.
1755 1756           */
1756 1757          ixa->ixa_pktlen = msgdsize(mp);
1757 1758          (void) conn_ip_output(mp, ixa);
1758 1759  }
1759 1760  
1760 1761  /*
1761 1762   * tcp_send() is called by tcp_wput_data() and returns one of the following:
1762 1763   *
1763 1764   * -1 = failed allocation.
1764      - *  0 = success; burst count reached, or usable send window is too small,
1765      - *      and that we'd rather wait until later before sending again.
     1765 + *  0 = We've either successfully sent data, or our usable send window is too
     1766 + *      small and we'd rather wait until later before sending again.
1766 1767   */
1767 1768  static int
1768 1769  tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1769 1770      const int tcp_hdr_len, const int num_sack_blk, int *usable,
1770 1771      uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1771 1772  {
1772      -        int             num_burst_seg = tcp->tcp_snd_burst;
1773 1773          int             num_lso_seg = 1;
1774 1774          uint_t          lso_usable;
1775 1775          boolean_t       do_lso_send = B_FALSE;
1776 1776          tcp_stack_t     *tcps = tcp->tcp_tcps;
1777 1777          conn_t          *connp = tcp->tcp_connp;
1778 1778          ip_xmit_attr_t  *ixa = connp->conn_ixa;
1779 1779  
1780 1780          /*
1781 1781           * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1782 1782           * the underlying connection is LSO capable. Will check whether having

1783 1783           * enough available data to initiate LSO transmission in the for(){}
1784 1784           * loops.
1785 1785           */
1786 1786          if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1787 1787                  do_lso_send = B_TRUE;

↓ open down ↓

5 lines elided

↑ open up ↑

1788 1788  
1789 1789          for (;;) {
1790 1790                  struct datab    *db;
1791 1791                  tcpha_t         *tcpha;
1792 1792                  uint32_t        sum;
1793 1793                  mblk_t          *mp, *mp1;
1794 1794                  uchar_t         *rptr;
1795 1795                  int             len;
1796 1796  
1797 1797                  /*
1798      -                 * Burst count reached, return successfully.
1799      -                 */
1800      -                if (num_burst_seg == 0)
1801      -                        break;
1802      -
1803      -                /*
1804 1798                   * Calculate the maximum payload length we can send at one
1805 1799                   * time.
1806 1800                   */
1807 1801                  if (do_lso_send) {
1808 1802                          /*
1809      -                         * Check whether be able to to do LSO for the current
1810      -                         * available data.
     1803 +                         * Determine whether or not it's possible to do LSO,
     1804 +                         * and if so, how much data we can send.
1811 1805                           */
1812      -                        if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) {
     1806 +                        if ((*usable - 1) / mss >= 1) {
1813 1807                                  lso_usable = MIN(tcp->tcp_lso_max, *usable);
1814      -                                lso_usable = MIN(lso_usable,
1815      -                                    num_burst_seg * mss);
1816      -
1817 1808                                  num_lso_seg = lso_usable / mss;
1818 1809                                  if (lso_usable % mss) {
1819 1810                                          num_lso_seg++;
1820 1811                                          tcp->tcp_last_sent_len = (ushort_t)
1821 1812                                              (lso_usable % mss);
1822 1813                                  } else {
1823 1814                                          tcp->tcp_last_sent_len = (ushort_t)mss;
1824 1815                                  }
1825 1816                          } else {
1826 1817                                  do_lso_send = B_FALSE;
1827 1818                                  num_lso_seg = 1;
1828 1819                                  lso_usable = mss;
1829 1820                          }
1830 1821                  }
1831 1822  
1832 1823                  ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
1833      -#ifdef DEBUG
1834      -                DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t,
1835      -                    do_lso_send);
1836      -#endif
1837      -                /*
1838      -                 * Adjust num_burst_seg here.
1839      -                 */
1840      -                num_burst_seg -= num_lso_seg;
1841 1824  
1842 1825                  len = mss;
1843 1826                  if (len > *usable) {
1844 1827                          ASSERT(do_lso_send == B_FALSE);
1845 1828  
1846 1829                          len = *usable;
1847 1830                          if (len <= 0) {
1848 1831                                  /* Terminate the loop */
1849 1832                                  break;  /* success; too small */
1850 1833                          }

1851 1834                          /*
1852 1835                           * Sender silly-window avoidance.
1853 1836                           * Ignore this if we are going to send a
1854 1837                           * zero window probe out.
1855 1838                           *
1856 1839                           * TODO: force data into microscopic window?
1857 1840                           *      ==> (!pushed || (unsent > usable))
1858 1841                           */
1859 1842                          if (len < (tcp->tcp_max_swnd >> 1) &&
1860 1843                              (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&
1861 1844                              !((tcp->tcp_valid_bits & TCP_URG_VALID) &&
1862 1845                              len == 1) && (! tcp->tcp_zero_win_probe)) {
1863 1846                                  /*
1864 1847                                   * If the retransmit timer is not running
1865 1848                                   * we start it so that we will retransmit
1866 1849                                   * in the case when the receiver has
1867 1850                                   * decremented the window.
1868 1851                                   */
1869 1852                                  if (*snxt == tcp->tcp_snxt &&
1870 1853                                      *snxt == tcp->tcp_suna) {
1871 1854                                          /*
1872 1855                                           * We are not supposed to send
1873 1856                                           * anything.  So let's wait a little
1874 1857                                           * bit longer before breaking SWS
1875 1858                                           * avoidance.
1876 1859                                           *
1877 1860                                           * What should the value be?
1878 1861                                           * Suggestion: MAX(init rexmit time,
1879 1862                                           * tcp->tcp_rto)
1880 1863                                           */
1881 1864                                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1882 1865                                  }
1883 1866                                  break;  /* success; too small */
1884 1867                          }
1885 1868                  }
1886 1869  
1887 1870                  tcpha = tcp->tcp_tcpha;
1888 1871  
1889 1872                  /*
1890 1873                   * The reason to adjust len here is that we need to set flags
1891 1874                   * and calculate checksum.
1892 1875                   */
1893 1876                  if (do_lso_send)
1894 1877                          len = lso_usable;
1895 1878  
1896 1879                  *usable -= len; /* Approximate - can be adjusted later */
1897 1880                  if (*usable > 0)
1898 1881                          tcpha->tha_flags = TH_ACK;
1899 1882                  else
1900 1883                          tcpha->tha_flags = (TH_ACK | TH_PUSH);
1901 1884  
1902 1885                  /*
1903 1886                   * Prime pump for IP's checksumming on our behalf.
1904 1887                   * Include the adjustment for a source route if any.
1905 1888                   * In case of LSO, the partial pseudo-header checksum should
1906 1889                   * exclusive TCP length, so zero tha_sum before IP calculate
1907 1890                   * pseudo-header checksum for partial checksum offload.
1908 1891                   */
1909 1892                  if (do_lso_send) {
1910 1893                          sum = 0;
1911 1894                  } else {
1912 1895                          sum = len + tcp_hdr_len + connp->conn_sum;
1913 1896                          sum = (sum >> 16) + (sum & 0xFFFF);
1914 1897                  }
1915 1898                  tcpha->tha_sum = htons(sum);
1916 1899                  tcpha->tha_seq = htonl(*snxt);
1917 1900  
1918 1901                  /*
1919 1902                   * Branch off to tcp_xmit_mp() if any of the VALID bits is
1920 1903                   * set.  For the case when TCP_FSS_VALID is the only valid
1921 1904                   * bit (normal active close), branch off only when we think
1922 1905                   * that the FIN flag needs to be set.  Note for this case,
1923 1906                   * that (snxt + len) may not reflect the actual seg_len,
1924 1907                   * as len may be further reduced in tcp_xmit_mp().  If len
1925 1908                   * gets modified, we will end up here again.
1926 1909                   */
1927 1910                  if (tcp->tcp_valid_bits != 0 &&
1928 1911                      (tcp->tcp_valid_bits != TCP_FSS_VALID ||
1929 1912                      ((*snxt + len) == tcp->tcp_fss))) {
1930 1913                          uchar_t         *prev_rptr;
1931 1914                          uint32_t        prev_snxt = tcp->tcp_snxt;
1932 1915  
1933 1916                          if (*tail_unsent == 0) {
1934 1917                                  ASSERT((*xmit_tail)->b_cont != NULL);
1935 1918                                  *xmit_tail = (*xmit_tail)->b_cont;
1936 1919                                  prev_rptr = (*xmit_tail)->b_rptr;
1937 1920                                  *tail_unsent = (int)((*xmit_tail)->b_wptr -
1938 1921                                      (*xmit_tail)->b_rptr);
1939 1922                          } else {
1940 1923                                  prev_rptr = (*xmit_tail)->b_rptr;
1941 1924                                  (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr -
1942 1925                                      *tail_unsent;
1943 1926                          }
1944 1927                          mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL,
1945 1928                              *snxt, B_FALSE, (uint32_t *)&len, B_FALSE);
1946 1929                          /* Restore tcp_snxt so we get amount sent right. */
1947 1930                          tcp->tcp_snxt = prev_snxt;
1948 1931                          if (prev_rptr == (*xmit_tail)->b_rptr) {
1949 1932                                  /*
1950 1933                                   * If the previous timestamp is still in use,
1951 1934                                   * don't stomp on it.
1952 1935                                   */
1953 1936                                  if ((*xmit_tail)->b_next == NULL) {
1954 1937                                          (*xmit_tail)->b_prev = local_time;
1955 1938                                          (*xmit_tail)->b_next =
1956 1939                                              (mblk_t *)(uintptr_t)(*snxt);
1957 1940                                  }
1958 1941                          } else
1959 1942                                  (*xmit_tail)->b_rptr = prev_rptr;
1960 1943  
1961 1944                          if (mp == NULL) {
1962 1945                                  return (-1);
1963 1946                          }
1964 1947                          mp1 = mp->b_cont;
1965 1948  
1966 1949                          if (len <= mss) /* LSO is unusable (!do_lso_send) */
1967 1950                                  tcp->tcp_last_sent_len = (ushort_t)len;
1968 1951                          while (mp1->b_cont) {
1969 1952                                  *xmit_tail = (*xmit_tail)->b_cont;
1970 1953                                  (*xmit_tail)->b_prev = local_time;
1971 1954                                  (*xmit_tail)->b_next =
1972 1955                                      (mblk_t *)(uintptr_t)(*snxt);
1973 1956                                  mp1 = mp1->b_cont;
1974 1957                          }
1975 1958                          *snxt += len;
1976 1959                          *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
1977 1960                          BUMP_LOCAL(tcp->tcp_obsegs);
1978 1961                          TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1979 1962                          TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1980 1963                          tcp_send_data(tcp, mp);
1981 1964                          continue;
1982 1965                  }
1983 1966  
1984 1967                  *snxt += len;   /* Adjust later if we don't send all of len */
1985 1968                  TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1986 1969                  TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1987 1970  
1988 1971                  if (*tail_unsent) {
1989 1972                          /* Are the bytes above us in flight? */
1990 1973                          rptr = (*xmit_tail)->b_wptr - *tail_unsent;
1991 1974                          if (rptr != (*xmit_tail)->b_rptr) {
1992 1975                                  *tail_unsent -= len;
1993 1976                                  if (len <= mss) /* LSO is unusable */
1994 1977                                          tcp->tcp_last_sent_len = (ushort_t)len;
1995 1978                                  len += total_hdr_len;
1996 1979                                  ixa->ixa_pktlen = len;
1997 1980  
1998 1981                                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
1999 1982                                          tcp->tcp_ipha->ipha_length = htons(len);
2000 1983                                  } else {
2001 1984                                          tcp->tcp_ip6h->ip6_plen =
2002 1985                                              htons(len - IPV6_HDR_LEN);
2003 1986                                  }
2004 1987  
2005 1988                                  mp = dupb(*xmit_tail);
2006 1989                                  if (mp == NULL) {
2007 1990                                          return (-1);    /* out_of_mem */
2008 1991                                  }
2009 1992                                  mp->b_rptr = rptr;
2010 1993                                  /*
2011 1994                                   * If the old timestamp is no longer in use,
2012 1995                                   * sample a new timestamp now.
2013 1996                                   */
2014 1997                                  if ((*xmit_tail)->b_next == NULL) {
2015 1998                                          (*xmit_tail)->b_prev = local_time;
2016 1999                                          (*xmit_tail)->b_next =
2017 2000                                              (mblk_t *)(uintptr_t)(*snxt-len);
2018 2001                                  }
2019 2002                                  goto must_alloc;
2020 2003                          }
2021 2004                  } else {
2022 2005                          *xmit_tail = (*xmit_tail)->b_cont;
2023 2006                          ASSERT((uintptr_t)((*xmit_tail)->b_wptr -
2024 2007                              (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX);
2025 2008                          *tail_unsent = (int)((*xmit_tail)->b_wptr -
2026 2009                              (*xmit_tail)->b_rptr);
2027 2010                  }
2028 2011  
2029 2012                  (*xmit_tail)->b_prev = local_time;
2030 2013                  (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len);
2031 2014  
2032 2015                  *tail_unsent -= len;
2033 2016                  if (len <= mss) /* LSO is unusable (!do_lso_send) */
2034 2017                          tcp->tcp_last_sent_len = (ushort_t)len;
2035 2018  
2036 2019                  len += total_hdr_len;
2037 2020                  ixa->ixa_pktlen = len;
2038 2021  
2039 2022                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
2040 2023                          tcp->tcp_ipha->ipha_length = htons(len);
2041 2024                  } else {
2042 2025                          tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2043 2026                  }
2044 2027  
2045 2028                  mp = dupb(*xmit_tail);
2046 2029                  if (mp == NULL) {
2047 2030                          return (-1);    /* out_of_mem */
2048 2031                  }
2049 2032  
2050 2033                  len = total_hdr_len;
2051 2034                  /*
2052 2035                   * There are four reasons to allocate a new hdr mblk:
2053 2036                   *  1) The bytes above us are in use by another packet
2054 2037                   *  2) We don't have good alignment
2055 2038                   *  3) The mblk is being shared
2056 2039                   *  4) We don't have enough room for a header
2057 2040                   */
2058 2041                  rptr = mp->b_rptr - len;
2059 2042                  if (!OK_32PTR(rptr) ||
2060 2043                      ((db = mp->b_datap), db->db_ref != 2) ||
2061 2044                      rptr < db->db_base) {
2062 2045                          /* NOTE: we assume allocb returns an OK_32PTR */
2063 2046  
2064 2047                  must_alloc:;
2065 2048                          mp1 = allocb(connp->conn_ht_iphc_allocated +
2066 2049                              tcps->tcps_wroff_xtra, BPRI_MED);
2067 2050                          if (mp1 == NULL) {
2068 2051                                  freemsg(mp);
2069 2052                                  return (-1);    /* out_of_mem */
2070 2053                          }
2071 2054                          mp1->b_cont = mp;
2072 2055                          mp = mp1;
2073 2056                          /* Leave room for Link Level header */
2074 2057                          len = total_hdr_len;
2075 2058                          rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2076 2059                          mp->b_wptr = &rptr[len];
2077 2060                  }
2078 2061  
2079 2062                  /*
2080 2063                   * Fill in the header using the template header, and add
2081 2064                   * options such as time-stamp, ECN and/or SACK, as needed.
2082 2065                   */
2083 2066                  tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
2084 2067  
2085 2068                  mp->b_rptr = rptr;
2086 2069  
2087 2070                  if (*tail_unsent) {
2088 2071                          int spill = *tail_unsent;
2089 2072  
2090 2073                          mp1 = mp->b_cont;
2091 2074                          if (mp1 == NULL)
2092 2075                                  mp1 = mp;
2093 2076  
2094 2077                          /*
2095 2078                           * If we're a little short, tack on more mblks until
2096 2079                           * there is no more spillover.
2097 2080                           */
2098 2081                          while (spill < 0) {
2099 2082                                  mblk_t *nmp;
2100 2083                                  int nmpsz;
2101 2084  
2102 2085                                  nmp = (*xmit_tail)->b_cont;
2103 2086                                  nmpsz = MBLKL(nmp);
2104 2087  
2105 2088                                  /*
2106 2089                                   * Excess data in mblk; can we split it?
2107 2090                                   * If LSO is enabled for the connection,
2108 2091                                   * keep on splitting as this is a transient
2109 2092                                   * send path.
2110 2093                                   */
2111 2094                                  if (!do_lso_send && (spill + nmpsz > 0)) {
2112 2095                                          /*
2113 2096                                           * Don't split if stream head was
2114 2097                                           * told to break up larger writes
2115 2098                                           * into smaller ones.
2116 2099                                           */
2117 2100                                          if (tcp->tcp_maxpsz_multiplier > 0)
2118 2101                                                  break;
2119 2102  
2120 2103                                          /*
2121 2104                                           * Next mblk is less than SMSS/2
2122 2105                                           * rounded up to nearest 64-byte;
2123 2106                                           * let it get sent as part of the
2124 2107                                           * next segment.
2125 2108                                           */
2126 2109                                          if (tcp->tcp_localnet &&
2127 2110                                              !tcp->tcp_cork &&
2128 2111                                              (nmpsz < roundup((mss >> 1), 64)))
2129 2112                                                  break;
2130 2113                                  }
2131 2114  
2132 2115                                  *xmit_tail = nmp;
2133 2116                                  ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX);
2134 2117                                  /* Stash for rtt use later */
2135 2118                                  (*xmit_tail)->b_prev = local_time;
2136 2119                                  (*xmit_tail)->b_next =
2137 2120                                      (mblk_t *)(uintptr_t)(*snxt - len);
2138 2121                                  mp1->b_cont = dupb(*xmit_tail);
2139 2122                                  mp1 = mp1->b_cont;
2140 2123  
2141 2124                                  spill += nmpsz;
2142 2125                                  if (mp1 == NULL) {
2143 2126                                          *tail_unsent = spill;
2144 2127                                          freemsg(mp);
2145 2128                                          return (-1);    /* out_of_mem */
2146 2129                                  }
2147 2130                          }
2148 2131  
2149 2132                          /* Trim back any surplus on the last mblk */
2150 2133                          if (spill >= 0) {
2151 2134                                  mp1->b_wptr -= spill;
2152 2135                                  *tail_unsent = spill;
2153 2136                          } else {
2154 2137                                  /*
2155 2138                                   * We did not send everything we could in
2156 2139                                   * order to remain within the b_cont limit.
2157 2140                                   */
2158 2141                                  *usable -= spill;
2159 2142                                  *snxt += spill;
2160 2143                                  tcp->tcp_last_sent_len += spill;
2161 2144                                  TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
2162 2145                                  /*
2163 2146                                   * Adjust the checksum
2164 2147                                   */
2165 2148                                  tcpha = (tcpha_t *)(rptr +
2166 2149                                      ixa->ixa_ip_hdr_length);
2167 2150                                  sum += spill;
2168 2151                                  sum = (sum >> 16) + (sum & 0xFFFF);
2169 2152                                  tcpha->tha_sum = htons(sum);
2170 2153                                  if (connp->conn_ipversion == IPV4_VERSION) {
2171 2154                                          sum = ntohs(
2172 2155                                              ((ipha_t *)rptr)->ipha_length) +
2173 2156                                              spill;
2174 2157                                          ((ipha_t *)rptr)->ipha_length =
2175 2158                                              htons(sum);
2176 2159                                  } else {
2177 2160                                          sum = ntohs(
2178 2161                                              ((ip6_t *)rptr)->ip6_plen) +
2179 2162                                              spill;
2180 2163                                          ((ip6_t *)rptr)->ip6_plen =
2181 2164                                              htons(sum);
2182 2165                                  }
2183 2166                                  ixa->ixa_pktlen += spill;
2184 2167                                  *tail_unsent = 0;
2185 2168                          }
2186 2169                  }
2187 2170                  if (tcp->tcp_ip_forward_progress) {
2188 2171                          tcp->tcp_ip_forward_progress = B_FALSE;
2189 2172                          ixa->ixa_flags |= IXAF_REACH_CONF;
2190 2173                  } else {
2191 2174                          ixa->ixa_flags &= ~IXAF_REACH_CONF;
2192 2175                  }
2193 2176  
2194 2177                  if (do_lso_send) {
2195 2178                          /* Append LSO information to the mp. */
2196 2179                          lso_info_set(mp, mss, HW_LSO);
2197 2180                          ixa->ixa_fragsize = IP_MAXPACKET;
2198 2181                          ixa->ixa_extra_ident = num_lso_seg - 1;
2199 2182  
2200 2183                          DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
2201 2184                              boolean_t, B_TRUE);
2202 2185  
2203 2186                          tcp_send_data(tcp, mp);
2204 2187  
2205 2188                          /*
2206 2189                           * Restore values of ixa_fragsize and ixa_extra_ident.
2207 2190                           */
2208 2191                          ixa->ixa_fragsize = ixa->ixa_pmtu;
2209 2192                          ixa->ixa_extra_ident = 0;
2210 2193                          tcp->tcp_obsegs += num_lso_seg;
2211 2194                          TCP_STAT(tcps, tcp_lso_times);
2212 2195                          TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
2213 2196                  } else {
2214 2197                          /*
2215 2198                           * Make sure to clean up LSO information. Wherever a
2216 2199                           * new mp uses the prepended header room after dupb(),
2217 2200                           * lso_info_cleanup() should be called.
2218 2201                           */
2219 2202                          lso_info_cleanup(mp);
2220 2203                          tcp_send_data(tcp, mp);
2221 2204                          BUMP_LOCAL(tcp->tcp_obsegs);
2222 2205                  }
2223 2206          }
2224 2207  
2225 2208          return (0);
2226 2209  }
2227 2210  
2228 2211  /*
2229 2212   * Initiate closedown sequence on an active connection.  (May be called as
2230 2213   * writer.)  Return value zero for OK return, non-zero for error return.
2231 2214   */
2232 2215  static int
2233 2216  tcp_xmit_end(tcp_t *tcp)
2234 2217  {
2235 2218          mblk_t          *mp;
2236 2219          tcp_stack_t     *tcps = tcp->tcp_tcps;
2237 2220          iulp_t          uinfo;
2238 2221          ip_stack_t      *ipst = tcps->tcps_netstack->netstack_ip;
2239 2222          conn_t          *connp = tcp->tcp_connp;
2240 2223  
2241 2224          if (tcp->tcp_state < TCPS_SYN_RCVD ||
2242 2225              tcp->tcp_state > TCPS_CLOSE_WAIT) {
2243 2226                  /*
2244 2227                   * Invalid state, only states TCPS_SYN_RCVD,
2245 2228                   * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid
2246 2229                   */
2247 2230                  return (-1);
2248 2231          }
2249 2232  
2250 2233          tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent;
2251 2234          tcp->tcp_valid_bits |= TCP_FSS_VALID;
2252 2235          /*
2253 2236           * If there is nothing more unsent, send the FIN now.
2254 2237           * Otherwise, it will go out with the last segment.
2255 2238           */
2256 2239          if (tcp->tcp_unsent == 0) {
2257 2240                  mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
2258 2241                      tcp->tcp_fss, B_FALSE, NULL, B_FALSE);
2259 2242  
2260 2243                  if (mp) {
2261 2244                          tcp_send_data(tcp, mp);
2262 2245                  } else {
2263 2246                          /*
2264 2247                           * Couldn't allocate msg.  Pretend we got it out.
2265 2248                           * Wait for rexmit timeout.
2266 2249                           */
2267 2250                          tcp->tcp_snxt = tcp->tcp_fss + 1;
2268 2251                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
2269 2252                  }
2270 2253  
2271 2254                  /*
2272 2255                   * If needed, update tcp_rexmit_snxt as tcp_snxt is
2273 2256                   * changed.
2274 2257                   */
2275 2258                  if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) {
2276 2259                          tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2277 2260                  }
2278 2261          } else {
2279 2262                  /*
2280 2263                   * If tcp->tcp_cork is set, then the data will not get sent,
2281 2264                   * so we have to check that and unset it first.
2282 2265                   */
2283 2266                  if (tcp->tcp_cork)
2284 2267                          tcp->tcp_cork = B_FALSE;
2285 2268                  tcp_wput_data(tcp, NULL, B_FALSE);
2286 2269          }
2287 2270  
2288 2271          /*
2289 2272           * If TCP does not get enough samples of RTT or tcp_rtt_updates
2290 2273           * is 0, don't update the cache.
2291 2274           */
2292 2275          if (tcps->tcps_rtt_updates == 0 ||
2293 2276              tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2294 2277                  return (0);
2295 2278  
2296 2279          /*
2297 2280           * We do not have a good algorithm to update ssthresh at this time.
2298 2281           * So don't do any update.
2299 2282           */
2300 2283          bzero(&uinfo, sizeof (uinfo));
2301 2284          uinfo.iulp_rtt = tcp->tcp_rtt_sa;
2302 2285          uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
2303 2286  
2304 2287          /*
2305 2288           * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2306 2289           * if source routed but we don't.
2307 2290           */
2308 2291          if (connp->conn_ipversion == IPV4_VERSION) {
2309 2292                  if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
2310 2293                          return (0);
2311 2294                  }
2312 2295                  (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2313 2296          } else {
2314 2297                  uint_t ifindex;
2315 2298  
2316 2299                  if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2317 2300                      &tcp->tcp_ip6h->ip6_dst))) {
2318 2301                          return (0);
2319 2302                  }
2320 2303                  ifindex = 0;
2321 2304                  if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2322 2305                          ip_xmit_attr_t *ixa = connp->conn_ixa;
2323 2306  
2324 2307                          /*
2325 2308                           * If we are going to create a DCE we'd better have
2326 2309                           * an ifindex
2327 2310                           */
2328 2311                          if (ixa->ixa_nce != NULL) {
2329 2312                                  ifindex = ixa->ixa_nce->nce_common->ncec_ill->
2330 2313                                      ill_phyint->phyint_ifindex;
2331 2314                          } else {
2332 2315                                  return (0);
2333 2316                          }
2334 2317                  }
2335 2318  
2336 2319                  (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo,
2337 2320                      ipst);
2338 2321          }
2339 2322          return (0);
2340 2323  }
2341 2324  
2342 2325  /*
2343 2326   * Send out a control packet on the tcp connection specified.  This routine
2344 2327   * is typically called where we need a simple ACK or RST generated.
2345 2328   */
2346 2329  void
2347 2330  tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
2348 2331  {
2349 2332          uchar_t         *rptr;
2350 2333          tcpha_t         *tcpha;
2351 2334          ipha_t          *ipha = NULL;
2352 2335          ip6_t           *ip6h = NULL;
2353 2336          uint32_t        sum;
2354 2337          int             total_hdr_len;
2355 2338          int             ip_hdr_len;
2356 2339          mblk_t          *mp;
2357 2340          tcp_stack_t     *tcps = tcp->tcp_tcps;
2358 2341          conn_t          *connp = tcp->tcp_connp;
2359 2342          ip_xmit_attr_t  *ixa = connp->conn_ixa;
2360 2343  
2361 2344          /*
2362 2345           * Save sum for use in source route later.
2363 2346           */
2364 2347          sum = connp->conn_ht_ulp_len + connp->conn_sum;
2365 2348          total_hdr_len = connp->conn_ht_iphc_len;
2366 2349          ip_hdr_len = ixa->ixa_ip_hdr_length;
2367 2350  
2368 2351          /* If a text string is passed in with the request, pass it to strlog. */
2369 2352          if (str != NULL && connp->conn_debug) {
2370 2353                  (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
2371 2354                      "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
2372 2355                      str, seq, ack, ctl);
2373 2356          }
2374 2357          mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
2375 2358              BPRI_MED);
2376 2359          if (mp == NULL) {
2377 2360                  return;
2378 2361          }
2379 2362          rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2380 2363          mp->b_rptr = rptr;
2381 2364          mp->b_wptr = &rptr[total_hdr_len];
2382 2365          bcopy(connp->conn_ht_iphc, rptr, total_hdr_len);
2383 2366  
2384 2367          ixa->ixa_pktlen = total_hdr_len;
2385 2368  
2386 2369          if (ixa->ixa_flags & IXAF_IS_IPV4) {
2387 2370                  ipha = (ipha_t *)rptr;
2388 2371                  ipha->ipha_length = htons(total_hdr_len);
2389 2372          } else {
2390 2373                  ip6h = (ip6_t *)rptr;
2391 2374                  ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
2392 2375          }
2393 2376          tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2394 2377          tcpha->tha_flags = (uint8_t)ctl;
2395 2378          if (ctl & TH_RST) {
2396 2379                  TCPS_BUMP_MIB(tcps, tcpOutRsts);
2397 2380                  TCPS_BUMP_MIB(tcps, tcpOutControl);
2398 2381                  /*
2399 2382                   * Don't send TSopt w/ TH_RST packets per RFC 1323.
2400 2383                   */
2401 2384                  if (tcp->tcp_snd_ts_ok &&
2402 2385                      tcp->tcp_state > TCPS_SYN_SENT) {
2403 2386                          mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN];
2404 2387                          *(mp->b_wptr) = TCPOPT_EOL;
2405 2388  
2406 2389                          ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN;
2407 2390  
2408 2391                          if (connp->conn_ipversion == IPV4_VERSION) {
2409 2392                                  ipha->ipha_length = htons(total_hdr_len -
2410 2393                                      TCPOPT_REAL_TS_LEN);
2411 2394                          } else {
2412 2395                                  ip6h->ip6_plen = htons(total_hdr_len -
2413 2396                                      IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN);
2414 2397                          }
2415 2398                          tcpha->tha_offset_and_reserved -= (3 << 4);
2416 2399                          sum -= TCPOPT_REAL_TS_LEN;
2417 2400                  }
2418 2401          }
2419 2402          if (ctl & TH_ACK) {
2420 2403                  if (tcp->tcp_snd_ts_ok) {
2421 2404                          uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2422 2405  
2423 2406                          U32_TO_BE32(llbolt,
2424 2407                              (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
2425 2408                          U32_TO_BE32(tcp->tcp_ts_recent,
2426 2409                              (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
2427 2410                  }
2428 2411  
2429 2412                  /* Update the latest receive window size in TCP header. */
2430 2413                  tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2431 2414                  /* Track what we sent to the peer */
2432 2415                  tcp->tcp_tcpha->tha_win = tcpha->tha_win;
2433 2416                  tcp->tcp_rack = ack;
2434 2417                  tcp->tcp_rack_cnt = 0;
2435 2418                  TCPS_BUMP_MIB(tcps, tcpOutAck);
2436 2419          }
2437 2420          BUMP_LOCAL(tcp->tcp_obsegs);
2438 2421          tcpha->tha_seq = htonl(seq);
2439 2422          tcpha->tha_ack = htonl(ack);
2440 2423          /*
2441 2424           * Include the adjustment for a source route if any.
2442 2425           */
2443 2426          sum = (sum >> 16) + (sum & 0xFFFF);
2444 2427          tcpha->tha_sum = htons(sum);
2445 2428          tcp_send_data(tcp, mp);
2446 2429  }
2447 2430  
2448 2431  /*
2449 2432   * Generate a reset based on an inbound packet, connp is set by caller
2450 2433   * when RST is in response to an unexpected inbound packet for which
2451 2434   * there is active tcp state in the system.
2452 2435   *
2453 2436   * IPSEC NOTE : Try to send the reply with the same protection as it came
2454 2437   * in.  We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
2455 2438   * That way the packet will go out at the same level of protection as it
2456 2439   * came in with.
2457 2440   */
2458 2441  static void
2459 2442  tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl,
2460 2443      ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp)
2461 2444  {
2462 2445          ipha_t          *ipha = NULL;
2463 2446          ip6_t           *ip6h = NULL;
2464 2447          ushort_t        len;
2465 2448          tcpha_t         *tcpha;
2466 2449          int             i;
2467 2450          ipaddr_t        v4addr;
2468 2451          in6_addr_t      v6addr;
2469 2452          netstack_t      *ns = ipst->ips_netstack;
2470 2453          tcp_stack_t     *tcps = ns->netstack_tcp;
2471 2454          ip_xmit_attr_t  ixas, *ixa;
2472 2455          uint_t          ip_hdr_len = ira->ira_ip_hdr_length;
2473 2456          boolean_t       need_refrele = B_FALSE;         /* ixa_refrele(ixa) */
2474 2457          ushort_t        port;
2475 2458  
2476 2459          if (!tcp_send_rst_chk(tcps)) {
2477 2460                  TCP_STAT(tcps, tcp_rst_unsent);
2478 2461                  freemsg(mp);
2479 2462                  return;
2480 2463          }
2481 2464  
2482 2465          /*
2483 2466           * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other
2484 2467           * options from the listener. In that case the caller must ensure that
2485 2468           * we are running on the listener = connp squeue.
2486 2469           *
2487 2470           * We get a safe copy of conn_ixa so we don't need to restore anything
2488 2471           * we or ip_output_simple might change in the ixa.
2489 2472           */
2490 2473          if (connp != NULL) {
2491 2474                  ASSERT(connp->conn_on_sqp);
2492 2475  
2493 2476                  ixa = conn_get_ixa_exclusive(connp);
2494 2477                  if (ixa == NULL) {
2495 2478                          TCP_STAT(tcps, tcp_rst_unsent);
2496 2479                          freemsg(mp);
2497 2480                          return;
2498 2481                  }
2499 2482                  need_refrele = B_TRUE;
2500 2483          } else {
2501 2484                  bzero(&ixas, sizeof (ixas));
2502 2485                  ixa = &ixas;
2503 2486                  /*
2504 2487                   * IXAF_VERIFY_SOURCE is overkill since we know the
2505 2488                   * packet was for us.
2506 2489                   */
2507 2490                  ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE;
2508 2491                  ixa->ixa_protocol = IPPROTO_TCP;
2509 2492                  ixa->ixa_zoneid = ira->ira_zoneid;
2510 2493                  ixa->ixa_ifindex = 0;
2511 2494                  ixa->ixa_ipst = ipst;
2512 2495                  ixa->ixa_cred = kcred;
2513 2496                  ixa->ixa_cpid = NOPID;
2514 2497          }
2515 2498  
2516 2499          if (str && tcps->tcps_dbg) {
2517 2500                  (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
2518 2501                      "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
2519 2502                      "flags 0x%x",
2520 2503                      str, seq, ack, ctl);
2521 2504          }
2522 2505          if (mp->b_datap->db_ref != 1) {
2523 2506                  mblk_t *mp1 = copyb(mp);
2524 2507                  freemsg(mp);
2525 2508                  mp = mp1;
2526 2509                  if (mp == NULL)
2527 2510                          goto done;
2528 2511          } else if (mp->b_cont) {
2529 2512                  freemsg(mp->b_cont);
2530 2513                  mp->b_cont = NULL;
2531 2514                  DB_CKSUMFLAGS(mp) = 0;
2532 2515          }
2533 2516          /*
2534 2517           * We skip reversing source route here.
2535 2518           * (for now we replace all IP options with EOL)
2536 2519           */
2537 2520          if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2538 2521                  ipha = (ipha_t *)mp->b_rptr;
2539 2522                  for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
2540 2523                          mp->b_rptr[i] = IPOPT_EOL;
2541 2524                  /*
2542 2525                   * Make sure that src address isn't flagrantly invalid.
2543 2526                   * Not all broadcast address checking for the src address
2544 2527                   * is possible, since we don't know the netmask of the src
2545 2528                   * addr.  No check for destination address is done, since
2546 2529                   * IP will not pass up a packet with a broadcast dest
2547 2530                   * address to TCP.  Similar checks are done below for IPv6.
2548 2531                   */
2549 2532                  if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST ||
2550 2533                      CLASSD(ipha->ipha_src)) {
2551 2534                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
2552 2535                          ip_drop_input("ipIfStatsInDiscards", mp, NULL);
2553 2536                          freemsg(mp);
2554 2537                          goto done;
2555 2538                  }
2556 2539          } else {
2557 2540                  ip6h = (ip6_t *)mp->b_rptr;
2558 2541  
2559 2542                  if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
2560 2543                      IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
2561 2544                          BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
2562 2545                          ip_drop_input("ipIfStatsInDiscards", mp, NULL);
2563 2546                          freemsg(mp);
2564 2547                          goto done;
2565 2548                  }
2566 2549  
2567 2550                  /* Remove any extension headers assuming partial overlay */
2568 2551                  if (ip_hdr_len > IPV6_HDR_LEN) {
2569 2552                          uint8_t *to;
2570 2553  
2571 2554                          to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
2572 2555                          ovbcopy(ip6h, to, IPV6_HDR_LEN);
2573 2556                          mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
2574 2557                          ip_hdr_len = IPV6_HDR_LEN;
2575 2558                          ip6h = (ip6_t *)mp->b_rptr;
2576 2559                          ip6h->ip6_nxt = IPPROTO_TCP;
2577 2560                  }
2578 2561          }
2579 2562          tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
2580 2563          if (tcpha->tha_flags & TH_RST) {
2581 2564                  freemsg(mp);
2582 2565                  goto done;
2583 2566          }
2584 2567          tcpha->tha_offset_and_reserved = (5 << 4);
2585 2568          len = ip_hdr_len + sizeof (tcpha_t);
2586 2569          mp->b_wptr = &mp->b_rptr[len];
2587 2570          if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2588 2571                  ipha->ipha_length = htons(len);
2589 2572                  /* Swap addresses */
2590 2573                  v4addr = ipha->ipha_src;
2591 2574                  ipha->ipha_src = ipha->ipha_dst;
2592 2575                  ipha->ipha_dst = v4addr;
2593 2576                  ipha->ipha_ident = 0;
2594 2577                  ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
2595 2578                  ixa->ixa_flags |= IXAF_IS_IPV4;
2596 2579                  ixa->ixa_ip_hdr_length = ip_hdr_len;
2597 2580          } else {
2598 2581                  ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2599 2582                  /* Swap addresses */
2600 2583                  v6addr = ip6h->ip6_src;
2601 2584                  ip6h->ip6_src = ip6h->ip6_dst;
2602 2585                  ip6h->ip6_dst = v6addr;
2603 2586                  ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit;
2604 2587                  ixa->ixa_flags &= ~IXAF_IS_IPV4;
2605 2588  
2606 2589                  if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
2607 2590                          ixa->ixa_flags |= IXAF_SCOPEID_SET;
2608 2591                          ixa->ixa_scopeid = ira->ira_ruifindex;
2609 2592                  }
2610 2593                  ixa->ixa_ip_hdr_length = IPV6_HDR_LEN;
2611 2594          }
2612 2595          ixa->ixa_pktlen = len;
2613 2596  
2614 2597          /* Swap the ports */
2615 2598          port = tcpha->tha_fport;
2616 2599          tcpha->tha_fport = tcpha->tha_lport;
2617 2600          tcpha->tha_lport = port;
2618 2601  
2619 2602          tcpha->tha_ack = htonl(ack);
2620 2603          tcpha->tha_seq = htonl(seq);
2621 2604          tcpha->tha_win = 0;
2622 2605          tcpha->tha_sum = htons(sizeof (tcpha_t));
2623 2606          tcpha->tha_flags = (uint8_t)ctl;
2624 2607          if (ctl & TH_RST) {
2625 2608                  if (ctl & TH_ACK) {
2626 2609                          /*
2627 2610                           * Probe connection rejection here.
2628 2611                           * tcp_xmit_listeners_reset() drops non-SYN segments
2629 2612                           * that do not specify TH_ACK in their flags without
2630 2613                           * calling this function.  As a consequence, if this
2631 2614                           * function is called with a TH_RST|TH_ACK ctl argument,
2632 2615                           * it is being called in response to a SYN segment
2633 2616                           * and thus the tcp:::accept-refused probe point
2634 2617                           * is valid here.
2635 2618                           */
2636 2619                          DTRACE_TCP5(accept__refused, mblk_t *, NULL,
2637 2620                              void, NULL, void_ip_t *, mp->b_rptr, tcp_t *, NULL,
2638 2621                              tcph_t *, tcpha);
2639 2622                  }
2640 2623                  TCPS_BUMP_MIB(tcps, tcpOutRsts);
2641 2624                  TCPS_BUMP_MIB(tcps, tcpOutControl);
2642 2625          }
2643 2626  
2644 2627          /* Discard any old label */
2645 2628          if (ixa->ixa_free_flags & IXA_FREE_TSL) {
2646 2629                  ASSERT(ixa->ixa_tsl != NULL);
2647 2630                  label_rele(ixa->ixa_tsl);
2648 2631                  ixa->ixa_free_flags &= ~IXA_FREE_TSL;
2649 2632          }
2650 2633          ixa->ixa_tsl = ira->ira_tsl;    /* Behave as a multi-level responder */
2651 2634  
2652 2635          if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2653 2636                  /*
2654 2637                   * Apply IPsec based on how IPsec was applied to
2655 2638                   * the packet that caused the RST.
2656 2639                   */
2657 2640                  if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) {
2658 2641                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2659 2642                          /* Note: mp already consumed and ip_drop_packet done */
2660 2643                          goto done;
2661 2644                  }
2662 2645          } else {
2663 2646                  /*
2664 2647                   * This is in clear. The RST message we are building
2665 2648                   * here should go out in clear, independent of our policy.
2666 2649                   */
2667 2650                  ixa->ixa_flags |= IXAF_NO_IPSEC;
2668 2651          }
2669 2652  
2670 2653          DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, ixa,
2671 2654              __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL,
2672 2655              __dtrace_tcp_tcph_t *, tcpha);
2673 2656  
2674 2657          /*
2675 2658           * NOTE:  one might consider tracing a TCP packet here, but
2676 2659           * this function has no active TCP state and no tcp structure
2677 2660           * that has a trace buffer.  If we traced here, we would have
2678 2661           * to keep a local trace buffer in tcp_record_trace().
2679 2662           */
2680 2663  
2681 2664          (void) ip_output_simple(mp, ixa);
2682 2665  done:
2683 2666          ixa_cleanup(ixa);
2684 2667          if (need_refrele) {
2685 2668                  ASSERT(ixa != &ixas);
2686 2669                  ixa_refrele(ixa);
2687 2670          }
2688 2671  }
2689 2672  
2690 2673  /*
2691 2674   * Generate a "no listener here" RST in response to an "unknown" segment.
2692 2675   * connp is set by caller when RST is in response to an unexpected
2693 2676   * inbound packet for which there is active tcp state in the system.
2694 2677   * Note that we are reusing the incoming mp to construct the outgoing RST.
2695 2678   */
2696 2679  void
2697 2680  tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst,
2698 2681      conn_t *connp)
2699 2682  {
2700 2683          uchar_t         *rptr;
2701 2684          uint32_t        seg_len;
2702 2685          tcpha_t         *tcpha;
2703 2686          uint32_t        seg_seq;
2704 2687          uint32_t        seg_ack;
2705 2688          uint_t          flags;
2706 2689          ipha_t          *ipha;
2707 2690          ip6_t           *ip6h;
2708 2691          boolean_t       policy_present;
2709 2692          netstack_t      *ns = ipst->ips_netstack;
2710 2693          tcp_stack_t     *tcps = ns->netstack_tcp;
2711 2694          ipsec_stack_t   *ipss = tcps->tcps_netstack->netstack_ipsec;
2712 2695          uint_t          ip_hdr_len = ira->ira_ip_hdr_length;
2713 2696  
2714 2697          TCP_STAT(tcps, tcp_no_listener);
2715 2698  
2716 2699          /*
2717 2700           * DTrace this "unknown" segment as a tcp:::receive, as we did
2718 2701           * just receive something that was TCP.
2719 2702           */
2720 2703          DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, NULL,
2721 2704              __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL,
2722 2705              __dtrace_tcp_tcph_t *, &mp->b_rptr[ip_hdr_len]);
2723 2706  
2724 2707          if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2725 2708                  policy_present = ipss->ipsec_inbound_v4_policy_present;
2726 2709                  ipha = (ipha_t *)mp->b_rptr;
2727 2710                  ip6h = NULL;
2728 2711          } else {
2729 2712                  policy_present = ipss->ipsec_inbound_v6_policy_present;
2730 2713                  ipha = NULL;
2731 2714                  ip6h = (ip6_t *)mp->b_rptr;
2732 2715          }
2733 2716  
2734 2717          if (policy_present) {
2735 2718                  /*
2736 2719                   * The conn_t parameter is NULL because we already know
2737 2720                   * nobody's home.
2738 2721                   */
2739 2722                  mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h,
2740 2723                      ira, ns);
2741 2724                  if (mp == NULL)
2742 2725                          return;
2743 2726          }
2744 2727          if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
2745 2728                  DTRACE_PROBE2(
2746 2729                      tx__ip__log__error__nolistener__tcp,
2747 2730                      char *, "Could not reply with RST to mp(1)",
2748 2731                      mblk_t *, mp);
2749 2732                  ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
2750 2733                  freemsg(mp);
2751 2734                  return;
2752 2735          }
2753 2736  
2754 2737          rptr = mp->b_rptr;
2755 2738  
2756 2739          tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2757 2740          seg_seq = ntohl(tcpha->tha_seq);
2758 2741          seg_ack = ntohl(tcpha->tha_ack);
2759 2742          flags = tcpha->tha_flags;
2760 2743  
2761 2744          seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len);
2762 2745          if (flags & TH_RST) {
2763 2746                  freemsg(mp);
2764 2747          } else if (flags & TH_ACK) {
2765 2748                  tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST,
2766 2749                      ira, ipst, connp);
2767 2750          } else {
2768 2751                  if (flags & TH_SYN) {
2769 2752                          seg_len++;
2770 2753                  } else {
2771 2754                          /*
2772 2755                           * Here we violate the RFC.  Note that a normal
2773 2756                           * TCP will never send a segment without the ACK
2774 2757                           * flag, except for RST or SYN segment.  This
2775 2758                           * segment is neither.  Just drop it on the
2776 2759                           * floor.
2777 2760                           */
2778 2761                          freemsg(mp);
2779 2762                          TCP_STAT(tcps, tcp_rst_unsent);
2780 2763                          return;
2781 2764                  }
2782 2765  
2783 2766                  tcp_xmit_early_reset("no tcp, reset/ack", mp, 0,
2784 2767                      seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp);
2785 2768          }
2786 2769  }
2787 2770  
2788 2771  /*
2789 2772   * Helper function for tcp_xmit_mp() in handling connection set up flag
2790 2773   * options setting.
2791 2774   */
2792 2775  static void
2793 2776  tcp_xmit_mp_aux_iss(tcp_t *tcp, conn_t *connp, tcpha_t *tcpha, mblk_t *mp,
2794 2777      uint_t *flags)
2795 2778  {
2796 2779          uint32_t u1;
2797 2780          uint8_t *wptr = mp->b_wptr;
2798 2781          tcp_stack_t *tcps = tcp->tcp_tcps;
2799 2782          boolean_t add_sack = B_FALSE;
2800 2783  
2801 2784          /*
2802 2785           * If TCP_ISS_VALID and the seq number is tcp_iss,
2803 2786           * TCP can only be in SYN-SENT, SYN-RCVD or
2804 2787           * FIN-WAIT-1 state.  It can be FIN-WAIT-1 if
2805 2788           * our SYN is not ack'ed but the app closes this
2806 2789           * TCP connection.
2807 2790           */
2808 2791          ASSERT(tcp->tcp_state == TCPS_SYN_SENT ||
2809 2792              tcp->tcp_state == TCPS_SYN_RCVD ||
2810 2793              tcp->tcp_state == TCPS_FIN_WAIT_1);
2811 2794  
2812 2795          /*
2813 2796           * Tack on the MSS option.  It is always needed
2814 2797           * for both active and passive open.
2815 2798           *
2816 2799           * MSS option value should be interface MTU - MIN
2817 2800           * TCP/IP header according to RFC 793 as it means
2818 2801           * the maximum segment size TCP can receive.  But
2819 2802           * to get around some broken middle boxes/end hosts
2820 2803           * out there, we allow the option value to be the
2821 2804           * same as the MSS option size on the peer side.
2822 2805           * In this way, the other side will not send
2823 2806           * anything larger than they can receive.
2824 2807           *
2825 2808           * Note that for SYN_SENT state, the ndd param
2826 2809           * tcp_use_smss_as_mss_opt has no effect as we
2827 2810           * don't know the peer's MSS option value. So
2828 2811           * the only case we need to take care of is in
2829 2812           * SYN_RCVD state, which is done later.
2830 2813           */
2831 2814          wptr[0] = TCPOPT_MAXSEG;
2832 2815          wptr[1] = TCPOPT_MAXSEG_LEN;
2833 2816          wptr += 2;
2834 2817          u1 = tcp->tcp_initial_pmtu - (connp->conn_ipversion == IPV4_VERSION ?
2835 2818              IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - TCP_MIN_HEADER_LENGTH;
2836 2819          U16_TO_BE16(u1, wptr);
2837 2820          wptr += 2;
2838 2821  
2839 2822          /* Update the offset to cover the additional word */
2840 2823          tcpha->tha_offset_and_reserved += (1 << 4);
2841 2824  
2842 2825          switch (tcp->tcp_state) {
2843 2826          case TCPS_SYN_SENT:
2844 2827                  *flags = TH_SYN;
2845 2828  
2846 2829                  if (tcp->tcp_snd_sack_ok)
2847 2830                          add_sack = B_TRUE;
2848 2831  
2849 2832                  if (tcp->tcp_snd_ts_ok) {
2850 2833                          uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2851 2834  
2852 2835                          if (add_sack) {
2853 2836                                  wptr[0] = TCPOPT_SACK_PERMITTED;
2854 2837                                  wptr[1] = TCPOPT_SACK_OK_LEN;
2855 2838                                  add_sack = B_FALSE;
2856 2839                          } else {
2857 2840                                  wptr[0] = TCPOPT_NOP;
2858 2841                                  wptr[1] = TCPOPT_NOP;
2859 2842                          }
2860 2843                          wptr[2] = TCPOPT_TSTAMP;
2861 2844                          wptr[3] = TCPOPT_TSTAMP_LEN;
2862 2845                          wptr += 4;
2863 2846                          U32_TO_BE32(llbolt, wptr);
2864 2847                          wptr += 4;
2865 2848                          ASSERT(tcp->tcp_ts_recent == 0);
2866 2849                          U32_TO_BE32(0L, wptr);
2867 2850                          wptr += 4;
2868 2851                          tcpha->tha_offset_and_reserved += (3 << 4);
2869 2852                  }
2870 2853  
2871 2854                  /*
2872 2855                   * Set up all the bits to tell other side
2873 2856                   * we are ECN capable.
2874 2857                   */
2875 2858                  if (tcp->tcp_ecn_ok)
2876 2859                          *flags |= (TH_ECE | TH_CWR);
2877 2860  
2878 2861                  break;
2879 2862  
2880 2863          case TCPS_SYN_RCVD:
2881 2864                  *flags |= TH_SYN;
2882 2865  
2883 2866                  /*
2884 2867                   * Reset the MSS option value to be SMSS
2885 2868                   * We should probably add back the bytes
2886 2869                   * for timestamp option and IPsec.  We
2887 2870                   * don't do that as this is a workaround
2888 2871                   * for broken middle boxes/end hosts, it
2889 2872                   * is better for us to be more cautious.
2890 2873                   * They may not take these things into
2891 2874                   * account in their SMSS calculation.  Thus
2892 2875                   * the peer's calculated SMSS may be smaller
2893 2876                   * than what it can be.  This should be OK.
2894 2877                   */
2895 2878                  if (tcps->tcps_use_smss_as_mss_opt) {
2896 2879                          u1 = tcp->tcp_mss;
2897 2880                          /*
2898 2881                           * Note that wptr points just past the MSS
2899 2882                           * option value.
2900 2883                           */
2901 2884                          U16_TO_BE16(u1, wptr - 2);
2902 2885                  }
2903 2886  
2904 2887                  /*
2905 2888                   * tcp_snd_ts_ok can only be set in TCPS_SYN_RCVD
2906 2889                   * when the peer also uses timestamps option.  And
2907 2890                   * the TCP header template must have already been
2908 2891                   * updated to include the timestamps option.
2909 2892                   */
2910 2893                  if (tcp->tcp_snd_sack_ok) {
2911 2894                          if (tcp->tcp_snd_ts_ok) {
2912 2895                                  uint8_t *tmp_wptr;
2913 2896  
2914 2897                                  /*
2915 2898                                   * Use the NOP in the header just
2916 2899                                   * before timestamps opton.
2917 2900                                   */
2918 2901                                  tmp_wptr = (uint8_t *)tcpha +
2919 2902                                      TCP_MIN_HEADER_LENGTH;
2920 2903                                  ASSERT(tmp_wptr[0] == TCPOPT_NOP &&
2921 2904                                      tmp_wptr[1] == TCPOPT_NOP);
2922 2905                                  tmp_wptr[0] = TCPOPT_SACK_PERMITTED;
2923 2906                                  tmp_wptr[1] = TCPOPT_SACK_OK_LEN;
2924 2907                          } else {
2925 2908                                  add_sack = B_TRUE;
2926 2909                          }
2927 2910                  }
2928 2911  
2929 2912  
2930 2913                  /*
2931 2914                   * If the other side is ECN capable, reply
2932 2915                   * that we are also ECN capable.
2933 2916                   */
2934 2917                  if (tcp->tcp_ecn_ok)
2935 2918                          *flags |= TH_ECE;
2936 2919                  break;
2937 2920  
2938 2921          default:
2939 2922                  /*
2940 2923                   * The above ASSERT() makes sure that this
2941 2924                   * must be FIN-WAIT-1 state.  Our SYN has
2942 2925                   * not been ack'ed so retransmit it.
2943 2926                   */
2944 2927                  *flags |= TH_SYN;
2945 2928                  break;
2946 2929          }
2947 2930  
2948 2931          if (add_sack) {
2949 2932                  wptr[0] = TCPOPT_NOP;
2950 2933                  wptr[1] = TCPOPT_NOP;
2951 2934                  wptr[2] = TCPOPT_SACK_PERMITTED;
2952 2935                  wptr[3] = TCPOPT_SACK_OK_LEN;
2953 2936                  wptr += TCPOPT_REAL_SACK_OK_LEN;
2954 2937                  tcpha->tha_offset_and_reserved += (1 << 4);
2955 2938          }
2956 2939  
2957 2940          if (tcp->tcp_snd_ws_ok) {
2958 2941                  wptr[0] =  TCPOPT_NOP;
2959 2942                  wptr[1] =  TCPOPT_WSCALE;
2960 2943                  wptr[2] =  TCPOPT_WS_LEN;
2961 2944                  wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
2962 2945                  wptr += TCPOPT_REAL_WS_LEN;
2963 2946                  tcpha->tha_offset_and_reserved += (1 << 4);
2964 2947          }
2965 2948  
2966 2949          mp->b_wptr = wptr;
2967 2950          u1 = (int)(mp->b_wptr - mp->b_rptr);
2968 2951          /*
2969 2952           * Get IP set to checksum on our behalf
2970 2953           * Include the adjustment for a source route if any.
2971 2954           */
2972 2955          u1 += connp->conn_sum;
2973 2956          u1 = (u1 >> 16) + (u1 & 0xFFFF);
2974 2957          tcpha->tha_sum = htons(u1);
2975 2958          TCPS_BUMP_MIB(tcps, tcpOutControl);
2976 2959  }
2977 2960  
2978 2961  /*
2979 2962   * Helper function for tcp_xmit_mp() in handling connection tear down
2980 2963   * flag setting and state changes.
2981 2964   */
2982 2965  static void
2983 2966  tcp_xmit_mp_aux_fss(tcp_t *tcp, ip_xmit_attr_t *ixa, uint_t *flags)
2984 2967  {
2985 2968          if (!tcp->tcp_fin_acked) {
2986 2969                  *flags |= TH_FIN;
2987 2970                  TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutControl);
2988 2971          }
2989 2972          if (!tcp->tcp_fin_sent) {
2990 2973                  tcp->tcp_fin_sent = B_TRUE;
2991 2974                  switch (tcp->tcp_state) {
2992 2975                  case TCPS_SYN_RCVD:
2993 2976                          tcp->tcp_state = TCPS_FIN_WAIT_1;
2994 2977                          DTRACE_TCP6(state__change, void, NULL,
2995 2978                              ip_xmit_attr_t *, ixa, void, NULL,
2996 2979                              tcp_t *, tcp, void, NULL,
2997 2980                              int32_t, TCPS_SYN_RCVD);
2998 2981                          break;
2999 2982                  case TCPS_ESTABLISHED:
3000 2983                          tcp->tcp_state = TCPS_FIN_WAIT_1;
3001 2984                          DTRACE_TCP6(state__change, void, NULL,
3002 2985                              ip_xmit_attr_t *, ixa, void, NULL,
3003 2986                              tcp_t *, tcp, void, NULL,
3004 2987                              int32_t, TCPS_ESTABLISHED);
3005 2988                          break;
3006 2989                  case TCPS_CLOSE_WAIT:
3007 2990                          tcp->tcp_state = TCPS_LAST_ACK;
3008 2991                          DTRACE_TCP6(state__change, void, NULL,
3009 2992                              ip_xmit_attr_t *, ixa, void, NULL,
3010 2993                              tcp_t *, tcp, void, NULL,
3011 2994                              int32_t, TCPS_CLOSE_WAIT);
3012 2995                          break;
3013 2996                  }
3014 2997                  if (tcp->tcp_suna == tcp->tcp_snxt)
3015 2998                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3016 2999                  tcp->tcp_snxt = tcp->tcp_fss + 1;
3017 3000          }
3018 3001  }
3019 3002  
3020 3003  /*
3021 3004   * tcp_xmit_mp is called to return a pointer to an mblk chain complete with
3022 3005   * ip and tcp header ready to pass down to IP.  If the mp passed in is
3023 3006   * non-NULL, then up to max_to_send bytes of data will be dup'ed off that
3024 3007   * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary
3025 3008   * otherwise it will dup partial mblks.)
3026 3009   * Otherwise, an appropriate ACK packet will be generated.  This
3027 3010   * routine is not usually called to send new data for the first time.  It
3028 3011   * is mostly called out of the timer for retransmits, and to generate ACKs.
3029 3012   *
3030 3013   * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will
3031 3014   * be adjusted by *offset.  And after dupb(), the offset and the ending mblk
3032 3015   * of the original mblk chain will be returned in *offset and *end_mp.
3033 3016   */
3034 3017  mblk_t *
3035 3018  tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
3036 3019      mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len,
3037 3020      boolean_t rexmit)
3038 3021  {
3039 3022          int     data_length;
3040 3023          int32_t off = 0;
3041 3024          uint_t  flags;
3042 3025          mblk_t  *mp1;
3043 3026          mblk_t  *mp2;
3044 3027          uchar_t *rptr;
3045 3028          tcpha_t *tcpha;
3046 3029          int32_t num_sack_blk = 0;
3047 3030          int32_t sack_opt_len = 0;
3048 3031          tcp_stack_t     *tcps = tcp->tcp_tcps;
3049 3032          conn_t          *connp = tcp->tcp_connp;
3050 3033          ip_xmit_attr_t  *ixa = connp->conn_ixa;
3051 3034  
3052 3035          /* Allocate for our maximum TCP header + link-level */
3053 3036          mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
3054 3037              BPRI_MED);
3055 3038          if (mp1 == NULL)
3056 3039                  return (NULL);
3057 3040          data_length = 0;
3058 3041  
3059 3042          /*
3060 3043           * Note that tcp_mss has been adjusted to take into account the
3061 3044           * timestamp option if applicable.  Because SACK options do not
3062 3045           * appear in every TCP segments and they are of variable lengths,
3063 3046           * they cannot be included in tcp_mss.  Thus we need to calculate
3064 3047           * the actual segment length when we need to send a segment which
3065 3048           * includes SACK options.
3066 3049           */
3067 3050          if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
3068 3051                  num_sack_blk = MIN(tcp->tcp_max_sack_blk,
3069 3052                      tcp->tcp_num_sack_blk);
3070 3053                  sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
3071 3054                      TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
3072 3055                  if (max_to_send + sack_opt_len > tcp->tcp_mss)
3073 3056                          max_to_send -= sack_opt_len;
3074 3057          }
3075 3058  
3076 3059          if (offset != NULL) {
3077 3060                  off = *offset;
3078 3061                  /* We use offset as an indicator that end_mp is not NULL. */
3079 3062                  *end_mp = NULL;
3080 3063          }
3081 3064          for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) {
3082 3065                  /* This could be faster with cooperation from downstream */
3083 3066                  if (mp2 != mp1 && !sendall &&
3084 3067                      data_length + (int)(mp->b_wptr - mp->b_rptr) >
3085 3068                      max_to_send)
3086 3069                          /*
3087 3070                           * Don't send the next mblk since the whole mblk
3088 3071                           * does not fit.
3089 3072                           */
3090 3073                          break;
3091 3074                  mp2->b_cont = dupb(mp);
3092 3075                  mp2 = mp2->b_cont;
3093 3076                  if (!mp2) {
3094 3077                          freemsg(mp1);
3095 3078                          return (NULL);
3096 3079                  }
3097 3080                  mp2->b_rptr += off;
3098 3081                  ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <=
3099 3082                      (uintptr_t)INT_MAX);
3100 3083  
3101 3084                  data_length += (int)(mp2->b_wptr - mp2->b_rptr);
3102 3085                  if (data_length > max_to_send) {
3103 3086                          mp2->b_wptr -= data_length - max_to_send;
3104 3087                          data_length = max_to_send;
3105 3088                          off = mp2->b_wptr - mp->b_rptr;
3106 3089                          break;
3107 3090                  } else {
3108 3091                          off = 0;
3109 3092                  }
3110 3093          }
3111 3094          if (offset != NULL) {
3112 3095                  *offset = off;
3113 3096                  *end_mp = mp;
3114 3097          }
3115 3098          if (seg_len != NULL) {
3116 3099                  *seg_len = data_length;
3117 3100          }
3118 3101  
3119 3102          /* Update the latest receive window size in TCP header. */
3120 3103          tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
3121 3104  
3122 3105          rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
3123 3106          mp1->b_rptr = rptr;
3124 3107          mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len;
3125 3108          bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
3126 3109          tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
3127 3110          tcpha->tha_seq = htonl(seq);
3128 3111  
3129 3112          /*
3130 3113           * Use tcp_unsent to determine if the PUSH bit should be used assumes
3131 3114           * that this function was called from tcp_wput_data. Thus, when called
3132 3115           * to retransmit data the setting of the PUSH bit may appear some
3133 3116           * what random in that it might get set when it should not. This
3134 3117           * should not pose any performance issues.
3135 3118           */
3136 3119          if (data_length != 0 && (tcp->tcp_unsent == 0 ||
3137 3120              tcp->tcp_unsent == data_length)) {
3138 3121                  flags = TH_ACK | TH_PUSH;
3139 3122          } else {
3140 3123                  flags = TH_ACK;
3141 3124          }
3142 3125  
3143 3126          if (tcp->tcp_ecn_ok) {
3144 3127                  if (tcp->tcp_ecn_echo_on)
3145 3128                          flags |= TH_ECE;
3146 3129  
3147 3130                  /*
3148 3131                   * Only set ECT bit and ECN_CWR if a segment contains new data.
3149 3132                   * There is no TCP flow control for non-data segments, and
3150 3133                   * only data segment is transmitted reliably.
3151 3134                   */
3152 3135                  if (data_length > 0 && !rexmit) {
3153 3136                          TCP_SET_ECT(tcp, rptr);
3154 3137                          if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
3155 3138                                  flags |= TH_CWR;
3156 3139                                  tcp->tcp_ecn_cwr_sent = B_TRUE;
3157 3140                          }
3158 3141                  }
3159 3142          }
3160 3143  
3161 3144          /* Check if there is any special processing needs to be done. */
3162 3145          if (tcp->tcp_valid_bits) {
3163 3146                  uint32_t u1;
3164 3147  
3165 3148                  /* We don't allow having SYN and FIN in the same segment... */
3166 3149                  if ((tcp->tcp_valid_bits & TCP_ISS_VALID) &&
3167 3150                      seq == tcp->tcp_iss) {
3168 3151                          /* Need to do connection set up processing. */
3169 3152                          tcp_xmit_mp_aux_iss(tcp, connp, tcpha, mp1, &flags);
3170 3153                  } else if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
3171 3154                      (seq + data_length) == tcp->tcp_fss) {
3172 3155                          /* Need to do connection tear down processing. */
3173 3156                          tcp_xmit_mp_aux_fss(tcp, ixa, &flags);
3174 3157                  }
3175 3158  
3176 3159                  /*
3177 3160                   * Need to do urgent pointer processing.
3178 3161                   *
3179 3162                   * Note the trick here.  u1 is unsigned.  When tcp_urg
3180 3163                   * is smaller than seq, u1 will become a very huge value.
3181 3164                   * So the comparison will fail.  Also note that tcp_urp
3182 3165                   * should be positive, see RFC 793 page 17.
3183 3166                   */
3184 3167                  u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
3185 3168                  if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
3186 3169                      u1 < (uint32_t)(64 * 1024)) {
3187 3170                          flags |= TH_URG;
3188 3171                          TCPS_BUMP_MIB(tcps, tcpOutUrg);
3189 3172                          tcpha->tha_urp = htons(u1);
3190 3173                  }
3191 3174          }
3192 3175          tcpha->tha_flags = (uchar_t)flags;
3193 3176          tcp->tcp_rack = tcp->tcp_rnxt;
3194 3177          tcp->tcp_rack_cnt = 0;
3195 3178  
3196 3179          /* Fill in the current value of timestamps option. */
3197 3180          if (tcp->tcp_snd_ts_ok) {
3198 3181                  if (tcp->tcp_state != TCPS_SYN_SENT) {
3199 3182                          uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
3200 3183  
3201 3184                          U32_TO_BE32(llbolt,
3202 3185                              (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
3203 3186                          U32_TO_BE32(tcp->tcp_ts_recent,
3204 3187                              (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
3205 3188                  }
3206 3189          }
3207 3190  
3208 3191          /* Fill in the SACK blocks. */
3209 3192          if (num_sack_blk > 0) {
3210 3193                  uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len;
3211 3194                  sack_blk_t *tmp;
3212 3195                  int32_t i;
3213 3196  
3214 3197                  wptr[0] = TCPOPT_NOP;
3215 3198                  wptr[1] = TCPOPT_NOP;
3216 3199                  wptr[2] = TCPOPT_SACK;
3217 3200                  wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
3218 3201                      sizeof (sack_blk_t);
3219 3202                  wptr += TCPOPT_REAL_SACK_LEN;
3220 3203  
3221 3204                  tmp = tcp->tcp_sack_list;
3222 3205                  for (i = 0; i < num_sack_blk; i++) {
3223 3206                          U32_TO_BE32(tmp[i].begin, wptr);
3224 3207                          wptr += sizeof (tcp_seq);
3225 3208                          U32_TO_BE32(tmp[i].end, wptr);
3226 3209                          wptr += sizeof (tcp_seq);
3227 3210                  }
3228 3211                  tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4);
3229 3212          }
3230 3213          ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX);
3231 3214          data_length += (int)(mp1->b_wptr - rptr);
3232 3215  
3233 3216          ixa->ixa_pktlen = data_length;
3234 3217  
3235 3218          if (ixa->ixa_flags & IXAF_IS_IPV4) {
3236 3219                  ((ipha_t *)rptr)->ipha_length = htons(data_length);
3237 3220          } else {
3238 3221                  ip6_t *ip6 = (ip6_t *)rptr;
3239 3222  
3240 3223                  ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN);
3241 3224          }
3242 3225  
3243 3226          /*
3244 3227           * Prime pump for IP
3245 3228           * Include the adjustment for a source route if any.
3246 3229           */
3247 3230          data_length -= ixa->ixa_ip_hdr_length;
3248 3231          data_length += connp->conn_sum;
3249 3232          data_length = (data_length >> 16) + (data_length & 0xFFFF);
3250 3233          tcpha->tha_sum = htons(data_length);
3251 3234          if (tcp->tcp_ip_forward_progress) {
3252 3235                  tcp->tcp_ip_forward_progress = B_FALSE;
3253 3236                  connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
3254 3237          } else {
3255 3238                  connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
3256 3239          }
3257 3240          return (mp1);
3258 3241  }
3259 3242  
3260 3243  /*
3261 3244   * If this routine returns B_TRUE, TCP can generate a RST in response
3262 3245   * to a segment.  If it returns B_FALSE, TCP should not respond.
3263 3246   */
3264 3247  static boolean_t
3265 3248  tcp_send_rst_chk(tcp_stack_t *tcps)
3266 3249  {
3267 3250          int64_t now;
3268 3251  
3269 3252          /*
3270 3253           * TCP needs to protect itself from generating too many RSTs.
3271 3254           * This can be a DoS attack by sending us random segments
3272 3255           * soliciting RSTs.
3273 3256           *
3274 3257           * What we do here is to have a limit of tcp_rst_sent_rate RSTs
3275 3258           * in each 1 second interval.  In this way, TCP still generate
3276 3259           * RSTs in normal cases but when under attack, the impact is
3277 3260           * limited.
3278 3261           */
3279 3262          if (tcps->tcps_rst_sent_rate_enabled != 0) {
3280 3263                  now = ddi_get_lbolt64();
3281 3264                  if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) >
3282 3265                      1*SECONDS) {
3283 3266                          tcps->tcps_last_rst_intrvl = now;
3284 3267                          tcps->tcps_rst_cnt = 1;
3285 3268                  } else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) {
3286 3269                          return (B_FALSE);
3287 3270                  }
3288 3271          }
3289 3272          return (B_TRUE);
3290 3273  }
3291 3274  
3292 3275  /*
3293 3276   * This function handles all retransmissions if SACK is enabled for this
3294 3277   * connection.  First it calculates how many segments can be retransmitted
3295 3278   * based on tcp_pipe.  Then it goes thru the notsack list to find eligible
3296 3279   * segments.  A segment is eligible if sack_cnt for that segment is greater
3297 3280   * than or equal tcp_dupack_fast_retransmit.  After it has retransmitted
3298 3281   * all eligible segments, it checks to see if TCP can send some new segments
3299 3282   * (fast recovery).  If it can, set the appropriate flag for tcp_input_data().
3300 3283   *
3301 3284   * Parameters:
3302 3285   *      tcp_t *tcp: the tcp structure of the connection.
3303 3286   *      uint_t *flags: in return, appropriate value will be set for
3304 3287   *      tcp_input_data().
3305 3288   */
3306 3289  void
3307 3290  tcp_sack_rexmit(tcp_t *tcp, uint_t *flags)
3308 3291  {
3309 3292          notsack_blk_t   *notsack_blk;
3310 3293          int32_t         usable_swnd;
3311 3294          int32_t         mss;
3312 3295          uint32_t        seg_len;
3313 3296          mblk_t          *xmit_mp;
3314 3297          tcp_stack_t     *tcps = tcp->tcp_tcps;
3315 3298  
3316 3299          ASSERT(tcp->tcp_notsack_list != NULL);
3317 3300          ASSERT(tcp->tcp_rexmit == B_FALSE);
3318 3301  
3319 3302          /* Defensive coding in case there is a bug... */
3320 3303          if (tcp->tcp_notsack_list == NULL) {
3321 3304                  return;
3322 3305          }
3323 3306          notsack_blk = tcp->tcp_notsack_list;
3324 3307          mss = tcp->tcp_mss;
3325 3308  
3326 3309          /*
3327 3310           * Limit the num of outstanding data in the network to be
3328 3311           * tcp_cwnd_ssthresh, which is half of the original congestion wnd.
3329 3312           */
3330 3313          usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
3331 3314  
3332 3315          /* At least retransmit 1 MSS of data. */
3333 3316          if (usable_swnd <= 0) {
3334 3317                  usable_swnd = mss;
3335 3318          }
3336 3319  
3337 3320          /* Make sure no new RTT samples will be taken. */
3338 3321          tcp->tcp_csuna = tcp->tcp_snxt;
3339 3322  
3340 3323          notsack_blk = tcp->tcp_notsack_list;
3341 3324          while (usable_swnd > 0) {
3342 3325                  mblk_t          *snxt_mp, *tmp_mp;
3343 3326                  tcp_seq         begin = tcp->tcp_sack_snxt;
3344 3327                  tcp_seq         end;
3345 3328                  int32_t         off;
3346 3329  
3347 3330                  for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) {
3348 3331                          if (SEQ_GT(notsack_blk->end, begin) &&
3349 3332                              (notsack_blk->sack_cnt >=
3350 3333                              tcps->tcps_dupack_fast_retransmit)) {
3351 3334                                  end = notsack_blk->end;
3352 3335                                  if (SEQ_LT(begin, notsack_blk->begin)) {
3353 3336                                          begin = notsack_blk->begin;
3354 3337                                  }
3355 3338                                  break;
3356 3339                          }
3357 3340                  }
3358 3341                  /*
3359 3342                   * All holes are filled.  Manipulate tcp_cwnd to send more
3360 3343                   * if we can.  Note that after the SACK recovery, tcp_cwnd is
3361 3344                   * set to tcp_cwnd_ssthresh.
3362 3345                   */
3363 3346                  if (notsack_blk == NULL) {
3364 3347                          usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
3365 3348                          if (usable_swnd <= 0 || tcp->tcp_unsent == 0) {
3366 3349                                  tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna;
3367 3350                                  ASSERT(tcp->tcp_cwnd > 0);
3368 3351                                  return;
3369 3352                          } else {
3370 3353                                  usable_swnd = usable_swnd / mss;
3371 3354                                  tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna +
3372 3355                                      MAX(usable_swnd * mss, mss);
3373 3356                                  *flags |= TH_XMIT_NEEDED;
3374 3357                                  return;
3375 3358                          }
3376 3359                  }
3377 3360  
3378 3361                  /*
3379 3362                   * Note that we may send more than usable_swnd allows here
3380 3363                   * because of round off, but no more than 1 MSS of data.
3381 3364                   */
3382 3365                  seg_len = end - begin;
3383 3366                  if (seg_len > mss)
3384 3367                          seg_len = mss;
3385 3368                  snxt_mp = tcp_get_seg_mp(tcp, begin, &off);
3386 3369                  ASSERT(snxt_mp != NULL);
3387 3370                  /* This should not happen.  Defensive coding again... */
3388 3371                  if (snxt_mp == NULL) {
3389 3372                          return;
3390 3373                  }
3391 3374  
3392 3375                  xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3393 3376                      &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3394 3377                  if (xmit_mp == NULL)
3395 3378                          return;
3396 3379  
3397 3380                  usable_swnd -= seg_len;
3398 3381                  tcp->tcp_pipe += seg_len;
3399 3382                  tcp->tcp_sack_snxt = begin + seg_len;
3400 3383  
3401 3384                  tcp_send_data(tcp, xmit_mp);
3402 3385  
3403 3386                  /*
3404 3387                   * Update the send timestamp to avoid false retransmission.
3405 3388                   */
3406 3389                  snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3407 3390  
3408 3391                  TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3409 3392                  TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3410 3393                  TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3411 3394                  /*
3412 3395                   * Update tcp_rexmit_max to extend this SACK recovery phase.
3413 3396                   * This happens when new data sent during fast recovery is
3414 3397                   * also lost.  If TCP retransmits those new data, it needs
3415 3398                   * to extend SACK recover phase to avoid starting another
3416 3399                   * fast retransmit/recovery unnecessarily.

↓ open down ↓

1566 lines elided

↑ open up ↑

3417 3400                   */
3418 3401                  if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3419 3402                          tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3420 3403                  }
3421 3404          }
3422 3405  }
3423 3406  
3424 3407  /*
3425 3408   * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3426 3409   * or ICMP errors.
3427      - *
3428      - * To limit the number of duplicate segments, we limit the number of segment
3429      - * to be sent in one time to tcp_snd_burst, the burst variable.
3430 3410   */
3431 3411  void
3432 3412  tcp_ss_rexmit(tcp_t *tcp)
3433 3413  {
3434 3414          uint32_t        snxt;
3435 3415          uint32_t        smax;
3436 3416          int32_t         win;
3437 3417          int32_t         mss;
3438 3418          int32_t         off;
3439      -        int32_t         burst = tcp->tcp_snd_burst;
3440 3419          mblk_t          *snxt_mp;
3441 3420          tcp_stack_t     *tcps = tcp->tcp_tcps;
3442 3421  
3443 3422          /*
3444 3423           * Note that tcp_rexmit can be set even though TCP has retransmitted
3445 3424           * all unack'ed segments.
3446 3425           */
3447 3426          if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
3448 3427                  smax = tcp->tcp_rexmit_max;
3449 3428                  snxt = tcp->tcp_rexmit_nxt;
3450 3429                  if (SEQ_LT(snxt, tcp->tcp_suna)) {
3451 3430                          snxt = tcp->tcp_suna;
3452 3431                  }
3453 3432                  win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
3454 3433                  win -= snxt - tcp->tcp_suna;
3455 3434                  mss = tcp->tcp_mss;
3456 3435                  snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
3457 3436  
3458      -                while (SEQ_LT(snxt, smax) && (win > 0) &&
3459      -                    (burst > 0) && (snxt_mp != NULL)) {
     3437 +                while (SEQ_LT(snxt, smax) && (win > 0) && (snxt_mp != NULL)) {
3460 3438                          mblk_t  *xmit_mp;
3461 3439                          mblk_t  *old_snxt_mp = snxt_mp;
3462 3440                          uint32_t cnt = mss;
3463 3441  
3464 3442                          if (win < cnt) {
3465 3443                                  cnt = win;
3466 3444                          }
3467 3445                          if (SEQ_GT(snxt + cnt, smax)) {
3468 3446                                  cnt = smax - snxt;
3469 3447                          }

3470 3448                          xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3471 3449                              &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3472 3450                          if (xmit_mp == NULL)
3473 3451                                  return;
3474 3452  
3475 3453                          tcp_send_data(tcp, xmit_mp);
3476 3454  
3477 3455                          snxt += cnt;

↓ open down ↓

8 lines elided

↑ open up ↑

3478 3456                          win -= cnt;
3479 3457                          /*
3480 3458                           * Update the send timestamp to avoid false
3481 3459                           * retransmission.
3482 3460                           */
3483 3461                          old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3484 3462                          TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3485 3463                          TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3486 3464  
3487 3465                          tcp->tcp_rexmit_nxt = snxt;
3488      -                        burst--;
3489 3466                  }
3490 3467                  /*
3491 3468                   * If we have transmitted all we have at the time
3492 3469                   * we started the retranmission, we can leave
3493 3470                   * the rest of the job to tcp_wput_data().  But we
3494 3471                   * need to check the send window first.  If the
3495 3472                   * win is not 0, go on with tcp_wput_data().
3496 3473                   */
3497 3474                  if (SEQ_LT(snxt, smax) || win == 0) {
3498 3475                          return;

3499 3476                  }
3500 3477          }
3501 3478          /* Only call tcp_wput_data() if there is data to be sent. */
3502 3479          if (tcp->tcp_unsent) {
3503 3480                  tcp_wput_data(tcp, NULL, B_FALSE);
3504 3481          }
3505 3482  }
3506 3483  
3507 3484  /*
3508 3485   * Do slow start retransmission after ICMP errors of PMTU changes.
3509 3486   */
3510 3487  void
3511 3488  tcp_rexmit_after_error(tcp_t *tcp)
3512 3489  {
3513 3490          /*
3514 3491           * All sent data has been acknowledged or no data left to send, just
3515 3492           * to return.
3516 3493           */
3517 3494          if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
3518 3495              (tcp->tcp_xmit_head == NULL))

↓ open down ↓

20 lines elided

↑ open up ↑

3519 3496                  return;
3520 3497  
3521 3498          if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
3522 3499                  tcp->tcp_rexmit_max = tcp->tcp_fss;
3523 3500          else
3524 3501                  tcp->tcp_rexmit_max = tcp->tcp_snxt;
3525 3502  
3526 3503          tcp->tcp_rexmit_nxt = tcp->tcp_suna;
3527 3504          tcp->tcp_rexmit = B_TRUE;
3528 3505          tcp->tcp_dupack_cnt = 0;
3529      -        tcp->tcp_snd_burst = TCP_CWND_SS;
3530 3506          tcp_ss_rexmit(tcp);
3531 3507  }
3532 3508  
3533 3509  /*
3534 3510   * tcp_get_seg_mp() is called to get the pointer to a segment in the
3535 3511   * send queue which starts at the given sequence number. If the given
3536 3512   * sequence number is equal to last valid sequence number (tcp_snxt), the
3537 3513   * returned mblk is the last valid mblk, and off is set to the length of
3538 3514   * that mblk.
3539 3515   *

3540 3516   * send queue which starts at the given seq. no.
3541 3517   *
3542 3518   * Parameters:
3543 3519   *      tcp_t *tcp: the tcp instance pointer.
3544 3520   *      uint32_t seq: the starting seq. no of the requested segment.
3545 3521   *      int32_t *off: after the execution, *off will be the offset to
3546 3522   *              the returned mblk which points to the requested seq no.
3547 3523   *              It is the caller's responsibility to send in a non-null off.
3548 3524   *
3549 3525   * Return:
3550 3526   *      A mblk_t pointer pointing to the requested segment in send queue.
3551 3527   */
3552 3528  static mblk_t *
3553 3529  tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off)
3554 3530  {
3555 3531          int32_t cnt;
3556 3532          mblk_t  *mp;
3557 3533  
3558 3534          /* Defensive coding.  Make sure we don't send incorrect data. */
3559 3535          if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt))
3560 3536                  return (NULL);
3561 3537  
3562 3538          cnt = seq - tcp->tcp_suna;
3563 3539          mp = tcp->tcp_xmit_head;
3564 3540          while (cnt > 0 && mp != NULL) {
3565 3541                  cnt -= mp->b_wptr - mp->b_rptr;
3566 3542                  if (cnt <= 0) {
3567 3543                          cnt += mp->b_wptr - mp->b_rptr;
3568 3544                          break;
3569 3545                  }
3570 3546                  mp = mp->b_cont;
3571 3547          }
3572 3548          ASSERT(mp != NULL);
3573 3549          *off = cnt;
3574 3550          return (mp);
3575 3551  }
3576 3552  
3577 3553  /*
3578 3554   * This routine adjusts next-to-send sequence number variables, in the
3579 3555   * case where the reciever has shrunk it's window.
3580 3556   */
3581 3557  void
3582 3558  tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt)
3583 3559  {
3584 3560          mblk_t *xmit_tail;
3585 3561          int32_t offset;
3586 3562  
3587 3563          tcp->tcp_snxt = snxt;
3588 3564  
3589 3565          /* Get the mblk, and the offset in it, as per the shrunk window */
3590 3566          xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset);
3591 3567          ASSERT(xmit_tail != NULL);
3592 3568          tcp->tcp_xmit_tail = xmit_tail;
3593 3569          tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr -
3594 3570              xmit_tail->b_rptr - offset;
3595 3571  }
3596 3572  
3597 3573  /*
3598 3574   * This handles the case when the receiver has shrunk its win. Per RFC 1122
3599 3575   * if the receiver shrinks the window, i.e. moves the right window to the
3600 3576   * left, the we should not send new data, but should retransmit normally the
3601 3577   * old unacked data between suna and suna + swnd. We might has sent data
3602 3578   * that is now outside the new window, pretend that we didn't send  it.
3603 3579   */
3604 3580  static void
3605 3581  tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count)
3606 3582  {
3607 3583          uint32_t        snxt = tcp->tcp_snxt;
3608 3584  
3609 3585          ASSERT(shrunk_count > 0);
3610 3586  
3611 3587          if (!tcp->tcp_is_wnd_shrnk) {
3612 3588                  tcp->tcp_snxt_shrunk = snxt;
3613 3589                  tcp->tcp_is_wnd_shrnk = B_TRUE;
3614 3590          } else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) {
3615 3591                  tcp->tcp_snxt_shrunk = snxt;
3616 3592          }
3617 3593  
3618 3594          /* Pretend we didn't send the data outside the window */
3619 3595          snxt -= shrunk_count;
3620 3596  
3621 3597          /* Reset all the values per the now shrunk window */
3622 3598          tcp_update_xmit_tail(tcp, snxt);
3623 3599          tcp->tcp_unsent += shrunk_count;
3624 3600  
3625 3601          /*
3626 3602           * If the SACK option is set, delete the entire list of
3627 3603           * notsack'ed blocks.
3628 3604           */
3629 3605          TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3630 3606  
3631 3607          if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3632 3608                  /*
3633 3609                   * Make sure the timer is running so that we will probe a zero
3634 3610                   * window.
3635 3611                   */
3636 3612                  TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3637 3613  }
3638 3614  
3639 3615  /*
3640 3616   * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3641 3617   * with the template header, as well as other options such as time-stamp,
3642 3618   * ECN and/or SACK.
3643 3619   */
3644 3620  static void
3645 3621  tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
3646 3622  {
3647 3623          tcpha_t *tcp_tmpl, *tcpha;
3648 3624          uint32_t *dst, *src;
3649 3625          int hdrlen;
3650 3626          conn_t *connp = tcp->tcp_connp;
3651 3627  
3652 3628          ASSERT(OK_32PTR(rptr));
3653 3629  
3654 3630          /* Template header */
3655 3631          tcp_tmpl = tcp->tcp_tcpha;
3656 3632  
3657 3633          /* Header of outgoing packet */
3658 3634          tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3659 3635  
3660 3636          /* dst and src are opaque 32-bit fields, used for copying */
3661 3637          dst = (uint32_t *)rptr;
3662 3638          src = (uint32_t *)connp->conn_ht_iphc;
3663 3639          hdrlen = connp->conn_ht_iphc_len;
3664 3640  
3665 3641          /* Fill time-stamp option if needed */
3666 3642          if (tcp->tcp_snd_ts_ok) {
3667 3643                  U32_TO_BE32((uint32_t)now,
3668 3644                      (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3669 3645                  U32_TO_BE32(tcp->tcp_ts_recent,
3670 3646                      (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3671 3647          } else {
3672 3648                  ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3673 3649          }
3674 3650  
3675 3651          /*
3676 3652           * Copy the template header; is this really more efficient than
3677 3653           * calling bcopy()?  For simple IPv4/TCP, it may be the case,
3678 3654           * but perhaps not for other scenarios.
3679 3655           */
3680 3656          dst[0] = src[0];
3681 3657          dst[1] = src[1];
3682 3658          dst[2] = src[2];
3683 3659          dst[3] = src[3];
3684 3660          dst[4] = src[4];
3685 3661          dst[5] = src[5];
3686 3662          dst[6] = src[6];
3687 3663          dst[7] = src[7];
3688 3664          dst[8] = src[8];
3689 3665          dst[9] = src[9];
3690 3666          if (hdrlen -= 40) {
3691 3667                  hdrlen >>= 2;
3692 3668                  dst += 10;
3693 3669                  src += 10;
3694 3670                  do {
3695 3671                          *dst++ = *src++;
3696 3672                  } while (--hdrlen);
3697 3673          }
3698 3674  
3699 3675          /*
3700 3676           * Set the ECN info in the TCP header if it is not a zero
3701 3677           * window probe.  Zero window probe is only sent in
3702 3678           * tcp_wput_data() and tcp_timer().
3703 3679           */
3704 3680          if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) {
3705 3681                  TCP_SET_ECT(tcp, rptr);
3706 3682  
3707 3683                  if (tcp->tcp_ecn_echo_on)
3708 3684                          tcpha->tha_flags |= TH_ECE;
3709 3685                  if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
3710 3686                          tcpha->tha_flags |= TH_CWR;
3711 3687                          tcp->tcp_ecn_cwr_sent = B_TRUE;
3712 3688                  }
3713 3689          }
3714 3690  
3715 3691          /* Fill in SACK options */
3716 3692          if (num_sack_blk > 0) {
3717 3693                  uchar_t *wptr = rptr + connp->conn_ht_iphc_len;
3718 3694                  sack_blk_t *tmp;
3719 3695                  int32_t i;
3720 3696  
3721 3697                  wptr[0] = TCPOPT_NOP;
3722 3698                  wptr[1] = TCPOPT_NOP;
3723 3699                  wptr[2] = TCPOPT_SACK;
3724 3700                  wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
3725 3701                      sizeof (sack_blk_t);
3726 3702                  wptr += TCPOPT_REAL_SACK_LEN;
3727 3703  
3728 3704                  tmp = tcp->tcp_sack_list;
3729 3705                  for (i = 0; i < num_sack_blk; i++) {
3730 3706                          U32_TO_BE32(tmp[i].begin, wptr);
3731 3707                          wptr += sizeof (tcp_seq);
3732 3708                          U32_TO_BE32(tmp[i].end, wptr);
3733 3709                          wptr += sizeof (tcp_seq);
3734 3710                  }
3735 3711                  tcpha->tha_offset_and_reserved +=
3736 3712                      ((num_sack_blk * 2 + 1) << 4);
3737 3713          }
3738 3714  }

↓ open down ↓

199 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX