Print this page
1631 kernel panic in tcp_input_data


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.

  24  */
  25 
  26 /* This file contains all TCP input processing functions. */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #include <sys/strsun.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/stropts.h>
  33 #include <sys/strlog.h>
  34 #define _SUN_TPI_VERSION 2
  35 #include <sys/tihdr.h>
  36 #include <sys/suntpi.h>
  37 #include <sys/xti_inet.h>
  38 #include <sys/squeue_impl.h>
  39 #include <sys/squeue.h>
  40 #include <sys/tsol/tnet.h>
  41 
  42 #include <inet/common.h>
  43 #include <inet/ip.h>


2213 
2214                 /*
2215                  * Prime pump for checksum calculation in IP.  Include the
2216                  * adjustment for a source route if any.
2217                  */
2218                 data_length = tcp_hdr_len + connp->conn_sum;
2219                 data_length = (data_length >> 16) + (data_length & 0xFFFF);
2220                 tcpha->tha_sum = htons(data_length);
2221 
2222                 if (tcp->tcp_ip_forward_progress) {
2223                         tcp->tcp_ip_forward_progress = B_FALSE;
2224                         connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
2225                 } else {
2226                         connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
2227                 }
2228                 return (mp1);
2229         }
2230 }
2231 
2232 /*












































































































2233  * Handle M_DATA messages from IP. Its called directly from IP via
2234  * squeue for received IP packets.
2235  *
2236  * The first argument is always the connp/tcp to which the mp belongs.
2237  * There are no exceptions to this rule. The caller has already put
2238  * a reference on this connp/tcp and once tcp_input_data() returns,
2239  * the squeue will do the refrele.
2240  *
2241  * The TH_SYN for the listener directly go to tcp_input_listener via
2242  * squeue. ICMP errors go directly to tcp_icmp_input().
2243  *
2244  * sqp: NULL = recursive, sqp != NULL means called from squeue
2245  */
2246 void
2247 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2248 {
2249         int32_t         bytes_acked;
2250         int32_t         gap;
2251         mblk_t          *mp1;
2252         uint_t          flags;


2254         uchar_t         *iphdr;
2255         uchar_t         *rptr;
2256         int32_t         rgap;
2257         uint32_t        seg_ack;
2258         int             seg_len;
2259         uint_t          ip_hdr_len;
2260         uint32_t        seg_seq;
2261         tcpha_t         *tcpha;
2262         int             urp;
2263         tcp_opt_t       tcpopt;
2264         ip_pkt_t        ipp;
2265         boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
2266         uint32_t        cwnd;
2267         uint32_t        add;
2268         int             npkt;
2269         int             mss;
2270         conn_t          *connp = (conn_t *)arg;
2271         squeue_t        *sqp = (squeue_t *)arg2;
2272         tcp_t           *tcp = connp->conn_tcp;
2273         tcp_stack_t     *tcps = tcp->tcp_tcps;

2274 
2275         /*
2276          * RST from fused tcp loopback peer should trigger an unfuse.
2277          */
2278         if (tcp->tcp_fused) {
2279                 TCP_STAT(tcps, tcp_fusion_aborted);
2280                 tcp_unfuse(tcp);
2281         }
2282 
2283         iphdr = mp->b_rptr;
2284         rptr = mp->b_rptr;
2285         ASSERT(OK_32PTR(rptr));
2286 
2287         ip_hdr_len = ira->ira_ip_hdr_length;
2288         if (connp->conn_recv_ancillary.crb_all != 0) {
2289                 /*
2290                  * Record packet information in the ip_pkt_t
2291                  */
2292                 ipp.ipp_fields = 0;
2293                 if (ira->ira_flags & IRAF_IS_IPV4) {


2379                 if (tcp->tcp_detached || !pullupmsg(mp, -1)) {
2380                         freemsg(mp);
2381                         return;
2382                 }
2383                 /* Update pointers into message */
2384                 iphdr = rptr = mp->b_rptr;
2385                 tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2386                 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
2387                         /*
2388                          * Since we can't handle any data with this urgent
2389                          * pointer that is out of sequence, we expunge
2390                          * the data.  This allows us to still register
2391                          * the urgent mark and generate the M_PCSIG,
2392                          * which we can do.
2393                          */
2394                         mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
2395                         seg_len = 0;
2396                 }
2397         }
2398 





2399         switch (tcp->tcp_state) {
2400         case TCPS_SYN_SENT:
2401                 if (connp->conn_final_sqp == NULL &&
2402                     tcp_outbound_squeue_switch && sqp != NULL) {
2403                         ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
2404                         connp->conn_final_sqp = sqp;
2405                         if (connp->conn_final_sqp != connp->conn_sqp) {
2406                                 DTRACE_PROBE1(conn__final__sqp__switch,
2407                                     conn_t *, connp);
2408                                 CONN_INC_REF(connp);
2409                                 SQUEUE_SWITCH(connp, connp->conn_final_sqp);
2410                                 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2411                                     tcp_input_data, connp, ira, ip_squeue_flag,
2412                                     SQTAG_CONNECT_FINISH);
2413                                 return;
2414                         }
2415                         DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp);
2416                 }
2417                 if (flags & TH_ACK) {
2418                         /*


2590                                  */
2591                                 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2592                                         if (tcp->tcp_ack_tid != 0) {
2593                                                 (void) TCP_TIMER_CANCEL(tcp,
2594                                                     tcp->tcp_ack_tid);
2595                                                 tcp->tcp_ack_tid = 0;
2596                                         }
2597                                         tcp_send_data(tcp, ack_mp);
2598                                         BUMP_LOCAL(tcp->tcp_obsegs);
2599                                         TCPS_BUMP_MIB(tcps, tcpOutAck);
2600 
2601                                         if (!IPCL_IS_NONSTR(connp)) {
2602                                                 /* Send up T_CONN_CON */
2603                                                 if (ira->ira_cred != NULL) {
2604                                                         mblk_setcred(mp1,
2605                                                             ira->ira_cred,
2606                                                             ira->ira_cpid);
2607                                                 }
2608                                                 putnext(connp->conn_rq, mp1);
2609                                         } else {
2610                                                 (*connp->conn_upcalls->
2611                                                     su_connected)
2612                                                     (connp->conn_upper_handle,
2613                                                     tcp->tcp_connid,
2614                                                     ira->ira_cred,
2615                                                     ira->ira_cpid);
2616                                                 freemsg(mp1);
2617                                         }
2618 
2619                                         freemsg(mp);
2620                                         return;
2621                                 }
2622                                 /*
2623                                  * Forget fusion; we need to handle more
2624                                  * complex cases below.  Send the deferred
2625                                  * T_CONN_CON message upstream and proceed
2626                                  * as usual.  Mark this tcp as not capable
2627                                  * of fusion.
2628                                  */
2629                                 TCP_STAT(tcps, tcp_fusion_unfusable);
2630                                 tcp->tcp_unfusable = B_TRUE;
2631                                 if (!IPCL_IS_NONSTR(connp)) {
2632                                         if (ira->ira_cred != NULL) {
2633                                                 mblk_setcred(mp1, ira->ira_cred,
2634                                                     ira->ira_cpid);
2635                                         }
2636                                         putnext(connp->conn_rq, mp1);
2637                                 } else {
2638                                         (*connp->conn_upcalls->su_connected)
2639                                             (connp->conn_upper_handle,
2640                                             tcp->tcp_connid, ira->ira_cred,
2641                                             ira->ira_cpid);
2642                                         freemsg(mp1);
2643                                 }
2644                         }
2645 
2646                         /*
2647                          * Check to see if there is data to be sent.  If
2648                          * yes, set the transmit flag.  Then check to see
2649                          * if received data processing needs to be done.
2650                          * If not, go straight to xmit_check.  This short
2651                          * cut is OK as we don't support T/TCP.
2652                          */
2653                         if (tcp->tcp_unsent)
2654                                 flags |= TH_XMIT_NEEDED;
2655 
2656                         if (seg_len == 0 && !(flags & TH_URG)) {
2657                                 freemsg(mp);
2658                                 goto xmit_check;


2992                          * for this connection or if this is a new urgent
2993                          * byte. Also send a zero-length "unmarked" message
2994                          * to inform SIOCATMARK that this is not the mark.
2995                          *
2996                          * tcp_urp_last_valid is cleared when the T_exdata_ind
2997                          * is sent up. This plus the check for old data
2998                          * (gap >= 0) handles the wraparound of the sequence
2999                          * number space without having to always track the
3000                          * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks
3001                          * this max in its rcv_up variable).
3002                          *
3003                          * This prevents duplicate SIGURGS due to a "late"
3004                          * zero-window probe when the T_EXDATA_IND has already
3005                          * been sent up.
3006                          */
3007                         if ((flags & TH_URG) &&
3008                             (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq,
3009                             tcp->tcp_urp_last))) {
3010                                 if (IPCL_IS_NONSTR(connp)) {
3011                                         if (!TCP_IS_DETACHED(tcp)) {
3012                                                 (*connp->conn_upcalls->
3013                                                     su_signal_oob)
3014                                                     (connp->conn_upper_handle,
3015                                                     urp);
3016                                         }
3017                                 } else {
3018                                         mp1 = allocb(0, BPRI_MED);
3019                                         if (mp1 == NULL) {
3020                                                 freemsg(mp);
3021                                                 return;
3022                                         }
3023                                         if (!TCP_IS_DETACHED(tcp) &&
3024                                             !putnextctl1(connp->conn_rq,
3025                                             M_PCSIG, SIGURG)) {
3026                                                 /* Try again on the rexmit. */
3027                                                 freemsg(mp1);
3028                                                 freemsg(mp);
3029                                                 return;
3030                                         }
3031                                         /*
3032                                          * If the next byte would be the mark
3033                                          * then mark with MARKNEXT else mark


3271          */
3272         if (flags & TH_URG && urp >= 0) {
3273                 if (!tcp->tcp_urp_last_valid ||
3274                     SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3275                         /*
3276                          * Non-STREAMS sockets handle the urgent data a litte
3277                          * differently from STREAMS based sockets. There is no
3278                          * need to mark any mblks with the MSG{NOT,}MARKNEXT
3279                          * flags to keep SIOCATMARK happy. Instead a
3280                          * su_signal_oob upcall is made to update the mark.
3281                          * Neither is a T_EXDATA_IND mblk needed to be
3282                          * prepended to the urgent data. The urgent data is
3283                          * delivered using the su_recv upcall, where we set
3284                          * the MSG_OOB flag to indicate that it is urg data.
3285                          *
3286                          * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3287                          * are used by non-STREAMS sockets.
3288                          */
3289                         if (IPCL_IS_NONSTR(connp)) {
3290                                 if (!TCP_IS_DETACHED(tcp)) {
3291                                         (*connp->conn_upcalls->su_signal_oob)
3292                                             (connp->conn_upper_handle, urp);
3293                                 }
3294                         } else {
3295                                 /*
3296                                  * If we haven't generated the signal yet for
3297                                  * this urgent pointer value, do it now.  Also,
3298                                  * send up a zero-length M_DATA indicating
3299                                  * whether or not this is the mark. The latter
3300                                  * is not needed when a T_EXDATA_IND is sent up.
3301                                  * However, if there are allocation failures
3302                                  * this code relies on the sender retransmitting
3303                                  * and the socket code for determining the mark
3304                                  * should not block waiting for the peer to
3305                                  * transmit. Thus, for simplicity we always
3306                                  * send up the mark indication.
3307                                  */
3308                                 mp1 = allocb(0, BPRI_MED);
3309                                 if (mp1 == NULL) {
3310                                         freemsg(mp);
3311                                         return;


3430                                          * the remainder back in will cause a
3431                                          * loop. In this case, drop the
3432                                          * packet and let the sender try
3433                                          * sending a good packet.
3434                                          */
3435                                         if (tmp_rnxt == tcp->tcp_rnxt) {
3436                                                 freemsg(mp);
3437                                                 return;
3438                                         }
3439                                 }
3440                                 tcp_input_data(connp, mp, NULL, ira);
3441                                 return;
3442                         }
3443                         /*
3444                          * This segment contains only the urgent byte.  We
3445                          * have to allocate the T_exdata_ind, if we can.
3446                          */
3447                         if (IPCL_IS_NONSTR(connp)) {
3448                                 int error;
3449 
3450                                 (*connp->conn_upcalls->su_recv)
3451                                     (connp->conn_upper_handle, mp, seg_len,
3452                                     MSG_OOB, &error, NULL);
3453                                 /*
3454                                  * We should never be in middle of a
3455                                  * fallback, the squeue guarantees that.
3456                                  */
3457                                 ASSERT(error != EOPNOTSUPP);
3458                                 mp = NULL;
3459                                 goto update_ack;
3460                         } else if (!tcp->tcp_urp_mp) {
3461                                 struct T_exdata_ind *tei;
3462                                 mp1 = allocb(sizeof (struct T_exdata_ind),
3463                                     BPRI_MED);
3464                                 if (!mp1) {
3465                                         /*
3466                                          * Sigh... It'll be back.
3467                                          * Generate any MSG*MARK message now.
3468                                          */
3469                                         freemsg(mp);
3470                                         seg_len = 0;


4609                     tcp_display(tcp, NULL, DISP_PORT_ONLY));
4610 #endif /* DEBUG */
4611         }
4612 
4613         /*
4614          * Check for ancillary data changes compared to last segment.
4615          */
4616         if (connp->conn_recv_ancillary.crb_all != 0) {
4617                 mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira);
4618                 if (mp == NULL)
4619                         return;
4620         }
4621 
4622         if (IPCL_IS_NONSTR(connp)) {
4623                 /*
4624                  * Non-STREAMS socket
4625                  */
4626                 boolean_t push = flags & (TH_PUSH|TH_FIN);
4627                 int error;
4628 
4629                 if ((*connp->conn_upcalls->su_recv)(
4630                     connp->conn_upper_handle,
4631                     mp, seg_len, 0, &error, &push) <= 0) {
4632                         /*
4633                          * We should never be in middle of a
4634                          * fallback, the squeue guarantees that.
4635                          */
4636                         ASSERT(error != EOPNOTSUPP);
4637                         if (error == ENOSPC)
4638                                 tcp->tcp_rwnd -= seg_len;
4639                 } else if (push) {
4640                         /* PUSH bit set and sockfs is not flow controlled */
4641                         flags |= tcp_rwnd_reopen(tcp);
4642                 }
4643         } else if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
4644                 /*
4645                  * Side queue inbound data until the accept happens.
4646                  * tcp_accept/tcp_rput drains this when the accept happens.
4647                  * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or
4648                  * T_EXDATA_IND) it is queued on b_next.
4649                  * XXX Make urgent data use this. Requires:
4650                  *      Removing tcp_listener check for TH_URG


4852                             tcp->tcp_localnet ?
4853                             tcps->tcps_local_dack_interval :
4854                             tcps->tcps_deferred_ack_interval);
4855                 }
4856         }
4857         if (flags & TH_ORDREL_NEEDED) {
4858                 /*
4859                  * Notify upper layer about an orderly release. If this is
4860                  * a non-STREAMS socket, then just make an upcall. For STREAMS
4861                  * we send up an ordrel_ind, unless this is an eager, in which
4862                  * case the ordrel will be sent when tcp_accept_finish runs.
4863                  * Note that for non-STREAMS we make an upcall even if it is an
4864                  * eager, because we have an upper handle to send it to.
4865                  */
4866                 ASSERT(IPCL_IS_NONSTR(connp) || tcp->tcp_listener == NULL);
4867                 ASSERT(!tcp->tcp_detached);
4868 
4869                 if (IPCL_IS_NONSTR(connp)) {
4870                         ASSERT(tcp->tcp_ordrel_mp == NULL);
4871                         tcp->tcp_ordrel_done = B_TRUE;
4872                         (*connp->conn_upcalls->su_opctl)
4873                             (connp->conn_upper_handle, SOCK_OPCTL_SHUT_RECV, 0);
4874                         goto done;
4875                 }
4876 
4877                 if (tcp->tcp_rcv_list != NULL) {
4878                         /*
4879                          * Push any mblk(s) enqueued from co processing.
4880                          */
4881                         flags |= tcp_rcv_drain(tcp);
4882                 }
4883                 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
4884 
4885                 mp1 = tcp->tcp_ordrel_mp;
4886                 tcp->tcp_ordrel_mp = NULL;
4887                 tcp->tcp_ordrel_done = B_TRUE;
4888                 putnext(connp->conn_rq, mp1);
4889         }
4890 done:
4891         ASSERT(!(flags & TH_MARKNEXT_NEEDED));
4892 }
4893 




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  25  */
  26 
  27 /* This file contains all TCP input processing functions. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsun.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/stropts.h>
  34 #include <sys/strlog.h>
  35 #define _SUN_TPI_VERSION 2
  36 #include <sys/tihdr.h>
  37 #include <sys/suntpi.h>
  38 #include <sys/xti_inet.h>
  39 #include <sys/squeue_impl.h>
  40 #include <sys/squeue.h>
  41 #include <sys/tsol/tnet.h>
  42 
  43 #include <inet/common.h>
  44 #include <inet/ip.h>


2214 
2215                 /*
2216                  * Prime pump for checksum calculation in IP.  Include the
2217                  * adjustment for a source route if any.
2218                  */
2219                 data_length = tcp_hdr_len + connp->conn_sum;
2220                 data_length = (data_length >> 16) + (data_length & 0xFFFF);
2221                 tcpha->tha_sum = htons(data_length);
2222 
2223                 if (tcp->tcp_ip_forward_progress) {
2224                         tcp->tcp_ip_forward_progress = B_FALSE;
2225                         connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
2226                 } else {
2227                         connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
2228                 }
2229                 return (mp1);
2230         }
2231 }
2232 
2233 /*
2234  * Dummy socket upcalls for if/when the conn_t gets detached from a
2235  * direct-callback sonode via a user-driven close().  Easy to catch with
2236  * DTrace FBT, and should be mostly harmless.
2237  */
2238 
2239 /* ARGSUSED */
2240 static sock_upper_handle_t
2241 tcp_dummy_newconn(sock_upper_handle_t x, sock_lower_handle_t y,
2242     sock_downcalls_t *z, cred_t *cr, pid_t pid, sock_upcalls_t **ignored)
2243 {
2244         ASSERT(0);      /* Panic in debug, otherwise ignore. */
2245         return (NULL);
2246 }
2247 
2248 /* ARGSUSED */
2249 static void
2250 tcp_dummy_connected(sock_upper_handle_t x, sock_connid_t y, cred_t *cr,
2251     pid_t pid)
2252 {
2253         ASSERT(x == NULL);
2254         /* Normally we'd crhold(cr) and attach it to socket state. */
2255         /* LINTED */
2256 }
2257 
2258 /* ARGSUSED */
2259 static int
2260 tcp_dummy_disconnected(sock_upper_handle_t x, sock_connid_t y, int blah)
2261 {
2262         ASSERT(0);      /* Panic in debug, otherwise ignore. */
2263         return (-1);
2264 }
2265 
2266 /* ARGSUSED */
2267 static void
2268 tcp_dummy_opctl(sock_upper_handle_t x, sock_opctl_action_t y, uintptr_t blah)
2269 {
2270         ASSERT(x == NULL);
2271         /* We really want this one to be a harmless NOP for now. */
2272         /* LINTED */
2273 }
2274 
2275 /* ARGSUSED */
2276 static ssize_t
2277 tcp_dummy_recv(sock_upper_handle_t x, mblk_t *mp, size_t len, int flags,
2278     int *error, boolean_t *push)
2279 {
2280         ASSERT(x == NULL);
2281 
2282         /*
2283          * Consume the message, set ESHUTDOWN, and return an error.
2284          * Nobody's home!
2285          */
2286         freemsg(mp);
2287         *error = ESHUTDOWN;
2288         return (-1);
2289 }
2290 
2291 /* ARGSUSED */
2292 static void
2293 tcp_dummy_set_proto_props(sock_upper_handle_t x, struct sock_proto_props *y)
2294 {
2295         ASSERT(0);      /* Panic in debug, otherwise ignore. */
2296 }
2297 
2298 /* ARGSUSED */
2299 static void
2300 tcp_dummy_txq_full(sock_upper_handle_t x, boolean_t y)
2301 {
2302         ASSERT(0);      /* Panic in debug, otherwise ignore. */
2303 }
2304 
2305 /* ARGSUSED */
2306 static void
2307 tcp_dummy_signal_oob(sock_upper_handle_t x, ssize_t len)
2308 {
2309         ASSERT(x == NULL);
2310         /* Otherwise, this would signal socket state about OOB data. */
2311 }
2312 
2313 /* ARGSUSED */
2314 static void
2315 tcp_dummy_set_error(sock_upper_handle_t x, int err)
2316 {
2317         ASSERT(0);      /* Panic in debug, otherwise ignore. */
2318 }
2319 
2320 /* ARGSUSED */
2321 static void
2322 tcp_dummy_onearg(sock_upper_handle_t x)
2323 {
2324         ASSERT(0);      /* Panic in debug, otherwise ignore. */
2325 }
2326 
2327 static sock_upcalls_t tcp_dummy_upcalls = {
2328         tcp_dummy_newconn,
2329         tcp_dummy_connected,
2330         tcp_dummy_disconnected,
2331         tcp_dummy_opctl,
2332         tcp_dummy_recv,
2333         tcp_dummy_set_proto_props,
2334         tcp_dummy_txq_full,
2335         tcp_dummy_signal_oob,
2336         tcp_dummy_onearg,
2337         tcp_dummy_set_error,
2338         tcp_dummy_onearg
2339 };
2340 
2341 /*
2342  * Handle M_DATA messages from IP. Its called directly from IP via
2343  * squeue for received IP packets.
2344  *
2345  * The first argument is always the connp/tcp to which the mp belongs.
2346  * There are no exceptions to this rule. The caller has already put
2347  * a reference on this connp/tcp and once tcp_input_data() returns,
2348  * the squeue will do the refrele.
2349  *
2350  * The TH_SYN for the listener directly go to tcp_input_listener via
2351  * squeue. ICMP errors go directly to tcp_icmp_input().
2352  *
2353  * sqp: NULL = recursive, sqp != NULL means called from squeue
2354  */
2355 void
2356 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2357 {
2358         int32_t         bytes_acked;
2359         int32_t         gap;
2360         mblk_t          *mp1;
2361         uint_t          flags;


2363         uchar_t         *iphdr;
2364         uchar_t         *rptr;
2365         int32_t         rgap;
2366         uint32_t        seg_ack;
2367         int             seg_len;
2368         uint_t          ip_hdr_len;
2369         uint32_t        seg_seq;
2370         tcpha_t         *tcpha;
2371         int             urp;
2372         tcp_opt_t       tcpopt;
2373         ip_pkt_t        ipp;
2374         boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
2375         uint32_t        cwnd;
2376         uint32_t        add;
2377         int             npkt;
2378         int             mss;
2379         conn_t          *connp = (conn_t *)arg;
2380         squeue_t        *sqp = (squeue_t *)arg2;
2381         tcp_t           *tcp = connp->conn_tcp;
2382         tcp_stack_t     *tcps = tcp->tcp_tcps;
2383         sock_upcalls_t  *sockupcalls;
2384 
2385         /*
2386          * RST from fused tcp loopback peer should trigger an unfuse.
2387          */
2388         if (tcp->tcp_fused) {
2389                 TCP_STAT(tcps, tcp_fusion_aborted);
2390                 tcp_unfuse(tcp);
2391         }
2392 
2393         iphdr = mp->b_rptr;
2394         rptr = mp->b_rptr;
2395         ASSERT(OK_32PTR(rptr));
2396 
2397         ip_hdr_len = ira->ira_ip_hdr_length;
2398         if (connp->conn_recv_ancillary.crb_all != 0) {
2399                 /*
2400                  * Record packet information in the ip_pkt_t
2401                  */
2402                 ipp.ipp_fields = 0;
2403                 if (ira->ira_flags & IRAF_IS_IPV4) {


2489                 if (tcp->tcp_detached || !pullupmsg(mp, -1)) {
2490                         freemsg(mp);
2491                         return;
2492                 }
2493                 /* Update pointers into message */
2494                 iphdr = rptr = mp->b_rptr;
2495                 tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2496                 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
2497                         /*
2498                          * Since we can't handle any data with this urgent
2499                          * pointer that is out of sequence, we expunge
2500                          * the data.  This allows us to still register
2501                          * the urgent mark and generate the M_PCSIG,
2502                          * which we can do.
2503                          */
2504                         mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
2505                         seg_len = 0;
2506                 }
2507         }
2508 
2509         sockupcalls = connp->conn_upcalls;
2510         /* A conn_t may have belonged to a now-closed socket.  Be careful. */
2511         if (sockupcalls == NULL)
2512                 sockupcalls = &tcp_dummy_upcalls;
2513 
2514         switch (tcp->tcp_state) {
2515         case TCPS_SYN_SENT:
2516                 if (connp->conn_final_sqp == NULL &&
2517                     tcp_outbound_squeue_switch && sqp != NULL) {
2518                         ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
2519                         connp->conn_final_sqp = sqp;
2520                         if (connp->conn_final_sqp != connp->conn_sqp) {
2521                                 DTRACE_PROBE1(conn__final__sqp__switch,
2522                                     conn_t *, connp);
2523                                 CONN_INC_REF(connp);
2524                                 SQUEUE_SWITCH(connp, connp->conn_final_sqp);
2525                                 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2526                                     tcp_input_data, connp, ira, ip_squeue_flag,
2527                                     SQTAG_CONNECT_FINISH);
2528                                 return;
2529                         }
2530                         DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp);
2531                 }
2532                 if (flags & TH_ACK) {
2533                         /*


2705                                  */
2706                                 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2707                                         if (tcp->tcp_ack_tid != 0) {
2708                                                 (void) TCP_TIMER_CANCEL(tcp,
2709                                                     tcp->tcp_ack_tid);
2710                                                 tcp->tcp_ack_tid = 0;
2711                                         }
2712                                         tcp_send_data(tcp, ack_mp);
2713                                         BUMP_LOCAL(tcp->tcp_obsegs);
2714                                         TCPS_BUMP_MIB(tcps, tcpOutAck);
2715 
2716                                         if (!IPCL_IS_NONSTR(connp)) {
2717                                                 /* Send up T_CONN_CON */
2718                                                 if (ira->ira_cred != NULL) {
2719                                                         mblk_setcred(mp1,
2720                                                             ira->ira_cred,
2721                                                             ira->ira_cpid);
2722                                                 }
2723                                                 putnext(connp->conn_rq, mp1);
2724                                         } else {
2725                                                 (*sockupcalls->su_connected)

2726                                                     (connp->conn_upper_handle,
2727                                                     tcp->tcp_connid,
2728                                                     ira->ira_cred,
2729                                                     ira->ira_cpid);
2730                                                 freemsg(mp1);
2731                                         }
2732 
2733                                         freemsg(mp);
2734                                         return;
2735                                 }
2736                                 /*
2737                                  * Forget fusion; we need to handle more
2738                                  * complex cases below.  Send the deferred
2739                                  * T_CONN_CON message upstream and proceed
2740                                  * as usual.  Mark this tcp as not capable
2741                                  * of fusion.
2742                                  */
2743                                 TCP_STAT(tcps, tcp_fusion_unfusable);
2744                                 tcp->tcp_unfusable = B_TRUE;
2745                                 if (!IPCL_IS_NONSTR(connp)) {
2746                                         if (ira->ira_cred != NULL) {
2747                                                 mblk_setcred(mp1, ira->ira_cred,
2748                                                     ira->ira_cpid);
2749                                         }
2750                                         putnext(connp->conn_rq, mp1);
2751                                 } else {
2752                                         (*sockupcalls->su_connected)
2753                                             (connp->conn_upper_handle,
2754                                             tcp->tcp_connid, ira->ira_cred,
2755                                             ira->ira_cpid);
2756                                         freemsg(mp1);
2757                                 }
2758                         }
2759 
2760                         /*
2761                          * Check to see if there is data to be sent.  If
2762                          * yes, set the transmit flag.  Then check to see
2763                          * if received data processing needs to be done.
2764                          * If not, go straight to xmit_check.  This short
2765                          * cut is OK as we don't support T/TCP.
2766                          */
2767                         if (tcp->tcp_unsent)
2768                                 flags |= TH_XMIT_NEEDED;
2769 
2770                         if (seg_len == 0 && !(flags & TH_URG)) {
2771                                 freemsg(mp);
2772                                 goto xmit_check;


3106                          * for this connection or if this is a new urgent
3107                          * byte. Also send a zero-length "unmarked" message
3108                          * to inform SIOCATMARK that this is not the mark.
3109                          *
3110                          * tcp_urp_last_valid is cleared when the T_exdata_ind
3111                          * is sent up. This plus the check for old data
3112                          * (gap >= 0) handles the wraparound of the sequence
3113                          * number space without having to always track the
3114                          * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks
3115                          * this max in its rcv_up variable).
3116                          *
3117                          * This prevents duplicate SIGURGS due to a "late"
3118                          * zero-window probe when the T_EXDATA_IND has already
3119                          * been sent up.
3120                          */
3121                         if ((flags & TH_URG) &&
3122                             (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq,
3123                             tcp->tcp_urp_last))) {
3124                                 if (IPCL_IS_NONSTR(connp)) {
3125                                         if (!TCP_IS_DETACHED(tcp)) {
3126                                                 (*sockupcalls->su_signal_oob)

3127                                                     (connp->conn_upper_handle,
3128                                                     urp);
3129                                         }
3130                                 } else {
3131                                         mp1 = allocb(0, BPRI_MED);
3132                                         if (mp1 == NULL) {
3133                                                 freemsg(mp);
3134                                                 return;
3135                                         }
3136                                         if (!TCP_IS_DETACHED(tcp) &&
3137                                             !putnextctl1(connp->conn_rq,
3138                                             M_PCSIG, SIGURG)) {
3139                                                 /* Try again on the rexmit. */
3140                                                 freemsg(mp1);
3141                                                 freemsg(mp);
3142                                                 return;
3143                                         }
3144                                         /*
3145                                          * If the next byte would be the mark
3146                                          * then mark with MARKNEXT else mark


3384          */
3385         if (flags & TH_URG && urp >= 0) {
3386                 if (!tcp->tcp_urp_last_valid ||
3387                     SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3388                         /*
3389                          * Non-STREAMS sockets handle the urgent data a litte
3390                          * differently from STREAMS based sockets. There is no
3391                          * need to mark any mblks with the MSG{NOT,}MARKNEXT
3392                          * flags to keep SIOCATMARK happy. Instead a
3393                          * su_signal_oob upcall is made to update the mark.
3394                          * Neither is a T_EXDATA_IND mblk needed to be
3395                          * prepended to the urgent data. The urgent data is
3396                          * delivered using the su_recv upcall, where we set
3397                          * the MSG_OOB flag to indicate that it is urg data.
3398                          *
3399                          * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3400                          * are used by non-STREAMS sockets.
3401                          */
3402                         if (IPCL_IS_NONSTR(connp)) {
3403                                 if (!TCP_IS_DETACHED(tcp)) {
3404                                         (*sockupcalls->su_signal_oob)
3405                                             (connp->conn_upper_handle, urp);
3406                                 }
3407                         } else {
3408                                 /*
3409                                  * If we haven't generated the signal yet for
3410                                  * this urgent pointer value, do it now.  Also,
3411                                  * send up a zero-length M_DATA indicating
3412                                  * whether or not this is the mark. The latter
3413                                  * is not needed when a T_EXDATA_IND is sent up.
3414                                  * However, if there are allocation failures
3415                                  * this code relies on the sender retransmitting
3416                                  * and the socket code for determining the mark
3417                                  * should not block waiting for the peer to
3418                                  * transmit. Thus, for simplicity we always
3419                                  * send up the mark indication.
3420                                  */
3421                                 mp1 = allocb(0, BPRI_MED);
3422                                 if (mp1 == NULL) {
3423                                         freemsg(mp);
3424                                         return;


3543                                          * the remainder back in will cause a
3544                                          * loop. In this case, drop the
3545                                          * packet and let the sender try
3546                                          * sending a good packet.
3547                                          */
3548                                         if (tmp_rnxt == tcp->tcp_rnxt) {
3549                                                 freemsg(mp);
3550                                                 return;
3551                                         }
3552                                 }
3553                                 tcp_input_data(connp, mp, NULL, ira);
3554                                 return;
3555                         }
3556                         /*
3557                          * This segment contains only the urgent byte.  We
3558                          * have to allocate the T_exdata_ind, if we can.
3559                          */
3560                         if (IPCL_IS_NONSTR(connp)) {
3561                                 int error;
3562 
3563                                 (*sockupcalls->su_recv)
3564                                     (connp->conn_upper_handle, mp, seg_len,
3565                                     MSG_OOB, &error, NULL);
3566                                 /*
3567                                  * We should never be in middle of a
3568                                  * fallback, the squeue guarantees that.
3569                                  */
3570                                 ASSERT(error != EOPNOTSUPP);
3571                                 mp = NULL;
3572                                 goto update_ack;
3573                         } else if (!tcp->tcp_urp_mp) {
3574                                 struct T_exdata_ind *tei;
3575                                 mp1 = allocb(sizeof (struct T_exdata_ind),
3576                                     BPRI_MED);
3577                                 if (!mp1) {
3578                                         /*
3579                                          * Sigh... It'll be back.
3580                                          * Generate any MSG*MARK message now.
3581                                          */
3582                                         freemsg(mp);
3583                                         seg_len = 0;


4722                     tcp_display(tcp, NULL, DISP_PORT_ONLY));
4723 #endif /* DEBUG */
4724         }
4725 
4726         /*
4727          * Check for ancillary data changes compared to last segment.
4728          */
4729         if (connp->conn_recv_ancillary.crb_all != 0) {
4730                 mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira);
4731                 if (mp == NULL)
4732                         return;
4733         }
4734 
4735         if (IPCL_IS_NONSTR(connp)) {
4736                 /*
4737                  * Non-STREAMS socket
4738                  */
4739                 boolean_t push = flags & (TH_PUSH|TH_FIN);
4740                 int error;
4741 
4742                 if ((*sockupcalls->su_recv)(connp->conn_upper_handle,

4743                     mp, seg_len, 0, &error, &push) <= 0) {
4744                         /*
4745                          * We should never be in middle of a
4746                          * fallback, the squeue guarantees that.
4747                          */
4748                         ASSERT(error != EOPNOTSUPP);
4749                         if (error == ENOSPC)
4750                                 tcp->tcp_rwnd -= seg_len;
4751                 } else if (push) {
4752                         /* PUSH bit set and sockfs is not flow controlled */
4753                         flags |= tcp_rwnd_reopen(tcp);
4754                 }
4755         } else if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
4756                 /*
4757                  * Side queue inbound data until the accept happens.
4758                  * tcp_accept/tcp_rput drains this when the accept happens.
4759                  * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or
4760                  * T_EXDATA_IND) it is queued on b_next.
4761                  * XXX Make urgent data use this. Requires:
4762                  *      Removing tcp_listener check for TH_URG


4964                             tcp->tcp_localnet ?
4965                             tcps->tcps_local_dack_interval :
4966                             tcps->tcps_deferred_ack_interval);
4967                 }
4968         }
4969         if (flags & TH_ORDREL_NEEDED) {
4970                 /*
4971                  * Notify upper layer about an orderly release. If this is
4972                  * a non-STREAMS socket, then just make an upcall. For STREAMS
4973                  * we send up an ordrel_ind, unless this is an eager, in which
4974                  * case the ordrel will be sent when tcp_accept_finish runs.
4975                  * Note that for non-STREAMS we make an upcall even if it is an
4976                  * eager, because we have an upper handle to send it to.
4977                  */
4978                 ASSERT(IPCL_IS_NONSTR(connp) || tcp->tcp_listener == NULL);
4979                 ASSERT(!tcp->tcp_detached);
4980 
4981                 if (IPCL_IS_NONSTR(connp)) {
4982                         ASSERT(tcp->tcp_ordrel_mp == NULL);
4983                         tcp->tcp_ordrel_done = B_TRUE;
4984                         (*sockupcalls->su_opctl)(connp->conn_upper_handle,
4985                             SOCK_OPCTL_SHUT_RECV, 0);
4986                         goto done;
4987                 }
4988 
4989                 if (tcp->tcp_rcv_list != NULL) {
4990                         /*
4991                          * Push any mblk(s) enqueued from co processing.
4992                          */
4993                         flags |= tcp_rcv_drain(tcp);
4994                 }
4995                 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
4996 
4997                 mp1 = tcp->tcp_ordrel_mp;
4998                 tcp->tcp_ordrel_mp = NULL;
4999                 tcp->tcp_ordrel_done = B_TRUE;
5000                 putnext(connp->conn_rq, mp1);
5001         }
5002 done:
5003         ASSERT(!(flags & TH_MARKNEXT_NEEDED));
5004 }
5005