Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/ip/icmp.c
          +++ new/usr/src/uts/common/inet/ip/icmp.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  25   25   * Copyright 2016 Joyent, Inc.
  26   26   */
  27   27  /* Copyright (c) 1990 Mentat Inc. */
  28   28  
  29   29  #include <sys/types.h>
  30   30  #include <sys/stream.h>
  31   31  #include <sys/stropts.h>
  32   32  #include <sys/strlog.h>
  33   33  #include <sys/strsun.h>
  34   34  #define _SUN_TPI_VERSION 2
  35   35  #include <sys/tihdr.h>
  36   36  #include <sys/timod.h>
  37   37  #include <sys/ddi.h>
  38   38  #include <sys/sunddi.h>
  39   39  #include <sys/strsubr.h>
  40   40  #include <sys/suntpi.h>
  41   41  #include <sys/xti_inet.h>
  42   42  #include <sys/cmn_err.h>
  43   43  #include <sys/kmem.h>
  44   44  #include <sys/cred.h>
  45   45  #include <sys/policy.h>
  46   46  #include <sys/priv.h>
  47   47  #include <sys/ucred.h>
  48   48  #include <sys/zone.h>
  49   49  
  50   50  #include <sys/sockio.h>
  51   51  #include <sys/socket.h>
  52   52  #include <sys/socketvar.h>
  53   53  #include <sys/vtrace.h>
  54   54  #include <sys/sdt.h>
  55   55  #include <sys/debug.h>
  56   56  #include <sys/isa_defs.h>
  57   57  #include <sys/random.h>
  58   58  #include <netinet/in.h>
  59   59  #include <netinet/ip6.h>
  60   60  #include <netinet/icmp6.h>
  61   61  #include <netinet/udp.h>
  62   62  
  63   63  #include <inet/common.h>
  64   64  #include <inet/ip.h>
  65   65  #include <inet/ip_impl.h>
  66   66  #include <inet/ipsec_impl.h>
  67   67  #include <inet/ip6.h>
  68   68  #include <inet/ip_ire.h>
  69   69  #include <inet/ip_if.h>
  70   70  #include <inet/ip_multi.h>
  71   71  #include <inet/ip_ndp.h>
  72   72  #include <inet/proto_set.h>
  73   73  #include <inet/mib2.h>
  74   74  #include <inet/nd.h>
  75   75  #include <inet/optcom.h>
  76   76  #include <inet/snmpcom.h>
  77   77  #include <inet/kstatcom.h>
  78   78  #include <inet/ipclassifier.h>
  79   79  
  80   80  #include <sys/tsol/label.h>
  81   81  #include <sys/tsol/tnet.h>
  82   82  
  83   83  #include <inet/rawip_impl.h>
  84   84  #include <net/bpf.h>
  85   85  
  86   86  #include <sys/disp.h>
  87   87  
  88   88  /*
  89   89   * Synchronization notes:
  90   90   *
  91   91   * RAWIP is MT and uses the usual kernel synchronization primitives. We use
  92   92   * conn_lock to protect the icmp_t.
  93   93   *
  94   94   * Plumbing notes:
  95   95   * ICMP is always a device driver. For compatibility with mibopen() code
  96   96   * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
  97   97   * dummy module.
  98   98   */
  99   99  static void     icmp_addr_req(queue_t *q, mblk_t *mp);
 100  100  static void     icmp_tpi_bind(queue_t *q, mblk_t *mp);
 101  101  static void     icmp_bind_proto(icmp_t *icmp);
 102  102  static int      icmp_build_hdr_template(conn_t *, const in6_addr_t *,
 103  103      const in6_addr_t *, uint32_t);
 104  104  static void     icmp_capability_req(queue_t *q, mblk_t *mp);
 105  105  static int      icmp_close(queue_t *q, int flags);
 106  106  static void     icmp_close_free(conn_t *);
 107  107  static void     icmp_tpi_connect(queue_t *q, mblk_t *mp);
 108  108  static void     icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
 109  109  static void     icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
 110  110      int sys_error);
 111  111  static void     icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
 112  112      t_scalar_t tlierr, int sys_error);
 113  113  static void     icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
 114  114      ip_recv_attr_t *);
 115  115  static void     icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
 116  116      ip_recv_attr_t *);
 117  117  static void     icmp_info_req(queue_t *q, mblk_t *mp);
 118  118  static void     icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 119  119  static conn_t   *icmp_open(int family, cred_t *credp, int *err, int flags);
 120  120  static int      icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
 121  121                      cred_t *credp);
 122  122  static int      icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
 123  123                      cred_t *credp);
 124  124  static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
 125  125  int             icmp_opt_set(conn_t *connp, uint_t optset_context,
 126  126                      int level, int name, uint_t inlen,
 127  127                      uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 128  128                      void *thisdg_attrs, cred_t *cr);
 129  129  int             icmp_opt_get(conn_t *connp, int level, int name,
 130  130                      uchar_t *ptr);
 131  131  static int      icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
 132  132                      sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
 133  133  static mblk_t   *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
 134  134      const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
 135  135  static mblk_t   *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
 136  136      mblk_t *, const in6_addr_t *, uint32_t, int *);
 137  137  static int      icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
 138  138                      uchar_t *ptr, int len);
 139  139  static void     icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
 140  140  static void     icmp_tpi_unbind(queue_t *q, mblk_t *mp);
 141  141  static void     icmp_wput(queue_t *q, mblk_t *mp);
 142  142  static void     icmp_wput_fallback(queue_t *q, mblk_t *mp);
 143  143  static void     icmp_wput_other(queue_t *q, mblk_t *mp);
 144  144  static void     icmp_wput_iocdata(queue_t *q, mblk_t *mp);
 145  145  static void     icmp_wput_restricted(queue_t *q, mblk_t *mp);
 146  146  static void     icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
 147  147  
 148  148  static void     *rawip_stack_init(netstackid_t stackid, netstack_t *ns);
 149  149  static void     rawip_stack_fini(netstackid_t stackid, void *arg);
 150  150  
 151  151  static void     *rawip_kstat_init(netstackid_t stackid);
 152  152  static void     rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
 153  153  static int      rawip_kstat_update(kstat_t *kp, int rw);
 154  154  static void     rawip_stack_shutdown(netstackid_t stackid, void *arg);
 155  155  
 156  156  /* Common routines for TPI and socket module */
 157  157  static conn_t   *rawip_do_open(int, cred_t *, int *, int);
 158  158  static void     rawip_do_close(conn_t *);
 159  159  static int      rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
 160  160  static int      rawip_do_unbind(conn_t *);
 161  161  static int      rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
 162  162      cred_t *, pid_t);
 163  163  
 164  164  int             rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
 165  165                      socklen_t *, cred_t *);
 166  166  int             rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
 167  167                      socklen_t *, cred_t *);
 168  168  
 169  169  static struct module_info icmp_mod_info =  {
 170  170          5707, "icmp", 1, INFPSZ, 512, 128
 171  171  };
 172  172  
 173  173  /*
 174  174   * Entry points for ICMP as a device.
 175  175   * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
 176  176   */
 177  177  static struct qinit icmprinitv4 = {
 178  178          NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
 179  179  };
 180  180  
 181  181  static struct qinit icmprinitv6 = {
 182  182          NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
 183  183  };
 184  184  
 185  185  static struct qinit icmpwinit = {
 186  186          (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
 187  187  };
 188  188  
 189  189  /* ICMP entry point during fallback */
 190  190  static struct qinit icmp_fallback_sock_winit = {
 191  191          (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
 192  192  };
 193  193  
 194  194  /* For AF_INET aka /dev/icmp */
 195  195  struct streamtab icmpinfov4 = {
 196  196          &icmprinitv4, &icmpwinit
 197  197  };
 198  198  
 199  199  /* For AF_INET6 aka /dev/icmp6 */
 200  200  struct streamtab icmpinfov6 = {
 201  201          &icmprinitv6, &icmpwinit
 202  202  };
 203  203  
 204  204  /* Default structure copied into T_INFO_ACK messages */
 205  205  static struct T_info_ack icmp_g_t_info_ack = {
 206  206          T_INFO_ACK,
 207  207          IP_MAXPACKET,    /* TSDU_size.  icmp allows maximum size messages. */
 208  208          T_INVALID,      /* ETSDU_size.  icmp does not support expedited data. */
 209  209          T_INVALID,      /* CDATA_size. icmp does not support connect data. */
 210  210          T_INVALID,      /* DDATA_size. icmp does not support disconnect data. */
 211  211          0,              /* ADDR_size - filled in later. */
 212  212          0,              /* OPT_size - not initialized here */
 213  213          IP_MAXPACKET,   /* TIDU_size.  icmp allows maximum size messages. */
 214  214          T_CLTS,         /* SERV_type.  icmp supports connection-less. */
 215  215          TS_UNBND,       /* CURRENT_state.  This is set from icmp_state. */
 216  216          (XPG4_1|SENDZERO) /* PROVIDER_flag */
 217  217  };
 218  218  
 219  219  static int
 220  220  icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo,
 221  221      const char *ifname, const void *pval, uint_t flags)
 222  222  {
 223  223          return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl,
 224  224              stack, cr, pinfo, ifname, pval, flags));
 225  225  }
 226  226  
 227  227  static int
 228  228  icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname,
 229  229      void *val, uint_t psize, uint_t flags)
 230  230  {
 231  231          return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack,
 232  232              pinfo, ifname, val, psize, flags));
 233  233  }
 234  234  
 235  235  /*
 236  236   * All of these are alterable, within the min/max values given, at run time.
 237  237   *
 238  238   * Note: All those tunables which do not start with "icmp_" are Committed and
 239  239   * therefore are public. See PSARC 2010/080.
 240  240   */
 241  241  static mod_prop_info_t icmp_propinfo_tbl[] = {
 242  242          /* tunable - 0 */
 243  243          { "_wroff_extra", MOD_PROTO_RAWIP,
 244  244              mod_set_uint32, mod_get_uint32,
 245  245              {0, 128, 32}, {32} },
 246  246  
 247  247          { "_ipv4_ttl", MOD_PROTO_RAWIP,
 248  248              mod_set_uint32, mod_get_uint32,
 249  249              {1, 255, 255}, {255} },
 250  250  
 251  251          { "_ipv6_hoplimit", MOD_PROTO_RAWIP,
 252  252              mod_set_uint32, mod_get_uint32,
 253  253              {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
 254  254              {IPV6_DEFAULT_HOPS} },
 255  255  
 256  256          { "_bsd_compat", MOD_PROTO_RAWIP,
 257  257              mod_set_boolean, mod_get_boolean,
 258  258              {B_TRUE}, {B_TRUE} },
 259  259  
 260  260          { "send_buf", MOD_PROTO_RAWIP,
 261  261              icmp_set_buf_prop, icmp_get_buf_prop,
 262  262              {4096, 65536, 8192}, {8192} },
 263  263  
 264  264          { "_xmit_lowat", MOD_PROTO_RAWIP,
 265  265              mod_set_uint32, mod_get_uint32,
 266  266              {0, 65536, 1024}, {1024} },
 267  267  
 268  268          { "recv_buf", MOD_PROTO_RAWIP,
 269  269              icmp_set_buf_prop, icmp_get_buf_prop,
 270  270              {4096, 65536, 8192}, {8192} },
 271  271  
 272  272          { "max_buf", MOD_PROTO_RAWIP,
 273  273              mod_set_uint32, mod_get_uint32,
 274  274              {65536, ULP_MAX_BUF, 256*1024}, {256*1024} },
 275  275  
 276  276          { "_pmtu_discovery", MOD_PROTO_RAWIP,
 277  277              mod_set_boolean, mod_get_boolean,
 278  278              {B_FALSE}, {B_FALSE} },
 279  279  
 280  280          { "_sendto_ignerr", MOD_PROTO_RAWIP,
 281  281              mod_set_boolean, mod_get_boolean,
 282  282              {B_FALSE}, {B_FALSE} },
 283  283  
 284  284          { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} },
 285  285  
 286  286          { NULL, 0, NULL, NULL, {0}, {0} }
 287  287  };
 288  288  
 289  289  #define is_wroff_extra                  is_propinfo_tbl[0].prop_cur_uval
 290  290  #define is_ipv4_ttl                     is_propinfo_tbl[1].prop_cur_uval
 291  291  #define is_ipv6_hoplimit                is_propinfo_tbl[2].prop_cur_uval
 292  292  #define is_bsd_compat                   is_propinfo_tbl[3].prop_cur_bval
 293  293  #define is_xmit_hiwat                   is_propinfo_tbl[4].prop_cur_uval
 294  294  #define is_xmit_lowat                   is_propinfo_tbl[5].prop_cur_uval
 295  295  #define is_recv_hiwat                   is_propinfo_tbl[6].prop_cur_uval
 296  296  #define is_max_buf                      is_propinfo_tbl[7].prop_cur_uval
 297  297  #define is_pmtu_discovery               is_propinfo_tbl[8].prop_cur_bval
 298  298  #define is_sendto_ignerr                is_propinfo_tbl[9].prop_cur_bval
 299  299  
 300  300  typedef union T_primitives *t_primp_t;
 301  301  
 302  302  /*
 303  303   * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
 304  304   * passed to icmp_wput.
 305  305   * It calls IP to verify the local IP address, and calls IP to insert
 306  306   * the conn_t in the fanout table.
 307  307   * If everything is ok it then sends the T_BIND_ACK back up.
 308  308   */
 309  309  static void
 310  310  icmp_tpi_bind(queue_t *q, mblk_t *mp)
 311  311  {
 312  312          int     error;
 313  313          struct sockaddr *sa;
 314  314          struct T_bind_req *tbr;
 315  315          socklen_t       len;
 316  316          sin_t   *sin;
 317  317          sin6_t  *sin6;
 318  318          icmp_t          *icmp;
 319  319          conn_t  *connp = Q_TO_CONN(q);
 320  320          mblk_t *mp1;
 321  321          cred_t *cr;
 322  322  
 323  323          /*
 324  324           * All Solaris components should pass a db_credp
 325  325           * for this TPI message, hence we ASSERT.
 326  326           * But in case there is some other M_PROTO that looks
 327  327           * like a TPI message sent by some other kernel
 328  328           * component, we check and return an error.
 329  329           */
 330  330          cr = msg_getcred(mp, NULL);
 331  331          ASSERT(cr != NULL);
 332  332          if (cr == NULL) {
 333  333                  icmp_err_ack(q, mp, TSYSERR, EINVAL);
 334  334                  return;
 335  335          }
 336  336  
 337  337          icmp = connp->conn_icmp;
 338  338          if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 339  339                  (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 340  340                      "icmp_bind: bad req, len %u",
 341  341                      (uint_t)(mp->b_wptr - mp->b_rptr));
 342  342                  icmp_err_ack(q, mp, TPROTO, 0);
 343  343                  return;
 344  344          }
 345  345  
 346  346          if (icmp->icmp_state != TS_UNBND) {
 347  347                  (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 348  348                      "icmp_bind: bad state, %u", icmp->icmp_state);
 349  349                  icmp_err_ack(q, mp, TOUTSTATE, 0);
 350  350                  return;
 351  351          }
 352  352  
 353  353          /*
 354  354           * Reallocate the message to make sure we have enough room for an
 355  355           * address.
 356  356           */
 357  357          mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
 358  358          if (mp1 == NULL) {
 359  359                  icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 360  360                  return;
 361  361          }
 362  362          mp = mp1;
 363  363  
 364  364          /* Reset the message type in preparation for shipping it back. */
 365  365          DB_TYPE(mp) = M_PCPROTO;
 366  366          tbr = (struct T_bind_req *)mp->b_rptr;
 367  367          len = tbr->ADDR_length;
 368  368          switch (len) {
 369  369          case 0: /* request for a generic port */
 370  370                  tbr->ADDR_offset = sizeof (struct T_bind_req);
 371  371                  if (connp->conn_family == AF_INET) {
 372  372                          tbr->ADDR_length = sizeof (sin_t);
 373  373                          sin = (sin_t *)&tbr[1];
 374  374                          *sin = sin_null;
 375  375                          sin->sin_family = AF_INET;
 376  376                          mp->b_wptr = (uchar_t *)&sin[1];
 377  377                          sa = (struct sockaddr *)sin;
 378  378                          len = sizeof (sin_t);
 379  379                  } else {
 380  380                          ASSERT(connp->conn_family == AF_INET6);
 381  381                          tbr->ADDR_length = sizeof (sin6_t);
 382  382                          sin6 = (sin6_t *)&tbr[1];
 383  383                          *sin6 = sin6_null;
 384  384                          sin6->sin6_family = AF_INET6;
 385  385                          mp->b_wptr = (uchar_t *)&sin6[1];
 386  386                          sa = (struct sockaddr *)sin6;
 387  387                          len = sizeof (sin6_t);
 388  388                  }
 389  389                  break;
 390  390  
 391  391          case sizeof (sin_t):    /* Complete IPv4 address */
 392  392                  sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
 393  393                      sizeof (sin_t));
 394  394                  break;
 395  395  
 396  396          case sizeof (sin6_t):   /* Complete IPv6 address */
 397  397                  sa = (struct sockaddr *)mi_offset_param(mp,
 398  398                      tbr->ADDR_offset, sizeof (sin6_t));
 399  399                  break;
 400  400  
 401  401          default:
 402  402                  (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 403  403                      "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
 404  404                  icmp_err_ack(q, mp, TBADADDR, 0);
 405  405                  return;
 406  406          }
 407  407  
 408  408          error = rawip_do_bind(connp, sa, len);
 409  409          if (error != 0) {
 410  410                  if (error > 0) {
 411  411                          icmp_err_ack(q, mp, TSYSERR, error);
 412  412                  } else {
 413  413                          icmp_err_ack(q, mp, -error, 0);
 414  414                  }
 415  415          } else {
 416  416                  tbr->PRIM_type = T_BIND_ACK;
 417  417                  qreply(q, mp);
 418  418          }
 419  419  }
 420  420  
 421  421  static int
 422  422  rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
 423  423  {
 424  424          sin_t           *sin;
 425  425          sin6_t          *sin6;
 426  426          icmp_t          *icmp = connp->conn_icmp;
 427  427          int             error = 0;
 428  428          ip_laddr_t      laddr_type = IPVL_UNICAST_UP;   /* INADDR_ANY */
 429  429          in_port_t       lport;          /* Network byte order */
 430  430          ipaddr_t        v4src;          /* Set if AF_INET */
 431  431          in6_addr_t      v6src;
 432  432          uint_t          scopeid = 0;
 433  433          zoneid_t        zoneid = IPCL_ZONEID(connp);
 434  434          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 435  435  
 436  436          if (sa == NULL || !OK_32PTR((char *)sa)) {
 437  437                  return (EINVAL);
 438  438          }
 439  439  
 440  440          switch (len) {
 441  441          case sizeof (sin_t):    /* Complete IPv4 address */
 442  442                  sin = (sin_t *)sa;
 443  443                  if (sin->sin_family != AF_INET ||
 444  444                      connp->conn_family != AF_INET) {
 445  445                          /* TSYSERR, EAFNOSUPPORT */
 446  446                          return (EAFNOSUPPORT);
 447  447                  }
 448  448                  v4src = sin->sin_addr.s_addr;
 449  449                  IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
 450  450                  if (v4src != INADDR_ANY) {
 451  451                          laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
 452  452                              B_TRUE);
 453  453                  }
 454  454                  lport = sin->sin_port;
 455  455                  break;
 456  456          case sizeof (sin6_t): /* Complete IPv6 address */
 457  457                  sin6 = (sin6_t *)sa;
 458  458                  if (sin6->sin6_family != AF_INET6 ||
 459  459                      connp->conn_family != AF_INET6) {
 460  460                          /* TSYSERR, EAFNOSUPPORT */
 461  461                          return (EAFNOSUPPORT);
 462  462                  }
 463  463                  /* No support for mapped addresses on raw sockets */
 464  464                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 465  465                          /* TSYSERR, EADDRNOTAVAIL */
 466  466                          return (EADDRNOTAVAIL);
 467  467                  }
 468  468                  v6src = sin6->sin6_addr;
 469  469                  if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 470  470                          if (IN6_IS_ADDR_LINKSCOPE(&v6src))
 471  471                                  scopeid = sin6->sin6_scope_id;
 472  472                          laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
 473  473                              B_TRUE, scopeid);
 474  474                  }
 475  475                  lport = sin6->sin6_port;
 476  476                  break;
 477  477  
 478  478          default:
 479  479                  /* TBADADDR */
 480  480                  return (EADDRNOTAVAIL);
 481  481          }
 482  482  
 483  483          /* Is the local address a valid unicast, multicast, or broadcast? */
 484  484          if (laddr_type == IPVL_BAD)
 485  485                  return (EADDRNOTAVAIL);
 486  486  
 487  487          /*
 488  488           * The state must be TS_UNBND.
 489  489           */
 490  490          mutex_enter(&connp->conn_lock);
 491  491          if (icmp->icmp_state != TS_UNBND) {
 492  492                  mutex_exit(&connp->conn_lock);
 493  493                  return (-TOUTSTATE);
 494  494          }
 495  495  
 496  496          /*
 497  497           * Copy the source address into our icmp structure.  This address
 498  498           * may still be zero; if so, ip will fill in the correct address
 499  499           * each time an outbound packet is passed to it.
 500  500           * If we are binding to a broadcast or multicast address then
 501  501           * we just set the conn_bound_addr since we don't want to use
 502  502           * that as the source address when sending.
 503  503           */
 504  504          connp->conn_bound_addr_v6 = v6src;
 505  505          connp->conn_laddr_v6 = v6src;
 506  506          if (scopeid != 0) {
 507  507                  connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
 508  508                  connp->conn_ixa->ixa_scopeid = scopeid;
 509  509                  connp->conn_incoming_ifindex = scopeid;
 510  510          } else {
 511  511                  connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 512  512                  connp->conn_incoming_ifindex = connp->conn_bound_if;
 513  513          }
 514  514  
 515  515          switch (laddr_type) {
 516  516          case IPVL_UNICAST_UP:
 517  517          case IPVL_UNICAST_DOWN:
 518  518                  connp->conn_saddr_v6 = v6src;
 519  519                  connp->conn_mcbc_bind = B_FALSE;
 520  520                  break;
 521  521          case IPVL_MCAST:
 522  522          case IPVL_BCAST:
 523  523                  /* ip_set_destination will pick a source address later */
 524  524                  connp->conn_saddr_v6 = ipv6_all_zeros;
 525  525                  connp->conn_mcbc_bind = B_TRUE;
 526  526                  break;
 527  527          }
 528  528  
 529  529          /* Any errors after this point should use late_error */
 530  530  
 531  531          /*
 532  532           * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 533  533           * with IPPROTO_TCP.
 534  534           */
 535  535          connp->conn_lport = lport;
 536  536          connp->conn_fport = 0;
 537  537  
 538  538          if (connp->conn_family == AF_INET) {
 539  539                  ASSERT(connp->conn_ipversion == IPV4_VERSION);
 540  540          } else {
 541  541                  ASSERT(connp->conn_ipversion == IPV6_VERSION);
 542  542          }
 543  543  
 544  544          icmp->icmp_state = TS_IDLE;
 545  545  
 546  546          /*
 547  547           * We create an initial header template here to make a subsequent
 548  548           * sendto have a starting point. Since conn_last_dst is zero the
 549  549           * first sendto will always follow the 'dst changed' code path.
 550  550           * Note that we defer massaging options and the related checksum
 551  551           * adjustment until we have a destination address.
 552  552           */
 553  553          error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 554  554              &connp->conn_faddr_v6, connp->conn_flowinfo);
 555  555          if (error != 0) {
 556  556                  mutex_exit(&connp->conn_lock);
 557  557                  goto late_error;
 558  558          }
 559  559          /* Just in case */
 560  560          connp->conn_faddr_v6 = ipv6_all_zeros;
 561  561          connp->conn_v6lastdst = ipv6_all_zeros;
 562  562          mutex_exit(&connp->conn_lock);
 563  563  
 564  564          error = ip_laddr_fanout_insert(connp);
 565  565          if (error != 0)
 566  566                  goto late_error;
 567  567  
 568  568          /* Bind succeeded */
 569  569          return (0);
 570  570  
 571  571  late_error:
 572  572          mutex_enter(&connp->conn_lock);
 573  573          connp->conn_saddr_v6 = ipv6_all_zeros;
 574  574          connp->conn_bound_addr_v6 = ipv6_all_zeros;
 575  575          connp->conn_laddr_v6 = ipv6_all_zeros;
 576  576          if (scopeid != 0) {
 577  577                  connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 578  578                  connp->conn_incoming_ifindex = connp->conn_bound_if;
 579  579          }
 580  580          icmp->icmp_state = TS_UNBND;
 581  581          connp->conn_v6lastdst = ipv6_all_zeros;
 582  582          connp->conn_lport = 0;
 583  583  
 584  584          /* Restore the header that was built above - different source address */
 585  585          (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 586  586              &connp->conn_faddr_v6, connp->conn_flowinfo);
 587  587          mutex_exit(&connp->conn_lock);
 588  588          return (error);
 589  589  }
 590  590  
 591  591  /*
 592  592   * Tell IP to just bind to the protocol.
 593  593   */
 594  594  static void
 595  595  icmp_bind_proto(icmp_t *icmp)
 596  596  {
 597  597          conn_t  *connp = icmp->icmp_connp;
 598  598  
 599  599          mutex_enter(&connp->conn_lock);
 600  600          connp->conn_saddr_v6 = ipv6_all_zeros;
 601  601          connp->conn_laddr_v6 = ipv6_all_zeros;
 602  602          connp->conn_faddr_v6 = ipv6_all_zeros;
 603  603          connp->conn_v6lastdst = ipv6_all_zeros;
 604  604          mutex_exit(&connp->conn_lock);
 605  605  
 606  606          (void) ip_laddr_fanout_insert(connp);
 607  607  }
 608  608  
 609  609  /*
 610  610   * This routine handles each T_CONN_REQ message passed to icmp.  It
 611  611   * associates a default destination address with the stream.
 612  612   *
 613  613   * After various error checks are completed, icmp_connect() lays
 614  614   * the target address and port into the composite header template.
 615  615   * Then we ask IP for information, including a source address if we didn't
 616  616   * already have one. Finally we send up the T_OK_ACK reply message.
 617  617   */
 618  618  static void
 619  619  icmp_tpi_connect(queue_t *q, mblk_t *mp)
 620  620  {
 621  621          conn_t  *connp = Q_TO_CONN(q);
 622  622          struct T_conn_req       *tcr;
 623  623          struct sockaddr *sa;
 624  624          socklen_t len;
 625  625          int error;
 626  626          cred_t *cr;
 627  627          pid_t pid;
 628  628          /*
 629  629           * All Solaris components should pass a db_credp
 630  630           * for this TPI message, hence we ASSERT.
 631  631           * But in case there is some other M_PROTO that looks
 632  632           * like a TPI message sent by some other kernel
 633  633           * component, we check and return an error.
 634  634           */
 635  635          cr = msg_getcred(mp, &pid);
 636  636          ASSERT(cr != NULL);
 637  637          if (cr == NULL) {
 638  638                  icmp_err_ack(q, mp, TSYSERR, EINVAL);
 639  639                  return;
 640  640          }
 641  641  
 642  642          tcr = (struct T_conn_req *)mp->b_rptr;
 643  643          /* Sanity checks */
 644  644          if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
 645  645                  icmp_err_ack(q, mp, TPROTO, 0);
 646  646                  return;
 647  647          }
 648  648  
 649  649          if (tcr->OPT_length != 0) {
 650  650                  icmp_err_ack(q, mp, TBADOPT, 0);
 651  651                  return;
 652  652          }
 653  653  
 654  654          len = tcr->DEST_length;
 655  655  
 656  656          switch (len) {
 657  657          default:
 658  658                  icmp_err_ack(q, mp, TBADADDR, 0);
 659  659                  return;
 660  660          case sizeof (sin_t):
 661  661                  sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
 662  662                      sizeof (sin_t));
 663  663                  break;
 664  664          case sizeof (sin6_t):
 665  665                  sa = (struct sockaddr *)mi_offset_param(mp,
 666  666                      tcr->DEST_offset, sizeof (sin6_t));
 667  667                  break;
 668  668          }
 669  669  
 670  670          error = proto_verify_ip_addr(connp->conn_family, sa, len);
 671  671          if (error != 0) {
 672  672                  icmp_err_ack(q, mp, TSYSERR, error);
 673  673                  return;
 674  674          }
 675  675  
 676  676          error = rawip_do_connect(connp, sa, len, cr, pid);
 677  677          if (error != 0) {
 678  678                  if (error < 0) {
 679  679                          icmp_err_ack(q, mp, -error, 0);
 680  680                  } else {
 681  681                          icmp_err_ack(q, mp, 0, error);
 682  682                  }
 683  683          } else {
 684  684                  mblk_t *mp1;
 685  685  
 686  686                  /*
 687  687                   * We have to send a connection confirmation to
 688  688                   * keep TLI happy.
 689  689                   */
 690  690                  if (connp->conn_family == AF_INET) {
 691  691                          mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 692  692                              sizeof (sin_t), NULL, 0);
 693  693                  } else {
 694  694                          ASSERT(connp->conn_family == AF_INET6);
 695  695                          mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 696  696                              sizeof (sin6_t), NULL, 0);
 697  697                  }
 698  698                  if (mp1 == NULL) {
 699  699                          icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 700  700                          return;
 701  701                  }
 702  702  
 703  703                  /*
 704  704                   * Send ok_ack for T_CONN_REQ
 705  705                   */
 706  706                  mp = mi_tpi_ok_ack_alloc(mp);
 707  707                  if (mp == NULL) {
 708  708                          /* Unable to reuse the T_CONN_REQ for the ack. */
 709  709                          icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
 710  710                          return;
 711  711                  }
 712  712                  putnext(connp->conn_rq, mp);
 713  713                  putnext(connp->conn_rq, mp1);
 714  714          }
 715  715  }
 716  716  
 717  717  static int
 718  718  rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 719  719      cred_t *cr, pid_t pid)
 720  720  {
 721  721          icmp_t          *icmp;
 722  722          sin_t           *sin;
 723  723          sin6_t          *sin6;
 724  724          int             error;
 725  725          uint16_t        dstport;
 726  726          ipaddr_t        v4dst;
 727  727          in6_addr_t      v6dst;
 728  728          uint32_t        flowinfo;
 729  729          ip_xmit_attr_t  *ixa;
 730  730          ip_xmit_attr_t  *oldixa;
 731  731          uint_t          scopeid = 0;
 732  732          uint_t          srcid = 0;
 733  733          in6_addr_t      v6src = connp->conn_saddr_v6;
 734  734  
 735  735          icmp = connp->conn_icmp;
 736  736  
 737  737          if (sa == NULL || !OK_32PTR((char *)sa)) {
 738  738                  return (EINVAL);
 739  739          }
 740  740  
 741  741          ASSERT(sa != NULL && len != 0);
 742  742  
 743  743          /*
 744  744           * Determine packet type based on type of address passed in
 745  745           * the request should contain an IPv4 or IPv6 address.
 746  746           * Make sure that address family matches the type of
 747  747           * family of the address passed down.
 748  748           */
 749  749          switch (len) {
 750  750          case sizeof (sin_t):
 751  751                  sin = (sin_t *)sa;
 752  752  
 753  753                  v4dst = sin->sin_addr.s_addr;
 754  754                  dstport = sin->sin_port;
 755  755                  IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 756  756                  ASSERT(connp->conn_ipversion == IPV4_VERSION);
 757  757                  break;
 758  758  
 759  759          case sizeof (sin6_t):
 760  760                  sin6 = (sin6_t *)sa;
 761  761  
 762  762                  /* No support for mapped addresses on raw sockets */
 763  763                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 764  764                          return (EADDRNOTAVAIL);
 765  765                  }
 766  766                  v6dst = sin6->sin6_addr;
 767  767                  dstport = sin6->sin6_port;
 768  768                  ASSERT(connp->conn_ipversion == IPV6_VERSION);
 769  769                  flowinfo = sin6->sin6_flowinfo;
 770  770                  if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
 771  771                          scopeid = sin6->sin6_scope_id;
 772  772                  srcid = sin6->__sin6_src_id;
 773  773                  if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 774  774                          /* Due to check above, we know sin6_addr is v6-only. */
 775  775                          if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
 776  776                              B_FALSE, connp->conn_netstack)) {
 777  777                                  /* Mismatch - v6src would be v4mapped. */
 778  778                                  return (EADDRNOTAVAIL);
 779  779                          }
 780  780                  }
 781  781                  break;
 782  782          }
 783  783  
 784  784          /*
 785  785           * If there is a different thread using conn_ixa then we get a new
 786  786           * copy and cut the old one loose from conn_ixa. Otherwise we use
 787  787           * conn_ixa and prevent any other thread from using/changing it.
 788  788           * Once connect() is done other threads can use conn_ixa since the
 789  789           * refcnt will be back at one.
 790  790           * We defer updating conn_ixa until later to handle any concurrent
 791  791           * conn_ixa_cleanup thread.
 792  792           */
 793  793          ixa = conn_get_ixa(connp, B_FALSE);
 794  794          if (ixa == NULL)
 795  795                  return (ENOMEM);
 796  796  
 797  797          mutex_enter(&connp->conn_lock);
 798  798          /*
 799  799           * This icmp_t must have bound already before doing a connect.
 800  800           * Reject if a connect is in progress (we drop conn_lock during
 801  801           * rawip_do_connect).
 802  802           */
 803  803          if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
 804  804                  mutex_exit(&connp->conn_lock);
 805  805                  ixa_refrele(ixa);
 806  806                  return (-TOUTSTATE);
 807  807          }
 808  808  
 809  809          if (icmp->icmp_state == TS_DATA_XFER) {
 810  810                  /* Already connected - clear out state */
 811  811                  if (connp->conn_mcbc_bind)
 812  812                          connp->conn_saddr_v6 = ipv6_all_zeros;
 813  813                  else
 814  814                          connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 815  815                  connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 816  816                  connp->conn_faddr_v6 = ipv6_all_zeros;
 817  817                  icmp->icmp_state = TS_IDLE;
 818  818          }
 819  819  
 820  820          /*
 821  821           * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 822  822           * with IPPROTO_TCP.
 823  823           */
 824  824          connp->conn_fport = dstport;
 825  825          if (connp->conn_ipversion == IPV4_VERSION) {
 826  826                  /*
 827  827                   * Interpret a zero destination to mean loopback.
 828  828                   * Update the T_CONN_REQ (sin/sin6) since it is used to
 829  829                   * generate the T_CONN_CON.
 830  830                   */
 831  831                  if (v4dst == INADDR_ANY) {
 832  832                          v4dst = htonl(INADDR_LOOPBACK);
 833  833                          IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 834  834                          ASSERT(connp->conn_family == AF_INET);
 835  835                          sin->sin_addr.s_addr = v4dst;
 836  836                  }
 837  837                  connp->conn_faddr_v6 = v6dst;
 838  838                  connp->conn_flowinfo = 0;
 839  839          } else {
 840  840                  ASSERT(connp->conn_ipversion == IPV6_VERSION);
 841  841                  /*
 842  842                   * Interpret a zero destination to mean loopback.
 843  843                   * Update the T_CONN_REQ (sin/sin6) since it is used to
 844  844                   * generate the T_CONN_CON.
 845  845                   */
 846  846                  if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
 847  847                          v6dst = ipv6_loopback;
 848  848                          sin6->sin6_addr = v6dst;
 849  849                  }
 850  850                  connp->conn_faddr_v6 = v6dst;
 851  851                  connp->conn_flowinfo = flowinfo;
 852  852          }
 853  853  
 854  854          /*
 855  855           * We update our cred/cpid based on the caller of connect
 856  856           */
 857  857          if (connp->conn_cred != cr) {
 858  858                  crhold(cr);
 859  859                  crfree(connp->conn_cred);
 860  860                  connp->conn_cred = cr;
 861  861          }
 862  862          connp->conn_cpid = pid;
 863  863          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
 864  864          ixa->ixa_cred = cr;
 865  865          ixa->ixa_cpid = pid;
 866  866          if (is_system_labeled()) {
 867  867                  /* We need to restart with a label based on the cred */
 868  868                  ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
 869  869          }
 870  870  
 871  871          if (scopeid != 0) {
 872  872                  ixa->ixa_flags |= IXAF_SCOPEID_SET;
 873  873                  ixa->ixa_scopeid = scopeid;
 874  874                  connp->conn_incoming_ifindex = scopeid;
 875  875          } else {
 876  876                  ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 877  877                  connp->conn_incoming_ifindex = connp->conn_bound_if;
 878  878          }
 879  879  
 880  880          /*
 881  881           * conn_connect will drop conn_lock and reacquire it.
 882  882           * To prevent a send* from messing with this icmp_t while the lock
 883  883           * is dropped we set icmp_state and clear conn_v6lastdst.
 884  884           * That will make all send* fail with EISCONN.
 885  885           */
 886  886          connp->conn_v6lastdst = ipv6_all_zeros;
 887  887          icmp->icmp_state = TS_WCON_CREQ;
 888  888  
 889  889          error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
 890  890          mutex_exit(&connp->conn_lock);
 891  891          if (error != 0)
 892  892                  goto connect_failed;
 893  893  
 894  894          /*
 895  895           * The addresses have been verified. Time to insert in
 896  896           * the correct fanout list.
 897  897           */
 898  898          error = ipcl_conn_insert(connp);
 899  899          if (error != 0)
 900  900                  goto connect_failed;
 901  901  
 902  902          mutex_enter(&connp->conn_lock);
 903  903          error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 904  904              &connp->conn_faddr_v6, connp->conn_flowinfo);
 905  905          if (error != 0) {
 906  906                  mutex_exit(&connp->conn_lock);
 907  907                  goto connect_failed;
 908  908          }
 909  909  
 910  910          icmp->icmp_state = TS_DATA_XFER;
 911  911          /* Record this as the "last" send even though we haven't sent any */
 912  912          connp->conn_v6lastdst = connp->conn_faddr_v6;
 913  913          connp->conn_lastipversion = connp->conn_ipversion;
 914  914          connp->conn_lastdstport = connp->conn_fport;
 915  915          connp->conn_lastflowinfo = connp->conn_flowinfo;
 916  916          connp->conn_lastscopeid = scopeid;
 917  917          connp->conn_lastsrcid = srcid;
 918  918          /* Also remember a source to use together with lastdst */
 919  919          connp->conn_v6lastsrc = v6src;
 920  920  
 921  921          oldixa = conn_replace_ixa(connp, ixa);
 922  922          mutex_exit(&connp->conn_lock);
 923  923          ixa_refrele(oldixa);
 924  924  
 925  925          ixa_refrele(ixa);
 926  926          return (0);
 927  927  
 928  928  connect_failed:
 929  929          if (ixa != NULL)
 930  930                  ixa_refrele(ixa);
 931  931          mutex_enter(&connp->conn_lock);
 932  932          icmp->icmp_state = TS_IDLE;
 933  933          /* In case the source address was set above */
 934  934          if (connp->conn_mcbc_bind)
 935  935                  connp->conn_saddr_v6 = ipv6_all_zeros;
 936  936          else
 937  937                  connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 938  938          connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 939  939          connp->conn_faddr_v6 = ipv6_all_zeros;
 940  940          connp->conn_v6lastdst = ipv6_all_zeros;
 941  941          connp->conn_flowinfo = 0;
 942  942  
 943  943          (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 944  944              &connp->conn_faddr_v6, connp->conn_flowinfo);
 945  945          mutex_exit(&connp->conn_lock);
 946  946          return (error);
 947  947  }
 948  948  
 949  949  static void
 950  950  rawip_do_close(conn_t *connp)
 951  951  {
 952  952          ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
 953  953  
 954  954          ip_quiesce_conn(connp);
 955  955  
 956  956          if (!IPCL_IS_NONSTR(connp)) {
 957  957                  qprocsoff(connp->conn_rq);
 958  958          }
 959  959  
 960  960          icmp_close_free(connp);
 961  961  
 962  962          /*
 963  963           * Now we are truly single threaded on this stream, and can
 964  964           * delete the things hanging off the connp, and finally the connp.
 965  965           * We removed this connp from the fanout list, it cannot be
 966  966           * accessed thru the fanouts, and we already waited for the
 967  967           * conn_ref to drop to 0. We are already in close, so
 968  968           * there cannot be any other thread from the top. qprocsoff
 969  969           * has completed, and service has completed or won't run in
 970  970           * future.
 971  971           */
 972  972          ASSERT(connp->conn_ref == 1);
 973  973  
 974  974          if (!IPCL_IS_NONSTR(connp)) {
 975  975                  inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
 976  976          } else {
 977  977                  ip_free_helper_stream(connp);
 978  978          }
 979  979  
 980  980          connp->conn_ref--;
 981  981          ipcl_conn_destroy(connp);
 982  982  }
 983  983  
 984  984  static int
 985  985  icmp_close(queue_t *q, int flags)
 986  986  {
 987  987          conn_t  *connp;
 988  988  
 989  989          if (flags & SO_FALLBACK) {
 990  990                  /*
 991  991                   * stream is being closed while in fallback
 992  992                   * simply free the resources that were allocated
 993  993                   */
 994  994                  inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
 995  995                  qprocsoff(q);
 996  996                  goto done;
 997  997          }
 998  998  
 999  999          connp = Q_TO_CONN(q);
1000 1000          (void) rawip_do_close(connp);
1001 1001  done:
1002 1002          q->q_ptr = WR(q)->q_ptr = NULL;
1003 1003          return (0);
1004 1004  }
1005 1005  
1006 1006  static void
1007 1007  icmp_close_free(conn_t *connp)
1008 1008  {
1009 1009          icmp_t *icmp = connp->conn_icmp;
1010 1010  
1011 1011          if (icmp->icmp_filter != NULL) {
1012 1012                  kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
1013 1013                  icmp->icmp_filter = NULL;
1014 1014          }
1015 1015  
1016 1016          if (icmp->icmp_bpf_len != 0) {
1017 1017                  kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
1018 1018                  icmp->icmp_bpf_len = 0;
1019 1019                  icmp->icmp_bpf_prog = NULL;
1020 1020          }
1021 1021  
1022 1022          /*
1023 1023           * Clear any fields which the kmem_cache constructor clears.
1024 1024           * Only icmp_connp needs to be preserved.
1025 1025           * TBD: We should make this more efficient to avoid clearing
1026 1026           * everything.
1027 1027           */
1028 1028          ASSERT(icmp->icmp_connp == connp);
1029 1029          bzero(icmp, sizeof (icmp_t));
1030 1030          icmp->icmp_connp = connp;
1031 1031  }
1032 1032  
1033 1033  /*
1034 1034   * This routine handles each T_DISCON_REQ message passed to icmp
1035 1035   * as an indicating that ICMP is no longer connected. This results
1036 1036   * in telling IP to restore the binding to just the local address.
1037 1037   */
1038 1038  static int
1039 1039  icmp_do_disconnect(conn_t *connp)
1040 1040  {
1041 1041          icmp_t  *icmp = connp->conn_icmp;
1042 1042          int     error;
1043 1043  
1044 1044          mutex_enter(&connp->conn_lock);
1045 1045          if (icmp->icmp_state != TS_DATA_XFER) {
1046 1046                  mutex_exit(&connp->conn_lock);
1047 1047                  return (-TOUTSTATE);
1048 1048          }
1049 1049          if (connp->conn_mcbc_bind)
1050 1050                  connp->conn_saddr_v6 = ipv6_all_zeros;
1051 1051          else
1052 1052                  connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
1053 1053          connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
1054 1054          connp->conn_faddr_v6 = ipv6_all_zeros;
1055 1055          icmp->icmp_state = TS_IDLE;
1056 1056  
1057 1057          connp->conn_v6lastdst = ipv6_all_zeros;
1058 1058          error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
1059 1059              &connp->conn_faddr_v6, connp->conn_flowinfo);
1060 1060          mutex_exit(&connp->conn_lock);
1061 1061          if (error != 0)
1062 1062                  return (error);
1063 1063  
1064 1064          /*
1065 1065           * Tell IP to remove the full binding and revert
1066 1066           * to the local address binding.
1067 1067           */
1068 1068          return (ip_laddr_fanout_insert(connp));
1069 1069  }
1070 1070  
1071 1071  static void
1072 1072  icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1073 1073  {
1074 1074          conn_t  *connp = Q_TO_CONN(q);
1075 1075          int     error;
1076 1076  
1077 1077          /*
1078 1078           * Allocate the largest primitive we need to send back
1079 1079           * T_error_ack is > than T_ok_ack
1080 1080           */
1081 1081          mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1082 1082          if (mp == NULL) {
1083 1083                  /* Unable to reuse the T_DISCON_REQ for the ack. */
1084 1084                  icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1085 1085                  return;
1086 1086          }
1087 1087  
1088 1088          error = icmp_do_disconnect(connp);
1089 1089  
1090 1090          if (error != 0) {
1091 1091                  if (error > 0) {
1092 1092                          icmp_err_ack(q, mp, 0, error);
1093 1093                  } else {
1094 1094                          icmp_err_ack(q, mp, -error, 0);
1095 1095                  }
1096 1096          } else {
1097 1097                  mp = mi_tpi_ok_ack_alloc(mp);
1098 1098                  ASSERT(mp != NULL);
1099 1099                  qreply(q, mp);
1100 1100          }
1101 1101  }
1102 1102  
1103 1103  static int
1104 1104  icmp_disconnect(conn_t *connp)
1105 1105  {
1106 1106          int     error;
1107 1107  
1108 1108          connp->conn_dgram_errind = B_FALSE;
1109 1109  
1110 1110          error = icmp_do_disconnect(connp);
1111 1111  
1112 1112          if (error < 0)
1113 1113                  error = proto_tlitosyserr(-error);
1114 1114          return (error);
1115 1115  }
1116 1116  
1117 1117  /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1118 1118  static void
1119 1119  icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1120 1120  {
1121 1121          if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1122 1122                  qreply(q, mp);
1123 1123  }
1124 1124  
1125 1125  /* Shorthand to generate and send TPI error acks to our client */
1126 1126  static void
1127 1127  icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1128 1128      t_scalar_t t_error, int sys_error)
1129 1129  {
1130 1130          struct T_error_ack      *teackp;
1131 1131  
1132 1132          if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1133 1133              M_PCPROTO, T_ERROR_ACK)) != NULL) {
1134 1134                  teackp = (struct T_error_ack *)mp->b_rptr;
1135 1135                  teackp->ERROR_prim = primitive;
1136 1136                  teackp->TLI_error = t_error;
1137 1137                  teackp->UNIX_error = sys_error;
1138 1138                  qreply(q, mp);
1139 1139          }
1140 1140  }
1141 1141  
1142 1142  /*
1143 1143   * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1144 1144   * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1145 1145   * Assumes that IP has pulled up everything up to and including the ICMP header.
1146 1146   */
1147 1147  /* ARGSUSED2 */
1148 1148  static void
1149 1149  icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1150 1150  {
1151 1151          conn_t          *connp = (conn_t *)arg1;
1152 1152          icmp_t          *icmp = connp->conn_icmp;
1153 1153          icmph_t         *icmph;
1154 1154          ipha_t          *ipha;
1155 1155          int             iph_hdr_length;
1156 1156          sin_t           sin;
1157 1157          mblk_t          *mp1;
1158 1158          int             error = 0;
1159 1159  
1160 1160          ipha = (ipha_t *)mp->b_rptr;
1161 1161  
1162 1162          ASSERT(OK_32PTR(mp->b_rptr));
1163 1163  
1164 1164          if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1165 1165                  ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1166 1166                  icmp_icmp_error_ipv6(connp, mp, ira);
1167 1167                  return;
1168 1168          }
1169 1169          ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1170 1170  
1171 1171          /* Skip past the outer IP and ICMP headers */
1172 1172          ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1173 1173          iph_hdr_length = ira->ira_ip_hdr_length;
1174 1174          icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1175 1175          ipha = (ipha_t *)&icmph[1];     /* Inner IP header */
1176 1176  
1177 1177          iph_hdr_length = IPH_HDR_LENGTH(ipha);
1178 1178  
1179 1179          switch (icmph->icmph_type) {
1180 1180          case ICMP_DEST_UNREACHABLE:
1181 1181                  switch (icmph->icmph_code) {
1182 1182                  case ICMP_FRAGMENTATION_NEEDED: {
1183 1183                          ipha_t          *ipha;
1184 1184                          ip_xmit_attr_t  *ixa;
1185 1185                          /*
1186 1186                           * IP has already adjusted the path MTU.
1187 1187                           * But we need to adjust DF for IPv4.
1188 1188                           */
1189 1189                          if (connp->conn_ipversion != IPV4_VERSION)
1190 1190                                  break;
1191 1191  
1192 1192                          ixa = conn_get_ixa(connp, B_FALSE);
1193 1193                          if (ixa == NULL || ixa->ixa_ire == NULL) {
1194 1194                                  /*
1195 1195                                   * Some other thread holds conn_ixa. We will
1196 1196                                   * redo this on the next ICMP too big.
1197 1197                                   */
1198 1198                                  if (ixa != NULL)
1199 1199                                          ixa_refrele(ixa);
1200 1200                                  break;
1201 1201                          }
1202 1202                          (void) ip_get_pmtu(ixa);
1203 1203  
1204 1204                          mutex_enter(&connp->conn_lock);
1205 1205                          ipha = (ipha_t *)connp->conn_ht_iphc;
1206 1206                          if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1207 1207                                  ipha->ipha_fragment_offset_and_flags |=
1208 1208                                      IPH_DF_HTONS;
1209 1209                          } else {
1210 1210                                  ipha->ipha_fragment_offset_and_flags &=
1211 1211                                      ~IPH_DF_HTONS;
1212 1212                          }
1213 1213                          mutex_exit(&connp->conn_lock);
1214 1214                          ixa_refrele(ixa);
1215 1215                          break;
1216 1216                  }
1217 1217                  case ICMP_PORT_UNREACHABLE:
1218 1218                  case ICMP_PROTOCOL_UNREACHABLE:
1219 1219                          error = ECONNREFUSED;
1220 1220                          break;
1221 1221                  default:
1222 1222                          /* Transient errors */
1223 1223                          break;
1224 1224                  }
1225 1225                  break;
1226 1226          default:
1227 1227                  /* Transient errors */
1228 1228                  break;
1229 1229          }
1230 1230          if (error == 0) {
1231 1231                  freemsg(mp);
1232 1232                  return;
1233 1233          }
1234 1234  
1235 1235          /*
1236 1236           * Deliver T_UDERROR_IND when the application has asked for it.
1237 1237           * The socket layer enables this automatically when connected.
1238 1238           */
1239 1239          if (!connp->conn_dgram_errind) {
1240 1240                  freemsg(mp);
1241 1241                  return;
1242 1242          }
1243 1243  
1244 1244          sin = sin_null;
1245 1245          sin.sin_family = AF_INET;
1246 1246          sin.sin_addr.s_addr = ipha->ipha_dst;
1247 1247  
1248 1248          if (IPCL_IS_NONSTR(connp)) {
1249 1249                  mutex_enter(&connp->conn_lock);
1250 1250                  if (icmp->icmp_state == TS_DATA_XFER) {
1251 1251                          if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1252 1252                                  mutex_exit(&connp->conn_lock);
1253 1253                                  (*connp->conn_upcalls->su_set_error)
1254 1254                                      (connp->conn_upper_handle, error);
1255 1255                                  goto done;
1256 1256                          }
1257 1257                  } else {
1258 1258                          icmp->icmp_delayed_error = error;
1259 1259                          *((sin_t *)&icmp->icmp_delayed_addr) = sin;
1260 1260                  }
1261 1261                  mutex_exit(&connp->conn_lock);
1262 1262          } else {
1263 1263                  mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1264 1264                      error);
1265 1265                  if (mp1 != NULL)
1266 1266                          putnext(connp->conn_rq, mp1);
1267 1267          }
1268 1268  done:
1269 1269          freemsg(mp);
1270 1270  }
1271 1271  
1272 1272  /*
1273 1273   * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1274 1274   * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1275 1275   * Assumes that IP has pulled up all the extension headers as well as the
1276 1276   * ICMPv6 header.
1277 1277   */
1278 1278  static void
1279 1279  icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1280 1280  {
1281 1281          icmp6_t         *icmp6;
1282 1282          ip6_t           *ip6h, *outer_ip6h;
1283 1283          uint16_t        iph_hdr_length;
1284 1284          uint8_t         *nexthdrp;
1285 1285          sin6_t          sin6;
1286 1286          mblk_t          *mp1;
1287 1287          int             error = 0;
1288 1288          icmp_t          *icmp = connp->conn_icmp;
1289 1289  
1290 1290          outer_ip6h = (ip6_t *)mp->b_rptr;
1291 1291  #ifdef DEBUG
1292 1292          if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1293 1293                  iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1294 1294          else
1295 1295                  iph_hdr_length = IPV6_HDR_LEN;
1296 1296          ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1297 1297  #endif
1298 1298          /* Skip past the outer IP and ICMP headers */
1299 1299          iph_hdr_length = ira->ira_ip_hdr_length;
1300 1300          icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1301 1301  
1302 1302          ip6h = (ip6_t *)&icmp6[1];      /* Inner IP header */
1303 1303          if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1304 1304                  freemsg(mp);
1305 1305                  return;
1306 1306          }
1307 1307  
1308 1308          switch (icmp6->icmp6_type) {
1309 1309          case ICMP6_DST_UNREACH:
1310 1310                  switch (icmp6->icmp6_code) {
1311 1311                  case ICMP6_DST_UNREACH_NOPORT:
1312 1312                          error = ECONNREFUSED;
1313 1313                          break;
1314 1314                  case ICMP6_DST_UNREACH_ADMIN:
1315 1315                  case ICMP6_DST_UNREACH_NOROUTE:
1316 1316                  case ICMP6_DST_UNREACH_BEYONDSCOPE:
1317 1317                  case ICMP6_DST_UNREACH_ADDR:
1318 1318                          /* Transient errors */
1319 1319                          break;
1320 1320                  default:
1321 1321                          break;
1322 1322                  }
1323 1323                  break;
1324 1324          case ICMP6_PACKET_TOO_BIG: {
1325 1325                  struct T_unitdata_ind   *tudi;
1326 1326                  struct T_opthdr         *toh;
1327 1327                  size_t                  udi_size;
1328 1328                  mblk_t                  *newmp;
1329 1329                  t_scalar_t              opt_length = sizeof (struct T_opthdr) +
1330 1330                      sizeof (struct ip6_mtuinfo);
1331 1331                  sin6_t                  *sin6;
1332 1332                  struct ip6_mtuinfo      *mtuinfo;
1333 1333  
1334 1334                  /*
1335 1335                   * If the application has requested to receive path mtu
1336 1336                   * information, send up an empty message containing an
1337 1337                   * IPV6_PATHMTU ancillary data item.
1338 1338                   */
1339 1339                  if (!connp->conn_ipv6_recvpathmtu)
1340 1340                          break;
1341 1341  
1342 1342                  udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1343 1343                      opt_length;
1344 1344                  if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1345 1345                          BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1346 1346                          break;
1347 1347                  }
1348 1348  
1349 1349                  /*
1350 1350                   * newmp->b_cont is left to NULL on purpose.  This is an
1351 1351                   * empty message containing only ancillary data.
1352 1352                   */
1353 1353                  newmp->b_datap->db_type = M_PROTO;
1354 1354                  tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1355 1355                  newmp->b_wptr = (uchar_t *)tudi + udi_size;
1356 1356                  tudi->PRIM_type = T_UNITDATA_IND;
1357 1357                  tudi->SRC_length = sizeof (sin6_t);
1358 1358                  tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1359 1359                  tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1360 1360                  tudi->OPT_length = opt_length;
1361 1361  
1362 1362                  sin6 = (sin6_t *)&tudi[1];
1363 1363                  bzero(sin6, sizeof (sin6_t));
1364 1364                  sin6->sin6_family = AF_INET6;
1365 1365                  sin6->sin6_addr = connp->conn_faddr_v6;
1366 1366  
1367 1367                  toh = (struct T_opthdr *)&sin6[1];
1368 1368                  toh->level = IPPROTO_IPV6;
1369 1369                  toh->name = IPV6_PATHMTU;
1370 1370                  toh->len = opt_length;
1371 1371                  toh->status = 0;
1372 1372  
1373 1373                  mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1374 1374                  bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1375 1375                  mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1376 1376                  mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1377 1377                  mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1378 1378                  /*
1379 1379                   * We've consumed everything we need from the original
1380 1380                   * message.  Free it, then send our empty message.
1381 1381                   */
1382 1382                  freemsg(mp);
1383 1383                  icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1384 1384                  return;
1385 1385          }
1386 1386          case ICMP6_TIME_EXCEEDED:
1387 1387                  /* Transient errors */
1388 1388                  break;
1389 1389          case ICMP6_PARAM_PROB:
1390 1390                  /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1391 1391                  if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1392 1392                      (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1393 1393                      (uchar_t *)nexthdrp) {
1394 1394                          error = ECONNREFUSED;
1395 1395                          break;
1396 1396                  }
1397 1397                  break;
1398 1398          }
1399 1399          if (error == 0) {
1400 1400                  freemsg(mp);
1401 1401                  return;
1402 1402          }
1403 1403  
1404 1404          /*
1405 1405           * Deliver T_UDERROR_IND when the application has asked for it.
1406 1406           * The socket layer enables this automatically when connected.
1407 1407           */
1408 1408          if (!connp->conn_dgram_errind) {
1409 1409                  freemsg(mp);
1410 1410                  return;
1411 1411          }
1412 1412  
1413 1413          sin6 = sin6_null;
1414 1414          sin6.sin6_family = AF_INET6;
1415 1415          sin6.sin6_addr = ip6h->ip6_dst;
1416 1416          sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1417 1417          if (IPCL_IS_NONSTR(connp)) {
1418 1418                  mutex_enter(&connp->conn_lock);
1419 1419                  if (icmp->icmp_state == TS_DATA_XFER) {
1420 1420                          if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1421 1421                              &connp->conn_faddr_v6)) {
1422 1422                                  mutex_exit(&connp->conn_lock);
1423 1423                                  (*connp->conn_upcalls->su_set_error)
1424 1424                                      (connp->conn_upper_handle, error);
1425 1425                                  goto done;
1426 1426                          }
1427 1427                  } else {
1428 1428                          icmp->icmp_delayed_error = error;
1429 1429                          *((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1430 1430                  }
1431 1431                  mutex_exit(&connp->conn_lock);
1432 1432          } else {
1433 1433                  mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1434 1434                      NULL, 0, error);
1435 1435                  if (mp1 != NULL)
1436 1436                          putnext(connp->conn_rq, mp1);
1437 1437          }
1438 1438  done:
1439 1439          freemsg(mp);
1440 1440  }
1441 1441  
1442 1442  /*
1443 1443   * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1444 1444   * The local address is filled in if endpoint is bound. The remote address
1445 1445   * is filled in if remote address has been precified ("connected endpoint")
1446 1446   * (The concept of connected CLTS sockets is alien to published TPI
1447 1447   *  but we support it anyway).
1448 1448   */
1449 1449  static void
1450 1450  icmp_addr_req(queue_t *q, mblk_t *mp)
1451 1451  {
1452 1452          struct sockaddr *sa;
1453 1453          mblk_t  *ackmp;
1454 1454          struct T_addr_ack *taa;
1455 1455          icmp_t  *icmp = Q_TO_ICMP(q);
1456 1456          conn_t  *connp = icmp->icmp_connp;
1457 1457          uint_t  addrlen;
1458 1458  
1459 1459          /* Make it large enough for worst case */
1460 1460          ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1461 1461              2 * sizeof (sin6_t), 1);
1462 1462          if (ackmp == NULL) {
1463 1463                  icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1464 1464                  return;
1465 1465          }
1466 1466          taa = (struct T_addr_ack *)ackmp->b_rptr;
1467 1467  
1468 1468          bzero(taa, sizeof (struct T_addr_ack));
1469 1469          ackmp->b_wptr = (uchar_t *)&taa[1];
1470 1470  
1471 1471          taa->PRIM_type = T_ADDR_ACK;
1472 1472          ackmp->b_datap->db_type = M_PCPROTO;
1473 1473  
1474 1474          if (connp->conn_family == AF_INET)
1475 1475                  addrlen = sizeof (sin_t);
1476 1476          else
1477 1477                  addrlen = sizeof (sin6_t);
1478 1478  
1479 1479          mutex_enter(&connp->conn_lock);
1480 1480          /*
1481 1481           * Note: Following code assumes 32 bit alignment of basic
1482 1482           * data structures like sin_t and struct T_addr_ack.
1483 1483           */
1484 1484          if (icmp->icmp_state != TS_UNBND) {
1485 1485                  /*
1486 1486                   * Fill in local address first
1487 1487                   */
1488 1488                  taa->LOCADDR_offset = sizeof (*taa);
1489 1489                  taa->LOCADDR_length = addrlen;
1490 1490                  sa = (struct sockaddr *)&taa[1];
1491 1491                  (void) conn_getsockname(connp, sa, &addrlen);
1492 1492                  ackmp->b_wptr += addrlen;
1493 1493          }
1494 1494          if (icmp->icmp_state == TS_DATA_XFER) {
1495 1495                  /*
1496 1496                   * connected, fill remote address too
1497 1497                   */
1498 1498                  taa->REMADDR_length = addrlen;
1499 1499                  /* assumed 32-bit alignment */
1500 1500                  taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1501 1501                  sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1502 1502                  (void) conn_getpeername(connp, sa, &addrlen);
1503 1503                  ackmp->b_wptr += addrlen;
1504 1504          }
1505 1505          mutex_exit(&connp->conn_lock);
1506 1506          ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1507 1507          qreply(q, ackmp);
1508 1508  }
1509 1509  
1510 1510  static void
1511 1511  icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1512 1512  {
1513 1513          conn_t          *connp = icmp->icmp_connp;
1514 1514  
1515 1515          *tap = icmp_g_t_info_ack;
1516 1516  
1517 1517          if (connp->conn_family == AF_INET6)
1518 1518                  tap->ADDR_size = sizeof (sin6_t);
1519 1519          else
1520 1520                  tap->ADDR_size = sizeof (sin_t);
1521 1521          tap->CURRENT_state = icmp->icmp_state;
1522 1522          tap->OPT_size = icmp_max_optsize;
1523 1523  }
1524 1524  
1525 1525  static void
1526 1526  icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1527 1527      t_uscalar_t cap_bits1)
1528 1528  {
1529 1529          tcap->CAP_bits1 = 0;
1530 1530  
1531 1531          if (cap_bits1 & TC1_INFO) {
1532 1532                  icmp_copy_info(&tcap->INFO_ack, icmp);
1533 1533                  tcap->CAP_bits1 |= TC1_INFO;
1534 1534          }
1535 1535  }
1536 1536  
1537 1537  /*
1538 1538   * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1539 1539   * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1540 1540   * icmp_g_t_info_ack.  The current state of the stream is copied from
1541 1541   * icmp_state.
1542 1542   */
1543 1543  static void
1544 1544  icmp_capability_req(queue_t *q, mblk_t *mp)
1545 1545  {
1546 1546          icmp_t                  *icmp = Q_TO_ICMP(q);
1547 1547          t_uscalar_t             cap_bits1;
1548 1548          struct T_capability_ack *tcap;
1549 1549  
1550 1550          cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1551 1551  
1552 1552          mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1553 1553              mp->b_datap->db_type, T_CAPABILITY_ACK);
1554 1554          if (!mp)
1555 1555                  return;
1556 1556  
1557 1557          tcap = (struct T_capability_ack *)mp->b_rptr;
1558 1558  
1559 1559          icmp_do_capability_ack(icmp, tcap, cap_bits1);
1560 1560  
1561 1561          qreply(q, mp);
1562 1562  }
1563 1563  
1564 1564  /*
1565 1565   * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1566 1566   * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1567 1567   * The current state of the stream is copied from icmp_state.
1568 1568   */
1569 1569  static void
1570 1570  icmp_info_req(queue_t *q, mblk_t *mp)
1571 1571  {
1572 1572          icmp_t  *icmp = Q_TO_ICMP(q);
1573 1573  
1574 1574          /* Create a T_INFO_ACK message. */
1575 1575          mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1576 1576              T_INFO_ACK);
1577 1577          if (!mp)
1578 1578                  return;
1579 1579          icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1580 1580          qreply(q, mp);
1581 1581  }
1582 1582  
1583 1583  static int
1584 1584  icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1585 1585      int family)
1586 1586  {
1587 1587          conn_t *connp;
1588 1588          dev_t   conn_dev;
1589 1589          int     error;
1590 1590  
1591 1591          /* If the stream is already open, return immediately. */
1592 1592          if (q->q_ptr != NULL)
1593 1593                  return (0);
1594 1594  
1595 1595          if (sflag == MODOPEN)
1596 1596                  return (EINVAL);
1597 1597  
1598 1598          /*
1599 1599           * Since ICMP is not used so heavily, allocating from the small
1600 1600           * arena should be sufficient.
1601 1601           */
1602 1602          if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1603 1603                  return (EBUSY);
1604 1604          }
1605 1605  
1606 1606          if (flag & SO_FALLBACK) {
1607 1607                  /*
1608 1608                   * Non streams socket needs a stream to fallback to
1609 1609                   */
1610 1610                  RD(q)->q_ptr = (void *)conn_dev;
1611 1611                  WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1612 1612                  WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1613 1613                  qprocson(q);
1614 1614                  return (0);
1615 1615          }
1616 1616  
1617 1617          connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1618 1618          if (connp == NULL) {
1619 1619                  ASSERT(error != 0);
1620 1620                  inet_minor_free(ip_minor_arena_sa, conn_dev);
1621 1621                  return (error);
1622 1622          }
1623 1623  
1624 1624          *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1625 1625          connp->conn_dev = conn_dev;
1626 1626          connp->conn_minor_arena = ip_minor_arena_sa;
1627 1627  
1628 1628          /*
1629 1629           * Initialize the icmp_t structure for this stream.
1630 1630           */
1631 1631          q->q_ptr = connp;
1632 1632          WR(q)->q_ptr = connp;
1633 1633          connp->conn_rq = q;
1634 1634          connp->conn_wq = WR(q);
1635 1635  
1636 1636          WR(q)->q_hiwat = connp->conn_sndbuf;
1637 1637          WR(q)->q_lowat = connp->conn_sndlowat;
1638 1638  
1639 1639          qprocson(q);
1640 1640  
1641 1641          /* Set the Stream head write offset. */
1642 1642          (void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1643 1643          (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1644 1644  
1645 1645          mutex_enter(&connp->conn_lock);
1646 1646          connp->conn_state_flags &= ~CONN_INCIPIENT;
1647 1647          mutex_exit(&connp->conn_lock);
1648 1648  
1649 1649          icmp_bind_proto(connp->conn_icmp);
1650 1650  
1651 1651          return (0);
1652 1652  }
1653 1653  
1654 1654  /* For /dev/icmp aka AF_INET open */
1655 1655  static int
1656 1656  icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1657 1657  {
1658 1658          return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1659 1659  }
1660 1660  
1661 1661  /* For /dev/icmp6 aka AF_INET6 open */
1662 1662  static int
1663 1663  icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1664 1664  {
1665 1665          return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1666 1666  }
1667 1667  
1668 1668  /*
1669 1669   * This is the open routine for icmp.  It allocates a icmp_t structure for
1670 1670   * the stream and, on the first open of the module, creates an ND table.
1671 1671   */
1672 1672  static conn_t *
1673 1673  rawip_do_open(int family, cred_t *credp, int *err, int flags)
1674 1674  {
1675 1675          icmp_t  *icmp;
1676 1676          conn_t *connp;
1677 1677          zoneid_t zoneid;
1678 1678          netstack_t *ns;
1679 1679          icmp_stack_t *is;
1680 1680          int len;
1681 1681          boolean_t isv6 = B_FALSE;
1682 1682  
1683 1683          *err = secpolicy_net_icmpaccess(credp);
1684 1684          if (*err != 0)
1685 1685                  return (NULL);
1686 1686  
1687 1687          if (family == AF_INET6)
1688 1688                  isv6 = B_TRUE;
1689 1689  
1690 1690          ns = netstack_find_by_cred(credp);
1691 1691          ASSERT(ns != NULL);
1692 1692          is = ns->netstack_icmp;
1693 1693          ASSERT(is != NULL);
1694 1694  
1695 1695          /*
1696 1696           * For exclusive stacks we set the zoneid to zero
1697 1697           * to make ICMP operate as if in the global zone.
1698 1698           */
1699 1699          if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1700 1700                  zoneid = GLOBAL_ZONEID;
1701 1701          else
1702 1702                  zoneid = crgetzoneid(credp);
1703 1703  
1704 1704          ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1705 1705  
1706 1706          connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1707 1707          icmp = connp->conn_icmp;
1708 1708  
1709 1709          /*
1710 1710           * ipcl_conn_create did a netstack_hold. Undo the hold that was
1711 1711           * done by netstack_find_by_cred()
1712 1712           */
1713 1713          netstack_rele(ns);
1714 1714  
1715 1715          /*
1716 1716           * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1717 1717           * need to lock anything.
1718 1718           */
1719 1719          ASSERT(connp->conn_proto == IPPROTO_ICMP);
1720 1720          ASSERT(connp->conn_icmp == icmp);
1721 1721          ASSERT(icmp->icmp_connp == connp);
1722 1722  
1723 1723          /* Set the initial state of the stream and the privilege status. */
1724 1724          icmp->icmp_state = TS_UNBND;
1725 1725          connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1726 1726          if (isv6) {
1727 1727                  connp->conn_family = AF_INET6;
1728 1728                  connp->conn_ipversion = IPV6_VERSION;
1729 1729                  connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1730 1730                  connp->conn_proto = IPPROTO_ICMPV6;
1731 1731                  /* May be changed by a SO_PROTOTYPE socket option. */
1732 1732                  connp->conn_proto = IPPROTO_ICMPV6;
1733 1733                  connp->conn_ixa->ixa_protocol = connp->conn_proto;
1734 1734                  connp->conn_ixa->ixa_raw_cksum_offset = 2;
1735 1735                  connp->conn_default_ttl = is->is_ipv6_hoplimit;
1736 1736                  len = sizeof (ip6_t);
1737 1737          } else {
1738 1738                  connp->conn_family = AF_INET;
1739 1739                  connp->conn_ipversion = IPV4_VERSION;
1740 1740                  connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1741 1741                  /* May be changed by a SO_PROTOTYPE socket option. */
1742 1742                  connp->conn_proto = IPPROTO_ICMP;
1743 1743                  connp->conn_ixa->ixa_protocol = connp->conn_proto;
1744 1744                  connp->conn_default_ttl = is->is_ipv4_ttl;
1745 1745                  len = sizeof (ipha_t);
1746 1746          }
1747 1747          connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1748 1748  
1749 1749          connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1750 1750  
1751 1751          /*
1752 1752           * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1753 1753           * the checksum is provided in the pre-built packet. We clear
1754 1754           * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1755 1755           * complete IP header and not to compute the transport checksum.
1756 1756           */
1757 1757          connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1758 1758          /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1759 1759          connp->conn_ixa->ixa_zoneid = zoneid;
1760 1760  
1761 1761          connp->conn_zoneid = zoneid;
1762 1762  
1763 1763          /*
1764 1764           * If the caller has the process-wide flag set, then default to MAC
1765 1765           * exempt mode.  This allows read-down to unlabeled hosts.
1766 1766           */
1767 1767          if (getpflags(NET_MAC_AWARE, credp) != 0)
1768 1768                  connp->conn_mac_mode = CONN_MAC_AWARE;
1769 1769  
1770 1770          connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1771 1771  
1772 1772          icmp->icmp_is = is;
1773 1773  
1774 1774          connp->conn_rcvbuf = is->is_recv_hiwat;
1775 1775          connp->conn_sndbuf = is->is_xmit_hiwat;
1776 1776          connp->conn_sndlowat = is->is_xmit_lowat;
1777 1777          connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1778 1778  
1779 1779          connp->conn_wroff = len + is->is_wroff_extra;
1780 1780          connp->conn_so_type = SOCK_RAW;
1781 1781  
1782 1782          connp->conn_recv = icmp_input;
1783 1783          connp->conn_recvicmp = icmp_icmp_input;
1784 1784          crhold(credp);
1785 1785          connp->conn_cred = credp;
1786 1786          connp->conn_cpid = curproc->p_pid;
1787 1787          connp->conn_open_time = ddi_get_lbolt64();
1788 1788          /* Cache things in ixa without an extra refhold */
1789 1789          ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1790 1790          connp->conn_ixa->ixa_cred = connp->conn_cred;
1791 1791          connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1792 1792          if (is_system_labeled())
1793 1793                  connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1794 1794  
1795 1795          connp->conn_flow_cntrld = B_FALSE;
1796 1796  
1797 1797          if (is->is_pmtu_discovery)
1798 1798                  connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1799 1799  
1800 1800          return (connp);
1801 1801  }
1802 1802  
1803 1803  /*
1804 1804   * Which ICMP options OK to set through T_UNITDATA_REQ...
1805 1805   */
1806 1806  /* ARGSUSED */
1807 1807  static boolean_t
1808 1808  icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1809 1809  {
1810 1810          return (B_TRUE);
1811 1811  }
1812 1812  
1813 1813  /*
1814 1814   * This routine gets default values of certain options whose default
1815 1815   * values are maintained by protcol specific code
1816 1816   */
1817 1817  int
1818 1818  icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1819 1819  {
1820 1820          icmp_t *icmp = Q_TO_ICMP(q);
1821 1821          icmp_stack_t *is = icmp->icmp_is;
1822 1822          int *i1 = (int *)ptr;
1823 1823  
1824 1824          switch (level) {
1825 1825          case IPPROTO_IP:
1826 1826                  switch (name) {
1827 1827                  case IP_MULTICAST_TTL:
1828 1828                          *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1829 1829                          return (sizeof (uchar_t));
1830 1830                  case IP_MULTICAST_LOOP:
1831 1831                          *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1832 1832                          return (sizeof (uchar_t));
1833 1833                  }
1834 1834                  break;
1835 1835          case IPPROTO_IPV6:
1836 1836                  switch (name) {
1837 1837                  case IPV6_MULTICAST_HOPS:
1838 1838                          *i1 = IP_DEFAULT_MULTICAST_TTL;
1839 1839                          return (sizeof (int));
1840 1840                  case IPV6_MULTICAST_LOOP:
1841 1841                          *i1 = IP_DEFAULT_MULTICAST_LOOP;
1842 1842                          return (sizeof (int));
1843 1843                  case IPV6_UNICAST_HOPS:
1844 1844                          *i1 = is->is_ipv6_hoplimit;
1845 1845                          return (sizeof (int));
1846 1846                  }
1847 1847                  break;
1848 1848          case IPPROTO_ICMPV6:
1849 1849                  switch (name) {
1850 1850                  case ICMP6_FILTER:
1851 1851                          /* Make it look like "pass all" */
1852 1852                          ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1853 1853                          return (sizeof (icmp6_filter_t));
1854 1854                  }
1855 1855                  break;
1856 1856          }
1857 1857          return (-1);
1858 1858  }
1859 1859  
1860 1860  /*
1861 1861   * This routine retrieves the current status of socket options.
1862 1862   * It returns the size of the option retrieved, or -1.
1863 1863   */
1864 1864  int
1865 1865  icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1866 1866  {
1867 1867          icmp_t          *icmp = connp->conn_icmp;
1868 1868          int             *i1 = (int *)ptr;
1869 1869          conn_opt_arg_t  coas;
1870 1870          int             retval;
1871 1871  
1872 1872          coas.coa_connp = connp;
1873 1873          coas.coa_ixa = connp->conn_ixa;
1874 1874          coas.coa_ipp = &connp->conn_xmit_ipp;
1875 1875          coas.coa_ancillary = B_FALSE;
1876 1876          coas.coa_changed = 0;
1877 1877  
1878 1878          /*
1879 1879           * We assume that the optcom framework has checked for the set
1880 1880           * of levels and names that are supported, hence we don't worry
1881 1881           * about rejecting based on that.
1882 1882           * First check for ICMP specific handling, then pass to common routine.
1883 1883           */
1884 1884          switch (level) {
1885 1885          case IPPROTO_IP:
1886 1886                  /*
1887 1887                   * Only allow IPv4 option processing on IPv4 sockets.
1888 1888                   */
1889 1889                  if (connp->conn_family != AF_INET)
1890 1890                          return (-1);
1891 1891  
1892 1892                  switch (name) {
1893 1893                  case IP_OPTIONS:
1894 1894                  case T_IP_OPTIONS:
1895 1895                          /* Options are passed up with each packet */
1896 1896                          return (0);
1897 1897                  case IP_HDRINCL:
1898 1898                          mutex_enter(&connp->conn_lock);
1899 1899                          *i1 = (int)icmp->icmp_hdrincl;
1900 1900                          mutex_exit(&connp->conn_lock);
1901 1901                          return (sizeof (int));
1902 1902                  }
1903 1903                  break;
1904 1904  
1905 1905          case IPPROTO_IPV6:
1906 1906                  /*
1907 1907                   * Only allow IPv6 option processing on native IPv6 sockets.
1908 1908                   */
1909 1909                  if (connp->conn_family != AF_INET6)
1910 1910                          return (-1);
1911 1911  
1912 1912                  switch (name) {
1913 1913                  case IPV6_CHECKSUM:
1914 1914                          /*
1915 1915                           * Return offset or -1 if no checksum offset.
1916 1916                           * Does not apply to IPPROTO_ICMPV6
1917 1917                           */
1918 1918                          if (connp->conn_proto == IPPROTO_ICMPV6)
1919 1919                                  return (-1);
1920 1920  
1921 1921                          mutex_enter(&connp->conn_lock);
1922 1922                          if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1923 1923                                  *i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1924 1924                          else
1925 1925                                  *i1 = -1;
1926 1926                          mutex_exit(&connp->conn_lock);
1927 1927                          return (sizeof (int));
1928 1928                  }
1929 1929                  break;
1930 1930  
1931 1931          case IPPROTO_ICMPV6:
1932 1932                  /*
1933 1933                   * Only allow IPv6 option processing on native IPv6 sockets.
1934 1934                   */
1935 1935                  if (connp->conn_family != AF_INET6)
1936 1936                          return (-1);
1937 1937  
1938 1938                  if (connp->conn_proto != IPPROTO_ICMPV6)
1939 1939                          return (-1);
1940 1940  
1941 1941                  switch (name) {
1942 1942                  case ICMP6_FILTER:
1943 1943                          mutex_enter(&connp->conn_lock);
1944 1944                          if (icmp->icmp_filter == NULL) {
1945 1945                                  /* Make it look like "pass all" */
1946 1946                                  ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1947 1947                          } else {
1948 1948                                  (void) bcopy(icmp->icmp_filter, ptr,
1949 1949                                      sizeof (icmp6_filter_t));
1950 1950                          }
1951 1951                          mutex_exit(&connp->conn_lock);
1952 1952                          return (sizeof (icmp6_filter_t));
1953 1953                  }
1954 1954          }
1955 1955          mutex_enter(&connp->conn_lock);
1956 1956          retval = conn_opt_get(&coas, level, name, ptr);
1957 1957          mutex_exit(&connp->conn_lock);
1958 1958          return (retval);
1959 1959  }
1960 1960  
1961 1961  /*
1962 1962   * This routine retrieves the current status of socket options.
1963 1963   * It returns the size of the option retrieved, or -1.
1964 1964   */
1965 1965  int
1966 1966  icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1967 1967  {
1968 1968          conn_t          *connp = Q_TO_CONN(q);
1969 1969          int             err;
1970 1970  
1971 1971          err = icmp_opt_get(connp, level, name, ptr);
1972 1972          return (err);
1973 1973  }
1974 1974  
1975 1975  static int
1976 1976  icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp)
1977 1977  {
1978 1978          struct bpf_program prog;
1979 1979          ip_bpf_insn_t *insns = NULL;
1980 1980          unsigned int size;
1981 1981  
1982 1982  #ifdef _LP64
1983 1983          if (get_udatamodel() != DATAMODEL_NATIVE) {
1984 1984                  struct bpf_program32 *prog32;
1985 1985  
1986 1986                  if (inlen != sizeof (struct bpf_program32)) {
1987 1987                          return (EINVAL);
1988 1988                  }
1989 1989                  prog32 = (struct bpf_program32 *)invalp;
1990 1990                  prog.bf_len = prog32->bf_len;
1991 1991                  prog.bf_insns = (void *)(uint64_t)prog32->bf_insns;
1992 1992          } else
1993 1993  #endif
1994 1994          if (inlen == sizeof (struct bpf_program)) {
1995 1995                  bcopy(invalp, &prog, sizeof (prog));
1996 1996          } else {
1997 1997                  return (EINVAL);
1998 1998          }
1999 1999  
2000 2000          if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) {
2001 2001                  return (EINVAL);
2002 2002          }
2003 2003          size = prog.bf_len * sizeof (struct bpf_insn);
2004 2004          insns = kmem_alloc(size, KM_SLEEP);
2005 2005          if (copyin(prog.bf_insns, insns, size) != 0) {
2006 2006                  kmem_free(insns, size);
2007 2007                  return (EFAULT);
2008 2008          }
2009 2009          if (!ip_bpf_validate(insns, prog.bf_len)) {
2010 2010                  kmem_free(insns, size);
2011 2011                  return (EINVAL);
2012 2012          }
2013 2013  
2014 2014          rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
2015 2015          if (icmp->icmp_bpf_len != 0) {
2016 2016                  ASSERT(icmp->icmp_bpf_prog != NULL);
2017 2017  
2018 2018                  kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
2019 2019          }
2020 2020          icmp->icmp_bpf_len = size;
2021 2021          icmp->icmp_bpf_prog = insns;
2022 2022          rw_exit(&icmp->icmp_bpf_lock);
2023 2023          return (0);
2024 2024  }
2025 2025  
2026 2026  static int
2027 2027  icmp_detach_filter(icmp_t *icmp)
2028 2028  {
2029 2029          int error;
2030 2030  
2031 2031          rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
2032 2032          if (icmp->icmp_bpf_len == 0) {
2033 2033                  ASSERT(icmp->icmp_bpf_prog == NULL);
2034 2034                  error = ENOENT;
2035 2035          } else {
2036 2036                  kmem_free(icmp->icmp_bpf_prog,
2037 2037                      icmp->icmp_bpf_len);
2038 2038                  icmp->icmp_bpf_len = 0;
2039 2039                  icmp->icmp_bpf_prog = NULL;
2040 2040                  error = 0;
2041 2041          }
2042 2042          rw_exit(&icmp->icmp_bpf_lock);
2043 2043          return (error);
2044 2044  }
2045 2045  
2046 2046  static boolean_t
2047 2047  icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira)
2048 2048  {
2049 2049          boolean_t res;
2050 2050          uchar_t *buf = mp->b_rptr;
2051 2051          uint_t wirelen, len = MBLKL(mp);
2052 2052  
2053 2053          rw_enter(&icmp->icmp_bpf_lock, RW_READER);
2054 2054          if (icmp->icmp_bpf_len == 0) {
2055 2055                  rw_exit(&icmp->icmp_bpf_lock);
2056 2056                  return (B_FALSE);
2057 2057          }
2058 2058          if (ira->ira_flags & IRAF_IS_IPV4) {
2059 2059                  ipha_t *ipha = (ipha_t *)buf;
2060 2060  
2061 2061                  wirelen = ntohs(ipha->ipha_length);
2062 2062          } else {
2063 2063                  ip6_t *ip6h = (ip6_t *)buf;
2064 2064  
2065 2065                  wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2066 2066          }
2067 2067          res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len);
2068 2068          rw_exit(&icmp->icmp_bpf_lock);
2069 2069  
2070 2070          return (res);
2071 2071  }
2072 2072  
2073 2073  /*
2074 2074   * This routine sets socket options.
2075 2075   */
2076 2076  int
2077 2077  icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
2078 2078      uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
2079 2079  {
2080 2080          conn_t          *connp = coa->coa_connp;
2081 2081          ip_xmit_attr_t  *ixa = coa->coa_ixa;
2082 2082          icmp_t          *icmp = connp->conn_icmp;
2083 2083          icmp_stack_t    *is = icmp->icmp_is;
2084 2084          int             *i1 = (int *)invalp;
2085 2085          boolean_t       onoff = (*i1 == 0) ? 0 : 1;
2086 2086          int             error;
2087 2087  
2088 2088          ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
2089 2089  
2090 2090          /*
2091 2091           * For fixed length options, no sanity check
2092 2092           * of passed in length is done. It is assumed *_optcom_req()
2093 2093           * routines do the right thing.
2094 2094           */
2095 2095  
2096 2096          switch (level) {
2097 2097          case SOL_SOCKET:
2098 2098                  switch (name) {
2099 2099                  case SO_PROTOTYPE:
2100 2100                          if ((*i1 & 0xFF) != IPPROTO_ICMP &&
2101 2101                              (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
2102 2102                              secpolicy_net_rawaccess(cr) != 0) {
2103 2103                                  return (EACCES);
2104 2104                          }
2105 2105                          if (checkonly)
2106 2106                                  break;
2107 2107  
2108 2108                          mutex_enter(&connp->conn_lock);
2109 2109                          connp->conn_proto = *i1 & 0xFF;
2110 2110                          ixa->ixa_protocol = connp->conn_proto;
2111 2111                          if ((connp->conn_proto == IPPROTO_RAW ||
2112 2112                              connp->conn_proto == IPPROTO_IGMP) &&
2113 2113                              connp->conn_family == AF_INET) {
2114 2114                                  icmp->icmp_hdrincl = 1;
2115 2115                                  ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2116 2116                          } else if (connp->conn_proto == IPPROTO_UDP ||
2117 2117                              connp->conn_proto == IPPROTO_TCP ||
2118 2118                              connp->conn_proto == IPPROTO_SCTP) {
2119 2119                                  /* Used by test applications like psh */
2120 2120                                  icmp->icmp_hdrincl = 0;
2121 2121                                  ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2122 2122                          } else {
2123 2123                                  icmp->icmp_hdrincl = 0;
2124 2124                                  ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2125 2125                          }
2126 2126  
2127 2127                          if (connp->conn_family == AF_INET6 &&
2128 2128                              connp->conn_proto == IPPROTO_ICMPV6) {
2129 2129                                  /* Set offset for icmp6_cksum */
2130 2130                                  ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2131 2131                                  ixa->ixa_raw_cksum_offset = 2;
2132 2132                          }
2133 2133                          if (icmp->icmp_filter != NULL &&
2134 2134                              connp->conn_proto != IPPROTO_ICMPV6) {
2135 2135                                  kmem_free(icmp->icmp_filter,
2136 2136                                      sizeof (icmp6_filter_t));
2137 2137                                  icmp->icmp_filter = NULL;
2138 2138                          }
2139 2139                          mutex_exit(&connp->conn_lock);
2140 2140  
2141 2141                          coa->coa_changed |= COA_HEADER_CHANGED;
2142 2142                          /*
2143 2143                           * For SCTP, we don't use icmp_bind_proto() for
2144 2144                           * raw socket binding.
2145 2145                           */
2146 2146                          if (connp->conn_proto == IPPROTO_SCTP)
2147 2147                                  return (0);
2148 2148  
2149 2149                          coa->coa_changed |= COA_ICMP_BIND_NEEDED;
2150 2150                          return (0);
2151 2151  
2152 2152                  case SO_SNDBUF:
2153 2153                          if (*i1 > is->is_max_buf) {
2154 2154                                  return (ENOBUFS);
2155 2155                          }
2156 2156                          break;
2157 2157                  case SO_RCVBUF:
2158 2158                          if (*i1 > is->is_max_buf) {
2159 2159                                  return (ENOBUFS);
2160 2160                          }
2161 2161                          break;
2162 2162                  case SO_ATTACH_FILTER:
2163 2163                          return (icmp_attach_filter(icmp, inlen, invalp));
2164 2164                  case SO_DETACH_FILTER:
2165 2165                          return (icmp_detach_filter(icmp));
2166 2166                  }
2167 2167                  break;
2168 2168  
2169 2169          case IPPROTO_IP:
2170 2170                  /*
2171 2171                   * Only allow IPv4 option processing on IPv4 sockets.
2172 2172                   */
2173 2173                  if (connp->conn_family != AF_INET)
2174 2174                          return (EINVAL);
2175 2175  
2176 2176                  switch (name) {
2177 2177                  case IP_HDRINCL:
2178 2178                          if (!checkonly) {
2179 2179                                  mutex_enter(&connp->conn_lock);
2180 2180                                  icmp->icmp_hdrincl = onoff;
2181 2181                                  if (onoff)
2182 2182                                          ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2183 2183                                  else
2184 2184                                          ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2185 2185                                  mutex_exit(&connp->conn_lock);
2186 2186                          }
2187 2187                          break;
2188 2188                  }
2189 2189                  break;
2190 2190  
2191 2191          case IPPROTO_IPV6:
2192 2192                  if (connp->conn_family != AF_INET6)
2193 2193                          return (EINVAL);
2194 2194  
2195 2195                  switch (name) {
2196 2196                  case IPV6_CHECKSUM:
2197 2197                          /*
2198 2198                           * Integer offset into the user data of where the
2199 2199                           * checksum is located.
2200 2200                           * Offset of -1 disables option.
2201 2201                           * Does not apply to IPPROTO_ICMPV6.
2202 2202                           */
2203 2203                          if (connp->conn_proto == IPPROTO_ICMPV6 ||
2204 2204                              coa->coa_ancillary) {
2205 2205                                  return (EINVAL);
2206 2206                          }
2207 2207                          if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2208 2208                                  /* Negative or not 16 bit aligned offset */
2209 2209                                  return (EINVAL);
2210 2210                          }
2211 2211                          if (checkonly)
2212 2212                                  break;
2213 2213  
2214 2214                          mutex_enter(&connp->conn_lock);
2215 2215                          if (*i1 == -1) {
2216 2216                                  ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2217 2217                                  ixa->ixa_raw_cksum_offset = 0;
2218 2218                                  ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2219 2219                          } else {
2220 2220                                  ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2221 2221                                  ixa->ixa_raw_cksum_offset = *i1;
2222 2222                                  ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2223 2223                          }
2224 2224                          mutex_exit(&connp->conn_lock);
2225 2225                          break;
2226 2226                  }
2227 2227                  break;
2228 2228  
2229 2229          case IPPROTO_ICMPV6:
2230 2230                  /*
2231 2231                   * Only allow IPv6 option processing on IPv6 sockets.
2232 2232                   */
2233 2233                  if (connp->conn_family != AF_INET6)
2234 2234                          return (EINVAL);
2235 2235                  if (connp->conn_proto != IPPROTO_ICMPV6)
2236 2236                          return (EINVAL);
2237 2237  
2238 2238                  switch (name) {
2239 2239                  case ICMP6_FILTER:
2240 2240                          if (checkonly)
2241 2241                                  break;
2242 2242  
2243 2243                          if ((inlen != 0) &&
2244 2244                              (inlen != sizeof (icmp6_filter_t)))
2245 2245                                  return (EINVAL);
2246 2246  
2247 2247                          mutex_enter(&connp->conn_lock);
2248 2248                          if (inlen == 0) {
2249 2249                                  if (icmp->icmp_filter != NULL) {
2250 2250                                          kmem_free(icmp->icmp_filter,
2251 2251                                              sizeof (icmp6_filter_t));
2252 2252                                          icmp->icmp_filter = NULL;
2253 2253                                  }
2254 2254                          } else {
2255 2255                                  if (icmp->icmp_filter == NULL) {
2256 2256                                          icmp->icmp_filter = kmem_alloc(
2257 2257                                              sizeof (icmp6_filter_t),
2258 2258                                              KM_NOSLEEP);
2259 2259                                          if (icmp->icmp_filter == NULL) {
2260 2260                                                  mutex_exit(&connp->conn_lock);
2261 2261                                                  return (ENOBUFS);
2262 2262                                          }
2263 2263                                  }
2264 2264                                  (void) bcopy(invalp, icmp->icmp_filter, inlen);
2265 2265                          }
2266 2266                          mutex_exit(&connp->conn_lock);
2267 2267                          break;
2268 2268                  }
2269 2269                  break;
2270 2270          }
2271 2271          error = conn_opt_set(coa, level, name, inlen, invalp,
2272 2272              checkonly, cr);
2273 2273          return (error);
2274 2274  }
2275 2275  
2276 2276  /*
2277 2277   * This routine sets socket options.
2278 2278   */
2279 2279  int
2280 2280  icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2281 2281      uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2282 2282      void *thisdg_attrs, cred_t *cr)
2283 2283  {
2284 2284          icmp_t          *icmp = connp->conn_icmp;
2285 2285          int             err;
2286 2286          conn_opt_arg_t  coas, *coa;
2287 2287          boolean_t       checkonly;
2288 2288          icmp_stack_t    *is = icmp->icmp_is;
2289 2289  
2290 2290          switch (optset_context) {
2291 2291          case SETFN_OPTCOM_CHECKONLY:
2292 2292                  checkonly = B_TRUE;
2293 2293                  /*
2294 2294                   * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2295 2295                   * inlen != 0 implies value supplied and
2296 2296                   *      we have to "pretend" to set it.
2297 2297                   * inlen == 0 implies that there is no
2298 2298                   *      value part in T_CHECK request and just validation
2299 2299                   * done elsewhere should be enough, we just return here.
2300 2300                   */
2301 2301                  if (inlen == 0) {
2302 2302                          *outlenp = 0;
2303 2303                          return (0);
2304 2304                  }
2305 2305                  break;
2306 2306          case SETFN_OPTCOM_NEGOTIATE:
2307 2307                  checkonly = B_FALSE;
2308 2308                  break;
2309 2309          case SETFN_UD_NEGOTIATE:
2310 2310          case SETFN_CONN_NEGOTIATE:
2311 2311                  checkonly = B_FALSE;
2312 2312                  /*
2313 2313                   * Negotiating local and "association-related" options
2314 2314                   * through T_UNITDATA_REQ.
2315 2315                   *
2316 2316                   * Following routine can filter out ones we do not
2317 2317                   * want to be "set" this way.
2318 2318                   */
2319 2319                  if (!icmp_opt_allow_udr_set(level, name)) {
2320 2320                          *outlenp = 0;
2321 2321                          return (EINVAL);
2322 2322                  }
2323 2323                  break;
2324 2324          default:
2325 2325                  /*
2326 2326                   * We should never get here
2327 2327                   */
2328 2328                  *outlenp = 0;
2329 2329                  return (EINVAL);
2330 2330          }
2331 2331  
2332 2332          ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2333 2333              (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2334 2334  
2335 2335          if (thisdg_attrs != NULL) {
2336 2336                  /* Options from T_UNITDATA_REQ */
2337 2337                  coa = (conn_opt_arg_t *)thisdg_attrs;
2338 2338                  ASSERT(coa->coa_connp == connp);
2339 2339                  ASSERT(coa->coa_ixa != NULL);
2340 2340                  ASSERT(coa->coa_ipp != NULL);
2341 2341                  ASSERT(coa->coa_ancillary);
2342 2342          } else {
2343 2343                  coa = &coas;
2344 2344                  coas.coa_connp = connp;
2345 2345                  /* Get a reference on conn_ixa to prevent concurrent mods */
2346 2346                  coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2347 2347                  if (coas.coa_ixa == NULL) {
2348 2348                          *outlenp = 0;
2349 2349                          return (ENOMEM);
2350 2350                  }
2351 2351                  coas.coa_ipp = &connp->conn_xmit_ipp;
2352 2352                  coas.coa_ancillary = B_FALSE;
2353 2353                  coas.coa_changed = 0;
2354 2354          }
2355 2355  
2356 2356          err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2357 2357              cr, checkonly);
2358 2358          if (err != 0) {
2359 2359  errout:
2360 2360                  if (!coa->coa_ancillary)
2361 2361                          ixa_refrele(coa->coa_ixa);
2362 2362                  *outlenp = 0;
2363 2363                  return (err);
2364 2364          }
2365 2365  
2366 2366          /*
2367 2367           * Common case of OK return with outval same as inval.
2368 2368           */
2369 2369          if (invalp != outvalp) {
2370 2370                  /* don't trust bcopy for identical src/dst */
2371 2371                  (void) bcopy(invalp, outvalp, inlen);
2372 2372          }
2373 2373          *outlenp = inlen;
2374 2374  
2375 2375          /*
2376 2376           * If this was not ancillary data, then we rebuild the headers,
2377 2377           * update the IRE/NCE, and IPsec as needed.
2378 2378           * Since the label depends on the destination we go through
2379 2379           * ip_set_destination first.
2380 2380           */
2381 2381          if (coa->coa_ancillary) {
2382 2382                  return (0);
2383 2383          }
2384 2384  
2385 2385          if (coa->coa_changed & COA_ROUTE_CHANGED) {
2386 2386                  in6_addr_t saddr, faddr, nexthop;
2387 2387                  in_port_t fport;
2388 2388  
2389 2389                  /*
2390 2390                   * We clear lastdst to make sure we pick up the change
2391 2391                   * next time sending.
2392 2392                   * If we are connected we re-cache the information.
2393 2393                   * We ignore errors to preserve BSD behavior.
2394 2394                   * Note that we don't redo IPsec policy lookup here
2395 2395                   * since the final destination (or source) didn't change.
2396 2396                   */
2397 2397                  mutex_enter(&connp->conn_lock);
2398 2398                  connp->conn_v6lastdst = ipv6_all_zeros;
2399 2399  
2400 2400                  ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2401 2401                      &connp->conn_faddr_v6, &nexthop);
2402 2402                  saddr = connp->conn_saddr_v6;
2403 2403                  faddr = connp->conn_faddr_v6;
2404 2404                  fport = connp->conn_fport;
2405 2405                  mutex_exit(&connp->conn_lock);
2406 2406  
2407 2407                  if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2408 2408                      !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2409 2409                          (void) ip_attr_connect(connp, coa->coa_ixa,
2410 2410                              &saddr, &faddr, &nexthop, fport, NULL, NULL,
2411 2411                              IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2412 2412                  }
2413 2413          }
2414 2414  
2415 2415          ixa_refrele(coa->coa_ixa);
2416 2416  
2417 2417          if (coa->coa_changed & COA_HEADER_CHANGED) {
2418 2418                  /*
2419 2419                   * Rebuild the header template if we are connected.
2420 2420                   * Otherwise clear conn_v6lastdst so we rebuild the header
2421 2421                   * in the data path.
2422 2422                   */
2423 2423                  mutex_enter(&connp->conn_lock);
2424 2424                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2425 2425                      !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2426 2426                          err = icmp_build_hdr_template(connp,
2427 2427                              &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2428 2428                              connp->conn_flowinfo);
2429 2429                          if (err != 0) {
2430 2430                                  mutex_exit(&connp->conn_lock);
2431 2431                                  return (err);
2432 2432                          }
2433 2433                  } else {
2434 2434                          connp->conn_v6lastdst = ipv6_all_zeros;
2435 2435                  }
2436 2436                  mutex_exit(&connp->conn_lock);
2437 2437          }
2438 2438          if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2439 2439                  (void) proto_set_rx_hiwat(connp->conn_rq, connp,
2440 2440                      connp->conn_rcvbuf);
2441 2441          }
2442 2442          if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2443 2443                  connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2444 2444          }
2445 2445          if (coa->coa_changed & COA_WROFF_CHANGED) {
2446 2446                  /* Increase wroff if needed */
2447 2447                  uint_t wroff;
2448 2448  
2449 2449                  mutex_enter(&connp->conn_lock);
2450 2450                  wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2451 2451                  if (wroff > connp->conn_wroff) {
2452 2452                          connp->conn_wroff = wroff;
2453 2453                          mutex_exit(&connp->conn_lock);
2454 2454                          (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2455 2455                  } else {
2456 2456                          mutex_exit(&connp->conn_lock);
2457 2457                  }
2458 2458          }
2459 2459          if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2460 2460                  icmp_bind_proto(icmp);
2461 2461          }
2462 2462          return (err);
2463 2463  }
2464 2464  
2465 2465  /* This routine sets socket options. */
2466 2466  int
2467 2467  icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2468 2468      uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2469 2469      void *thisdg_attrs, cred_t *cr)
2470 2470  {
2471 2471          conn_t  *connp = Q_TO_CONN(q);
2472 2472          int error;
2473 2473  
2474 2474          error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2475 2475              outlenp, outvalp, thisdg_attrs, cr);
2476 2476          return (error);
2477 2477  }
2478 2478  
2479 2479  /*
2480 2480   * Setup IP headers.
2481 2481   *
2482 2482   * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2483 2483   * but icmp_output_hdrincl restores ipha_protocol once we return.
2484 2484   */
2485 2485  mblk_t *
2486 2486  icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2487 2487      const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2488 2488      mblk_t *data_mp, int *errorp)
2489 2489  {
2490 2490          mblk_t          *mp;
2491 2491          icmp_stack_t    *is = connp->conn_netstack->netstack_icmp;
2492 2492          uint_t          data_len;
2493 2493          uint32_t        cksum;
2494 2494  
2495 2495          data_len = msgdsize(data_mp);
2496 2496          mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2497 2497              flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2498 2498          if (mp == NULL) {
2499 2499                  ASSERT(*errorp != 0);
2500 2500                  return (NULL);
2501 2501          }
2502 2502  
2503 2503          ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2504 2504  
2505 2505          /*
2506 2506           * If there was a routing option/header then conn_prepend_hdr
2507 2507           * has massaged it and placed the pseudo-header checksum difference
2508 2508           * in the cksum argument.
2509 2509           *
2510 2510           * Prepare for ICMPv6 checksum done in IP.
2511 2511           *
2512 2512           * We make it easy for IP to include our pseudo header
2513 2513           * by putting our length (and any routing header adjustment)
2514 2514           * in the ICMPv6 checksum field.
2515 2515           * The IP source, destination, and length have already been set by
2516 2516           * conn_prepend_hdr.
2517 2517           */
2518 2518          cksum += data_len;
2519 2519          cksum = (cksum >> 16) + (cksum & 0xFFFF);
2520 2520          ASSERT(cksum < 0x10000);
2521 2521  
2522 2522          if (ixa->ixa_flags & IXAF_IS_IPV4) {
2523 2523                  ipha_t  *ipha = (ipha_t *)mp->b_rptr;
2524 2524  
2525 2525                  ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2526 2526          } else {
2527 2527                  ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
2528 2528                  uint_t  cksum_offset = 0;
2529 2529  
2530 2530                  ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2531 2531  
2532 2532                  if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2533 2533                          if (connp->conn_proto == IPPROTO_ICMPV6) {
2534 2534                                  cksum_offset = ixa->ixa_ip_hdr_length +
2535 2535                                      offsetof(icmp6_t, icmp6_cksum);
2536 2536                          } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2537 2537                                  cksum_offset = ixa->ixa_ip_hdr_length +
2538 2538                                      ixa->ixa_raw_cksum_offset;
2539 2539                          }
2540 2540                  }
2541 2541                  if (cksum_offset != 0) {
2542 2542                          uint16_t *ptr;
2543 2543  
2544 2544                          /* Make sure the checksum fits in the first mblk */
2545 2545                          if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2546 2546                                  mblk_t *mp1;
2547 2547  
2548 2548                                  mp1 = msgpullup(mp,
2549 2549                                      cksum_offset + sizeof (short));
2550 2550                                  freemsg(mp);
2551 2551                                  if (mp1 == NULL) {
2552 2552                                          *errorp = ENOMEM;
2553 2553                                          return (NULL);
2554 2554                                  }
2555 2555                                  mp = mp1;
2556 2556                                  ip6h = (ip6_t *)mp->b_rptr;
2557 2557                          }
2558 2558                          ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2559 2559                          *ptr = htons(cksum);
2560 2560                  }
2561 2561          }
2562 2562  
2563 2563          /* Note that we don't try to update wroff due to ancillary data */
2564 2564          return (mp);
2565 2565  }
2566 2566  
2567 2567  static int
2568 2568  icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2569 2569      const in6_addr_t *v6dst, uint32_t flowinfo)
2570 2570  {
2571 2571          int             error;
2572 2572  
2573 2573          ASSERT(MUTEX_HELD(&connp->conn_lock));
2574 2574          /*
2575 2575           * We clear lastdst to make sure we don't use the lastdst path
2576 2576           * next time sending since we might not have set v6dst yet.
2577 2577           */
2578 2578          connp->conn_v6lastdst = ipv6_all_zeros;
2579 2579  
2580 2580          error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2581 2581          if (error != 0)
2582 2582                  return (error);
2583 2583  
2584 2584          /*
2585 2585           * Any routing header/option has been massaged. The checksum difference
2586 2586           * is stored in conn_sum.
2587 2587           */
2588 2588          return (0);
2589 2589  }
2590 2590  
2591 2591  static mblk_t *
2592 2592  icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2593 2593  {
2594 2594          ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2595 2595          if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2596 2596                  /*
2597 2597                   * fallback has started but messages have not been moved yet
2598 2598                   */
2599 2599                  if (icmp->icmp_fallback_queue_head == NULL) {
2600 2600                          ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2601 2601                          icmp->icmp_fallback_queue_head = mp;
2602 2602                          icmp->icmp_fallback_queue_tail = mp;
2603 2603                  } else {
2604 2604                          ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2605 2605                          icmp->icmp_fallback_queue_tail->b_next = mp;
2606 2606                          icmp->icmp_fallback_queue_tail = mp;
2607 2607                  }
2608 2608                  return (NULL);
2609 2609          } else {
2610 2610                  /*
2611 2611                   * Fallback completed, let the caller putnext() the mblk.
2612 2612                   */
2613 2613                  return (mp);
2614 2614          }
2615 2615  }
2616 2616  
2617 2617  /*
2618 2618   * Deliver data to ULP. In case we have a socket, and it's falling back to
2619 2619   * TPI, then we'll queue the mp for later processing.
2620 2620   */
2621 2621  static void
2622 2622  icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2623 2623  {
2624 2624          if (IPCL_IS_NONSTR(connp)) {
2625 2625                  icmp_t *icmp = connp->conn_icmp;
2626 2626                  int error;
2627 2627  
2628 2628                  ASSERT(len == msgdsize(mp));
2629 2629                  if ((*connp->conn_upcalls->su_recv)
2630 2630                      (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2631 2631                          mutex_enter(&icmp->icmp_recv_lock);
2632 2632                          if (error == ENOSPC) {
2633 2633                                  /*
2634 2634                                   * let's confirm while holding the lock
2635 2635                                   */
2636 2636                                  if ((*connp->conn_upcalls->su_recv)
2637 2637                                      (connp->conn_upper_handle, NULL, 0, 0,
2638 2638                                      &error, NULL) < 0) {
2639 2639                                          ASSERT(error == ENOSPC);
2640 2640                                          if (error == ENOSPC) {
2641 2641                                                  connp->conn_flow_cntrld =
2642 2642                                                      B_TRUE;
2643 2643                                          }
2644 2644                                  }
2645 2645                                  mutex_exit(&icmp->icmp_recv_lock);
2646 2646                          } else {
2647 2647                                  ASSERT(error == EOPNOTSUPP);
2648 2648                                  mp = icmp_queue_fallback(icmp, mp);
2649 2649                                  mutex_exit(&icmp->icmp_recv_lock);
2650 2650                                  if (mp != NULL)
2651 2651                                          putnext(connp->conn_rq, mp);
2652 2652                          }
2653 2653                  }
2654 2654                  ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2655 2655          } else {
2656 2656                  putnext(connp->conn_rq, mp);
2657 2657          }
2658 2658  }
2659 2659  
2660 2660  /*
2661 2661   * This is the inbound data path.
2662 2662   * IP has already pulled up the IP headers and verified alignment
2663 2663   * etc.
2664 2664   */
2665 2665  /* ARGSUSED2 */
2666 2666  static void
2667 2667  icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2668 2668  {
2669 2669          conn_t                  *connp = (conn_t *)arg1;
2670 2670          struct T_unitdata_ind   *tudi;
2671 2671          uchar_t                 *rptr;          /* Pointer to IP header */
2672 2672          int                     ip_hdr_length;
2673 2673          int                     udi_size;       /* Size of T_unitdata_ind */
2674 2674          int                     pkt_len;
2675 2675          icmp_t                  *icmp;
2676 2676          ip_pkt_t                ipps;
2677 2677          ip6_t                   *ip6h;
2678 2678          mblk_t                  *mp1;
2679 2679          crb_t                   recv_ancillary;
2680 2680          icmp_stack_t            *is;
2681 2681          sin_t                   *sin;
2682 2682          sin6_t                  *sin6;
2683 2683          ipha_t                  *ipha;
2684 2684  
2685 2685          ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2686 2686  
2687 2687          icmp = connp->conn_icmp;
2688 2688          is = icmp->icmp_is;
2689 2689          rptr = mp->b_rptr;
2690 2690  
2691 2691          ASSERT(DB_TYPE(mp) == M_DATA);
2692 2692          ASSERT(OK_32PTR(rptr));
2693 2693          ASSERT(ira->ira_pktlen == msgdsize(mp));
2694 2694          pkt_len = ira->ira_pktlen;
2695 2695  
2696 2696          /*
2697 2697           * Get a snapshot of these and allow other threads to change
2698 2698           * them after that. We need the same recv_ancillary when determining
2699 2699           * the size as when adding the ancillary data items.
2700 2700           */
2701 2701          mutex_enter(&connp->conn_lock);
2702 2702          recv_ancillary = connp->conn_recv_ancillary;
2703 2703          mutex_exit(&connp->conn_lock);
2704 2704  
2705 2705          ip_hdr_length = ira->ira_ip_hdr_length;
2706 2706          ASSERT(MBLKL(mp) >= ip_hdr_length);     /* IP did a pullup */
2707 2707  
2708 2708          /* Initialize regardless of IP version */
2709 2709          ipps.ipp_fields = 0;
2710 2710  
2711 2711          /* Apply socket filter, if needed */
2712 2712          if (icmp->icmp_bpf_len != 0) {
2713 2713                  if (icmp_eval_filter(icmp, mp, ira)) {
2714 2714                          freemsg(mp);
2715 2715                          return;
2716 2716                  }
2717 2717          }
2718 2718  
2719 2719          if (ira->ira_flags & IRAF_IS_IPV4) {
2720 2720                  ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2721 2721                  ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2722 2722                  ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2723 2723  
2724 2724                  ipha = (ipha_t *)mp->b_rptr;
2725 2725                  if (recv_ancillary.crb_all != 0)
2726 2726                          (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2727 2727  
2728 2728                  /*
2729 2729                   * BSD for some reason adjusts ipha_length to exclude the
2730 2730                   * IP header length. We do the same.
2731 2731                   */
2732 2732                  if (is->is_bsd_compat) {
2733 2733                          ushort_t len;
2734 2734  
2735 2735                          len = ntohs(ipha->ipha_length);
2736 2736                          if (mp->b_datap->db_ref > 1) {
2737 2737                                  /*
2738 2738                                   * Allocate a new IP header so that we can
2739 2739                                   * modify ipha_length.
2740 2740                                   */
2741 2741                                  mblk_t  *mp1;
2742 2742  
2743 2743                                  mp1 = allocb(ip_hdr_length, BPRI_MED);
2744 2744                                  if (mp1 == NULL) {
2745 2745                                          freemsg(mp);
2746 2746                                          BUMP_MIB(&is->is_rawip_mib,
2747 2747                                              rawipInErrors);
2748 2748                                          return;
2749 2749                                  }
2750 2750                                  bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2751 2751                                  mp->b_rptr = rptr + ip_hdr_length;
2752 2752                                  rptr = mp1->b_rptr;
2753 2753                                  ipha = (ipha_t *)rptr;
2754 2754                                  mp1->b_cont = mp;
2755 2755                                  mp1->b_wptr = rptr + ip_hdr_length;
2756 2756                                  mp = mp1;
2757 2757                          }
2758 2758                          len -= ip_hdr_length;
2759 2759                          ipha->ipha_length = htons(len);
2760 2760                  }
2761 2761  
2762 2762                  /*
2763 2763                   * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2764 2764                   * sockets. This is ensured by icmp_bind and the IP fanout code.
2765 2765                   */
2766 2766                  ASSERT(connp->conn_family == AF_INET);
2767 2767  
2768 2768                  /*
2769 2769                   * This is the inbound data path.  Packets are passed upstream
2770 2770                   * as T_UNITDATA_IND messages with full IPv4 headers still
2771 2771                   * attached.
2772 2772                   */
2773 2773  
2774 2774                  /*
2775 2775                   * Normally only send up the source address.
2776 2776                   * If any ancillary data items are wanted we add those.
2777 2777                   */
2778 2778                  udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2779 2779                  if (recv_ancillary.crb_all != 0) {
2780 2780                          udi_size += conn_recvancillary_size(connp,
2781 2781                              recv_ancillary, ira, mp, &ipps);
2782 2782                  }
2783 2783  
2784 2784                  /* Allocate a message block for the T_UNITDATA_IND structure. */
2785 2785                  mp1 = allocb(udi_size, BPRI_MED);
2786 2786                  if (mp1 == NULL) {
2787 2787                          freemsg(mp);
2788 2788                          BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2789 2789                          return;
2790 2790                  }
2791 2791                  mp1->b_cont = mp;
2792 2792                  tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2793 2793                  mp1->b_datap->db_type = M_PROTO;
2794 2794                  mp1->b_wptr = (uchar_t *)tudi + udi_size;
2795 2795                  tudi->PRIM_type = T_UNITDATA_IND;
2796 2796                  tudi->SRC_length = sizeof (sin_t);
2797 2797                  tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2798 2798                  sin = (sin_t *)&tudi[1];
2799 2799                  *sin = sin_null;
2800 2800                  sin->sin_family = AF_INET;
2801 2801                  sin->sin_addr.s_addr = ipha->ipha_src;
2802 2802                  *(uint32_t *)&sin->sin_zero[0] = 0;
2803 2803                  *(uint32_t *)&sin->sin_zero[4] = 0;
2804 2804                  tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2805 2805                      sizeof (sin_t);
2806 2806                  udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2807 2807                  tudi->OPT_length = udi_size;
2808 2808  
2809 2809                  /*
2810 2810                   * Add options if IP_RECVIF etc is set
2811 2811                   */
2812 2812                  if (udi_size != 0) {
2813 2813                          conn_recvancillary_add(connp, recv_ancillary, ira,
2814 2814                              &ipps, (uchar_t *)&sin[1], udi_size);
2815 2815                  }
2816 2816                  goto deliver;
2817 2817          }
2818 2818  
2819 2819          ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2820 2820          /*
2821 2821           * IPv6 packets can only be received by applications
2822 2822           * that are prepared to receive IPv6 addresses.
2823 2823           * The IP fanout must ensure this.
2824 2824           */
2825 2825          ASSERT(connp->conn_family == AF_INET6);
2826 2826  
2827 2827          /*
2828 2828           * Handle IPv6 packets. We don't pass up the IP headers with the
2829 2829           * payload for IPv6.
2830 2830           */
2831 2831  
2832 2832          ip6h = (ip6_t *)rptr;
2833 2833          if (recv_ancillary.crb_all != 0) {
2834 2834                  /*
2835 2835                   * Call on ip_find_hdr_v6 which gets individual lenghts of
2836 2836                   * extension headers (and pointers to them).
2837 2837                   */
2838 2838                  uint8_t         nexthdr;
2839 2839  
2840 2840                  /* We don't care about the length or nextheader. */
2841 2841                  (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2842 2842  
2843 2843                  /*
2844 2844                   * We do not pass up hop-by-hop options or any other
2845 2845                   * extension header as part of the packet. Applications
2846 2846                   * that want to see them have to specify IPV6_RECV* socket
2847 2847                   * options. And conn_recvancillary_size/add explicitly
2848 2848                   * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2849 2849                   *
2850 2850                   * If we had multilevel ICMP sockets, then we'd want to
2851 2851                   * modify conn_recvancillary_size/add to
2852 2852                   * allow the user to see the label.
2853 2853                   */
2854 2854          }
2855 2855  
2856 2856          /*
2857 2857           * Check a filter for ICMPv6 types if needed.
2858 2858           * Verify raw checksums if needed.
2859 2859           */
2860 2860          mutex_enter(&connp->conn_lock);
2861 2861          if (icmp->icmp_filter != NULL) {
2862 2862                  int type;
2863 2863  
2864 2864                  /* Assumes that IP has done the pullupmsg */
2865 2865                  type = mp->b_rptr[ip_hdr_length];
2866 2866  
2867 2867                  ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2868 2868                  if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2869 2869                          mutex_exit(&connp->conn_lock);
2870 2870                          freemsg(mp);
2871 2871                          return;
2872 2872                  }
2873 2873          }
2874 2874          if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2875 2875                  /* Checksum */
2876 2876                  uint16_t        *up;
2877 2877                  uint32_t        sum;
2878 2878                  int             remlen;
2879 2879  
2880 2880                  up = (uint16_t *)&ip6h->ip6_src;
2881 2881  
2882 2882                  remlen = msgdsize(mp) - ip_hdr_length;
2883 2883                  sum = htons(connp->conn_proto + remlen)
2884 2884                      + up[0] + up[1] + up[2] + up[3]
2885 2885                      + up[4] + up[5] + up[6] + up[7]
2886 2886                      + up[8] + up[9] + up[10] + up[11]
2887 2887                      + up[12] + up[13] + up[14] + up[15];
2888 2888                  sum = (sum & 0xffff) + (sum >> 16);
2889 2889                  sum = IP_CSUM(mp, ip_hdr_length, sum);
2890 2890                  if (sum != 0) {
2891 2891                          /* IPv6 RAW checksum failed */
2892 2892                          ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2893 2893                          mutex_exit(&connp->conn_lock);
2894 2894                          freemsg(mp);
2895 2895                          BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2896 2896                          return;
2897 2897                  }
2898 2898          }
2899 2899          mutex_exit(&connp->conn_lock);
2900 2900  
2901 2901          udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2902 2902  
2903 2903          if (recv_ancillary.crb_all != 0) {
2904 2904                  udi_size += conn_recvancillary_size(connp,
2905 2905                      recv_ancillary, ira, mp, &ipps);
2906 2906          }
2907 2907  
2908 2908          mp1 = allocb(udi_size, BPRI_MED);
2909 2909          if (mp1 == NULL) {
2910 2910                  freemsg(mp);
2911 2911                  BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2912 2912                  return;
2913 2913          }
2914 2914          mp1->b_cont = mp;
2915 2915          mp1->b_datap->db_type = M_PROTO;
2916 2916          tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2917 2917          mp1->b_wptr = (uchar_t *)tudi + udi_size;
2918 2918          tudi->PRIM_type = T_UNITDATA_IND;
2919 2919          tudi->SRC_length = sizeof (sin6_t);
2920 2920          tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2921 2921          tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2922 2922          udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2923 2923          tudi->OPT_length = udi_size;
2924 2924          sin6 = (sin6_t *)&tudi[1];
2925 2925          *sin6 = sin6_null;
2926 2926          sin6->sin6_port = 0;
2927 2927          sin6->sin6_family = AF_INET6;
2928 2928  
2929 2929          sin6->sin6_addr = ip6h->ip6_src;
2930 2930          /* No sin6_flowinfo per API */
2931 2931          sin6->sin6_flowinfo = 0;
2932 2932          /* For link-scope pass up scope id */
2933 2933          if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2934 2934                  sin6->sin6_scope_id = ira->ira_ruifindex;
2935 2935          else
2936 2936                  sin6->sin6_scope_id = 0;
2937 2937          sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2938 2938              IPCL_ZONEID(connp), is->is_netstack);
2939 2939  
2940 2940          if (udi_size != 0) {
2941 2941                  conn_recvancillary_add(connp, recv_ancillary, ira,
2942 2942                      &ipps, (uchar_t *)&sin6[1], udi_size);
2943 2943          }
2944 2944  
2945 2945          /* Skip all the IPv6 headers per API */
2946 2946          mp->b_rptr += ip_hdr_length;
2947 2947          pkt_len -= ip_hdr_length;
2948 2948  
2949 2949  deliver:
2950 2950          BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2951 2951          icmp_ulp_recv(connp, mp1, pkt_len);
2952 2952  }
2953 2953  
2954 2954  /*
2955 2955   * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2956 2956   * information that can be changing beneath us.
2957 2957   */
2958 2958  mblk_t *
2959 2959  icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2960 2960  {
2961 2961          mblk_t                  *mpdata;
2962 2962          struct opthdr           *optp;
2963 2963          conn_t                  *connp = Q_TO_CONN(q);
2964 2964          icmp_stack_t            *is = connp->conn_netstack->netstack_icmp;
2965 2965          mblk_t                  *mp2ctl;
2966 2966  
2967 2967          /*
2968 2968           * make a copy of the original message
2969 2969           */
2970 2970          mp2ctl = copymsg(mpctl);
2971 2971  
2972 2972          if (mpctl == NULL ||
2973 2973              (mpdata = mpctl->b_cont) == NULL) {
2974 2974                  freemsg(mpctl);
2975 2975                  freemsg(mp2ctl);
2976 2976                  return (0);
2977 2977          }
2978 2978  
2979 2979          /* fixed length structure for IPv4 and IPv6 counters */
2980 2980          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2981 2981          optp->level = EXPER_RAWIP;
2982 2982          optp->name = 0;
2983 2983          (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2984 2984              sizeof (is->is_rawip_mib));
2985 2985          optp->len = msgdsize(mpdata);
2986 2986          qreply(q, mpctl);
2987 2987  
2988 2988          return (mp2ctl);
2989 2989  }
2990 2990  
2991 2991  /*
2992 2992   * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2993 2993   * TODO:  If this ever actually tries to set anything, it needs to be
2994 2994   * to do the appropriate locking.
2995 2995   */
2996 2996  /* ARGSUSED */
2997 2997  int
2998 2998  icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2999 2999      uchar_t *ptr, int len)
3000 3000  {
3001 3001          switch (level) {
3002 3002          case EXPER_RAWIP:
3003 3003                  return (0);
3004 3004          default:
3005 3005                  return (1);
3006 3006          }
3007 3007  }
3008 3008  
3009 3009  /*
3010 3010   * This routine creates a T_UDERROR_IND message and passes it upstream.
3011 3011   * The address and options are copied from the T_UNITDATA_REQ message
3012 3012   * passed in mp.  This message is freed.
3013 3013   */
3014 3014  static void
3015 3015  icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
3016 3016  {
3017 3017          struct T_unitdata_req *tudr;
3018 3018          mblk_t  *mp1;
3019 3019          uchar_t *destaddr;
3020 3020          t_scalar_t destlen;
3021 3021          uchar_t *optaddr;
3022 3022          t_scalar_t optlen;
3023 3023  
3024 3024          if ((mp->b_wptr < mp->b_rptr) ||
3025 3025              (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
3026 3026                  goto done;
3027 3027          }
3028 3028          tudr = (struct T_unitdata_req *)mp->b_rptr;
3029 3029          destaddr = mp->b_rptr + tudr->DEST_offset;
3030 3030          if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
3031 3031              destaddr + tudr->DEST_length < mp->b_rptr ||
3032 3032              destaddr + tudr->DEST_length > mp->b_wptr) {
3033 3033                  goto done;
3034 3034          }
3035 3035          optaddr = mp->b_rptr + tudr->OPT_offset;
3036 3036          if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
3037 3037              optaddr + tudr->OPT_length < mp->b_rptr ||
3038 3038              optaddr + tudr->OPT_length > mp->b_wptr) {
3039 3039                  goto done;
3040 3040          }
3041 3041          destlen = tudr->DEST_length;
3042 3042          optlen = tudr->OPT_length;
3043 3043  
3044 3044          mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
3045 3045              (char *)optaddr, optlen, err);
3046 3046          if (mp1 != NULL)
3047 3047                  qreply(q, mp1);
3048 3048  
3049 3049  done:
3050 3050          freemsg(mp);
3051 3051  }
3052 3052  
3053 3053  static int
3054 3054  rawip_do_unbind(conn_t *connp)
3055 3055  {
3056 3056          icmp_t  *icmp = connp->conn_icmp;
3057 3057  
3058 3058          mutex_enter(&connp->conn_lock);
3059 3059          /* If a bind has not been done, we can't unbind. */
3060 3060          if (icmp->icmp_state == TS_UNBND) {
3061 3061                  mutex_exit(&connp->conn_lock);
3062 3062                  return (-TOUTSTATE);
3063 3063          }
3064 3064          connp->conn_saddr_v6 = ipv6_all_zeros;
3065 3065          connp->conn_bound_addr_v6 = ipv6_all_zeros;
3066 3066          connp->conn_laddr_v6 = ipv6_all_zeros;
3067 3067          connp->conn_mcbc_bind = B_FALSE;
3068 3068          connp->conn_lport = 0;
3069 3069          connp->conn_fport = 0;
3070 3070          /* In case we were also connected */
3071 3071          connp->conn_faddr_v6 = ipv6_all_zeros;
3072 3072          connp->conn_v6lastdst = ipv6_all_zeros;
3073 3073  
3074 3074          icmp->icmp_state = TS_UNBND;
3075 3075  
3076 3076          (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
3077 3077              &connp->conn_faddr_v6, connp->conn_flowinfo);
3078 3078          mutex_exit(&connp->conn_lock);
3079 3079  
3080 3080          ip_unbind(connp);
3081 3081          return (0);
3082 3082  }
3083 3083  
3084 3084  /*
3085 3085   * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
3086 3086   * After some error checking, the message is passed downstream to ip.
3087 3087   */
3088 3088  static void
3089 3089  icmp_tpi_unbind(queue_t *q, mblk_t *mp)
3090 3090  {
3091 3091          conn_t  *connp = Q_TO_CONN(q);
3092 3092          int     error;
3093 3093  
3094 3094          ASSERT(mp->b_cont == NULL);
3095 3095          error = rawip_do_unbind(connp);
3096 3096          if (error) {
3097 3097                  if (error < 0) {
3098 3098                          icmp_err_ack(q, mp, -error, 0);
3099 3099                  } else {
3100 3100                          icmp_err_ack(q, mp, 0, error);
3101 3101                  }
3102 3102                  return;
3103 3103          }
3104 3104  
3105 3105          /*
3106 3106           * Convert mp into a T_OK_ACK
3107 3107           */
3108 3108  
3109 3109          mp = mi_tpi_ok_ack_alloc(mp);
3110 3110  
3111 3111          /*
3112 3112           * should not happen in practice... T_OK_ACK is smaller than the
3113 3113           * original message.
3114 3114           */
3115 3115          ASSERT(mp != NULL);
3116 3116          ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
3117 3117          qreply(q, mp);
3118 3118  }
3119 3119  
3120 3120  /*
3121 3121   * Process IPv4 packets that already include an IP header.
3122 3122   * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
3123 3123   * IPPROTO_IGMP).
3124 3124   * In this case we ignore the address and any options in the T_UNITDATA_REQ.
3125 3125   *
3126 3126   * The packet is assumed to have a base (20 byte) IP header followed
3127 3127   * by the upper-layer protocol. We include any IP_OPTIONS including a
3128 3128   * CIPSO label but otherwise preserve the base IP header.
3129 3129   */
3130 3130  static int
3131 3131  icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3132 3132  {
3133 3133          icmp_t          *icmp = connp->conn_icmp;
3134 3134          icmp_stack_t    *is = icmp->icmp_is;
3135 3135          ipha_t          iphas;
3136 3136          ipha_t          *ipha;
3137 3137          int             ip_hdr_length;
3138 3138          int             tp_hdr_len;
3139 3139          ip_xmit_attr_t  *ixa;
3140 3140          ip_pkt_t        *ipp;
3141 3141          in6_addr_t      v6src;
3142 3142          in6_addr_t      v6dst;
3143 3143          in6_addr_t      v6nexthop;
3144 3144          int             error;
3145 3145          boolean_t       do_ipsec;
3146 3146  
3147 3147          /*
3148 3148           * We need an exclusive copy of conn_ixa since the included IP
3149 3149           * header could have any destination.
3150 3150           * That copy has no pointers hence we
3151 3151           * need to set them up once we've parsed the ancillary data.
3152 3152           */
3153 3153          ixa = conn_get_ixa_exclusive(connp);
3154 3154          if (ixa == NULL) {
3155 3155                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3156 3156                  freemsg(mp);
3157 3157                  return (ENOMEM);
3158 3158          }
3159 3159          ASSERT(cr != NULL);
3160 3160          /*
3161 3161           * Caller has a reference on cr; from db_credp or because we
3162 3162           * are running in process context.
3163 3163           */
3164 3164          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3165 3165          ixa->ixa_cred = cr;
3166 3166          ixa->ixa_cpid = pid;
3167 3167          if (is_system_labeled()) {
3168 3168                  /* We need to restart with a label based on the cred */
3169 3169                  ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3170 3170          }
3171 3171  
3172 3172          /* In case previous destination was multicast or multirt */
3173 3173          ip_attr_newdst(ixa);
3174 3174  
3175 3175          /* Get a copy of conn_xmit_ipp since the TX label might change it */
3176 3176          ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3177 3177          if (ipp == NULL) {
3178 3178                  ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3179 3179                  ixa->ixa_cred = connp->conn_cred;       /* Restore */
3180 3180                  ixa->ixa_cpid = connp->conn_cpid;
3181 3181                  ixa_refrele(ixa);
3182 3182                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3183 3183                  freemsg(mp);
3184 3184                  return (ENOMEM);
3185 3185          }
3186 3186          mutex_enter(&connp->conn_lock);
3187 3187          error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3188 3188          mutex_exit(&connp->conn_lock);
3189 3189          if (error != 0) {
3190 3190                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3191 3191                  freemsg(mp);
3192 3192                  goto done;
3193 3193          }
3194 3194  
3195 3195          /* Sanity check length of packet */
3196 3196          ipha = (ipha_t *)mp->b_rptr;
3197 3197  
3198 3198          ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3199 3199          if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3200 3200                  if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3201 3201                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3202 3202                          freemsg(mp);
3203 3203                          goto done;
3204 3204                  }
3205 3205                  ipha = (ipha_t *)mp->b_rptr;
3206 3206          }
3207 3207          ipha->ipha_version_and_hdr_length =
3208 3208              (IP_VERSION<<4) | (ip_hdr_length>>2);
3209 3209  
3210 3210          /*
3211 3211           * We set IXAF_DONTFRAG if the application set DF which makes
3212 3212           * IP not fragment.
3213 3213           */
3214 3214          ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3215 3215          if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3216 3216                  ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3217 3217          else
3218 3218                  ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3219 3219  
3220 3220          /* Even for multicast and broadcast we honor the apps ttl */
3221 3221          ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3222 3222  
3223 3223          /*
3224 3224           * No source verification for non-local addresses
3225 3225           */
3226 3226          if (ipha->ipha_src != INADDR_ANY &&
3227 3227              ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
3228 3228              is->is_netstack->netstack_ip, B_FALSE)
3229 3229              != IPVL_UNICAST_UP) {
3230 3230                  ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3231 3231          }
3232 3232  
3233 3233          if (ipha->ipha_dst == INADDR_ANY)
3234 3234                  ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3235 3235  
3236 3236          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3237 3237          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3238 3238  
3239 3239          /* Defer IPsec if it might need to look at ICMP type/code */
3240 3240          do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3241 3241          ixa->ixa_flags |= IXAF_IS_IPV4;
3242 3242  
3243 3243          ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3244 3244          error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3245 3245              connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3246 3246              (do_ipsec ? IPDF_IPSEC : 0));
3247 3247          switch (error) {
3248 3248          case 0:
3249 3249                  break;
3250 3250          case EADDRNOTAVAIL:
3251 3251                  /*
3252 3252                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3253 3253                   * Don't have the application see that errno
3254 3254                   */
3255 3255                  error = ENETUNREACH;
3256 3256                  goto failed;
3257 3257          case ENETDOWN:
3258 3258                  /*
3259 3259                   * Have !ipif_addr_ready address; drop packet silently
3260 3260                   * until we can get applications to not send until we
3261 3261                   * are ready.
3262 3262                   */
3263 3263                  error = 0;
3264 3264                  goto failed;
3265 3265          case EHOSTUNREACH:
3266 3266          case ENETUNREACH:
3267 3267                  if (ixa->ixa_ire != NULL) {
3268 3268                          /*
3269 3269                           * Let conn_ip_output/ire_send_noroute return
3270 3270                           * the error and send any local ICMP error.
3271 3271                           */
3272 3272                          error = 0;
3273 3273                          break;
3274 3274                  }
3275 3275                  /* FALLTHRU */
3276 3276          default:
3277 3277          failed:
3278 3278                  freemsg(mp);
3279 3279                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3280 3280                  goto done;
3281 3281          }
3282 3282          if (ipha->ipha_src == INADDR_ANY)
3283 3283                  IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3284 3284  
3285 3285          /*
3286 3286           * We might be going to a different destination than last time,
3287 3287           * thus check that TX allows the communication and compute any
3288 3288           * needed label.
3289 3289           *
3290 3290           * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3291 3291           * don't have to worry about concurrent threads.
3292 3292           */
3293 3293          if (is_system_labeled()) {
3294 3294                  /*
3295 3295                   * Check whether Trusted Solaris policy allows communication
3296 3296                   * with this host, and pretend that the destination is
3297 3297                   * unreachable if not.
3298 3298                   * Compute any needed label and place it in ipp_label_v4/v6.
3299 3299                   *
3300 3300                   * Later conn_build_hdr_template/conn_prepend_hdr takes
3301 3301                   * ipp_label_v4/v6 to form the packet.
3302 3302                   *
3303 3303                   * Tsol note: We have ipp structure local to this thread so
3304 3304                   * no locking is needed.
3305 3305                   */
3306 3306                  error = conn_update_label(connp, ixa, &v6dst, ipp);
3307 3307                  if (error != 0) {
3308 3308                          freemsg(mp);
3309 3309                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3310 3310                          goto done;
3311 3311                  }
3312 3312          }
3313 3313  
3314 3314          /*
3315 3315           * Save away a copy of the IPv4 header the application passed down
3316 3316           * and then prepend an IPv4 header complete with any IP options
3317 3317           * including label.
3318 3318           * We need a struct copy since icmp_prepend_hdr will reuse the available
3319 3319           * space in the mblk.
3320 3320           */
3321 3321          iphas = *ipha;
3322 3322          mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3323 3323  
3324 3324          mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3325 3325          if (mp == NULL) {
3326 3326                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3327 3327                  ASSERT(error != 0);
3328 3328                  goto done;
3329 3329          }
3330 3330          if (ixa->ixa_pktlen > IP_MAXPACKET) {
3331 3331                  error = EMSGSIZE;
3332 3332                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3333 3333                  freemsg(mp);
3334 3334                  goto done;
3335 3335          }
3336 3336          /* Restore key parts of the header that the application passed down */
3337 3337          ipha = (ipha_t *)mp->b_rptr;
3338 3338          ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3339 3339          ipha->ipha_ident = iphas.ipha_ident;
3340 3340          ipha->ipha_fragment_offset_and_flags =
3341 3341              iphas.ipha_fragment_offset_and_flags;
3342 3342          ipha->ipha_ttl = iphas.ipha_ttl;
3343 3343          ipha->ipha_protocol = iphas.ipha_protocol;
3344 3344          ipha->ipha_src = iphas.ipha_src;
3345 3345          ipha->ipha_dst = iphas.ipha_dst;
3346 3346  
3347 3347          ixa->ixa_protocol = ipha->ipha_protocol;
3348 3348  
3349 3349          /*
3350 3350           * Make sure that the IP header plus any transport header that is
3351 3351           * checksumed by ip_output is in the first mblk. (ip_output assumes
3352 3352           * that at least the checksum field is in the first mblk.)
3353 3353           */
3354 3354          switch (ipha->ipha_protocol) {
3355 3355          case IPPROTO_UDP:
3356 3356                  tp_hdr_len = 8;
3357 3357                  break;
3358 3358          case IPPROTO_TCP:
3359 3359                  tp_hdr_len = 20;
3360 3360                  break;
3361 3361          default:
3362 3362                  tp_hdr_len = 0;
3363 3363                  break;
3364 3364          }
3365 3365          ip_hdr_length = IPH_HDR_LENGTH(ipha);
3366 3366          if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3367 3367                  if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3368 3368                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3369 3369                          if (mp->b_cont == NULL)
3370 3370                                  error = EINVAL;
3371 3371                          else
3372 3372                                  error = ENOMEM;
3373 3373                          freemsg(mp);
3374 3374                          goto done;
3375 3375                  }
3376 3376          }
3377 3377  
3378 3378          if (!do_ipsec) {
3379 3379                  /* Policy might differ for different ICMP type/code */
3380 3380                  if (ixa->ixa_ipsec_policy != NULL) {
3381 3381                          IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3382 3382                          ixa->ixa_ipsec_policy = NULL;
3383 3383                          ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3384 3384                  }
3385 3385                  mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3386 3386                  if (mp == NULL) {
3387 3387                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3388 3388                          error = EHOSTUNREACH;   /* IPsec policy failure */
3389 3389                          goto done;
3390 3390                  }
3391 3391          }
3392 3392  
3393 3393          /* We're done.  Pass the packet to ip. */
3394 3394          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3395 3395  
3396 3396          error = conn_ip_output(mp, ixa);
3397 3397          /* No rawipOutErrors if an error since IP increases its error counter */
3398 3398          switch (error) {
3399 3399          case 0:
3400 3400                  break;
3401 3401          case EWOULDBLOCK:
3402 3402                  (void) ixa_check_drain_insert(connp, ixa);
3403 3403                  error = 0;
3404 3404                  break;
3405 3405          case EADDRNOTAVAIL:
3406 3406                  /*
3407 3407                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3408 3408                   * Don't have the application see that errno
3409 3409                   */
3410 3410                  error = ENETUNREACH;
3411 3411                  break;
3412 3412          }
3413 3413  done:
3414 3414          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3415 3415          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3416 3416          ixa->ixa_cpid = connp->conn_cpid;
3417 3417          ixa_refrele(ixa);
3418 3418          ip_pkt_free(ipp);
3419 3419          kmem_free(ipp, sizeof (*ipp));
3420 3420          return (error);
3421 3421  }
3422 3422  
3423 3423  static mblk_t *
3424 3424  icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3425 3425  {
3426 3426          ipha_t  *ipha = NULL;
3427 3427          ip6_t   *ip6h = NULL;
3428 3428  
3429 3429          if (ixa->ixa_flags & IXAF_IS_IPV4)
3430 3430                  ipha = (ipha_t *)mp->b_rptr;
3431 3431          else
3432 3432                  ip6h = (ip6_t *)mp->b_rptr;
3433 3433  
3434 3434          if (ixa->ixa_ipsec_policy != NULL) {
3435 3435                  IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3436 3436                  ixa->ixa_ipsec_policy = NULL;
3437 3437                  ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3438 3438          }
3439 3439          return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3440 3440  }
3441 3441  
3442 3442  /*
3443 3443   * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3444 3444   * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3445 3445   * the TPI options, otherwise we take them from msg_control.
3446 3446   * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3447 3447   * Always consumes mp; never consumes tudr_mp.
3448 3448   */
3449 3449  static int
3450 3450  icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3451 3451      mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3452 3452  {
3453 3453          icmp_t          *icmp = connp->conn_icmp;
3454 3454          icmp_stack_t    *is = icmp->icmp_is;
3455 3455          int             error;
3456 3456          ip_xmit_attr_t  *ixa;
3457 3457          ip_pkt_t        *ipp;
3458 3458          in6_addr_t      v6src;
3459 3459          in6_addr_t      v6dst;
3460 3460          in6_addr_t      v6nexthop;
3461 3461          in_port_t       dstport;
3462 3462          uint32_t        flowinfo;
3463 3463          int             is_absreq_failure = 0;
3464 3464          conn_opt_arg_t  coas, *coa;
3465 3465  
3466 3466          ASSERT(tudr_mp != NULL || msg != NULL);
3467 3467  
3468 3468          /*
3469 3469           * Get ixa before checking state to handle a disconnect race.
3470 3470           *
3471 3471           * We need an exclusive copy of conn_ixa since the ancillary data
3472 3472           * options might modify it. That copy has no pointers hence we
3473 3473           * need to set them up once we've parsed the ancillary data.
3474 3474           */
3475 3475          ixa = conn_get_ixa_exclusive(connp);
3476 3476          if (ixa == NULL) {
3477 3477                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3478 3478                  freemsg(mp);
3479 3479                  return (ENOMEM);
3480 3480          }
3481 3481          ASSERT(cr != NULL);
3482 3482          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3483 3483          ixa->ixa_cred = cr;
3484 3484          ixa->ixa_cpid = pid;
3485 3485          if (is_system_labeled()) {
3486 3486                  /* We need to restart with a label based on the cred */
3487 3487                  ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3488 3488          }
3489 3489  
3490 3490          /* In case previous destination was multicast or multirt */
3491 3491          ip_attr_newdst(ixa);
3492 3492  
3493 3493          /* Get a copy of conn_xmit_ipp since the options might change it */
3494 3494          ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3495 3495          if (ipp == NULL) {
3496 3496                  ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3497 3497                  ixa->ixa_cred = connp->conn_cred;       /* Restore */
3498 3498                  ixa->ixa_cpid = connp->conn_cpid;
3499 3499                  ixa_refrele(ixa);
3500 3500                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3501 3501                  freemsg(mp);
3502 3502                  return (ENOMEM);
3503 3503          }
3504 3504          mutex_enter(&connp->conn_lock);
3505 3505          error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3506 3506          mutex_exit(&connp->conn_lock);
3507 3507          if (error != 0) {
3508 3508                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3509 3509                  freemsg(mp);
3510 3510                  goto done;
3511 3511          }
3512 3512  
3513 3513          /*
3514 3514           * Parse the options and update ixa and ipp as a result.
3515 3515           */
3516 3516  
3517 3517          coa = &coas;
3518 3518          coa->coa_connp = connp;
3519 3519          coa->coa_ixa = ixa;
3520 3520          coa->coa_ipp = ipp;
3521 3521          coa->coa_ancillary = B_TRUE;
3522 3522          coa->coa_changed = 0;
3523 3523  
3524 3524          if (msg != NULL) {
3525 3525                  error = process_auxiliary_options(connp, msg->msg_control,
3526 3526                      msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3527 3527          } else {
3528 3528                  struct T_unitdata_req *tudr;
3529 3529  
3530 3530                  tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3531 3531                  ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3532 3532                  error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3533 3533                      &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3534 3534                      coa, &is_absreq_failure);
3535 3535          }
3536 3536          if (error != 0) {
3537 3537                  /*
3538 3538                   * Note: No special action needed in this
3539 3539                   * module for "is_absreq_failure"
3540 3540                   */
3541 3541                  freemsg(mp);
3542 3542                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3543 3543                  goto done;
3544 3544          }
3545 3545          ASSERT(is_absreq_failure == 0);
3546 3546  
3547 3547          mutex_enter(&connp->conn_lock);
3548 3548          /*
3549 3549           * If laddr is unspecified then we look at sin6_src_id.
3550 3550           * We will give precedence to a source address set with IPV6_PKTINFO
3551 3551           * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3552 3552           * want ip_attr_connect to select a source (since it can fail) when
3553 3553           * IPV6_PKTINFO is specified.
3554 3554           * If this doesn't result in a source address then we get a source
3555 3555           * from ip_attr_connect() below.
3556 3556           */
3557 3557          v6src = connp->conn_saddr_v6;
3558 3558          if (sin != NULL) {
3559 3559                  IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3560 3560                  dstport = sin->sin_port;
3561 3561                  flowinfo = 0;
3562 3562                  ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3563 3563                  ixa->ixa_flags |= IXAF_IS_IPV4;
3564 3564          } else if (sin6 != NULL) {
3565 3565                  boolean_t v4mapped;
3566 3566                  uint_t srcid;
3567 3567  
3568 3568                  v6dst = sin6->sin6_addr;
3569 3569                  dstport = sin6->sin6_port;
3570 3570                  flowinfo = sin6->sin6_flowinfo;
3571 3571                  srcid = sin6->__sin6_src_id;
3572 3572                  if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3573 3573                          ixa->ixa_scopeid = sin6->sin6_scope_id;
3574 3574                          ixa->ixa_flags |= IXAF_SCOPEID_SET;
3575 3575                  } else {
3576 3576                          ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3577 3577                  }
3578 3578                  v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
3579 3579                  if (v4mapped)
3580 3580                          ixa->ixa_flags |= IXAF_IS_IPV4;
3581 3581                  else
3582 3582                          ixa->ixa_flags &= ~IXAF_IS_IPV4;
3583 3583                  if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3584 3584                          if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3585 3585                              v4mapped, connp->conn_netstack)) {
3586 3586                                  /* Mismatched v4mapped/v6 specified by srcid. */
3587 3587                                  mutex_exit(&connp->conn_lock);
3588 3588                                  error = EADDRNOTAVAIL;
3589 3589                                  goto failed;    /* Does freemsg() and mib. */
3590 3590                          }
3591 3591                  }
3592 3592          } else {
3593 3593                  /* Connected case */
3594 3594                  v6dst = connp->conn_faddr_v6;
3595 3595                  flowinfo = connp->conn_flowinfo;
3596 3596          }
3597 3597          mutex_exit(&connp->conn_lock);
3598 3598          /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
3599 3599          if (ipp->ipp_fields & IPPF_ADDR) {
3600 3600                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
3601 3601                          if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3602 3602                                  v6src = ipp->ipp_addr;
3603 3603                  } else {
3604 3604                          if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3605 3605                                  v6src = ipp->ipp_addr;
3606 3606                  }
3607 3607          }
3608 3608          /*
3609 3609           * Allow source not assigned to the system
3610 3610           * only if it is not a local addresses
3611 3611           */
3612 3612          if (!V6_OR_V4_INADDR_ANY(v6src)) {
3613 3613                  ip_laddr_t laddr_type;
3614 3614  
3615 3615                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
3616 3616                          ipaddr_t v4src;
3617 3617  
3618 3618                          IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
3619 3619                          laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
3620 3620                              is->is_netstack->netstack_ip, B_FALSE);
3621 3621                  } else {
3622 3622                          laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
3623 3623                              is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
3624 3624                  }
3625 3625                  if (laddr_type != IPVL_UNICAST_UP)
3626 3626                          ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3627 3627          }
3628 3628  
3629 3629          ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3630 3630          error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3631 3631              &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3632 3632  
3633 3633          switch (error) {
3634 3634          case 0:
3635 3635                  break;
3636 3636          case EADDRNOTAVAIL:
3637 3637                  /*
3638 3638                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3639 3639                   * Don't have the application see that errno
3640 3640                   */
3641 3641                  error = ENETUNREACH;
3642 3642                  goto failed;
3643 3643          case ENETDOWN:
3644 3644                  /*
3645 3645                   * Have !ipif_addr_ready address; drop packet silently
3646 3646                   * until we can get applications to not send until we
3647 3647                   * are ready.
3648 3648                   */
3649 3649                  error = 0;
3650 3650                  goto failed;
3651 3651          case EHOSTUNREACH:
3652 3652          case ENETUNREACH:
3653 3653                  if (ixa->ixa_ire != NULL) {
3654 3654                          /*
3655 3655                           * Let conn_ip_output/ire_send_noroute return
3656 3656                           * the error and send any local ICMP error.
3657 3657                           */
3658 3658                          error = 0;
3659 3659                          break;
3660 3660                  }
3661 3661                  /* FALLTHRU */
3662 3662          default:
3663 3663          failed:
3664 3664                  freemsg(mp);
3665 3665                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3666 3666                  goto done;
3667 3667          }
3668 3668  
3669 3669          /*
3670 3670           * We might be going to a different destination than last time,
3671 3671           * thus check that TX allows the communication and compute any
3672 3672           * needed label.
3673 3673           *
3674 3674           * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3675 3675           * don't have to worry about concurrent threads.
3676 3676           */
3677 3677          if (is_system_labeled()) {
3678 3678                  /*
3679 3679                   * Check whether Trusted Solaris policy allows communication
3680 3680                   * with this host, and pretend that the destination is
3681 3681                   * unreachable if not.
3682 3682                   * Compute any needed label and place it in ipp_label_v4/v6.
3683 3683                   *
3684 3684                   * Later conn_build_hdr_template/conn_prepend_hdr takes
3685 3685                   * ipp_label_v4/v6 to form the packet.
3686 3686                   *
3687 3687                   * Tsol note: We have ipp structure local to this thread so
3688 3688                   * no locking is needed.
3689 3689                   */
3690 3690                  error = conn_update_label(connp, ixa, &v6dst, ipp);
3691 3691                  if (error != 0) {
3692 3692                          freemsg(mp);
3693 3693                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3694 3694                          goto done;
3695 3695                  }
3696 3696          }
3697 3697          mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3698 3698              &error);
3699 3699          if (mp == NULL) {
3700 3700                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3701 3701                  ASSERT(error != 0);
3702 3702                  goto done;
3703 3703          }
3704 3704          if (ixa->ixa_pktlen > IP_MAXPACKET) {
3705 3705                  error = EMSGSIZE;
3706 3706                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3707 3707                  freemsg(mp);
3708 3708                  goto done;
3709 3709          }
3710 3710  
3711 3711          /* Policy might differ for different ICMP type/code */
3712 3712          mp = icmp_output_attach_policy(mp, connp, ixa);
3713 3713          if (mp == NULL) {
3714 3714                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3715 3715                  error = EHOSTUNREACH;   /* IPsec policy failure */
3716 3716                  goto done;
3717 3717          }
3718 3718  
3719 3719          /* We're done.  Pass the packet to ip. */
3720 3720          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3721 3721  
3722 3722          error = conn_ip_output(mp, ixa);
3723 3723          if (!connp->conn_unspec_src)
3724 3724                  ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3725 3725          /* No rawipOutErrors if an error since IP increases its error counter */
3726 3726          switch (error) {
3727 3727          case 0:
3728 3728                  break;
3729 3729          case EWOULDBLOCK:
3730 3730                  (void) ixa_check_drain_insert(connp, ixa);
3731 3731                  error = 0;
3732 3732                  break;
3733 3733          case EADDRNOTAVAIL:
3734 3734                  /*
3735 3735                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3736 3736                   * Don't have the application see that errno
3737 3737                   */
3738 3738                  error = ENETUNREACH;
3739 3739                  /* FALLTHRU */
3740 3740          default:
3741 3741                  mutex_enter(&connp->conn_lock);
3742 3742                  /*
3743 3743                   * Clear the source and v6lastdst so we call ip_attr_connect
3744 3744                   * for the next packet and try to pick a better source.
3745 3745                   */
3746 3746                  if (connp->conn_mcbc_bind)
3747 3747                          connp->conn_saddr_v6 = ipv6_all_zeros;
3748 3748                  else
3749 3749                          connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3750 3750                  connp->conn_v6lastdst = ipv6_all_zeros;
3751 3751                  mutex_exit(&connp->conn_lock);
3752 3752                  break;
3753 3753          }
3754 3754  done:
3755 3755          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3756 3756          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3757 3757          ixa->ixa_cpid = connp->conn_cpid;
3758 3758          ixa_refrele(ixa);
3759 3759          ip_pkt_free(ipp);
3760 3760          kmem_free(ipp, sizeof (*ipp));
3761 3761          return (error);
3762 3762  }
3763 3763  
3764 3764  /*
3765 3765   * Handle sending an M_DATA for a connected socket.
3766 3766   * Handles both IPv4 and IPv6.
3767 3767   */
3768 3768  int
3769 3769  icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3770 3770  {
3771 3771          icmp_t          *icmp = connp->conn_icmp;
3772 3772          icmp_stack_t    *is = icmp->icmp_is;
3773 3773          int             error;
3774 3774          ip_xmit_attr_t  *ixa;
3775 3775          boolean_t       do_ipsec;
3776 3776  
3777 3777          /*
3778 3778           * If no other thread is using conn_ixa this just gets a reference to
3779 3779           * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3780 3780           */
3781 3781          ixa = conn_get_ixa(connp, B_FALSE);
3782 3782          if (ixa == NULL) {
3783 3783                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3784 3784                  freemsg(mp);
3785 3785                  return (ENOMEM);
3786 3786          }
3787 3787  
3788 3788          ASSERT(cr != NULL);
3789 3789          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3790 3790          ixa->ixa_cred = cr;
3791 3791          ixa->ixa_cpid = pid;
3792 3792  
3793 3793          /* Defer IPsec if it might need to look at ICMP type/code */
3794 3794          switch (ixa->ixa_protocol) {
3795 3795          case IPPROTO_ICMP:
3796 3796          case IPPROTO_ICMPV6:
3797 3797                  do_ipsec = B_FALSE;
3798 3798                  break;
3799 3799          default:
3800 3800                  do_ipsec = B_TRUE;
3801 3801          }
3802 3802  
3803 3803          mutex_enter(&connp->conn_lock);
3804 3804          mp = icmp_prepend_header_template(connp, ixa, mp,
3805 3805              &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3806 3806  
3807 3807          if (mp == NULL) {
3808 3808                  ASSERT(error != 0);
3809 3809                  mutex_exit(&connp->conn_lock);
3810 3810                  ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3811 3811                  ixa->ixa_cred = connp->conn_cred;       /* Restore */
3812 3812                  ixa->ixa_cpid = connp->conn_cpid;
3813 3813                  ixa_refrele(ixa);
3814 3814                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3815 3815                  freemsg(mp);
3816 3816                  return (error);
3817 3817          }
3818 3818  
3819 3819          if (!do_ipsec) {
3820 3820                  /* Policy might differ for different ICMP type/code */
3821 3821                  mp = icmp_output_attach_policy(mp, connp, ixa);
3822 3822                  if (mp == NULL) {
3823 3823                          mutex_exit(&connp->conn_lock);
3824 3824                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3825 3825                          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3826 3826                          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3827 3827                          ixa->ixa_cpid = connp->conn_cpid;
3828 3828                          ixa_refrele(ixa);
3829 3829                          return (EHOSTUNREACH);  /* IPsec policy failure */
3830 3830                  }
3831 3831          }
3832 3832  
3833 3833          /*
3834 3834           * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3835 3835           * safe copy, then we need to fill in any pointers in it.
3836 3836           */
3837 3837          if (ixa->ixa_ire == NULL) {
3838 3838                  in6_addr_t      faddr, saddr;
3839 3839                  in6_addr_t      nexthop;
3840 3840                  in_port_t       fport;
3841 3841  
3842 3842                  saddr = connp->conn_saddr_v6;
3843 3843                  faddr = connp->conn_faddr_v6;
3844 3844                  fport = connp->conn_fport;
3845 3845                  ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3846 3846                  mutex_exit(&connp->conn_lock);
3847 3847  
3848 3848                  error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3849 3849                      fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3850 3850                      (do_ipsec ? IPDF_IPSEC : 0));
3851 3851                  switch (error) {
3852 3852                  case 0:
3853 3853                          break;
3854 3854                  case EADDRNOTAVAIL:
3855 3855                          /*
3856 3856                           * IXAF_VERIFY_SOURCE tells us to pick a better source.
3857 3857                           * Don't have the application see that errno
3858 3858                           */
3859 3859                          error = ENETUNREACH;
3860 3860                          goto failed;
3861 3861                  case ENETDOWN:
3862 3862                          /*
3863 3863                           * Have !ipif_addr_ready address; drop packet silently
3864 3864                           * until we can get applications to not send until we
3865 3865                           * are ready.
3866 3866                           */
3867 3867                          error = 0;
3868 3868                          goto failed;
3869 3869                  case EHOSTUNREACH:
3870 3870                  case ENETUNREACH:
3871 3871                          if (ixa->ixa_ire != NULL) {
3872 3872                                  /*
3873 3873                                   * Let conn_ip_output/ire_send_noroute return
3874 3874                                   * the error and send any local ICMP error.
3875 3875                                   */
3876 3876                                  error = 0;
3877 3877                                  break;
3878 3878                          }
3879 3879                          /* FALLTHRU */
3880 3880                  default:
3881 3881                  failed:
3882 3882                          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3883 3883                          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3884 3884                          ixa->ixa_cpid = connp->conn_cpid;
3885 3885                          ixa_refrele(ixa);
3886 3886                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3887 3887                          freemsg(mp);
3888 3888                          return (error);
3889 3889                  }
3890 3890          } else {
3891 3891                  /* Done with conn_t */
3892 3892                  mutex_exit(&connp->conn_lock);
3893 3893          }
3894 3894  
3895 3895          /* We're done.  Pass the packet to ip. */
3896 3896          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3897 3897  
3898 3898          error = conn_ip_output(mp, ixa);
3899 3899          /* No rawipOutErrors if an error since IP increases its error counter */
3900 3900          switch (error) {
3901 3901          case 0:
3902 3902                  break;
3903 3903          case EWOULDBLOCK:
3904 3904                  (void) ixa_check_drain_insert(connp, ixa);
3905 3905                  error = 0;
3906 3906                  break;
3907 3907          case EADDRNOTAVAIL:
3908 3908                  /*
3909 3909                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3910 3910                   * Don't have the application see that errno
3911 3911                   */
3912 3912                  error = ENETUNREACH;
3913 3913                  break;
3914 3914          }
3915 3915          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3916 3916          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3917 3917          ixa->ixa_cpid = connp->conn_cpid;
3918 3918          ixa_refrele(ixa);
3919 3919          return (error);
3920 3920  }
3921 3921  
3922 3922  /*
3923 3923   * Handle sending an M_DATA to the last destination.
3924 3924   * Handles both IPv4 and IPv6.
3925 3925   *
3926 3926   * NOTE: The caller must hold conn_lock and we drop it here.
3927 3927   */
3928 3928  int
3929 3929  icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3930 3930      ip_xmit_attr_t *ixa)
3931 3931  {
3932 3932          icmp_t          *icmp = connp->conn_icmp;
3933 3933          icmp_stack_t    *is = icmp->icmp_is;
3934 3934          int             error;
3935 3935          boolean_t       do_ipsec;
3936 3936  
3937 3937          ASSERT(MUTEX_HELD(&connp->conn_lock));
3938 3938          ASSERT(ixa != NULL);
3939 3939  
3940 3940          ASSERT(cr != NULL);
3941 3941          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3942 3942          ixa->ixa_cred = cr;
3943 3943          ixa->ixa_cpid = pid;
3944 3944  
3945 3945          /* Defer IPsec if it might need to look at ICMP type/code */
3946 3946          switch (ixa->ixa_protocol) {
3947 3947          case IPPROTO_ICMP:
3948 3948          case IPPROTO_ICMPV6:
3949 3949                  do_ipsec = B_FALSE;
3950 3950                  break;
3951 3951          default:
3952 3952                  do_ipsec = B_TRUE;
3953 3953          }
3954 3954  
3955 3955  
3956 3956          mp = icmp_prepend_header_template(connp, ixa, mp,
3957 3957              &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3958 3958  
3959 3959          if (mp == NULL) {
3960 3960                  ASSERT(error != 0);
3961 3961                  mutex_exit(&connp->conn_lock);
3962 3962                  ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3963 3963                  ixa->ixa_cred = connp->conn_cred;       /* Restore */
3964 3964                  ixa->ixa_cpid = connp->conn_cpid;
3965 3965                  ixa_refrele(ixa);
3966 3966                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3967 3967                  freemsg(mp);
3968 3968                  return (error);
3969 3969          }
3970 3970  
3971 3971          if (!do_ipsec) {
3972 3972                  /* Policy might differ for different ICMP type/code */
3973 3973                  mp = icmp_output_attach_policy(mp, connp, ixa);
3974 3974                  if (mp == NULL) {
3975 3975                          mutex_exit(&connp->conn_lock);
3976 3976                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3977 3977                          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3978 3978                          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3979 3979                          ixa->ixa_cpid = connp->conn_cpid;
3980 3980                          ixa_refrele(ixa);
3981 3981                          return (EHOSTUNREACH);  /* IPsec policy failure */
3982 3982                  }
3983 3983          }
3984 3984  
3985 3985          /*
3986 3986           * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3987 3987           * safe copy, then we need to fill in any pointers in it.
3988 3988           */
3989 3989          if (ixa->ixa_ire == NULL) {
3990 3990                  in6_addr_t      lastdst, lastsrc;
3991 3991                  in6_addr_t      nexthop;
3992 3992                  in_port_t       lastport;
3993 3993  
3994 3994                  lastsrc = connp->conn_v6lastsrc;
3995 3995                  lastdst = connp->conn_v6lastdst;
3996 3996                  lastport = connp->conn_lastdstport;
3997 3997                  ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3998 3998                  mutex_exit(&connp->conn_lock);
3999 3999  
4000 4000                  error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
4001 4001                      &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
4002 4002                      IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
4003 4003                  switch (error) {
4004 4004                  case 0:
4005 4005                          break;
4006 4006                  case EADDRNOTAVAIL:
4007 4007                          /*
4008 4008                           * IXAF_VERIFY_SOURCE tells us to pick a better source.
4009 4009                           * Don't have the application see that errno
4010 4010                           */
4011 4011                          error = ENETUNREACH;
4012 4012                          goto failed;
4013 4013                  case ENETDOWN:
4014 4014                          /*
4015 4015                           * Have !ipif_addr_ready address; drop packet silently
4016 4016                           * until we can get applications to not send until we
4017 4017                           * are ready.
4018 4018                           */
4019 4019                          error = 0;
4020 4020                          goto failed;
4021 4021                  case EHOSTUNREACH:
4022 4022                  case ENETUNREACH:
4023 4023                          if (ixa->ixa_ire != NULL) {
4024 4024                                  /*
4025 4025                                   * Let conn_ip_output/ire_send_noroute return
4026 4026                                   * the error and send any local ICMP error.
4027 4027                                   */
4028 4028                                  error = 0;
4029 4029                                  break;
4030 4030                          }
4031 4031                          /* FALLTHRU */
4032 4032                  default:
4033 4033                  failed:
4034 4034                          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4035 4035                          ixa->ixa_cred = connp->conn_cred;       /* Restore */
4036 4036                          ixa->ixa_cpid = connp->conn_cpid;
4037 4037                          ixa_refrele(ixa);
4038 4038                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4039 4039                          freemsg(mp);
4040 4040                          return (error);
4041 4041                  }
4042 4042          } else {
4043 4043                  /* Done with conn_t */
4044 4044                  mutex_exit(&connp->conn_lock);
4045 4045          }
4046 4046  
4047 4047          /* We're done.  Pass the packet to ip. */
4048 4048          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4049 4049          error = conn_ip_output(mp, ixa);
4050 4050          /* No rawipOutErrors if an error since IP increases its error counter */
4051 4051          switch (error) {
4052 4052          case 0:
4053 4053                  break;
4054 4054          case EWOULDBLOCK:
4055 4055                  (void) ixa_check_drain_insert(connp, ixa);
4056 4056                  error = 0;
4057 4057                  break;
4058 4058          case EADDRNOTAVAIL:
4059 4059                  /*
4060 4060                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
4061 4061                   * Don't have the application see that errno
4062 4062                   */
4063 4063                  error = ENETUNREACH;
4064 4064                  /* FALLTHRU */
4065 4065          default:
4066 4066                  mutex_enter(&connp->conn_lock);
4067 4067                  /*
4068 4068                   * Clear the source and v6lastdst so we call ip_attr_connect
4069 4069                   * for the next packet and try to pick a better source.
4070 4070                   */
4071 4071                  if (connp->conn_mcbc_bind)
4072 4072                          connp->conn_saddr_v6 = ipv6_all_zeros;
4073 4073                  else
4074 4074                          connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4075 4075                  connp->conn_v6lastdst = ipv6_all_zeros;
4076 4076                  mutex_exit(&connp->conn_lock);
4077 4077                  break;
4078 4078          }
4079 4079          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4080 4080          ixa->ixa_cred = connp->conn_cred;       /* Restore */
4081 4081          ixa->ixa_cpid = connp->conn_cpid;
4082 4082          ixa_refrele(ixa);
4083 4083          return (error);
4084 4084  }
4085 4085  
4086 4086  
4087 4087  /*
4088 4088   * Prepend the header template and then fill in the source and
4089 4089   * flowinfo. The caller needs to handle the destination address since
4090 4090   * it's setting is different if rthdr or source route.
4091 4091   *
4092 4092   * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
4093 4093   * When it returns NULL it sets errorp.
4094 4094   */
4095 4095  static mblk_t *
4096 4096  icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
4097 4097      const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
4098 4098  {
4099 4099          icmp_t          *icmp = connp->conn_icmp;
4100 4100          icmp_stack_t    *is = icmp->icmp_is;
4101 4101          uint_t          pktlen;
4102 4102          uint_t          copylen;
4103 4103          uint8_t         *iph;
4104 4104          uint_t          ip_hdr_length;
4105 4105          uint32_t        cksum;
4106 4106          ip_pkt_t        *ipp;
4107 4107  
4108 4108          ASSERT(MUTEX_HELD(&connp->conn_lock));
4109 4109  
4110 4110          /*
4111 4111           * Copy the header template.
4112 4112           */
4113 4113          copylen = connp->conn_ht_iphc_len;
4114 4114          pktlen = copylen + msgdsize(mp);
4115 4115          if (pktlen > IP_MAXPACKET) {
4116 4116                  freemsg(mp);
4117 4117                  *errorp = EMSGSIZE;
4118 4118                  return (NULL);
4119 4119          }
4120 4120          ixa->ixa_pktlen = pktlen;
4121 4121  
4122 4122          /* check/fix buffer config, setup pointers into it */
4123 4123          iph = mp->b_rptr - copylen;
4124 4124          if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
4125 4125                  mblk_t *mp1;
4126 4126  
4127 4127                  mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
4128 4128                  if (mp1 == NULL) {
4129 4129                          freemsg(mp);
4130 4130                          *errorp = ENOMEM;
4131 4131                          return (NULL);
4132 4132                  }
4133 4133                  mp1->b_wptr = DB_LIM(mp1);
4134 4134                  mp1->b_cont = mp;
4135 4135                  mp = mp1;
4136 4136                  iph = (mp->b_wptr - copylen);
4137 4137          }
4138 4138          mp->b_rptr = iph;
4139 4139          bcopy(connp->conn_ht_iphc, iph, copylen);
4140 4140          ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
4141 4141  
4142 4142          ixa->ixa_ip_hdr_length = ip_hdr_length;
4143 4143  
4144 4144          /*
4145 4145           * Prepare for ICMPv6 checksum done in IP.
4146 4146           *
4147 4147           * icmp_build_hdr_template has already massaged any routing header
4148 4148           * and placed the result in conn_sum.
4149 4149           *
4150 4150           * We make it easy for IP to include our pseudo header
4151 4151           * by putting our length (and any routing header adjustment)
4152 4152           * in the ICMPv6 checksum field.
4153 4153           */
4154 4154          cksum = pktlen - ip_hdr_length;
4155 4155  
4156 4156          cksum += connp->conn_sum;
4157 4157          cksum = (cksum >> 16) + (cksum & 0xFFFF);
4158 4158          ASSERT(cksum < 0x10000);
4159 4159  
4160 4160          ipp = &connp->conn_xmit_ipp;
4161 4161          if (ixa->ixa_flags & IXAF_IS_IPV4) {
4162 4162                  ipha_t  *ipha = (ipha_t *)iph;
4163 4163  
4164 4164                  ipha->ipha_length = htons((uint16_t)pktlen);
4165 4165  
4166 4166                  /* if IP_PKTINFO specified an addres it wins over bind() */
4167 4167                  if ((ipp->ipp_fields & IPPF_ADDR) &&
4168 4168                      IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4169 4169                          ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
4170 4170                          ipha->ipha_src = ipp->ipp_addr_v4;
4171 4171                  } else {
4172 4172                          IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
4173 4173                  }
4174 4174          } else {
4175 4175                  ip6_t *ip6h = (ip6_t *)iph;
4176 4176                  uint_t  cksum_offset = 0;
4177 4177  
4178 4178                  ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
4179 4179  
4180 4180                  /* if IP_PKTINFO specified an addres it wins over bind() */
4181 4181                  if ((ipp->ipp_fields & IPPF_ADDR) &&
4182 4182                      !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4183 4183                          ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
4184 4184                          ip6h->ip6_src = ipp->ipp_addr;
4185 4185                  } else {
4186 4186                          ip6h->ip6_src = *v6src;
4187 4187                  }
4188 4188                  ip6h->ip6_vcf =
4189 4189                      (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4190 4190                      (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4191 4191                  if (ipp->ipp_fields & IPPF_TCLASS) {
4192 4192                          /* Overrides the class part of flowinfo */
4193 4193                          ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4194 4194                              ipp->ipp_tclass);
4195 4195                  }
4196 4196  
4197 4197                  if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
4198 4198                          if (connp->conn_proto == IPPROTO_ICMPV6) {
4199 4199                                  cksum_offset = ixa->ixa_ip_hdr_length +
4200 4200                                      offsetof(icmp6_t, icmp6_cksum);
4201 4201                          } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4202 4202                                  cksum_offset = ixa->ixa_ip_hdr_length +
4203 4203                                      ixa->ixa_raw_cksum_offset;
4204 4204                          }
4205 4205                  }
4206 4206                  if (cksum_offset != 0) {
4207 4207                          uint16_t *ptr;
4208 4208  
4209 4209                          /* Make sure the checksum fits in the first mblk */
4210 4210                          if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4211 4211                                  mblk_t *mp1;
4212 4212  
4213 4213                                  mp1 = msgpullup(mp,
4214 4214                                      cksum_offset + sizeof (short));
4215 4215                                  freemsg(mp);
4216 4216                                  if (mp1 == NULL) {
4217 4217                                          *errorp = ENOMEM;
4218 4218                                          return (NULL);
4219 4219                                  }
4220 4220                                  mp = mp1;
4221 4221                                  iph = mp->b_rptr;
4222 4222                                  ip6h = (ip6_t *)iph;
4223 4223                          }
4224 4224                          ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4225 4225                          *ptr = htons(cksum);
4226 4226                  }
4227 4227          }
4228 4228  
4229 4229          return (mp);
4230 4230  }
4231 4231  
4232 4232  /*
4233 4233   * This routine handles all messages passed downstream.  It either
4234 4234   * consumes the message or passes it downstream; it never queues a
4235 4235   * a message.
4236 4236   */
4237 4237  void
4238 4238  icmp_wput(queue_t *q, mblk_t *mp)
4239 4239  {
4240 4240          sin6_t          *sin6;
4241 4241          sin_t           *sin = NULL;
4242 4242          uint_t          srcid;
4243 4243          conn_t          *connp = Q_TO_CONN(q);
4244 4244          icmp_t          *icmp = connp->conn_icmp;
4245 4245          int             error = 0;
4246 4246          struct sockaddr *addr = NULL;
4247 4247          socklen_t       addrlen;
4248 4248          icmp_stack_t    *is = icmp->icmp_is;
4249 4249          struct T_unitdata_req *tudr;
4250 4250          mblk_t          *data_mp;
4251 4251          cred_t          *cr;
4252 4252          pid_t           pid;
4253 4253  
4254 4254          /*
4255 4255           * We directly handle several cases here: T_UNITDATA_REQ message
4256 4256           * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4257 4257           * socket.
4258 4258           */
4259 4259          switch (DB_TYPE(mp)) {
4260 4260          case M_DATA:
4261 4261                  /* sockfs never sends down M_DATA */
4262 4262                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4263 4263                  freemsg(mp);
4264 4264                  return;
4265 4265  
4266 4266          case M_PROTO:
4267 4267          case M_PCPROTO:
4268 4268                  tudr = (struct T_unitdata_req *)mp->b_rptr;
4269 4269                  if (MBLKL(mp) < sizeof (*tudr) ||
4270 4270                      ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4271 4271                          icmp_wput_other(q, mp);
4272 4272                          return;
4273 4273                  }
4274 4274                  break;
4275 4275  
4276 4276          default:
4277 4277                  icmp_wput_other(q, mp);
4278 4278                  return;
4279 4279          }
4280 4280  
4281 4281          /* Handle valid T_UNITDATA_REQ here */
4282 4282          data_mp = mp->b_cont;
4283 4283          if (data_mp == NULL) {
4284 4284                  error = EPROTO;
4285 4285                  goto ud_error2;
4286 4286          }
4287 4287          mp->b_cont = NULL;
4288 4288  
4289 4289          if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4290 4290                  error = EADDRNOTAVAIL;
4291 4291                  goto ud_error2;
4292 4292          }
4293 4293  
4294 4294          /*
4295 4295           * All Solaris components should pass a db_credp
4296 4296           * for this message, hence we ASSERT.
4297 4297           * On production kernels we return an error to be robust against
4298 4298           * random streams modules sitting on top of us.
4299 4299           */
4300 4300          cr = msg_getcred(mp, &pid);
4301 4301          ASSERT(cr != NULL);
4302 4302          if (cr == NULL) {
4303 4303                  error = EINVAL;
4304 4304                  goto ud_error2;
4305 4305          }
4306 4306  
4307 4307          /*
4308 4308           * If a port has not been bound to the stream, fail.
4309 4309           * This is not a problem when sockfs is directly
4310 4310           * above us, because it will ensure that the socket
4311 4311           * is first bound before allowing data to be sent.
4312 4312           */
4313 4313          if (icmp->icmp_state == TS_UNBND) {
4314 4314                  error = EPROTO;
4315 4315                  goto ud_error2;
4316 4316          }
4317 4317          addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4318 4318          addrlen = tudr->DEST_length;
4319 4319  
4320 4320          switch (connp->conn_family) {
4321 4321          case AF_INET6:
4322 4322                  sin6 = (sin6_t *)addr;
4323 4323                  if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4324 4324                      (sin6->sin6_family != AF_INET6)) {
4325 4325                          error = EADDRNOTAVAIL;
4326 4326                          goto ud_error2;
4327 4327                  }
4328 4328  
4329 4329                  /* No support for mapped addresses on raw sockets */
4330 4330                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4331 4331                          error = EADDRNOTAVAIL;
4332 4332                          goto ud_error2;
4333 4333                  }
4334 4334                  srcid = sin6->__sin6_src_id;
4335 4335  
4336 4336                  /*
4337 4337                   * If the local address is a mapped address return
4338 4338                   * an error.
4339 4339                   * It would be possible to send an IPv6 packet but the
4340 4340                   * response would never make it back to the application
4341 4341                   * since it is bound to a mapped address.
4342 4342                   */
4343 4343                  if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4344 4344                          error = EADDRNOTAVAIL;
4345 4345                          goto ud_error2;
4346 4346                  }
4347 4347  
4348 4348                  if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4349 4349                          sin6->sin6_addr = ipv6_loopback;
4350 4350  
4351 4351                  if (tudr->OPT_length != 0) {
4352 4352                          /*
4353 4353                           * If we are connected then the destination needs to be
4354 4354                           * the same as the connected one.
4355 4355                           */
4356 4356                          if (icmp->icmp_state == TS_DATA_XFER &&
4357 4357                              !conn_same_as_last_v6(connp, sin6)) {
4358 4358                                  error = EISCONN;
4359 4359                                  goto ud_error2;
4360 4360                          }
4361 4361                          error = icmp_output_ancillary(connp, NULL, sin6,
4362 4362                              data_mp, mp, NULL, cr, pid);
4363 4363                  } else {
4364 4364                          ip_xmit_attr_t *ixa;
4365 4365  
4366 4366                          /*
4367 4367                           * We have to allocate an ip_xmit_attr_t before we grab
4368 4368                           * conn_lock and we need to hold conn_lock once we've
4369 4369                           * checked conn_same_as_last_v6 to handle concurrent
4370 4370                           * send* calls on a socket.
4371 4371                           */
4372 4372                          ixa = conn_get_ixa(connp, B_FALSE);
4373 4373                          if (ixa == NULL) {
4374 4374                                  error = ENOMEM;
4375 4375                                  goto ud_error2;
4376 4376                          }
4377 4377                          mutex_enter(&connp->conn_lock);
4378 4378  
4379 4379                          if (conn_same_as_last_v6(connp, sin6) &&
4380 4380                              connp->conn_lastsrcid == srcid &&
4381 4381                              ipsec_outbound_policy_current(ixa)) {
4382 4382                                  /* icmp_output_lastdst drops conn_lock */
4383 4383                                  error = icmp_output_lastdst(connp, data_mp, cr,
4384 4384                                      pid, ixa);
4385 4385                          } else {
4386 4386                                  /* icmp_output_newdst drops conn_lock */
4387 4387                                  error = icmp_output_newdst(connp, data_mp, NULL,
4388 4388                                      sin6, cr, pid, ixa);
4389 4389                          }
4390 4390                          ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4391 4391                  }
4392 4392                  if (error == 0) {
4393 4393                          freeb(mp);
4394 4394                          return;
4395 4395                  }
4396 4396                  break;
4397 4397  
4398 4398          case AF_INET:
4399 4399                  sin = (sin_t *)addr;
4400 4400                  if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4401 4401                      (sin->sin_family != AF_INET)) {
4402 4402                          error = EADDRNOTAVAIL;
4403 4403                          goto ud_error2;
4404 4404                  }
4405 4405                  if (sin->sin_addr.s_addr == INADDR_ANY)
4406 4406                          sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4407 4407  
4408 4408                  /* Protocol 255 contains full IP headers */
4409 4409                  /* Read without holding lock */
4410 4410                  if (icmp->icmp_hdrincl) {
4411 4411                          if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4412 4412                                  if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4413 4413                                          error = EINVAL;
4414 4414                                          goto ud_error2;
4415 4415                                  }
4416 4416                          }
4417 4417                          error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4418 4418                          if (error == 0) {
4419 4419                                  freeb(mp);
4420 4420                                  return;
4421 4421                          }
4422 4422                          /* data_mp consumed above */
4423 4423                          data_mp = NULL;
4424 4424                          goto ud_error2;
4425 4425                  }
4426 4426  
4427 4427                  if (tudr->OPT_length != 0) {
4428 4428                          /*
4429 4429                           * If we are connected then the destination needs to be
4430 4430                           * the same as the connected one.
4431 4431                           */
4432 4432                          if (icmp->icmp_state == TS_DATA_XFER &&
4433 4433                              !conn_same_as_last_v4(connp, sin)) {
4434 4434                                  error = EISCONN;
4435 4435                                  goto ud_error2;
4436 4436                          }
4437 4437                          error = icmp_output_ancillary(connp, sin, NULL,
4438 4438                              data_mp, mp, NULL, cr, pid);
4439 4439                  } else {
4440 4440                          ip_xmit_attr_t *ixa;
4441 4441  
4442 4442                          /*
4443 4443                           * We have to allocate an ip_xmit_attr_t before we grab
4444 4444                           * conn_lock and we need to hold conn_lock once we've
4445 4445                           * checked conn_same_as_last_v4 to handle concurrent
4446 4446                           * send* calls on a socket.
4447 4447                           */
4448 4448                          ixa = conn_get_ixa(connp, B_FALSE);
4449 4449                          if (ixa == NULL) {
4450 4450                                  error = ENOMEM;
4451 4451                                  goto ud_error2;
4452 4452                          }
4453 4453                          mutex_enter(&connp->conn_lock);
4454 4454  
4455 4455                          if (conn_same_as_last_v4(connp, sin) &&
4456 4456                              ipsec_outbound_policy_current(ixa)) {
4457 4457                                  /* icmp_output_lastdst drops conn_lock */
4458 4458                                  error = icmp_output_lastdst(connp, data_mp, cr,
4459 4459                                      pid, ixa);
4460 4460                          } else {
4461 4461                                  /* icmp_output_newdst drops conn_lock */
4462 4462                                  error = icmp_output_newdst(connp, data_mp, sin,
4463 4463                                      NULL, cr, pid, ixa);
4464 4464                          }
4465 4465                          ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4466 4466                  }
4467 4467                  if (error == 0) {
4468 4468                          freeb(mp);
4469 4469                          return;
4470 4470                  }
4471 4471                  break;
4472 4472          }
4473 4473          ASSERT(mp != NULL);
4474 4474          /* mp is freed by the following routine */
4475 4475          icmp_ud_err(q, mp, (t_scalar_t)error);
4476 4476          return;
4477 4477  
4478 4478  ud_error2:
4479 4479          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4480 4480          freemsg(data_mp);
4481 4481          ASSERT(mp != NULL);
4482 4482          /* mp is freed by the following routine */
4483 4483          icmp_ud_err(q, mp, (t_scalar_t)error);
4484 4484  }
4485 4485  
4486 4486  /*
4487 4487   * Handle the case of the IP address or flow label being different
4488 4488   * for both IPv4 and IPv6.
4489 4489   *
4490 4490   * NOTE: The caller must hold conn_lock and we drop it here.
4491 4491   */
4492 4492  static int
4493 4493  icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4494 4494      cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4495 4495  {
4496 4496          icmp_t          *icmp = connp->conn_icmp;
4497 4497          icmp_stack_t    *is = icmp->icmp_is;
4498 4498          int             error;
4499 4499          ip_xmit_attr_t  *oldixa;
4500 4500          boolean_t       do_ipsec;
4501 4501          uint_t          srcid;
4502 4502          uint32_t        flowinfo;
4503 4503          in6_addr_t      v6src;
4504 4504          in6_addr_t      v6dst;
4505 4505          in6_addr_t      v6nexthop;
4506 4506          in_port_t       dstport;
4507 4507  
4508 4508          ASSERT(MUTEX_HELD(&connp->conn_lock));
4509 4509          ASSERT(ixa != NULL);
4510 4510  
4511 4511          /*
4512 4512           * We hold conn_lock across all the use and modifications of
4513 4513           * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4514 4514           * stay consistent.
4515 4515           */
4516 4516  
4517 4517          ASSERT(cr != NULL);
4518 4518          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4519 4519          ixa->ixa_cred = cr;
4520 4520          ixa->ixa_cpid = pid;
4521 4521          if (is_system_labeled()) {
4522 4522                  /* We need to restart with a label based on the cred */
4523 4523                  ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4524 4524          }
4525 4525          /*
4526 4526           * If we are connected then the destination needs to be the
4527 4527           * same as the connected one, which is not the case here since we
4528 4528           * checked for that above.
4529 4529           */
4530 4530          if (icmp->icmp_state == TS_DATA_XFER) {
4531 4531                  mutex_exit(&connp->conn_lock);
4532 4532                  error = EISCONN;
4533 4533                  goto ud_error;
4534 4534          }
4535 4535  
4536 4536          /* In case previous destination was multicast or multirt */
4537 4537          ip_attr_newdst(ixa);
4538 4538  
4539 4539          /*
4540 4540           * If laddr is unspecified then we look at sin6_src_id.
4541 4541           * We will give precedence to a source address set with IPV6_PKTINFO
4542 4542           * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4543 4543           * want ip_attr_connect to select a source (since it can fail) when
4544 4544           * IPV6_PKTINFO is specified.
4545 4545           * If this doesn't result in a source address then we get a source
4546 4546           * from ip_attr_connect() below.
4547 4547           */
4548 4548          v6src = connp->conn_saddr_v6;
4549 4549          if (sin != NULL) {
4550 4550                  IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4551 4551                  dstport = sin->sin_port;
4552 4552                  flowinfo = 0;
4553 4553                  /* Don't bother with ip_srcid_find_id(), but indicate anyway. */
4554 4554                  srcid = 0;
4555 4555                  ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4556 4556                  ixa->ixa_flags |= IXAF_IS_IPV4;
4557 4557          } else {
4558 4558                  boolean_t v4mapped;
4559 4559  
4560 4560                  v6dst = sin6->sin6_addr;
4561 4561                  dstport = sin6->sin6_port;
4562 4562                  flowinfo = sin6->sin6_flowinfo;
4563 4563                  srcid = sin6->__sin6_src_id;
4564 4564                  if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4565 4565                          ixa->ixa_scopeid = sin6->sin6_scope_id;
4566 4566                          ixa->ixa_flags |= IXAF_SCOPEID_SET;
4567 4567                  } else {
4568 4568                          ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4569 4569                  }
4570 4570                  v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
4571 4571                  if (v4mapped)
4572 4572                          ixa->ixa_flags |= IXAF_IS_IPV4;
4573 4573                  else
4574 4574                          ixa->ixa_flags &= ~IXAF_IS_IPV4;
4575 4575                  if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4576 4576                          if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4577 4577                              v4mapped, connp->conn_netstack)) {
4578 4578                                  /* Mismatched v4mapped/v6 specified by srcid. */
4579 4579                                  mutex_exit(&connp->conn_lock);
4580 4580                                  error = EADDRNOTAVAIL;
4581 4581                                  goto ud_error;
4582 4582                          }
4583 4583                  }
4584 4584          }
4585 4585          /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
4586 4586          if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) {
4587 4587                  ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4588 4588  
4589 4589                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
4590 4590                          if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4591 4591                                  v6src = ipp->ipp_addr;
4592 4592                  } else {
4593 4593                          if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4594 4594                                  v6src = ipp->ipp_addr;
4595 4595                  }
4596 4596          }
4597 4597  
4598 4598          /* Defer IPsec if it might need to look at ICMP type/code */
4599 4599          switch (ixa->ixa_protocol) {
4600 4600          case IPPROTO_ICMP:
4601 4601          case IPPROTO_ICMPV6:
4602 4602                  do_ipsec = B_FALSE;
4603 4603                  break;
4604 4604          default:
4605 4605                  do_ipsec = B_TRUE;
4606 4606          }
4607 4607  
4608 4608          ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4609 4609          mutex_exit(&connp->conn_lock);
4610 4610  
4611 4611          error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4612 4612              &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4613 4613              (do_ipsec ? IPDF_IPSEC : 0));
4614 4614          switch (error) {
4615 4615          case 0:
4616 4616                  break;
4617 4617          case EADDRNOTAVAIL:
4618 4618                  /*
4619 4619                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
4620 4620                   * Don't have the application see that errno
4621 4621                   */
4622 4622                  error = ENETUNREACH;
4623 4623                  goto failed;
4624 4624          case ENETDOWN:
4625 4625                  /*
4626 4626                   * Have !ipif_addr_ready address; drop packet silently
4627 4627                   * until we can get applications to not send until we
4628 4628                   * are ready.
4629 4629                   */
4630 4630                  error = 0;
4631 4631                  goto failed;
4632 4632          case EHOSTUNREACH:
4633 4633          case ENETUNREACH:
4634 4634                  if (ixa->ixa_ire != NULL) {
4635 4635                          /*
4636 4636                           * Let conn_ip_output/ire_send_noroute return
4637 4637                           * the error and send any local ICMP error.
4638 4638                           */
4639 4639                          error = 0;
4640 4640                          break;
4641 4641                  }
4642 4642                  /* FALLTHRU */
4643 4643          default:
4644 4644          failed:
4645 4645                  goto ud_error;
4646 4646          }
4647 4647  
4648 4648          mutex_enter(&connp->conn_lock);
4649 4649          /*
4650 4650           * While we dropped the lock some other thread might have connected
4651 4651           * this socket. If so we bail out with EISCONN to ensure that the
4652 4652           * connecting thread is the one that updates conn_ixa, conn_ht_*
4653 4653           * and conn_*last*.
4654 4654           */
4655 4655          if (icmp->icmp_state == TS_DATA_XFER) {
4656 4656                  mutex_exit(&connp->conn_lock);
4657 4657                  error = EISCONN;
4658 4658                  goto ud_error;
4659 4659          }
4660 4660  
4661 4661          /*
4662 4662           * We need to rebuild the headers if
4663 4663           *  - we are labeling packets (could be different for different
4664 4664           *    destinations)
4665 4665           *  - we have a source route (or routing header) since we need to
4666 4666           *    massage that to get the pseudo-header checksum
4667 4667           *  - a socket option with COA_HEADER_CHANGED has been set which
4668 4668           *    set conn_v6lastdst to zero.
4669 4669           *
4670 4670           * Otherwise the prepend function will just update the src, dst,
4671 4671           * and flow label.
4672 4672           */
4673 4673          if (is_system_labeled()) {
4674 4674                  /* TX MLP requires SCM_UCRED and don't have that here */
4675 4675                  if (connp->conn_mlp_type != mlptSingle) {
4676 4676                          mutex_exit(&connp->conn_lock);
4677 4677                          error = ECONNREFUSED;
4678 4678                          goto ud_error;
4679 4679                  }
4680 4680                  /*
4681 4681                   * Check whether Trusted Solaris policy allows communication
4682 4682                   * with this host, and pretend that the destination is
4683 4683                   * unreachable if not.
4684 4684                   * Compute any needed label and place it in ipp_label_v4/v6.
4685 4685                   *
4686 4686                   * Later conn_build_hdr_template/conn_prepend_hdr takes
4687 4687                   * ipp_label_v4/v6 to form the packet.
4688 4688                   *
4689 4689                   * Tsol note: Since we hold conn_lock we know no other
4690 4690                   * thread manipulates conn_xmit_ipp.
4691 4691                   */
4692 4692                  error = conn_update_label(connp, ixa, &v6dst,
4693 4693                      &connp->conn_xmit_ipp);
4694 4694                  if (error != 0) {
4695 4695                          mutex_exit(&connp->conn_lock);
4696 4696                          goto ud_error;
4697 4697                  }
4698 4698                  /* Rebuild the header template */
4699 4699                  error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4700 4700                      flowinfo);
4701 4701                  if (error != 0) {
4702 4702                          mutex_exit(&connp->conn_lock);
4703 4703                          goto ud_error;
4704 4704                  }
4705 4705          } else if (connp->conn_xmit_ipp.ipp_fields &
4706 4706              (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4707 4707              IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4708 4708                  /* Rebuild the header template */
4709 4709                  error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4710 4710                      flowinfo);
4711 4711                  if (error != 0) {
4712 4712                          mutex_exit(&connp->conn_lock);
4713 4713                          goto ud_error;
4714 4714                  }
4715 4715          } else {
4716 4716                  /* Simply update the destination address if no source route */
4717 4717                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
4718 4718                          ipha_t  *ipha = (ipha_t *)connp->conn_ht_iphc;
4719 4719  
4720 4720                          IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4721 4721                          if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4722 4722                                  ipha->ipha_fragment_offset_and_flags |=
4723 4723                                      IPH_DF_HTONS;
4724 4724                          } else {
4725 4725                                  ipha->ipha_fragment_offset_and_flags &=
4726 4726                                      ~IPH_DF_HTONS;
4727 4727                          }
4728 4728                  } else {
4729 4729                          ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4730 4730                          ip6h->ip6_dst = v6dst;
4731 4731                  }
4732 4732          }
4733 4733  
4734 4734          /*
4735 4735           * Remember the dst etc which corresponds to the built header
4736 4736           * template and conn_ixa.
4737 4737           */
4738 4738          oldixa = conn_replace_ixa(connp, ixa);
4739 4739          connp->conn_v6lastdst = v6dst;
4740 4740          connp->conn_lastflowinfo = flowinfo;
4741 4741          connp->conn_lastscopeid = ixa->ixa_scopeid;
4742 4742          connp->conn_lastsrcid = srcid;
4743 4743          /* Also remember a source to use together with lastdst */
4744 4744          connp->conn_v6lastsrc = v6src;
4745 4745  
4746 4746          data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4747 4747              flowinfo, &error);
4748 4748  
4749 4749          /* Done with conn_t */
4750 4750          mutex_exit(&connp->conn_lock);
4751 4751          ixa_refrele(oldixa);
4752 4752  
4753 4753          if (data_mp == NULL) {
4754 4754                  ASSERT(error != 0);
4755 4755                  goto ud_error;
4756 4756          }
4757 4757  
4758 4758          if (!do_ipsec) {
4759 4759                  /* Policy might differ for different ICMP type/code */
4760 4760                  data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4761 4761                  if (data_mp == NULL) {
4762 4762                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4763 4763                          error = EHOSTUNREACH;   /* IPsec policy failure */
4764 4764                          goto done;
4765 4765                  }
4766 4766          }
4767 4767  
4768 4768          /* We're done.  Pass the packet to ip. */
4769 4769          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4770 4770  
4771 4771          error = conn_ip_output(data_mp, ixa);
4772 4772          /* No rawipOutErrors if an error since IP increases its error counter */
4773 4773          switch (error) {
4774 4774          case 0:
4775 4775                  break;
4776 4776          case EWOULDBLOCK:
4777 4777                  (void) ixa_check_drain_insert(connp, ixa);
4778 4778                  error = 0;
4779 4779                  break;
4780 4780          case EADDRNOTAVAIL:
4781 4781                  /*
4782 4782                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
4783 4783                   * Don't have the application see that errno
4784 4784                   */
4785 4785                  error = ENETUNREACH;
4786 4786                  /* FALLTHRU */
4787 4787          default:
4788 4788                  mutex_enter(&connp->conn_lock);
4789 4789                  /*
4790 4790                   * Clear the source and v6lastdst so we call ip_attr_connect
4791 4791                   * for the next packet and try to pick a better source.
4792 4792                   */
4793 4793                  if (connp->conn_mcbc_bind)
4794 4794                          connp->conn_saddr_v6 = ipv6_all_zeros;
4795 4795                  else
4796 4796                          connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4797 4797                  connp->conn_v6lastdst = ipv6_all_zeros;
4798 4798                  mutex_exit(&connp->conn_lock);
4799 4799                  break;
4800 4800          }
4801 4801  done:
4802 4802          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4803 4803          ixa->ixa_cred = connp->conn_cred;       /* Restore */
4804 4804          ixa->ixa_cpid = connp->conn_cpid;
4805 4805          ixa_refrele(ixa);
4806 4806          return (error);
4807 4807  
4808 4808  ud_error:
4809 4809          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4810 4810          ixa->ixa_cred = connp->conn_cred;       /* Restore */
4811 4811          ixa->ixa_cpid = connp->conn_cpid;
4812 4812          ixa_refrele(ixa);
4813 4813  
4814 4814          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4815 4815          freemsg(data_mp);
4816 4816          return (error);
4817 4817  }
4818 4818  
4819 4819  /* ARGSUSED */
4820 4820  static void
4821 4821  icmp_wput_fallback(queue_t *q, mblk_t *mp)
4822 4822  {
4823 4823  #ifdef DEBUG
4824 4824          cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4825 4825  #endif
4826 4826          freemsg(mp);
4827 4827  }
4828 4828  
4829 4829  static void
4830 4830  icmp_wput_other(queue_t *q, mblk_t *mp)
4831 4831  {
4832 4832          uchar_t *rptr = mp->b_rptr;
4833 4833          struct iocblk *iocp;
4834 4834          conn_t  *connp = Q_TO_CONN(q);
4835 4835          icmp_t  *icmp = connp->conn_icmp;
4836 4836          cred_t *cr;
4837 4837  
4838 4838          switch (mp->b_datap->db_type) {
4839 4839          case M_PROTO:
4840 4840          case M_PCPROTO:
4841 4841                  if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4842 4842                          /*
4843 4843                           * If the message does not contain a PRIM_type,
4844 4844                           * throw it away.
4845 4845                           */
4846 4846                          freemsg(mp);
4847 4847                          return;
4848 4848                  }
4849 4849                  switch (((t_primp_t)rptr)->type) {
4850 4850                  case T_ADDR_REQ:
4851 4851                          icmp_addr_req(q, mp);
4852 4852                          return;
4853 4853                  case O_T_BIND_REQ:
4854 4854                  case T_BIND_REQ:
4855 4855                          icmp_tpi_bind(q, mp);
4856 4856                          return;
4857 4857                  case T_CONN_REQ:
4858 4858                          icmp_tpi_connect(q, mp);
4859 4859                          return;
4860 4860                  case T_CAPABILITY_REQ:
4861 4861                          icmp_capability_req(q, mp);
4862 4862                          return;
4863 4863                  case T_INFO_REQ:
4864 4864                          icmp_info_req(q, mp);
4865 4865                          return;
4866 4866                  case T_UNITDATA_REQ:
4867 4867                          /*
4868 4868                           * If a T_UNITDATA_REQ gets here, the address must
4869 4869                           * be bad.  Valid T_UNITDATA_REQs are handled
4870 4870                           * in icmp_wput.
4871 4871                           */
4872 4872                          icmp_ud_err(q, mp, EADDRNOTAVAIL);
4873 4873                          return;
4874 4874                  case T_UNBIND_REQ:
4875 4875                          icmp_tpi_unbind(q, mp);
4876 4876                          return;
4877 4877                  case T_SVR4_OPTMGMT_REQ:
4878 4878                          /*
4879 4879                           * All Solaris components should pass a db_credp
4880 4880                           * for this TPI message, hence we ASSERT.
4881 4881                           * But in case there is some other M_PROTO that looks
4882 4882                           * like a TPI message sent by some other kernel
4883 4883                           * component, we check and return an error.
4884 4884                           */
4885 4885                          cr = msg_getcred(mp, NULL);
4886 4886                          ASSERT(cr != NULL);
4887 4887                          if (cr == NULL) {
4888 4888                                  icmp_err_ack(q, mp, TSYSERR, EINVAL);
4889 4889                                  return;
4890 4890                          }
4891 4891  
4892 4892                          if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4893 4893                              cr)) {
4894 4894                                  svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4895 4895                          }
4896 4896                          return;
4897 4897  
4898 4898                  case T_OPTMGMT_REQ:
4899 4899                          /*
4900 4900                           * All Solaris components should pass a db_credp
4901 4901                           * for this TPI message, hence we ASSERT.
4902 4902                           * But in case there is some other M_PROTO that looks
4903 4903                           * like a TPI message sent by some other kernel
4904 4904                           * component, we check and return an error.
4905 4905                           */
4906 4906                          cr = msg_getcred(mp, NULL);
4907 4907                          ASSERT(cr != NULL);
4908 4908                          if (cr == NULL) {
4909 4909                                  icmp_err_ack(q, mp, TSYSERR, EINVAL);
4910 4910                                  return;
4911 4911                          }
4912 4912                          tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4913 4913                          return;
4914 4914  
4915 4915                  case T_DISCON_REQ:
4916 4916                          icmp_tpi_disconnect(q, mp);
4917 4917                          return;
4918 4918  
4919 4919                  /* The following TPI message is not supported by icmp. */
4920 4920                  case O_T_CONN_RES:
4921 4921                  case T_CONN_RES:
4922 4922                          icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4923 4923                          return;
4924 4924  
4925 4925                  /* The following 3 TPI requests are illegal for icmp. */
4926 4926                  case T_DATA_REQ:
4927 4927                  case T_EXDATA_REQ:
4928 4928                  case T_ORDREL_REQ:
4929 4929                          icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4930 4930                          return;
4931 4931                  default:
4932 4932                          break;
4933 4933                  }
4934 4934                  break;
4935 4935          case M_FLUSH:
4936 4936                  if (*rptr & FLUSHW)
4937 4937                          flushq(q, FLUSHDATA);
4938 4938                  break;
4939 4939          case M_IOCTL:
4940 4940                  iocp = (struct iocblk *)mp->b_rptr;
4941 4941                  switch (iocp->ioc_cmd) {
4942 4942                  case TI_GETPEERNAME:
4943 4943                          if (icmp->icmp_state != TS_DATA_XFER) {
4944 4944                                  /*
4945 4945                                   * If a default destination address has not
4946 4946                                   * been associated with the stream, then we
4947 4947                                   * don't know the peer's name.
4948 4948                                   */
4949 4949                                  iocp->ioc_error = ENOTCONN;
4950 4950                                  iocp->ioc_count = 0;
4951 4951                                  mp->b_datap->db_type = M_IOCACK;
4952 4952                                  qreply(q, mp);
4953 4953                                  return;
4954 4954                          }
4955 4955                          /* FALLTHRU */
4956 4956                  case TI_GETMYNAME:
4957 4957                          /*
4958 4958                           * For TI_GETPEERNAME and TI_GETMYNAME, we first
4959 4959                           * need to copyin the user's strbuf structure.
4960 4960                           * Processing will continue in the M_IOCDATA case
4961 4961                           * below.
4962 4962                           */
4963 4963                          mi_copyin(q, mp, NULL,
4964 4964                              SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4965 4965                          return;
4966 4966                  default:
4967 4967                          break;
4968 4968                  }
4969 4969                  break;
4970 4970          case M_IOCDATA:
4971 4971                  icmp_wput_iocdata(q, mp);
4972 4972                  return;
4973 4973          default:
4974 4974                  /* Unrecognized messages are passed through without change. */
4975 4975                  break;
4976 4976          }
4977 4977          ip_wput_nondata(q, mp);
4978 4978  }
4979 4979  
4980 4980  /*
4981 4981   * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4982 4982   * messages.
4983 4983   */
4984 4984  static void
4985 4985  icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4986 4986  {
4987 4987          mblk_t          *mp1;
4988 4988          STRUCT_HANDLE(strbuf, sb);
4989 4989          uint_t          addrlen;
4990 4990          conn_t          *connp = Q_TO_CONN(q);
4991 4991          icmp_t          *icmp = connp->conn_icmp;
4992 4992  
4993 4993          /* Make sure it is one of ours. */
4994 4994          switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4995 4995          case TI_GETMYNAME:
4996 4996          case TI_GETPEERNAME:
4997 4997                  break;
4998 4998          default:
4999 4999                  ip_wput_nondata(q, mp);
5000 5000                  return;
5001 5001          }
5002 5002  
5003 5003          switch (mi_copy_state(q, mp, &mp1)) {
5004 5004          case -1:
5005 5005                  return;
5006 5006          case MI_COPY_CASE(MI_COPY_IN, 1):
5007 5007                  break;
5008 5008          case MI_COPY_CASE(MI_COPY_OUT, 1):
5009 5009                  /*
5010 5010                   * The address has been copied out, so now
5011 5011                   * copyout the strbuf.
5012 5012                   */
5013 5013                  mi_copyout(q, mp);
5014 5014                  return;
5015 5015          case MI_COPY_CASE(MI_COPY_OUT, 2):
5016 5016                  /*
5017 5017                   * The address and strbuf have been copied out.
5018 5018                   * We're done, so just acknowledge the original
5019 5019                   * M_IOCTL.
5020 5020                   */
5021 5021                  mi_copy_done(q, mp, 0);
5022 5022                  return;
5023 5023          default:
5024 5024                  /*
5025 5025                   * Something strange has happened, so acknowledge
5026 5026                   * the original M_IOCTL with an EPROTO error.
5027 5027                   */
5028 5028                  mi_copy_done(q, mp, EPROTO);
5029 5029                  return;
5030 5030          }
5031 5031  
5032 5032          /*
5033 5033           * Now we have the strbuf structure for TI_GETMYNAME
5034 5034           * and TI_GETPEERNAME.  Next we copyout the requested
5035 5035           * address and then we'll copyout the strbuf.
5036 5036           */
5037 5037          STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
5038 5038              (void *)mp1->b_rptr);
5039 5039  
5040 5040          if (connp->conn_family == AF_INET)
5041 5041                  addrlen = sizeof (sin_t);
5042 5042          else
5043 5043                  addrlen = sizeof (sin6_t);
5044 5044  
5045 5045          if (STRUCT_FGET(sb, maxlen) < addrlen) {
5046 5046                  mi_copy_done(q, mp, EINVAL);
5047 5047                  return;
5048 5048          }
5049 5049          switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5050 5050          case TI_GETMYNAME:
5051 5051                  break;
5052 5052          case TI_GETPEERNAME:
5053 5053                  if (icmp->icmp_state != TS_DATA_XFER) {
5054 5054                          mi_copy_done(q, mp, ENOTCONN);
5055 5055                          return;
5056 5056                  }
5057 5057                  break;
5058 5058          default:
5059 5059                  mi_copy_done(q, mp, EPROTO);
5060 5060                  return;
5061 5061          }
5062 5062          mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
5063 5063          if (!mp1)
5064 5064                  return;
5065 5065  
5066 5066          STRUCT_FSET(sb, len, addrlen);
5067 5067          switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5068 5068          case TI_GETMYNAME:
5069 5069                  (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
5070 5070                      &addrlen);
5071 5071                  break;
5072 5072          case TI_GETPEERNAME:
5073 5073                  (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
5074 5074                      &addrlen);
5075 5075                  break;
5076 5076          }
5077 5077          mp1->b_wptr += addrlen;
5078 5078          /* Copy out the address */
5079 5079          mi_copyout(q, mp);
5080 5080  }
5081 5081  
5082 5082  void
5083 5083  icmp_ddi_g_init(void)
5084 5084  {
5085 5085          icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
5086 5086              icmp_opt_obj.odb_opt_arr_cnt);
5087 5087  
5088 5088          /*
5089 5089           * We want to be informed each time a stack is created or
5090 5090           * destroyed in the kernel, so we can maintain the
5091 5091           * set of icmp_stack_t's.
5092 5092           */
5093 5093          netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
5094 5094  }
5095 5095  
5096 5096  void
5097 5097  icmp_ddi_g_destroy(void)
5098 5098  {
5099 5099          netstack_unregister(NS_ICMP);
5100 5100  }
5101 5101  
5102 5102  #define INET_NAME       "ip"
5103 5103  
5104 5104  /*
5105 5105   * Initialize the ICMP stack instance.
5106 5106   */
5107 5107  static void *
5108 5108  rawip_stack_init(netstackid_t stackid, netstack_t *ns)
5109 5109  {
5110 5110          icmp_stack_t    *is;
5111 5111          int             error = 0;
5112 5112          size_t          arrsz;
5113 5113          major_t         major;
5114 5114  
5115 5115          is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
5116 5116          is->is_netstack = ns;
5117 5117  
5118 5118          arrsz = sizeof (icmp_propinfo_tbl);
5119 5119          is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
5120 5120          bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz);
5121 5121  
5122 5122          is->is_ksp = rawip_kstat_init(stackid);
5123 5123  
5124 5124          major = mod_name_to_major(INET_NAME);
5125 5125          error = ldi_ident_from_major(major, &is->is_ldi_ident);
5126 5126          ASSERT(error == 0);
5127 5127          return (is);
5128 5128  }
5129 5129  
5130 5130  /*
5131 5131   * Free the ICMP stack instance.
5132 5132   */
5133 5133  static void
5134 5134  rawip_stack_fini(netstackid_t stackid, void *arg)
5135 5135  {
5136 5136          icmp_stack_t *is = (icmp_stack_t *)arg;
5137 5137  
5138 5138          kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl));
5139 5139          is->is_propinfo_tbl = NULL;
5140 5140  
5141 5141          rawip_kstat_fini(stackid, is->is_ksp);
5142 5142          is->is_ksp = NULL;
5143 5143          ldi_ident_release(is->is_ldi_ident);
5144 5144          kmem_free(is, sizeof (*is));
5145 5145  }
5146 5146  
5147 5147  static void *
5148 5148  rawip_kstat_init(netstackid_t stackid)
5149 5149  {
5150 5150          kstat_t *ksp;
5151 5151  
5152 5152          rawip_named_kstat_t template = {
5153 5153                  { "inDatagrams",        KSTAT_DATA_UINT32, 0 },
5154 5154                  { "inCksumErrs",        KSTAT_DATA_UINT32, 0 },
5155 5155                  { "inErrors",           KSTAT_DATA_UINT32, 0 },
5156 5156                  { "outDatagrams",       KSTAT_DATA_UINT32, 0 },
5157 5157                  { "outErrors",          KSTAT_DATA_UINT32, 0 },
5158 5158          };
5159 5159  
5160 5160          ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5161 5161              KSTAT_TYPE_NAMED, NUM_OF_FIELDS(rawip_named_kstat_t), 0, stackid);
5162 5162          if (ksp == NULL || ksp->ks_data == NULL)
5163 5163                  return (NULL);
5164 5164  
5165 5165          bcopy(&template, ksp->ks_data, sizeof (template));
5166 5166          ksp->ks_update = rawip_kstat_update;
5167 5167          ksp->ks_private = (void *)(uintptr_t)stackid;
5168 5168  
5169 5169          kstat_install(ksp);
5170 5170          return (ksp);
5171 5171  }
5172 5172  
5173 5173  static void
5174 5174  rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5175 5175  {
5176 5176          if (ksp != NULL) {
5177 5177                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5178 5178                  kstat_delete_netstack(ksp, stackid);
5179 5179          }
5180 5180  }
5181 5181  
5182 5182  static int
5183 5183  rawip_kstat_update(kstat_t *ksp, int rw)
5184 5184  {
5185 5185          rawip_named_kstat_t *rawipkp;
5186 5186          netstackid_t    stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5187 5187          netstack_t      *ns;
5188 5188          icmp_stack_t    *is;
5189 5189  
5190 5190          if ((ksp == NULL) || (ksp->ks_data == NULL))
5191 5191                  return (EIO);
5192 5192  
5193 5193          if (rw == KSTAT_WRITE)
5194 5194                  return (EACCES);
5195 5195  
5196 5196          rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5197 5197  
5198 5198          ns = netstack_find_by_stackid(stackid);
5199 5199          if (ns == NULL)
5200 5200                  return (-1);
5201 5201          is = ns->netstack_icmp;
5202 5202          if (is == NULL) {
5203 5203                  netstack_rele(ns);
5204 5204                  return (-1);
5205 5205          }
5206 5206          rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5207 5207          rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5208 5208          rawipkp->inErrors.value.ui32 =     is->is_rawip_mib.rawipInErrors;
5209 5209          rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5210 5210          rawipkp->outErrors.value.ui32 =    is->is_rawip_mib.rawipOutErrors;
5211 5211          netstack_rele(ns);
5212 5212          return (0);
5213 5213  }
5214 5214  
5215 5215  /* ARGSUSED */
5216 5216  int
5217 5217  rawip_accept(sock_lower_handle_t lproto_handle,
5218 5218      sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5219 5219      cred_t *cr)
5220 5220  {
5221 5221          return (EOPNOTSUPP);
5222 5222  }
5223 5223  
5224 5224  /* ARGSUSED */
5225 5225  int
5226 5226  rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5227 5227      socklen_t len, cred_t *cr)
5228 5228  {
5229 5229          conn_t  *connp = (conn_t *)proto_handle;
5230 5230          int     error;
5231 5231  
5232 5232          /* All Solaris components should pass a cred for this operation. */
5233 5233          ASSERT(cr != NULL);
5234 5234  
5235 5235          /* Binding to a NULL address really means unbind */
5236 5236          if (sa == NULL)
5237 5237                  error = rawip_do_unbind(connp);
5238 5238          else
5239 5239                  error = rawip_do_bind(connp, sa, len);
5240 5240  
5241 5241          if (error < 0) {
5242 5242                  if (error == -TOUTSTATE)
5243 5243                          error = EINVAL;
5244 5244                  else
5245 5245                          error = proto_tlitosyserr(-error);
5246 5246          }
5247 5247          return (error);
5248 5248  }
5249 5249  
5250 5250  static int
5251 5251  rawip_implicit_bind(conn_t *connp)
5252 5252  {
5253 5253          sin6_t sin6addr;
5254 5254          sin_t *sin;
5255 5255          sin6_t *sin6;
5256 5256          socklen_t len;
5257 5257          int error;
5258 5258  
5259 5259          if (connp->conn_family == AF_INET) {
5260 5260                  len = sizeof (struct sockaddr_in);
5261 5261                  sin = (sin_t *)&sin6addr;
5262 5262                  *sin = sin_null;
5263 5263                  sin->sin_family = AF_INET;
5264 5264                  sin->sin_addr.s_addr = INADDR_ANY;
5265 5265          } else {
5266 5266                  ASSERT(connp->conn_family == AF_INET6);
5267 5267                  len = sizeof (sin6_t);
5268 5268                  sin6 = (sin6_t *)&sin6addr;
5269 5269                  *sin6 = sin6_null;
5270 5270                  sin6->sin6_family = AF_INET6;
5271 5271                  V6_SET_ZERO(sin6->sin6_addr);
5272 5272          }
5273 5273  
5274 5274          error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5275 5275  
5276 5276          return ((error < 0) ? proto_tlitosyserr(-error) : error);
5277 5277  }
5278 5278  
5279 5279  static int
5280 5280  rawip_unbind(conn_t *connp)
5281 5281  {
5282 5282          int error;
5283 5283  
5284 5284          error = rawip_do_unbind(connp);
5285 5285          if (error < 0) {
5286 5286                  error = proto_tlitosyserr(-error);
5287 5287          }
5288 5288          return (error);
5289 5289  }
5290 5290  
5291 5291  /* ARGSUSED */
5292 5292  int
5293 5293  rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5294 5294  {
5295 5295          return (EOPNOTSUPP);
5296 5296  }
5297 5297  
5298 5298  int
5299 5299  rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5300 5300      socklen_t len, sock_connid_t *id, cred_t *cr)
5301 5301  {
5302 5302          conn_t  *connp = (conn_t *)proto_handle;
5303 5303          icmp_t *icmp = connp->conn_icmp;
5304 5304          int     error;
5305 5305          boolean_t did_bind = B_FALSE;
5306 5306          pid_t   pid = curproc->p_pid;
5307 5307  
5308 5308          /* All Solaris components should pass a cred for this operation. */
5309 5309          ASSERT(cr != NULL);
5310 5310  
5311 5311          if (sa == NULL) {
5312 5312                  /*
5313 5313                   * Disconnect
5314 5314                   * Make sure we are connected
5315 5315                   */
5316 5316                  if (icmp->icmp_state != TS_DATA_XFER)
5317 5317                          return (EINVAL);
5318 5318  
5319 5319                  error = icmp_disconnect(connp);
5320 5320                  return (error);
5321 5321          }
5322 5322  
5323 5323          error = proto_verify_ip_addr(connp->conn_family, sa, len);
5324 5324          if (error != 0)
5325 5325                  return (error);
5326 5326  
5327 5327          /* do an implicit bind if necessary */
5328 5328          if (icmp->icmp_state == TS_UNBND) {
5329 5329                  error = rawip_implicit_bind(connp);
5330 5330                  /*
5331 5331                   * We could be racing with an actual bind, in which case
5332 5332                   * we would see EPROTO. We cross our fingers and try
5333 5333                   * to connect.
5334 5334                   */
5335 5335                  if (!(error == 0 || error == EPROTO))
5336 5336                          return (error);
5337 5337                  did_bind = B_TRUE;
5338 5338          }
5339 5339  
5340 5340          /*
5341 5341           * set SO_DGRAM_ERRIND
5342 5342           */
5343 5343          connp->conn_dgram_errind = B_TRUE;
5344 5344  
5345 5345          error = rawip_do_connect(connp, sa, len, cr, pid);
5346 5346          if (error != 0 && did_bind) {
5347 5347                  int unbind_err;
5348 5348  
5349 5349                  unbind_err = rawip_unbind(connp);
5350 5350                  ASSERT(unbind_err == 0);
5351 5351          }
5352 5352  
5353 5353          if (error == 0) {
5354 5354                  *id = 0;
5355 5355                  (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5356 5356                      0, NULL, -1);
5357 5357          } else if (error < 0) {
5358 5358                  error = proto_tlitosyserr(-error);
5359 5359          }
5360 5360          return (error);
5361 5361  }
5362 5362  
5363 5363  /* ARGSUSED2 */
5364 5364  int
5365 5365  rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5366 5366      boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
5367 5367      sock_quiesce_arg_t *arg)
5368 5368  {
5369 5369          conn_t  *connp = (conn_t *)proto_handle;
5370 5370          icmp_t  *icmp;
5371 5371          struct T_capability_ack tca;
5372 5372          struct sockaddr_in6 laddr, faddr;
5373 5373          socklen_t laddrlen, faddrlen;
5374 5374          short opts;
5375 5375          struct stroptions *stropt;
5376 5376          mblk_t *mp, *stropt_mp;
5377 5377          int error;
5378 5378  
5379 5379          icmp = connp->conn_icmp;
5380 5380  
5381 5381          stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5382 5382  
5383 5383          /*
5384 5384           * setup the fallback stream that was allocated
5385 5385           */
5386 5386          connp->conn_dev = (dev_t)RD(q)->q_ptr;
5387 5387          connp->conn_minor_arena = WR(q)->q_ptr;
5388 5388  
5389 5389          RD(q)->q_ptr = WR(q)->q_ptr = connp;
5390 5390  
5391 5391          WR(q)->q_qinfo = &icmpwinit;
5392 5392  
5393 5393          connp->conn_rq = RD(q);
5394 5394          connp->conn_wq = WR(q);
5395 5395  
5396 5396          /* Notify stream head about options before sending up data */
5397 5397          stropt_mp->b_datap->db_type = M_SETOPTS;
5398 5398          stropt_mp->b_wptr += sizeof (*stropt);
5399 5399          stropt = (struct stroptions *)stropt_mp->b_rptr;
5400 5400          stropt->so_flags = SO_WROFF | SO_HIWAT;
5401 5401          stropt->so_wroff = connp->conn_wroff;
5402 5402          stropt->so_hiwat = connp->conn_rcvbuf;
5403 5403          putnext(RD(q), stropt_mp);
5404 5404  
5405 5405          /*
5406 5406           * free helper stream
5407 5407           */
5408 5408          ip_free_helper_stream(connp);
5409 5409  
5410 5410          /*
5411 5411           * Collect the information needed to sync with the sonode
5412 5412           */
5413 5413          icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5414 5414  
5415 5415          laddrlen = faddrlen = sizeof (sin6_t);
5416 5416          (void) rawip_getsockname((sock_lower_handle_t)connp,
5417 5417              (struct sockaddr *)&laddr, &laddrlen, CRED());
5418 5418          error = rawip_getpeername((sock_lower_handle_t)connp,
5419 5419              (struct sockaddr *)&faddr, &faddrlen, CRED());
5420 5420          if (error != 0)
5421 5421                  faddrlen = 0;
5422 5422          opts = 0;
5423 5423          if (connp->conn_dgram_errind)
5424 5424                  opts |= SO_DGRAM_ERRIND;
5425 5425          if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5426 5426                  opts |= SO_DONTROUTE;
5427 5427  
5428 5428          mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
5429 5429              (struct sockaddr *)&laddr, laddrlen,
5430 5430              (struct sockaddr *)&faddr, faddrlen, opts);
5431 5431  
5432 5432          /*
5433 5433           * Attempts to send data up during fallback will result in it being
5434 5434           * queued in icmp_t. Now we push up any queued packets.
5435 5435           */
5436 5436          mutex_enter(&icmp->icmp_recv_lock);
5437 5437          if (mp != NULL) {
5438 5438                  mp->b_next = icmp->icmp_fallback_queue_head;
5439 5439                  icmp->icmp_fallback_queue_head = mp;
5440 5440          }
5441 5441          while (icmp->icmp_fallback_queue_head != NULL) {
5442 5442                  mp = icmp->icmp_fallback_queue_head;
5443 5443                  icmp->icmp_fallback_queue_head = mp->b_next;
5444 5444                  mp->b_next = NULL;
5445 5445                  mutex_exit(&icmp->icmp_recv_lock);
5446 5446                  putnext(RD(q), mp);
5447 5447                  mutex_enter(&icmp->icmp_recv_lock);
5448 5448          }
5449 5449          icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5450 5450  
5451 5451          /*
5452 5452           * No longer a streams less socket
5453 5453           */
5454 5454          mutex_enter(&connp->conn_lock);
5455 5455          connp->conn_flags &= ~IPCL_NONSTR;
5456 5456          mutex_exit(&connp->conn_lock);
5457 5457  
5458 5458          mutex_exit(&icmp->icmp_recv_lock);
5459 5459  
5460 5460          ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5461 5461              icmp->icmp_fallback_queue_tail == NULL);
5462 5462  
5463 5463          ASSERT(connp->conn_ref >= 1);
5464 5464  
5465 5465          return (0);
5466 5466  }
5467 5467  
5468 5468  /* ARGSUSED2 */
5469 5469  sock_lower_handle_t
5470 5470  rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5471 5471      uint_t *smodep, int *errorp, int flags, cred_t *credp)
5472 5472  {
5473 5473          conn_t *connp;
5474 5474  
5475 5475          if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5476 5476                  *errorp = EPROTONOSUPPORT;
5477 5477                  return (NULL);
5478 5478          }
5479 5479  
5480 5480          connp = rawip_do_open(family, credp, errorp, flags);
5481 5481          if (connp != NULL) {
5482 5482                  connp->conn_flags |= IPCL_NONSTR;
5483 5483  
5484 5484                  mutex_enter(&connp->conn_lock);
5485 5485                  connp->conn_state_flags &= ~CONN_INCIPIENT;
5486 5486                  mutex_exit(&connp->conn_lock);
5487 5487                  *sock_downcalls = &sock_rawip_downcalls;
5488 5488                  *smodep = SM_ATOMIC;
5489 5489          } else {
5490 5490                  ASSERT(*errorp != 0);
5491 5491          }
5492 5492  
5493 5493          return ((sock_lower_handle_t)connp);
5494 5494  }
5495 5495  
5496 5496  /* ARGSUSED3 */
5497 5497  void
5498 5498  rawip_activate(sock_lower_handle_t proto_handle,
5499 5499      sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5500 5500      cred_t *cr)
5501 5501  {
5502 5502          conn_t                  *connp = (conn_t *)proto_handle;
5503 5503          struct sock_proto_props sopp;
5504 5504  
5505 5505          /* All Solaris components should pass a cred for this operation. */
5506 5506          ASSERT(cr != NULL);
5507 5507  
5508 5508          connp->conn_upcalls = sock_upcalls;
5509 5509          connp->conn_upper_handle = sock_handle;
5510 5510  
5511 5511          sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5512 5512              SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5513 5513          sopp.sopp_wroff = connp->conn_wroff;
5514 5514          sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5515 5515          sopp.sopp_rxlowat = connp->conn_rcvlowat;
5516 5516          sopp.sopp_maxblk = INFPSZ;
5517 5517          sopp.sopp_maxpsz = IP_MAXPACKET;
5518 5518          sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5519 5519              icmp_mod_info.mi_minpsz;
5520 5520  
5521 5521          (*connp->conn_upcalls->su_set_proto_props)
5522 5522              (connp->conn_upper_handle, &sopp);
5523 5523  
5524 5524          icmp_bind_proto(connp->conn_icmp);
5525 5525  }
5526 5526  
5527 5527  /* ARGSUSED3 */
5528 5528  int
5529 5529  rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5530 5530      socklen_t *salenp, cred_t *cr)
5531 5531  {
5532 5532          conn_t  *connp = (conn_t *)proto_handle;
5533 5533          icmp_t  *icmp = connp->conn_icmp;
5534 5534          int     error;
5535 5535  
5536 5536          /* All Solaris components should pass a cred for this operation. */
5537 5537          ASSERT(cr != NULL);
5538 5538  
5539 5539          mutex_enter(&connp->conn_lock);
5540 5540          if (icmp->icmp_state != TS_DATA_XFER)
5541 5541                  error = ENOTCONN;
5542 5542          else
5543 5543                  error = conn_getpeername(connp, sa, salenp);
5544 5544          mutex_exit(&connp->conn_lock);
5545 5545          return (error);
5546 5546  }
5547 5547  
5548 5548  /* ARGSUSED3 */
5549 5549  int
5550 5550  rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5551 5551      socklen_t *salenp, cred_t *cr)
5552 5552  {
5553 5553          conn_t  *connp = (conn_t *)proto_handle;
5554 5554          int     error;
5555 5555  
5556 5556          /* All Solaris components should pass a cred for this operation. */
5557 5557          ASSERT(cr != NULL);
5558 5558  
5559 5559          mutex_enter(&connp->conn_lock);
5560 5560          error = conn_getsockname(connp, sa, salenp);
5561 5561          mutex_exit(&connp->conn_lock);
5562 5562          return (error);
5563 5563  }
5564 5564  
5565 5565  int
5566 5566  rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5567 5567      const void *optvalp, socklen_t optlen, cred_t *cr)
5568 5568  {
5569 5569          conn_t  *connp = (conn_t *)proto_handle;
5570 5570          int error;
5571 5571  
5572 5572          /* All Solaris components should pass a cred for this operation. */
5573 5573          ASSERT(cr != NULL);
5574 5574  
5575 5575          error = proto_opt_check(level, option_name, optlen, NULL,
5576 5576              icmp_opt_obj.odb_opt_des_arr,
5577 5577              icmp_opt_obj.odb_opt_arr_cnt,
5578 5578              B_TRUE, B_FALSE, cr);
5579 5579  
5580 5580          if (error != 0) {
5581 5581                  /*
5582 5582                   * option not recognized
5583 5583                   */
5584 5584                  if (error < 0) {
5585 5585                          error = proto_tlitosyserr(-error);
5586 5586                  }
5587 5587                  return (error);
5588 5588          }
5589 5589  
5590 5590          error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5591 5591              option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5592 5592              (uchar_t *)optvalp, NULL, cr);
5593 5593  
5594 5594          ASSERT(error >= 0);
5595 5595  
5596 5596          return (error);
5597 5597  }
5598 5598  
5599 5599  int
5600 5600  rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5601 5601      void *optvalp, socklen_t *optlen, cred_t *cr)
5602 5602  {
5603 5603          int             error;
5604 5604          conn_t          *connp = (conn_t *)proto_handle;
5605 5605          t_uscalar_t     max_optbuf_len;
5606 5606          void            *optvalp_buf;
5607 5607          int             len;
5608 5608  
5609 5609          /* All Solaris components should pass a cred for this operation. */
5610 5610          ASSERT(cr != NULL);
5611 5611  
5612 5612          error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5613 5613              icmp_opt_obj.odb_opt_des_arr,
5614 5614              icmp_opt_obj.odb_opt_arr_cnt,
5615 5615              B_FALSE, B_TRUE, cr);
5616 5616  
5617 5617          if (error != 0) {
5618 5618                  if (error < 0) {
5619 5619                          error = proto_tlitosyserr(-error);
5620 5620                  }
5621 5621                  return (error);
5622 5622          }
5623 5623  
5624 5624          optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5625 5625          len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5626 5626          if (len == -1) {
5627 5627                  kmem_free(optvalp_buf, max_optbuf_len);
5628 5628                  return (EINVAL);
5629 5629          }
5630 5630  
5631 5631          /*
5632 5632           * update optlen and copy option value
5633 5633           */
5634 5634          t_uscalar_t size = MIN(len, *optlen);
5635 5635  
5636 5636          bcopy(optvalp_buf, optvalp, size);
5637 5637          bcopy(&size, optlen, sizeof (size));
5638 5638  
5639 5639          kmem_free(optvalp_buf, max_optbuf_len);
5640 5640          return (0);
5641 5641  }
5642 5642  
5643 5643  /* ARGSUSED1 */
5644 5644  int
5645 5645  rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5646 5646  {
5647 5647          conn_t  *connp = (conn_t *)proto_handle;
5648 5648  
5649 5649          /* All Solaris components should pass a cred for this operation. */
5650 5650          ASSERT(cr != NULL);
5651 5651  
5652 5652          (void) rawip_do_close(connp);
5653 5653          return (0);
5654 5654  }
5655 5655  
5656 5656  /* ARGSUSED2 */
5657 5657  int
5658 5658  rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5659 5659  {
5660 5660          conn_t  *connp = (conn_t *)proto_handle;
5661 5661  
5662 5662          /* All Solaris components should pass a cred for this operation. */
5663 5663          ASSERT(cr != NULL);
5664 5664  
5665 5665          /* shut down the send side */
5666 5666          if (how != SHUT_RD)
5667 5667                  (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5668 5668                      SOCK_OPCTL_SHUT_SEND, 0);
5669 5669          /* shut down the recv side */
5670 5670          if (how != SHUT_WR)
5671 5671                  (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5672 5672                      SOCK_OPCTL_SHUT_RECV, 0);
5673 5673          return (0);
5674 5674  }
5675 5675  
5676 5676  void
5677 5677  rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5678 5678  {
5679 5679          conn_t  *connp = (conn_t *)proto_handle;
5680 5680          icmp_t  *icmp = connp->conn_icmp;
5681 5681  
5682 5682          mutex_enter(&icmp->icmp_recv_lock);
5683 5683          connp->conn_flow_cntrld = B_FALSE;
5684 5684          mutex_exit(&icmp->icmp_recv_lock);
5685 5685  }
5686 5686  
5687 5687  int
5688 5688  rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5689 5689      int mode, int32_t *rvalp, cred_t *cr)
5690 5690  {
5691 5691          conn_t          *connp = (conn_t *)proto_handle;
5692 5692          int             error;
5693 5693  
5694 5694          /* All Solaris components should pass a cred for this operation. */
5695 5695          ASSERT(cr != NULL);
5696 5696  
5697 5697          /*
5698 5698           * If we don't have a helper stream then create one.
5699 5699           * ip_create_helper_stream takes care of locking the conn_t,
5700 5700           * so this check for NULL is just a performance optimization.
5701 5701           */
5702 5702          if (connp->conn_helper_info == NULL) {
5703 5703                  icmp_stack_t *is = connp->conn_icmp->icmp_is;
5704 5704  
5705 5705                  ASSERT(is->is_ldi_ident != NULL);
5706 5706  
5707 5707                  /*
5708 5708                   * Create a helper stream for non-STREAMS socket.
5709 5709                   */
5710 5710                  error = ip_create_helper_stream(connp, is->is_ldi_ident);
5711 5711                  if (error != 0) {
5712 5712                          ip0dbg(("rawip_ioctl: create of IP helper stream "
5713 5713                              "failed %d\n", error));
5714 5714                          return (error);
5715 5715                  }
5716 5716          }
5717 5717  
5718 5718          switch (cmd) {
5719 5719          case _SIOCSOCKFALLBACK:
5720 5720          case TI_GETPEERNAME:
5721 5721          case TI_GETMYNAME:
5722 5722  #ifdef DEBUG
5723 5723                  cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5724 5724                      " socket", cmd);
5725 5725  #endif
5726 5726                  error = EINVAL;
5727 5727                  break;
5728 5728          default:
5729 5729                  /*
5730 5730                   * Pass on to IP using helper stream
5731 5731                   */
5732 5732                  error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5733 5733                      cmd, arg, mode, cr, rvalp);
5734 5734                  break;
5735 5735          }
5736 5736          return (error);
5737 5737  }
5738 5738  
5739 5739  int
5740 5740  rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5741 5741      cred_t *cr)
5742 5742  {
5743 5743          sin6_t          *sin6;
5744 5744          sin_t           *sin = NULL;
5745 5745          uint_t          srcid;
5746 5746          conn_t          *connp = (conn_t *)proto_handle;
5747 5747          icmp_t          *icmp = connp->conn_icmp;
5748 5748          int             error = 0;
5749 5749          icmp_stack_t    *is = icmp->icmp_is;
5750 5750          pid_t           pid = curproc->p_pid;
5751 5751          ip_xmit_attr_t  *ixa;
5752 5752  
5753 5753          ASSERT(DB_TYPE(mp) == M_DATA);
5754 5754  
5755 5755          /* All Solaris components should pass a cred for this operation. */
5756 5756          ASSERT(cr != NULL);
5757 5757  
5758 5758          /* do an implicit bind if necessary */
5759 5759          if (icmp->icmp_state == TS_UNBND) {
5760 5760                  error = rawip_implicit_bind(connp);
5761 5761                  /*
5762 5762                   * We could be racing with an actual bind, in which case
5763 5763                   * we would see EPROTO. We cross our fingers and try
5764 5764                   * to connect.
5765 5765                   */
5766 5766                  if (!(error == 0 || error == EPROTO)) {
5767 5767                          freemsg(mp);
5768 5768                          return (error);
5769 5769                  }
5770 5770          }
5771 5771  
5772 5772          /* Protocol 255 contains full IP headers */
5773 5773          /* Read without holding lock */
5774 5774          if (icmp->icmp_hdrincl) {
5775 5775                  ASSERT(connp->conn_ipversion == IPV4_VERSION);
5776 5776                  if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5777 5777                          if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5778 5778                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5779 5779                                  freemsg(mp);
5780 5780                                  return (EINVAL);
5781 5781                          }
5782 5782                  }
5783 5783                  error = icmp_output_hdrincl(connp, mp, cr, pid);
5784 5784                  if (is->is_sendto_ignerr)
5785 5785                          return (0);
5786 5786                  else
5787 5787                          return (error);
5788 5788          }
5789 5789  
5790 5790          /* Connected? */
5791 5791          if (msg->msg_name == NULL) {
5792 5792                  if (icmp->icmp_state != TS_DATA_XFER) {
5793 5793                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5794 5794                          return (EDESTADDRREQ);
5795 5795                  }
5796 5796                  if (msg->msg_controllen != 0) {
5797 5797                          error = icmp_output_ancillary(connp, NULL, NULL, mp,
5798 5798                              NULL, msg, cr, pid);
5799 5799                  } else {
5800 5800                          error = icmp_output_connected(connp, mp, cr, pid);
5801 5801                  }
5802 5802                  if (is->is_sendto_ignerr)
5803 5803                          return (0);
5804 5804                  else
5805 5805                          return (error);
5806 5806          }
5807 5807          if (icmp->icmp_state == TS_DATA_XFER) {
5808 5808                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5809 5809                  return (EISCONN);
5810 5810          }
5811 5811          error = proto_verify_ip_addr(connp->conn_family,
5812 5812              (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5813 5813          if (error != 0) {
5814 5814                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5815 5815                  return (error);
5816 5816          }
5817 5817          switch (connp->conn_family) {
5818 5818          case AF_INET6:
5819 5819                  sin6 = (sin6_t *)msg->msg_name;
5820 5820  
5821 5821                  /* No support for mapped addresses on raw sockets */
5822 5822                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5823 5823                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5824 5824                          return (EADDRNOTAVAIL);
5825 5825                  }
5826 5826                  srcid = sin6->__sin6_src_id;
5827 5827  
5828 5828                  /*
5829 5829                   * If the local address is a mapped address return
5830 5830                   * an error.
5831 5831                   * It would be possible to send an IPv6 packet but the
5832 5832                   * response would never make it back to the application
5833 5833                   * since it is bound to a mapped address.
5834 5834                   */
5835 5835                  if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5836 5836                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5837 5837                          return (EADDRNOTAVAIL);
5838 5838                  }
5839 5839  
5840 5840                  if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5841 5841                          sin6->sin6_addr = ipv6_loopback;
5842 5842  
5843 5843                  /*
5844 5844                   * We have to allocate an ip_xmit_attr_t before we grab
5845 5845                   * conn_lock and we need to hold conn_lock once we've check
5846 5846                   * conn_same_as_last_v6 to handle concurrent send* calls on a
5847 5847                   * socket.
5848 5848                   */
5849 5849                  if (msg->msg_controllen == 0) {
5850 5850                          ixa = conn_get_ixa(connp, B_FALSE);
5851 5851                          if (ixa == NULL) {
5852 5852                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5853 5853                                  return (ENOMEM);
5854 5854                          }
5855 5855                  } else {
5856 5856                          ixa = NULL;
5857 5857                  }
5858 5858                  mutex_enter(&connp->conn_lock);
5859 5859                  if (icmp->icmp_delayed_error != 0) {
5860 5860                          sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5861 5861  
5862 5862                          error = icmp->icmp_delayed_error;
5863 5863                          icmp->icmp_delayed_error = 0;
5864 5864  
5865 5865                          /* Compare IP address and family */
5866 5866  
5867 5867                          if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5868 5868                              &sin2->sin6_addr) &&
5869 5869                              sin6->sin6_family == sin2->sin6_family) {
5870 5870                                  mutex_exit(&connp->conn_lock);
5871 5871                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5872 5872                                  if (ixa != NULL)
5873 5873                                          ixa_refrele(ixa);
5874 5874                                  return (error);
5875 5875                          }
5876 5876                  }
5877 5877                  if (msg->msg_controllen != 0) {
5878 5878                          mutex_exit(&connp->conn_lock);
5879 5879                          ASSERT(ixa == NULL);
5880 5880                          error = icmp_output_ancillary(connp, NULL, sin6, mp,
5881 5881                              NULL, msg, cr, pid);
5882 5882                  } else if (conn_same_as_last_v6(connp, sin6) &&
5883 5883                      connp->conn_lastsrcid == srcid &&
5884 5884                      ipsec_outbound_policy_current(ixa)) {
5885 5885                          /* icmp_output_lastdst drops conn_lock */
5886 5886                          error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5887 5887                  } else {
5888 5888                          /* icmp_output_newdst drops conn_lock */
5889 5889                          error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5890 5890                              pid, ixa);
5891 5891                  }
5892 5892                  ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5893 5893                  if (is->is_sendto_ignerr)
5894 5894                          return (0);
5895 5895                  else
5896 5896                          return (error);
5897 5897          case AF_INET:
5898 5898                  sin = (sin_t *)msg->msg_name;
5899 5899  
5900 5900                  if (sin->sin_addr.s_addr == INADDR_ANY)
5901 5901                          sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5902 5902  
5903 5903                  /*
5904 5904                   * We have to allocate an ip_xmit_attr_t before we grab
5905 5905                   * conn_lock and we need to hold conn_lock once we've check
5906 5906                   * conn_same_as_last_v6 to handle concurrent send* on a socket.
5907 5907                   */
5908 5908                  if (msg->msg_controllen == 0) {
5909 5909                          ixa = conn_get_ixa(connp, B_FALSE);
5910 5910                          if (ixa == NULL) {
5911 5911                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5912 5912                                  return (ENOMEM);
5913 5913                          }
5914 5914                  } else {
5915 5915                          ixa = NULL;
5916 5916                  }
5917 5917                  mutex_enter(&connp->conn_lock);
5918 5918                  if (icmp->icmp_delayed_error != 0) {
5919 5919                          sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5920 5920  
5921 5921                          error = icmp->icmp_delayed_error;
5922 5922                          icmp->icmp_delayed_error = 0;
5923 5923  
5924 5924                          /* Compare IP address */
5925 5925  
5926 5926                          if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5927 5927                                  mutex_exit(&connp->conn_lock);
5928 5928                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5929 5929                                  if (ixa != NULL)
5930 5930                                          ixa_refrele(ixa);
5931 5931                                  return (error);
5932 5932                          }
5933 5933                  }
5934 5934  
5935 5935                  if (msg->msg_controllen != 0) {
5936 5936                          mutex_exit(&connp->conn_lock);
5937 5937                          ASSERT(ixa == NULL);
5938 5938                          error = icmp_output_ancillary(connp, sin, NULL, mp,
5939 5939                              NULL, msg, cr, pid);
5940 5940                  } else if (conn_same_as_last_v4(connp, sin) &&
5941 5941                      ipsec_outbound_policy_current(ixa)) {
5942 5942                          /* icmp_output_lastdst drops conn_lock */
5943 5943                          error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5944 5944                  } else {
5945 5945                          /* icmp_output_newdst drops conn_lock */
5946 5946                          error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5947 5947                              pid, ixa);
5948 5948                  }
5949 5949                  ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5950 5950                  if (is->is_sendto_ignerr)
5951 5951                          return (0);
5952 5952                  else
5953 5953                          return (error);
5954 5954          default:
5955 5955                  return (EINVAL);
5956 5956          }
5957 5957  }
5958 5958  
5959 5959  sock_downcalls_t sock_rawip_downcalls = {
5960 5960          rawip_activate,
5961 5961          rawip_accept,
5962 5962          rawip_bind,
5963 5963          rawip_listen,
5964 5964          rawip_connect,
5965 5965          rawip_getpeername,
5966 5966          rawip_getsockname,
5967 5967          rawip_getsockopt,
5968 5968          rawip_setsockopt,
5969 5969          rawip_send,
5970 5970          NULL,
5971 5971          NULL,
5972 5972          NULL,
5973 5973          rawip_shutdown,
5974 5974          rawip_clr_flowctrl,
5975 5975          rawip_ioctl,
5976 5976          rawip_close
5977 5977  };
  
    | 
      ↓ open down ↓ | 
    5977 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX