Print this page
    
OS-5007 support SO_ATTACH_FILTER on ICMP sockets
Reviewed by: Cody Mello <melloc@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/ip/icmp.c
          +++ new/usr/src/uts/common/inet/ip/icmp.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
       25 + * Copyright 2016 Joyent, Inc.
  25   26   */
  26   27  /* Copyright (c) 1990 Mentat Inc. */
  27   28  
  28   29  #include <sys/types.h>
  29   30  #include <sys/stream.h>
  30   31  #include <sys/stropts.h>
  31   32  #include <sys/strlog.h>
  32   33  #include <sys/strsun.h>
  33   34  #define _SUN_TPI_VERSION 2
  34   35  #include <sys/tihdr.h>
  35   36  #include <sys/timod.h>
  36   37  #include <sys/ddi.h>
  37   38  #include <sys/sunddi.h>
  38   39  #include <sys/strsubr.h>
  39   40  #include <sys/suntpi.h>
  40   41  #include <sys/xti_inet.h>
  41   42  #include <sys/cmn_err.h>
  42   43  #include <sys/kmem.h>
  43   44  #include <sys/cred.h>
  44   45  #include <sys/policy.h>
  45   46  #include <sys/priv.h>
  46   47  #include <sys/ucred.h>
  47   48  #include <sys/zone.h>
  48   49  
  49   50  #include <sys/sockio.h>
  50   51  #include <sys/socket.h>
  51   52  #include <sys/socketvar.h>
  52   53  #include <sys/vtrace.h>
  53   54  #include <sys/sdt.h>
  54   55  #include <sys/debug.h>
  55   56  #include <sys/isa_defs.h>
  56   57  #include <sys/random.h>
  57   58  #include <netinet/in.h>
  58   59  #include <netinet/ip6.h>
  59   60  #include <netinet/icmp6.h>
  60   61  #include <netinet/udp.h>
  61   62  
  62   63  #include <inet/common.h>
  63   64  #include <inet/ip.h>
  64   65  #include <inet/ip_impl.h>
  65   66  #include <inet/ipsec_impl.h>
  66   67  #include <inet/ip6.h>
  67   68  #include <inet/ip_ire.h>
  68   69  #include <inet/ip_if.h>
  69   70  #include <inet/ip_multi.h>
  70   71  #include <inet/ip_ndp.h>
  71   72  #include <inet/proto_set.h>
  72   73  #include <inet/mib2.h>
  
    | 
      ↓ open down ↓ | 
    38 lines elided | 
    
      ↑ open up ↑ | 
  
  73   74  #include <inet/nd.h>
  74   75  #include <inet/optcom.h>
  75   76  #include <inet/snmpcom.h>
  76   77  #include <inet/kstatcom.h>
  77   78  #include <inet/ipclassifier.h>
  78   79  
  79   80  #include <sys/tsol/label.h>
  80   81  #include <sys/tsol/tnet.h>
  81   82  
  82   83  #include <inet/rawip_impl.h>
       84 +#include <net/bpf.h>
  83   85  
  84   86  #include <sys/disp.h>
  85   87  
  86   88  /*
  87   89   * Synchronization notes:
  88   90   *
  89   91   * RAWIP is MT and uses the usual kernel synchronization primitives. We use
  90   92   * conn_lock to protect the icmp_t.
  91   93   *
  92   94   * Plumbing notes:
  93   95   * ICMP is always a device driver. For compatibility with mibopen() code
  94   96   * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
  95   97   * dummy module.
  96   98   */
  97   99  static void     icmp_addr_req(queue_t *q, mblk_t *mp);
  98  100  static void     icmp_tpi_bind(queue_t *q, mblk_t *mp);
  99  101  static void     icmp_bind_proto(icmp_t *icmp);
 100  102  static int      icmp_build_hdr_template(conn_t *, const in6_addr_t *,
 101  103      const in6_addr_t *, uint32_t);
 102  104  static void     icmp_capability_req(queue_t *q, mblk_t *mp);
 103  105  static int      icmp_close(queue_t *q, int flags);
 104  106  static void     icmp_close_free(conn_t *);
 105  107  static void     icmp_tpi_connect(queue_t *q, mblk_t *mp);
 106  108  static void     icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
 107  109  static void     icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
 108  110      int sys_error);
 109  111  static void     icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
 110  112      t_scalar_t tlierr, int sys_error);
 111  113  static void     icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
 112  114      ip_recv_attr_t *);
 113  115  static void     icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
 114  116      ip_recv_attr_t *);
 115  117  static void     icmp_info_req(queue_t *q, mblk_t *mp);
 116  118  static void     icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 117  119  static conn_t   *icmp_open(int family, cred_t *credp, int *err, int flags);
 118  120  static int      icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
 119  121                      cred_t *credp);
 120  122  static int      icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
 121  123                      cred_t *credp);
 122  124  static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
 123  125  int             icmp_opt_set(conn_t *connp, uint_t optset_context,
 124  126                      int level, int name, uint_t inlen,
 125  127                      uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 126  128                      void *thisdg_attrs, cred_t *cr);
 127  129  int             icmp_opt_get(conn_t *connp, int level, int name,
 128  130                      uchar_t *ptr);
 129  131  static int      icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
 130  132                      sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
 131  133  static mblk_t   *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
 132  134      const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
 133  135  static mblk_t   *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
 134  136      mblk_t *, const in6_addr_t *, uint32_t, int *);
 135  137  static int      icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
 136  138                      uchar_t *ptr, int len);
 137  139  static void     icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
 138  140  static void     icmp_tpi_unbind(queue_t *q, mblk_t *mp);
 139  141  static void     icmp_wput(queue_t *q, mblk_t *mp);
 140  142  static void     icmp_wput_fallback(queue_t *q, mblk_t *mp);
 141  143  static void     icmp_wput_other(queue_t *q, mblk_t *mp);
 142  144  static void     icmp_wput_iocdata(queue_t *q, mblk_t *mp);
 143  145  static void     icmp_wput_restricted(queue_t *q, mblk_t *mp);
 144  146  static void     icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
 145  147  
 146  148  static void     *rawip_stack_init(netstackid_t stackid, netstack_t *ns);
 147  149  static void     rawip_stack_fini(netstackid_t stackid, void *arg);
 148  150  
 149  151  static void     *rawip_kstat_init(netstackid_t stackid);
 150  152  static void     rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
 151  153  static int      rawip_kstat_update(kstat_t *kp, int rw);
 152  154  static void     rawip_stack_shutdown(netstackid_t stackid, void *arg);
 153  155  
 154  156  /* Common routines for TPI and socket module */
 155  157  static conn_t   *rawip_do_open(int, cred_t *, int *, int);
 156  158  static void     rawip_do_close(conn_t *);
 157  159  static int      rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
 158  160  static int      rawip_do_unbind(conn_t *);
 159  161  static int      rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
 160  162      cred_t *, pid_t);
 161  163  
 162  164  int             rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
 163  165                      socklen_t *, cred_t *);
 164  166  int             rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
 165  167                      socklen_t *, cred_t *);
 166  168  
 167  169  static struct module_info icmp_mod_info =  {
 168  170          5707, "icmp", 1, INFPSZ, 512, 128
 169  171  };
 170  172  
 171  173  /*
 172  174   * Entry points for ICMP as a device.
 173  175   * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
 174  176   */
 175  177  static struct qinit icmprinitv4 = {
 176  178          NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
 177  179  };
 178  180  
 179  181  static struct qinit icmprinitv6 = {
 180  182          NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
 181  183  };
 182  184  
 183  185  static struct qinit icmpwinit = {
 184  186          (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
 185  187  };
 186  188  
 187  189  /* ICMP entry point during fallback */
 188  190  static struct qinit icmp_fallback_sock_winit = {
 189  191          (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
 190  192  };
 191  193  
 192  194  /* For AF_INET aka /dev/icmp */
 193  195  struct streamtab icmpinfov4 = {
 194  196          &icmprinitv4, &icmpwinit
 195  197  };
 196  198  
 197  199  /* For AF_INET6 aka /dev/icmp6 */
 198  200  struct streamtab icmpinfov6 = {
 199  201          &icmprinitv6, &icmpwinit
 200  202  };
 201  203  
 202  204  /* Default structure copied into T_INFO_ACK messages */
 203  205  static struct T_info_ack icmp_g_t_info_ack = {
 204  206          T_INFO_ACK,
 205  207          IP_MAXPACKET,    /* TSDU_size.  icmp allows maximum size messages. */
 206  208          T_INVALID,      /* ETSDU_size.  icmp does not support expedited data. */
 207  209          T_INVALID,      /* CDATA_size. icmp does not support connect data. */
 208  210          T_INVALID,      /* DDATA_size. icmp does not support disconnect data. */
 209  211          0,              /* ADDR_size - filled in later. */
 210  212          0,              /* OPT_size - not initialized here */
 211  213          IP_MAXPACKET,   /* TIDU_size.  icmp allows maximum size messages. */
 212  214          T_CLTS,         /* SERV_type.  icmp supports connection-less. */
 213  215          TS_UNBND,       /* CURRENT_state.  This is set from icmp_state. */
 214  216          (XPG4_1|SENDZERO) /* PROVIDER_flag */
 215  217  };
 216  218  
 217  219  static int
 218  220  icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo,
 219  221      const char *ifname, const void *pval, uint_t flags)
 220  222  {
 221  223          return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl,
 222  224              stack, cr, pinfo, ifname, pval, flags));
 223  225  }
 224  226  
 225  227  static int
 226  228  icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname,
 227  229      void *val, uint_t psize, uint_t flags)
 228  230  {
 229  231          return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack,
 230  232              pinfo, ifname, val, psize, flags));
 231  233  }
 232  234  
 233  235  /*
 234  236   * All of these are alterable, within the min/max values given, at run time.
 235  237   *
 236  238   * Note: All those tunables which do not start with "icmp_" are Committed and
 237  239   * therefore are public. See PSARC 2010/080.
 238  240   */
 239  241  static mod_prop_info_t icmp_propinfo_tbl[] = {
 240  242          /* tunable - 0 */
 241  243          { "_wroff_extra", MOD_PROTO_RAWIP,
 242  244              mod_set_uint32, mod_get_uint32,
 243  245              {0, 128, 32}, {32} },
 244  246  
 245  247          { "_ipv4_ttl", MOD_PROTO_RAWIP,
 246  248              mod_set_uint32, mod_get_uint32,
 247  249              {1, 255, 255}, {255} },
 248  250  
 249  251          { "_ipv6_hoplimit", MOD_PROTO_RAWIP,
 250  252              mod_set_uint32, mod_get_uint32,
 251  253              {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
 252  254              {IPV6_DEFAULT_HOPS} },
 253  255  
 254  256          { "_bsd_compat", MOD_PROTO_RAWIP,
 255  257              mod_set_boolean, mod_get_boolean,
 256  258              {B_TRUE}, {B_TRUE} },
 257  259  
 258  260          { "send_buf", MOD_PROTO_RAWIP,
 259  261              icmp_set_buf_prop, icmp_get_buf_prop,
 260  262              {4096, 65536, 8192}, {8192} },
 261  263  
 262  264          { "_xmit_lowat", MOD_PROTO_RAWIP,
 263  265              mod_set_uint32, mod_get_uint32,
 264  266              {0, 65536, 1024}, {1024} },
 265  267  
 266  268          { "recv_buf", MOD_PROTO_RAWIP,
 267  269              icmp_set_buf_prop, icmp_get_buf_prop,
 268  270              {4096, 65536, 8192}, {8192} },
 269  271  
 270  272          { "max_buf", MOD_PROTO_RAWIP,
 271  273              mod_set_uint32, mod_get_uint32,
 272  274              {65536, ULP_MAX_BUF, 256*1024}, {256*1024} },
 273  275  
 274  276          { "_pmtu_discovery", MOD_PROTO_RAWIP,
 275  277              mod_set_boolean, mod_get_boolean,
 276  278              {B_FALSE}, {B_FALSE} },
 277  279  
 278  280          { "_sendto_ignerr", MOD_PROTO_RAWIP,
 279  281              mod_set_boolean, mod_get_boolean,
 280  282              {B_FALSE}, {B_FALSE} },
 281  283  
 282  284          { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} },
 283  285  
 284  286          { NULL, 0, NULL, NULL, {0}, {0} }
 285  287  };
 286  288  
 287  289  #define is_wroff_extra                  is_propinfo_tbl[0].prop_cur_uval
 288  290  #define is_ipv4_ttl                     is_propinfo_tbl[1].prop_cur_uval
 289  291  #define is_ipv6_hoplimit                is_propinfo_tbl[2].prop_cur_uval
 290  292  #define is_bsd_compat                   is_propinfo_tbl[3].prop_cur_bval
 291  293  #define is_xmit_hiwat                   is_propinfo_tbl[4].prop_cur_uval
 292  294  #define is_xmit_lowat                   is_propinfo_tbl[5].prop_cur_uval
 293  295  #define is_recv_hiwat                   is_propinfo_tbl[6].prop_cur_uval
 294  296  #define is_max_buf                      is_propinfo_tbl[7].prop_cur_uval
 295  297  #define is_pmtu_discovery               is_propinfo_tbl[8].prop_cur_bval
 296  298  #define is_sendto_ignerr                is_propinfo_tbl[9].prop_cur_bval
 297  299  
 298  300  typedef union T_primitives *t_primp_t;
 299  301  
 300  302  /*
 301  303   * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
 302  304   * passed to icmp_wput.
 303  305   * It calls IP to verify the local IP address, and calls IP to insert
 304  306   * the conn_t in the fanout table.
 305  307   * If everything is ok it then sends the T_BIND_ACK back up.
 306  308   */
 307  309  static void
 308  310  icmp_tpi_bind(queue_t *q, mblk_t *mp)
 309  311  {
 310  312          int     error;
 311  313          struct sockaddr *sa;
 312  314          struct T_bind_req *tbr;
 313  315          socklen_t       len;
 314  316          sin_t   *sin;
 315  317          sin6_t  *sin6;
 316  318          icmp_t          *icmp;
 317  319          conn_t  *connp = Q_TO_CONN(q);
 318  320          mblk_t *mp1;
 319  321          cred_t *cr;
 320  322  
 321  323          /*
 322  324           * All Solaris components should pass a db_credp
 323  325           * for this TPI message, hence we ASSERT.
 324  326           * But in case there is some other M_PROTO that looks
 325  327           * like a TPI message sent by some other kernel
 326  328           * component, we check and return an error.
 327  329           */
 328  330          cr = msg_getcred(mp, NULL);
 329  331          ASSERT(cr != NULL);
 330  332          if (cr == NULL) {
 331  333                  icmp_err_ack(q, mp, TSYSERR, EINVAL);
 332  334                  return;
 333  335          }
 334  336  
 335  337          icmp = connp->conn_icmp;
 336  338          if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 337  339                  (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 338  340                      "icmp_bind: bad req, len %u",
 339  341                      (uint_t)(mp->b_wptr - mp->b_rptr));
 340  342                  icmp_err_ack(q, mp, TPROTO, 0);
 341  343                  return;
 342  344          }
 343  345  
 344  346          if (icmp->icmp_state != TS_UNBND) {
 345  347                  (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 346  348                      "icmp_bind: bad state, %u", icmp->icmp_state);
 347  349                  icmp_err_ack(q, mp, TOUTSTATE, 0);
 348  350                  return;
 349  351          }
 350  352  
 351  353          /*
 352  354           * Reallocate the message to make sure we have enough room for an
 353  355           * address.
 354  356           */
 355  357          mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
 356  358          if (mp1 == NULL) {
 357  359                  icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 358  360                  return;
 359  361          }
 360  362          mp = mp1;
 361  363  
 362  364          /* Reset the message type in preparation for shipping it back. */
 363  365          DB_TYPE(mp) = M_PCPROTO;
 364  366          tbr = (struct T_bind_req *)mp->b_rptr;
 365  367          len = tbr->ADDR_length;
 366  368          switch (len) {
 367  369          case 0: /* request for a generic port */
 368  370                  tbr->ADDR_offset = sizeof (struct T_bind_req);
 369  371                  if (connp->conn_family == AF_INET) {
 370  372                          tbr->ADDR_length = sizeof (sin_t);
 371  373                          sin = (sin_t *)&tbr[1];
 372  374                          *sin = sin_null;
 373  375                          sin->sin_family = AF_INET;
 374  376                          mp->b_wptr = (uchar_t *)&sin[1];
 375  377                          sa = (struct sockaddr *)sin;
 376  378                          len = sizeof (sin_t);
 377  379                  } else {
 378  380                          ASSERT(connp->conn_family == AF_INET6);
 379  381                          tbr->ADDR_length = sizeof (sin6_t);
 380  382                          sin6 = (sin6_t *)&tbr[1];
 381  383                          *sin6 = sin6_null;
 382  384                          sin6->sin6_family = AF_INET6;
 383  385                          mp->b_wptr = (uchar_t *)&sin6[1];
 384  386                          sa = (struct sockaddr *)sin6;
 385  387                          len = sizeof (sin6_t);
 386  388                  }
 387  389                  break;
 388  390  
 389  391          case sizeof (sin_t):    /* Complete IPv4 address */
 390  392                  sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
 391  393                      sizeof (sin_t));
 392  394                  break;
 393  395  
 394  396          case sizeof (sin6_t):   /* Complete IPv6 address */
 395  397                  sa = (struct sockaddr *)mi_offset_param(mp,
 396  398                      tbr->ADDR_offset, sizeof (sin6_t));
 397  399                  break;
 398  400  
 399  401          default:
 400  402                  (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 401  403                      "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
 402  404                  icmp_err_ack(q, mp, TBADADDR, 0);
 403  405                  return;
 404  406          }
 405  407  
 406  408          error = rawip_do_bind(connp, sa, len);
 407  409          if (error != 0) {
 408  410                  if (error > 0) {
 409  411                          icmp_err_ack(q, mp, TSYSERR, error);
 410  412                  } else {
 411  413                          icmp_err_ack(q, mp, -error, 0);
 412  414                  }
 413  415          } else {
 414  416                  tbr->PRIM_type = T_BIND_ACK;
 415  417                  qreply(q, mp);
 416  418          }
 417  419  }
 418  420  
 419  421  static int
 420  422  rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
 421  423  {
 422  424          sin_t           *sin;
 423  425          sin6_t          *sin6;
 424  426          icmp_t          *icmp = connp->conn_icmp;
 425  427          int             error = 0;
 426  428          ip_laddr_t      laddr_type = IPVL_UNICAST_UP;   /* INADDR_ANY */
 427  429          in_port_t       lport;          /* Network byte order */
 428  430          ipaddr_t        v4src;          /* Set if AF_INET */
 429  431          in6_addr_t      v6src;
 430  432          uint_t          scopeid = 0;
 431  433          zoneid_t        zoneid = IPCL_ZONEID(connp);
 432  434          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 433  435  
 434  436          if (sa == NULL || !OK_32PTR((char *)sa)) {
 435  437                  return (EINVAL);
 436  438          }
 437  439  
 438  440          switch (len) {
 439  441          case sizeof (sin_t):    /* Complete IPv4 address */
 440  442                  sin = (sin_t *)sa;
 441  443                  if (sin->sin_family != AF_INET ||
 442  444                      connp->conn_family != AF_INET) {
 443  445                          /* TSYSERR, EAFNOSUPPORT */
 444  446                          return (EAFNOSUPPORT);
 445  447                  }
 446  448                  v4src = sin->sin_addr.s_addr;
 447  449                  IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
 448  450                  if (v4src != INADDR_ANY) {
 449  451                          laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
 450  452                              B_TRUE);
 451  453                  }
 452  454                  lport = sin->sin_port;
 453  455                  break;
 454  456          case sizeof (sin6_t): /* Complete IPv6 address */
 455  457                  sin6 = (sin6_t *)sa;
 456  458                  if (sin6->sin6_family != AF_INET6 ||
 457  459                      connp->conn_family != AF_INET6) {
 458  460                          /* TSYSERR, EAFNOSUPPORT */
 459  461                          return (EAFNOSUPPORT);
 460  462                  }
 461  463                  /* No support for mapped addresses on raw sockets */
 462  464                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 463  465                          /* TSYSERR, EADDRNOTAVAIL */
 464  466                          return (EADDRNOTAVAIL);
 465  467                  }
 466  468                  v6src = sin6->sin6_addr;
 467  469                  if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 468  470                          if (IN6_IS_ADDR_LINKSCOPE(&v6src))
 469  471                                  scopeid = sin6->sin6_scope_id;
 470  472                          laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
 471  473                              B_TRUE, scopeid);
 472  474                  }
 473  475                  lport = sin6->sin6_port;
 474  476                  break;
 475  477  
 476  478          default:
 477  479                  /* TBADADDR */
 478  480                  return (EADDRNOTAVAIL);
 479  481          }
 480  482  
 481  483          /* Is the local address a valid unicast, multicast, or broadcast? */
 482  484          if (laddr_type == IPVL_BAD)
 483  485                  return (EADDRNOTAVAIL);
 484  486  
 485  487          /*
 486  488           * The state must be TS_UNBND.
 487  489           */
 488  490          mutex_enter(&connp->conn_lock);
 489  491          if (icmp->icmp_state != TS_UNBND) {
 490  492                  mutex_exit(&connp->conn_lock);
 491  493                  return (-TOUTSTATE);
 492  494          }
 493  495  
 494  496          /*
 495  497           * Copy the source address into our icmp structure.  This address
 496  498           * may still be zero; if so, ip will fill in the correct address
 497  499           * each time an outbound packet is passed to it.
 498  500           * If we are binding to a broadcast or multicast address then
 499  501           * we just set the conn_bound_addr since we don't want to use
 500  502           * that as the source address when sending.
 501  503           */
 502  504          connp->conn_bound_addr_v6 = v6src;
 503  505          connp->conn_laddr_v6 = v6src;
 504  506          if (scopeid != 0) {
 505  507                  connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
 506  508                  connp->conn_ixa->ixa_scopeid = scopeid;
 507  509                  connp->conn_incoming_ifindex = scopeid;
 508  510          } else {
 509  511                  connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 510  512                  connp->conn_incoming_ifindex = connp->conn_bound_if;
 511  513          }
 512  514  
 513  515          switch (laddr_type) {
 514  516          case IPVL_UNICAST_UP:
 515  517          case IPVL_UNICAST_DOWN:
 516  518                  connp->conn_saddr_v6 = v6src;
 517  519                  connp->conn_mcbc_bind = B_FALSE;
 518  520                  break;
 519  521          case IPVL_MCAST:
 520  522          case IPVL_BCAST:
 521  523                  /* ip_set_destination will pick a source address later */
 522  524                  connp->conn_saddr_v6 = ipv6_all_zeros;
 523  525                  connp->conn_mcbc_bind = B_TRUE;
 524  526                  break;
 525  527          }
 526  528  
 527  529          /* Any errors after this point should use late_error */
 528  530  
 529  531          /*
 530  532           * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 531  533           * with IPPROTO_TCP.
 532  534           */
 533  535          connp->conn_lport = lport;
 534  536          connp->conn_fport = 0;
 535  537  
 536  538          if (connp->conn_family == AF_INET) {
 537  539                  ASSERT(connp->conn_ipversion == IPV4_VERSION);
 538  540          } else {
 539  541                  ASSERT(connp->conn_ipversion == IPV6_VERSION);
 540  542          }
 541  543  
 542  544          icmp->icmp_state = TS_IDLE;
 543  545  
 544  546          /*
 545  547           * We create an initial header template here to make a subsequent
 546  548           * sendto have a starting point. Since conn_last_dst is zero the
 547  549           * first sendto will always follow the 'dst changed' code path.
 548  550           * Note that we defer massaging options and the related checksum
 549  551           * adjustment until we have a destination address.
 550  552           */
 551  553          error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 552  554              &connp->conn_faddr_v6, connp->conn_flowinfo);
 553  555          if (error != 0) {
 554  556                  mutex_exit(&connp->conn_lock);
 555  557                  goto late_error;
 556  558          }
 557  559          /* Just in case */
 558  560          connp->conn_faddr_v6 = ipv6_all_zeros;
 559  561          connp->conn_v6lastdst = ipv6_all_zeros;
 560  562          mutex_exit(&connp->conn_lock);
 561  563  
 562  564          error = ip_laddr_fanout_insert(connp);
 563  565          if (error != 0)
 564  566                  goto late_error;
 565  567  
 566  568          /* Bind succeeded */
 567  569          return (0);
 568  570  
 569  571  late_error:
 570  572          mutex_enter(&connp->conn_lock);
 571  573          connp->conn_saddr_v6 = ipv6_all_zeros;
 572  574          connp->conn_bound_addr_v6 = ipv6_all_zeros;
 573  575          connp->conn_laddr_v6 = ipv6_all_zeros;
 574  576          if (scopeid != 0) {
 575  577                  connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 576  578                  connp->conn_incoming_ifindex = connp->conn_bound_if;
 577  579          }
 578  580          icmp->icmp_state = TS_UNBND;
 579  581          connp->conn_v6lastdst = ipv6_all_zeros;
 580  582          connp->conn_lport = 0;
 581  583  
 582  584          /* Restore the header that was built above - different source address */
 583  585          (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 584  586              &connp->conn_faddr_v6, connp->conn_flowinfo);
 585  587          mutex_exit(&connp->conn_lock);
 586  588          return (error);
 587  589  }
 588  590  
 589  591  /*
 590  592   * Tell IP to just bind to the protocol.
 591  593   */
 592  594  static void
 593  595  icmp_bind_proto(icmp_t *icmp)
 594  596  {
 595  597          conn_t  *connp = icmp->icmp_connp;
 596  598  
 597  599          mutex_enter(&connp->conn_lock);
 598  600          connp->conn_saddr_v6 = ipv6_all_zeros;
 599  601          connp->conn_laddr_v6 = ipv6_all_zeros;
 600  602          connp->conn_faddr_v6 = ipv6_all_zeros;
 601  603          connp->conn_v6lastdst = ipv6_all_zeros;
 602  604          mutex_exit(&connp->conn_lock);
 603  605  
 604  606          (void) ip_laddr_fanout_insert(connp);
 605  607  }
 606  608  
 607  609  /*
 608  610   * This routine handles each T_CONN_REQ message passed to icmp.  It
 609  611   * associates a default destination address with the stream.
 610  612   *
 611  613   * After various error checks are completed, icmp_connect() lays
 612  614   * the target address and port into the composite header template.
 613  615   * Then we ask IP for information, including a source address if we didn't
 614  616   * already have one. Finally we send up the T_OK_ACK reply message.
 615  617   */
 616  618  static void
 617  619  icmp_tpi_connect(queue_t *q, mblk_t *mp)
 618  620  {
 619  621          conn_t  *connp = Q_TO_CONN(q);
 620  622          struct T_conn_req       *tcr;
 621  623          struct sockaddr *sa;
 622  624          socklen_t len;
 623  625          int error;
 624  626          cred_t *cr;
 625  627          pid_t pid;
 626  628          /*
 627  629           * All Solaris components should pass a db_credp
 628  630           * for this TPI message, hence we ASSERT.
 629  631           * But in case there is some other M_PROTO that looks
 630  632           * like a TPI message sent by some other kernel
 631  633           * component, we check and return an error.
 632  634           */
 633  635          cr = msg_getcred(mp, &pid);
 634  636          ASSERT(cr != NULL);
 635  637          if (cr == NULL) {
 636  638                  icmp_err_ack(q, mp, TSYSERR, EINVAL);
 637  639                  return;
 638  640          }
 639  641  
 640  642          tcr = (struct T_conn_req *)mp->b_rptr;
 641  643          /* Sanity checks */
 642  644          if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
 643  645                  icmp_err_ack(q, mp, TPROTO, 0);
 644  646                  return;
 645  647          }
 646  648  
 647  649          if (tcr->OPT_length != 0) {
 648  650                  icmp_err_ack(q, mp, TBADOPT, 0);
 649  651                  return;
 650  652          }
 651  653  
 652  654          len = tcr->DEST_length;
 653  655  
 654  656          switch (len) {
 655  657          default:
 656  658                  icmp_err_ack(q, mp, TBADADDR, 0);
 657  659                  return;
 658  660          case sizeof (sin_t):
 659  661                  sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
 660  662                      sizeof (sin_t));
 661  663                  break;
 662  664          case sizeof (sin6_t):
 663  665                  sa = (struct sockaddr *)mi_offset_param(mp,
 664  666                      tcr->DEST_offset, sizeof (sin6_t));
 665  667                  break;
 666  668          }
 667  669  
 668  670          error = proto_verify_ip_addr(connp->conn_family, sa, len);
 669  671          if (error != 0) {
 670  672                  icmp_err_ack(q, mp, TSYSERR, error);
 671  673                  return;
 672  674          }
 673  675  
 674  676          error = rawip_do_connect(connp, sa, len, cr, pid);
 675  677          if (error != 0) {
 676  678                  if (error < 0) {
 677  679                          icmp_err_ack(q, mp, -error, 0);
 678  680                  } else {
 679  681                          icmp_err_ack(q, mp, 0, error);
 680  682                  }
 681  683          } else {
 682  684                  mblk_t *mp1;
 683  685  
 684  686                  /*
 685  687                   * We have to send a connection confirmation to
 686  688                   * keep TLI happy.
 687  689                   */
 688  690                  if (connp->conn_family == AF_INET) {
 689  691                          mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 690  692                              sizeof (sin_t), NULL, 0);
 691  693                  } else {
 692  694                          ASSERT(connp->conn_family == AF_INET6);
 693  695                          mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 694  696                              sizeof (sin6_t), NULL, 0);
 695  697                  }
 696  698                  if (mp1 == NULL) {
 697  699                          icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 698  700                          return;
 699  701                  }
 700  702  
 701  703                  /*
 702  704                   * Send ok_ack for T_CONN_REQ
 703  705                   */
 704  706                  mp = mi_tpi_ok_ack_alloc(mp);
 705  707                  if (mp == NULL) {
 706  708                          /* Unable to reuse the T_CONN_REQ for the ack. */
 707  709                          icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
 708  710                          return;
 709  711                  }
 710  712                  putnext(connp->conn_rq, mp);
 711  713                  putnext(connp->conn_rq, mp1);
 712  714          }
 713  715  }
 714  716  
 715  717  static int
 716  718  rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 717  719      cred_t *cr, pid_t pid)
 718  720  {
 719  721          icmp_t          *icmp;
 720  722          sin_t           *sin;
 721  723          sin6_t          *sin6;
 722  724          int             error;
 723  725          uint16_t        dstport;
 724  726          ipaddr_t        v4dst;
 725  727          in6_addr_t      v6dst;
 726  728          uint32_t        flowinfo;
 727  729          ip_xmit_attr_t  *ixa;
 728  730          ip_xmit_attr_t  *oldixa;
 729  731          uint_t          scopeid = 0;
 730  732          uint_t          srcid = 0;
 731  733          in6_addr_t      v6src = connp->conn_saddr_v6;
 732  734  
 733  735          icmp = connp->conn_icmp;
 734  736  
 735  737          if (sa == NULL || !OK_32PTR((char *)sa)) {
 736  738                  return (EINVAL);
 737  739          }
 738  740  
 739  741          ASSERT(sa != NULL && len != 0);
 740  742  
 741  743          /*
 742  744           * Determine packet type based on type of address passed in
 743  745           * the request should contain an IPv4 or IPv6 address.
 744  746           * Make sure that address family matches the type of
 745  747           * family of the address passed down.
 746  748           */
 747  749          switch (len) {
 748  750          case sizeof (sin_t):
 749  751                  sin = (sin_t *)sa;
 750  752  
 751  753                  v4dst = sin->sin_addr.s_addr;
 752  754                  dstport = sin->sin_port;
 753  755                  IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 754  756                  ASSERT(connp->conn_ipversion == IPV4_VERSION);
 755  757                  break;
 756  758  
 757  759          case sizeof (sin6_t):
 758  760                  sin6 = (sin6_t *)sa;
 759  761  
 760  762                  /* No support for mapped addresses on raw sockets */
 761  763                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 762  764                          return (EADDRNOTAVAIL);
 763  765                  }
 764  766                  v6dst = sin6->sin6_addr;
 765  767                  dstport = sin6->sin6_port;
 766  768                  ASSERT(connp->conn_ipversion == IPV6_VERSION);
 767  769                  flowinfo = sin6->sin6_flowinfo;
 768  770                  if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
 769  771                          scopeid = sin6->sin6_scope_id;
 770  772                  srcid = sin6->__sin6_src_id;
 771  773                  if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 772  774                          /* Due to check above, we know sin6_addr is v6-only. */
 773  775                          if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
 774  776                              B_FALSE, connp->conn_netstack)) {
 775  777                                  /* Mismatch - v6src would be v4mapped. */
 776  778                                  return (EADDRNOTAVAIL);
 777  779                          }
 778  780                  }
 779  781                  break;
 780  782          }
 781  783  
 782  784          /*
 783  785           * If there is a different thread using conn_ixa then we get a new
 784  786           * copy and cut the old one loose from conn_ixa. Otherwise we use
 785  787           * conn_ixa and prevent any other thread from using/changing it.
 786  788           * Once connect() is done other threads can use conn_ixa since the
 787  789           * refcnt will be back at one.
 788  790           * We defer updating conn_ixa until later to handle any concurrent
 789  791           * conn_ixa_cleanup thread.
 790  792           */
 791  793          ixa = conn_get_ixa(connp, B_FALSE);
 792  794          if (ixa == NULL)
 793  795                  return (ENOMEM);
 794  796  
 795  797          mutex_enter(&connp->conn_lock);
 796  798          /*
 797  799           * This icmp_t must have bound already before doing a connect.
 798  800           * Reject if a connect is in progress (we drop conn_lock during
 799  801           * rawip_do_connect).
 800  802           */
 801  803          if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
 802  804                  mutex_exit(&connp->conn_lock);
 803  805                  ixa_refrele(ixa);
 804  806                  return (-TOUTSTATE);
 805  807          }
 806  808  
 807  809          if (icmp->icmp_state == TS_DATA_XFER) {
 808  810                  /* Already connected - clear out state */
 809  811                  if (connp->conn_mcbc_bind)
 810  812                          connp->conn_saddr_v6 = ipv6_all_zeros;
 811  813                  else
 812  814                          connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 813  815                  connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 814  816                  connp->conn_faddr_v6 = ipv6_all_zeros;
 815  817                  icmp->icmp_state = TS_IDLE;
 816  818          }
 817  819  
 818  820          /*
 819  821           * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 820  822           * with IPPROTO_TCP.
 821  823           */
 822  824          connp->conn_fport = dstport;
 823  825          if (connp->conn_ipversion == IPV4_VERSION) {
 824  826                  /*
 825  827                   * Interpret a zero destination to mean loopback.
 826  828                   * Update the T_CONN_REQ (sin/sin6) since it is used to
 827  829                   * generate the T_CONN_CON.
 828  830                   */
 829  831                  if (v4dst == INADDR_ANY) {
 830  832                          v4dst = htonl(INADDR_LOOPBACK);
 831  833                          IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 832  834                          ASSERT(connp->conn_family == AF_INET);
 833  835                          sin->sin_addr.s_addr = v4dst;
 834  836                  }
 835  837                  connp->conn_faddr_v6 = v6dst;
 836  838                  connp->conn_flowinfo = 0;
 837  839          } else {
 838  840                  ASSERT(connp->conn_ipversion == IPV6_VERSION);
 839  841                  /*
 840  842                   * Interpret a zero destination to mean loopback.
 841  843                   * Update the T_CONN_REQ (sin/sin6) since it is used to
 842  844                   * generate the T_CONN_CON.
 843  845                   */
 844  846                  if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
 845  847                          v6dst = ipv6_loopback;
 846  848                          sin6->sin6_addr = v6dst;
 847  849                  }
 848  850                  connp->conn_faddr_v6 = v6dst;
 849  851                  connp->conn_flowinfo = flowinfo;
 850  852          }
 851  853  
 852  854          /*
 853  855           * We update our cred/cpid based on the caller of connect
 854  856           */
 855  857          if (connp->conn_cred != cr) {
 856  858                  crhold(cr);
 857  859                  crfree(connp->conn_cred);
 858  860                  connp->conn_cred = cr;
 859  861          }
 860  862          connp->conn_cpid = pid;
 861  863          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
 862  864          ixa->ixa_cred = cr;
 863  865          ixa->ixa_cpid = pid;
 864  866          if (is_system_labeled()) {
 865  867                  /* We need to restart with a label based on the cred */
 866  868                  ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
 867  869          }
 868  870  
 869  871          if (scopeid != 0) {
 870  872                  ixa->ixa_flags |= IXAF_SCOPEID_SET;
 871  873                  ixa->ixa_scopeid = scopeid;
 872  874                  connp->conn_incoming_ifindex = scopeid;
 873  875          } else {
 874  876                  ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 875  877                  connp->conn_incoming_ifindex = connp->conn_bound_if;
 876  878          }
 877  879  
 878  880          /*
 879  881           * conn_connect will drop conn_lock and reacquire it.
 880  882           * To prevent a send* from messing with this icmp_t while the lock
 881  883           * is dropped we set icmp_state and clear conn_v6lastdst.
 882  884           * That will make all send* fail with EISCONN.
 883  885           */
 884  886          connp->conn_v6lastdst = ipv6_all_zeros;
 885  887          icmp->icmp_state = TS_WCON_CREQ;
 886  888  
 887  889          error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
 888  890          mutex_exit(&connp->conn_lock);
 889  891          if (error != 0)
 890  892                  goto connect_failed;
 891  893  
 892  894          /*
 893  895           * The addresses have been verified. Time to insert in
 894  896           * the correct fanout list.
 895  897           */
 896  898          error = ipcl_conn_insert(connp);
 897  899          if (error != 0)
 898  900                  goto connect_failed;
 899  901  
 900  902          mutex_enter(&connp->conn_lock);
 901  903          error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 902  904              &connp->conn_faddr_v6, connp->conn_flowinfo);
 903  905          if (error != 0) {
 904  906                  mutex_exit(&connp->conn_lock);
 905  907                  goto connect_failed;
 906  908          }
 907  909  
 908  910          icmp->icmp_state = TS_DATA_XFER;
 909  911          /* Record this as the "last" send even though we haven't sent any */
 910  912          connp->conn_v6lastdst = connp->conn_faddr_v6;
 911  913          connp->conn_lastipversion = connp->conn_ipversion;
 912  914          connp->conn_lastdstport = connp->conn_fport;
 913  915          connp->conn_lastflowinfo = connp->conn_flowinfo;
 914  916          connp->conn_lastscopeid = scopeid;
 915  917          connp->conn_lastsrcid = srcid;
 916  918          /* Also remember a source to use together with lastdst */
 917  919          connp->conn_v6lastsrc = v6src;
 918  920  
 919  921          oldixa = conn_replace_ixa(connp, ixa);
 920  922          mutex_exit(&connp->conn_lock);
 921  923          ixa_refrele(oldixa);
 922  924  
 923  925          ixa_refrele(ixa);
 924  926          return (0);
 925  927  
 926  928  connect_failed:
 927  929          if (ixa != NULL)
 928  930                  ixa_refrele(ixa);
 929  931          mutex_enter(&connp->conn_lock);
 930  932          icmp->icmp_state = TS_IDLE;
 931  933          /* In case the source address was set above */
 932  934          if (connp->conn_mcbc_bind)
 933  935                  connp->conn_saddr_v6 = ipv6_all_zeros;
 934  936          else
 935  937                  connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 936  938          connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 937  939          connp->conn_faddr_v6 = ipv6_all_zeros;
 938  940          connp->conn_v6lastdst = ipv6_all_zeros;
 939  941          connp->conn_flowinfo = 0;
 940  942  
 941  943          (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 942  944              &connp->conn_faddr_v6, connp->conn_flowinfo);
 943  945          mutex_exit(&connp->conn_lock);
 944  946          return (error);
 945  947  }
 946  948  
 947  949  static void
 948  950  rawip_do_close(conn_t *connp)
 949  951  {
 950  952          ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
 951  953  
 952  954          ip_quiesce_conn(connp);
 953  955  
 954  956          if (!IPCL_IS_NONSTR(connp)) {
 955  957                  qprocsoff(connp->conn_rq);
 956  958          }
 957  959  
 958  960          icmp_close_free(connp);
 959  961  
 960  962          /*
 961  963           * Now we are truly single threaded on this stream, and can
 962  964           * delete the things hanging off the connp, and finally the connp.
 963  965           * We removed this connp from the fanout list, it cannot be
 964  966           * accessed thru the fanouts, and we already waited for the
 965  967           * conn_ref to drop to 0. We are already in close, so
 966  968           * there cannot be any other thread from the top. qprocsoff
 967  969           * has completed, and service has completed or won't run in
 968  970           * future.
 969  971           */
 970  972          ASSERT(connp->conn_ref == 1);
 971  973  
 972  974          if (!IPCL_IS_NONSTR(connp)) {
 973  975                  inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
 974  976          } else {
 975  977                  ip_free_helper_stream(connp);
 976  978          }
 977  979  
 978  980          connp->conn_ref--;
 979  981          ipcl_conn_destroy(connp);
 980  982  }
 981  983  
 982  984  static int
 983  985  icmp_close(queue_t *q, int flags)
 984  986  {
 985  987          conn_t  *connp;
 986  988  
 987  989          if (flags & SO_FALLBACK) {
 988  990                  /*
 989  991                   * stream is being closed while in fallback
 990  992                   * simply free the resources that were allocated
 991  993                   */
 992  994                  inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
 993  995                  qprocsoff(q);
 994  996                  goto done;
 995  997          }
 996  998  
 997  999          connp = Q_TO_CONN(q);
 998 1000          (void) rawip_do_close(connp);
 999 1001  done:
1000 1002          q->q_ptr = WR(q)->q_ptr = NULL;
1001 1003          return (0);
1002 1004  }
1003 1005  
  
    | 
      ↓ open down ↓ | 
    911 lines elided | 
    
      ↑ open up ↑ | 
  
1004 1006  static void
1005 1007  icmp_close_free(conn_t *connp)
1006 1008  {
1007 1009          icmp_t *icmp = connp->conn_icmp;
1008 1010  
1009 1011          if (icmp->icmp_filter != NULL) {
1010 1012                  kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
1011 1013                  icmp->icmp_filter = NULL;
1012 1014          }
1013 1015  
     1016 +        if (icmp->icmp_bpf_len != 0) {
     1017 +                kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
     1018 +                icmp->icmp_bpf_len = 0;
     1019 +                icmp->icmp_bpf_prog = NULL;
     1020 +        }
     1021 +
1014 1022          /*
1015 1023           * Clear any fields which the kmem_cache constructor clears.
1016 1024           * Only icmp_connp needs to be preserved.
1017 1025           * TBD: We should make this more efficient to avoid clearing
1018 1026           * everything.
1019 1027           */
1020 1028          ASSERT(icmp->icmp_connp == connp);
1021 1029          bzero(icmp, sizeof (icmp_t));
1022 1030          icmp->icmp_connp = connp;
1023 1031  }
1024 1032  
1025 1033  /*
1026 1034   * This routine handles each T_DISCON_REQ message passed to icmp
1027 1035   * as an indicating that ICMP is no longer connected. This results
1028 1036   * in telling IP to restore the binding to just the local address.
1029 1037   */
1030 1038  static int
1031 1039  icmp_do_disconnect(conn_t *connp)
1032 1040  {
1033 1041          icmp_t  *icmp = connp->conn_icmp;
1034 1042          int     error;
1035 1043  
1036 1044          mutex_enter(&connp->conn_lock);
1037 1045          if (icmp->icmp_state != TS_DATA_XFER) {
1038 1046                  mutex_exit(&connp->conn_lock);
1039 1047                  return (-TOUTSTATE);
1040 1048          }
1041 1049          if (connp->conn_mcbc_bind)
1042 1050                  connp->conn_saddr_v6 = ipv6_all_zeros;
1043 1051          else
1044 1052                  connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
1045 1053          connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
1046 1054          connp->conn_faddr_v6 = ipv6_all_zeros;
1047 1055          icmp->icmp_state = TS_IDLE;
1048 1056  
1049 1057          connp->conn_v6lastdst = ipv6_all_zeros;
1050 1058          error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
1051 1059              &connp->conn_faddr_v6, connp->conn_flowinfo);
1052 1060          mutex_exit(&connp->conn_lock);
1053 1061          if (error != 0)
1054 1062                  return (error);
1055 1063  
1056 1064          /*
1057 1065           * Tell IP to remove the full binding and revert
1058 1066           * to the local address binding.
1059 1067           */
1060 1068          return (ip_laddr_fanout_insert(connp));
1061 1069  }
1062 1070  
1063 1071  static void
1064 1072  icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1065 1073  {
1066 1074          conn_t  *connp = Q_TO_CONN(q);
1067 1075          int     error;
1068 1076  
1069 1077          /*
1070 1078           * Allocate the largest primitive we need to send back
1071 1079           * T_error_ack is > than T_ok_ack
1072 1080           */
1073 1081          mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1074 1082          if (mp == NULL) {
1075 1083                  /* Unable to reuse the T_DISCON_REQ for the ack. */
1076 1084                  icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1077 1085                  return;
1078 1086          }
1079 1087  
1080 1088          error = icmp_do_disconnect(connp);
1081 1089  
1082 1090          if (error != 0) {
1083 1091                  if (error > 0) {
1084 1092                          icmp_err_ack(q, mp, 0, error);
1085 1093                  } else {
1086 1094                          icmp_err_ack(q, mp, -error, 0);
1087 1095                  }
1088 1096          } else {
1089 1097                  mp = mi_tpi_ok_ack_alloc(mp);
1090 1098                  ASSERT(mp != NULL);
1091 1099                  qreply(q, mp);
1092 1100          }
1093 1101  }
1094 1102  
1095 1103  static int
1096 1104  icmp_disconnect(conn_t *connp)
1097 1105  {
1098 1106          int     error;
1099 1107  
1100 1108          connp->conn_dgram_errind = B_FALSE;
1101 1109  
1102 1110          error = icmp_do_disconnect(connp);
1103 1111  
1104 1112          if (error < 0)
1105 1113                  error = proto_tlitosyserr(-error);
1106 1114          return (error);
1107 1115  }
1108 1116  
1109 1117  /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1110 1118  static void
1111 1119  icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1112 1120  {
1113 1121          if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1114 1122                  qreply(q, mp);
1115 1123  }
1116 1124  
1117 1125  /* Shorthand to generate and send TPI error acks to our client */
1118 1126  static void
1119 1127  icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1120 1128      t_scalar_t t_error, int sys_error)
1121 1129  {
1122 1130          struct T_error_ack      *teackp;
1123 1131  
1124 1132          if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1125 1133              M_PCPROTO, T_ERROR_ACK)) != NULL) {
1126 1134                  teackp = (struct T_error_ack *)mp->b_rptr;
1127 1135                  teackp->ERROR_prim = primitive;
1128 1136                  teackp->TLI_error = t_error;
1129 1137                  teackp->UNIX_error = sys_error;
1130 1138                  qreply(q, mp);
1131 1139          }
1132 1140  }
1133 1141  
1134 1142  /*
1135 1143   * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1136 1144   * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1137 1145   * Assumes that IP has pulled up everything up to and including the ICMP header.
1138 1146   */
1139 1147  /* ARGSUSED2 */
1140 1148  static void
1141 1149  icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1142 1150  {
1143 1151          conn_t          *connp = (conn_t *)arg1;
1144 1152          icmp_t          *icmp = connp->conn_icmp;
1145 1153          icmph_t         *icmph;
1146 1154          ipha_t          *ipha;
1147 1155          int             iph_hdr_length;
1148 1156          sin_t           sin;
1149 1157          mblk_t          *mp1;
1150 1158          int             error = 0;
1151 1159  
1152 1160          ipha = (ipha_t *)mp->b_rptr;
1153 1161  
1154 1162          ASSERT(OK_32PTR(mp->b_rptr));
1155 1163  
1156 1164          if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1157 1165                  ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1158 1166                  icmp_icmp_error_ipv6(connp, mp, ira);
1159 1167                  return;
1160 1168          }
1161 1169          ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1162 1170  
1163 1171          /* Skip past the outer IP and ICMP headers */
1164 1172          ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1165 1173          iph_hdr_length = ira->ira_ip_hdr_length;
1166 1174          icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1167 1175          ipha = (ipha_t *)&icmph[1];     /* Inner IP header */
1168 1176  
1169 1177          iph_hdr_length = IPH_HDR_LENGTH(ipha);
1170 1178  
1171 1179          switch (icmph->icmph_type) {
1172 1180          case ICMP_DEST_UNREACHABLE:
1173 1181                  switch (icmph->icmph_code) {
1174 1182                  case ICMP_FRAGMENTATION_NEEDED: {
1175 1183                          ipha_t          *ipha;
1176 1184                          ip_xmit_attr_t  *ixa;
1177 1185                          /*
1178 1186                           * IP has already adjusted the path MTU.
1179 1187                           * But we need to adjust DF for IPv4.
1180 1188                           */
1181 1189                          if (connp->conn_ipversion != IPV4_VERSION)
1182 1190                                  break;
1183 1191  
1184 1192                          ixa = conn_get_ixa(connp, B_FALSE);
1185 1193                          if (ixa == NULL || ixa->ixa_ire == NULL) {
1186 1194                                  /*
1187 1195                                   * Some other thread holds conn_ixa. We will
1188 1196                                   * redo this on the next ICMP too big.
1189 1197                                   */
1190 1198                                  if (ixa != NULL)
1191 1199                                          ixa_refrele(ixa);
1192 1200                                  break;
1193 1201                          }
1194 1202                          (void) ip_get_pmtu(ixa);
1195 1203  
1196 1204                          mutex_enter(&connp->conn_lock);
1197 1205                          ipha = (ipha_t *)connp->conn_ht_iphc;
1198 1206                          if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1199 1207                                  ipha->ipha_fragment_offset_and_flags |=
1200 1208                                      IPH_DF_HTONS;
1201 1209                          } else {
1202 1210                                  ipha->ipha_fragment_offset_and_flags &=
1203 1211                                      ~IPH_DF_HTONS;
1204 1212                          }
1205 1213                          mutex_exit(&connp->conn_lock);
1206 1214                          ixa_refrele(ixa);
1207 1215                          break;
1208 1216                  }
1209 1217                  case ICMP_PORT_UNREACHABLE:
1210 1218                  case ICMP_PROTOCOL_UNREACHABLE:
1211 1219                          error = ECONNREFUSED;
1212 1220                          break;
1213 1221                  default:
1214 1222                          /* Transient errors */
1215 1223                          break;
1216 1224                  }
1217 1225                  break;
1218 1226          default:
1219 1227                  /* Transient errors */
1220 1228                  break;
1221 1229          }
1222 1230          if (error == 0) {
1223 1231                  freemsg(mp);
1224 1232                  return;
1225 1233          }
1226 1234  
1227 1235          /*
1228 1236           * Deliver T_UDERROR_IND when the application has asked for it.
1229 1237           * The socket layer enables this automatically when connected.
1230 1238           */
1231 1239          if (!connp->conn_dgram_errind) {
1232 1240                  freemsg(mp);
1233 1241                  return;
1234 1242          }
1235 1243  
1236 1244          sin = sin_null;
1237 1245          sin.sin_family = AF_INET;
1238 1246          sin.sin_addr.s_addr = ipha->ipha_dst;
1239 1247  
1240 1248          if (IPCL_IS_NONSTR(connp)) {
1241 1249                  mutex_enter(&connp->conn_lock);
1242 1250                  if (icmp->icmp_state == TS_DATA_XFER) {
1243 1251                          if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1244 1252                                  mutex_exit(&connp->conn_lock);
1245 1253                                  (*connp->conn_upcalls->su_set_error)
1246 1254                                      (connp->conn_upper_handle, error);
1247 1255                                  goto done;
1248 1256                          }
1249 1257                  } else {
1250 1258                          icmp->icmp_delayed_error = error;
1251 1259                          *((sin_t *)&icmp->icmp_delayed_addr) = sin;
1252 1260                  }
1253 1261                  mutex_exit(&connp->conn_lock);
1254 1262          } else {
1255 1263                  mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1256 1264                      error);
1257 1265                  if (mp1 != NULL)
1258 1266                          putnext(connp->conn_rq, mp1);
1259 1267          }
1260 1268  done:
1261 1269          freemsg(mp);
1262 1270  }
1263 1271  
1264 1272  /*
1265 1273   * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1266 1274   * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1267 1275   * Assumes that IP has pulled up all the extension headers as well as the
1268 1276   * ICMPv6 header.
1269 1277   */
1270 1278  static void
1271 1279  icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1272 1280  {
1273 1281          icmp6_t         *icmp6;
1274 1282          ip6_t           *ip6h, *outer_ip6h;
1275 1283          uint16_t        iph_hdr_length;
1276 1284          uint8_t         *nexthdrp;
1277 1285          sin6_t          sin6;
1278 1286          mblk_t          *mp1;
1279 1287          int             error = 0;
1280 1288          icmp_t          *icmp = connp->conn_icmp;
1281 1289  
1282 1290          outer_ip6h = (ip6_t *)mp->b_rptr;
1283 1291  #ifdef DEBUG
1284 1292          if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1285 1293                  iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1286 1294          else
1287 1295                  iph_hdr_length = IPV6_HDR_LEN;
1288 1296          ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1289 1297  #endif
1290 1298          /* Skip past the outer IP and ICMP headers */
1291 1299          iph_hdr_length = ira->ira_ip_hdr_length;
1292 1300          icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1293 1301  
1294 1302          ip6h = (ip6_t *)&icmp6[1];      /* Inner IP header */
1295 1303          if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1296 1304                  freemsg(mp);
1297 1305                  return;
1298 1306          }
1299 1307  
1300 1308          switch (icmp6->icmp6_type) {
1301 1309          case ICMP6_DST_UNREACH:
1302 1310                  switch (icmp6->icmp6_code) {
1303 1311                  case ICMP6_DST_UNREACH_NOPORT:
1304 1312                          error = ECONNREFUSED;
1305 1313                          break;
1306 1314                  case ICMP6_DST_UNREACH_ADMIN:
1307 1315                  case ICMP6_DST_UNREACH_NOROUTE:
1308 1316                  case ICMP6_DST_UNREACH_BEYONDSCOPE:
1309 1317                  case ICMP6_DST_UNREACH_ADDR:
1310 1318                          /* Transient errors */
1311 1319                          break;
1312 1320                  default:
1313 1321                          break;
1314 1322                  }
1315 1323                  break;
1316 1324          case ICMP6_PACKET_TOO_BIG: {
1317 1325                  struct T_unitdata_ind   *tudi;
1318 1326                  struct T_opthdr         *toh;
1319 1327                  size_t                  udi_size;
1320 1328                  mblk_t                  *newmp;
1321 1329                  t_scalar_t              opt_length = sizeof (struct T_opthdr) +
1322 1330                      sizeof (struct ip6_mtuinfo);
1323 1331                  sin6_t                  *sin6;
1324 1332                  struct ip6_mtuinfo      *mtuinfo;
1325 1333  
1326 1334                  /*
1327 1335                   * If the application has requested to receive path mtu
1328 1336                   * information, send up an empty message containing an
1329 1337                   * IPV6_PATHMTU ancillary data item.
1330 1338                   */
1331 1339                  if (!connp->conn_ipv6_recvpathmtu)
1332 1340                          break;
1333 1341  
1334 1342                  udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1335 1343                      opt_length;
1336 1344                  if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1337 1345                          BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1338 1346                          break;
1339 1347                  }
1340 1348  
1341 1349                  /*
1342 1350                   * newmp->b_cont is left to NULL on purpose.  This is an
1343 1351                   * empty message containing only ancillary data.
1344 1352                   */
1345 1353                  newmp->b_datap->db_type = M_PROTO;
1346 1354                  tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1347 1355                  newmp->b_wptr = (uchar_t *)tudi + udi_size;
1348 1356                  tudi->PRIM_type = T_UNITDATA_IND;
1349 1357                  tudi->SRC_length = sizeof (sin6_t);
1350 1358                  tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1351 1359                  tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1352 1360                  tudi->OPT_length = opt_length;
1353 1361  
1354 1362                  sin6 = (sin6_t *)&tudi[1];
1355 1363                  bzero(sin6, sizeof (sin6_t));
1356 1364                  sin6->sin6_family = AF_INET6;
1357 1365                  sin6->sin6_addr = connp->conn_faddr_v6;
1358 1366  
1359 1367                  toh = (struct T_opthdr *)&sin6[1];
1360 1368                  toh->level = IPPROTO_IPV6;
1361 1369                  toh->name = IPV6_PATHMTU;
1362 1370                  toh->len = opt_length;
1363 1371                  toh->status = 0;
1364 1372  
1365 1373                  mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1366 1374                  bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1367 1375                  mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1368 1376                  mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1369 1377                  mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1370 1378                  /*
1371 1379                   * We've consumed everything we need from the original
1372 1380                   * message.  Free it, then send our empty message.
1373 1381                   */
1374 1382                  freemsg(mp);
1375 1383                  icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1376 1384                  return;
1377 1385          }
1378 1386          case ICMP6_TIME_EXCEEDED:
1379 1387                  /* Transient errors */
1380 1388                  break;
1381 1389          case ICMP6_PARAM_PROB:
1382 1390                  /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1383 1391                  if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1384 1392                      (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1385 1393                      (uchar_t *)nexthdrp) {
1386 1394                          error = ECONNREFUSED;
1387 1395                          break;
1388 1396                  }
1389 1397                  break;
1390 1398          }
1391 1399          if (error == 0) {
1392 1400                  freemsg(mp);
1393 1401                  return;
1394 1402          }
1395 1403  
1396 1404          /*
1397 1405           * Deliver T_UDERROR_IND when the application has asked for it.
1398 1406           * The socket layer enables this automatically when connected.
1399 1407           */
1400 1408          if (!connp->conn_dgram_errind) {
1401 1409                  freemsg(mp);
1402 1410                  return;
1403 1411          }
1404 1412  
1405 1413          sin6 = sin6_null;
1406 1414          sin6.sin6_family = AF_INET6;
1407 1415          sin6.sin6_addr = ip6h->ip6_dst;
1408 1416          sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1409 1417          if (IPCL_IS_NONSTR(connp)) {
1410 1418                  mutex_enter(&connp->conn_lock);
1411 1419                  if (icmp->icmp_state == TS_DATA_XFER) {
1412 1420                          if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1413 1421                              &connp->conn_faddr_v6)) {
1414 1422                                  mutex_exit(&connp->conn_lock);
1415 1423                                  (*connp->conn_upcalls->su_set_error)
1416 1424                                      (connp->conn_upper_handle, error);
1417 1425                                  goto done;
1418 1426                          }
1419 1427                  } else {
1420 1428                          icmp->icmp_delayed_error = error;
1421 1429                          *((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1422 1430                  }
1423 1431                  mutex_exit(&connp->conn_lock);
1424 1432          } else {
1425 1433                  mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1426 1434                      NULL, 0, error);
1427 1435                  if (mp1 != NULL)
1428 1436                          putnext(connp->conn_rq, mp1);
1429 1437          }
1430 1438  done:
1431 1439          freemsg(mp);
1432 1440  }
1433 1441  
1434 1442  /*
1435 1443   * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1436 1444   * The local address is filled in if endpoint is bound. The remote address
1437 1445   * is filled in if remote address has been precified ("connected endpoint")
1438 1446   * (The concept of connected CLTS sockets is alien to published TPI
1439 1447   *  but we support it anyway).
1440 1448   */
1441 1449  static void
1442 1450  icmp_addr_req(queue_t *q, mblk_t *mp)
1443 1451  {
1444 1452          struct sockaddr *sa;
1445 1453          mblk_t  *ackmp;
1446 1454          struct T_addr_ack *taa;
1447 1455          icmp_t  *icmp = Q_TO_ICMP(q);
1448 1456          conn_t  *connp = icmp->icmp_connp;
1449 1457          uint_t  addrlen;
1450 1458  
1451 1459          /* Make it large enough for worst case */
1452 1460          ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1453 1461              2 * sizeof (sin6_t), 1);
1454 1462          if (ackmp == NULL) {
1455 1463                  icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1456 1464                  return;
1457 1465          }
1458 1466          taa = (struct T_addr_ack *)ackmp->b_rptr;
1459 1467  
1460 1468          bzero(taa, sizeof (struct T_addr_ack));
1461 1469          ackmp->b_wptr = (uchar_t *)&taa[1];
1462 1470  
1463 1471          taa->PRIM_type = T_ADDR_ACK;
1464 1472          ackmp->b_datap->db_type = M_PCPROTO;
1465 1473  
1466 1474          if (connp->conn_family == AF_INET)
1467 1475                  addrlen = sizeof (sin_t);
1468 1476          else
1469 1477                  addrlen = sizeof (sin6_t);
1470 1478  
1471 1479          mutex_enter(&connp->conn_lock);
1472 1480          /*
1473 1481           * Note: Following code assumes 32 bit alignment of basic
1474 1482           * data structures like sin_t and struct T_addr_ack.
1475 1483           */
1476 1484          if (icmp->icmp_state != TS_UNBND) {
1477 1485                  /*
1478 1486                   * Fill in local address first
1479 1487                   */
1480 1488                  taa->LOCADDR_offset = sizeof (*taa);
1481 1489                  taa->LOCADDR_length = addrlen;
1482 1490                  sa = (struct sockaddr *)&taa[1];
1483 1491                  (void) conn_getsockname(connp, sa, &addrlen);
1484 1492                  ackmp->b_wptr += addrlen;
1485 1493          }
1486 1494          if (icmp->icmp_state == TS_DATA_XFER) {
1487 1495                  /*
1488 1496                   * connected, fill remote address too
1489 1497                   */
1490 1498                  taa->REMADDR_length = addrlen;
1491 1499                  /* assumed 32-bit alignment */
1492 1500                  taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1493 1501                  sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1494 1502                  (void) conn_getpeername(connp, sa, &addrlen);
1495 1503                  ackmp->b_wptr += addrlen;
1496 1504          }
1497 1505          mutex_exit(&connp->conn_lock);
1498 1506          ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1499 1507          qreply(q, ackmp);
1500 1508  }
1501 1509  
1502 1510  static void
1503 1511  icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1504 1512  {
1505 1513          conn_t          *connp = icmp->icmp_connp;
1506 1514  
1507 1515          *tap = icmp_g_t_info_ack;
1508 1516  
1509 1517          if (connp->conn_family == AF_INET6)
1510 1518                  tap->ADDR_size = sizeof (sin6_t);
1511 1519          else
1512 1520                  tap->ADDR_size = sizeof (sin_t);
1513 1521          tap->CURRENT_state = icmp->icmp_state;
1514 1522          tap->OPT_size = icmp_max_optsize;
1515 1523  }
1516 1524  
1517 1525  static void
1518 1526  icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1519 1527      t_uscalar_t cap_bits1)
1520 1528  {
1521 1529          tcap->CAP_bits1 = 0;
1522 1530  
1523 1531          if (cap_bits1 & TC1_INFO) {
1524 1532                  icmp_copy_info(&tcap->INFO_ack, icmp);
1525 1533                  tcap->CAP_bits1 |= TC1_INFO;
1526 1534          }
1527 1535  }
1528 1536  
1529 1537  /*
1530 1538   * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1531 1539   * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1532 1540   * icmp_g_t_info_ack.  The current state of the stream is copied from
1533 1541   * icmp_state.
1534 1542   */
1535 1543  static void
1536 1544  icmp_capability_req(queue_t *q, mblk_t *mp)
1537 1545  {
1538 1546          icmp_t                  *icmp = Q_TO_ICMP(q);
1539 1547          t_uscalar_t             cap_bits1;
1540 1548          struct T_capability_ack *tcap;
1541 1549  
1542 1550          cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1543 1551  
1544 1552          mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1545 1553              mp->b_datap->db_type, T_CAPABILITY_ACK);
1546 1554          if (!mp)
1547 1555                  return;
1548 1556  
1549 1557          tcap = (struct T_capability_ack *)mp->b_rptr;
1550 1558  
1551 1559          icmp_do_capability_ack(icmp, tcap, cap_bits1);
1552 1560  
1553 1561          qreply(q, mp);
1554 1562  }
1555 1563  
1556 1564  /*
1557 1565   * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1558 1566   * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1559 1567   * The current state of the stream is copied from icmp_state.
1560 1568   */
1561 1569  static void
1562 1570  icmp_info_req(queue_t *q, mblk_t *mp)
1563 1571  {
1564 1572          icmp_t  *icmp = Q_TO_ICMP(q);
1565 1573  
1566 1574          /* Create a T_INFO_ACK message. */
1567 1575          mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1568 1576              T_INFO_ACK);
1569 1577          if (!mp)
1570 1578                  return;
1571 1579          icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1572 1580          qreply(q, mp);
1573 1581  }
1574 1582  
1575 1583  static int
1576 1584  icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1577 1585      int family)
1578 1586  {
1579 1587          conn_t *connp;
1580 1588          dev_t   conn_dev;
1581 1589          int     error;
1582 1590  
1583 1591          /* If the stream is already open, return immediately. */
1584 1592          if (q->q_ptr != NULL)
1585 1593                  return (0);
1586 1594  
1587 1595          if (sflag == MODOPEN)
1588 1596                  return (EINVAL);
1589 1597  
1590 1598          /*
1591 1599           * Since ICMP is not used so heavily, allocating from the small
1592 1600           * arena should be sufficient.
1593 1601           */
1594 1602          if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1595 1603                  return (EBUSY);
1596 1604          }
1597 1605  
1598 1606          if (flag & SO_FALLBACK) {
1599 1607                  /*
1600 1608                   * Non streams socket needs a stream to fallback to
1601 1609                   */
1602 1610                  RD(q)->q_ptr = (void *)conn_dev;
1603 1611                  WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1604 1612                  WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1605 1613                  qprocson(q);
1606 1614                  return (0);
1607 1615          }
1608 1616  
1609 1617          connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1610 1618          if (connp == NULL) {
1611 1619                  ASSERT(error != 0);
1612 1620                  inet_minor_free(ip_minor_arena_sa, conn_dev);
1613 1621                  return (error);
1614 1622          }
1615 1623  
1616 1624          *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1617 1625          connp->conn_dev = conn_dev;
1618 1626          connp->conn_minor_arena = ip_minor_arena_sa;
1619 1627  
1620 1628          /*
1621 1629           * Initialize the icmp_t structure for this stream.
1622 1630           */
1623 1631          q->q_ptr = connp;
1624 1632          WR(q)->q_ptr = connp;
1625 1633          connp->conn_rq = q;
1626 1634          connp->conn_wq = WR(q);
1627 1635  
1628 1636          WR(q)->q_hiwat = connp->conn_sndbuf;
1629 1637          WR(q)->q_lowat = connp->conn_sndlowat;
1630 1638  
1631 1639          qprocson(q);
1632 1640  
1633 1641          /* Set the Stream head write offset. */
1634 1642          (void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1635 1643          (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1636 1644  
1637 1645          mutex_enter(&connp->conn_lock);
1638 1646          connp->conn_state_flags &= ~CONN_INCIPIENT;
1639 1647          mutex_exit(&connp->conn_lock);
1640 1648  
1641 1649          icmp_bind_proto(connp->conn_icmp);
1642 1650  
1643 1651          return (0);
1644 1652  }
1645 1653  
1646 1654  /* For /dev/icmp aka AF_INET open */
1647 1655  static int
1648 1656  icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1649 1657  {
1650 1658          return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1651 1659  }
1652 1660  
1653 1661  /* For /dev/icmp6 aka AF_INET6 open */
1654 1662  static int
1655 1663  icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1656 1664  {
1657 1665          return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1658 1666  }
1659 1667  
1660 1668  /*
1661 1669   * This is the open routine for icmp.  It allocates a icmp_t structure for
1662 1670   * the stream and, on the first open of the module, creates an ND table.
1663 1671   */
1664 1672  static conn_t *
1665 1673  rawip_do_open(int family, cred_t *credp, int *err, int flags)
1666 1674  {
1667 1675          icmp_t  *icmp;
1668 1676          conn_t *connp;
1669 1677          zoneid_t zoneid;
1670 1678          netstack_t *ns;
1671 1679          icmp_stack_t *is;
1672 1680          int len;
1673 1681          boolean_t isv6 = B_FALSE;
1674 1682  
1675 1683          *err = secpolicy_net_icmpaccess(credp);
1676 1684          if (*err != 0)
1677 1685                  return (NULL);
1678 1686  
1679 1687          if (family == AF_INET6)
1680 1688                  isv6 = B_TRUE;
1681 1689  
1682 1690          ns = netstack_find_by_cred(credp);
1683 1691          ASSERT(ns != NULL);
1684 1692          is = ns->netstack_icmp;
1685 1693          ASSERT(is != NULL);
1686 1694  
1687 1695          /*
1688 1696           * For exclusive stacks we set the zoneid to zero
1689 1697           * to make ICMP operate as if in the global zone.
1690 1698           */
1691 1699          if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1692 1700                  zoneid = GLOBAL_ZONEID;
1693 1701          else
1694 1702                  zoneid = crgetzoneid(credp);
1695 1703  
1696 1704          ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1697 1705  
1698 1706          connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1699 1707          icmp = connp->conn_icmp;
1700 1708  
1701 1709          /*
1702 1710           * ipcl_conn_create did a netstack_hold. Undo the hold that was
1703 1711           * done by netstack_find_by_cred()
1704 1712           */
1705 1713          netstack_rele(ns);
1706 1714  
1707 1715          /*
1708 1716           * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1709 1717           * need to lock anything.
1710 1718           */
1711 1719          ASSERT(connp->conn_proto == IPPROTO_ICMP);
1712 1720          ASSERT(connp->conn_icmp == icmp);
1713 1721          ASSERT(icmp->icmp_connp == connp);
1714 1722  
1715 1723          /* Set the initial state of the stream and the privilege status. */
1716 1724          icmp->icmp_state = TS_UNBND;
1717 1725          connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1718 1726          if (isv6) {
1719 1727                  connp->conn_family = AF_INET6;
1720 1728                  connp->conn_ipversion = IPV6_VERSION;
1721 1729                  connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1722 1730                  connp->conn_proto = IPPROTO_ICMPV6;
1723 1731                  /* May be changed by a SO_PROTOTYPE socket option. */
1724 1732                  connp->conn_proto = IPPROTO_ICMPV6;
1725 1733                  connp->conn_ixa->ixa_protocol = connp->conn_proto;
1726 1734                  connp->conn_ixa->ixa_raw_cksum_offset = 2;
1727 1735                  connp->conn_default_ttl = is->is_ipv6_hoplimit;
1728 1736                  len = sizeof (ip6_t);
1729 1737          } else {
1730 1738                  connp->conn_family = AF_INET;
1731 1739                  connp->conn_ipversion = IPV4_VERSION;
1732 1740                  connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1733 1741                  /* May be changed by a SO_PROTOTYPE socket option. */
1734 1742                  connp->conn_proto = IPPROTO_ICMP;
1735 1743                  connp->conn_ixa->ixa_protocol = connp->conn_proto;
1736 1744                  connp->conn_default_ttl = is->is_ipv4_ttl;
1737 1745                  len = sizeof (ipha_t);
1738 1746          }
1739 1747          connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1740 1748  
1741 1749          connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1742 1750  
1743 1751          /*
1744 1752           * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1745 1753           * the checksum is provided in the pre-built packet. We clear
1746 1754           * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1747 1755           * complete IP header and not to compute the transport checksum.
1748 1756           */
1749 1757          connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1750 1758          /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1751 1759          connp->conn_ixa->ixa_zoneid = zoneid;
1752 1760  
1753 1761          connp->conn_zoneid = zoneid;
1754 1762  
1755 1763          /*
1756 1764           * If the caller has the process-wide flag set, then default to MAC
1757 1765           * exempt mode.  This allows read-down to unlabeled hosts.
1758 1766           */
1759 1767          if (getpflags(NET_MAC_AWARE, credp) != 0)
1760 1768                  connp->conn_mac_mode = CONN_MAC_AWARE;
1761 1769  
1762 1770          connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1763 1771  
1764 1772          icmp->icmp_is = is;
1765 1773  
1766 1774          connp->conn_rcvbuf = is->is_recv_hiwat;
1767 1775          connp->conn_sndbuf = is->is_xmit_hiwat;
1768 1776          connp->conn_sndlowat = is->is_xmit_lowat;
1769 1777          connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1770 1778  
1771 1779          connp->conn_wroff = len + is->is_wroff_extra;
1772 1780          connp->conn_so_type = SOCK_RAW;
1773 1781  
1774 1782          connp->conn_recv = icmp_input;
1775 1783          connp->conn_recvicmp = icmp_icmp_input;
1776 1784          crhold(credp);
1777 1785          connp->conn_cred = credp;
1778 1786          connp->conn_cpid = curproc->p_pid;
1779 1787          connp->conn_open_time = ddi_get_lbolt64();
1780 1788          /* Cache things in ixa without an extra refhold */
1781 1789          ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1782 1790          connp->conn_ixa->ixa_cred = connp->conn_cred;
1783 1791          connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1784 1792          if (is_system_labeled())
1785 1793                  connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1786 1794  
1787 1795          connp->conn_flow_cntrld = B_FALSE;
1788 1796  
1789 1797          if (is->is_pmtu_discovery)
1790 1798                  connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1791 1799  
1792 1800          return (connp);
1793 1801  }
1794 1802  
1795 1803  /*
1796 1804   * Which ICMP options OK to set through T_UNITDATA_REQ...
1797 1805   */
1798 1806  /* ARGSUSED */
1799 1807  static boolean_t
1800 1808  icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1801 1809  {
1802 1810          return (B_TRUE);
1803 1811  }
1804 1812  
1805 1813  /*
1806 1814   * This routine gets default values of certain options whose default
1807 1815   * values are maintained by protcol specific code
1808 1816   */
1809 1817  int
1810 1818  icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1811 1819  {
1812 1820          icmp_t *icmp = Q_TO_ICMP(q);
1813 1821          icmp_stack_t *is = icmp->icmp_is;
1814 1822          int *i1 = (int *)ptr;
1815 1823  
1816 1824          switch (level) {
1817 1825          case IPPROTO_IP:
1818 1826                  switch (name) {
1819 1827                  case IP_MULTICAST_TTL:
1820 1828                          *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1821 1829                          return (sizeof (uchar_t));
1822 1830                  case IP_MULTICAST_LOOP:
1823 1831                          *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1824 1832                          return (sizeof (uchar_t));
1825 1833                  }
1826 1834                  break;
1827 1835          case IPPROTO_IPV6:
1828 1836                  switch (name) {
1829 1837                  case IPV6_MULTICAST_HOPS:
1830 1838                          *i1 = IP_DEFAULT_MULTICAST_TTL;
1831 1839                          return (sizeof (int));
1832 1840                  case IPV6_MULTICAST_LOOP:
1833 1841                          *i1 = IP_DEFAULT_MULTICAST_LOOP;
1834 1842                          return (sizeof (int));
1835 1843                  case IPV6_UNICAST_HOPS:
1836 1844                          *i1 = is->is_ipv6_hoplimit;
1837 1845                          return (sizeof (int));
1838 1846                  }
1839 1847                  break;
1840 1848          case IPPROTO_ICMPV6:
1841 1849                  switch (name) {
1842 1850                  case ICMP6_FILTER:
1843 1851                          /* Make it look like "pass all" */
1844 1852                          ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1845 1853                          return (sizeof (icmp6_filter_t));
1846 1854                  }
1847 1855                  break;
1848 1856          }
1849 1857          return (-1);
1850 1858  }
1851 1859  
1852 1860  /*
1853 1861   * This routine retrieves the current status of socket options.
1854 1862   * It returns the size of the option retrieved, or -1.
1855 1863   */
1856 1864  int
1857 1865  icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1858 1866  {
1859 1867          icmp_t          *icmp = connp->conn_icmp;
1860 1868          int             *i1 = (int *)ptr;
1861 1869          conn_opt_arg_t  coas;
1862 1870          int             retval;
1863 1871  
1864 1872          coas.coa_connp = connp;
1865 1873          coas.coa_ixa = connp->conn_ixa;
1866 1874          coas.coa_ipp = &connp->conn_xmit_ipp;
1867 1875          coas.coa_ancillary = B_FALSE;
1868 1876          coas.coa_changed = 0;
1869 1877  
1870 1878          /*
1871 1879           * We assume that the optcom framework has checked for the set
1872 1880           * of levels and names that are supported, hence we don't worry
1873 1881           * about rejecting based on that.
1874 1882           * First check for ICMP specific handling, then pass to common routine.
1875 1883           */
1876 1884          switch (level) {
1877 1885          case IPPROTO_IP:
1878 1886                  /*
1879 1887                   * Only allow IPv4 option processing on IPv4 sockets.
1880 1888                   */
1881 1889                  if (connp->conn_family != AF_INET)
1882 1890                          return (-1);
1883 1891  
1884 1892                  switch (name) {
1885 1893                  case IP_OPTIONS:
1886 1894                  case T_IP_OPTIONS:
1887 1895                          /* Options are passed up with each packet */
1888 1896                          return (0);
1889 1897                  case IP_HDRINCL:
1890 1898                          mutex_enter(&connp->conn_lock);
1891 1899                          *i1 = (int)icmp->icmp_hdrincl;
1892 1900                          mutex_exit(&connp->conn_lock);
1893 1901                          return (sizeof (int));
1894 1902                  }
1895 1903                  break;
1896 1904  
1897 1905          case IPPROTO_IPV6:
1898 1906                  /*
1899 1907                   * Only allow IPv6 option processing on native IPv6 sockets.
1900 1908                   */
1901 1909                  if (connp->conn_family != AF_INET6)
1902 1910                          return (-1);
1903 1911  
1904 1912                  switch (name) {
1905 1913                  case IPV6_CHECKSUM:
1906 1914                          /*
1907 1915                           * Return offset or -1 if no checksum offset.
1908 1916                           * Does not apply to IPPROTO_ICMPV6
1909 1917                           */
1910 1918                          if (connp->conn_proto == IPPROTO_ICMPV6)
1911 1919                                  return (-1);
1912 1920  
1913 1921                          mutex_enter(&connp->conn_lock);
1914 1922                          if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1915 1923                                  *i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1916 1924                          else
1917 1925                                  *i1 = -1;
1918 1926                          mutex_exit(&connp->conn_lock);
1919 1927                          return (sizeof (int));
1920 1928                  }
1921 1929                  break;
1922 1930  
1923 1931          case IPPROTO_ICMPV6:
1924 1932                  /*
1925 1933                   * Only allow IPv6 option processing on native IPv6 sockets.
1926 1934                   */
1927 1935                  if (connp->conn_family != AF_INET6)
1928 1936                          return (-1);
1929 1937  
1930 1938                  if (connp->conn_proto != IPPROTO_ICMPV6)
1931 1939                          return (-1);
1932 1940  
1933 1941                  switch (name) {
1934 1942                  case ICMP6_FILTER:
1935 1943                          mutex_enter(&connp->conn_lock);
1936 1944                          if (icmp->icmp_filter == NULL) {
1937 1945                                  /* Make it look like "pass all" */
1938 1946                                  ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1939 1947                          } else {
1940 1948                                  (void) bcopy(icmp->icmp_filter, ptr,
1941 1949                                      sizeof (icmp6_filter_t));
1942 1950                          }
1943 1951                          mutex_exit(&connp->conn_lock);
1944 1952                          return (sizeof (icmp6_filter_t));
1945 1953                  }
1946 1954          }
1947 1955          mutex_enter(&connp->conn_lock);
1948 1956          retval = conn_opt_get(&coas, level, name, ptr);
1949 1957          mutex_exit(&connp->conn_lock);
1950 1958          return (retval);
1951 1959  }
1952 1960  
1953 1961  /*
1954 1962   * This routine retrieves the current status of socket options.
1955 1963   * It returns the size of the option retrieved, or -1.
1956 1964   */
  
    | 
      ↓ open down ↓ | 
    933 lines elided | 
    
      ↑ open up ↑ | 
  
1957 1965  int
1958 1966  icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1959 1967  {
1960 1968          conn_t          *connp = Q_TO_CONN(q);
1961 1969          int             err;
1962 1970  
1963 1971          err = icmp_opt_get(connp, level, name, ptr);
1964 1972          return (err);
1965 1973  }
1966 1974  
     1975 +static int
     1976 +icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp)
     1977 +{
     1978 +        struct bpf_program prog;
     1979 +        ip_bpf_insn_t *insns = NULL;
     1980 +        unsigned int size;
     1981 +
     1982 +#ifdef _LP64
     1983 +        if (get_udatamodel() != DATAMODEL_NATIVE) {
     1984 +                struct bpf_program32 *prog32;
     1985 +
     1986 +                if (inlen != sizeof (struct bpf_program32)) {
     1987 +                        return (EINVAL);
     1988 +                }
     1989 +                prog32 = (struct bpf_program32 *)invalp;
     1990 +                prog.bf_len = prog32->bf_len;
     1991 +                prog.bf_insns = (void *)(uint64_t)prog32->bf_insns;
     1992 +        } else
     1993 +#endif
     1994 +        if (inlen == sizeof (struct bpf_program)) {
     1995 +                bcopy(invalp, &prog, sizeof (prog));
     1996 +        } else {
     1997 +                return (EINVAL);
     1998 +        }
     1999 +
     2000 +        if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) {
     2001 +                return (EINVAL);
     2002 +        }
     2003 +        size = prog.bf_len * sizeof (struct bpf_insn);
     2004 +        insns = kmem_alloc(size, KM_SLEEP);
     2005 +        if (copyin(prog.bf_insns, insns, size) != 0) {
     2006 +                kmem_free(insns, size);
     2007 +                return (EFAULT);
     2008 +        }
     2009 +        if (!ip_bpf_validate(insns, prog.bf_len)) {
     2010 +                kmem_free(insns, size);
     2011 +                return (EINVAL);
     2012 +        }
     2013 +
     2014 +        rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
     2015 +        if (icmp->icmp_bpf_len != 0) {
     2016 +                ASSERT(icmp->icmp_bpf_prog != NULL);
     2017 +
     2018 +                kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
     2019 +        }
     2020 +        icmp->icmp_bpf_len = size;
     2021 +        icmp->icmp_bpf_prog = insns;
     2022 +        rw_exit(&icmp->icmp_bpf_lock);
     2023 +        return (0);
     2024 +}
     2025 +
     2026 +static int
     2027 +icmp_detach_filter(icmp_t *icmp)
     2028 +{
     2029 +        int error;
     2030 +
     2031 +        rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
     2032 +        if (icmp->icmp_bpf_len == 0) {
     2033 +                ASSERT(icmp->icmp_bpf_prog == NULL);
     2034 +                error = ENOENT;
     2035 +        } else {
     2036 +                kmem_free(icmp->icmp_bpf_prog,
     2037 +                    icmp->icmp_bpf_len);
     2038 +                icmp->icmp_bpf_len = 0;
     2039 +                icmp->icmp_bpf_prog = NULL;
     2040 +                error = 0;
     2041 +        }
     2042 +        rw_exit(&icmp->icmp_bpf_lock);
     2043 +        return (error);
     2044 +}
     2045 +
     2046 +static boolean_t
     2047 +icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira)
     2048 +{
     2049 +        boolean_t res;
     2050 +        uchar_t *buf = mp->b_rptr;
     2051 +        uint_t wirelen, len = MBLKL(mp);
     2052 +
     2053 +        rw_enter(&icmp->icmp_bpf_lock, RW_READER);
     2054 +        if (icmp->icmp_bpf_len == 0) {
     2055 +                rw_exit(&icmp->icmp_bpf_lock);
     2056 +                return (B_FALSE);
     2057 +        }
     2058 +        if (ira->ira_flags & IRAF_IS_IPV4) {
     2059 +                ipha_t *ipha = (ipha_t *)buf;
     2060 +
     2061 +                wirelen = ntohs(ipha->ipha_length);
     2062 +        } else {
     2063 +                ip6_t *ip6h = (ip6_t *)buf;
     2064 +
     2065 +                wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
     2066 +        }
     2067 +        res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len);
     2068 +        rw_exit(&icmp->icmp_bpf_lock);
     2069 +
     2070 +        return (res);
     2071 +}
     2072 +
1967 2073  /*
1968 2074   * This routine sets socket options.
1969 2075   */
1970 2076  int
1971 2077  icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
1972 2078      uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
1973 2079  {
1974 2080          conn_t          *connp = coa->coa_connp;
1975 2081          ip_xmit_attr_t  *ixa = coa->coa_ixa;
1976 2082          icmp_t          *icmp = connp->conn_icmp;
1977 2083          icmp_stack_t    *is = icmp->icmp_is;
1978 2084          int             *i1 = (int *)invalp;
1979 2085          boolean_t       onoff = (*i1 == 0) ? 0 : 1;
1980 2086          int             error;
1981 2087  
1982 2088          ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
1983 2089  
1984 2090          /*
1985 2091           * For fixed length options, no sanity check
1986 2092           * of passed in length is done. It is assumed *_optcom_req()
1987 2093           * routines do the right thing.
1988 2094           */
1989 2095  
1990 2096          switch (level) {
1991 2097          case SOL_SOCKET:
1992 2098                  switch (name) {
1993 2099                  case SO_PROTOTYPE:
1994 2100                          if ((*i1 & 0xFF) != IPPROTO_ICMP &&
1995 2101                              (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
1996 2102                              secpolicy_net_rawaccess(cr) != 0) {
1997 2103                                  return (EACCES);
1998 2104                          }
1999 2105                          if (checkonly)
2000 2106                                  break;
2001 2107  
2002 2108                          mutex_enter(&connp->conn_lock);
2003 2109                          connp->conn_proto = *i1 & 0xFF;
2004 2110                          ixa->ixa_protocol = connp->conn_proto;
2005 2111                          if ((connp->conn_proto == IPPROTO_RAW ||
2006 2112                              connp->conn_proto == IPPROTO_IGMP) &&
2007 2113                              connp->conn_family == AF_INET) {
2008 2114                                  icmp->icmp_hdrincl = 1;
2009 2115                                  ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2010 2116                          } else if (connp->conn_proto == IPPROTO_UDP ||
2011 2117                              connp->conn_proto == IPPROTO_TCP ||
2012 2118                              connp->conn_proto == IPPROTO_SCTP) {
2013 2119                                  /* Used by test applications like psh */
2014 2120                                  icmp->icmp_hdrincl = 0;
2015 2121                                  ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2016 2122                          } else {
2017 2123                                  icmp->icmp_hdrincl = 0;
2018 2124                                  ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2019 2125                          }
2020 2126  
2021 2127                          if (connp->conn_family == AF_INET6 &&
2022 2128                              connp->conn_proto == IPPROTO_ICMPV6) {
2023 2129                                  /* Set offset for icmp6_cksum */
2024 2130                                  ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2025 2131                                  ixa->ixa_raw_cksum_offset = 2;
2026 2132                          }
2027 2133                          if (icmp->icmp_filter != NULL &&
2028 2134                              connp->conn_proto != IPPROTO_ICMPV6) {
2029 2135                                  kmem_free(icmp->icmp_filter,
2030 2136                                      sizeof (icmp6_filter_t));
2031 2137                                  icmp->icmp_filter = NULL;
2032 2138                          }
2033 2139                          mutex_exit(&connp->conn_lock);
2034 2140  
2035 2141                          coa->coa_changed |= COA_HEADER_CHANGED;
2036 2142                          /*
2037 2143                           * For SCTP, we don't use icmp_bind_proto() for
2038 2144                           * raw socket binding.
2039 2145                           */
2040 2146                          if (connp->conn_proto == IPPROTO_SCTP)
2041 2147                                  return (0);
2042 2148  
2043 2149                          coa->coa_changed |= COA_ICMP_BIND_NEEDED;
2044 2150                          return (0);
2045 2151  
  
    | 
      ↓ open down ↓ | 
    69 lines elided | 
    
      ↑ open up ↑ | 
  
2046 2152                  case SO_SNDBUF:
2047 2153                          if (*i1 > is->is_max_buf) {
2048 2154                                  return (ENOBUFS);
2049 2155                          }
2050 2156                          break;
2051 2157                  case SO_RCVBUF:
2052 2158                          if (*i1 > is->is_max_buf) {
2053 2159                                  return (ENOBUFS);
2054 2160                          }
2055 2161                          break;
     2162 +                case SO_ATTACH_FILTER:
     2163 +                        return (icmp_attach_filter(icmp, inlen, invalp));
     2164 +                case SO_DETACH_FILTER:
     2165 +                        return (icmp_detach_filter(icmp));
2056 2166                  }
2057 2167                  break;
2058 2168  
2059 2169          case IPPROTO_IP:
2060 2170                  /*
2061 2171                   * Only allow IPv4 option processing on IPv4 sockets.
2062 2172                   */
2063 2173                  if (connp->conn_family != AF_INET)
2064 2174                          return (EINVAL);
2065 2175  
2066 2176                  switch (name) {
2067 2177                  case IP_HDRINCL:
2068 2178                          if (!checkonly) {
2069 2179                                  mutex_enter(&connp->conn_lock);
2070 2180                                  icmp->icmp_hdrincl = onoff;
2071 2181                                  if (onoff)
2072 2182                                          ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2073 2183                                  else
2074 2184                                          ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2075 2185                                  mutex_exit(&connp->conn_lock);
2076 2186                          }
2077 2187                          break;
2078 2188                  }
2079 2189                  break;
2080 2190  
2081 2191          case IPPROTO_IPV6:
2082 2192                  if (connp->conn_family != AF_INET6)
2083 2193                          return (EINVAL);
2084 2194  
2085 2195                  switch (name) {
2086 2196                  case IPV6_CHECKSUM:
2087 2197                          /*
2088 2198                           * Integer offset into the user data of where the
2089 2199                           * checksum is located.
2090 2200                           * Offset of -1 disables option.
2091 2201                           * Does not apply to IPPROTO_ICMPV6.
2092 2202                           */
2093 2203                          if (connp->conn_proto == IPPROTO_ICMPV6 ||
2094 2204                              coa->coa_ancillary) {
2095 2205                                  return (EINVAL);
2096 2206                          }
2097 2207                          if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2098 2208                                  /* Negative or not 16 bit aligned offset */
2099 2209                                  return (EINVAL);
2100 2210                          }
2101 2211                          if (checkonly)
2102 2212                                  break;
2103 2213  
2104 2214                          mutex_enter(&connp->conn_lock);
2105 2215                          if (*i1 == -1) {
2106 2216                                  ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2107 2217                                  ixa->ixa_raw_cksum_offset = 0;
2108 2218                                  ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2109 2219                          } else {
2110 2220                                  ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2111 2221                                  ixa->ixa_raw_cksum_offset = *i1;
2112 2222                                  ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2113 2223                          }
2114 2224                          mutex_exit(&connp->conn_lock);
2115 2225                          break;
2116 2226                  }
2117 2227                  break;
2118 2228  
2119 2229          case IPPROTO_ICMPV6:
2120 2230                  /*
2121 2231                   * Only allow IPv6 option processing on IPv6 sockets.
2122 2232                   */
2123 2233                  if (connp->conn_family != AF_INET6)
2124 2234                          return (EINVAL);
2125 2235                  if (connp->conn_proto != IPPROTO_ICMPV6)
2126 2236                          return (EINVAL);
2127 2237  
2128 2238                  switch (name) {
2129 2239                  case ICMP6_FILTER:
2130 2240                          if (checkonly)
2131 2241                                  break;
2132 2242  
2133 2243                          if ((inlen != 0) &&
2134 2244                              (inlen != sizeof (icmp6_filter_t)))
2135 2245                                  return (EINVAL);
2136 2246  
2137 2247                          mutex_enter(&connp->conn_lock);
2138 2248                          if (inlen == 0) {
2139 2249                                  if (icmp->icmp_filter != NULL) {
2140 2250                                          kmem_free(icmp->icmp_filter,
2141 2251                                              sizeof (icmp6_filter_t));
2142 2252                                          icmp->icmp_filter = NULL;
2143 2253                                  }
2144 2254                          } else {
2145 2255                                  if (icmp->icmp_filter == NULL) {
2146 2256                                          icmp->icmp_filter = kmem_alloc(
2147 2257                                              sizeof (icmp6_filter_t),
2148 2258                                              KM_NOSLEEP);
2149 2259                                          if (icmp->icmp_filter == NULL) {
2150 2260                                                  mutex_exit(&connp->conn_lock);
2151 2261                                                  return (ENOBUFS);
2152 2262                                          }
2153 2263                                  }
2154 2264                                  (void) bcopy(invalp, icmp->icmp_filter, inlen);
2155 2265                          }
2156 2266                          mutex_exit(&connp->conn_lock);
2157 2267                          break;
2158 2268                  }
2159 2269                  break;
2160 2270          }
2161 2271          error = conn_opt_set(coa, level, name, inlen, invalp,
2162 2272              checkonly, cr);
2163 2273          return (error);
2164 2274  }
2165 2275  
2166 2276  /*
2167 2277   * This routine sets socket options.
2168 2278   */
2169 2279  int
2170 2280  icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2171 2281      uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2172 2282      void *thisdg_attrs, cred_t *cr)
2173 2283  {
2174 2284          icmp_t          *icmp = connp->conn_icmp;
2175 2285          int             err;
2176 2286          conn_opt_arg_t  coas, *coa;
2177 2287          boolean_t       checkonly;
2178 2288          icmp_stack_t    *is = icmp->icmp_is;
2179 2289  
2180 2290          switch (optset_context) {
2181 2291          case SETFN_OPTCOM_CHECKONLY:
2182 2292                  checkonly = B_TRUE;
2183 2293                  /*
2184 2294                   * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2185 2295                   * inlen != 0 implies value supplied and
2186 2296                   *      we have to "pretend" to set it.
2187 2297                   * inlen == 0 implies that there is no
2188 2298                   *      value part in T_CHECK request and just validation
2189 2299                   * done elsewhere should be enough, we just return here.
2190 2300                   */
2191 2301                  if (inlen == 0) {
2192 2302                          *outlenp = 0;
2193 2303                          return (0);
2194 2304                  }
2195 2305                  break;
2196 2306          case SETFN_OPTCOM_NEGOTIATE:
2197 2307                  checkonly = B_FALSE;
2198 2308                  break;
2199 2309          case SETFN_UD_NEGOTIATE:
2200 2310          case SETFN_CONN_NEGOTIATE:
2201 2311                  checkonly = B_FALSE;
2202 2312                  /*
2203 2313                   * Negotiating local and "association-related" options
2204 2314                   * through T_UNITDATA_REQ.
2205 2315                   *
2206 2316                   * Following routine can filter out ones we do not
2207 2317                   * want to be "set" this way.
2208 2318                   */
2209 2319                  if (!icmp_opt_allow_udr_set(level, name)) {
2210 2320                          *outlenp = 0;
2211 2321                          return (EINVAL);
2212 2322                  }
2213 2323                  break;
2214 2324          default:
2215 2325                  /*
2216 2326                   * We should never get here
2217 2327                   */
2218 2328                  *outlenp = 0;
2219 2329                  return (EINVAL);
2220 2330          }
2221 2331  
2222 2332          ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2223 2333              (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2224 2334  
2225 2335          if (thisdg_attrs != NULL) {
2226 2336                  /* Options from T_UNITDATA_REQ */
2227 2337                  coa = (conn_opt_arg_t *)thisdg_attrs;
2228 2338                  ASSERT(coa->coa_connp == connp);
2229 2339                  ASSERT(coa->coa_ixa != NULL);
2230 2340                  ASSERT(coa->coa_ipp != NULL);
2231 2341                  ASSERT(coa->coa_ancillary);
2232 2342          } else {
2233 2343                  coa = &coas;
2234 2344                  coas.coa_connp = connp;
2235 2345                  /* Get a reference on conn_ixa to prevent concurrent mods */
2236 2346                  coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2237 2347                  if (coas.coa_ixa == NULL) {
2238 2348                          *outlenp = 0;
2239 2349                          return (ENOMEM);
2240 2350                  }
2241 2351                  coas.coa_ipp = &connp->conn_xmit_ipp;
2242 2352                  coas.coa_ancillary = B_FALSE;
2243 2353                  coas.coa_changed = 0;
2244 2354          }
2245 2355  
2246 2356          err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2247 2357              cr, checkonly);
2248 2358          if (err != 0) {
2249 2359  errout:
2250 2360                  if (!coa->coa_ancillary)
2251 2361                          ixa_refrele(coa->coa_ixa);
2252 2362                  *outlenp = 0;
2253 2363                  return (err);
2254 2364          }
2255 2365  
2256 2366          /*
2257 2367           * Common case of OK return with outval same as inval.
2258 2368           */
2259 2369          if (invalp != outvalp) {
2260 2370                  /* don't trust bcopy for identical src/dst */
2261 2371                  (void) bcopy(invalp, outvalp, inlen);
2262 2372          }
2263 2373          *outlenp = inlen;
2264 2374  
2265 2375          /*
2266 2376           * If this was not ancillary data, then we rebuild the headers,
2267 2377           * update the IRE/NCE, and IPsec as needed.
2268 2378           * Since the label depends on the destination we go through
2269 2379           * ip_set_destination first.
2270 2380           */
2271 2381          if (coa->coa_ancillary) {
2272 2382                  return (0);
2273 2383          }
2274 2384  
2275 2385          if (coa->coa_changed & COA_ROUTE_CHANGED) {
2276 2386                  in6_addr_t saddr, faddr, nexthop;
2277 2387                  in_port_t fport;
2278 2388  
2279 2389                  /*
2280 2390                   * We clear lastdst to make sure we pick up the change
2281 2391                   * next time sending.
2282 2392                   * If we are connected we re-cache the information.
2283 2393                   * We ignore errors to preserve BSD behavior.
2284 2394                   * Note that we don't redo IPsec policy lookup here
2285 2395                   * since the final destination (or source) didn't change.
2286 2396                   */
2287 2397                  mutex_enter(&connp->conn_lock);
2288 2398                  connp->conn_v6lastdst = ipv6_all_zeros;
2289 2399  
2290 2400                  ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2291 2401                      &connp->conn_faddr_v6, &nexthop);
2292 2402                  saddr = connp->conn_saddr_v6;
2293 2403                  faddr = connp->conn_faddr_v6;
2294 2404                  fport = connp->conn_fport;
2295 2405                  mutex_exit(&connp->conn_lock);
2296 2406  
2297 2407                  if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2298 2408                      !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2299 2409                          (void) ip_attr_connect(connp, coa->coa_ixa,
2300 2410                              &saddr, &faddr, &nexthop, fport, NULL, NULL,
2301 2411                              IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2302 2412                  }
2303 2413          }
2304 2414  
2305 2415          ixa_refrele(coa->coa_ixa);
2306 2416  
2307 2417          if (coa->coa_changed & COA_HEADER_CHANGED) {
2308 2418                  /*
2309 2419                   * Rebuild the header template if we are connected.
2310 2420                   * Otherwise clear conn_v6lastdst so we rebuild the header
2311 2421                   * in the data path.
2312 2422                   */
2313 2423                  mutex_enter(&connp->conn_lock);
2314 2424                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2315 2425                      !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2316 2426                          err = icmp_build_hdr_template(connp,
2317 2427                              &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2318 2428                              connp->conn_flowinfo);
2319 2429                          if (err != 0) {
2320 2430                                  mutex_exit(&connp->conn_lock);
2321 2431                                  return (err);
2322 2432                          }
2323 2433                  } else {
2324 2434                          connp->conn_v6lastdst = ipv6_all_zeros;
2325 2435                  }
2326 2436                  mutex_exit(&connp->conn_lock);
2327 2437          }
2328 2438          if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2329 2439                  (void) proto_set_rx_hiwat(connp->conn_rq, connp,
2330 2440                      connp->conn_rcvbuf);
2331 2441          }
2332 2442          if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2333 2443                  connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2334 2444          }
2335 2445          if (coa->coa_changed & COA_WROFF_CHANGED) {
2336 2446                  /* Increase wroff if needed */
2337 2447                  uint_t wroff;
2338 2448  
2339 2449                  mutex_enter(&connp->conn_lock);
2340 2450                  wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2341 2451                  if (wroff > connp->conn_wroff) {
2342 2452                          connp->conn_wroff = wroff;
2343 2453                          mutex_exit(&connp->conn_lock);
2344 2454                          (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2345 2455                  } else {
2346 2456                          mutex_exit(&connp->conn_lock);
2347 2457                  }
2348 2458          }
2349 2459          if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2350 2460                  icmp_bind_proto(icmp);
2351 2461          }
2352 2462          return (err);
2353 2463  }
2354 2464  
2355 2465  /* This routine sets socket options. */
2356 2466  int
2357 2467  icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2358 2468      uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2359 2469      void *thisdg_attrs, cred_t *cr)
2360 2470  {
2361 2471          conn_t  *connp = Q_TO_CONN(q);
2362 2472          int error;
2363 2473  
2364 2474          error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2365 2475              outlenp, outvalp, thisdg_attrs, cr);
2366 2476          return (error);
2367 2477  }
2368 2478  
2369 2479  /*
2370 2480   * Setup IP headers.
2371 2481   *
2372 2482   * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2373 2483   * but icmp_output_hdrincl restores ipha_protocol once we return.
2374 2484   */
2375 2485  mblk_t *
2376 2486  icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2377 2487      const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2378 2488      mblk_t *data_mp, int *errorp)
2379 2489  {
2380 2490          mblk_t          *mp;
2381 2491          icmp_stack_t    *is = connp->conn_netstack->netstack_icmp;
2382 2492          uint_t          data_len;
2383 2493          uint32_t        cksum;
2384 2494  
2385 2495          data_len = msgdsize(data_mp);
2386 2496          mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2387 2497              flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2388 2498          if (mp == NULL) {
2389 2499                  ASSERT(*errorp != 0);
2390 2500                  return (NULL);
2391 2501          }
2392 2502  
2393 2503          ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2394 2504  
2395 2505          /*
2396 2506           * If there was a routing option/header then conn_prepend_hdr
2397 2507           * has massaged it and placed the pseudo-header checksum difference
2398 2508           * in the cksum argument.
2399 2509           *
2400 2510           * Prepare for ICMPv6 checksum done in IP.
2401 2511           *
2402 2512           * We make it easy for IP to include our pseudo header
2403 2513           * by putting our length (and any routing header adjustment)
2404 2514           * in the ICMPv6 checksum field.
2405 2515           * The IP source, destination, and length have already been set by
2406 2516           * conn_prepend_hdr.
2407 2517           */
2408 2518          cksum += data_len;
2409 2519          cksum = (cksum >> 16) + (cksum & 0xFFFF);
2410 2520          ASSERT(cksum < 0x10000);
2411 2521  
2412 2522          if (ixa->ixa_flags & IXAF_IS_IPV4) {
2413 2523                  ipha_t  *ipha = (ipha_t *)mp->b_rptr;
2414 2524  
2415 2525                  ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2416 2526          } else {
2417 2527                  ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
2418 2528                  uint_t  cksum_offset = 0;
2419 2529  
2420 2530                  ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2421 2531  
2422 2532                  if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2423 2533                          if (connp->conn_proto == IPPROTO_ICMPV6) {
2424 2534                                  cksum_offset = ixa->ixa_ip_hdr_length +
2425 2535                                      offsetof(icmp6_t, icmp6_cksum);
2426 2536                          } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2427 2537                                  cksum_offset = ixa->ixa_ip_hdr_length +
2428 2538                                      ixa->ixa_raw_cksum_offset;
2429 2539                          }
2430 2540                  }
2431 2541                  if (cksum_offset != 0) {
2432 2542                          uint16_t *ptr;
2433 2543  
2434 2544                          /* Make sure the checksum fits in the first mblk */
2435 2545                          if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2436 2546                                  mblk_t *mp1;
2437 2547  
2438 2548                                  mp1 = msgpullup(mp,
2439 2549                                      cksum_offset + sizeof (short));
2440 2550                                  freemsg(mp);
2441 2551                                  if (mp1 == NULL) {
2442 2552                                          *errorp = ENOMEM;
2443 2553                                          return (NULL);
2444 2554                                  }
2445 2555                                  mp = mp1;
2446 2556                                  ip6h = (ip6_t *)mp->b_rptr;
2447 2557                          }
2448 2558                          ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2449 2559                          *ptr = htons(cksum);
2450 2560                  }
2451 2561          }
2452 2562  
2453 2563          /* Note that we don't try to update wroff due to ancillary data */
2454 2564          return (mp);
2455 2565  }
2456 2566  
2457 2567  static int
2458 2568  icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2459 2569      const in6_addr_t *v6dst, uint32_t flowinfo)
2460 2570  {
2461 2571          int             error;
2462 2572  
2463 2573          ASSERT(MUTEX_HELD(&connp->conn_lock));
2464 2574          /*
2465 2575           * We clear lastdst to make sure we don't use the lastdst path
2466 2576           * next time sending since we might not have set v6dst yet.
2467 2577           */
2468 2578          connp->conn_v6lastdst = ipv6_all_zeros;
2469 2579  
2470 2580          error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2471 2581          if (error != 0)
2472 2582                  return (error);
2473 2583  
2474 2584          /*
2475 2585           * Any routing header/option has been massaged. The checksum difference
2476 2586           * is stored in conn_sum.
2477 2587           */
2478 2588          return (0);
2479 2589  }
2480 2590  
2481 2591  static mblk_t *
2482 2592  icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2483 2593  {
2484 2594          ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2485 2595          if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2486 2596                  /*
2487 2597                   * fallback has started but messages have not been moved yet
2488 2598                   */
2489 2599                  if (icmp->icmp_fallback_queue_head == NULL) {
2490 2600                          ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2491 2601                          icmp->icmp_fallback_queue_head = mp;
2492 2602                          icmp->icmp_fallback_queue_tail = mp;
2493 2603                  } else {
2494 2604                          ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2495 2605                          icmp->icmp_fallback_queue_tail->b_next = mp;
2496 2606                          icmp->icmp_fallback_queue_tail = mp;
2497 2607                  }
2498 2608                  return (NULL);
2499 2609          } else {
2500 2610                  /*
2501 2611                   * Fallback completed, let the caller putnext() the mblk.
2502 2612                   */
2503 2613                  return (mp);
2504 2614          }
2505 2615  }
2506 2616  
2507 2617  /*
2508 2618   * Deliver data to ULP. In case we have a socket, and it's falling back to
2509 2619   * TPI, then we'll queue the mp for later processing.
2510 2620   */
2511 2621  static void
2512 2622  icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2513 2623  {
2514 2624          if (IPCL_IS_NONSTR(connp)) {
2515 2625                  icmp_t *icmp = connp->conn_icmp;
2516 2626                  int error;
2517 2627  
2518 2628                  ASSERT(len == msgdsize(mp));
2519 2629                  if ((*connp->conn_upcalls->su_recv)
2520 2630                      (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2521 2631                          mutex_enter(&icmp->icmp_recv_lock);
2522 2632                          if (error == ENOSPC) {
2523 2633                                  /*
2524 2634                                   * let's confirm while holding the lock
2525 2635                                   */
2526 2636                                  if ((*connp->conn_upcalls->su_recv)
2527 2637                                      (connp->conn_upper_handle, NULL, 0, 0,
2528 2638                                      &error, NULL) < 0) {
2529 2639                                          ASSERT(error == ENOSPC);
2530 2640                                          if (error == ENOSPC) {
2531 2641                                                  connp->conn_flow_cntrld =
2532 2642                                                      B_TRUE;
2533 2643                                          }
2534 2644                                  }
2535 2645                                  mutex_exit(&icmp->icmp_recv_lock);
2536 2646                          } else {
2537 2647                                  ASSERT(error == EOPNOTSUPP);
2538 2648                                  mp = icmp_queue_fallback(icmp, mp);
2539 2649                                  mutex_exit(&icmp->icmp_recv_lock);
2540 2650                                  if (mp != NULL)
2541 2651                                          putnext(connp->conn_rq, mp);
2542 2652                          }
2543 2653                  }
2544 2654                  ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2545 2655          } else {
2546 2656                  putnext(connp->conn_rq, mp);
2547 2657          }
2548 2658  }
2549 2659  
2550 2660  /*
2551 2661   * This is the inbound data path.
2552 2662   * IP has already pulled up the IP headers and verified alignment
2553 2663   * etc.
2554 2664   */
2555 2665  /* ARGSUSED2 */
2556 2666  static void
2557 2667  icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2558 2668  {
2559 2669          conn_t                  *connp = (conn_t *)arg1;
2560 2670          struct T_unitdata_ind   *tudi;
2561 2671          uchar_t                 *rptr;          /* Pointer to IP header */
2562 2672          int                     ip_hdr_length;
2563 2673          int                     udi_size;       /* Size of T_unitdata_ind */
2564 2674          int                     pkt_len;
2565 2675          icmp_t                  *icmp;
2566 2676          ip_pkt_t                ipps;
2567 2677          ip6_t                   *ip6h;
2568 2678          mblk_t                  *mp1;
2569 2679          crb_t                   recv_ancillary;
2570 2680          icmp_stack_t            *is;
2571 2681          sin_t                   *sin;
2572 2682          sin6_t                  *sin6;
2573 2683          ipha_t                  *ipha;
2574 2684  
2575 2685          ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2576 2686  
2577 2687          icmp = connp->conn_icmp;
2578 2688          is = icmp->icmp_is;
2579 2689          rptr = mp->b_rptr;
2580 2690  
2581 2691          ASSERT(DB_TYPE(mp) == M_DATA);
2582 2692          ASSERT(OK_32PTR(rptr));
2583 2693          ASSERT(ira->ira_pktlen == msgdsize(mp));
2584 2694          pkt_len = ira->ira_pktlen;
2585 2695  
2586 2696          /*
2587 2697           * Get a snapshot of these and allow other threads to change
2588 2698           * them after that. We need the same recv_ancillary when determining
2589 2699           * the size as when adding the ancillary data items.
2590 2700           */
  
    | 
      ↓ open down ↓ | 
    525 lines elided | 
    
      ↑ open up ↑ | 
  
2591 2701          mutex_enter(&connp->conn_lock);
2592 2702          recv_ancillary = connp->conn_recv_ancillary;
2593 2703          mutex_exit(&connp->conn_lock);
2594 2704  
2595 2705          ip_hdr_length = ira->ira_ip_hdr_length;
2596 2706          ASSERT(MBLKL(mp) >= ip_hdr_length);     /* IP did a pullup */
2597 2707  
2598 2708          /* Initialize regardless of IP version */
2599 2709          ipps.ipp_fields = 0;
2600 2710  
     2711 +        /* Apply socket filter, if needed */
     2712 +        if (icmp->icmp_bpf_len != 0) {
     2713 +                if (icmp_eval_filter(icmp, mp, ira)) {
     2714 +                        freemsg(mp);
     2715 +                        return;
     2716 +                }
     2717 +        }
     2718 +
2601 2719          if (ira->ira_flags & IRAF_IS_IPV4) {
2602 2720                  ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2603 2721                  ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2604 2722                  ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2605 2723  
2606 2724                  ipha = (ipha_t *)mp->b_rptr;
2607 2725                  if (recv_ancillary.crb_all != 0)
2608 2726                          (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2609 2727  
2610 2728                  /*
2611 2729                   * BSD for some reason adjusts ipha_length to exclude the
2612 2730                   * IP header length. We do the same.
2613 2731                   */
2614 2732                  if (is->is_bsd_compat) {
2615 2733                          ushort_t len;
2616 2734  
2617 2735                          len = ntohs(ipha->ipha_length);
2618 2736                          if (mp->b_datap->db_ref > 1) {
2619 2737                                  /*
2620 2738                                   * Allocate a new IP header so that we can
2621 2739                                   * modify ipha_length.
2622 2740                                   */
2623 2741                                  mblk_t  *mp1;
2624 2742  
2625 2743                                  mp1 = allocb(ip_hdr_length, BPRI_MED);
2626 2744                                  if (mp1 == NULL) {
2627 2745                                          freemsg(mp);
2628 2746                                          BUMP_MIB(&is->is_rawip_mib,
2629 2747                                              rawipInErrors);
2630 2748                                          return;
2631 2749                                  }
2632 2750                                  bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2633 2751                                  mp->b_rptr = rptr + ip_hdr_length;
2634 2752                                  rptr = mp1->b_rptr;
2635 2753                                  ipha = (ipha_t *)rptr;
2636 2754                                  mp1->b_cont = mp;
2637 2755                                  mp1->b_wptr = rptr + ip_hdr_length;
2638 2756                                  mp = mp1;
2639 2757                          }
2640 2758                          len -= ip_hdr_length;
2641 2759                          ipha->ipha_length = htons(len);
2642 2760                  }
2643 2761  
2644 2762                  /*
2645 2763                   * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2646 2764                   * sockets. This is ensured by icmp_bind and the IP fanout code.
2647 2765                   */
2648 2766                  ASSERT(connp->conn_family == AF_INET);
2649 2767  
2650 2768                  /*
2651 2769                   * This is the inbound data path.  Packets are passed upstream
2652 2770                   * as T_UNITDATA_IND messages with full IPv4 headers still
2653 2771                   * attached.
2654 2772                   */
2655 2773  
2656 2774                  /*
2657 2775                   * Normally only send up the source address.
2658 2776                   * If any ancillary data items are wanted we add those.
2659 2777                   */
2660 2778                  udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2661 2779                  if (recv_ancillary.crb_all != 0) {
2662 2780                          udi_size += conn_recvancillary_size(connp,
2663 2781                              recv_ancillary, ira, mp, &ipps);
2664 2782                  }
2665 2783  
2666 2784                  /* Allocate a message block for the T_UNITDATA_IND structure. */
2667 2785                  mp1 = allocb(udi_size, BPRI_MED);
2668 2786                  if (mp1 == NULL) {
2669 2787                          freemsg(mp);
2670 2788                          BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2671 2789                          return;
2672 2790                  }
2673 2791                  mp1->b_cont = mp;
2674 2792                  tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2675 2793                  mp1->b_datap->db_type = M_PROTO;
2676 2794                  mp1->b_wptr = (uchar_t *)tudi + udi_size;
2677 2795                  tudi->PRIM_type = T_UNITDATA_IND;
2678 2796                  tudi->SRC_length = sizeof (sin_t);
2679 2797                  tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2680 2798                  sin = (sin_t *)&tudi[1];
2681 2799                  *sin = sin_null;
2682 2800                  sin->sin_family = AF_INET;
2683 2801                  sin->sin_addr.s_addr = ipha->ipha_src;
2684 2802                  *(uint32_t *)&sin->sin_zero[0] = 0;
2685 2803                  *(uint32_t *)&sin->sin_zero[4] = 0;
2686 2804                  tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2687 2805                      sizeof (sin_t);
2688 2806                  udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2689 2807                  tudi->OPT_length = udi_size;
2690 2808  
2691 2809                  /*
2692 2810                   * Add options if IP_RECVIF etc is set
2693 2811                   */
2694 2812                  if (udi_size != 0) {
2695 2813                          conn_recvancillary_add(connp, recv_ancillary, ira,
2696 2814                              &ipps, (uchar_t *)&sin[1], udi_size);
2697 2815                  }
2698 2816                  goto deliver;
2699 2817          }
2700 2818  
2701 2819          ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2702 2820          /*
2703 2821           * IPv6 packets can only be received by applications
2704 2822           * that are prepared to receive IPv6 addresses.
2705 2823           * The IP fanout must ensure this.
2706 2824           */
2707 2825          ASSERT(connp->conn_family == AF_INET6);
2708 2826  
2709 2827          /*
2710 2828           * Handle IPv6 packets. We don't pass up the IP headers with the
2711 2829           * payload for IPv6.
2712 2830           */
2713 2831  
2714 2832          ip6h = (ip6_t *)rptr;
2715 2833          if (recv_ancillary.crb_all != 0) {
2716 2834                  /*
2717 2835                   * Call on ip_find_hdr_v6 which gets individual lenghts of
2718 2836                   * extension headers (and pointers to them).
2719 2837                   */
2720 2838                  uint8_t         nexthdr;
2721 2839  
2722 2840                  /* We don't care about the length or nextheader. */
2723 2841                  (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2724 2842  
2725 2843                  /*
2726 2844                   * We do not pass up hop-by-hop options or any other
2727 2845                   * extension header as part of the packet. Applications
2728 2846                   * that want to see them have to specify IPV6_RECV* socket
2729 2847                   * options. And conn_recvancillary_size/add explicitly
2730 2848                   * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2731 2849                   *
2732 2850                   * If we had multilevel ICMP sockets, then we'd want to
2733 2851                   * modify conn_recvancillary_size/add to
2734 2852                   * allow the user to see the label.
2735 2853                   */
2736 2854          }
2737 2855  
2738 2856          /*
2739 2857           * Check a filter for ICMPv6 types if needed.
2740 2858           * Verify raw checksums if needed.
2741 2859           */
2742 2860          mutex_enter(&connp->conn_lock);
2743 2861          if (icmp->icmp_filter != NULL) {
2744 2862                  int type;
2745 2863  
2746 2864                  /* Assumes that IP has done the pullupmsg */
2747 2865                  type = mp->b_rptr[ip_hdr_length];
2748 2866  
2749 2867                  ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2750 2868                  if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2751 2869                          mutex_exit(&connp->conn_lock);
2752 2870                          freemsg(mp);
2753 2871                          return;
2754 2872                  }
2755 2873          }
2756 2874          if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2757 2875                  /* Checksum */
2758 2876                  uint16_t        *up;
2759 2877                  uint32_t        sum;
2760 2878                  int             remlen;
2761 2879  
2762 2880                  up = (uint16_t *)&ip6h->ip6_src;
2763 2881  
2764 2882                  remlen = msgdsize(mp) - ip_hdr_length;
2765 2883                  sum = htons(connp->conn_proto + remlen)
2766 2884                      + up[0] + up[1] + up[2] + up[3]
2767 2885                      + up[4] + up[5] + up[6] + up[7]
2768 2886                      + up[8] + up[9] + up[10] + up[11]
2769 2887                      + up[12] + up[13] + up[14] + up[15];
2770 2888                  sum = (sum & 0xffff) + (sum >> 16);
2771 2889                  sum = IP_CSUM(mp, ip_hdr_length, sum);
2772 2890                  if (sum != 0) {
2773 2891                          /* IPv6 RAW checksum failed */
2774 2892                          ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2775 2893                          mutex_exit(&connp->conn_lock);
2776 2894                          freemsg(mp);
2777 2895                          BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2778 2896                          return;
2779 2897                  }
2780 2898          }
2781 2899          mutex_exit(&connp->conn_lock);
2782 2900  
2783 2901          udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2784 2902  
2785 2903          if (recv_ancillary.crb_all != 0) {
2786 2904                  udi_size += conn_recvancillary_size(connp,
2787 2905                      recv_ancillary, ira, mp, &ipps);
2788 2906          }
2789 2907  
2790 2908          mp1 = allocb(udi_size, BPRI_MED);
2791 2909          if (mp1 == NULL) {
2792 2910                  freemsg(mp);
2793 2911                  BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2794 2912                  return;
2795 2913          }
2796 2914          mp1->b_cont = mp;
2797 2915          mp1->b_datap->db_type = M_PROTO;
2798 2916          tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2799 2917          mp1->b_wptr = (uchar_t *)tudi + udi_size;
2800 2918          tudi->PRIM_type = T_UNITDATA_IND;
2801 2919          tudi->SRC_length = sizeof (sin6_t);
2802 2920          tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2803 2921          tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2804 2922          udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2805 2923          tudi->OPT_length = udi_size;
2806 2924          sin6 = (sin6_t *)&tudi[1];
2807 2925          *sin6 = sin6_null;
2808 2926          sin6->sin6_port = 0;
2809 2927          sin6->sin6_family = AF_INET6;
2810 2928  
2811 2929          sin6->sin6_addr = ip6h->ip6_src;
2812 2930          /* No sin6_flowinfo per API */
2813 2931          sin6->sin6_flowinfo = 0;
2814 2932          /* For link-scope pass up scope id */
2815 2933          if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2816 2934                  sin6->sin6_scope_id = ira->ira_ruifindex;
2817 2935          else
2818 2936                  sin6->sin6_scope_id = 0;
2819 2937          sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2820 2938              IPCL_ZONEID(connp), is->is_netstack);
2821 2939  
2822 2940          if (udi_size != 0) {
2823 2941                  conn_recvancillary_add(connp, recv_ancillary, ira,
2824 2942                      &ipps, (uchar_t *)&sin6[1], udi_size);
2825 2943          }
2826 2944  
2827 2945          /* Skip all the IPv6 headers per API */
2828 2946          mp->b_rptr += ip_hdr_length;
2829 2947          pkt_len -= ip_hdr_length;
2830 2948  
2831 2949  deliver:
2832 2950          BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2833 2951          icmp_ulp_recv(connp, mp1, pkt_len);
2834 2952  }
2835 2953  
2836 2954  /*
2837 2955   * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2838 2956   * information that can be changing beneath us.
2839 2957   */
2840 2958  mblk_t *
2841 2959  icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2842 2960  {
2843 2961          mblk_t                  *mpdata;
2844 2962          struct opthdr           *optp;
2845 2963          conn_t                  *connp = Q_TO_CONN(q);
2846 2964          icmp_stack_t            *is = connp->conn_netstack->netstack_icmp;
2847 2965          mblk_t                  *mp2ctl;
2848 2966  
2849 2967          /*
2850 2968           * make a copy of the original message
2851 2969           */
2852 2970          mp2ctl = copymsg(mpctl);
2853 2971  
2854 2972          if (mpctl == NULL ||
2855 2973              (mpdata = mpctl->b_cont) == NULL) {
2856 2974                  freemsg(mpctl);
2857 2975                  freemsg(mp2ctl);
2858 2976                  return (0);
2859 2977          }
2860 2978  
2861 2979          /* fixed length structure for IPv4 and IPv6 counters */
2862 2980          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2863 2981          optp->level = EXPER_RAWIP;
2864 2982          optp->name = 0;
2865 2983          (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2866 2984              sizeof (is->is_rawip_mib));
2867 2985          optp->len = msgdsize(mpdata);
2868 2986          qreply(q, mpctl);
2869 2987  
2870 2988          return (mp2ctl);
2871 2989  }
2872 2990  
2873 2991  /*
2874 2992   * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2875 2993   * TODO:  If this ever actually tries to set anything, it needs to be
2876 2994   * to do the appropriate locking.
2877 2995   */
2878 2996  /* ARGSUSED */
2879 2997  int
2880 2998  icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2881 2999      uchar_t *ptr, int len)
2882 3000  {
2883 3001          switch (level) {
2884 3002          case EXPER_RAWIP:
2885 3003                  return (0);
2886 3004          default:
2887 3005                  return (1);
2888 3006          }
2889 3007  }
2890 3008  
2891 3009  /*
2892 3010   * This routine creates a T_UDERROR_IND message and passes it upstream.
2893 3011   * The address and options are copied from the T_UNITDATA_REQ message
2894 3012   * passed in mp.  This message is freed.
2895 3013   */
2896 3014  static void
2897 3015  icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
2898 3016  {
2899 3017          struct T_unitdata_req *tudr;
2900 3018          mblk_t  *mp1;
2901 3019          uchar_t *destaddr;
2902 3020          t_scalar_t destlen;
2903 3021          uchar_t *optaddr;
2904 3022          t_scalar_t optlen;
2905 3023  
2906 3024          if ((mp->b_wptr < mp->b_rptr) ||
2907 3025              (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
2908 3026                  goto done;
2909 3027          }
2910 3028          tudr = (struct T_unitdata_req *)mp->b_rptr;
2911 3029          destaddr = mp->b_rptr + tudr->DEST_offset;
2912 3030          if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
2913 3031              destaddr + tudr->DEST_length < mp->b_rptr ||
2914 3032              destaddr + tudr->DEST_length > mp->b_wptr) {
2915 3033                  goto done;
2916 3034          }
2917 3035          optaddr = mp->b_rptr + tudr->OPT_offset;
2918 3036          if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
2919 3037              optaddr + tudr->OPT_length < mp->b_rptr ||
2920 3038              optaddr + tudr->OPT_length > mp->b_wptr) {
2921 3039                  goto done;
2922 3040          }
2923 3041          destlen = tudr->DEST_length;
2924 3042          optlen = tudr->OPT_length;
2925 3043  
2926 3044          mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
2927 3045              (char *)optaddr, optlen, err);
2928 3046          if (mp1 != NULL)
2929 3047                  qreply(q, mp1);
2930 3048  
2931 3049  done:
2932 3050          freemsg(mp);
2933 3051  }
2934 3052  
2935 3053  static int
2936 3054  rawip_do_unbind(conn_t *connp)
2937 3055  {
2938 3056          icmp_t  *icmp = connp->conn_icmp;
2939 3057  
2940 3058          mutex_enter(&connp->conn_lock);
2941 3059          /* If a bind has not been done, we can't unbind. */
2942 3060          if (icmp->icmp_state == TS_UNBND) {
2943 3061                  mutex_exit(&connp->conn_lock);
2944 3062                  return (-TOUTSTATE);
2945 3063          }
2946 3064          connp->conn_saddr_v6 = ipv6_all_zeros;
2947 3065          connp->conn_bound_addr_v6 = ipv6_all_zeros;
2948 3066          connp->conn_laddr_v6 = ipv6_all_zeros;
2949 3067          connp->conn_mcbc_bind = B_FALSE;
2950 3068          connp->conn_lport = 0;
2951 3069          connp->conn_fport = 0;
2952 3070          /* In case we were also connected */
2953 3071          connp->conn_faddr_v6 = ipv6_all_zeros;
2954 3072          connp->conn_v6lastdst = ipv6_all_zeros;
2955 3073  
2956 3074          icmp->icmp_state = TS_UNBND;
2957 3075  
2958 3076          (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
2959 3077              &connp->conn_faddr_v6, connp->conn_flowinfo);
2960 3078          mutex_exit(&connp->conn_lock);
2961 3079  
2962 3080          ip_unbind(connp);
2963 3081          return (0);
2964 3082  }
2965 3083  
2966 3084  /*
2967 3085   * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
2968 3086   * After some error checking, the message is passed downstream to ip.
2969 3087   */
2970 3088  static void
2971 3089  icmp_tpi_unbind(queue_t *q, mblk_t *mp)
2972 3090  {
2973 3091          conn_t  *connp = Q_TO_CONN(q);
2974 3092          int     error;
2975 3093  
2976 3094          ASSERT(mp->b_cont == NULL);
2977 3095          error = rawip_do_unbind(connp);
2978 3096          if (error) {
2979 3097                  if (error < 0) {
2980 3098                          icmp_err_ack(q, mp, -error, 0);
2981 3099                  } else {
2982 3100                          icmp_err_ack(q, mp, 0, error);
2983 3101                  }
2984 3102                  return;
2985 3103          }
2986 3104  
2987 3105          /*
2988 3106           * Convert mp into a T_OK_ACK
2989 3107           */
2990 3108  
2991 3109          mp = mi_tpi_ok_ack_alloc(mp);
2992 3110  
2993 3111          /*
2994 3112           * should not happen in practice... T_OK_ACK is smaller than the
2995 3113           * original message.
2996 3114           */
2997 3115          ASSERT(mp != NULL);
2998 3116          ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
2999 3117          qreply(q, mp);
3000 3118  }
3001 3119  
3002 3120  /*
3003 3121   * Process IPv4 packets that already include an IP header.
3004 3122   * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
3005 3123   * IPPROTO_IGMP).
3006 3124   * In this case we ignore the address and any options in the T_UNITDATA_REQ.
3007 3125   *
3008 3126   * The packet is assumed to have a base (20 byte) IP header followed
3009 3127   * by the upper-layer protocol. We include any IP_OPTIONS including a
3010 3128   * CIPSO label but otherwise preserve the base IP header.
3011 3129   */
3012 3130  static int
3013 3131  icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3014 3132  {
3015 3133          icmp_t          *icmp = connp->conn_icmp;
3016 3134          icmp_stack_t    *is = icmp->icmp_is;
3017 3135          ipha_t          iphas;
3018 3136          ipha_t          *ipha;
3019 3137          int             ip_hdr_length;
3020 3138          int             tp_hdr_len;
3021 3139          ip_xmit_attr_t  *ixa;
3022 3140          ip_pkt_t        *ipp;
3023 3141          in6_addr_t      v6src;
3024 3142          in6_addr_t      v6dst;
3025 3143          in6_addr_t      v6nexthop;
3026 3144          int             error;
3027 3145          boolean_t       do_ipsec;
3028 3146  
3029 3147          /*
3030 3148           * We need an exclusive copy of conn_ixa since the included IP
3031 3149           * header could have any destination.
3032 3150           * That copy has no pointers hence we
3033 3151           * need to set them up once we've parsed the ancillary data.
3034 3152           */
3035 3153          ixa = conn_get_ixa_exclusive(connp);
3036 3154          if (ixa == NULL) {
3037 3155                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3038 3156                  freemsg(mp);
3039 3157                  return (ENOMEM);
3040 3158          }
3041 3159          ASSERT(cr != NULL);
3042 3160          /*
3043 3161           * Caller has a reference on cr; from db_credp or because we
3044 3162           * are running in process context.
3045 3163           */
3046 3164          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3047 3165          ixa->ixa_cred = cr;
3048 3166          ixa->ixa_cpid = pid;
3049 3167          if (is_system_labeled()) {
3050 3168                  /* We need to restart with a label based on the cred */
3051 3169                  ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3052 3170          }
3053 3171  
3054 3172          /* In case previous destination was multicast or multirt */
3055 3173          ip_attr_newdst(ixa);
3056 3174  
3057 3175          /* Get a copy of conn_xmit_ipp since the TX label might change it */
3058 3176          ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3059 3177          if (ipp == NULL) {
3060 3178                  ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3061 3179                  ixa->ixa_cred = connp->conn_cred;       /* Restore */
3062 3180                  ixa->ixa_cpid = connp->conn_cpid;
3063 3181                  ixa_refrele(ixa);
3064 3182                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3065 3183                  freemsg(mp);
3066 3184                  return (ENOMEM);
3067 3185          }
3068 3186          mutex_enter(&connp->conn_lock);
3069 3187          error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3070 3188          mutex_exit(&connp->conn_lock);
3071 3189          if (error != 0) {
3072 3190                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3073 3191                  freemsg(mp);
3074 3192                  goto done;
3075 3193          }
3076 3194  
3077 3195          /* Sanity check length of packet */
3078 3196          ipha = (ipha_t *)mp->b_rptr;
3079 3197  
3080 3198          ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3081 3199          if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3082 3200                  if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3083 3201                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3084 3202                          freemsg(mp);
3085 3203                          goto done;
3086 3204                  }
3087 3205                  ipha = (ipha_t *)mp->b_rptr;
3088 3206          }
3089 3207          ipha->ipha_version_and_hdr_length =
3090 3208              (IP_VERSION<<4) | (ip_hdr_length>>2);
3091 3209  
3092 3210          /*
3093 3211           * We set IXAF_DONTFRAG if the application set DF which makes
3094 3212           * IP not fragment.
3095 3213           */
3096 3214          ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3097 3215          if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3098 3216                  ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3099 3217          else
3100 3218                  ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3101 3219  
3102 3220          /* Even for multicast and broadcast we honor the apps ttl */
3103 3221          ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3104 3222  
3105 3223          /*
3106 3224           * No source verification for non-local addresses
3107 3225           */
3108 3226          if (ipha->ipha_src != INADDR_ANY &&
3109 3227              ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
3110 3228              is->is_netstack->netstack_ip, B_FALSE)
3111 3229              != IPVL_UNICAST_UP) {
3112 3230                  ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3113 3231          }
3114 3232  
3115 3233          if (ipha->ipha_dst == INADDR_ANY)
3116 3234                  ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3117 3235  
3118 3236          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3119 3237          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3120 3238  
3121 3239          /* Defer IPsec if it might need to look at ICMP type/code */
3122 3240          do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3123 3241          ixa->ixa_flags |= IXAF_IS_IPV4;
3124 3242  
3125 3243          ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3126 3244          error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3127 3245              connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3128 3246              (do_ipsec ? IPDF_IPSEC : 0));
3129 3247          switch (error) {
3130 3248          case 0:
3131 3249                  break;
3132 3250          case EADDRNOTAVAIL:
3133 3251                  /*
3134 3252                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3135 3253                   * Don't have the application see that errno
3136 3254                   */
3137 3255                  error = ENETUNREACH;
3138 3256                  goto failed;
3139 3257          case ENETDOWN:
3140 3258                  /*
3141 3259                   * Have !ipif_addr_ready address; drop packet silently
3142 3260                   * until we can get applications to not send until we
3143 3261                   * are ready.
3144 3262                   */
3145 3263                  error = 0;
3146 3264                  goto failed;
3147 3265          case EHOSTUNREACH:
3148 3266          case ENETUNREACH:
3149 3267                  if (ixa->ixa_ire != NULL) {
3150 3268                          /*
3151 3269                           * Let conn_ip_output/ire_send_noroute return
3152 3270                           * the error and send any local ICMP error.
3153 3271                           */
3154 3272                          error = 0;
3155 3273                          break;
3156 3274                  }
3157 3275                  /* FALLTHRU */
3158 3276          default:
3159 3277          failed:
3160 3278                  freemsg(mp);
3161 3279                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3162 3280                  goto done;
3163 3281          }
3164 3282          if (ipha->ipha_src == INADDR_ANY)
3165 3283                  IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3166 3284  
3167 3285          /*
3168 3286           * We might be going to a different destination than last time,
3169 3287           * thus check that TX allows the communication and compute any
3170 3288           * needed label.
3171 3289           *
3172 3290           * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3173 3291           * don't have to worry about concurrent threads.
3174 3292           */
3175 3293          if (is_system_labeled()) {
3176 3294                  /*
3177 3295                   * Check whether Trusted Solaris policy allows communication
3178 3296                   * with this host, and pretend that the destination is
3179 3297                   * unreachable if not.
3180 3298                   * Compute any needed label and place it in ipp_label_v4/v6.
3181 3299                   *
3182 3300                   * Later conn_build_hdr_template/conn_prepend_hdr takes
3183 3301                   * ipp_label_v4/v6 to form the packet.
3184 3302                   *
3185 3303                   * Tsol note: We have ipp structure local to this thread so
3186 3304                   * no locking is needed.
3187 3305                   */
3188 3306                  error = conn_update_label(connp, ixa, &v6dst, ipp);
3189 3307                  if (error != 0) {
3190 3308                          freemsg(mp);
3191 3309                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3192 3310                          goto done;
3193 3311                  }
3194 3312          }
3195 3313  
3196 3314          /*
3197 3315           * Save away a copy of the IPv4 header the application passed down
3198 3316           * and then prepend an IPv4 header complete with any IP options
3199 3317           * including label.
3200 3318           * We need a struct copy since icmp_prepend_hdr will reuse the available
3201 3319           * space in the mblk.
3202 3320           */
3203 3321          iphas = *ipha;
3204 3322          mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3205 3323  
3206 3324          mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3207 3325          if (mp == NULL) {
3208 3326                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3209 3327                  ASSERT(error != 0);
3210 3328                  goto done;
3211 3329          }
3212 3330          if (ixa->ixa_pktlen > IP_MAXPACKET) {
3213 3331                  error = EMSGSIZE;
3214 3332                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3215 3333                  freemsg(mp);
3216 3334                  goto done;
3217 3335          }
3218 3336          /* Restore key parts of the header that the application passed down */
3219 3337          ipha = (ipha_t *)mp->b_rptr;
3220 3338          ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3221 3339          ipha->ipha_ident = iphas.ipha_ident;
3222 3340          ipha->ipha_fragment_offset_and_flags =
3223 3341              iphas.ipha_fragment_offset_and_flags;
3224 3342          ipha->ipha_ttl = iphas.ipha_ttl;
3225 3343          ipha->ipha_protocol = iphas.ipha_protocol;
3226 3344          ipha->ipha_src = iphas.ipha_src;
3227 3345          ipha->ipha_dst = iphas.ipha_dst;
3228 3346  
3229 3347          ixa->ixa_protocol = ipha->ipha_protocol;
3230 3348  
3231 3349          /*
3232 3350           * Make sure that the IP header plus any transport header that is
3233 3351           * checksumed by ip_output is in the first mblk. (ip_output assumes
3234 3352           * that at least the checksum field is in the first mblk.)
3235 3353           */
3236 3354          switch (ipha->ipha_protocol) {
3237 3355          case IPPROTO_UDP:
3238 3356                  tp_hdr_len = 8;
3239 3357                  break;
3240 3358          case IPPROTO_TCP:
3241 3359                  tp_hdr_len = 20;
3242 3360                  break;
3243 3361          default:
3244 3362                  tp_hdr_len = 0;
3245 3363                  break;
3246 3364          }
3247 3365          ip_hdr_length = IPH_HDR_LENGTH(ipha);
3248 3366          if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3249 3367                  if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3250 3368                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3251 3369                          if (mp->b_cont == NULL)
3252 3370                                  error = EINVAL;
3253 3371                          else
3254 3372                                  error = ENOMEM;
3255 3373                          freemsg(mp);
3256 3374                          goto done;
3257 3375                  }
3258 3376          }
3259 3377  
3260 3378          if (!do_ipsec) {
3261 3379                  /* Policy might differ for different ICMP type/code */
3262 3380                  if (ixa->ixa_ipsec_policy != NULL) {
3263 3381                          IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3264 3382                          ixa->ixa_ipsec_policy = NULL;
3265 3383                          ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3266 3384                  }
3267 3385                  mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3268 3386                  if (mp == NULL) {
3269 3387                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3270 3388                          error = EHOSTUNREACH;   /* IPsec policy failure */
3271 3389                          goto done;
3272 3390                  }
3273 3391          }
3274 3392  
3275 3393          /* We're done.  Pass the packet to ip. */
3276 3394          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3277 3395  
3278 3396          error = conn_ip_output(mp, ixa);
3279 3397          /* No rawipOutErrors if an error since IP increases its error counter */
3280 3398          switch (error) {
3281 3399          case 0:
3282 3400                  break;
3283 3401          case EWOULDBLOCK:
3284 3402                  (void) ixa_check_drain_insert(connp, ixa);
3285 3403                  error = 0;
3286 3404                  break;
3287 3405          case EADDRNOTAVAIL:
3288 3406                  /*
3289 3407                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3290 3408                   * Don't have the application see that errno
3291 3409                   */
3292 3410                  error = ENETUNREACH;
3293 3411                  break;
3294 3412          }
3295 3413  done:
3296 3414          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3297 3415          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3298 3416          ixa->ixa_cpid = connp->conn_cpid;
3299 3417          ixa_refrele(ixa);
3300 3418          ip_pkt_free(ipp);
3301 3419          kmem_free(ipp, sizeof (*ipp));
3302 3420          return (error);
3303 3421  }
3304 3422  
3305 3423  static mblk_t *
3306 3424  icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3307 3425  {
3308 3426          ipha_t  *ipha = NULL;
3309 3427          ip6_t   *ip6h = NULL;
3310 3428  
3311 3429          if (ixa->ixa_flags & IXAF_IS_IPV4)
3312 3430                  ipha = (ipha_t *)mp->b_rptr;
3313 3431          else
3314 3432                  ip6h = (ip6_t *)mp->b_rptr;
3315 3433  
3316 3434          if (ixa->ixa_ipsec_policy != NULL) {
3317 3435                  IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3318 3436                  ixa->ixa_ipsec_policy = NULL;
3319 3437                  ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3320 3438          }
3321 3439          return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3322 3440  }
3323 3441  
3324 3442  /*
3325 3443   * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3326 3444   * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3327 3445   * the TPI options, otherwise we take them from msg_control.
3328 3446   * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3329 3447   * Always consumes mp; never consumes tudr_mp.
3330 3448   */
3331 3449  static int
3332 3450  icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3333 3451      mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3334 3452  {
3335 3453          icmp_t          *icmp = connp->conn_icmp;
3336 3454          icmp_stack_t    *is = icmp->icmp_is;
3337 3455          int             error;
3338 3456          ip_xmit_attr_t  *ixa;
3339 3457          ip_pkt_t        *ipp;
3340 3458          in6_addr_t      v6src;
3341 3459          in6_addr_t      v6dst;
3342 3460          in6_addr_t      v6nexthop;
3343 3461          in_port_t       dstport;
3344 3462          uint32_t        flowinfo;
3345 3463          int             is_absreq_failure = 0;
3346 3464          conn_opt_arg_t  coas, *coa;
3347 3465  
3348 3466          ASSERT(tudr_mp != NULL || msg != NULL);
3349 3467  
3350 3468          /*
3351 3469           * Get ixa before checking state to handle a disconnect race.
3352 3470           *
3353 3471           * We need an exclusive copy of conn_ixa since the ancillary data
3354 3472           * options might modify it. That copy has no pointers hence we
3355 3473           * need to set them up once we've parsed the ancillary data.
3356 3474           */
3357 3475          ixa = conn_get_ixa_exclusive(connp);
3358 3476          if (ixa == NULL) {
3359 3477                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3360 3478                  freemsg(mp);
3361 3479                  return (ENOMEM);
3362 3480          }
3363 3481          ASSERT(cr != NULL);
3364 3482          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3365 3483          ixa->ixa_cred = cr;
3366 3484          ixa->ixa_cpid = pid;
3367 3485          if (is_system_labeled()) {
3368 3486                  /* We need to restart with a label based on the cred */
3369 3487                  ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3370 3488          }
3371 3489  
3372 3490          /* In case previous destination was multicast or multirt */
3373 3491          ip_attr_newdst(ixa);
3374 3492  
3375 3493          /* Get a copy of conn_xmit_ipp since the options might change it */
3376 3494          ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3377 3495          if (ipp == NULL) {
3378 3496                  ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3379 3497                  ixa->ixa_cred = connp->conn_cred;       /* Restore */
3380 3498                  ixa->ixa_cpid = connp->conn_cpid;
3381 3499                  ixa_refrele(ixa);
3382 3500                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3383 3501                  freemsg(mp);
3384 3502                  return (ENOMEM);
3385 3503          }
3386 3504          mutex_enter(&connp->conn_lock);
3387 3505          error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3388 3506          mutex_exit(&connp->conn_lock);
3389 3507          if (error != 0) {
3390 3508                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3391 3509                  freemsg(mp);
3392 3510                  goto done;
3393 3511          }
3394 3512  
3395 3513          /*
3396 3514           * Parse the options and update ixa and ipp as a result.
3397 3515           */
3398 3516  
3399 3517          coa = &coas;
3400 3518          coa->coa_connp = connp;
3401 3519          coa->coa_ixa = ixa;
3402 3520          coa->coa_ipp = ipp;
3403 3521          coa->coa_ancillary = B_TRUE;
3404 3522          coa->coa_changed = 0;
3405 3523  
3406 3524          if (msg != NULL) {
3407 3525                  error = process_auxiliary_options(connp, msg->msg_control,
3408 3526                      msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3409 3527          } else {
3410 3528                  struct T_unitdata_req *tudr;
3411 3529  
3412 3530                  tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3413 3531                  ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3414 3532                  error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3415 3533                      &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3416 3534                      coa, &is_absreq_failure);
3417 3535          }
3418 3536          if (error != 0) {
3419 3537                  /*
3420 3538                   * Note: No special action needed in this
3421 3539                   * module for "is_absreq_failure"
3422 3540                   */
3423 3541                  freemsg(mp);
3424 3542                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3425 3543                  goto done;
3426 3544          }
3427 3545          ASSERT(is_absreq_failure == 0);
3428 3546  
3429 3547          mutex_enter(&connp->conn_lock);
3430 3548          /*
3431 3549           * If laddr is unspecified then we look at sin6_src_id.
3432 3550           * We will give precedence to a source address set with IPV6_PKTINFO
3433 3551           * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3434 3552           * want ip_attr_connect to select a source (since it can fail) when
3435 3553           * IPV6_PKTINFO is specified.
3436 3554           * If this doesn't result in a source address then we get a source
3437 3555           * from ip_attr_connect() below.
3438 3556           */
3439 3557          v6src = connp->conn_saddr_v6;
3440 3558          if (sin != NULL) {
3441 3559                  IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3442 3560                  dstport = sin->sin_port;
3443 3561                  flowinfo = 0;
3444 3562                  ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3445 3563                  ixa->ixa_flags |= IXAF_IS_IPV4;
3446 3564          } else if (sin6 != NULL) {
3447 3565                  boolean_t v4mapped;
3448 3566                  uint_t srcid;
3449 3567  
3450 3568                  v6dst = sin6->sin6_addr;
3451 3569                  dstport = sin6->sin6_port;
3452 3570                  flowinfo = sin6->sin6_flowinfo;
3453 3571                  srcid = sin6->__sin6_src_id;
3454 3572                  if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3455 3573                          ixa->ixa_scopeid = sin6->sin6_scope_id;
3456 3574                          ixa->ixa_flags |= IXAF_SCOPEID_SET;
3457 3575                  } else {
3458 3576                          ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3459 3577                  }
3460 3578                  v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
3461 3579                  if (v4mapped)
3462 3580                          ixa->ixa_flags |= IXAF_IS_IPV4;
3463 3581                  else
3464 3582                          ixa->ixa_flags &= ~IXAF_IS_IPV4;
3465 3583                  if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3466 3584                          if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3467 3585                              v4mapped, connp->conn_netstack)) {
3468 3586                                  /* Mismatched v4mapped/v6 specified by srcid. */
3469 3587                                  mutex_exit(&connp->conn_lock);
3470 3588                                  error = EADDRNOTAVAIL;
3471 3589                                  goto failed;    /* Does freemsg() and mib. */
3472 3590                          }
3473 3591                  }
3474 3592          } else {
3475 3593                  /* Connected case */
3476 3594                  v6dst = connp->conn_faddr_v6;
3477 3595                  flowinfo = connp->conn_flowinfo;
3478 3596          }
3479 3597          mutex_exit(&connp->conn_lock);
3480 3598          /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
3481 3599          if (ipp->ipp_fields & IPPF_ADDR) {
3482 3600                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
3483 3601                          if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3484 3602                                  v6src = ipp->ipp_addr;
3485 3603                  } else {
3486 3604                          if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3487 3605                                  v6src = ipp->ipp_addr;
3488 3606                  }
3489 3607          }
3490 3608          /*
3491 3609           * Allow source not assigned to the system
3492 3610           * only if it is not a local addresses
3493 3611           */
3494 3612          if (!V6_OR_V4_INADDR_ANY(v6src)) {
3495 3613                  ip_laddr_t laddr_type;
3496 3614  
3497 3615                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
3498 3616                          ipaddr_t v4src;
3499 3617  
3500 3618                          IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
3501 3619                          laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
3502 3620                              is->is_netstack->netstack_ip, B_FALSE);
3503 3621                  } else {
3504 3622                          laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
3505 3623                              is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
3506 3624                  }
3507 3625                  if (laddr_type != IPVL_UNICAST_UP)
3508 3626                          ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3509 3627          }
3510 3628  
3511 3629          ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3512 3630          error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3513 3631              &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3514 3632  
3515 3633          switch (error) {
3516 3634          case 0:
3517 3635                  break;
3518 3636          case EADDRNOTAVAIL:
3519 3637                  /*
3520 3638                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3521 3639                   * Don't have the application see that errno
3522 3640                   */
3523 3641                  error = ENETUNREACH;
3524 3642                  goto failed;
3525 3643          case ENETDOWN:
3526 3644                  /*
3527 3645                   * Have !ipif_addr_ready address; drop packet silently
3528 3646                   * until we can get applications to not send until we
3529 3647                   * are ready.
3530 3648                   */
3531 3649                  error = 0;
3532 3650                  goto failed;
3533 3651          case EHOSTUNREACH:
3534 3652          case ENETUNREACH:
3535 3653                  if (ixa->ixa_ire != NULL) {
3536 3654                          /*
3537 3655                           * Let conn_ip_output/ire_send_noroute return
3538 3656                           * the error and send any local ICMP error.
3539 3657                           */
3540 3658                          error = 0;
3541 3659                          break;
3542 3660                  }
3543 3661                  /* FALLTHRU */
3544 3662          default:
3545 3663          failed:
3546 3664                  freemsg(mp);
3547 3665                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3548 3666                  goto done;
3549 3667          }
3550 3668  
3551 3669          /*
3552 3670           * We might be going to a different destination than last time,
3553 3671           * thus check that TX allows the communication and compute any
3554 3672           * needed label.
3555 3673           *
3556 3674           * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3557 3675           * don't have to worry about concurrent threads.
3558 3676           */
3559 3677          if (is_system_labeled()) {
3560 3678                  /*
3561 3679                   * Check whether Trusted Solaris policy allows communication
3562 3680                   * with this host, and pretend that the destination is
3563 3681                   * unreachable if not.
3564 3682                   * Compute any needed label and place it in ipp_label_v4/v6.
3565 3683                   *
3566 3684                   * Later conn_build_hdr_template/conn_prepend_hdr takes
3567 3685                   * ipp_label_v4/v6 to form the packet.
3568 3686                   *
3569 3687                   * Tsol note: We have ipp structure local to this thread so
3570 3688                   * no locking is needed.
3571 3689                   */
3572 3690                  error = conn_update_label(connp, ixa, &v6dst, ipp);
3573 3691                  if (error != 0) {
3574 3692                          freemsg(mp);
3575 3693                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3576 3694                          goto done;
3577 3695                  }
3578 3696          }
3579 3697          mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3580 3698              &error);
3581 3699          if (mp == NULL) {
3582 3700                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3583 3701                  ASSERT(error != 0);
3584 3702                  goto done;
3585 3703          }
3586 3704          if (ixa->ixa_pktlen > IP_MAXPACKET) {
3587 3705                  error = EMSGSIZE;
3588 3706                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3589 3707                  freemsg(mp);
3590 3708                  goto done;
3591 3709          }
3592 3710  
3593 3711          /* Policy might differ for different ICMP type/code */
3594 3712          mp = icmp_output_attach_policy(mp, connp, ixa);
3595 3713          if (mp == NULL) {
3596 3714                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3597 3715                  error = EHOSTUNREACH;   /* IPsec policy failure */
3598 3716                  goto done;
3599 3717          }
3600 3718  
3601 3719          /* We're done.  Pass the packet to ip. */
3602 3720          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3603 3721  
3604 3722          error = conn_ip_output(mp, ixa);
3605 3723          if (!connp->conn_unspec_src)
3606 3724                  ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3607 3725          /* No rawipOutErrors if an error since IP increases its error counter */
3608 3726          switch (error) {
3609 3727          case 0:
3610 3728                  break;
3611 3729          case EWOULDBLOCK:
3612 3730                  (void) ixa_check_drain_insert(connp, ixa);
3613 3731                  error = 0;
3614 3732                  break;
3615 3733          case EADDRNOTAVAIL:
3616 3734                  /*
3617 3735                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3618 3736                   * Don't have the application see that errno
3619 3737                   */
3620 3738                  error = ENETUNREACH;
3621 3739                  /* FALLTHRU */
3622 3740          default:
3623 3741                  mutex_enter(&connp->conn_lock);
3624 3742                  /*
3625 3743                   * Clear the source and v6lastdst so we call ip_attr_connect
3626 3744                   * for the next packet and try to pick a better source.
3627 3745                   */
3628 3746                  if (connp->conn_mcbc_bind)
3629 3747                          connp->conn_saddr_v6 = ipv6_all_zeros;
3630 3748                  else
3631 3749                          connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3632 3750                  connp->conn_v6lastdst = ipv6_all_zeros;
3633 3751                  mutex_exit(&connp->conn_lock);
3634 3752                  break;
3635 3753          }
3636 3754  done:
3637 3755          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3638 3756          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3639 3757          ixa->ixa_cpid = connp->conn_cpid;
3640 3758          ixa_refrele(ixa);
3641 3759          ip_pkt_free(ipp);
3642 3760          kmem_free(ipp, sizeof (*ipp));
3643 3761          return (error);
3644 3762  }
3645 3763  
3646 3764  /*
3647 3765   * Handle sending an M_DATA for a connected socket.
3648 3766   * Handles both IPv4 and IPv6.
3649 3767   */
3650 3768  int
3651 3769  icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3652 3770  {
3653 3771          icmp_t          *icmp = connp->conn_icmp;
3654 3772          icmp_stack_t    *is = icmp->icmp_is;
3655 3773          int             error;
3656 3774          ip_xmit_attr_t  *ixa;
3657 3775          boolean_t       do_ipsec;
3658 3776  
3659 3777          /*
3660 3778           * If no other thread is using conn_ixa this just gets a reference to
3661 3779           * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3662 3780           */
3663 3781          ixa = conn_get_ixa(connp, B_FALSE);
3664 3782          if (ixa == NULL) {
3665 3783                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3666 3784                  freemsg(mp);
3667 3785                  return (ENOMEM);
3668 3786          }
3669 3787  
3670 3788          ASSERT(cr != NULL);
3671 3789          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3672 3790          ixa->ixa_cred = cr;
3673 3791          ixa->ixa_cpid = pid;
3674 3792  
3675 3793          /* Defer IPsec if it might need to look at ICMP type/code */
3676 3794          switch (ixa->ixa_protocol) {
3677 3795          case IPPROTO_ICMP:
3678 3796          case IPPROTO_ICMPV6:
3679 3797                  do_ipsec = B_FALSE;
3680 3798                  break;
3681 3799          default:
3682 3800                  do_ipsec = B_TRUE;
3683 3801          }
3684 3802  
3685 3803          mutex_enter(&connp->conn_lock);
3686 3804          mp = icmp_prepend_header_template(connp, ixa, mp,
3687 3805              &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3688 3806  
3689 3807          if (mp == NULL) {
3690 3808                  ASSERT(error != 0);
3691 3809                  mutex_exit(&connp->conn_lock);
3692 3810                  ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3693 3811                  ixa->ixa_cred = connp->conn_cred;       /* Restore */
3694 3812                  ixa->ixa_cpid = connp->conn_cpid;
3695 3813                  ixa_refrele(ixa);
3696 3814                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3697 3815                  freemsg(mp);
3698 3816                  return (error);
3699 3817          }
3700 3818  
3701 3819          if (!do_ipsec) {
3702 3820                  /* Policy might differ for different ICMP type/code */
3703 3821                  mp = icmp_output_attach_policy(mp, connp, ixa);
3704 3822                  if (mp == NULL) {
3705 3823                          mutex_exit(&connp->conn_lock);
3706 3824                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3707 3825                          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3708 3826                          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3709 3827                          ixa->ixa_cpid = connp->conn_cpid;
3710 3828                          ixa_refrele(ixa);
3711 3829                          return (EHOSTUNREACH);  /* IPsec policy failure */
3712 3830                  }
3713 3831          }
3714 3832  
3715 3833          /*
3716 3834           * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3717 3835           * safe copy, then we need to fill in any pointers in it.
3718 3836           */
3719 3837          if (ixa->ixa_ire == NULL) {
3720 3838                  in6_addr_t      faddr, saddr;
3721 3839                  in6_addr_t      nexthop;
3722 3840                  in_port_t       fport;
3723 3841  
3724 3842                  saddr = connp->conn_saddr_v6;
3725 3843                  faddr = connp->conn_faddr_v6;
3726 3844                  fport = connp->conn_fport;
3727 3845                  ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3728 3846                  mutex_exit(&connp->conn_lock);
3729 3847  
3730 3848                  error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3731 3849                      fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3732 3850                      (do_ipsec ? IPDF_IPSEC : 0));
3733 3851                  switch (error) {
3734 3852                  case 0:
3735 3853                          break;
3736 3854                  case EADDRNOTAVAIL:
3737 3855                          /*
3738 3856                           * IXAF_VERIFY_SOURCE tells us to pick a better source.
3739 3857                           * Don't have the application see that errno
3740 3858                           */
3741 3859                          error = ENETUNREACH;
3742 3860                          goto failed;
3743 3861                  case ENETDOWN:
3744 3862                          /*
3745 3863                           * Have !ipif_addr_ready address; drop packet silently
3746 3864                           * until we can get applications to not send until we
3747 3865                           * are ready.
3748 3866                           */
3749 3867                          error = 0;
3750 3868                          goto failed;
3751 3869                  case EHOSTUNREACH:
3752 3870                  case ENETUNREACH:
3753 3871                          if (ixa->ixa_ire != NULL) {
3754 3872                                  /*
3755 3873                                   * Let conn_ip_output/ire_send_noroute return
3756 3874                                   * the error and send any local ICMP error.
3757 3875                                   */
3758 3876                                  error = 0;
3759 3877                                  break;
3760 3878                          }
3761 3879                          /* FALLTHRU */
3762 3880                  default:
3763 3881                  failed:
3764 3882                          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3765 3883                          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3766 3884                          ixa->ixa_cpid = connp->conn_cpid;
3767 3885                          ixa_refrele(ixa);
3768 3886                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3769 3887                          freemsg(mp);
3770 3888                          return (error);
3771 3889                  }
3772 3890          } else {
3773 3891                  /* Done with conn_t */
3774 3892                  mutex_exit(&connp->conn_lock);
3775 3893          }
3776 3894  
3777 3895          /* We're done.  Pass the packet to ip. */
3778 3896          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3779 3897  
3780 3898          error = conn_ip_output(mp, ixa);
3781 3899          /* No rawipOutErrors if an error since IP increases its error counter */
3782 3900          switch (error) {
3783 3901          case 0:
3784 3902                  break;
3785 3903          case EWOULDBLOCK:
3786 3904                  (void) ixa_check_drain_insert(connp, ixa);
3787 3905                  error = 0;
3788 3906                  break;
3789 3907          case EADDRNOTAVAIL:
3790 3908                  /*
3791 3909                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3792 3910                   * Don't have the application see that errno
3793 3911                   */
3794 3912                  error = ENETUNREACH;
3795 3913                  break;
3796 3914          }
3797 3915          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3798 3916          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3799 3917          ixa->ixa_cpid = connp->conn_cpid;
3800 3918          ixa_refrele(ixa);
3801 3919          return (error);
3802 3920  }
3803 3921  
3804 3922  /*
3805 3923   * Handle sending an M_DATA to the last destination.
3806 3924   * Handles both IPv4 and IPv6.
3807 3925   *
3808 3926   * NOTE: The caller must hold conn_lock and we drop it here.
3809 3927   */
3810 3928  int
3811 3929  icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3812 3930      ip_xmit_attr_t *ixa)
3813 3931  {
3814 3932          icmp_t          *icmp = connp->conn_icmp;
3815 3933          icmp_stack_t    *is = icmp->icmp_is;
3816 3934          int             error;
3817 3935          boolean_t       do_ipsec;
3818 3936  
3819 3937          ASSERT(MUTEX_HELD(&connp->conn_lock));
3820 3938          ASSERT(ixa != NULL);
3821 3939  
3822 3940          ASSERT(cr != NULL);
3823 3941          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3824 3942          ixa->ixa_cred = cr;
3825 3943          ixa->ixa_cpid = pid;
3826 3944  
3827 3945          /* Defer IPsec if it might need to look at ICMP type/code */
3828 3946          switch (ixa->ixa_protocol) {
3829 3947          case IPPROTO_ICMP:
3830 3948          case IPPROTO_ICMPV6:
3831 3949                  do_ipsec = B_FALSE;
3832 3950                  break;
3833 3951          default:
3834 3952                  do_ipsec = B_TRUE;
3835 3953          }
3836 3954  
3837 3955  
3838 3956          mp = icmp_prepend_header_template(connp, ixa, mp,
3839 3957              &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3840 3958  
3841 3959          if (mp == NULL) {
3842 3960                  ASSERT(error != 0);
3843 3961                  mutex_exit(&connp->conn_lock);
3844 3962                  ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3845 3963                  ixa->ixa_cred = connp->conn_cred;       /* Restore */
3846 3964                  ixa->ixa_cpid = connp->conn_cpid;
3847 3965                  ixa_refrele(ixa);
3848 3966                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3849 3967                  freemsg(mp);
3850 3968                  return (error);
3851 3969          }
3852 3970  
3853 3971          if (!do_ipsec) {
3854 3972                  /* Policy might differ for different ICMP type/code */
3855 3973                  mp = icmp_output_attach_policy(mp, connp, ixa);
3856 3974                  if (mp == NULL) {
3857 3975                          mutex_exit(&connp->conn_lock);
3858 3976                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3859 3977                          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3860 3978                          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3861 3979                          ixa->ixa_cpid = connp->conn_cpid;
3862 3980                          ixa_refrele(ixa);
3863 3981                          return (EHOSTUNREACH);  /* IPsec policy failure */
3864 3982                  }
3865 3983          }
3866 3984  
3867 3985          /*
3868 3986           * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3869 3987           * safe copy, then we need to fill in any pointers in it.
3870 3988           */
3871 3989          if (ixa->ixa_ire == NULL) {
3872 3990                  in6_addr_t      lastdst, lastsrc;
3873 3991                  in6_addr_t      nexthop;
3874 3992                  in_port_t       lastport;
3875 3993  
3876 3994                  lastsrc = connp->conn_v6lastsrc;
3877 3995                  lastdst = connp->conn_v6lastdst;
3878 3996                  lastport = connp->conn_lastdstport;
3879 3997                  ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3880 3998                  mutex_exit(&connp->conn_lock);
3881 3999  
3882 4000                  error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
3883 4001                      &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
3884 4002                      IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
3885 4003                  switch (error) {
3886 4004                  case 0:
3887 4005                          break;
3888 4006                  case EADDRNOTAVAIL:
3889 4007                          /*
3890 4008                           * IXAF_VERIFY_SOURCE tells us to pick a better source.
3891 4009                           * Don't have the application see that errno
3892 4010                           */
3893 4011                          error = ENETUNREACH;
3894 4012                          goto failed;
3895 4013                  case ENETDOWN:
3896 4014                          /*
3897 4015                           * Have !ipif_addr_ready address; drop packet silently
3898 4016                           * until we can get applications to not send until we
3899 4017                           * are ready.
3900 4018                           */
3901 4019                          error = 0;
3902 4020                          goto failed;
3903 4021                  case EHOSTUNREACH:
3904 4022                  case ENETUNREACH:
3905 4023                          if (ixa->ixa_ire != NULL) {
3906 4024                                  /*
3907 4025                                   * Let conn_ip_output/ire_send_noroute return
3908 4026                                   * the error and send any local ICMP error.
3909 4027                                   */
3910 4028                                  error = 0;
3911 4029                                  break;
3912 4030                          }
3913 4031                          /* FALLTHRU */
3914 4032                  default:
3915 4033                  failed:
3916 4034                          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3917 4035                          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3918 4036                          ixa->ixa_cpid = connp->conn_cpid;
3919 4037                          ixa_refrele(ixa);
3920 4038                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3921 4039                          freemsg(mp);
3922 4040                          return (error);
3923 4041                  }
3924 4042          } else {
3925 4043                  /* Done with conn_t */
3926 4044                  mutex_exit(&connp->conn_lock);
3927 4045          }
3928 4046  
3929 4047          /* We're done.  Pass the packet to ip. */
3930 4048          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3931 4049          error = conn_ip_output(mp, ixa);
3932 4050          /* No rawipOutErrors if an error since IP increases its error counter */
3933 4051          switch (error) {
3934 4052          case 0:
3935 4053                  break;
3936 4054          case EWOULDBLOCK:
3937 4055                  (void) ixa_check_drain_insert(connp, ixa);
3938 4056                  error = 0;
3939 4057                  break;
3940 4058          case EADDRNOTAVAIL:
3941 4059                  /*
3942 4060                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
3943 4061                   * Don't have the application see that errno
3944 4062                   */
3945 4063                  error = ENETUNREACH;
3946 4064                  /* FALLTHRU */
3947 4065          default:
3948 4066                  mutex_enter(&connp->conn_lock);
3949 4067                  /*
3950 4068                   * Clear the source and v6lastdst so we call ip_attr_connect
3951 4069                   * for the next packet and try to pick a better source.
3952 4070                   */
3953 4071                  if (connp->conn_mcbc_bind)
3954 4072                          connp->conn_saddr_v6 = ipv6_all_zeros;
3955 4073                  else
3956 4074                          connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3957 4075                  connp->conn_v6lastdst = ipv6_all_zeros;
3958 4076                  mutex_exit(&connp->conn_lock);
3959 4077                  break;
3960 4078          }
3961 4079          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3962 4080          ixa->ixa_cred = connp->conn_cred;       /* Restore */
3963 4081          ixa->ixa_cpid = connp->conn_cpid;
3964 4082          ixa_refrele(ixa);
3965 4083          return (error);
3966 4084  }
3967 4085  
3968 4086  
3969 4087  /*
3970 4088   * Prepend the header template and then fill in the source and
3971 4089   * flowinfo. The caller needs to handle the destination address since
3972 4090   * it's setting is different if rthdr or source route.
3973 4091   *
3974 4092   * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3975 4093   * When it returns NULL it sets errorp.
3976 4094   */
3977 4095  static mblk_t *
3978 4096  icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3979 4097      const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
3980 4098  {
3981 4099          icmp_t          *icmp = connp->conn_icmp;
3982 4100          icmp_stack_t    *is = icmp->icmp_is;
3983 4101          uint_t          pktlen;
3984 4102          uint_t          copylen;
3985 4103          uint8_t         *iph;
3986 4104          uint_t          ip_hdr_length;
3987 4105          uint32_t        cksum;
3988 4106          ip_pkt_t        *ipp;
3989 4107  
3990 4108          ASSERT(MUTEX_HELD(&connp->conn_lock));
3991 4109  
3992 4110          /*
3993 4111           * Copy the header template.
3994 4112           */
3995 4113          copylen = connp->conn_ht_iphc_len;
3996 4114          pktlen = copylen + msgdsize(mp);
3997 4115          if (pktlen > IP_MAXPACKET) {
3998 4116                  freemsg(mp);
3999 4117                  *errorp = EMSGSIZE;
4000 4118                  return (NULL);
4001 4119          }
4002 4120          ixa->ixa_pktlen = pktlen;
4003 4121  
4004 4122          /* check/fix buffer config, setup pointers into it */
4005 4123          iph = mp->b_rptr - copylen;
4006 4124          if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
4007 4125                  mblk_t *mp1;
4008 4126  
4009 4127                  mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
4010 4128                  if (mp1 == NULL) {
4011 4129                          freemsg(mp);
4012 4130                          *errorp = ENOMEM;
4013 4131                          return (NULL);
4014 4132                  }
4015 4133                  mp1->b_wptr = DB_LIM(mp1);
4016 4134                  mp1->b_cont = mp;
4017 4135                  mp = mp1;
4018 4136                  iph = (mp->b_wptr - copylen);
4019 4137          }
4020 4138          mp->b_rptr = iph;
4021 4139          bcopy(connp->conn_ht_iphc, iph, copylen);
4022 4140          ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
4023 4141  
4024 4142          ixa->ixa_ip_hdr_length = ip_hdr_length;
4025 4143  
4026 4144          /*
4027 4145           * Prepare for ICMPv6 checksum done in IP.
4028 4146           *
4029 4147           * icmp_build_hdr_template has already massaged any routing header
4030 4148           * and placed the result in conn_sum.
4031 4149           *
4032 4150           * We make it easy for IP to include our pseudo header
4033 4151           * by putting our length (and any routing header adjustment)
4034 4152           * in the ICMPv6 checksum field.
4035 4153           */
4036 4154          cksum = pktlen - ip_hdr_length;
4037 4155  
4038 4156          cksum += connp->conn_sum;
4039 4157          cksum = (cksum >> 16) + (cksum & 0xFFFF);
4040 4158          ASSERT(cksum < 0x10000);
4041 4159  
4042 4160          ipp = &connp->conn_xmit_ipp;
4043 4161          if (ixa->ixa_flags & IXAF_IS_IPV4) {
4044 4162                  ipha_t  *ipha = (ipha_t *)iph;
4045 4163  
4046 4164                  ipha->ipha_length = htons((uint16_t)pktlen);
4047 4165  
4048 4166                  /* if IP_PKTINFO specified an addres it wins over bind() */
4049 4167                  if ((ipp->ipp_fields & IPPF_ADDR) &&
4050 4168                      IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4051 4169                          ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
4052 4170                          ipha->ipha_src = ipp->ipp_addr_v4;
4053 4171                  } else {
4054 4172                          IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
4055 4173                  }
4056 4174          } else {
4057 4175                  ip6_t *ip6h = (ip6_t *)iph;
4058 4176                  uint_t  cksum_offset = 0;
4059 4177  
4060 4178                  ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
4061 4179  
4062 4180                  /* if IP_PKTINFO specified an addres it wins over bind() */
4063 4181                  if ((ipp->ipp_fields & IPPF_ADDR) &&
4064 4182                      !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4065 4183                          ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
4066 4184                          ip6h->ip6_src = ipp->ipp_addr;
4067 4185                  } else {
4068 4186                          ip6h->ip6_src = *v6src;
4069 4187                  }
4070 4188                  ip6h->ip6_vcf =
4071 4189                      (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4072 4190                      (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4073 4191                  if (ipp->ipp_fields & IPPF_TCLASS) {
4074 4192                          /* Overrides the class part of flowinfo */
4075 4193                          ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4076 4194                              ipp->ipp_tclass);
4077 4195                  }
4078 4196  
4079 4197                  if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
4080 4198                          if (connp->conn_proto == IPPROTO_ICMPV6) {
4081 4199                                  cksum_offset = ixa->ixa_ip_hdr_length +
4082 4200                                      offsetof(icmp6_t, icmp6_cksum);
4083 4201                          } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4084 4202                                  cksum_offset = ixa->ixa_ip_hdr_length +
4085 4203                                      ixa->ixa_raw_cksum_offset;
4086 4204                          }
4087 4205                  }
4088 4206                  if (cksum_offset != 0) {
4089 4207                          uint16_t *ptr;
4090 4208  
4091 4209                          /* Make sure the checksum fits in the first mblk */
4092 4210                          if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4093 4211                                  mblk_t *mp1;
4094 4212  
4095 4213                                  mp1 = msgpullup(mp,
4096 4214                                      cksum_offset + sizeof (short));
4097 4215                                  freemsg(mp);
4098 4216                                  if (mp1 == NULL) {
4099 4217                                          *errorp = ENOMEM;
4100 4218                                          return (NULL);
4101 4219                                  }
4102 4220                                  mp = mp1;
4103 4221                                  iph = mp->b_rptr;
4104 4222                                  ip6h = (ip6_t *)iph;
4105 4223                          }
4106 4224                          ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4107 4225                          *ptr = htons(cksum);
4108 4226                  }
4109 4227          }
4110 4228  
4111 4229          return (mp);
4112 4230  }
4113 4231  
4114 4232  /*
4115 4233   * This routine handles all messages passed downstream.  It either
4116 4234   * consumes the message or passes it downstream; it never queues a
4117 4235   * a message.
4118 4236   */
4119 4237  void
4120 4238  icmp_wput(queue_t *q, mblk_t *mp)
4121 4239  {
4122 4240          sin6_t          *sin6;
4123 4241          sin_t           *sin = NULL;
4124 4242          uint_t          srcid;
4125 4243          conn_t          *connp = Q_TO_CONN(q);
4126 4244          icmp_t          *icmp = connp->conn_icmp;
4127 4245          int             error = 0;
4128 4246          struct sockaddr *addr = NULL;
4129 4247          socklen_t       addrlen;
4130 4248          icmp_stack_t    *is = icmp->icmp_is;
4131 4249          struct T_unitdata_req *tudr;
4132 4250          mblk_t          *data_mp;
4133 4251          cred_t          *cr;
4134 4252          pid_t           pid;
4135 4253  
4136 4254          /*
4137 4255           * We directly handle several cases here: T_UNITDATA_REQ message
4138 4256           * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4139 4257           * socket.
4140 4258           */
4141 4259          switch (DB_TYPE(mp)) {
4142 4260          case M_DATA:
4143 4261                  /* sockfs never sends down M_DATA */
4144 4262                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4145 4263                  freemsg(mp);
4146 4264                  return;
4147 4265  
4148 4266          case M_PROTO:
4149 4267          case M_PCPROTO:
4150 4268                  tudr = (struct T_unitdata_req *)mp->b_rptr;
4151 4269                  if (MBLKL(mp) < sizeof (*tudr) ||
4152 4270                      ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4153 4271                          icmp_wput_other(q, mp);
4154 4272                          return;
4155 4273                  }
4156 4274                  break;
4157 4275  
4158 4276          default:
4159 4277                  icmp_wput_other(q, mp);
4160 4278                  return;
4161 4279          }
4162 4280  
4163 4281          /* Handle valid T_UNITDATA_REQ here */
4164 4282          data_mp = mp->b_cont;
4165 4283          if (data_mp == NULL) {
4166 4284                  error = EPROTO;
4167 4285                  goto ud_error2;
4168 4286          }
4169 4287          mp->b_cont = NULL;
4170 4288  
4171 4289          if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4172 4290                  error = EADDRNOTAVAIL;
4173 4291                  goto ud_error2;
4174 4292          }
4175 4293  
4176 4294          /*
4177 4295           * All Solaris components should pass a db_credp
4178 4296           * for this message, hence we ASSERT.
4179 4297           * On production kernels we return an error to be robust against
4180 4298           * random streams modules sitting on top of us.
4181 4299           */
4182 4300          cr = msg_getcred(mp, &pid);
4183 4301          ASSERT(cr != NULL);
4184 4302          if (cr == NULL) {
4185 4303                  error = EINVAL;
4186 4304                  goto ud_error2;
4187 4305          }
4188 4306  
4189 4307          /*
4190 4308           * If a port has not been bound to the stream, fail.
4191 4309           * This is not a problem when sockfs is directly
4192 4310           * above us, because it will ensure that the socket
4193 4311           * is first bound before allowing data to be sent.
4194 4312           */
4195 4313          if (icmp->icmp_state == TS_UNBND) {
4196 4314                  error = EPROTO;
4197 4315                  goto ud_error2;
4198 4316          }
4199 4317          addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4200 4318          addrlen = tudr->DEST_length;
4201 4319  
4202 4320          switch (connp->conn_family) {
4203 4321          case AF_INET6:
4204 4322                  sin6 = (sin6_t *)addr;
4205 4323                  if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4206 4324                      (sin6->sin6_family != AF_INET6)) {
4207 4325                          error = EADDRNOTAVAIL;
4208 4326                          goto ud_error2;
4209 4327                  }
4210 4328  
4211 4329                  /* No support for mapped addresses on raw sockets */
4212 4330                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4213 4331                          error = EADDRNOTAVAIL;
4214 4332                          goto ud_error2;
4215 4333                  }
4216 4334                  srcid = sin6->__sin6_src_id;
4217 4335  
4218 4336                  /*
4219 4337                   * If the local address is a mapped address return
4220 4338                   * an error.
4221 4339                   * It would be possible to send an IPv6 packet but the
4222 4340                   * response would never make it back to the application
4223 4341                   * since it is bound to a mapped address.
4224 4342                   */
4225 4343                  if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4226 4344                          error = EADDRNOTAVAIL;
4227 4345                          goto ud_error2;
4228 4346                  }
4229 4347  
4230 4348                  if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4231 4349                          sin6->sin6_addr = ipv6_loopback;
4232 4350  
4233 4351                  if (tudr->OPT_length != 0) {
4234 4352                          /*
4235 4353                           * If we are connected then the destination needs to be
4236 4354                           * the same as the connected one.
4237 4355                           */
4238 4356                          if (icmp->icmp_state == TS_DATA_XFER &&
4239 4357                              !conn_same_as_last_v6(connp, sin6)) {
4240 4358                                  error = EISCONN;
4241 4359                                  goto ud_error2;
4242 4360                          }
4243 4361                          error = icmp_output_ancillary(connp, NULL, sin6,
4244 4362                              data_mp, mp, NULL, cr, pid);
4245 4363                  } else {
4246 4364                          ip_xmit_attr_t *ixa;
4247 4365  
4248 4366                          /*
4249 4367                           * We have to allocate an ip_xmit_attr_t before we grab
4250 4368                           * conn_lock and we need to hold conn_lock once we've
4251 4369                           * checked conn_same_as_last_v6 to handle concurrent
4252 4370                           * send* calls on a socket.
4253 4371                           */
4254 4372                          ixa = conn_get_ixa(connp, B_FALSE);
4255 4373                          if (ixa == NULL) {
4256 4374                                  error = ENOMEM;
4257 4375                                  goto ud_error2;
4258 4376                          }
4259 4377                          mutex_enter(&connp->conn_lock);
4260 4378  
4261 4379                          if (conn_same_as_last_v6(connp, sin6) &&
4262 4380                              connp->conn_lastsrcid == srcid &&
4263 4381                              ipsec_outbound_policy_current(ixa)) {
4264 4382                                  /* icmp_output_lastdst drops conn_lock */
4265 4383                                  error = icmp_output_lastdst(connp, data_mp, cr,
4266 4384                                      pid, ixa);
4267 4385                          } else {
4268 4386                                  /* icmp_output_newdst drops conn_lock */
4269 4387                                  error = icmp_output_newdst(connp, data_mp, NULL,
4270 4388                                      sin6, cr, pid, ixa);
4271 4389                          }
4272 4390                          ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4273 4391                  }
4274 4392                  if (error == 0) {
4275 4393                          freeb(mp);
4276 4394                          return;
4277 4395                  }
4278 4396                  break;
4279 4397  
4280 4398          case AF_INET:
4281 4399                  sin = (sin_t *)addr;
4282 4400                  if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4283 4401                      (sin->sin_family != AF_INET)) {
4284 4402                          error = EADDRNOTAVAIL;
4285 4403                          goto ud_error2;
4286 4404                  }
4287 4405                  if (sin->sin_addr.s_addr == INADDR_ANY)
4288 4406                          sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4289 4407  
4290 4408                  /* Protocol 255 contains full IP headers */
4291 4409                  /* Read without holding lock */
4292 4410                  if (icmp->icmp_hdrincl) {
4293 4411                          if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4294 4412                                  if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4295 4413                                          error = EINVAL;
4296 4414                                          goto ud_error2;
4297 4415                                  }
4298 4416                          }
4299 4417                          error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4300 4418                          if (error == 0) {
4301 4419                                  freeb(mp);
4302 4420                                  return;
4303 4421                          }
4304 4422                          /* data_mp consumed above */
4305 4423                          data_mp = NULL;
4306 4424                          goto ud_error2;
4307 4425                  }
4308 4426  
4309 4427                  if (tudr->OPT_length != 0) {
4310 4428                          /*
4311 4429                           * If we are connected then the destination needs to be
4312 4430                           * the same as the connected one.
4313 4431                           */
4314 4432                          if (icmp->icmp_state == TS_DATA_XFER &&
4315 4433                              !conn_same_as_last_v4(connp, sin)) {
4316 4434                                  error = EISCONN;
4317 4435                                  goto ud_error2;
4318 4436                          }
4319 4437                          error = icmp_output_ancillary(connp, sin, NULL,
4320 4438                              data_mp, mp, NULL, cr, pid);
4321 4439                  } else {
4322 4440                          ip_xmit_attr_t *ixa;
4323 4441  
4324 4442                          /*
4325 4443                           * We have to allocate an ip_xmit_attr_t before we grab
4326 4444                           * conn_lock and we need to hold conn_lock once we've
4327 4445                           * checked conn_same_as_last_v4 to handle concurrent
4328 4446                           * send* calls on a socket.
4329 4447                           */
4330 4448                          ixa = conn_get_ixa(connp, B_FALSE);
4331 4449                          if (ixa == NULL) {
4332 4450                                  error = ENOMEM;
4333 4451                                  goto ud_error2;
4334 4452                          }
4335 4453                          mutex_enter(&connp->conn_lock);
4336 4454  
4337 4455                          if (conn_same_as_last_v4(connp, sin) &&
4338 4456                              ipsec_outbound_policy_current(ixa)) {
4339 4457                                  /* icmp_output_lastdst drops conn_lock */
4340 4458                                  error = icmp_output_lastdst(connp, data_mp, cr,
4341 4459                                      pid, ixa);
4342 4460                          } else {
4343 4461                                  /* icmp_output_newdst drops conn_lock */
4344 4462                                  error = icmp_output_newdst(connp, data_mp, sin,
4345 4463                                      NULL, cr, pid, ixa);
4346 4464                          }
4347 4465                          ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4348 4466                  }
4349 4467                  if (error == 0) {
4350 4468                          freeb(mp);
4351 4469                          return;
4352 4470                  }
4353 4471                  break;
4354 4472          }
4355 4473          ASSERT(mp != NULL);
4356 4474          /* mp is freed by the following routine */
4357 4475          icmp_ud_err(q, mp, (t_scalar_t)error);
4358 4476          return;
4359 4477  
4360 4478  ud_error2:
4361 4479          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4362 4480          freemsg(data_mp);
4363 4481          ASSERT(mp != NULL);
4364 4482          /* mp is freed by the following routine */
4365 4483          icmp_ud_err(q, mp, (t_scalar_t)error);
4366 4484  }
4367 4485  
4368 4486  /*
4369 4487   * Handle the case of the IP address or flow label being different
4370 4488   * for both IPv4 and IPv6.
4371 4489   *
4372 4490   * NOTE: The caller must hold conn_lock and we drop it here.
4373 4491   */
4374 4492  static int
4375 4493  icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4376 4494      cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4377 4495  {
4378 4496          icmp_t          *icmp = connp->conn_icmp;
4379 4497          icmp_stack_t    *is = icmp->icmp_is;
4380 4498          int             error;
4381 4499          ip_xmit_attr_t  *oldixa;
4382 4500          boolean_t       do_ipsec;
4383 4501          uint_t          srcid;
4384 4502          uint32_t        flowinfo;
4385 4503          in6_addr_t      v6src;
4386 4504          in6_addr_t      v6dst;
4387 4505          in6_addr_t      v6nexthop;
4388 4506          in_port_t       dstport;
4389 4507  
4390 4508          ASSERT(MUTEX_HELD(&connp->conn_lock));
4391 4509          ASSERT(ixa != NULL);
4392 4510  
4393 4511          /*
4394 4512           * We hold conn_lock across all the use and modifications of
4395 4513           * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4396 4514           * stay consistent.
4397 4515           */
4398 4516  
4399 4517          ASSERT(cr != NULL);
4400 4518          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4401 4519          ixa->ixa_cred = cr;
4402 4520          ixa->ixa_cpid = pid;
4403 4521          if (is_system_labeled()) {
4404 4522                  /* We need to restart with a label based on the cred */
4405 4523                  ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4406 4524          }
4407 4525          /*
4408 4526           * If we are connected then the destination needs to be the
4409 4527           * same as the connected one, which is not the case here since we
4410 4528           * checked for that above.
4411 4529           */
4412 4530          if (icmp->icmp_state == TS_DATA_XFER) {
4413 4531                  mutex_exit(&connp->conn_lock);
4414 4532                  error = EISCONN;
4415 4533                  goto ud_error;
4416 4534          }
4417 4535  
4418 4536          /* In case previous destination was multicast or multirt */
4419 4537          ip_attr_newdst(ixa);
4420 4538  
4421 4539          /*
4422 4540           * If laddr is unspecified then we look at sin6_src_id.
4423 4541           * We will give precedence to a source address set with IPV6_PKTINFO
4424 4542           * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4425 4543           * want ip_attr_connect to select a source (since it can fail) when
4426 4544           * IPV6_PKTINFO is specified.
4427 4545           * If this doesn't result in a source address then we get a source
4428 4546           * from ip_attr_connect() below.
4429 4547           */
4430 4548          v6src = connp->conn_saddr_v6;
4431 4549          if (sin != NULL) {
4432 4550                  IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4433 4551                  dstport = sin->sin_port;
4434 4552                  flowinfo = 0;
4435 4553                  /* Don't bother with ip_srcid_find_id(), but indicate anyway. */
4436 4554                  srcid = 0;
4437 4555                  ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4438 4556                  ixa->ixa_flags |= IXAF_IS_IPV4;
4439 4557          } else {
4440 4558                  boolean_t v4mapped;
4441 4559  
4442 4560                  v6dst = sin6->sin6_addr;
4443 4561                  dstport = sin6->sin6_port;
4444 4562                  flowinfo = sin6->sin6_flowinfo;
4445 4563                  srcid = sin6->__sin6_src_id;
4446 4564                  if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4447 4565                          ixa->ixa_scopeid = sin6->sin6_scope_id;
4448 4566                          ixa->ixa_flags |= IXAF_SCOPEID_SET;
4449 4567                  } else {
4450 4568                          ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4451 4569                  }
4452 4570                  v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
4453 4571                  if (v4mapped)
4454 4572                          ixa->ixa_flags |= IXAF_IS_IPV4;
4455 4573                  else
4456 4574                          ixa->ixa_flags &= ~IXAF_IS_IPV4;
4457 4575                  if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4458 4576                          if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4459 4577                              v4mapped, connp->conn_netstack)) {
4460 4578                                  /* Mismatched v4mapped/v6 specified by srcid. */
4461 4579                                  mutex_exit(&connp->conn_lock);
4462 4580                                  error = EADDRNOTAVAIL;
4463 4581                                  goto ud_error;
4464 4582                          }
4465 4583                  }
4466 4584          }
4467 4585          /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
4468 4586          if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) {
4469 4587                  ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4470 4588  
4471 4589                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
4472 4590                          if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4473 4591                                  v6src = ipp->ipp_addr;
4474 4592                  } else {
4475 4593                          if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4476 4594                                  v6src = ipp->ipp_addr;
4477 4595                  }
4478 4596          }
4479 4597  
4480 4598          /* Defer IPsec if it might need to look at ICMP type/code */
4481 4599          switch (ixa->ixa_protocol) {
4482 4600          case IPPROTO_ICMP:
4483 4601          case IPPROTO_ICMPV6:
4484 4602                  do_ipsec = B_FALSE;
4485 4603                  break;
4486 4604          default:
4487 4605                  do_ipsec = B_TRUE;
4488 4606          }
4489 4607  
4490 4608          ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4491 4609          mutex_exit(&connp->conn_lock);
4492 4610  
4493 4611          error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4494 4612              &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4495 4613              (do_ipsec ? IPDF_IPSEC : 0));
4496 4614          switch (error) {
4497 4615          case 0:
4498 4616                  break;
4499 4617          case EADDRNOTAVAIL:
4500 4618                  /*
4501 4619                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
4502 4620                   * Don't have the application see that errno
4503 4621                   */
4504 4622                  error = ENETUNREACH;
4505 4623                  goto failed;
4506 4624          case ENETDOWN:
4507 4625                  /*
4508 4626                   * Have !ipif_addr_ready address; drop packet silently
4509 4627                   * until we can get applications to not send until we
4510 4628                   * are ready.
4511 4629                   */
4512 4630                  error = 0;
4513 4631                  goto failed;
4514 4632          case EHOSTUNREACH:
4515 4633          case ENETUNREACH:
4516 4634                  if (ixa->ixa_ire != NULL) {
4517 4635                          /*
4518 4636                           * Let conn_ip_output/ire_send_noroute return
4519 4637                           * the error and send any local ICMP error.
4520 4638                           */
4521 4639                          error = 0;
4522 4640                          break;
4523 4641                  }
4524 4642                  /* FALLTHRU */
4525 4643          default:
4526 4644          failed:
4527 4645                  goto ud_error;
4528 4646          }
4529 4647  
4530 4648          mutex_enter(&connp->conn_lock);
4531 4649          /*
4532 4650           * While we dropped the lock some other thread might have connected
4533 4651           * this socket. If so we bail out with EISCONN to ensure that the
4534 4652           * connecting thread is the one that updates conn_ixa, conn_ht_*
4535 4653           * and conn_*last*.
4536 4654           */
4537 4655          if (icmp->icmp_state == TS_DATA_XFER) {
4538 4656                  mutex_exit(&connp->conn_lock);
4539 4657                  error = EISCONN;
4540 4658                  goto ud_error;
4541 4659          }
4542 4660  
4543 4661          /*
4544 4662           * We need to rebuild the headers if
4545 4663           *  - we are labeling packets (could be different for different
4546 4664           *    destinations)
4547 4665           *  - we have a source route (or routing header) since we need to
4548 4666           *    massage that to get the pseudo-header checksum
4549 4667           *  - a socket option with COA_HEADER_CHANGED has been set which
4550 4668           *    set conn_v6lastdst to zero.
4551 4669           *
4552 4670           * Otherwise the prepend function will just update the src, dst,
4553 4671           * and flow label.
4554 4672           */
4555 4673          if (is_system_labeled()) {
4556 4674                  /* TX MLP requires SCM_UCRED and don't have that here */
4557 4675                  if (connp->conn_mlp_type != mlptSingle) {
4558 4676                          mutex_exit(&connp->conn_lock);
4559 4677                          error = ECONNREFUSED;
4560 4678                          goto ud_error;
4561 4679                  }
4562 4680                  /*
4563 4681                   * Check whether Trusted Solaris policy allows communication
4564 4682                   * with this host, and pretend that the destination is
4565 4683                   * unreachable if not.
4566 4684                   * Compute any needed label and place it in ipp_label_v4/v6.
4567 4685                   *
4568 4686                   * Later conn_build_hdr_template/conn_prepend_hdr takes
4569 4687                   * ipp_label_v4/v6 to form the packet.
4570 4688                   *
4571 4689                   * Tsol note: Since we hold conn_lock we know no other
4572 4690                   * thread manipulates conn_xmit_ipp.
4573 4691                   */
4574 4692                  error = conn_update_label(connp, ixa, &v6dst,
4575 4693                      &connp->conn_xmit_ipp);
4576 4694                  if (error != 0) {
4577 4695                          mutex_exit(&connp->conn_lock);
4578 4696                          goto ud_error;
4579 4697                  }
4580 4698                  /* Rebuild the header template */
4581 4699                  error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4582 4700                      flowinfo);
4583 4701                  if (error != 0) {
4584 4702                          mutex_exit(&connp->conn_lock);
4585 4703                          goto ud_error;
4586 4704                  }
4587 4705          } else if (connp->conn_xmit_ipp.ipp_fields &
4588 4706              (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4589 4707              IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4590 4708                  /* Rebuild the header template */
4591 4709                  error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4592 4710                      flowinfo);
4593 4711                  if (error != 0) {
4594 4712                          mutex_exit(&connp->conn_lock);
4595 4713                          goto ud_error;
4596 4714                  }
4597 4715          } else {
4598 4716                  /* Simply update the destination address if no source route */
4599 4717                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
4600 4718                          ipha_t  *ipha = (ipha_t *)connp->conn_ht_iphc;
4601 4719  
4602 4720                          IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4603 4721                          if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4604 4722                                  ipha->ipha_fragment_offset_and_flags |=
4605 4723                                      IPH_DF_HTONS;
4606 4724                          } else {
4607 4725                                  ipha->ipha_fragment_offset_and_flags &=
4608 4726                                      ~IPH_DF_HTONS;
4609 4727                          }
4610 4728                  } else {
4611 4729                          ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4612 4730                          ip6h->ip6_dst = v6dst;
4613 4731                  }
4614 4732          }
4615 4733  
4616 4734          /*
4617 4735           * Remember the dst etc which corresponds to the built header
4618 4736           * template and conn_ixa.
4619 4737           */
4620 4738          oldixa = conn_replace_ixa(connp, ixa);
4621 4739          connp->conn_v6lastdst = v6dst;
4622 4740          connp->conn_lastflowinfo = flowinfo;
4623 4741          connp->conn_lastscopeid = ixa->ixa_scopeid;
4624 4742          connp->conn_lastsrcid = srcid;
4625 4743          /* Also remember a source to use together with lastdst */
4626 4744          connp->conn_v6lastsrc = v6src;
4627 4745  
4628 4746          data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4629 4747              flowinfo, &error);
4630 4748  
4631 4749          /* Done with conn_t */
4632 4750          mutex_exit(&connp->conn_lock);
4633 4751          ixa_refrele(oldixa);
4634 4752  
4635 4753          if (data_mp == NULL) {
4636 4754                  ASSERT(error != 0);
4637 4755                  goto ud_error;
4638 4756          }
4639 4757  
4640 4758          if (!do_ipsec) {
4641 4759                  /* Policy might differ for different ICMP type/code */
4642 4760                  data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4643 4761                  if (data_mp == NULL) {
4644 4762                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4645 4763                          error = EHOSTUNREACH;   /* IPsec policy failure */
4646 4764                          goto done;
4647 4765                  }
4648 4766          }
4649 4767  
4650 4768          /* We're done.  Pass the packet to ip. */
4651 4769          BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4652 4770  
4653 4771          error = conn_ip_output(data_mp, ixa);
4654 4772          /* No rawipOutErrors if an error since IP increases its error counter */
4655 4773          switch (error) {
4656 4774          case 0:
4657 4775                  break;
4658 4776          case EWOULDBLOCK:
4659 4777                  (void) ixa_check_drain_insert(connp, ixa);
4660 4778                  error = 0;
4661 4779                  break;
4662 4780          case EADDRNOTAVAIL:
4663 4781                  /*
4664 4782                   * IXAF_VERIFY_SOURCE tells us to pick a better source.
4665 4783                   * Don't have the application see that errno
4666 4784                   */
4667 4785                  error = ENETUNREACH;
4668 4786                  /* FALLTHRU */
4669 4787          default:
4670 4788                  mutex_enter(&connp->conn_lock);
4671 4789                  /*
4672 4790                   * Clear the source and v6lastdst so we call ip_attr_connect
4673 4791                   * for the next packet and try to pick a better source.
4674 4792                   */
4675 4793                  if (connp->conn_mcbc_bind)
4676 4794                          connp->conn_saddr_v6 = ipv6_all_zeros;
4677 4795                  else
4678 4796                          connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4679 4797                  connp->conn_v6lastdst = ipv6_all_zeros;
4680 4798                  mutex_exit(&connp->conn_lock);
4681 4799                  break;
4682 4800          }
4683 4801  done:
4684 4802          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4685 4803          ixa->ixa_cred = connp->conn_cred;       /* Restore */
4686 4804          ixa->ixa_cpid = connp->conn_cpid;
4687 4805          ixa_refrele(ixa);
4688 4806          return (error);
4689 4807  
4690 4808  ud_error:
4691 4809          ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4692 4810          ixa->ixa_cred = connp->conn_cred;       /* Restore */
4693 4811          ixa->ixa_cpid = connp->conn_cpid;
4694 4812          ixa_refrele(ixa);
4695 4813  
4696 4814          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4697 4815          freemsg(data_mp);
4698 4816          return (error);
4699 4817  }
4700 4818  
4701 4819  /* ARGSUSED */
4702 4820  static void
4703 4821  icmp_wput_fallback(queue_t *q, mblk_t *mp)
4704 4822  {
4705 4823  #ifdef DEBUG
4706 4824          cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4707 4825  #endif
4708 4826          freemsg(mp);
4709 4827  }
4710 4828  
4711 4829  static void
4712 4830  icmp_wput_other(queue_t *q, mblk_t *mp)
4713 4831  {
4714 4832          uchar_t *rptr = mp->b_rptr;
4715 4833          struct iocblk *iocp;
4716 4834          conn_t  *connp = Q_TO_CONN(q);
4717 4835          icmp_t  *icmp = connp->conn_icmp;
4718 4836          cred_t *cr;
4719 4837  
4720 4838          switch (mp->b_datap->db_type) {
4721 4839          case M_PROTO:
4722 4840          case M_PCPROTO:
4723 4841                  if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4724 4842                          /*
4725 4843                           * If the message does not contain a PRIM_type,
4726 4844                           * throw it away.
4727 4845                           */
4728 4846                          freemsg(mp);
4729 4847                          return;
4730 4848                  }
4731 4849                  switch (((t_primp_t)rptr)->type) {
4732 4850                  case T_ADDR_REQ:
4733 4851                          icmp_addr_req(q, mp);
4734 4852                          return;
4735 4853                  case O_T_BIND_REQ:
4736 4854                  case T_BIND_REQ:
4737 4855                          icmp_tpi_bind(q, mp);
4738 4856                          return;
4739 4857                  case T_CONN_REQ:
4740 4858                          icmp_tpi_connect(q, mp);
4741 4859                          return;
4742 4860                  case T_CAPABILITY_REQ:
4743 4861                          icmp_capability_req(q, mp);
4744 4862                          return;
4745 4863                  case T_INFO_REQ:
4746 4864                          icmp_info_req(q, mp);
4747 4865                          return;
4748 4866                  case T_UNITDATA_REQ:
4749 4867                          /*
4750 4868                           * If a T_UNITDATA_REQ gets here, the address must
4751 4869                           * be bad.  Valid T_UNITDATA_REQs are handled
4752 4870                           * in icmp_wput.
4753 4871                           */
4754 4872                          icmp_ud_err(q, mp, EADDRNOTAVAIL);
4755 4873                          return;
4756 4874                  case T_UNBIND_REQ:
4757 4875                          icmp_tpi_unbind(q, mp);
4758 4876                          return;
4759 4877                  case T_SVR4_OPTMGMT_REQ:
4760 4878                          /*
4761 4879                           * All Solaris components should pass a db_credp
4762 4880                           * for this TPI message, hence we ASSERT.
4763 4881                           * But in case there is some other M_PROTO that looks
4764 4882                           * like a TPI message sent by some other kernel
4765 4883                           * component, we check and return an error.
4766 4884                           */
4767 4885                          cr = msg_getcred(mp, NULL);
4768 4886                          ASSERT(cr != NULL);
4769 4887                          if (cr == NULL) {
4770 4888                                  icmp_err_ack(q, mp, TSYSERR, EINVAL);
4771 4889                                  return;
4772 4890                          }
4773 4891  
4774 4892                          if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4775 4893                              cr)) {
4776 4894                                  svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4777 4895                          }
4778 4896                          return;
4779 4897  
4780 4898                  case T_OPTMGMT_REQ:
4781 4899                          /*
4782 4900                           * All Solaris components should pass a db_credp
4783 4901                           * for this TPI message, hence we ASSERT.
4784 4902                           * But in case there is some other M_PROTO that looks
4785 4903                           * like a TPI message sent by some other kernel
4786 4904                           * component, we check and return an error.
4787 4905                           */
4788 4906                          cr = msg_getcred(mp, NULL);
4789 4907                          ASSERT(cr != NULL);
4790 4908                          if (cr == NULL) {
4791 4909                                  icmp_err_ack(q, mp, TSYSERR, EINVAL);
4792 4910                                  return;
4793 4911                          }
4794 4912                          tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4795 4913                          return;
4796 4914  
4797 4915                  case T_DISCON_REQ:
4798 4916                          icmp_tpi_disconnect(q, mp);
4799 4917                          return;
4800 4918  
4801 4919                  /* The following TPI message is not supported by icmp. */
4802 4920                  case O_T_CONN_RES:
4803 4921                  case T_CONN_RES:
4804 4922                          icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4805 4923                          return;
4806 4924  
4807 4925                  /* The following 3 TPI requests are illegal for icmp. */
4808 4926                  case T_DATA_REQ:
4809 4927                  case T_EXDATA_REQ:
4810 4928                  case T_ORDREL_REQ:
4811 4929                          icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4812 4930                          return;
4813 4931                  default:
4814 4932                          break;
4815 4933                  }
4816 4934                  break;
4817 4935          case M_FLUSH:
4818 4936                  if (*rptr & FLUSHW)
4819 4937                          flushq(q, FLUSHDATA);
4820 4938                  break;
4821 4939          case M_IOCTL:
4822 4940                  iocp = (struct iocblk *)mp->b_rptr;
4823 4941                  switch (iocp->ioc_cmd) {
4824 4942                  case TI_GETPEERNAME:
4825 4943                          if (icmp->icmp_state != TS_DATA_XFER) {
4826 4944                                  /*
4827 4945                                   * If a default destination address has not
4828 4946                                   * been associated with the stream, then we
4829 4947                                   * don't know the peer's name.
4830 4948                                   */
4831 4949                                  iocp->ioc_error = ENOTCONN;
4832 4950                                  iocp->ioc_count = 0;
4833 4951                                  mp->b_datap->db_type = M_IOCACK;
4834 4952                                  qreply(q, mp);
4835 4953                                  return;
4836 4954                          }
4837 4955                          /* FALLTHRU */
4838 4956                  case TI_GETMYNAME:
4839 4957                          /*
4840 4958                           * For TI_GETPEERNAME and TI_GETMYNAME, we first
4841 4959                           * need to copyin the user's strbuf structure.
4842 4960                           * Processing will continue in the M_IOCDATA case
4843 4961                           * below.
4844 4962                           */
4845 4963                          mi_copyin(q, mp, NULL,
4846 4964                              SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4847 4965                          return;
4848 4966                  default:
4849 4967                          break;
4850 4968                  }
4851 4969                  break;
4852 4970          case M_IOCDATA:
4853 4971                  icmp_wput_iocdata(q, mp);
4854 4972                  return;
4855 4973          default:
4856 4974                  /* Unrecognized messages are passed through without change. */
4857 4975                  break;
4858 4976          }
4859 4977          ip_wput_nondata(q, mp);
4860 4978  }
4861 4979  
4862 4980  /*
4863 4981   * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4864 4982   * messages.
4865 4983   */
4866 4984  static void
4867 4985  icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4868 4986  {
4869 4987          mblk_t          *mp1;
4870 4988          STRUCT_HANDLE(strbuf, sb);
4871 4989          uint_t          addrlen;
4872 4990          conn_t          *connp = Q_TO_CONN(q);
4873 4991          icmp_t          *icmp = connp->conn_icmp;
4874 4992  
4875 4993          /* Make sure it is one of ours. */
4876 4994          switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4877 4995          case TI_GETMYNAME:
4878 4996          case TI_GETPEERNAME:
4879 4997                  break;
4880 4998          default:
4881 4999                  ip_wput_nondata(q, mp);
4882 5000                  return;
4883 5001          }
4884 5002  
4885 5003          switch (mi_copy_state(q, mp, &mp1)) {
4886 5004          case -1:
4887 5005                  return;
4888 5006          case MI_COPY_CASE(MI_COPY_IN, 1):
4889 5007                  break;
4890 5008          case MI_COPY_CASE(MI_COPY_OUT, 1):
4891 5009                  /*
4892 5010                   * The address has been copied out, so now
4893 5011                   * copyout the strbuf.
4894 5012                   */
4895 5013                  mi_copyout(q, mp);
4896 5014                  return;
4897 5015          case MI_COPY_CASE(MI_COPY_OUT, 2):
4898 5016                  /*
4899 5017                   * The address and strbuf have been copied out.
4900 5018                   * We're done, so just acknowledge the original
4901 5019                   * M_IOCTL.
4902 5020                   */
4903 5021                  mi_copy_done(q, mp, 0);
4904 5022                  return;
4905 5023          default:
4906 5024                  /*
4907 5025                   * Something strange has happened, so acknowledge
4908 5026                   * the original M_IOCTL with an EPROTO error.
4909 5027                   */
4910 5028                  mi_copy_done(q, mp, EPROTO);
4911 5029                  return;
4912 5030          }
4913 5031  
4914 5032          /*
4915 5033           * Now we have the strbuf structure for TI_GETMYNAME
4916 5034           * and TI_GETPEERNAME.  Next we copyout the requested
4917 5035           * address and then we'll copyout the strbuf.
4918 5036           */
4919 5037          STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
4920 5038              (void *)mp1->b_rptr);
4921 5039  
4922 5040          if (connp->conn_family == AF_INET)
4923 5041                  addrlen = sizeof (sin_t);
4924 5042          else
4925 5043                  addrlen = sizeof (sin6_t);
4926 5044  
4927 5045          if (STRUCT_FGET(sb, maxlen) < addrlen) {
4928 5046                  mi_copy_done(q, mp, EINVAL);
4929 5047                  return;
4930 5048          }
4931 5049          switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4932 5050          case TI_GETMYNAME:
4933 5051                  break;
4934 5052          case TI_GETPEERNAME:
4935 5053                  if (icmp->icmp_state != TS_DATA_XFER) {
4936 5054                          mi_copy_done(q, mp, ENOTCONN);
4937 5055                          return;
4938 5056                  }
4939 5057                  break;
4940 5058          default:
4941 5059                  mi_copy_done(q, mp, EPROTO);
4942 5060                  return;
4943 5061          }
4944 5062          mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
4945 5063          if (!mp1)
4946 5064                  return;
4947 5065  
4948 5066          STRUCT_FSET(sb, len, addrlen);
4949 5067          switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4950 5068          case TI_GETMYNAME:
4951 5069                  (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
4952 5070                      &addrlen);
4953 5071                  break;
4954 5072          case TI_GETPEERNAME:
4955 5073                  (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
4956 5074                      &addrlen);
4957 5075                  break;
4958 5076          }
4959 5077          mp1->b_wptr += addrlen;
4960 5078          /* Copy out the address */
4961 5079          mi_copyout(q, mp);
4962 5080  }
4963 5081  
4964 5082  void
4965 5083  icmp_ddi_g_init(void)
4966 5084  {
4967 5085          icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
4968 5086              icmp_opt_obj.odb_opt_arr_cnt);
4969 5087  
4970 5088          /*
4971 5089           * We want to be informed each time a stack is created or
4972 5090           * destroyed in the kernel, so we can maintain the
4973 5091           * set of icmp_stack_t's.
4974 5092           */
4975 5093          netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
4976 5094  }
4977 5095  
4978 5096  void
4979 5097  icmp_ddi_g_destroy(void)
4980 5098  {
4981 5099          netstack_unregister(NS_ICMP);
4982 5100  }
4983 5101  
4984 5102  #define INET_NAME       "ip"
4985 5103  
4986 5104  /*
4987 5105   * Initialize the ICMP stack instance.
4988 5106   */
4989 5107  static void *
4990 5108  rawip_stack_init(netstackid_t stackid, netstack_t *ns)
4991 5109  {
4992 5110          icmp_stack_t    *is;
4993 5111          int             error = 0;
4994 5112          size_t          arrsz;
4995 5113          major_t         major;
4996 5114  
4997 5115          is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
4998 5116          is->is_netstack = ns;
4999 5117  
5000 5118          arrsz = sizeof (icmp_propinfo_tbl);
5001 5119          is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
5002 5120          bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz);
5003 5121  
5004 5122          is->is_ksp = rawip_kstat_init(stackid);
5005 5123  
5006 5124          major = mod_name_to_major(INET_NAME);
5007 5125          error = ldi_ident_from_major(major, &is->is_ldi_ident);
5008 5126          ASSERT(error == 0);
5009 5127          return (is);
5010 5128  }
5011 5129  
5012 5130  /*
5013 5131   * Free the ICMP stack instance.
5014 5132   */
5015 5133  static void
5016 5134  rawip_stack_fini(netstackid_t stackid, void *arg)
5017 5135  {
5018 5136          icmp_stack_t *is = (icmp_stack_t *)arg;
5019 5137  
  
    | 
      ↓ open down ↓ | 
    2409 lines elided | 
    
      ↑ open up ↑ | 
  
5020 5138          kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl));
5021 5139          is->is_propinfo_tbl = NULL;
5022 5140  
5023 5141          rawip_kstat_fini(stackid, is->is_ksp);
5024 5142          is->is_ksp = NULL;
5025 5143          ldi_ident_release(is->is_ldi_ident);
5026 5144          kmem_free(is, sizeof (*is));
5027 5145  }
5028 5146  
5029 5147  static void *
5030      -rawip_kstat_init(netstackid_t stackid) {
     5148 +rawip_kstat_init(netstackid_t stackid)
     5149 +{
5031 5150          kstat_t *ksp;
5032 5151  
5033 5152          rawip_named_kstat_t template = {
5034 5153                  { "inDatagrams",        KSTAT_DATA_UINT32, 0 },
5035 5154                  { "inCksumErrs",        KSTAT_DATA_UINT32, 0 },
5036 5155                  { "inErrors",           KSTAT_DATA_UINT32, 0 },
5037 5156                  { "outDatagrams",       KSTAT_DATA_UINT32, 0 },
5038 5157                  { "outErrors",          KSTAT_DATA_UINT32, 0 },
5039 5158          };
5040 5159  
5041 5160          ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5042      -                                        KSTAT_TYPE_NAMED,
5043      -                                        NUM_OF_FIELDS(rawip_named_kstat_t),
5044      -                                        0, stackid);
     5161 +            KSTAT_TYPE_NAMED, NUM_OF_FIELDS(rawip_named_kstat_t), 0, stackid);
5045 5162          if (ksp == NULL || ksp->ks_data == NULL)
5046 5163                  return (NULL);
5047 5164  
5048 5165          bcopy(&template, ksp->ks_data, sizeof (template));
5049 5166          ksp->ks_update = rawip_kstat_update;
5050 5167          ksp->ks_private = (void *)(uintptr_t)stackid;
5051 5168  
5052 5169          kstat_install(ksp);
5053 5170          return (ksp);
5054 5171  }
5055 5172  
5056 5173  static void
5057 5174  rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5058 5175  {
5059 5176          if (ksp != NULL) {
5060 5177                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5061 5178                  kstat_delete_netstack(ksp, stackid);
5062 5179          }
5063 5180  }
5064 5181  
5065 5182  static int
5066 5183  rawip_kstat_update(kstat_t *ksp, int rw)
5067 5184  {
5068 5185          rawip_named_kstat_t *rawipkp;
5069 5186          netstackid_t    stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5070 5187          netstack_t      *ns;
5071 5188          icmp_stack_t    *is;
5072 5189  
5073 5190          if ((ksp == NULL) || (ksp->ks_data == NULL))
5074 5191                  return (EIO);
5075 5192  
5076 5193          if (rw == KSTAT_WRITE)
5077 5194                  return (EACCES);
5078 5195  
5079 5196          rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5080 5197  
5081 5198          ns = netstack_find_by_stackid(stackid);
5082 5199          if (ns == NULL)
5083 5200                  return (-1);
5084 5201          is = ns->netstack_icmp;
5085 5202          if (is == NULL) {
5086 5203                  netstack_rele(ns);
5087 5204                  return (-1);
5088 5205          }
5089 5206          rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5090 5207          rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5091 5208          rawipkp->inErrors.value.ui32 =     is->is_rawip_mib.rawipInErrors;
5092 5209          rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5093 5210          rawipkp->outErrors.value.ui32 =    is->is_rawip_mib.rawipOutErrors;
5094 5211          netstack_rele(ns);
5095 5212          return (0);
5096 5213  }
5097 5214  
5098 5215  /* ARGSUSED */
5099 5216  int
5100 5217  rawip_accept(sock_lower_handle_t lproto_handle,
5101 5218      sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5102 5219      cred_t *cr)
5103 5220  {
5104 5221          return (EOPNOTSUPP);
5105 5222  }
5106 5223  
5107 5224  /* ARGSUSED */
5108 5225  int
5109 5226  rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5110 5227      socklen_t len, cred_t *cr)
5111 5228  {
5112 5229          conn_t  *connp = (conn_t *)proto_handle;
5113 5230          int     error;
5114 5231  
5115 5232          /* All Solaris components should pass a cred for this operation. */
5116 5233          ASSERT(cr != NULL);
5117 5234  
5118 5235          /* Binding to a NULL address really means unbind */
5119 5236          if (sa == NULL)
5120 5237                  error = rawip_do_unbind(connp);
5121 5238          else
5122 5239                  error = rawip_do_bind(connp, sa, len);
5123 5240  
5124 5241          if (error < 0) {
5125 5242                  if (error == -TOUTSTATE)
5126 5243                          error = EINVAL;
5127 5244                  else
5128 5245                          error = proto_tlitosyserr(-error);
5129 5246          }
5130 5247          return (error);
5131 5248  }
5132 5249  
5133 5250  static int
5134 5251  rawip_implicit_bind(conn_t *connp)
5135 5252  {
5136 5253          sin6_t sin6addr;
5137 5254          sin_t *sin;
5138 5255          sin6_t *sin6;
5139 5256          socklen_t len;
5140 5257          int error;
5141 5258  
5142 5259          if (connp->conn_family == AF_INET) {
5143 5260                  len = sizeof (struct sockaddr_in);
5144 5261                  sin = (sin_t *)&sin6addr;
5145 5262                  *sin = sin_null;
5146 5263                  sin->sin_family = AF_INET;
5147 5264                  sin->sin_addr.s_addr = INADDR_ANY;
5148 5265          } else {
5149 5266                  ASSERT(connp->conn_family == AF_INET6);
5150 5267                  len = sizeof (sin6_t);
5151 5268                  sin6 = (sin6_t *)&sin6addr;
5152 5269                  *sin6 = sin6_null;
5153 5270                  sin6->sin6_family = AF_INET6;
5154 5271                  V6_SET_ZERO(sin6->sin6_addr);
5155 5272          }
5156 5273  
5157 5274          error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5158 5275  
5159 5276          return ((error < 0) ? proto_tlitosyserr(-error) : error);
5160 5277  }
5161 5278  
5162 5279  static int
5163 5280  rawip_unbind(conn_t *connp)
5164 5281  {
5165 5282          int error;
5166 5283  
5167 5284          error = rawip_do_unbind(connp);
5168 5285          if (error < 0) {
5169 5286                  error = proto_tlitosyserr(-error);
5170 5287          }
5171 5288          return (error);
5172 5289  }
5173 5290  
5174 5291  /* ARGSUSED */
5175 5292  int
5176 5293  rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5177 5294  {
5178 5295          return (EOPNOTSUPP);
5179 5296  }
5180 5297  
5181 5298  int
5182 5299  rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5183 5300      socklen_t len, sock_connid_t *id, cred_t *cr)
5184 5301  {
5185 5302          conn_t  *connp = (conn_t *)proto_handle;
5186 5303          icmp_t *icmp = connp->conn_icmp;
5187 5304          int     error;
5188 5305          boolean_t did_bind = B_FALSE;
5189 5306          pid_t   pid = curproc->p_pid;
5190 5307  
5191 5308          /* All Solaris components should pass a cred for this operation. */
5192 5309          ASSERT(cr != NULL);
5193 5310  
5194 5311          if (sa == NULL) {
5195 5312                  /*
5196 5313                   * Disconnect
5197 5314                   * Make sure we are connected
5198 5315                   */
5199 5316                  if (icmp->icmp_state != TS_DATA_XFER)
5200 5317                          return (EINVAL);
5201 5318  
5202 5319                  error = icmp_disconnect(connp);
5203 5320                  return (error);
5204 5321          }
5205 5322  
5206 5323          error = proto_verify_ip_addr(connp->conn_family, sa, len);
5207 5324          if (error != 0)
5208 5325                  return (error);
5209 5326  
5210 5327          /* do an implicit bind if necessary */
5211 5328          if (icmp->icmp_state == TS_UNBND) {
5212 5329                  error = rawip_implicit_bind(connp);
5213 5330                  /*
5214 5331                   * We could be racing with an actual bind, in which case
5215 5332                   * we would see EPROTO. We cross our fingers and try
5216 5333                   * to connect.
5217 5334                   */
5218 5335                  if (!(error == 0 || error == EPROTO))
5219 5336                          return (error);
5220 5337                  did_bind = B_TRUE;
5221 5338          }
5222 5339  
5223 5340          /*
5224 5341           * set SO_DGRAM_ERRIND
5225 5342           */
5226 5343          connp->conn_dgram_errind = B_TRUE;
5227 5344  
5228 5345          error = rawip_do_connect(connp, sa, len, cr, pid);
5229 5346          if (error != 0 && did_bind) {
5230 5347                  int unbind_err;
5231 5348  
5232 5349                  unbind_err = rawip_unbind(connp);
5233 5350                  ASSERT(unbind_err == 0);
5234 5351          }
5235 5352  
5236 5353          if (error == 0) {
5237 5354                  *id = 0;
5238 5355                  (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5239 5356                      0, NULL, -1);
5240 5357          } else if (error < 0) {
5241 5358                  error = proto_tlitosyserr(-error);
5242 5359          }
5243 5360          return (error);
5244 5361  }
5245 5362  
5246 5363  /* ARGSUSED2 */
5247 5364  int
5248 5365  rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5249 5366      boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
5250 5367      sock_quiesce_arg_t *arg)
5251 5368  {
5252 5369          conn_t  *connp = (conn_t *)proto_handle;
5253 5370          icmp_t  *icmp;
5254 5371          struct T_capability_ack tca;
5255 5372          struct sockaddr_in6 laddr, faddr;
5256 5373          socklen_t laddrlen, faddrlen;
5257 5374          short opts;
5258 5375          struct stroptions *stropt;
5259 5376          mblk_t *mp, *stropt_mp;
5260 5377          int error;
5261 5378  
5262 5379          icmp = connp->conn_icmp;
5263 5380  
5264 5381          stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5265 5382  
5266 5383          /*
5267 5384           * setup the fallback stream that was allocated
5268 5385           */
5269 5386          connp->conn_dev = (dev_t)RD(q)->q_ptr;
5270 5387          connp->conn_minor_arena = WR(q)->q_ptr;
5271 5388  
5272 5389          RD(q)->q_ptr = WR(q)->q_ptr = connp;
5273 5390  
5274 5391          WR(q)->q_qinfo = &icmpwinit;
5275 5392  
5276 5393          connp->conn_rq = RD(q);
5277 5394          connp->conn_wq = WR(q);
5278 5395  
5279 5396          /* Notify stream head about options before sending up data */
5280 5397          stropt_mp->b_datap->db_type = M_SETOPTS;
5281 5398          stropt_mp->b_wptr += sizeof (*stropt);
5282 5399          stropt = (struct stroptions *)stropt_mp->b_rptr;
5283 5400          stropt->so_flags = SO_WROFF | SO_HIWAT;
5284 5401          stropt->so_wroff = connp->conn_wroff;
5285 5402          stropt->so_hiwat = connp->conn_rcvbuf;
5286 5403          putnext(RD(q), stropt_mp);
5287 5404  
5288 5405          /*
5289 5406           * free helper stream
5290 5407           */
5291 5408          ip_free_helper_stream(connp);
5292 5409  
5293 5410          /*
5294 5411           * Collect the information needed to sync with the sonode
5295 5412           */
5296 5413          icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5297 5414  
5298 5415          laddrlen = faddrlen = sizeof (sin6_t);
5299 5416          (void) rawip_getsockname((sock_lower_handle_t)connp,
5300 5417              (struct sockaddr *)&laddr, &laddrlen, CRED());
5301 5418          error = rawip_getpeername((sock_lower_handle_t)connp,
5302 5419              (struct sockaddr *)&faddr, &faddrlen, CRED());
5303 5420          if (error != 0)
5304 5421                  faddrlen = 0;
5305 5422          opts = 0;
5306 5423          if (connp->conn_dgram_errind)
5307 5424                  opts |= SO_DGRAM_ERRIND;
5308 5425          if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5309 5426                  opts |= SO_DONTROUTE;
5310 5427  
5311 5428          mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
5312 5429              (struct sockaddr *)&laddr, laddrlen,
5313 5430              (struct sockaddr *)&faddr, faddrlen, opts);
5314 5431  
5315 5432          /*
5316 5433           * Attempts to send data up during fallback will result in it being
5317 5434           * queued in icmp_t. Now we push up any queued packets.
5318 5435           */
5319 5436          mutex_enter(&icmp->icmp_recv_lock);
5320 5437          if (mp != NULL) {
5321 5438                  mp->b_next = icmp->icmp_fallback_queue_head;
5322 5439                  icmp->icmp_fallback_queue_head = mp;
5323 5440          }
5324 5441          while (icmp->icmp_fallback_queue_head != NULL) {
5325 5442                  mp = icmp->icmp_fallback_queue_head;
5326 5443                  icmp->icmp_fallback_queue_head = mp->b_next;
5327 5444                  mp->b_next = NULL;
5328 5445                  mutex_exit(&icmp->icmp_recv_lock);
5329 5446                  putnext(RD(q), mp);
5330 5447                  mutex_enter(&icmp->icmp_recv_lock);
5331 5448          }
5332 5449          icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5333 5450  
5334 5451          /*
5335 5452           * No longer a streams less socket
5336 5453           */
5337 5454          mutex_enter(&connp->conn_lock);
5338 5455          connp->conn_flags &= ~IPCL_NONSTR;
5339 5456          mutex_exit(&connp->conn_lock);
5340 5457  
5341 5458          mutex_exit(&icmp->icmp_recv_lock);
5342 5459  
5343 5460          ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5344 5461              icmp->icmp_fallback_queue_tail == NULL);
5345 5462  
5346 5463          ASSERT(connp->conn_ref >= 1);
5347 5464  
5348 5465          return (0);
5349 5466  }
5350 5467  
5351 5468  /* ARGSUSED2 */
5352 5469  sock_lower_handle_t
5353 5470  rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5354 5471      uint_t *smodep, int *errorp, int flags, cred_t *credp)
5355 5472  {
5356 5473          conn_t *connp;
5357 5474  
5358 5475          if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5359 5476                  *errorp = EPROTONOSUPPORT;
5360 5477                  return (NULL);
5361 5478          }
5362 5479  
5363 5480          connp = rawip_do_open(family, credp, errorp, flags);
5364 5481          if (connp != NULL) {
5365 5482                  connp->conn_flags |= IPCL_NONSTR;
5366 5483  
5367 5484                  mutex_enter(&connp->conn_lock);
5368 5485                  connp->conn_state_flags &= ~CONN_INCIPIENT;
5369 5486                  mutex_exit(&connp->conn_lock);
5370 5487                  *sock_downcalls = &sock_rawip_downcalls;
5371 5488                  *smodep = SM_ATOMIC;
5372 5489          } else {
5373 5490                  ASSERT(*errorp != 0);
5374 5491          }
5375 5492  
5376 5493          return ((sock_lower_handle_t)connp);
5377 5494  }
5378 5495  
5379 5496  /* ARGSUSED3 */
5380 5497  void
5381 5498  rawip_activate(sock_lower_handle_t proto_handle,
5382 5499      sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5383 5500      cred_t *cr)
5384 5501  {
5385 5502          conn_t                  *connp = (conn_t *)proto_handle;
5386 5503          struct sock_proto_props sopp;
5387 5504  
5388 5505          /* All Solaris components should pass a cred for this operation. */
5389 5506          ASSERT(cr != NULL);
5390 5507  
5391 5508          connp->conn_upcalls = sock_upcalls;
5392 5509          connp->conn_upper_handle = sock_handle;
5393 5510  
5394 5511          sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5395 5512              SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5396 5513          sopp.sopp_wroff = connp->conn_wroff;
5397 5514          sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5398 5515          sopp.sopp_rxlowat = connp->conn_rcvlowat;
5399 5516          sopp.sopp_maxblk = INFPSZ;
5400 5517          sopp.sopp_maxpsz = IP_MAXPACKET;
5401 5518          sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5402 5519              icmp_mod_info.mi_minpsz;
5403 5520  
5404 5521          (*connp->conn_upcalls->su_set_proto_props)
5405 5522              (connp->conn_upper_handle, &sopp);
5406 5523  
5407 5524          icmp_bind_proto(connp->conn_icmp);
5408 5525  }
5409 5526  
5410 5527  /* ARGSUSED3 */
5411 5528  int
5412 5529  rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5413 5530      socklen_t *salenp, cred_t *cr)
5414 5531  {
5415 5532          conn_t  *connp = (conn_t *)proto_handle;
5416 5533          icmp_t  *icmp = connp->conn_icmp;
5417 5534          int     error;
5418 5535  
5419 5536          /* All Solaris components should pass a cred for this operation. */
5420 5537          ASSERT(cr != NULL);
5421 5538  
5422 5539          mutex_enter(&connp->conn_lock);
5423 5540          if (icmp->icmp_state != TS_DATA_XFER)
5424 5541                  error = ENOTCONN;
5425 5542          else
5426 5543                  error = conn_getpeername(connp, sa, salenp);
5427 5544          mutex_exit(&connp->conn_lock);
5428 5545          return (error);
5429 5546  }
5430 5547  
5431 5548  /* ARGSUSED3 */
5432 5549  int
5433 5550  rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5434 5551      socklen_t *salenp, cred_t *cr)
5435 5552  {
5436 5553          conn_t  *connp = (conn_t *)proto_handle;
5437 5554          int     error;
5438 5555  
5439 5556          /* All Solaris components should pass a cred for this operation. */
5440 5557          ASSERT(cr != NULL);
5441 5558  
5442 5559          mutex_enter(&connp->conn_lock);
5443 5560          error = conn_getsockname(connp, sa, salenp);
5444 5561          mutex_exit(&connp->conn_lock);
5445 5562          return (error);
5446 5563  }
5447 5564  
5448 5565  int
5449 5566  rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5450 5567      const void *optvalp, socklen_t optlen, cred_t *cr)
5451 5568  {
5452 5569          conn_t  *connp = (conn_t *)proto_handle;
5453 5570          int error;
5454 5571  
5455 5572          /* All Solaris components should pass a cred for this operation. */
5456 5573          ASSERT(cr != NULL);
5457 5574  
5458 5575          error = proto_opt_check(level, option_name, optlen, NULL,
5459 5576              icmp_opt_obj.odb_opt_des_arr,
5460 5577              icmp_opt_obj.odb_opt_arr_cnt,
5461 5578              B_TRUE, B_FALSE, cr);
5462 5579  
5463 5580          if (error != 0) {
5464 5581                  /*
5465 5582                   * option not recognized
5466 5583                   */
5467 5584                  if (error < 0) {
5468 5585                          error = proto_tlitosyserr(-error);
5469 5586                  }
5470 5587                  return (error);
5471 5588          }
5472 5589  
5473 5590          error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5474 5591              option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5475 5592              (uchar_t *)optvalp, NULL, cr);
5476 5593  
5477 5594          ASSERT(error >= 0);
5478 5595  
5479 5596          return (error);
5480 5597  }
5481 5598  
5482 5599  int
5483 5600  rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5484 5601      void *optvalp, socklen_t *optlen, cred_t *cr)
5485 5602  {
5486 5603          int             error;
5487 5604          conn_t          *connp = (conn_t *)proto_handle;
5488 5605          t_uscalar_t     max_optbuf_len;
5489 5606          void            *optvalp_buf;
5490 5607          int             len;
5491 5608  
5492 5609          /* All Solaris components should pass a cred for this operation. */
5493 5610          ASSERT(cr != NULL);
5494 5611  
5495 5612          error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5496 5613              icmp_opt_obj.odb_opt_des_arr,
5497 5614              icmp_opt_obj.odb_opt_arr_cnt,
5498 5615              B_FALSE, B_TRUE, cr);
5499 5616  
5500 5617          if (error != 0) {
5501 5618                  if (error < 0) {
5502 5619                          error = proto_tlitosyserr(-error);
5503 5620                  }
5504 5621                  return (error);
5505 5622          }
5506 5623  
5507 5624          optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5508 5625          len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5509 5626          if (len == -1) {
5510 5627                  kmem_free(optvalp_buf, max_optbuf_len);
5511 5628                  return (EINVAL);
5512 5629          }
5513 5630  
5514 5631          /*
5515 5632           * update optlen and copy option value
5516 5633           */
5517 5634          t_uscalar_t size = MIN(len, *optlen);
5518 5635  
5519 5636          bcopy(optvalp_buf, optvalp, size);
5520 5637          bcopy(&size, optlen, sizeof (size));
5521 5638  
5522 5639          kmem_free(optvalp_buf, max_optbuf_len);
5523 5640          return (0);
5524 5641  }
5525 5642  
5526 5643  /* ARGSUSED1 */
5527 5644  int
5528 5645  rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5529 5646  {
5530 5647          conn_t  *connp = (conn_t *)proto_handle;
5531 5648  
5532 5649          /* All Solaris components should pass a cred for this operation. */
5533 5650          ASSERT(cr != NULL);
5534 5651  
5535 5652          (void) rawip_do_close(connp);
5536 5653          return (0);
5537 5654  }
5538 5655  
5539 5656  /* ARGSUSED2 */
5540 5657  int
5541 5658  rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5542 5659  {
5543 5660          conn_t  *connp = (conn_t *)proto_handle;
5544 5661  
5545 5662          /* All Solaris components should pass a cred for this operation. */
5546 5663          ASSERT(cr != NULL);
5547 5664  
5548 5665          /* shut down the send side */
5549 5666          if (how != SHUT_RD)
5550 5667                  (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5551 5668                      SOCK_OPCTL_SHUT_SEND, 0);
5552 5669          /* shut down the recv side */
5553 5670          if (how != SHUT_WR)
5554 5671                  (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5555 5672                      SOCK_OPCTL_SHUT_RECV, 0);
5556 5673          return (0);
5557 5674  }
5558 5675  
5559 5676  void
5560 5677  rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5561 5678  {
5562 5679          conn_t  *connp = (conn_t *)proto_handle;
5563 5680          icmp_t  *icmp = connp->conn_icmp;
5564 5681  
5565 5682          mutex_enter(&icmp->icmp_recv_lock);
5566 5683          connp->conn_flow_cntrld = B_FALSE;
5567 5684          mutex_exit(&icmp->icmp_recv_lock);
5568 5685  }
5569 5686  
5570 5687  int
5571 5688  rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5572 5689      int mode, int32_t *rvalp, cred_t *cr)
5573 5690  {
5574 5691          conn_t          *connp = (conn_t *)proto_handle;
5575 5692          int             error;
5576 5693  
5577 5694          /* All Solaris components should pass a cred for this operation. */
5578 5695          ASSERT(cr != NULL);
5579 5696  
5580 5697          /*
5581 5698           * If we don't have a helper stream then create one.
5582 5699           * ip_create_helper_stream takes care of locking the conn_t,
5583 5700           * so this check for NULL is just a performance optimization.
5584 5701           */
5585 5702          if (connp->conn_helper_info == NULL) {
5586 5703                  icmp_stack_t *is = connp->conn_icmp->icmp_is;
5587 5704  
5588 5705                  ASSERT(is->is_ldi_ident != NULL);
5589 5706  
5590 5707                  /*
5591 5708                   * Create a helper stream for non-STREAMS socket.
5592 5709                   */
5593 5710                  error = ip_create_helper_stream(connp, is->is_ldi_ident);
5594 5711                  if (error != 0) {
5595 5712                          ip0dbg(("rawip_ioctl: create of IP helper stream "
5596 5713                              "failed %d\n", error));
5597 5714                          return (error);
5598 5715                  }
5599 5716          }
5600 5717  
5601 5718          switch (cmd) {
5602 5719          case _SIOCSOCKFALLBACK:
5603 5720          case TI_GETPEERNAME:
5604 5721          case TI_GETMYNAME:
5605 5722  #ifdef DEBUG
5606 5723                  cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5607 5724                      " socket", cmd);
5608 5725  #endif
5609 5726                  error = EINVAL;
5610 5727                  break;
5611 5728          default:
5612 5729                  /*
5613 5730                   * Pass on to IP using helper stream
5614 5731                   */
5615 5732                  error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5616 5733                      cmd, arg, mode, cr, rvalp);
5617 5734                  break;
5618 5735          }
5619 5736          return (error);
5620 5737  }
5621 5738  
5622 5739  int
5623 5740  rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5624 5741      cred_t *cr)
5625 5742  {
5626 5743          sin6_t          *sin6;
5627 5744          sin_t           *sin = NULL;
5628 5745          uint_t          srcid;
5629 5746          conn_t          *connp = (conn_t *)proto_handle;
5630 5747          icmp_t          *icmp = connp->conn_icmp;
5631 5748          int             error = 0;
5632 5749          icmp_stack_t    *is = icmp->icmp_is;
5633 5750          pid_t           pid = curproc->p_pid;
5634 5751          ip_xmit_attr_t  *ixa;
5635 5752  
5636 5753          ASSERT(DB_TYPE(mp) == M_DATA);
5637 5754  
5638 5755          /* All Solaris components should pass a cred for this operation. */
5639 5756          ASSERT(cr != NULL);
5640 5757  
5641 5758          /* do an implicit bind if necessary */
5642 5759          if (icmp->icmp_state == TS_UNBND) {
5643 5760                  error = rawip_implicit_bind(connp);
5644 5761                  /*
5645 5762                   * We could be racing with an actual bind, in which case
5646 5763                   * we would see EPROTO. We cross our fingers and try
5647 5764                   * to connect.
5648 5765                   */
5649 5766                  if (!(error == 0 || error == EPROTO)) {
5650 5767                          freemsg(mp);
5651 5768                          return (error);
5652 5769                  }
5653 5770          }
5654 5771  
5655 5772          /* Protocol 255 contains full IP headers */
5656 5773          /* Read without holding lock */
5657 5774          if (icmp->icmp_hdrincl) {
5658 5775                  ASSERT(connp->conn_ipversion == IPV4_VERSION);
5659 5776                  if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5660 5777                          if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5661 5778                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5662 5779                                  freemsg(mp);
5663 5780                                  return (EINVAL);
5664 5781                          }
5665 5782                  }
5666 5783                  error = icmp_output_hdrincl(connp, mp, cr, pid);
5667 5784                  if (is->is_sendto_ignerr)
5668 5785                          return (0);
5669 5786                  else
5670 5787                          return (error);
5671 5788          }
5672 5789  
5673 5790          /* Connected? */
5674 5791          if (msg->msg_name == NULL) {
5675 5792                  if (icmp->icmp_state != TS_DATA_XFER) {
5676 5793                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5677 5794                          return (EDESTADDRREQ);
5678 5795                  }
5679 5796                  if (msg->msg_controllen != 0) {
5680 5797                          error = icmp_output_ancillary(connp, NULL, NULL, mp,
5681 5798                              NULL, msg, cr, pid);
5682 5799                  } else {
5683 5800                          error = icmp_output_connected(connp, mp, cr, pid);
5684 5801                  }
5685 5802                  if (is->is_sendto_ignerr)
5686 5803                          return (0);
5687 5804                  else
5688 5805                          return (error);
5689 5806          }
5690 5807          if (icmp->icmp_state == TS_DATA_XFER) {
5691 5808                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5692 5809                  return (EISCONN);
5693 5810          }
5694 5811          error = proto_verify_ip_addr(connp->conn_family,
5695 5812              (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5696 5813          if (error != 0) {
5697 5814                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5698 5815                  return (error);
5699 5816          }
5700 5817          switch (connp->conn_family) {
5701 5818          case AF_INET6:
5702 5819                  sin6 = (sin6_t *)msg->msg_name;
5703 5820  
5704 5821                  /* No support for mapped addresses on raw sockets */
5705 5822                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5706 5823                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5707 5824                          return (EADDRNOTAVAIL);
5708 5825                  }
5709 5826                  srcid = sin6->__sin6_src_id;
5710 5827  
5711 5828                  /*
5712 5829                   * If the local address is a mapped address return
5713 5830                   * an error.
5714 5831                   * It would be possible to send an IPv6 packet but the
5715 5832                   * response would never make it back to the application
5716 5833                   * since it is bound to a mapped address.
5717 5834                   */
5718 5835                  if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5719 5836                          BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5720 5837                          return (EADDRNOTAVAIL);
5721 5838                  }
5722 5839  
5723 5840                  if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5724 5841                          sin6->sin6_addr = ipv6_loopback;
5725 5842  
5726 5843                  /*
5727 5844                   * We have to allocate an ip_xmit_attr_t before we grab
5728 5845                   * conn_lock and we need to hold conn_lock once we've check
5729 5846                   * conn_same_as_last_v6 to handle concurrent send* calls on a
5730 5847                   * socket.
5731 5848                   */
5732 5849                  if (msg->msg_controllen == 0) {
5733 5850                          ixa = conn_get_ixa(connp, B_FALSE);
5734 5851                          if (ixa == NULL) {
5735 5852                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5736 5853                                  return (ENOMEM);
5737 5854                          }
5738 5855                  } else {
5739 5856                          ixa = NULL;
5740 5857                  }
5741 5858                  mutex_enter(&connp->conn_lock);
5742 5859                  if (icmp->icmp_delayed_error != 0) {
5743 5860                          sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5744 5861  
5745 5862                          error = icmp->icmp_delayed_error;
5746 5863                          icmp->icmp_delayed_error = 0;
5747 5864  
5748 5865                          /* Compare IP address and family */
5749 5866  
5750 5867                          if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5751 5868                              &sin2->sin6_addr) &&
5752 5869                              sin6->sin6_family == sin2->sin6_family) {
5753 5870                                  mutex_exit(&connp->conn_lock);
5754 5871                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5755 5872                                  if (ixa != NULL)
5756 5873                                          ixa_refrele(ixa);
5757 5874                                  return (error);
5758 5875                          }
5759 5876                  }
5760 5877                  if (msg->msg_controllen != 0) {
5761 5878                          mutex_exit(&connp->conn_lock);
5762 5879                          ASSERT(ixa == NULL);
5763 5880                          error = icmp_output_ancillary(connp, NULL, sin6, mp,
5764 5881                              NULL, msg, cr, pid);
5765 5882                  } else if (conn_same_as_last_v6(connp, sin6) &&
5766 5883                      connp->conn_lastsrcid == srcid &&
5767 5884                      ipsec_outbound_policy_current(ixa)) {
5768 5885                          /* icmp_output_lastdst drops conn_lock */
5769 5886                          error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5770 5887                  } else {
5771 5888                          /* icmp_output_newdst drops conn_lock */
5772 5889                          error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5773 5890                              pid, ixa);
5774 5891                  }
5775 5892                  ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5776 5893                  if (is->is_sendto_ignerr)
5777 5894                          return (0);
5778 5895                  else
5779 5896                          return (error);
5780 5897          case AF_INET:
5781 5898                  sin = (sin_t *)msg->msg_name;
5782 5899  
5783 5900                  if (sin->sin_addr.s_addr == INADDR_ANY)
5784 5901                          sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5785 5902  
5786 5903                  /*
5787 5904                   * We have to allocate an ip_xmit_attr_t before we grab
5788 5905                   * conn_lock and we need to hold conn_lock once we've check
5789 5906                   * conn_same_as_last_v6 to handle concurrent send* on a socket.
5790 5907                   */
5791 5908                  if (msg->msg_controllen == 0) {
5792 5909                          ixa = conn_get_ixa(connp, B_FALSE);
5793 5910                          if (ixa == NULL) {
5794 5911                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5795 5912                                  return (ENOMEM);
5796 5913                          }
5797 5914                  } else {
5798 5915                          ixa = NULL;
5799 5916                  }
5800 5917                  mutex_enter(&connp->conn_lock);
5801 5918                  if (icmp->icmp_delayed_error != 0) {
5802 5919                          sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5803 5920  
5804 5921                          error = icmp->icmp_delayed_error;
5805 5922                          icmp->icmp_delayed_error = 0;
5806 5923  
5807 5924                          /* Compare IP address */
5808 5925  
5809 5926                          if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5810 5927                                  mutex_exit(&connp->conn_lock);
5811 5928                                  BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5812 5929                                  if (ixa != NULL)
5813 5930                                          ixa_refrele(ixa);
5814 5931                                  return (error);
5815 5932                          }
5816 5933                  }
5817 5934  
5818 5935                  if (msg->msg_controllen != 0) {
5819 5936                          mutex_exit(&connp->conn_lock);
5820 5937                          ASSERT(ixa == NULL);
5821 5938                          error = icmp_output_ancillary(connp, sin, NULL, mp,
5822 5939                              NULL, msg, cr, pid);
5823 5940                  } else if (conn_same_as_last_v4(connp, sin) &&
5824 5941                      ipsec_outbound_policy_current(ixa)) {
5825 5942                          /* icmp_output_lastdst drops conn_lock */
5826 5943                          error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5827 5944                  } else {
5828 5945                          /* icmp_output_newdst drops conn_lock */
5829 5946                          error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5830 5947                              pid, ixa);
5831 5948                  }
5832 5949                  ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5833 5950                  if (is->is_sendto_ignerr)
5834 5951                          return (0);
5835 5952                  else
5836 5953                          return (error);
5837 5954          default:
5838 5955                  return (EINVAL);
5839 5956          }
5840 5957  }
5841 5958  
5842 5959  sock_downcalls_t sock_rawip_downcalls = {
5843 5960          rawip_activate,
5844 5961          rawip_accept,
5845 5962          rawip_bind,
5846 5963          rawip_listen,
5847 5964          rawip_connect,
5848 5965          rawip_getpeername,
5849 5966          rawip_getsockname,
5850 5967          rawip_getsockopt,
5851 5968          rawip_setsockopt,
5852 5969          rawip_send,
5853 5970          NULL,
5854 5971          NULL,
5855 5972          NULL,
5856 5973          rawip_shutdown,
5857 5974          rawip_clr_flowctrl,
5858 5975          rawip_ioctl,
5859 5976          rawip_close
5860 5977  };
  
    | 
      ↓ open down ↓ | 
    806 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX