io-lx-public Wdiff usr/src/uts/common/inet/sockmods/sockmod_pfp.c

Print this page

OS-5549 move bpf filter functions into ip module
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
          +++ new/usr/src/uts/common/inet/sockmods/sockmod_pfp.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright 2015 Joyent, Inc. All rights reserved.
       24 + * Copyright 2016 Joyent, Inc.
  25   25   */
  26   26  
  27   27  #include <sys/types.h>
  28   28  #include <sys/param.h>
  29   29  #include <sys/systm.h>
  30   30  #include <sys/stropts.h>
  31   31  #include <sys/socket.h>
  32   32  #include <sys/socketvar.h>
  33   33  #include <sys/socket_proto.h>
  34   34  #include <sys/sockio.h>

  35   35  #include <sys/strsun.h>
  36   36  #include <sys/kstat.h>
  37   37  #include <sys/modctl.h>
  38   38  #include <sys/policy.h>
  39   39  #include <sys/priv_const.h>
  40   40  #include <sys/tihdr.h>
  41   41  #include <sys/zone.h>
  42   42  #include <sys/time.h>
  43   43  #include <sys/ethernet.h>

↓ open down ↓

9 lines elided

↑ open up ↑

  44   44  #include <sys/llc1.h>
  45   45  #include <fs/sockfs/sockcommon.h>
  46   46  #include <net/if.h>
  47   47  #include <inet/ip_arp.h>
  48   48  
  49   49  #include <sys/dls.h>
  50   50  #include <sys/mac.h>
  51   51  #include <sys/mac_client.h>
  52   52  #include <sys/mac_provider.h>
  53   53  #include <sys/mac_client_priv.h>
       54 +#include <inet/bpf.h>
  54   55  
  55   56  #include <netpacket/packet.h>
  56   57  
  57   58  static void pfp_close(mac_handle_t, mac_client_handle_t);
  58   59  static int pfp_dl_to_arphrd(int);
  59   60  static int pfp_getpacket_sockopt(sock_lower_handle_t, int, void *,
  60   61      socklen_t *);
  61   62  static int pfp_ifreq_getlinkid(intptr_t, struct ifreq *, datalink_id_t *, int);
  62   63  static int pfp_lifreq_getlinkid(intptr_t, struct lifreq *, datalink_id_t *,
  63   64      int);

  64   65  static int pfp_open_index(int, mac_handle_t *, mac_client_handle_t *,
  65   66      cred_t *);
  66   67  static void pfp_packet(void *, mac_resource_handle_t, mblk_t *, boolean_t);
  67   68  static void pfp_release_bpf(struct pfpsock *);
  68   69  static int pfp_set_promisc(struct pfpsock *, mac_client_promisc_type_t);
  69   70  static int pfp_setsocket_sockopt(sock_lower_handle_t, int, const void *,
  70   71      socklen_t);
  71   72  static int pfp_setpacket_sockopt(sock_lower_handle_t, int, const void *,
  72   73      socklen_t);
  73   74  
  74   75  /*
  75   76   * PFP sockfs operations
  76   77   * Most are currently no-ops because they have no meaning for a connectionless
  77   78   * socket.
  78   79   */
  79   80  static void sdpfp_activate(sock_lower_handle_t, sock_upper_handle_t,
  80   81      sock_upcalls_t *, int, struct cred *);
  81   82  static int sdpfp_bind(sock_lower_handle_t, struct sockaddr *, socklen_t,
  82   83      struct cred *);
  83   84  static int sdpfp_close(sock_lower_handle_t, int, struct cred *);
  84   85  static void sdpfp_clr_flowctrl(sock_lower_handle_t);
  85   86  static int sdpfp_getsockopt(sock_lower_handle_t, int, int, void *,
  86   87      socklen_t *, struct cred *);
  87   88  static int sdpfp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
  88   89      struct cred *);
  89   90  static int sdpfp_senduio(sock_lower_handle_t, struct uio *, struct nmsghdr *,
  90   91      struct cred *);
  91   92  static int sdpfp_setsockopt(sock_lower_handle_t, int, int, const void *,
  92   93      socklen_t, struct cred *);
  93   94  
  94   95  static sock_lower_handle_t sockpfp_create(int, int, int, sock_downcalls_t **,
  95   96      uint_t *, int *, int, cred_t *);
  96   97  
  97   98  static int sockpfp_init(void);
  98   99  static void sockpfp_fini(void);
  99  100  
 100  101  static kstat_t *pfp_ksp;
 101  102  static pfp_kstats_t ks_stats;
 102  103  static pfp_kstats_t pfp_kstats = {
 103  104          /*
 104  105           * Each one of these kstats is a different return path in handling
 105  106           * a packet received from the mac layer.
 106  107           */
 107  108          { "recvMacHeaderFail",  KSTAT_DATA_UINT64 },
 108  109          { "recvBadProtocol",    KSTAT_DATA_UINT64 },
 109  110          { "recvAllocbFail",     KSTAT_DATA_UINT64 },
 110  111          { "recvOk",             KSTAT_DATA_UINT64 },
 111  112          { "recvFail",           KSTAT_DATA_UINT64 },
 112  113          { "recvFiltered",       KSTAT_DATA_UINT64 },
 113  114          { "recvFlowControl",    KSTAT_DATA_UINT64 },
 114  115          /*
 115  116           * A global set of counters is maintained to track the behaviour
 116  117           * of the system (kernel & applications) in sending packets.
 117  118           */
 118  119          { "sendUnbound",        KSTAT_DATA_UINT64 },
 119  120          { "sendFailed",         KSTAT_DATA_UINT64 },
 120  121          { "sendTooBig",         KSTAT_DATA_UINT64 },
 121  122          { "sendAllocFail",      KSTAT_DATA_UINT64 },
 122  123          { "sendUiomoveFail",    KSTAT_DATA_UINT64 },
 123  124          { "sendNoMemory",       KSTAT_DATA_UINT64 },
 124  125          { "sendOpenFail",       KSTAT_DATA_UINT64 },
 125  126          { "sendWrongFamily",    KSTAT_DATA_UINT64 },
 126  127          { "sendShortMsg",       KSTAT_DATA_UINT64 },
 127  128          { "sendOk",             KSTAT_DATA_UINT64 }
 128  129  };
 129  130  
 130  131  sock_downcalls_t pfp_downcalls = {
 131  132          sdpfp_activate,
 132  133          sock_accept_notsupp,
 133  134          sdpfp_bind,
 134  135          sock_listen_notsupp,
 135  136          sock_connect_notsupp,
 136  137          sock_getpeername_notsupp,
 137  138          sock_getsockname_notsupp,
 138  139          sdpfp_getsockopt,
 139  140          sdpfp_setsockopt,
 140  141          sock_send_notsupp,
 141  142          sdpfp_senduio,
 142  143          NULL,
 143  144          sock_poll_notsupp,
 144  145          sock_shutdown_notsupp,
 145  146          sdpfp_clr_flowctrl,
 146  147          sdpfp_ioctl,
 147  148          sdpfp_close,
 148  149  };
 149  150  
 150  151  static smod_reg_t sinfo = {
 151  152          SOCKMOD_VERSION,
 152  153          "sockpfp",
 153  154          SOCK_UC_VERSION,
 154  155          SOCK_DC_VERSION,
 155  156          sockpfp_create,
 156  157          NULL
 157  158  };
 158  159  
 159  160  static int accepted_protos[3][2] = {
 160  161          { ETH_P_ALL,    0 },
 161  162          { ETH_P_802_2,  LLC_SNAP_SAP },
 162  163          { ETH_P_803_3,  0 },
 163  164  };
 164  165  
 165  166  /*
 166  167   * This sets an upper bound on the size of the receive buffer for a PF_PACKET
 167  168   * socket. More properly, this should be controlled through ipadm, ala TCP, UDP,
 168  169   * SCTP, etc. Until that's done, this provides a hard cap of 4 MB and allows an
 169  170   * opportunity for it to be changed, should it be needed.
 170  171   */
 171  172  int sockmod_pfp_rcvbuf_max = 1024 * 1024 * 4;
 172  173  
 173  174  /*
 174  175   * Module linkage information for the kernel.
 175  176   */
 176  177  static struct modlsockmod modlsockmod = {
 177  178          &mod_sockmodops, "PF Packet socket module", &sinfo
 178  179  };
 179  180  
 180  181  static struct modlinkage modlinkage = {
 181  182          MODREV_1,
 182  183          &modlsockmod,
 183  184          NULL
 184  185  };
 185  186  
 186  187  int
 187  188  _init(void)
 188  189  {
 189  190          int error;
 190  191  
 191  192          error = sockpfp_init();
 192  193          if (error != 0)
 193  194                  return (error);
 194  195  
 195  196          error = mod_install(&modlinkage);
 196  197          if (error != 0)
 197  198                  sockpfp_fini();
 198  199  
 199  200          return (error);
 200  201  }
 201  202  
 202  203  int
 203  204  _fini(void)
 204  205  {
 205  206          int error;
 206  207  
 207  208          error = mod_remove(&modlinkage);
 208  209          if (error == 0)
 209  210                  sockpfp_fini();
 210  211  
 211  212          return (error);
 212  213  }
 213  214  
 214  215  int
 215  216  _info(struct modinfo *modinfop)
 216  217  {
 217  218          return (mod_info(&modlinkage, modinfop));
 218  219  }
 219  220  
 220  221  /*
 221  222   * sockpfp_init: called as part of the initialisation of the module when
 222  223   * loaded into the kernel.
 223  224   *
 224  225   * Being able to create and record the kstats data in the kernel is not
 225  226   * considered to be vital to the operation of this kernel module, thus
 226  227   * its failure is tolerated.
 227  228   */
 228  229  static int
 229  230  sockpfp_init(void)
 230  231  {
 231  232          (void) memset(&ks_stats, 0, sizeof (ks_stats));
 232  233  
 233  234          (void) memcpy(&ks_stats, &pfp_kstats, sizeof (pfp_kstats));
 234  235  
 235  236          pfp_ksp = kstat_create("pfpacket", 0, "global", "misc",
 236  237              KSTAT_TYPE_NAMED, sizeof (pfp_kstats) / sizeof (kstat_named_t),
 237  238              KSTAT_FLAG_VIRTUAL);
 238  239          if (pfp_ksp != NULL) {
 239  240                  pfp_ksp->ks_data = &ks_stats;
 240  241                  kstat_install(pfp_ksp);
 241  242          }
 242  243  
 243  244          return (0);
 244  245  }
 245  246  
 246  247  /*
 247  248   * sockpfp_fini: called when the operating system wants to unload the
 248  249   * socket module from the kernel.
 249  250   */
 250  251  static void
 251  252  sockpfp_fini(void)
 252  253  {
 253  254          if (pfp_ksp != NULL)
 254  255                  kstat_delete(pfp_ksp);
 255  256  }
 256  257  
 257  258  /*
 258  259   * Due to sockets being created read-write by default, all PF_PACKET sockets
 259  260   * therefore require the NET_RAWACCESS priviliege, even if the socket is only
 260  261   * being used for reading packets from.
 261  262   *
 262  263   * This create function enforces this module only being used with PF_PACKET
 263  264   * sockets and the policy that we support via the config file in sock2path.d:
 264  265   * PF_PACKET sockets must be either SOCK_DGRAM or SOCK_RAW.
 265  266   */
 266  267  /* ARGSUSED */
 267  268  static sock_lower_handle_t
 268  269  sockpfp_create(int family, int type, int proto,
 269  270      sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp,
 270  271      int sflags, cred_t *cred)
 271  272  {
 272  273          struct pfpsock *ps;
 273  274          int kmflags;
 274  275          int newproto;
 275  276          int i;
 276  277  
 277  278          if (secpolicy_net_rawaccess(cred) != 0) {
 278  279                  *errorp = EACCES;
 279  280                  return (NULL);
 280  281          }
 281  282  
 282  283          if (family != AF_PACKET) {
 283  284                  *errorp = EAFNOSUPPORT;
 284  285                  return (NULL);
 285  286          }
 286  287  
 287  288          if ((type != SOCK_RAW) && (type != SOCK_DGRAM)) {
 288  289                  *errorp = ESOCKTNOSUPPORT;
 289  290                  return (NULL);
 290  291          }
 291  292  
 292  293          /*
 293  294           * First check to see if the protocol number passed in via the socket
 294  295           * creation should be mapped to a different number for internal use.
 295  296           */
 296  297          for (i = 0, newproto = -1;
 297  298              i < sizeof (accepted_protos)/ sizeof (accepted_protos[0]); i++) {
 298  299                  if (accepted_protos[i][0] == proto) {
 299  300                          newproto = accepted_protos[i][1];
 300  301                          break;
 301  302                  }
 302  303          }
 303  304  
 304  305          /*
 305  306           * If the mapping of the protocol that was under 0x800 failed to find
 306  307           * a local equivalent then fail the socket creation. If the protocol
 307  308           * for the socket is over 0x800 and it was not found in the mapping
 308  309           * table above, then use the value as is.
 309  310           */
 310  311          if (newproto == -1) {
 311  312                  if (proto < 0x800) {
 312  313                          *errorp = ENOPROTOOPT;
 313  314                          return (NULL);
 314  315                  }
 315  316                  newproto = proto;
 316  317          }
 317  318          proto = newproto;
 318  319  
 319  320          kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 320  321          ps = kmem_zalloc(sizeof (*ps), kmflags);
 321  322          if (ps == NULL) {
 322  323                  *errorp = ENOMEM;
 323  324                  return (NULL);
 324  325          }
 325  326  
 326  327          ps->ps_type = type;
 327  328          ps->ps_proto = proto;
 328  329          rw_init(&ps->ps_bpflock, NULL, RW_DRIVER, NULL);
 329  330          mutex_init(&ps->ps_lock, NULL, MUTEX_DRIVER, NULL);
 330  331  
 331  332          *sock_downcalls = &pfp_downcalls;
 332  333          /*
 333  334           * Setting this causes bytes from a packet that do not fit into the
 334  335           * destination user buffer to be discarded. Thus the API is one
 335  336           * packet per receive and callers are required to use a buffer large
 336  337           * enough for the biggest packet that the interface can provide.
 337  338           */
 338  339          *smodep = SM_ATOMIC;
 339  340  
 340  341          return ((sock_lower_handle_t)ps);
 341  342  }
 342  343  
 343  344  /* ************************************************************************* */
 344  345  
 345  346  /*
 346  347   * pfp_packet is the callback function that is given to the mac layer for
 347  348   * PF_PACKET to receive packets with. One packet at a time is passed into
 348  349   * this function from the mac layer. Each packet is a private copy given
 349  350   * to PF_PACKET to modify or free as it wishes and does not harm the original
 350  351   * packet from which it was cloned.
 351  352   */
 352  353  /* ARGSUSED */
 353  354  static void
 354  355  pfp_packet(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t flag)
 355  356  {
 356  357          struct T_unitdata_ind *tunit;
 357  358          struct sockaddr_ll *sll;
 358  359          struct sockaddr_ll *sol;
 359  360          mac_header_info_t hdr;
 360  361          struct pfpsock *ps;
 361  362          size_t tusz;
 362  363          mblk_t *mp0;
 363  364          int error;
 364  365  
 365  366          if (mp == NULL)
 366  367                  return;
 367  368  
 368  369          ps = arg;
 369  370          if (ps->ps_flow_ctrld) {
 370  371                  ps->ps_flow_ctrl_drops++;
 371  372                  ps->ps_stats.tp_drops++;
 372  373                  ks_stats.kp_recv_flow_cntrld.value.ui64++;
 373  374                  freemsg(mp);
 374  375                  return;
 375  376          }
 376  377  
 377  378          if (mac_header_info(ps->ps_mh, mp, &hdr) != 0) {
 378  379                  /*
 379  380                   * Can't decode the packet header information so drop it.
 380  381                   */
 381  382                  ps->ps_stats.tp_drops++;
 382  383                  ks_stats.kp_recv_mac_hdr_fail.value.ui64++;
 383  384                  freemsg(mp);
 384  385                  return;
 385  386          }
 386  387  
 387  388          if (mac_type(ps->ps_mh) == DL_ETHER &&
 388  389              hdr.mhi_bindsap == ETHERTYPE_VLAN) {
 389  390                  struct ether_vlan_header *evhp;
 390  391                  struct ether_vlan_header evh;
 391  392  
 392  393                  hdr.mhi_hdrsize = sizeof (struct ether_vlan_header);
 393  394                  hdr.mhi_istagged = B_TRUE;
 394  395  
 395  396                  if (MBLKL(mp) >= sizeof (*evhp)) {
 396  397                          evhp = (struct ether_vlan_header *)mp->b_rptr;
 397  398                  } else {
 398  399                          int sz = sizeof (*evhp);
 399  400                          char *s = (char *)&evh;
 400  401                          mblk_t *tmp;
 401  402                          int len;
 402  403  
 403  404                          for (tmp = mp; sz > 0 && tmp != NULL;
 404  405                              tmp = tmp->b_cont) {
 405  406                                  len = min(sz, MBLKL(tmp));
 406  407                                  bcopy(tmp->b_rptr, s, len);
 407  408                                  sz -= len;
 408  409                          }
 409  410                          evhp = &evh;
 410  411                  }
 411  412                  hdr.mhi_tci = ntohs(evhp->ether_tci);
 412  413                  hdr.mhi_bindsap = ntohs(evhp->ether_type);
 413  414          }
 414  415  
 415  416          if ((ps->ps_proto != 0) && (ps->ps_proto != hdr.mhi_bindsap)) {
 416  417                  /*
 417  418                   * The packet is not of interest to this socket so
 418  419                   * drop it on the floor. Here the SAP is being used
 419  420                   * as a very course filter.
 420  421                   */
 421  422                  ps->ps_stats.tp_drops++;
 422  423                  ks_stats.kp_recv_bad_proto.value.ui64++;
 423  424                  freemsg(mp);
 424  425                  return;
 425  426          }
 426  427  
 427  428          /*
 428  429           * This field is not often set, even for ethernet,
 429  430           * by mac_header_info, so compute it if it is 0.
 430  431           */
 431  432          if (hdr.mhi_pktsize == 0)
 432  433                  hdr.mhi_pktsize = msgdsize(mp);
 433  434  
 434  435          /*
 435  436           * If a BPF filter is present, pass the raw packet into that.
 436  437           * A failed match will result in zero being returned, indicating
 437  438           * that this socket is not interested in the packet.
 438  439           */
 439  440          if (ps->ps_bpf.bf_len != 0) {
 440  441                  uchar_t *buffer;

↓ open down ↓

377 lines elided

↑ open up ↑

 441  442                  int buflen;
 442  443  
 443  444                  buflen = MBLKL(mp);
 444  445                  if (hdr.mhi_pktsize == buflen) {
 445  446                          buffer = mp->b_rptr;
 446  447                  } else {
 447  448                          buflen = 0;
 448  449                          buffer = (uchar_t *)mp;
 449  450                  }
 450  451                  rw_enter(&ps->ps_bpflock, RW_READER);
 451      -                if (bpf_filter(ps->ps_bpf.bf_insns, buffer,
      452 +                if (ip_bpf_filter((ip_bpf_insn_t *)ps->ps_bpf.bf_insns, buffer,
 452  453                      hdr.mhi_pktsize, buflen) == 0) {
 453  454                          rw_exit(&ps->ps_bpflock);
 454  455                          ps->ps_stats.tp_drops++;
 455  456                          ks_stats.kp_recv_filtered.value.ui64++;
 456  457                          freemsg(mp);
 457  458                          return;
 458  459                  }
 459  460                  rw_exit(&ps->ps_bpflock);
 460  461          }
 461  462

 462  463          if (ps->ps_type == SOCK_DGRAM) {
 463  464                  /*
 464  465                   * SOCK_DGRAM socket expect a "layer 3" packet, so advance
 465  466                   * past the link layer header.
 466  467                   */
 467  468                  mp->b_rptr += hdr.mhi_hdrsize;
 468  469                  hdr.mhi_pktsize -= hdr.mhi_hdrsize;
 469  470          }
 470  471  
 471  472          tusz = sizeof (struct T_unitdata_ind) + sizeof (struct sockaddr_ll);
 472  473          if (ps->ps_auxdata) {
 473  474                  tusz += _TPI_ALIGN_TOPT(sizeof (struct tpacket_auxdata));
 474  475                  tusz += _TPI_ALIGN_TOPT(sizeof (struct T_opthdr));
 475  476          }
 476  477  
 477  478          /*
 478  479           * It is tempting to think that this could be optimised by having
 479  480           * the base mblk_t allocated and hung off the pfpsock structure,
 480  481           * except that then another one would need to be allocated for the
 481  482           * sockaddr_ll that is included. Even creating a template to copy
 482  483           * from is of questionable value, as read-write from one structure
 483  484           * to the other is going to be slower than all of the initialisation.
 484  485           */
 485  486          mp0 = allocb(tusz, BPRI_HI);
 486  487          if (mp0 == NULL) {
 487  488                  ps->ps_stats.tp_drops++;
 488  489                  ks_stats.kp_recv_alloc_fail.value.ui64++;
 489  490                  freemsg(mp);
 490  491                  return;
 491  492          }
 492  493  
 493  494          (void) memset(mp0->b_rptr, 0, tusz);
 494  495  
 495  496          mp0->b_datap->db_type = M_PROTO;
 496  497          mp0->b_wptr = mp0->b_rptr + tusz;
 497  498  
 498  499          tunit = (struct T_unitdata_ind *)mp0->b_rptr;
 499  500          tunit->PRIM_type = T_UNITDATA_IND;
 500  501          tunit->SRC_length = sizeof (struct sockaddr);
 501  502          tunit->SRC_offset = sizeof (*tunit);
 502  503  
 503  504          sol = &ps->ps_sock;
 504  505          sll = (struct sockaddr_ll *)(mp0->b_rptr + sizeof (*tunit));
 505  506          sll->sll_ifindex = sol->sll_ifindex;
 506  507          sll->sll_hatype = (uint16_t)hdr.mhi_origsap;
 507  508          sll->sll_halen = sol->sll_halen;
 508  509          if (hdr.mhi_saddr != NULL)
 509  510                  (void) memcpy(sll->sll_addr, hdr.mhi_saddr, sll->sll_halen);
 510  511  
 511  512          switch (hdr.mhi_dsttype) {
 512  513          case MAC_ADDRTYPE_MULTICAST :
 513  514                  sll->sll_pkttype = PACKET_MULTICAST;
 514  515                  break;
 515  516          case MAC_ADDRTYPE_BROADCAST :
 516  517                  sll->sll_pkttype = PACKET_BROADCAST;
 517  518                  break;
 518  519          case MAC_ADDRTYPE_UNICAST :
 519  520                  if (memcmp(sol->sll_addr, hdr.mhi_daddr, sol->sll_halen) == 0)
 520  521                          sll->sll_pkttype = PACKET_HOST;
 521  522                  else
 522  523                          sll->sll_pkttype = PACKET_OTHERHOST;
 523  524                  break;
 524  525          }
 525  526  
 526  527          if (ps->ps_auxdata) {
 527  528                  struct tpacket_auxdata *aux;
 528  529                  struct T_opthdr *topt;
 529  530  
 530  531                  tunit->OPT_offset = _TPI_ALIGN_TOPT(tunit->SRC_offset +
 531  532                      sizeof (struct sockaddr_ll));
 532  533                  tunit->OPT_length = _TPI_ALIGN_TOPT(sizeof (struct T_opthdr)) +
 533  534                      _TPI_ALIGN_TOPT(sizeof (struct tpacket_auxdata));
 534  535  
 535  536                  topt = (struct T_opthdr *)(mp0->b_rptr + tunit->OPT_offset);
 536  537                  aux = (struct tpacket_auxdata *)
 537  538                      ((char *)topt + _TPI_ALIGN_TOPT(sizeof (*topt)));
 538  539  
 539  540                  topt->len = tunit->OPT_length;
 540  541                  topt->level = SOL_PACKET;
 541  542                  topt->name = PACKET_AUXDATA;
 542  543                  topt->status = 0;
 543  544                  /*
 544  545                   * libpcap doesn't seem to use any other field,
 545  546                   * so it isn't clear how they should be filled in.
 546  547                   */
 547  548                  aux->tp_vlan_vci = hdr.mhi_tci;
 548  549          }
 549  550  
 550  551          linkb(mp0, mp);
 551  552  
 552  553          (void) gethrestime(&ps->ps_timestamp);
 553  554  
 554  555          ps->ps_upcalls->su_recv(ps->ps_upper, mp0, hdr.mhi_pktsize, 0,
 555  556              &error, NULL);
 556  557  
 557  558          if (error == 0) {
 558  559                  ps->ps_stats.tp_packets++;
 559  560                  ks_stats.kp_recv_ok.value.ui64++;
 560  561          } else {
 561  562                  mutex_enter(&ps->ps_lock);
 562  563                  if (error == ENOSPC) {
 563  564                          ps->ps_upcalls->su_recv(ps->ps_upper, NULL, 0, 0,
 564  565                              &error, NULL);
 565  566                          if (error == ENOSPC)
 566  567                                  ps->ps_flow_ctrld = B_TRUE;
 567  568                  }
 568  569                  mutex_exit(&ps->ps_lock);
 569  570                  ps->ps_stats.tp_drops++;
 570  571                  ks_stats.kp_recv_fail.value.ui64++;
 571  572          }
 572  573  }
 573  574  
 574  575  /*
 575  576   * Bind a PF_PACKET socket to a network interface.
 576  577   *
 577  578   * The default operation of this bind() is to place the socket (and thus the
 578  579   * network interface) into promiscuous mode. It is then up to the application
 579  580   * to turn that down by issuing the relevant ioctls, if desired.
 580  581   */
 581  582  static int
 582  583  sdpfp_bind(sock_lower_handle_t handle, struct sockaddr *addr,
 583  584      socklen_t addrlen, struct cred *cred)
 584  585  {
 585  586          struct sockaddr_ll *addr_ll, *sol;
 586  587          mac_client_handle_t mch;
 587  588          struct pfpsock *ps;
 588  589          mac_handle_t mh;
 589  590          int error;
 590  591  
 591  592          ps = (struct pfpsock *)handle;
 592  593          if (ps->ps_bound)
 593  594                  return (EINVAL);
 594  595  
 595  596          if (addrlen < sizeof (struct sockaddr_ll) || addr == NULL)
 596  597                  return (EINVAL);
 597  598  
 598  599          addr_ll = (struct sockaddr_ll *)addr;
 599  600  
 600  601          error = pfp_open_index(addr_ll->sll_ifindex, &mh, &mch, cred);
 601  602          if (error != 0)
 602  603                  return (error);
 603  604          /*
 604  605           * Ensure that each socket is only bound once.
 605  606           */
 606  607          mutex_enter(&ps->ps_lock);
 607  608          if (ps->ps_mh != 0) {
 608  609                  mutex_exit(&ps->ps_lock);
 609  610                  pfp_close(mh, mch);
 610  611                  return (EADDRINUSE);
 611  612          }
 612  613          ps->ps_mh = mh;
 613  614          ps->ps_mch = mch;
 614  615          mutex_exit(&ps->ps_lock);
 615  616  
 616  617          /*
 617  618           * Cache all of the information from bind so that it's in an easy
 618  619           * place to get at when packets are received.
 619  620           */
 620  621          sol = &ps->ps_sock;
 621  622          sol->sll_family = AF_PACKET;
 622  623          sol->sll_ifindex = addr_ll->sll_ifindex;
 623  624          sol->sll_protocol = addr_ll->sll_protocol;
 624  625          sol->sll_halen = mac_addr_len(ps->ps_mh);
 625  626          mac_unicast_primary_get(ps->ps_mh, sol->sll_addr);
 626  627          mac_sdu_get(ps->ps_mh, NULL, &ps->ps_max_sdu);
 627  628          ps->ps_linkid = addr_ll->sll_ifindex;
 628  629  
 629  630          error = mac_promisc_add(ps->ps_mch, MAC_CLIENT_PROMISC_ALL,
 630  631              pfp_packet, ps, &ps->ps_phd, MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
 631  632          if (error == 0) {
 632  633                  ps->ps_promisc = MAC_CLIENT_PROMISC_ALL;
 633  634                  ps->ps_bound = B_TRUE;
 634  635          }
 635  636  
 636  637          return (error);
 637  638  }
 638  639  
 639  640  /* ARGSUSED */
 640  641  static void
 641  642  sdpfp_activate(sock_lower_handle_t lower, sock_upper_handle_t upper,
 642  643      sock_upcalls_t *upcalls, int flags, cred_t *cred)
 643  644  {
 644  645          struct pfpsock *ps;
 645  646  
 646  647          ps = (struct pfpsock *)lower;
 647  648          ps->ps_upper = upper;
 648  649          ps->ps_upcalls = upcalls;
 649  650  }
 650  651  
 651  652  /*
 652  653   * This module only implements getting socket options for the new socket
 653  654   * option level (SOL_PACKET) that it introduces. All other requests are
 654  655   * passed back to the sockfs layer.
 655  656   */
 656  657  /* ARGSUSED */
 657  658  static int
 658  659  sdpfp_getsockopt(sock_lower_handle_t handle, int level, int option_name,
 659  660      void *optval, socklen_t *optlenp, struct cred *cred)
 660  661  {
 661  662          struct pfpsock *ps;
 662  663          int error = 0;
 663  664  
 664  665          ps = (struct pfpsock *)handle;
 665  666  
 666  667          switch (level) {
 667  668          case SOL_PACKET :
 668  669                  error = pfp_getpacket_sockopt(handle, option_name, optval,
 669  670                      optlenp);
 670  671                  break;
 671  672  
 672  673          case SOL_SOCKET :
 673  674                  if (option_name == SO_RCVBUF) {
 674  675                          if (*optlenp < sizeof (int32_t))
 675  676                                  return (EINVAL);
 676  677                          *((int32_t *)optval) = ps->ps_rcvbuf;
 677  678                          *optlenp = sizeof (int32_t);
 678  679                  } else {
 679  680                          error = ENOPROTOOPT;
 680  681                  }
 681  682                  break;
 682  683  
 683  684          default :
 684  685                  /*
 685  686                   * If sockfs code receives this error in return from the
 686  687                   * getsockopt downcall it handles the option locally, if
 687  688                   * it can.
 688  689                   */
 689  690                  error = ENOPROTOOPT;
 690  691                  break;
 691  692          }
 692  693  
 693  694          return (error);
 694  695  }
 695  696  
 696  697  /*
 697  698   * PF_PACKET supports setting socket options at only two levels:
 698  699   * SOL_SOCKET and SOL_PACKET.
 699  700   */
 700  701  /* ARGSUSED */
 701  702  static int
 702  703  sdpfp_setsockopt(sock_lower_handle_t handle, int level, int option_name,
 703  704      const void *optval, socklen_t optlen, struct cred *cred)
 704  705  {
 705  706          int error = 0;
 706  707  
 707  708          switch (level) {
 708  709          case SOL_SOCKET :
 709  710                  error = pfp_setsocket_sockopt(handle, option_name, optval,
 710  711                      optlen);
 711  712                  break;
 712  713          case SOL_PACKET :
 713  714                  error = pfp_setpacket_sockopt(handle, option_name, optval,
 714  715                      optlen);
 715  716                  break;
 716  717          default :
 717  718                  error = EINVAL;
 718  719                  break;
 719  720          }
 720  721  
 721  722          return (error);
 722  723  }
 723  724  
 724  725  /*
 725  726   * This function is incredibly inefficient for sending any packet that
 726  727   * comes with a msghdr asking to be sent to an interface to which the
 727  728   * socket has not been bound. Some possibilities here are keeping a
 728  729   * cache of all open mac's and mac_client's, for the purpose of sending,
 729  730   * and closing them after some amount of inactivity. Clearly, applications
 730  731   * should not be written to use one socket for multiple interfaces if
 731  732   * performance is desired with the code as is.
 732  733   */
 733  734  /* ARGSUSED */
 734  735  static int
 735  736  sdpfp_senduio(sock_lower_handle_t handle, struct uio *uiop,
 736  737      struct nmsghdr *msg, struct cred *cred)
 737  738  {
 738  739          struct sockaddr_ll *sol;
 739  740          mac_client_handle_t mch;
 740  741          struct pfpsock *ps;
 741  742          boolean_t new_open;
 742  743          mac_handle_t mh;
 743  744          size_t mpsize;
 744  745          uint_t maxsdu;
 745  746          mblk_t *mp0;
 746  747          mblk_t *mp;
 747  748          int error;
 748  749  
 749  750          mp = NULL;
 750  751          mp0 = NULL;
 751  752          new_open = B_FALSE;
 752  753          ps = (struct pfpsock *)handle;
 753  754          mh = ps->ps_mh;
 754  755          mch = ps->ps_mch;
 755  756          maxsdu = ps->ps_max_sdu;
 756  757  
 757  758          sol = (struct sockaddr_ll *)msg->msg_name;
 758  759          if (sol == NULL) {
 759  760                  /*
 760  761                   * If no sockaddr_ll has been provided with the send call,
 761  762                   * use the one constructed when the socket was bound to an
 762  763                   * interface and fail if it hasn't been bound.
 763  764                   */
 764  765                  if (!ps->ps_bound) {
 765  766                          ks_stats.kp_send_unbound.value.ui64++;
 766  767                          return (EPROTO);
 767  768                  }
 768  769                  sol = &ps->ps_sock;
 769  770          } else {
 770  771                  /*
 771  772                   * Verify the sockaddr_ll message passed down before using
 772  773                   * it to send a packet out with. If it refers to an interface
 773  774                   * that has not been bound, it is necessary to open it.
 774  775                   */
 775  776                  struct sockaddr_ll *sll;
 776  777  
 777  778                  if (msg->msg_namelen < sizeof (struct sockaddr_ll)) {
 778  779                          ks_stats.kp_send_short_msg.value.ui64++;
 779  780                          return (EINVAL);
 780  781                  }
 781  782  
 782  783                  if (sol->sll_family != AF_PACKET) {
 783  784                          ks_stats.kp_send_wrong_family.value.ui64++;
 784  785                          return (EAFNOSUPPORT);
 785  786                  }
 786  787  
 787  788                  sll = &ps->ps_sock;
 788  789                  if (sol->sll_ifindex != sll->sll_ifindex) {
 789  790                          error = pfp_open_index(sol->sll_ifindex, &mh, &mch,
 790  791                              cred);
 791  792                          if (error != 0) {
 792  793                                  ks_stats.kp_send_open_fail.value.ui64++;
 793  794                                  return (error);
 794  795                          }
 795  796                          mac_sdu_get(mh, NULL, &maxsdu);
 796  797                          new_open = B_TRUE;
 797  798                  }
 798  799          }
 799  800  
 800  801          mpsize = uiop->uio_resid;
 801  802          if (mpsize > maxsdu) {
 802  803                  ks_stats.kp_send_too_big.value.ui64++;
 803  804                  error = EMSGSIZE;
 804  805                  goto done;
 805  806          }
 806  807  
 807  808          if ((mp = allocb(mpsize, BPRI_HI)) == NULL) {
 808  809                  ks_stats.kp_send_alloc_fail.value.ui64++;
 809  810                  error = ENOBUFS;
 810  811                  goto done;
 811  812          }
 812  813  
 813  814          mp->b_wptr = mp->b_rptr + mpsize;
 814  815          error = uiomove(mp->b_rptr, mpsize, UIO_WRITE, uiop);
 815  816          if (error != 0) {
 816  817                  ks_stats.kp_send_uiomove_fail.value.ui64++;
 817  818                  goto done;
 818  819          }
 819  820  
 820  821          if (ps->ps_type == SOCK_DGRAM) {
 821  822                  mp0 = mac_header(mh, sol->sll_addr, sol->sll_protocol, mp, 0);
 822  823                  if (mp0 == NULL) {
 823  824                          ks_stats.kp_send_no_memory.value.ui64++;
 824  825                          error = ENOBUFS;
 825  826                          goto done;
 826  827                  }
 827  828                  linkb(mp0, mp);
 828  829                  mp = mp0;
 829  830          }
 830  831  
 831  832          /*
 832  833           * As this is sending datagrams and no promise is made about
 833  834           * how or if a packet will be sent/delivered, no effort is to
 834  835           * be expended in recovering from a situation where the packet
 835  836           * cannot be sent - it is just dropped.
 836  837           */
 837  838          error = mac_tx(mch, mp, 0, MAC_DROP_ON_NO_DESC, NULL);
 838  839          if (error == 0) {
 839  840                  mp = NULL;
 840  841                  ks_stats.kp_send_ok.value.ui64++;
 841  842          } else {
 842  843                  ks_stats.kp_send_failed.value.ui64++;
 843  844          }
 844  845  
 845  846  done:
 846  847  
 847  848          if (new_open) {
 848  849                  ASSERT(mch != ps->ps_mch);
 849  850                  ASSERT(mh != ps->ps_mh);
 850  851                  pfp_close(mh, mch);
 851  852          }
 852  853          if (mp != NULL)
 853  854                  freemsg(mp);
 854  855  
 855  856          return (error);
 856  857  
 857  858  }
 858  859  
 859  860  /*
 860  861   * There's no use of a lock here, or at the bottom of pfp_packet() where
 861  862   * ps_flow_ctrld is set to true, because in a situation where these two
 862  863   * are racing to set the flag one way or the other, the end result is
 863  864   * going to be ultimately determined by the scheduler anyway - which of
 864  865   * the two threads gets the lock first? In such an operational environment,
 865  866   * we've got packets arriving too fast to be delt with so packets are going
 866  867   * to be dropped. Grabbing a lock just makes the drop more expensive.
 867  868   */
 868  869  static void
 869  870  sdpfp_clr_flowctrl(sock_lower_handle_t handle)
 870  871  {
 871  872          struct pfpsock *ps;
 872  873  
 873  874          ps = (struct pfpsock *)handle;
 874  875  
 875  876          mutex_enter(&ps->ps_lock);
 876  877          ps->ps_flow_ctrld = B_FALSE;
 877  878          mutex_exit(&ps->ps_lock);
 878  879  }
 879  880  
 880  881  /*
 881  882   * The implementation of this ioctl() handler is intended to function
 882  883   * in the absence of a bind() being made before it is called. Thus the
 883  884   * function calls mac_open() itself to provide a handle
 884  885   * This function is structured like this:
 885  886   * - determine the linkid for the interface being targetted
 886  887   * - open the interface with said linkid
 887  888   * - perform ioctl
 888  889   * - copy results back to caller
 889  890   *
 890  891   * The ioctls that interact with interface flags have been implented below
 891  892   * to assume that the interface is always up and running (IFF_RUNNING) and
 892  893   * to use the state of this socket to determine whether or not the network
 893  894   * interface is in promiscuous mode. Thus an ioctl to get the interface flags
 894  895   * of an interface that has been put in promiscuous mode by another socket
 895  896   * (in the same program or different), will not report that status.
 896  897   */
 897  898  /* ARGSUSED */
 898  899  static int
 899  900  sdpfp_ioctl(sock_lower_handle_t handle, int cmd, intptr_t arg, int mod,
 900  901      int32_t *rval, struct cred *cr)
 901  902  {
 902  903          struct timeval tival;
 903  904          mac_client_promisc_type_t mtype;
 904  905          struct sockaddr_dl *sock;
 905  906          datalink_id_t linkid;
 906  907          struct lifreq lifreq;
 907  908          struct ifreq ifreq;
 908  909          struct pfpsock *ps;
 909  910          mac_handle_t mh;
 910  911          int error;
 911  912  
 912  913          ps = (struct pfpsock *)handle;
 913  914  
 914  915          switch (cmd) {
 915  916          /*
 916  917           * ioctls that work on "struct lifreq"
 917  918           */
 918  919          case SIOCSLIFFLAGS :
 919  920          case SIOCGLIFINDEX :
 920  921          case SIOCGLIFFLAGS :
 921  922          case SIOCGLIFMTU :
 922  923          case SIOCGLIFHWADDR :
 923  924                  error = pfp_lifreq_getlinkid(arg, &lifreq, &linkid, mod);
 924  925                  if (error != 0)
 925  926                          return (error);
 926  927                  break;
 927  928  
 928  929          /*
 929  930           * ioctls that work on "struct ifreq".
 930  931           * Not all of these have a "struct lifreq" partner, for example
 931  932           * SIOCGIFHWADDR, for the simple reason that the logical interface
 932  933           * does not have a hardware address.
 933  934           */
 934  935          case SIOCSIFFLAGS :
 935  936          case SIOCGIFINDEX :
 936  937          case SIOCGIFFLAGS :
 937  938          case SIOCGIFMTU :
 938  939          case SIOCGIFHWADDR :
 939  940                  error = pfp_ifreq_getlinkid(arg, &ifreq, &linkid, mod);
 940  941                  if (error != 0)
 941  942                          return (error);
 942  943                  break;
 943  944  
 944  945          case SIOCGSTAMP :
 945  946                  tival.tv_sec = (time_t)ps->ps_timestamp.tv_sec;
 946  947                  tival.tv_usec = ps->ps_timestamp.tv_nsec / 1000;
 947  948                  if (get_udatamodel() == DATAMODEL_NATIVE) {
 948  949                          error = ddi_copyout(&tival, (void *)arg,
 949  950                              sizeof (tival), mod);
 950  951                  }
 951  952  #ifdef _SYSCALL32_IMPL
 952  953                  else {
 953  954                          struct timeval32 tv32;
 954  955                          TIMEVAL_TO_TIMEVAL32(&tv32, &tival);
 955  956                          error = ddi_copyout(&tv32, (void *)arg,
 956  957                              sizeof (tv32), mod);
 957  958                  }
 958  959  #endif
 959  960                  return (error);
 960  961          }
 961  962  
 962  963          error =  mac_open_by_linkid(linkid, &mh);
 963  964          if (error != 0)
 964  965                  return (error);
 965  966  
 966  967          switch (cmd) {
 967  968          case SIOCGLIFINDEX :
 968  969                  lifreq.lifr_index = linkid;
 969  970                  break;
 970  971  
 971  972          case SIOCGIFINDEX :
 972  973                  ifreq.ifr_index = linkid;
 973  974                  break;
 974  975  
 975  976          case SIOCGIFFLAGS :
 976  977                  ifreq.ifr_flags = IFF_RUNNING;
 977  978                  if (ps->ps_promisc == MAC_CLIENT_PROMISC_ALL)
 978  979                          ifreq.ifr_flags |= IFF_PROMISC;
 979  980                  break;
 980  981  
 981  982          case SIOCGLIFFLAGS :
 982  983                  lifreq.lifr_flags = IFF_RUNNING;
 983  984                  if (ps->ps_promisc == MAC_CLIENT_PROMISC_ALL)
 984  985                          lifreq.lifr_flags |= IFF_PROMISC;
 985  986                  break;
 986  987  
 987  988          case SIOCSIFFLAGS :
 988  989                  if (linkid != ps->ps_linkid) {
 989  990                          error = EINVAL;
 990  991                  } else {
 991  992                          if ((ifreq.ifr_flags & IFF_PROMISC) != 0)
 992  993                                  mtype = MAC_CLIENT_PROMISC_ALL;
 993  994                          else
 994  995                                  mtype = MAC_CLIENT_PROMISC_FILTERED;
 995  996                          error = pfp_set_promisc(ps, mtype);
 996  997                  }
 997  998                  break;
 998  999  
 999 1000          case SIOCSLIFFLAGS :
1000 1001                  if (linkid != ps->ps_linkid) {
1001 1002                          error = EINVAL;
1002 1003                  } else {
1003 1004                          if ((lifreq.lifr_flags & IFF_PROMISC) != 0)
1004 1005                                  mtype = MAC_CLIENT_PROMISC_ALL;
1005 1006                          else
1006 1007                                  mtype = MAC_CLIENT_PROMISC_FILTERED;
1007 1008                          error = pfp_set_promisc(ps, mtype);
1008 1009                  }
1009 1010                  break;
1010 1011  
1011 1012          case SIOCGIFMTU :
1012 1013                  mac_sdu_get(mh, NULL, &ifreq.ifr_mtu);
1013 1014                  break;
1014 1015  
1015 1016          case SIOCGLIFMTU :
1016 1017                  mac_sdu_get(mh, NULL, &lifreq.lifr_mtu);
1017 1018                  break;
1018 1019  
1019 1020          case SIOCGIFHWADDR :
1020 1021                  if (mac_addr_len(mh) > sizeof (ifreq.ifr_addr.sa_data)) {
1021 1022                          error = EPFNOSUPPORT;
1022 1023                          break;
1023 1024                  }
1024 1025  
1025 1026                  if (mac_addr_len(mh) == 0) {
1026 1027                          (void) memset(ifreq.ifr_addr.sa_data, 0,
1027 1028                              sizeof (ifreq.ifr_addr.sa_data));
1028 1029                  } else {
1029 1030                          mac_unicast_primary_get(mh,
1030 1031                              (uint8_t *)ifreq.ifr_addr.sa_data);
1031 1032                  }
1032 1033  
1033 1034                  /*
1034 1035                   * The behaviour here in setting sa_family is consistent
1035 1036                   * with what applications such as tcpdump would expect
1036 1037                   * for a Linux PF_PACKET socket.
1037 1038                   */
1038 1039                  ifreq.ifr_addr.sa_family = pfp_dl_to_arphrd(mac_type(mh));
1039 1040                  break;
1040 1041  
1041 1042          case SIOCGLIFHWADDR :
1042 1043                  lifreq.lifr_type = 0;
1043 1044                  sock = (struct sockaddr_dl *)&lifreq.lifr_addr;
1044 1045  
1045 1046                  if (mac_addr_len(mh) > sizeof (sock->sdl_data)) {
1046 1047                          error = EPFNOSUPPORT;
1047 1048                          break;
1048 1049                  }
1049 1050  
1050 1051                  /*
1051 1052                   * Fill in the sockaddr_dl with link layer details. Of note,
1052 1053                   * the index is returned as 0 for a couple of reasons:
1053 1054                   * (1) there is no public API that uses or requires it
1054 1055                   * (2) the MAC index is currently 32bits and sdl_index is 16.
1055 1056                   */
1056 1057                  sock->sdl_family = AF_LINK;
1057 1058                  sock->sdl_index = 0;
1058 1059                  sock->sdl_type = mac_type(mh);
1059 1060                  sock->sdl_nlen = 0;
1060 1061                  sock->sdl_alen = mac_addr_len(mh);
1061 1062                  sock->sdl_slen = 0;
1062 1063                  if (mac_addr_len(mh) == 0) {
1063 1064                          (void) memset(sock->sdl_data, 0,
1064 1065                              sizeof (sock->sdl_data));
1065 1066                  } else {
1066 1067                          mac_unicast_primary_get(mh, (uint8_t *)sock->sdl_data);
1067 1068                  }
1068 1069                  break;
1069 1070  
1070 1071          default :
1071 1072                  break;
1072 1073          }
1073 1074  
1074 1075          mac_close(mh);
1075 1076  
1076 1077          if (error == 0) {
1077 1078                  /*
1078 1079                   * Only the "GET" ioctls need to copy data back to userace.
1079 1080                   */
1080 1081                  switch (cmd) {
1081 1082                  case SIOCGLIFINDEX :
1082 1083                  case SIOCGLIFFLAGS :
1083 1084                  case SIOCGLIFMTU :
1084 1085                  case SIOCGLIFHWADDR :
1085 1086                          error = ddi_copyout(&lifreq, (void *)arg,
1086 1087                              sizeof (lifreq), mod);
1087 1088                          break;
1088 1089  
1089 1090                  case SIOCGIFINDEX :
1090 1091                  case SIOCGIFFLAGS :
1091 1092                  case SIOCGIFMTU :
1092 1093                  case SIOCGIFHWADDR :
1093 1094                          error = ddi_copyout(&ifreq, (void *)arg,
1094 1095                              sizeof (ifreq), mod);
1095 1096                          break;
1096 1097                  default :
1097 1098                          break;
1098 1099                  }
1099 1100          }
1100 1101  
1101 1102          return (error);
1102 1103  }
1103 1104  
1104 1105  /*
1105 1106   * Closing the socket requires that all open references to network
1106 1107   * interfaces be closed.
1107 1108   */
1108 1109  /* ARGSUSED */
1109 1110  static int
1110 1111  sdpfp_close(sock_lower_handle_t handle, int flag, struct cred *cr)
1111 1112  {
1112 1113          struct pfpsock *ps = (struct pfpsock *)handle;
1113 1114  
1114 1115          if (ps->ps_phd != 0) {
1115 1116                  mac_promisc_remove(ps->ps_phd);
1116 1117                  ps->ps_phd = 0;
1117 1118          }
1118 1119  
1119 1120          if (ps->ps_mch != 0) {
1120 1121                  mac_client_close(ps->ps_mch, 0);
1121 1122                  ps->ps_mch = 0;
1122 1123          }
1123 1124  
1124 1125          if (ps->ps_mh != 0) {
1125 1126                  mac_close(ps->ps_mh);
1126 1127                  ps->ps_mh = 0;
1127 1128          }
1128 1129  
1129 1130          kmem_free(ps, sizeof (*ps));
1130 1131  
1131 1132          return (0);
1132 1133  }
1133 1134  
1134 1135  /* ************************************************************************* */
1135 1136  
1136 1137  /*
1137 1138   * Given a pointer (arg) to a "struct ifreq" (potentially in user space),
1138 1139   * determine the linkid for the interface name stored in that structure.
1139 1140   * name is used as a buffer so that we can ensure a trailing \0 is appended
1140 1141   * to the name safely.
1141 1142   */
1142 1143  static int
1143 1144  pfp_ifreq_getlinkid(intptr_t arg, struct ifreq *ifreqp,
1144 1145      datalink_id_t *linkidp, int mode)
1145 1146  {
1146 1147          char name[IFNAMSIZ + 1];
1147 1148          int error;
1148 1149  
1149 1150          if (ddi_copyin((void *)arg, ifreqp, sizeof (*ifreqp), mode) != 0)
1150 1151                  return (EFAULT);
1151 1152  
1152 1153          (void) strlcpy(name, ifreqp->ifr_name, sizeof (name));
1153 1154  
1154 1155          error = dls_mgmt_get_linkid(name, linkidp);
1155 1156          if (error != 0)
1156 1157                  error = dls_devnet_macname2linkid(name, linkidp);
1157 1158  
1158 1159          return (error);
1159 1160  }
1160 1161  
1161 1162  /*
1162 1163   * Given a pointer (arg) to a "struct lifreq" (potentially in user space),
1163 1164   * determine the linkid for the interface name stored in that structure.
1164 1165   * name is used as a buffer so that we can ensure a trailing \0 is appended
1165 1166   * to the name safely.
1166 1167   */
1167 1168  static int
1168 1169  pfp_lifreq_getlinkid(intptr_t arg, struct lifreq *lifreqp,
1169 1170      datalink_id_t *linkidp, int mode)
1170 1171  {
1171 1172          char name[LIFNAMSIZ + 1];
1172 1173          int error;
1173 1174  
1174 1175          if (ddi_copyin((void *)arg, lifreqp, sizeof (*lifreqp), mode) != 0)
1175 1176                  return (EFAULT);
1176 1177  
1177 1178          (void) strlcpy(name, lifreqp->lifr_name, sizeof (name));
1178 1179  
1179 1180          error = dls_mgmt_get_linkid(name, linkidp);
1180 1181          if (error != 0)
1181 1182                  error = dls_devnet_macname2linkid(name, linkidp);
1182 1183  
1183 1184          return (error);
1184 1185  }
1185 1186  
1186 1187  /*
1187 1188   * Although there are several new SOL_PACKET options that can be set and
1188 1189   * are specific to this implementation of PF_PACKET, the current API does
1189 1190   * not support doing a get on them to retrieve accompanying status. Thus
1190 1191   * it is only currently possible to use SOL_PACKET with getsockopt to
1191 1192   * retrieve statistical information. This remains consistant with the
1192 1193   * Linux API at the time of writing.
1193 1194   */
1194 1195  static int
1195 1196  pfp_getpacket_sockopt(sock_lower_handle_t handle, int option_name,
1196 1197      void *optval, socklen_t *optlenp)
1197 1198  {
1198 1199          struct pfpsock *ps;
1199 1200          struct tpacket_stats_short tpss;
1200 1201          int error = 0;
1201 1202  
1202 1203          ps = (struct pfpsock *)handle;
1203 1204  
1204 1205          switch (option_name) {
1205 1206          case PACKET_STATISTICS :
1206 1207                  if (*optlenp < sizeof (ps->ps_stats)) {
1207 1208                          error = EINVAL;
1208 1209                          break;
1209 1210                  }
1210 1211                  *optlenp = sizeof (ps->ps_stats);
1211 1212                  bcopy(&ps->ps_stats, optval, sizeof (ps->ps_stats));
1212 1213                  break;
1213 1214          case PACKET_STATISTICS_SHORT :
1214 1215                  if (*optlenp < sizeof (tpss)) {
1215 1216                          error = EINVAL;
1216 1217                          break;
1217 1218                  }
1218 1219                  *optlenp = sizeof (tpss);
1219 1220                  tpss.tp_packets = ps->ps_stats.tp_packets;
1220 1221                  tpss.tp_drops = ps->ps_stats.tp_drops;
1221 1222                  bcopy(&tpss, optval, sizeof (tpss));
1222 1223                  break;
1223 1224          default :
1224 1225                  error = EINVAL;
1225 1226                  break;
1226 1227          }
1227 1228  
1228 1229          return (error);
1229 1230  }
1230 1231  
1231 1232  /*
1232 1233   * The SOL_PACKET level for socket options supports three options,
1233 1234   * PACKET_ADD_MEMBERSHIP, PACKET_DROP_MEMBERSHIP and PACKET_AUXDATA.
1234 1235   * This function is responsible for mapping the two socket options
1235 1236   * that manage multicast membership into the appropriate internal
1236 1237   * function calls to bring the option into effect. Whilst direct
1237 1238   * changes to the multicast membership (ADD/DROP) groups is handled
1238 1239   * by calls directly into the mac module, changes to the promiscuos
1239 1240   * mode are vectored through pfp_set_promisc() so that the logic for
1240 1241   * managing the promiscuous mode is in one place.
1241 1242   */
1242 1243  /* ARGSUSED */
1243 1244  static int
1244 1245  pfp_setpacket_sockopt(sock_lower_handle_t handle, int option_name,
1245 1246      const void *optval, socklen_t optlen)
1246 1247  {
1247 1248          struct packet_mreq mreq;
1248 1249          struct pfpsock *ps;
1249 1250          int error = 0;
1250 1251          int opt;
1251 1252  
1252 1253          ps = (struct pfpsock *)handle;
1253 1254          if (!ps->ps_bound)
1254 1255                  return (EPROTO);
1255 1256  
1256 1257          if ((option_name == PACKET_ADD_MEMBERSHIP) ||
1257 1258              (option_name == PACKET_DROP_MEMBERSHIP)) {
1258 1259                  if (!ps->ps_bound)
1259 1260                          return (EPROTO);
1260 1261                  bcopy(optval, &mreq, sizeof (mreq));
1261 1262                  if (ps->ps_linkid != mreq.mr_ifindex)
1262 1263                          return (EINVAL);
1263 1264          }
1264 1265  
1265 1266          switch (option_name) {
1266 1267          case PACKET_ADD_MEMBERSHIP :
1267 1268                  switch (mreq.mr_type) {
1268 1269                  case PACKET_MR_MULTICAST :
1269 1270                          if (mreq.mr_alen != ps->ps_sock.sll_halen)
1270 1271                                  return (EINVAL);
1271 1272  
1272 1273                          error = mac_multicast_add(ps->ps_mch, mreq.mr_address);
1273 1274                          break;
1274 1275  
1275 1276                  case PACKET_MR_PROMISC :
1276 1277                          error = pfp_set_promisc(ps, MAC_CLIENT_PROMISC_ALL);
1277 1278                          break;
1278 1279  
1279 1280                  case PACKET_MR_ALLMULTI :
1280 1281                          error = pfp_set_promisc(ps, MAC_CLIENT_PROMISC_MULTI);
1281 1282                          break;
1282 1283                  }
1283 1284                  break;
1284 1285  
1285 1286          case PACKET_DROP_MEMBERSHIP :
1286 1287                  switch (mreq.mr_type) {
1287 1288                  case PACKET_MR_MULTICAST :
1288 1289                          if (mreq.mr_alen != ps->ps_sock.sll_halen)
1289 1290                                  return (EINVAL);
1290 1291  
1291 1292                          mac_multicast_remove(ps->ps_mch, mreq.mr_address);
1292 1293                          break;
1293 1294  
1294 1295                  case PACKET_MR_PROMISC :
1295 1296                          if (ps->ps_promisc != MAC_CLIENT_PROMISC_ALL)
1296 1297                                  return (EINVAL);
1297 1298                          error = pfp_set_promisc(ps,
1298 1299                              MAC_CLIENT_PROMISC_FILTERED);
1299 1300                          break;
1300 1301  
1301 1302                  case PACKET_MR_ALLMULTI :
1302 1303                          if (ps->ps_promisc != MAC_CLIENT_PROMISC_MULTI)
1303 1304                                  return (EINVAL);
1304 1305                          error = pfp_set_promisc(ps,
1305 1306                              MAC_CLIENT_PROMISC_FILTERED);
1306 1307                          break;
1307 1308                  }
1308 1309                  break;
1309 1310  
1310 1311          case PACKET_AUXDATA :
1311 1312                  if (optlen == sizeof (int)) {
1312 1313                          opt = *(int *)optval;
1313 1314                          ps->ps_auxdata = (opt != 0);
1314 1315                  } else {
1315 1316                          error = EINVAL;
1316 1317                  }
1317 1318                  break;
1318 1319          default :
1319 1320                  error = EINVAL;
1320 1321                  break;
1321 1322          }
1322 1323  
1323 1324          return (error);
1324 1325  }
1325 1326  
1326 1327  /*
1327 1328   * There are only two special setsockopt's for SOL_SOCKET with PF_PACKET:
1328 1329   * SO_ATTACH_FILTER and SO_DETACH_FILTER.

↓ open down ↓

867 lines elided

↑ open up ↑

1329 1330   *
1330 1331   * Both of these setsockopt values are candidates for being handled by the
1331 1332   * socket layer itself in future, however this requires understanding how
1332 1333   * they would interact with all other sockets.
1333 1334   */
1334 1335  static int
1335 1336  pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
1336 1337      const void *optval, socklen_t optlen)
1337 1338  {
1338 1339          struct bpf_program prog;
1339      -        struct bpf_insn *fcode;
     1340 +        ip_bpf_insn_t *fcode;
1340 1341          struct pfpsock *ps;
1341 1342          struct sock_proto_props sopp;
1342 1343          int error = 0;
1343 1344          int size;
1344 1345  
1345 1346          ps = (struct pfpsock *)handle;
1346 1347  
1347 1348          switch (option_name) {
1348 1349          case SO_ATTACH_FILTER :
1349 1350  #ifdef _LP64

1350 1351                  if (optlen == sizeof (struct bpf_program32)) {
1351 1352                          struct bpf_program32 prog32;
1352 1353  
1353 1354                          bcopy(optval, &prog32, sizeof (prog32));
1354 1355                          prog.bf_len = prog32.bf_len;
1355 1356                          prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1356 1357                  } else
1357 1358  #endif
1358 1359                  if (optlen == sizeof (struct bpf_program)) {
1359 1360                          bcopy(optval, &prog, sizeof (prog));
1360 1361                  } else if (optlen != sizeof (struct bpf_program)) {
1361 1362                          return (EINVAL);
1362 1363                  }

↓ open down ↓

13 lines elided

↑ open up ↑

1363 1364                  if (prog.bf_len > BPF_MAXINSNS)
1364 1365                          return (EINVAL);
1365 1366  
1366 1367                  size = prog.bf_len * sizeof (*prog.bf_insns);
1367 1368                  fcode = kmem_alloc(size, KM_SLEEP);
1368 1369                  if (ddi_copyin(prog.bf_insns, fcode, size, 0) != 0) {
1369 1370                          kmem_free(fcode, size);
1370 1371                          return (EFAULT);
1371 1372                  }
1372 1373  
1373      -                if (bpf_validate(fcode, (int)prog.bf_len)) {
     1374 +                if (ip_bpf_validate(fcode, prog.bf_len)) {
1374 1375                          rw_enter(&ps->ps_bpflock, RW_WRITER);
1375 1376                          pfp_release_bpf(ps);
1376      -                        ps->ps_bpf.bf_insns = fcode;
     1377 +                        ps->ps_bpf.bf_insns = (struct bpf_insn *)fcode;
1377 1378                          ps->ps_bpf.bf_len = size;
1378 1379                          rw_exit(&ps->ps_bpflock);
1379 1380  
1380 1381                          return (0);
1381 1382                  }
1382 1383                  kmem_free(fcode, size);
1383 1384                  error = EINVAL;
1384 1385                  break;
1385 1386  
1386 1387          case SO_DETACH_FILTER :

1387 1388                  pfp_release_bpf(ps);
1388 1389                  break;
1389 1390  
1390 1391          case SO_RCVBUF :
1391 1392                  size = *(int32_t *)optval;
1392 1393                  if (size > sockmod_pfp_rcvbuf_max || size < 0)
1393 1394                          return (ENOBUFS);
1394 1395                  sopp.sopp_flags = SOCKOPT_RCVHIWAT;
1395 1396                  sopp.sopp_rxhiwat = size;
1396 1397                  ps->ps_upcalls->su_set_proto_props(ps->ps_upper, &sopp);
1397 1398                  ps->ps_rcvbuf = size;
1398 1399                  break;
1399 1400  
1400 1401          default :
1401 1402                  error = ENOPROTOOPT;
1402 1403                  break;
1403 1404          }
1404 1405  
1405 1406          return (error);
1406 1407  }
1407 1408  
1408 1409  /*
1409 1410   * pfp_open_index is an internal function used to open a MAC device by
1410 1411   * its index. Both a mac_handle_t and mac_client_handle_t are acquired
1411 1412   * because some of the interfaces provided by the mac layer require either
1412 1413   * only the mac_handle_t or both it and mac_handle_t.
1413 1414   *
1414 1415   * Whilst inside the kernel we can access data structures supporting any
1415 1416   * zone, access to interfaces from non-global zones is restricted to those
1416 1417   * interfaces (if any) that are exclusively assigned to a zone.
1417 1418   */
1418 1419  static int
1419 1420  pfp_open_index(int index, mac_handle_t *mhp, mac_client_handle_t *mcip,
1420 1421      cred_t *cred)
1421 1422  {
1422 1423          mac_client_handle_t mch;
1423 1424          zoneid_t ifzoneid;
1424 1425          mac_handle_t mh;
1425 1426          zoneid_t zoneid;
1426 1427          int error;
1427 1428  
1428 1429          mh = 0;
1429 1430          mch = 0;
1430 1431          error = mac_open_by_linkid(index, &mh);
1431 1432          if (error != 0)
1432 1433                  goto bad_open;
1433 1434  
1434 1435          error = mac_client_open(mh, &mch, NULL,
1435 1436              MAC_OPEN_FLAGS_USE_DATALINK_NAME);
1436 1437          if (error != 0)
1437 1438                  goto bad_open;
1438 1439  
1439 1440          zoneid = crgetzoneid(cred);
1440 1441          if (zoneid != GLOBAL_ZONEID) {
1441 1442                  mac_perim_handle_t perim;
1442 1443  
1443 1444                  mac_perim_enter_by_mh(mh, &perim);
1444 1445                  error = dls_link_getzid(mac_name(mh), &ifzoneid);
1445 1446                  mac_perim_exit(perim);
1446 1447                  if (error != 0)
1447 1448                          goto bad_open;
1448 1449                  if (ifzoneid != zoneid) {
1449 1450                          error = EACCES;
1450 1451                          goto bad_open;
1451 1452                  }
1452 1453          }
1453 1454  
1454 1455          *mcip = mch;
1455 1456          *mhp = mh;
1456 1457  
1457 1458          return (0);
1458 1459  bad_open:
1459 1460          if (mch != 0)
1460 1461                  mac_client_close(mch, 0);
1461 1462          if (mh != 0)
1462 1463                  mac_close(mh);
1463 1464          return (error);
1464 1465  }
1465 1466  
1466 1467  static void
1467 1468  pfp_close(mac_handle_t mh, mac_client_handle_t mch)
1468 1469  {
1469 1470          mac_client_close(mch, 0);
1470 1471          mac_close(mh);
1471 1472  }
1472 1473  
1473 1474  /*
1474 1475   * The purpose of this function is to provide a single place where we free
1475 1476   * the loaded BPF program and reset all pointers/counters associated with
1476 1477   * it.
1477 1478   */
1478 1479  static void
1479 1480  pfp_release_bpf(struct pfpsock *ps)
1480 1481  {
1481 1482          if (ps->ps_bpf.bf_len != 0) {
1482 1483                  kmem_free(ps->ps_bpf.bf_insns, ps->ps_bpf.bf_len);
1483 1484                  ps->ps_bpf.bf_len = 0;
1484 1485                  ps->ps_bpf.bf_insns = NULL;
1485 1486          }
1486 1487  }
1487 1488  
1488 1489  /*
1489 1490   * Set the promiscuous mode of a network interface.
1490 1491   * This function only calls the mac layer when there is a change to the
1491 1492   * status of a network interface's promiscous mode. Tracking of how many
1492 1493   * sockets have the network interface in promiscuous mode, and thus the
1493 1494   * control over the physical device's status, is left to the mac layer.
1494 1495   */
1495 1496  static int
1496 1497  pfp_set_promisc(struct pfpsock *ps, mac_client_promisc_type_t turnon)
1497 1498  {
1498 1499          int error = 0;
1499 1500          int flags;
1500 1501  
1501 1502          /*
1502 1503           * There are 4 combinations of turnon/ps_promisc.
1503 1504           * This if handles 2 (both false, both true) and the if() below
1504 1505           * handles the remaining one - when change is required.
1505 1506           */
1506 1507          if (turnon == ps->ps_promisc)
1507 1508                  return (error);
1508 1509  
1509 1510          if (ps->ps_phd != 0) {
1510 1511                  mac_promisc_remove(ps->ps_phd);
1511 1512                  ps->ps_phd = 0;
1512 1513  
1513 1514                  /*
1514 1515                   * ps_promisc is set here in case the call to mac_promisc_add
1515 1516                   * fails: leaving it to indicate that the interface is still
1516 1517                   * in some sort of promiscuous mode is false.
1517 1518                   */
1518 1519                  if (ps->ps_promisc != MAC_CLIENT_PROMISC_FILTERED) {
1519 1520                          ps->ps_promisc = MAC_CLIENT_PROMISC_FILTERED;
1520 1521                          flags = MAC_PROMISC_FLAGS_NO_PHYS;
1521 1522                  } else {
1522 1523                          flags = 0;
1523 1524                  }
1524 1525                  flags |= MAC_PROMISC_FLAGS_VLAN_TAG_STRIP;
1525 1526          }
1526 1527  
1527 1528          error = mac_promisc_add(ps->ps_mch, turnon, pfp_packet, ps,
1528 1529              &ps->ps_phd, flags);
1529 1530          if (error == 0)
1530 1531                  ps->ps_promisc = turnon;
1531 1532  
1532 1533          return (error);
1533 1534  }
1534 1535  
1535 1536  /*
1536 1537   * This table maps the MAC types in Solaris to the ARPHRD_* values used
1537 1538   * on Linux. This is used with the SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl.
1538 1539   *
1539 1540   * The symbols in this table are *not* pulled in from <net/if_arp.h>,
1540 1541   * they are pulled from <netpacket/packet.h>, thus it acts as a source
1541 1542   * of supplementary information to the ARP table.
1542 1543   */
1543 1544  static uint_t arphrd_to_dl[][2] = {
1544 1545          { ARPHRD_IEEE80211,     DL_WIFI },
1545 1546          { ARPHRD_TUNNEL,        DL_IPV4 },
1546 1547          { ARPHRD_TUNNEL,        DL_IPV6 },
1547 1548          { ARPHRD_TUNNEL,        DL_6TO4 },
1548 1549          { ARPHRD_AX25,          DL_X25 },
1549 1550          { ARPHRD_ATM,           DL_ATM },
1550 1551          { 0,                    0 }
1551 1552  };
1552 1553  
1553 1554  static int
1554 1555  pfp_dl_to_arphrd(int dltype)
1555 1556  {
1556 1557          int i;
1557 1558  
1558 1559          for (i = 0; arphrd_to_dl[i][0] != 0; i++)
1559 1560                  if (arphrd_to_dl[i][1] == dltype)
1560 1561                          return (arphrd_to_dl[i][0]);
1561 1562          return (arp_hw_type(dltype));
1562 1563  }

↓ open down ↓

176 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX