epoll Wdiff usr/src/uts/common/io/bpf/bpf.c

Print this page

8634 epoll fails to wake on certain edge-triggered conditions
8635 epoll should not emit POLLNVAL
8636 recursive epoll should emit EPOLLRDNORM
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/io/bpf/bpf.c
          +++ new/usr/src/uts/common/io/bpf/bpf.c

   1    1  /*      $NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $    */
   2    2  
   3    3  /*
   4    4   * Copyright (c) 1990, 1991, 1993
   5    5   *      The Regents of the University of California.  All rights reserved.
   6    6   *
   7    7   * This code is derived from the Stanford/CMU enet packet filter,
   8    8   * (net/enet.c) distributed as part of 4.3BSD, and code contributed
   9    9   * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  10   10   * Berkeley Laboratory.
  11   11   *
  12   12   * Redistribution and use in source and binary forms, with or without
  13   13   * modification, are permitted provided that the following conditions
  14   14   * are met:
  15   15   * 1. Redistributions of source code must retain the above copyright
  16   16   *    notice, this list of conditions and the following disclaimer.
  17   17   * 2. Redistributions in binary form must reproduce the above copyright
  18   18   *    notice, this list of conditions and the following disclaimer in the
  19   19   *    documentation and/or other materials provided with the distribution.
  20   20   * 3. Neither the name of the University nor the names of its contributors
  21   21   *    may be used to endorse or promote products derived from this software
  22   22   *    without specific prior written permission.
  23   23   *
  24   24   * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25   25   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26   26   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27   27   * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28   28   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29   29   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30   30   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31   31   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32   32   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

↓ open down ↓

32 lines elided

↑ open up ↑

  33   33   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34   34   * SUCH DAMAGE.
  35   35   *
  36   36   *      @(#)bpf.c       8.4 (Berkeley) 1/9/95
  37   37   * static char rcsid[] =
  38   38   * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
  39   39   */
  40   40  /*
  41   41   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  42   42   * Use is subject to license terms.
       43 + * Copyright 2017 Joyent, Inc.
  43   44   */
  44   45  
  45   46  /*
  46   47   * The BPF implements the following access controls for zones attempting
  47   48   * to read and write data. Writing of data requires that the net_rawaccess
  48   49   * privilege is held whilst reading data requires either net_rawaccess or
  49   50   * net_observerability.
  50   51   *
  51   52   *                              | Shared |  Exclusive |   Global
  52   53   * -----------------------------+--------+------------+------------+

  53   54   * DLT_IPNET in local zone      |  Read  |    Read    |    Read    |
  54   55   * -----------------------------+--------+------------+------------+
  55   56   * Raw access to local zone NIC |  None  | Read/Write | Read/Write |
  56   57   * -----------------------------+--------+------------+------------+
  57   58   * Raw access to all NICs       |  None  |    None    | Read/Write |
  58   59   * -----------------------------+--------+------------+------------+
  59   60   *
  60   61   * The BPF driver is written as a cloning driver: each call to bpfopen()
  61   62   * allocates a new minor number. This provides BPF with a 1:1 relationship
  62   63   * between open's and close's. There is some amount of "descriptor state"
  63   64   * that is kept per open. Pointers to this data are stored in a hash table
  64   65   * (bpf_hash) that is index'd by the minor device number for each open file.
  65   66   */
  66   67  #include <sys/param.h>
  67   68  #include <sys/systm.h>
  68   69  #include <sys/time.h>
  69   70  #include <sys/ioctl.h>
  70   71  #include <sys/queue.h>
  71   72  #include <sys/filio.h>
  72   73  #include <sys/policy.h>
  73   74  #include <sys/cmn_err.h>
  74   75  #include <sys/uio.h>
  75   76  #include <sys/file.h>
  76   77  #include <sys/sysmacros.h>
  77   78  #include <sys/zone.h>
  78   79  
  79   80  #include <sys/socket.h>
  80   81  #include <sys/errno.h>
  81   82  #include <sys/poll.h>
  82   83  #include <sys/dlpi.h>
  83   84  #include <sys/neti.h>
  84   85  
  85   86  #include <net/if.h>
  86   87  
  87   88  #include <net/bpf.h>
  88   89  #include <net/bpfdesc.h>
  89   90  #include <net/dlt.h>
  90   91  
  91   92  #include <netinet/in.h>
  92   93  #include <sys/mac.h>
  93   94  #include <sys/mac_client.h>
  94   95  #include <sys/mac_impl.h>
  95   96  #include <sys/time_std_impl.h>
  96   97  #include <sys/hook.h>
  97   98  #include <sys/hook_event.h>
  98   99  
  99  100  
 100  101  #define mtod(_v, _t)    (_t)((_v)->b_rptr)
 101  102  #define M_LEN(_m)       ((_m)->b_wptr - (_m)->b_rptr)
 102  103  
 103  104  /*
 104  105   * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
 105  106   * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
 106  107   */
 107  108  #define BPF_BUFSIZE (32 * 1024)
 108  109  
 109  110  typedef void *(*cp_fn_t)(void *, const void *, size_t);
 110  111  
 111  112  /*
 112  113   * The default read buffer size, and limit for BIOCSBLEN.
 113  114   */
 114  115  int bpf_bufsize = BPF_BUFSIZE;
 115  116  int bpf_maxbufsize = (16 * 1024 * 1024);
 116  117  static mod_hash_t *bpf_hash = NULL;
 117  118  
 118  119  /*
 119  120   * Use a mutex to avoid a race condition between gathering the stats/peers
 120  121   * and opening/closing the device.
 121  122   */
 122  123  static kcondvar_t bpf_dlt_waiter;
 123  124  static kmutex_t bpf_mtx;
 124  125  static bpf_kstats_t ks_stats;
 125  126  static bpf_kstats_t bpf_kstats = {
 126  127          { "readWait",           KSTAT_DATA_UINT64 },
 127  128          { "writeOk",            KSTAT_DATA_UINT64 },
 128  129          { "writeError",         KSTAT_DATA_UINT64 },
 129  130          { "receive",            KSTAT_DATA_UINT64 },
 130  131          { "captured",           KSTAT_DATA_UINT64 },
 131  132          { "dropped",            KSTAT_DATA_UINT64 },
 132  133  };
 133  134  static kstat_t *bpf_ksp;
 134  135  
 135  136  /*
 136  137   *  bpf_list is a list of the BPF descriptors currently open
 137  138   */
 138  139  LIST_HEAD(, bpf_d) bpf_list;
 139  140  
 140  141  static int      bpf_allocbufs(struct bpf_d *);
 141  142  static void     bpf_clear_timeout(struct bpf_d *);
 142  143  static void     bpf_deliver(struct bpf_d *, cp_fn_t,
 143  144                      void *, uint_t, uint_t, boolean_t);
 144  145  static void     bpf_freed(struct bpf_d *);
 145  146  static int      bpf_ifname(struct bpf_d *d, char *, int);
 146  147  static void     *bpf_mcpy(void *, const void *, size_t);
 147  148  static int      bpf_attachd(struct bpf_d *, const char *, int);
 148  149  static void     bpf_detachd(struct bpf_d *);
 149  150  static int      bpf_setif(struct bpf_d *, char *, int);
 150  151  static void     bpf_timed_out(void *);
 151  152  static inline void
 152  153                  bpf_wakeup(struct bpf_d *);
 153  154  static void     catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t,
 154  155                      cp_fn_t, struct timeval *);
 155  156  static void     reset_d(struct bpf_d *);
 156  157  static int      bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 157  158  static int      bpf_setdlt(struct bpf_d *, void *);
 158  159  static void     bpf_dev_add(struct bpf_d *);
 159  160  static struct bpf_d *bpf_dev_find(minor_t);
 160  161  static struct bpf_d *bpf_dev_get(minor_t);
 161  162  static void     bpf_dev_remove(struct bpf_d *);
 162  163  
 163  164  static int
 164  165  bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp)
 165  166  {
 166  167          mblk_t *m;
 167  168          int error;
 168  169          int len;
 169  170          int hlen;
 170  171          int align;
 171  172  
 172  173          /*
 173  174           * Build a sockaddr based on the data link layer type.
 174  175           * We do this at this level because the ethernet header
 175  176           * is copied directly into the data field of the sockaddr.
 176  177           * In the case of SLIP, there is no header and the packet
 177  178           * is forwarded as is.
 178  179           * Also, we are careful to leave room at the front of the mbuf
 179  180           * for the link level header.
 180  181           */
 181  182          switch (linktype) {
 182  183  
 183  184          case DLT_EN10MB:
 184  185                  hlen = sizeof (struct ether_header);
 185  186                  break;
 186  187  
 187  188          case DLT_FDDI:
 188  189                  hlen = 16;
 189  190                  break;
 190  191  
 191  192          case DLT_NULL:
 192  193                  hlen = 0;
 193  194                  break;
 194  195  
 195  196          case DLT_IPOIB:
 196  197                  hlen = 44;
 197  198                  break;
 198  199  
 199  200          default:
 200  201                  return (EIO);
 201  202          }
 202  203  
 203  204          align = 4 - (hlen & 3);
 204  205  
 205  206          len = uio->uio_resid;
 206  207          /*
 207  208           * If there aren't enough bytes for a link level header or the
 208  209           * packet length exceeds the interface mtu, return an error.
 209  210           */
 210  211          if (len < hlen || len - hlen > mtu)
 211  212                  return (EMSGSIZE);
 212  213  
 213  214          m = allocb(len + align, BPRI_MED);
 214  215          if (m == NULL) {
 215  216                  error = ENOBUFS;
 216  217                  goto bad;
 217  218          }
 218  219  
 219  220          /* Insure the data is properly aligned */
 220  221          if (align > 0)
 221  222                  m->b_rptr += align;
 222  223          m->b_wptr = m->b_rptr + len;
 223  224  
 224  225          error = uiomove(mtod(m, void *), len, UIO_WRITE, uio);
 225  226          if (error)
 226  227                  goto bad;
 227  228          *mp = m;
 228  229          return (0);
 229  230  
 230  231  bad:
 231  232          if (m != NULL)
 232  233                  freemsg(m);
 233  234          return (error);
 234  235  }
 235  236  
 236  237  
 237  238  /*
 238  239   * Attach file to the bpf interface, i.e. make d listen on bp.
 239  240   */
 240  241  static int
 241  242  bpf_attachd(struct bpf_d *d, const char *ifname, int dlt)
 242  243  {
 243  244          bpf_provider_list_t *bp;
 244  245          bpf_provider_t *bpr;
 245  246          boolean_t zonematch;
 246  247          zoneid_t niczone;
 247  248          uintptr_t mcip;
 248  249          zoneid_t zone;
 249  250          uint_t nicdlt;
 250  251          uintptr_t mh;
 251  252          int hdrlen;
 252  253          int error;
 253  254  
 254  255          ASSERT(d->bd_bif == NULL);
 255  256          ASSERT(d->bd_mcip == NULL);
 256  257          zone = d->bd_zone;
 257  258          zonematch = B_TRUE;
 258  259  again:
 259  260          mh = 0;
 260  261          mcip = 0;
 261  262          LIST_FOREACH(bp, &bpf_providers, bpl_next) {
 262  263                  bpr = bp->bpl_what;
 263  264                  error = MBPF_OPEN(bpr, ifname, &mh, zone);
 264  265                  if (error != 0)
 265  266                          goto next;
 266  267                  error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
 267  268                  if (error != 0)
 268  269                          goto next;
 269  270                  error = MBPF_GET_DLT(bpr, mh, &nicdlt);
 270  271                  if (error != 0)
 271  272                          goto next;
 272  273  
 273  274                  nicdlt = bpf_dl_to_dlt(nicdlt);
 274  275                  if (dlt != -1 && dlt != nicdlt) {
 275  276                          error = ENOENT;
 276  277                          goto next;
 277  278                  }
 278  279  
 279  280                  error = MBPF_GET_ZONE(bpr, mh, &niczone);
 280  281                  if (error != 0)
 281  282                          goto next;
 282  283  
 283  284                  DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr,
 284  285                      uintptr_t, mh, int, nicdlt, zoneid_t, niczone);
 285  286  
 286  287                  if (zonematch && niczone != zone) {
 287  288                          error = ENOENT;
 288  289                          goto next;
 289  290                  }
 290  291                  break;
 291  292  next:
 292  293                  if (mcip != 0) {
 293  294                          MBPF_CLIENT_CLOSE(bpr, mcip);
 294  295                          mcip = 0;
 295  296                  }
 296  297                  if (mh != NULL) {
 297  298                          MBPF_CLOSE(bpr, mh);
 298  299                          mh = 0;
 299  300                  }
 300  301          }
 301  302          if (error != 0) {
 302  303                  if (zonematch && (zone == GLOBAL_ZONEID)) {
 303  304                          /*
 304  305                           * If we failed to do an exact match for the global
 305  306                           * zone using the global zoneid, try again in case
 306  307                           * the network interface is owned by a local zone.
 307  308                           */
 308  309                          zonematch = B_FALSE;
 309  310                          goto again;
 310  311                  }
 311  312                  return (error);
 312  313          }
 313  314  
 314  315          d->bd_mac = *bpr;
 315  316          d->bd_mcip = mcip;
 316  317          d->bd_bif = mh;
 317  318          d->bd_dlt = nicdlt;
 318  319          hdrlen = bpf_dl_hdrsize(nicdlt);
 319  320          d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
 320  321  
 321  322          (void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip),
 322  323              sizeof (d->bd_ifname));
 323  324  
 324  325          (void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid,
 325  326              zone);
 326  327          (void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d,
 327  328              &d->bd_promisc_handle, d->bd_promisc_flags);
 328  329          return (0);
 329  330  }
 330  331  
 331  332  /*
 332  333   * Detach a file from its interface.
 333  334   */
 334  335  static void
 335  336  bpf_detachd(struct bpf_d *d)
 336  337  {
 337  338          uintptr_t mph;
 338  339          uintptr_t mch;
 339  340          uintptr_t mh;
 340  341  
 341  342          ASSERT(d->bd_inuse == -1);
 342  343          mch = d->bd_mcip;
 343  344          d->bd_mcip = 0;
 344  345          mh = d->bd_bif;
 345  346          d->bd_bif = 0;
 346  347  
 347  348          /*
 348  349           * Check if this descriptor had requested promiscuous mode.
 349  350           * If so, turn it off. There's no need to take any action
 350  351           * here, that is done when MBPF_PROMISC_REMOVE is used;
 351  352           * bd_promisc is just a local flag to stop promiscuous mode
 352  353           * from being set more than once.
 353  354           */
 354  355          if (d->bd_promisc)
 355  356                  d->bd_promisc = 0;
 356  357  
 357  358          /*
 358  359           * Take device out of "promiscuous" mode.  Since we were able to
 359  360           * enter "promiscuous" mode, we should be able to turn it off.
 360  361           * Note, this field stores a pointer used to support both
 361  362           * promiscuous and non-promiscuous callbacks for packets.
 362  363           */
 363  364          mph = d->bd_promisc_handle;
 364  365          d->bd_promisc_handle = 0;
 365  366  
 366  367          /*
 367  368           * The lock has to be dropped here because mac_promisc_remove may
 368  369           * need to wait for mac_promisc_dispatch, which has called into
 369  370           * bpf and catchpacket is waiting for bd_lock...
 370  371           * i.e mac_promisc_remove() needs to be called with none of the
 371  372           * locks held that are part of the bpf_mtap() call path.
 372  373           */
 373  374          mutex_exit(&d->bd_lock);
 374  375          if (mph != 0)
 375  376                  MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
 376  377  
 377  378          if (mch != 0)
 378  379                  MBPF_CLIENT_CLOSE(&d->bd_mac, mch);
 379  380  
 380  381          if (mh != 0)
 381  382                  MBPF_CLOSE(&d->bd_mac, mh);
 382  383  
 383  384          /*
 384  385           * Because this function is called with bd_lock held, so it must
 385  386           * exit with it held.
 386  387           */
 387  388          mutex_enter(&d->bd_lock);
 388  389          *d->bd_ifname = '\0';
 389  390          (void) memset(&d->bd_mac, 0, sizeof (d->bd_mac));
 390  391  }
 391  392  
 392  393  
 393  394  /*
 394  395   * bpfilterattach() is called at load time.
 395  396   */
 396  397  int
 397  398  bpfilterattach(void)
 398  399  {
 399  400  
 400  401          bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31,
 401  402              mod_hash_null_keydtor);
 402  403          if (bpf_hash == NULL)
 403  404                  return (ENOMEM);
 404  405  
 405  406          (void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats));
 406  407  
 407  408          bpf_ksp = kstat_create("bpf", 0, "global", "misc",
 408  409              KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t),
 409  410              KSTAT_FLAG_VIRTUAL);
 410  411          if (bpf_ksp != NULL) {
 411  412                  bpf_ksp->ks_data = &ks_stats;
 412  413                  kstat_install(bpf_ksp);
 413  414          } else {
 414  415                  mod_hash_destroy_idhash(bpf_hash);
 415  416                  bpf_hash = NULL;
 416  417                  return (EEXIST);
 417  418          }
 418  419  
 419  420          cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL);
 420  421          mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL);
 421  422  
 422  423          LIST_INIT(&bpf_list);
 423  424  
 424  425          return (0);
 425  426  }
 426  427  
 427  428  
 428  429  /*
 429  430   * bpfilterdetach() is called at unload time.
 430  431   */
 431  432  int
 432  433  bpfilterdetach(void)
 433  434  {
 434  435  
 435  436          if (bpf_ksp != NULL) {
 436  437                  kstat_delete(bpf_ksp);
 437  438                  bpf_ksp = NULL;
 438  439          }
 439  440  
 440  441          mod_hash_destroy_idhash(bpf_hash);
 441  442          bpf_hash = NULL;
 442  443  
 443  444          cv_destroy(&bpf_dlt_waiter);
 444  445          mutex_destroy(&bpf_mtx);
 445  446  
 446  447          return (0);
 447  448  }
 448  449  
 449  450  /*
 450  451   * Open ethernet device. Clones.
 451  452   */
 452  453  /* ARGSUSED */
 453  454  int
 454  455  bpfopen(dev_t *devp, int flag, int mode, cred_t *cred)
 455  456  {
 456  457          struct bpf_d *d;
 457  458          uint_t dmin;
 458  459  
 459  460          /*
 460  461           * The security policy described at the top of this file is
 461  462           * enforced here.
 462  463           */
 463  464          if ((flag & FWRITE) != 0) {
 464  465                  if (secpolicy_net_rawaccess(cred) != 0)
 465  466                          return (EACCES);
 466  467          }
 467  468  
 468  469          if ((flag & FREAD) != 0) {
 469  470                  if ((secpolicy_net_observability(cred) != 0) &&
 470  471                      (secpolicy_net_rawaccess(cred) != 0))
 471  472                          return (EACCES);
 472  473          }
 473  474  
 474  475          if ((flag & (FWRITE|FREAD)) == 0)
 475  476                  return (ENXIO);
 476  477  
 477  478          /*
 478  479           * A structure is allocated per open file in BPF to store settings
 479  480           * such as buffer capture size, provide private buffers, etc.
 480  481           */
 481  482          d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP);
 482  483          d->bd_bufsize = bpf_bufsize;
 483  484          d->bd_fmode = flag;
 484  485          d->bd_zone = crgetzoneid(cred);
 485  486          d->bd_seesent = 1;
 486  487          d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS|
 487  488              MAC_PROMISC_FLAGS_NO_COPY;
 488  489          mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL);
 489  490          cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL);
 490  491  
 491  492          mutex_enter(&bpf_mtx);
 492  493          /*
 493  494           * Find an unused minor number. Obviously this is an O(n) algorithm
 494  495           * and doesn't scale particularly well, so if there are large numbers
 495  496           * of open file descriptors happening in real use, this design may
 496  497           * need to be revisited.
 497  498           */
 498  499          for (dmin = 0; dmin < L_MAXMIN; dmin++)
 499  500                  if (bpf_dev_find(dmin) == NULL)
 500  501                          break;
 501  502          if (dmin == L_MAXMIN) {
 502  503                  mutex_exit(&bpf_mtx);
 503  504                  kmem_free(d, sizeof (*d));
 504  505                  return (ENXIO);
 505  506          }
 506  507          d->bd_dev = dmin;
 507  508          LIST_INSERT_HEAD(&bpf_list, d, bd_list);
 508  509          bpf_dev_add(d);
 509  510          mutex_exit(&bpf_mtx);
 510  511  
 511  512          *devp = makedevice(getmajor(*devp), dmin);
 512  513  
 513  514          return (0);
 514  515  }
 515  516  
 516  517  /*
 517  518   * Close the descriptor by detaching it from its interface,
 518  519   * deallocating its buffers, and marking it free.
 519  520   *
 520  521   * Because we only allow a device to be opened once, there is always a
 521  522   * 1 to 1 relationship between opens and closes supporting this function.
 522  523   */
 523  524  /* ARGSUSED */
 524  525  int
 525  526  bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
 526  527  {
 527  528          struct bpf_d *d = bpf_dev_get(getminor(dev));
 528  529  
 529  530          mutex_enter(&d->bd_lock);
 530  531  
 531  532          while (d->bd_inuse != 0) {
 532  533                  d->bd_waiting++;
 533  534                  if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
 534  535                          d->bd_waiting--;
 535  536                          mutex_exit(&d->bd_lock);
 536  537                          return (EINTR);
 537  538                  }
 538  539                  d->bd_waiting--;
 539  540          }
 540  541  
 541  542          d->bd_inuse = -1;
 542  543          if (d->bd_state == BPF_WAITING)
 543  544                  bpf_clear_timeout(d);
 544  545          d->bd_state = BPF_IDLE;
 545  546          if (d->bd_bif)
 546  547                  bpf_detachd(d);
 547  548          mutex_exit(&d->bd_lock);
 548  549  
 549  550          mutex_enter(&bpf_mtx);
 550  551          LIST_REMOVE(d, bd_list);
 551  552          bpf_dev_remove(d);
 552  553          mutex_exit(&bpf_mtx);
 553  554  
 554  555          mutex_enter(&d->bd_lock);
 555  556          mutex_destroy(&d->bd_lock);
 556  557          cv_destroy(&d->bd_wait);
 557  558  
 558  559          bpf_freed(d);
 559  560          kmem_free(d, sizeof (*d));
 560  561  
 561  562          return (0);
 562  563  }
 563  564  
 564  565  /*
 565  566   * Rotate the packet buffers in descriptor d.  Move the store buffer
 566  567   * into the hold slot, and the free buffer into the store slot.
 567  568   * Zero the length of the new store buffer.
 568  569   */
 569  570  #define ROTATE_BUFFERS(d) \
 570  571          (d)->bd_hbuf = (d)->bd_sbuf; \
 571  572          (d)->bd_hlen = (d)->bd_slen; \
 572  573          (d)->bd_sbuf = (d)->bd_fbuf; \
 573  574          (d)->bd_slen = 0; \
 574  575          (d)->bd_fbuf = 0;
 575  576  /*
 576  577   *  bpfread - read next chunk of packets from buffers
 577  578   */
 578  579  /* ARGSUSED */
 579  580  int
 580  581  bpfread(dev_t dev, struct uio *uio, cred_t *cred)
 581  582  {
 582  583          struct bpf_d *d = bpf_dev_get(getminor(dev));
 583  584          int timed_out;
 584  585          ulong_t delay;
 585  586          int error;
 586  587  
 587  588          if ((d->bd_fmode & FREAD) == 0)
 588  589                  return (EBADF);
 589  590  
 590  591          /*
 591  592           * Restrict application to use a buffer the same size as
 592  593           * the kernel buffers.
 593  594           */
 594  595          if (uio->uio_resid != d->bd_bufsize)
 595  596                  return (EINVAL);
 596  597  
 597  598          mutex_enter(&d->bd_lock);
 598  599          if (d->bd_state == BPF_WAITING)
 599  600                  bpf_clear_timeout(d);
 600  601          timed_out = (d->bd_state == BPF_TIMED_OUT);
 601  602          d->bd_state = BPF_IDLE;
 602  603          /*
 603  604           * If the hold buffer is empty, then do a timed sleep, which
 604  605           * ends when the timeout expires or when enough packets
 605  606           * have arrived to fill the store buffer.
 606  607           */
 607  608          while (d->bd_hbuf == 0) {
 608  609                  if (d->bd_nonblock) {
 609  610                          if (d->bd_slen == 0) {
 610  611                                  mutex_exit(&d->bd_lock);
 611  612                                  return (EWOULDBLOCK);
 612  613                          }
 613  614                          ROTATE_BUFFERS(d);
 614  615                          break;
 615  616                  }
 616  617  
 617  618                  if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
 618  619                          /*
 619  620                           * A packet(s) either arrived since the previous
 620  621                           * read or arrived while we were asleep.
 621  622                           * Rotate the buffers and return what's here.
 622  623                           */
 623  624                          ROTATE_BUFFERS(d);
 624  625                          break;
 625  626                  }
 626  627                  ks_stats.kp_read_wait.value.ui64++;
 627  628                  delay = ddi_get_lbolt() + d->bd_rtout;
 628  629                  error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay);
 629  630                  if (error == 0) {
 630  631                          mutex_exit(&d->bd_lock);
 631  632                          return (EINTR);
 632  633                  }
 633  634                  if (error == -1) {
 634  635                          /*
 635  636                           * On a timeout, return what's in the buffer,
 636  637                           * which may be nothing.  If there is something
 637  638                           * in the store buffer, we can rotate the buffers.
 638  639                           */
 639  640                          if (d->bd_hbuf)
 640  641                                  /*
 641  642                                   * We filled up the buffer in between
 642  643                                   * getting the timeout and arriving
 643  644                                   * here, so we don't need to rotate.
 644  645                                   */
 645  646                                  break;
 646  647  
 647  648                          if (d->bd_slen == 0) {
 648  649                                  mutex_exit(&d->bd_lock);
 649  650                                  return (0);
 650  651                          }
 651  652                          ROTATE_BUFFERS(d);
 652  653                  }
 653  654          }
 654  655          /*
 655  656           * At this point, we know we have something in the hold slot.
 656  657           */
 657  658          mutex_exit(&d->bd_lock);
 658  659  
 659  660          /*
 660  661           * Move data from hold buffer into user space.
 661  662           * We know the entire buffer is transferred since
 662  663           * we checked above that the read buffer is bpf_bufsize bytes.
 663  664           */
 664  665          error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio);
 665  666  
 666  667          mutex_enter(&d->bd_lock);
 667  668          d->bd_fbuf = d->bd_hbuf;
 668  669          d->bd_hbuf = 0;
 669  670          d->bd_hlen = 0;
 670  671  done:
 671  672          mutex_exit(&d->bd_lock);
 672  673          return (error);
 673  674  }
 674  675  
 675  676  
 676  677  /*
 677  678   * If there are processes sleeping on this descriptor, wake them up.
 678  679   * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
 679  680   * so there is no code here grabbing it.
 680  681   */
 681  682  static inline void
 682  683  bpf_wakeup(struct bpf_d *d)
 683  684  {
 684  685          cv_signal(&d->bd_wait);
 685  686  }
 686  687  
 687  688  static void
 688  689  bpf_timed_out(void *arg)
 689  690  {
 690  691          struct bpf_d *d = arg;
 691  692  
 692  693          mutex_enter(&d->bd_lock);
 693  694          if (d->bd_state == BPF_WAITING) {
 694  695                  d->bd_state = BPF_TIMED_OUT;
 695  696                  if (d->bd_slen != 0)
 696  697                          cv_signal(&d->bd_wait);
 697  698          }
 698  699          mutex_exit(&d->bd_lock);
 699  700  }
 700  701  
 701  702  
 702  703  /* ARGSUSED */
 703  704  int
 704  705  bpfwrite(dev_t dev, struct uio *uio, cred_t *cred)
 705  706  {
 706  707          struct bpf_d *d = bpf_dev_get(getminor(dev));
 707  708          uintptr_t mch;
 708  709          uint_t mtu;
 709  710          mblk_t *m;
 710  711          int error;
 711  712          int dlt;
 712  713  
 713  714          if ((d->bd_fmode & FWRITE) == 0)
 714  715                  return (EBADF);
 715  716  
 716  717          mutex_enter(&d->bd_lock);
 717  718          if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) {
 718  719                  mutex_exit(&d->bd_lock);
 719  720                  return (EINTR);
 720  721          }
 721  722  
 722  723          if (uio->uio_resid == 0) {
 723  724                  mutex_exit(&d->bd_lock);
 724  725                  return (0);
 725  726          }
 726  727  
 727  728          while (d->bd_inuse < 0) {
 728  729                  d->bd_waiting++;
 729  730                  if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
 730  731                          d->bd_waiting--;
 731  732                          mutex_exit(&d->bd_lock);
 732  733                          return (EINTR);
 733  734                  }
 734  735                  d->bd_waiting--;
 735  736          }
 736  737  
 737  738          mutex_exit(&d->bd_lock);
 738  739  
 739  740          dlt = d->bd_dlt;
 740  741          mch = d->bd_mcip;
 741  742          MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu);
 742  743          d->bd_inuse++;
 743  744  
 744  745          m = NULL;
 745  746          if (dlt == DLT_IPNET) {
 746  747                  error = EIO;
 747  748                  goto done;
 748  749          }
 749  750  
 750  751          error = bpf_movein(uio, dlt, mtu, &m);
 751  752          if (error)
 752  753                  goto done;
 753  754  
 754  755          DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt,
 755  756              uint_t, mtu, mblk_t *, m);
 756  757  
 757  758          if (M_LEN(m) > mtu) {
 758  759                  error = EMSGSIZE;
 759  760                  goto done;
 760  761          }
 761  762  
 762  763          error = MBPF_TX(&d->bd_mac, mch, m);
 763  764          /*
 764  765           * The "tx" action here is required to consume the mblk_t.
 765  766           */
 766  767          m = NULL;
 767  768  
 768  769  done:
 769  770          if (error == 0)
 770  771                  ks_stats.kp_write_ok.value.ui64++;
 771  772          else
 772  773                  ks_stats.kp_write_error.value.ui64++;
 773  774          if (m != NULL)
 774  775                  freemsg(m);
 775  776  
 776  777          mutex_enter(&d->bd_lock);
 777  778          d->bd_inuse--;
 778  779          if ((d->bd_inuse == 0) && (d->bd_waiting != 0))
 779  780                  cv_signal(&d->bd_wait);
 780  781          mutex_exit(&d->bd_lock);
 781  782  
 782  783          /*
 783  784           * The driver frees the mbuf.
 784  785           */
 785  786          return (error);
 786  787  }
 787  788  
 788  789  
 789  790  /*
 790  791   * Reset a descriptor by flushing its packet buffer and clearing the
 791  792   * receive and drop counts.  Should be called at splnet.
 792  793   */
 793  794  static void
 794  795  reset_d(struct bpf_d *d)
 795  796  {
 796  797          if (d->bd_hbuf) {
 797  798                  /* Free the hold buffer. */
 798  799                  d->bd_fbuf = d->bd_hbuf;
 799  800                  d->bd_hbuf = 0;
 800  801          }
 801  802          d->bd_slen = 0;
 802  803          d->bd_hlen = 0;
 803  804          d->bd_rcount = 0;
 804  805          d->bd_dcount = 0;
 805  806          d->bd_ccount = 0;
 806  807  }
 807  808  
 808  809  /*
 809  810   *  FIONREAD            Check for read packet available.
 810  811   *  BIOCGBLEN           Get buffer len [for read()].
 811  812   *  BIOCSETF            Set ethernet read filter.
 812  813   *  BIOCFLUSH           Flush read packet buffer.
 813  814   *  BIOCPROMISC         Put interface into promiscuous mode.
 814  815   *  BIOCGDLT            Get link layer type.
 815  816   *  BIOCGETIF           Get interface name.
 816  817   *  BIOCSETIF           Set interface.
 817  818   *  BIOCSRTIMEOUT       Set read timeout.
 818  819   *  BIOCGRTIMEOUT       Get read timeout.
 819  820   *  BIOCGSTATS          Get packet stats.
 820  821   *  BIOCIMMEDIATE       Set immediate mode.
 821  822   *  BIOCVERSION         Get filter language version.
 822  823   *  BIOCGHDRCMPLT       Get "header already complete" flag.
 823  824   *  BIOCSHDRCMPLT       Set "header already complete" flag.
 824  825   */
 825  826  /* ARGSUSED */
 826  827  int
 827  828  bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval)
 828  829  {
 829  830          struct bpf_d *d = bpf_dev_get(getminor(dev));
 830  831          struct bpf_program prog;
 831  832          struct lifreq lifreq;
 832  833          struct ifreq ifreq;
 833  834          int error = 0;
 834  835          uint_t size;
 835  836  
 836  837          /*
 837  838           * Refresh the PID associated with this bpf file.
 838  839           */
 839  840          mutex_enter(&d->bd_lock);
 840  841          if (d->bd_state == BPF_WAITING)
 841  842                  bpf_clear_timeout(d);
 842  843          d->bd_state = BPF_IDLE;
 843  844          mutex_exit(&d->bd_lock);
 844  845  
 845  846          switch (cmd) {
 846  847  
 847  848          default:
 848  849                  error = EINVAL;
 849  850                  break;
 850  851  
 851  852          /*
 852  853           * Check for read packet available.
 853  854           */
 854  855          case FIONREAD:
 855  856                  {
 856  857                          int n;
 857  858  
 858  859                          mutex_enter(&d->bd_lock);
 859  860                          n = d->bd_slen;
 860  861                          if (d->bd_hbuf)
 861  862                                  n += d->bd_hlen;
 862  863                          mutex_exit(&d->bd_lock);
 863  864  
 864  865                          *(int *)addr = n;
 865  866                          break;
 866  867                  }
 867  868  
 868  869          /*
 869  870           * Get buffer len [for read()].
 870  871           */
 871  872          case BIOCGBLEN:
 872  873                  error = copyout(&d->bd_bufsize, (void *)addr,
 873  874                      sizeof (d->bd_bufsize));
 874  875                  break;
 875  876  
 876  877          /*
 877  878           * Set buffer length.
 878  879           */
 879  880          case BIOCSBLEN:
 880  881                  if (copyin((void *)addr, &size, sizeof (size)) != 0) {
 881  882                          error = EFAULT;
 882  883                          break;
 883  884                  }
 884  885  
 885  886                  mutex_enter(&d->bd_lock);
 886  887                  if (d->bd_bif != 0) {
 887  888                          error = EINVAL;
 888  889                  } else {
 889  890                          if (size > bpf_maxbufsize)
 890  891                                  size = bpf_maxbufsize;
 891  892                          else if (size < BPF_MINBUFSIZE)
 892  893                                  size = BPF_MINBUFSIZE;
 893  894  
 894  895                          d->bd_bufsize = size;
 895  896                  }
 896  897                  mutex_exit(&d->bd_lock);
 897  898  
 898  899                  if (error == 0)
 899  900                          error = copyout(&size, (void *)addr, sizeof (size));
 900  901                  break;
 901  902  
 902  903          /*
 903  904           * Set link layer read filter.
 904  905           */
 905  906          case BIOCSETF:
 906  907                  if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) {
 907  908                          error = EFAULT;
 908  909                          break;
 909  910                  }
 910  911                  error = bpf_setf(d, &prog);
 911  912                  break;
 912  913  
 913  914          /*
 914  915           * Flush read packet buffer.
 915  916           */
 916  917          case BIOCFLUSH:
 917  918                  mutex_enter(&d->bd_lock);
 918  919                  reset_d(d);
 919  920                  mutex_exit(&d->bd_lock);
 920  921                  break;
 921  922  
 922  923          /*
 923  924           * Put interface into promiscuous mode.
 924  925           * This is a one-way ioctl, it is not used to turn promiscuous
 925  926           * mode off.
 926  927           */
 927  928          case BIOCPROMISC:
 928  929                  if (d->bd_bif == 0) {
 929  930                          /*
 930  931                           * No interface attached yet.
 931  932                           */
 932  933                          error = EINVAL;
 933  934                          break;
 934  935                  }
 935  936                  mutex_enter(&d->bd_lock);
 936  937                  if (d->bd_promisc == 0) {
 937  938  
 938  939                          if (d->bd_promisc_handle) {
 939  940                                  uintptr_t mph;
 940  941  
 941  942                                  mph = d->bd_promisc_handle;
 942  943                                  d->bd_promisc_handle = 0;
 943  944  
 944  945                                  mutex_exit(&d->bd_lock);
 945  946                                  MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
 946  947                                  mutex_enter(&d->bd_lock);
 947  948                          }
 948  949  
 949  950                          d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY;
 950  951                          error = MBPF_PROMISC_ADD(&d->bd_mac,
 951  952                              d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d,
 952  953                              &d->bd_promisc_handle, d->bd_promisc_flags);
 953  954                          if (error == 0)
 954  955                                  d->bd_promisc = 1;
 955  956                  }
 956  957                  mutex_exit(&d->bd_lock);
 957  958                  break;
 958  959  
 959  960          /*
 960  961           * Get device parameters.
 961  962           */
 962  963          case BIOCGDLT:
 963  964                  if (d->bd_bif == 0)
 964  965                          error = EINVAL;
 965  966                  else
 966  967                          error = copyout(&d->bd_dlt, (void *)addr,
 967  968                              sizeof (d->bd_dlt));
 968  969                  break;
 969  970  
 970  971          /*
 971  972           * Get a list of supported device parameters.
 972  973           */
 973  974          case BIOCGDLTLIST:
 974  975                  if (d->bd_bif == 0) {
 975  976                          error = EINVAL;
 976  977                  } else {
 977  978                          struct bpf_dltlist list;
 978  979  
 979  980                          if (copyin((void *)addr, &list, sizeof (list)) != 0) {
 980  981                                  error = EFAULT;
 981  982                                  break;
 982  983                          }
 983  984                          error = bpf_getdltlist(d, &list);
 984  985                          if ((error == 0) &&
 985  986                              copyout(&list, (void *)addr, sizeof (list)) != 0)
 986  987                                  error = EFAULT;
 987  988                  }
 988  989                  break;
 989  990  
 990  991          /*
 991  992           * Set device parameters.
 992  993           */
 993  994          case BIOCSDLT:
 994  995                  error = bpf_setdlt(d, (void *)addr);
 995  996                  break;
 996  997  
 997  998          /*
 998  999           * Get interface name.
 999 1000           */
1000 1001          case BIOCGETIF:
1001 1002                  if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1002 1003                          error = EFAULT;
1003 1004                          break;
1004 1005                  }
1005 1006                  error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1006 1007                  if ((error == 0) &&
1007 1008                      copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) {
1008 1009                          error = EFAULT;
1009 1010                          break;
1010 1011                  }
1011 1012                  break;
1012 1013  
1013 1014          /*
1014 1015           * Set interface.
1015 1016           */
1016 1017          case BIOCSETIF:
1017 1018                  if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1018 1019                          error = EFAULT;
1019 1020                          break;
1020 1021                  }
1021 1022                  error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1022 1023                  break;
1023 1024  
1024 1025          /*
1025 1026           * Get interface name.
1026 1027           */
1027 1028          case BIOCGETLIF:
1028 1029                  if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1029 1030                          error = EFAULT;
1030 1031                          break;
1031 1032                  }
1032 1033                  error = bpf_ifname(d, lifreq.lifr_name,
1033 1034                      sizeof (lifreq.lifr_name));
1034 1035                  if ((error == 0) &&
1035 1036                      copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) {
1036 1037                          error = EFAULT;
1037 1038                          break;
1038 1039                  }
1039 1040                  break;
1040 1041  
1041 1042          /*
1042 1043           * Set interface.
1043 1044           */
1044 1045          case BIOCSETLIF:
1045 1046                  if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1046 1047                          error = EFAULT;
1047 1048                          break;
1048 1049                  }
1049 1050                  error = bpf_setif(d, lifreq.lifr_name,
1050 1051                      sizeof (lifreq.lifr_name));
1051 1052                  break;
1052 1053  
1053 1054  #ifdef _SYSCALL32_IMPL
1054 1055          /*
1055 1056           * Set read timeout.
1056 1057           */
1057 1058          case BIOCSRTIMEOUT32:
1058 1059                  {
1059 1060                          struct timeval32 tv;
1060 1061  
1061 1062                          if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1062 1063                                  error = EFAULT;
1063 1064                                  break;
1064 1065                          }
1065 1066  
1066 1067                          /* Convert the timeout in microseconds to ticks */
1067 1068                          d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1068 1069                              tv.tv_usec);
1069 1070                          if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1070 1071                                  d->bd_rtout = 1;
1071 1072                          break;
1072 1073                  }
1073 1074  
1074 1075          /*
1075 1076           * Get read timeout.
1076 1077           */
1077 1078          case BIOCGRTIMEOUT32:
1078 1079                  {
1079 1080                          struct timeval32 tv;
1080 1081                          clock_t ticks;
1081 1082  
1082 1083                          ticks = drv_hztousec(d->bd_rtout);
1083 1084                          tv.tv_sec = ticks / 1000000;
1084 1085                          tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1085 1086                          error = copyout(&tv, (void *)addr, sizeof (tv));
1086 1087                          break;
1087 1088                  }
1088 1089  
1089 1090          /*
1090 1091           * Get a list of supported device parameters.
1091 1092           */
1092 1093          case BIOCGDLTLIST32:
1093 1094                  if (d->bd_bif == 0) {
1094 1095                          error = EINVAL;
1095 1096                  } else {
1096 1097                          struct bpf_dltlist32 lst32;
1097 1098                          struct bpf_dltlist list;
1098 1099  
1099 1100                          if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) {
1100 1101                                  error = EFAULT;
1101 1102                                  break;
1102 1103                          }
1103 1104  
1104 1105                          list.bfl_len = lst32.bfl_len;
1105 1106                          list.bfl_list = (void *)(uint64_t)lst32.bfl_list;
1106 1107                          error = bpf_getdltlist(d, &list);
1107 1108                          if (error == 0) {
1108 1109                                  lst32.bfl_len = list.bfl_len;
1109 1110  
1110 1111                                  if (copyout(&lst32, (void *)addr,
1111 1112                                      sizeof (lst32)) != 0)
1112 1113                                          error = EFAULT;
1113 1114                          }
1114 1115                  }
1115 1116                  break;
1116 1117  
1117 1118          /*
1118 1119           * Set link layer read filter.
1119 1120           */
1120 1121          case BIOCSETF32: {
1121 1122                  struct bpf_program32 prog32;
1122 1123  
1123 1124                  if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) {
1124 1125                          error = EFAULT;
1125 1126                          break;
1126 1127                  }
1127 1128                  prog.bf_len = prog32.bf_len;
1128 1129                  prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1129 1130                  error = bpf_setf(d, &prog);
1130 1131                  break;
1131 1132          }
1132 1133  #endif
1133 1134  
1134 1135          /*
1135 1136           * Set read timeout.
1136 1137           */
1137 1138          case BIOCSRTIMEOUT:
1138 1139                  {
1139 1140                          struct timeval tv;
1140 1141  
1141 1142                          if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1142 1143                                  error = EFAULT;
1143 1144                                  break;
1144 1145                          }
1145 1146  
1146 1147                          /* Convert the timeout in microseconds to ticks */
1147 1148                          d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1148 1149                              tv.tv_usec);
1149 1150                          if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1150 1151                                  d->bd_rtout = 1;
1151 1152                          break;
1152 1153                  }
1153 1154  
1154 1155          /*
1155 1156           * Get read timeout.
1156 1157           */
1157 1158          case BIOCGRTIMEOUT:
1158 1159                  {
1159 1160                          struct timeval tv;
1160 1161                          clock_t ticks;
1161 1162  
1162 1163                          ticks = drv_hztousec(d->bd_rtout);
1163 1164                          tv.tv_sec = ticks / 1000000;
1164 1165                          tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1165 1166                          if (copyout(&tv, (void *)addr, sizeof (tv)) != 0)
1166 1167                                  error = EFAULT;
1167 1168                          break;
1168 1169                  }
1169 1170  
1170 1171          /*
1171 1172           * Get packet stats.
1172 1173           */
1173 1174          case BIOCGSTATS:
1174 1175                  {
1175 1176                          struct bpf_stat bs;
1176 1177  
1177 1178                          bs.bs_recv = d->bd_rcount;
1178 1179                          bs.bs_drop = d->bd_dcount;
1179 1180                          bs.bs_capt = d->bd_ccount;
1180 1181                          if (copyout(&bs, (void *)addr, sizeof (bs)) != 0)
1181 1182                                  error = EFAULT;
1182 1183                          break;
1183 1184                  }
1184 1185  
1185 1186          /*
1186 1187           * Set immediate mode.
1187 1188           */
1188 1189          case BIOCIMMEDIATE:
1189 1190                  if (copyin((void *)addr, &d->bd_immediate,
1190 1191                      sizeof (d->bd_immediate)) != 0)
1191 1192                          error = EFAULT;
1192 1193                  break;
1193 1194  
1194 1195          case BIOCVERSION:
1195 1196                  {
1196 1197                          struct bpf_version bv;
1197 1198  
1198 1199                          bv.bv_major = BPF_MAJOR_VERSION;
1199 1200                          bv.bv_minor = BPF_MINOR_VERSION;
1200 1201                          if (copyout(&bv, (void *)addr, sizeof (bv)) != 0)
1201 1202                                  error = EFAULT;
1202 1203                          break;
1203 1204                  }
1204 1205  
1205 1206          case BIOCGHDRCMPLT:     /* get "header already complete" flag */
1206 1207                  if (copyout(&d->bd_hdrcmplt, (void *)addr,
1207 1208                      sizeof (d->bd_hdrcmplt)) != 0)
1208 1209                          error = EFAULT;
1209 1210                  break;
1210 1211  
1211 1212          case BIOCSHDRCMPLT:     /* set "header already complete" flag */
1212 1213                  if (copyin((void *)addr, &d->bd_hdrcmplt,
1213 1214                      sizeof (d->bd_hdrcmplt)) != 0)
1214 1215                          error = EFAULT;
1215 1216                  break;
1216 1217  
1217 1218          /*
1218 1219           * Get "see sent packets" flag
1219 1220           */
1220 1221          case BIOCGSEESENT:
1221 1222                  if (copyout(&d->bd_seesent, (void *)addr,
1222 1223                      sizeof (d->bd_seesent)) != 0)
1223 1224                          error = EFAULT;
1224 1225                  break;
1225 1226  
1226 1227          /*
1227 1228           * Set "see sent" packets flag
1228 1229           */
1229 1230          case BIOCSSEESENT:
1230 1231                  if (copyin((void *)addr, &d->bd_seesent,
1231 1232                      sizeof (d->bd_seesent)) != 0)
1232 1233                          error = EFAULT;
1233 1234                  break;
1234 1235  
1235 1236          case FIONBIO:           /* Non-blocking I/O */
1236 1237                  if (copyin((void *)addr, &d->bd_nonblock,
1237 1238                      sizeof (d->bd_nonblock)) != 0)
1238 1239                          error = EFAULT;
1239 1240                  break;
1240 1241          }
1241 1242          return (error);
1242 1243  }
1243 1244  
1244 1245  /*
1245 1246   * Set d's packet filter program to fp.  If this file already has a filter,
1246 1247   * free it and replace it. If the new filter is "empty" (has a 0 size), then
1247 1248   * the result is to just remove and free the existing filter.
1248 1249   * Returns EINVAL for bogus requests.
1249 1250   */
1250 1251  int
1251 1252  bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1252 1253  {
1253 1254          struct bpf_insn *fcode, *old;
1254 1255          uint_t flen, size;
1255 1256          size_t oldsize;
1256 1257  
1257 1258          if (fp->bf_insns == 0) {
1258 1259                  if (fp->bf_len != 0)
1259 1260                          return (EINVAL);
1260 1261                  mutex_enter(&d->bd_lock);
1261 1262                  old = d->bd_filter;
1262 1263                  oldsize = d->bd_filter_size;
1263 1264                  d->bd_filter = 0;
1264 1265                  d->bd_filter_size = 0;
1265 1266                  reset_d(d);
1266 1267                  mutex_exit(&d->bd_lock);
1267 1268                  if (old != 0)
1268 1269                          kmem_free(old, oldsize);
1269 1270                  return (0);
1270 1271          }
1271 1272          flen = fp->bf_len;
1272 1273          if (flen > BPF_MAXINSNS)
1273 1274                  return (EINVAL);
1274 1275  
1275 1276          size = flen * sizeof (*fp->bf_insns);
1276 1277          fcode = kmem_alloc(size, KM_SLEEP);
1277 1278          if (copyin(fp->bf_insns, fcode, size) != 0)
1278 1279                  return (EFAULT);
1279 1280  
1280 1281          if (bpf_validate(fcode, (int)flen)) {
1281 1282                  mutex_enter(&d->bd_lock);
1282 1283                  old = d->bd_filter;
1283 1284                  oldsize = d->bd_filter_size;
1284 1285                  d->bd_filter = fcode;
1285 1286                  d->bd_filter_size = size;
1286 1287                  reset_d(d);
1287 1288                  mutex_exit(&d->bd_lock);
1288 1289                  if (old != 0)
1289 1290                          kmem_free(old, oldsize);
1290 1291  
1291 1292                  return (0);
1292 1293          }
1293 1294          kmem_free(fcode, size);
1294 1295          return (EINVAL);
1295 1296  }
1296 1297  
1297 1298  /*
1298 1299   * Detach a file from its current interface (if attached at all) and attach
1299 1300   * to the interface indicated by the name stored in ifname.
1300 1301   * Return an errno or 0.
1301 1302   */
1302 1303  static int
1303 1304  bpf_setif(struct bpf_d *d, char *ifname, int namesize)
1304 1305  {
1305 1306          int unit_seen;
1306 1307          int error = 0;
1307 1308          char *cp;
1308 1309          int i;
1309 1310  
1310 1311          /*
1311 1312           * Make sure the provided name has a unit number, and default
1312 1313           * it to '0' if not specified.
1313 1314           * XXX This is ugly ... do this differently?
1314 1315           */
1315 1316          unit_seen = 0;
1316 1317          cp = ifname;
1317 1318          cp[namesize - 1] = '\0';        /* sanity */
1318 1319          while (*cp++)
1319 1320                  if (*cp >= '0' && *cp <= '9')
1320 1321                          unit_seen = 1;
1321 1322          if (!unit_seen) {
1322 1323                  /* Make sure to leave room for the '\0'. */
1323 1324                  for (i = 0; i < (namesize - 1); ++i) {
1324 1325                          if ((ifname[i] >= 'a' && ifname[i] <= 'z') ||
1325 1326                              (ifname[i] >= 'A' && ifname[i] <= 'Z'))
1326 1327                                  continue;
1327 1328                          ifname[i] = '0';
1328 1329                  }
1329 1330          }
1330 1331  
1331 1332          /*
1332 1333           * Make sure that only one call to this function happens at a time
1333 1334           * and that we're not interleaving a read/write
1334 1335           */
1335 1336          mutex_enter(&d->bd_lock);
1336 1337          while (d->bd_inuse != 0) {
1337 1338                  d->bd_waiting++;
1338 1339                  if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
1339 1340                          d->bd_waiting--;
1340 1341                          mutex_exit(&d->bd_lock);
1341 1342                          return (EINTR);
1342 1343                  }
1343 1344                  d->bd_waiting--;
1344 1345          }
1345 1346          d->bd_inuse = -1;
1346 1347          mutex_exit(&d->bd_lock);
1347 1348  
1348 1349          if (d->bd_sbuf == 0)
1349 1350                  error = bpf_allocbufs(d);
1350 1351  
1351 1352          if (error == 0) {
1352 1353                  mutex_enter(&d->bd_lock);
1353 1354                  if (d->bd_bif)
1354 1355                          /*
1355 1356                           * Detach if attached to something else.
1356 1357                           */
1357 1358                          bpf_detachd(d);
1358 1359  
1359 1360                  error = bpf_attachd(d, ifname, -1);
1360 1361                  reset_d(d);
1361 1362                  d->bd_inuse = 0;
1362 1363                  if (d->bd_waiting != 0)
1363 1364                          cv_signal(&d->bd_wait);
1364 1365                  mutex_exit(&d->bd_lock);
1365 1366                  return (error);
1366 1367          }
1367 1368  
1368 1369          mutex_enter(&d->bd_lock);
1369 1370          d->bd_inuse = 0;
1370 1371          if (d->bd_waiting != 0)
1371 1372                  cv_signal(&d->bd_wait);
1372 1373          mutex_exit(&d->bd_lock);
1373 1374  
1374 1375          /*
1375 1376           * Try tickle the mac layer into attaching the device...
1376 1377           */
1377 1378          return (bpf_provider_tickle(ifname, d->bd_zone));
1378 1379  }
1379 1380  
1380 1381  /*
1381 1382   * Copy the interface name to the ifreq.
1382 1383   */
1383 1384  static int
1384 1385  bpf_ifname(struct bpf_d *d, char *buffer, int bufsize)
1385 1386  {
1386 1387  
1387 1388          mutex_enter(&d->bd_lock);
1388 1389          if (d->bd_bif == NULL) {

↓ open down ↓

1336 lines elided

↑ open up ↑

1389 1390                  mutex_exit(&d->bd_lock);
1390 1391                  return (EINVAL);
1391 1392          }
1392 1393  
1393 1394          (void) strlcpy(buffer, d->bd_ifname, bufsize);
1394 1395          mutex_exit(&d->bd_lock);
1395 1396  
1396 1397          return (0);
1397 1398  }
1398 1399  
1399      -/*
1400      - * Support for poll() system call
1401      - *
1402      - * Return true iff the specific operation will not block indefinitely - with
1403      - * the assumption that it is safe to positively acknowledge a request for the
1404      - * ability to write to the BPF device.
1405      - * Otherwise, return false but make a note that a selnotify() must be done.
1406      - */
     1400 +/* ARGSUSED */
1407 1401  int
1408 1402  bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp,
1409 1403      struct pollhead **phpp)
1410 1404  {
1411 1405          struct bpf_d *d = bpf_dev_get(getminor(dev));
1412 1406  
     1407 +        /*
     1408 +         * Until this driver is modified to issue proper pollwakeup() calls on
     1409 +         * its pollhead, edge-triggered polling is not allowed.
     1410 +         */
     1411 +        if (events & POLLET) {
     1412 +                return (EPERM);
     1413 +        }
     1414 +
1413 1415          if (events & (POLLIN | POLLRDNORM)) {
1414 1416                  /*
1415 1417                   * An imitation of the FIONREAD ioctl code.
1416 1418                   */
1417 1419                  mutex_enter(&d->bd_lock);
1418 1420                  if (d->bd_hlen != 0 ||
1419 1421                      ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1420 1422                      d->bd_slen != 0)) {
1421 1423                          *reventsp |= events & (POLLIN | POLLRDNORM);
1422 1424                  } else {
     1425 +                        /*
     1426 +                         * Until the bpf driver has been updated to include
     1427 +                         * adequate pollwakeup() logic, no pollhead will be
     1428 +                         * emitted here, preventing the resource from being
     1429 +                         * cached by poll()/devpoll/epoll.
     1430 +                         */
1423 1431                          *reventsp = 0;
1424      -                        if (!anyyet)
1425      -                                *phpp = &d->bd_poll;
1426 1432                          /* Start the read timeout if necessary */
1427 1433                          if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1428 1434                                  bpf_clear_timeout(d);
1429 1435                                  /*
1430 1436                                   * Only allow the timeout to be set once.
1431 1437                                   */
1432 1438                                  if (d->bd_callout == 0)
1433 1439                                          d->bd_callout = timeout(bpf_timed_out,
1434 1440                                              d, d->bd_rtout);
1435 1441                                  d->bd_state = BPF_WAITING;

1436 1442                          }
1437 1443                  }
1438 1444                  mutex_exit(&d->bd_lock);
1439 1445          }
1440 1446  
1441 1447          return (0);
1442 1448  }
1443 1449  
1444 1450  /*
1445 1451   * Copy data from an mblk_t chain into a buffer. This works for ipnet
1446 1452   * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1447 1453   * packet itself.
1448 1454   */
1449 1455  static void *
1450 1456  bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1451 1457  {
1452 1458          const mblk_t *m;
1453 1459          uint_t count;
1454 1460          uchar_t *dst;
1455 1461  
1456 1462          m = src_arg;
1457 1463          dst = dst_arg;
1458 1464          while (len > 0) {
1459 1465                  if (m == NULL)
1460 1466                          panic("bpf_mcpy");
1461 1467                  count = (uint_t)min(M_LEN(m), len);
1462 1468                  (void) memcpy(dst, mtod(m, const void *), count);
1463 1469                  m = m->b_cont;
1464 1470                  dst += count;
1465 1471                  len -= count;
1466 1472          }
1467 1473          return (dst_arg);
1468 1474  }
1469 1475  
1470 1476  /*
1471 1477   * Dispatch a packet to all the listeners on interface bp.
1472 1478   *
1473 1479   * marg    pointer to the packet, either a data buffer or an mbuf chain
1474 1480   * buflen  buffer length, if marg is a data buffer
1475 1481   * cpfn    a function that can copy marg into the listener's buffer
1476 1482   * pktlen  length of the packet
1477 1483   * issent  boolean indicating whether the packet was sent or receive
1478 1484   */
1479 1485  static inline void
1480 1486  bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen,
1481 1487      uint_t buflen, boolean_t issent)
1482 1488  {
1483 1489          struct timeval tv;
1484 1490          uint_t slen;
1485 1491  
1486 1492          if (!d->bd_seesent && issent)
1487 1493                  return;
1488 1494  
1489 1495          /*
1490 1496           * Accuracy of the packet counters in BPF is vital so it
1491 1497           * is important to protect even the outer ones.
1492 1498           */
1493 1499          mutex_enter(&d->bd_lock);
1494 1500          slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
1495 1501          DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif,
1496 1502              struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen);
1497 1503          d->bd_rcount++;
1498 1504          ks_stats.kp_receive.value.ui64++;
1499 1505          if (slen != 0) {
1500 1506                  uniqtime(&tv);
1501 1507                  catchpacket(d, marg, pktlen, slen, cpfn, &tv);
1502 1508          }
1503 1509          mutex_exit(&d->bd_lock);
1504 1510  }
1505 1511  
1506 1512  /*
1507 1513   * Incoming linkage from device drivers.
1508 1514   */
1509 1515  /* ARGSUSED */
1510 1516  void
1511 1517  bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent)
1512 1518  {
1513 1519          cp_fn_t cpfn;
1514 1520          struct bpf_d *d = arg;
1515 1521          uint_t pktlen, buflen;
1516 1522          void *marg;
1517 1523  
1518 1524          pktlen = msgdsize(m);
1519 1525  
1520 1526          if (pktlen == M_LEN(m)) {
1521 1527                  cpfn = (cp_fn_t)memcpy;
1522 1528                  marg = mtod(m, void *);
1523 1529                  buflen = pktlen;
1524 1530          } else {
1525 1531                  cpfn = bpf_mcpy;
1526 1532                  marg = m;
1527 1533                  buflen = 0;
1528 1534          }
1529 1535  
1530 1536          bpf_deliver(d, cpfn, marg, pktlen, buflen, issent);
1531 1537  }
1532 1538  
1533 1539  /*
1534 1540   * Incoming linkage from ipnet.
1535 1541   * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1536 1542   * from all network interfaces. Thus the tap function needs to apply a
1537 1543   * filter using the interface index/id to immitate snoop'ing on just the
1538 1544   * specified interface.
1539 1545   */
1540 1546  /* ARGSUSED */
1541 1547  void
1542 1548  bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length)
1543 1549  {
1544 1550          hook_pkt_observe_t *hdr;
1545 1551          struct bpf_d *d = arg;
1546 1552  
1547 1553          hdr = (hook_pkt_observe_t *)m->b_rptr;
1548 1554          if (ntohl(hdr->hpo_ifindex) != d->bd_linkid)
1549 1555                  return;
1550 1556          bpf_deliver(d, bpf_mcpy, m, length, 0, issent);
1551 1557  
1552 1558  }
1553 1559  
1554 1560  /*
1555 1561   * Move the packet data from interface memory (pkt) into the
1556 1562   * store buffer.  Return 1 if it's time to wakeup a listener (buffer full),
1557 1563   * otherwise 0.  "copy" is the routine called to do the actual data
1558 1564   * transfer.  memcpy is passed in to copy contiguous chunks, while
1559 1565   * bpf_mcpy is passed in to copy mbuf chains.  In the latter case,
1560 1566   * pkt is really an mbuf.
1561 1567   */
1562 1568  static void
1563 1569  catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen,
1564 1570      cp_fn_t cpfn, struct timeval *tv)
1565 1571  {
1566 1572          struct bpf_hdr *hp;
1567 1573          int totlen, curlen;
1568 1574          int hdrlen = d->bd_hdrlen;
1569 1575          int do_wakeup = 0;
1570 1576  
1571 1577          ++d->bd_ccount;
1572 1578          ks_stats.kp_capture.value.ui64++;
1573 1579          /*
1574 1580           * Figure out how many bytes to move.  If the packet is
1575 1581           * greater or equal to the snapshot length, transfer that
1576 1582           * much.  Otherwise, transfer the whole packet (unless
1577 1583           * we hit the buffer size limit).
1578 1584           */
1579 1585          totlen = hdrlen + min(snaplen, pktlen);
1580 1586          if (totlen > d->bd_bufsize)
1581 1587                  totlen = d->bd_bufsize;
1582 1588  
1583 1589          /*
1584 1590           * Round up the end of the previous packet to the next longword.
1585 1591           */
1586 1592          curlen = BPF_WORDALIGN(d->bd_slen);
1587 1593          if (curlen + totlen > d->bd_bufsize) {
1588 1594                  /*
1589 1595                   * This packet will overflow the storage buffer.
1590 1596                   * Rotate the buffers if we can, then wakeup any
1591 1597                   * pending reads.
1592 1598                   */
1593 1599                  if (d->bd_fbuf == 0) {
1594 1600                          /*
1595 1601                           * We haven't completed the previous read yet,
1596 1602                           * so drop the packet.
1597 1603                           */
1598 1604                          ++d->bd_dcount;
1599 1605                          ks_stats.kp_dropped.value.ui64++;
1600 1606                          return;
1601 1607                  }
1602 1608                  ROTATE_BUFFERS(d);
1603 1609                  do_wakeup = 1;
1604 1610                  curlen = 0;
1605 1611          } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1606 1612                  /*
1607 1613                   * Immediate mode is set, or the read timeout has
1608 1614                   * already expired during a select call.  A packet
1609 1615                   * arrived, so the reader should be woken up.
1610 1616                   */
1611 1617                  do_wakeup = 1;
1612 1618          }
1613 1619  
1614 1620          /*
1615 1621           * Append the bpf header to the existing buffer before we add
1616 1622           * on the actual packet data.
1617 1623           */
1618 1624          hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
1619 1625          hp->bh_tstamp.tv_sec = tv->tv_sec;
1620 1626          hp->bh_tstamp.tv_usec = tv->tv_usec;
1621 1627          hp->bh_datalen = pktlen;
1622 1628          hp->bh_hdrlen = (uint16_t)hdrlen;
1623 1629          /*
1624 1630           * Copy the packet data into the store buffer and update its length.
1625 1631           */
1626 1632          (*cpfn)((uchar_t *)hp + hdrlen, pkt,
1627 1633              (hp->bh_caplen = totlen - hdrlen));
1628 1634          d->bd_slen = curlen + totlen;
1629 1635  
1630 1636          /*
1631 1637           * Call bpf_wakeup after bd_slen has been updated.
1632 1638           */
1633 1639          if (do_wakeup)
1634 1640                  bpf_wakeup(d);
1635 1641  }
1636 1642  
1637 1643  /*
1638 1644   * Initialize all nonzero fields of a descriptor.
1639 1645   */
1640 1646  static int
1641 1647  bpf_allocbufs(struct bpf_d *d)
1642 1648  {
1643 1649  
1644 1650          d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1645 1651          if (!d->bd_fbuf)
1646 1652                  return (ENOBUFS);
1647 1653          d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1648 1654          if (!d->bd_sbuf) {
1649 1655                  kmem_free(d->bd_fbuf, d->bd_bufsize);
1650 1656                  return (ENOBUFS);
1651 1657          }
1652 1658          d->bd_slen = 0;
1653 1659          d->bd_hlen = 0;
1654 1660          return (0);
1655 1661  }
1656 1662  
1657 1663  /*
1658 1664   * Free buffers currently in use by a descriptor.
1659 1665   * Called on close.
1660 1666   */
1661 1667  static void
1662 1668  bpf_freed(struct bpf_d *d)
1663 1669  {
1664 1670          /*
1665 1671           * At this point the descriptor has been detached from its
1666 1672           * interface and it yet hasn't been marked free.
1667 1673           */
1668 1674          if (d->bd_sbuf != 0) {
1669 1675                  kmem_free(d->bd_sbuf, d->bd_bufsize);
1670 1676                  if (d->bd_hbuf != 0)
1671 1677                          kmem_free(d->bd_hbuf, d->bd_bufsize);
1672 1678                  if (d->bd_fbuf != 0)
1673 1679                          kmem_free(d->bd_fbuf, d->bd_bufsize);
1674 1680          }
1675 1681          if (d->bd_filter)
1676 1682                  kmem_free(d->bd_filter, d->bd_filter_size);
1677 1683  }
1678 1684  
1679 1685  /*
1680 1686   * Get a list of available data link type of the interface.
1681 1687   */
1682 1688  static int
1683 1689  bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp)
1684 1690  {
1685 1691          bpf_provider_list_t *bp;
1686 1692          bpf_provider_t *bpr;
1687 1693          zoneid_t zoneid;
1688 1694          uintptr_t mcip;
1689 1695          uint_t nicdlt;
1690 1696          uintptr_t mh;
1691 1697          int error;
1692 1698          int n;
1693 1699  
1694 1700          n = 0;
1695 1701          mh = 0;
1696 1702          mcip = 0;
1697 1703          error = 0;
1698 1704          mutex_enter(&d->bd_lock);
1699 1705          LIST_FOREACH(bp, &bpf_providers, bpl_next) {
1700 1706                  bpr = bp->bpl_what;
1701 1707                  error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone);
1702 1708                  if (error != 0)
1703 1709                          goto next;
1704 1710                  error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
1705 1711                  if (error != 0)
1706 1712                          goto next;
1707 1713                  error = MBPF_GET_ZONE(bpr, mh, &zoneid);
1708 1714                  if (error != 0)
1709 1715                          goto next;
1710 1716                  if (d->bd_zone != GLOBAL_ZONEID &&
1711 1717                      d->bd_zone != zoneid)
1712 1718                          goto next;
1713 1719                  error = MBPF_GET_DLT(bpr, mh, &nicdlt);
1714 1720                  if (error != 0)
1715 1721                          goto next;
1716 1722                  nicdlt = bpf_dl_to_dlt(nicdlt);
1717 1723                  if (listp->bfl_list != NULL) {
1718 1724                          if (n >= listp->bfl_len) {
1719 1725                                  MBPF_CLIENT_CLOSE(bpr, mcip);
1720 1726                                  MBPF_CLOSE(bpr, mh);
1721 1727                                  break;
1722 1728                          }
1723 1729                          /*
1724 1730                           * Bumping of bd_inuse ensures the structure does not
1725 1731                           * disappear while the copyout runs and allows the for
1726 1732                           * loop to be continued.
1727 1733                           */
1728 1734                          d->bd_inuse++;
1729 1735                          mutex_exit(&d->bd_lock);
1730 1736                          if (copyout(&nicdlt,
1731 1737                              listp->bfl_list + n, sizeof (uint_t)) != 0)
1732 1738                                  error = EFAULT;
1733 1739                          mutex_enter(&d->bd_lock);
1734 1740                          if (error != 0)
1735 1741                                  break;
1736 1742                          d->bd_inuse--;
1737 1743                  }
1738 1744                  n++;
1739 1745  next:
1740 1746                  if (mcip != 0) {
1741 1747                          MBPF_CLIENT_CLOSE(bpr, mcip);
1742 1748                          mcip = 0;
1743 1749                  }
1744 1750                  if (mh != 0) {
1745 1751                          MBPF_CLOSE(bpr, mh);
1746 1752                          mh = 0;
1747 1753                  }
1748 1754          }
1749 1755          mutex_exit(&d->bd_lock);
1750 1756  
1751 1757          /*
1752 1758           * It is quite possible that one or more provider to BPF may not
1753 1759           * know about a link name whlist others do. In that case, so long
1754 1760           * as we have one success, do not declare an error unless it was
1755 1761           * an EFAULT as this indicates a problem that needs to be reported.
1756 1762           */
1757 1763          if ((error != EFAULT) && (n > 0))
1758 1764                  error = 0;
1759 1765  
1760 1766          listp->bfl_len = n;
1761 1767          return (error);
1762 1768  }
1763 1769  
1764 1770  /*
1765 1771   * Set the data link type of a BPF instance.
1766 1772   */
1767 1773  static int
1768 1774  bpf_setdlt(struct bpf_d *d, void *addr)
1769 1775  {
1770 1776          char ifname[LIFNAMSIZ+1];
1771 1777          zoneid_t niczone;
1772 1778          int error;
1773 1779          int dlt;
1774 1780  
1775 1781          if (copyin(addr, &dlt, sizeof (dlt)) != 0)
1776 1782                  return (EFAULT);
1777 1783  
1778 1784          mutex_enter(&d->bd_lock);
1779 1785  
1780 1786          if (d->bd_bif == 0) {                   /* Interface not set */
1781 1787                  mutex_exit(&d->bd_lock);
1782 1788                  return (EINVAL);
1783 1789          }
1784 1790          if (d->bd_dlt == dlt) { /* NULL-op */
1785 1791                  mutex_exit(&d->bd_lock);
1786 1792                  return (0);
1787 1793          }
1788 1794  
1789 1795          error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone);
1790 1796          if (error != 0) {
1791 1797                  mutex_exit(&d->bd_lock);
1792 1798                  return (error);
1793 1799          }
1794 1800  
1795 1801          /*
1796 1802           * See the matrix at the top of the file for the permissions table
1797 1803           * enforced by this driver.
1798 1804           */
1799 1805          if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) &&
1800 1806              (niczone != d->bd_zone)) {
1801 1807                  mutex_exit(&d->bd_lock);
1802 1808                  return (EINVAL);
1803 1809          }
1804 1810  
1805 1811          (void) strlcpy(ifname, d->bd_ifname, sizeof (ifname));
1806 1812          d->bd_inuse = -1;
1807 1813          bpf_detachd(d);
1808 1814          error = bpf_attachd(d, ifname, dlt);
1809 1815          reset_d(d);
1810 1816          d->bd_inuse = 0;
1811 1817  
1812 1818          mutex_exit(&d->bd_lock);
1813 1819          return (error);
1814 1820  }
1815 1821  
1816 1822  /*
1817 1823   * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1818 1824   * with the necessary protection to retrieve and modify bd_callout but it
1819 1825   * does not hold the lock for its entire duration... see below...
1820 1826   */
1821 1827  static void
1822 1828  bpf_clear_timeout(struct bpf_d *d)
1823 1829  {
1824 1830          timeout_id_t tid = d->bd_callout;
1825 1831          d->bd_callout = 0;
1826 1832          d->bd_inuse++;
1827 1833  
1828 1834          /*
1829 1835           * If the timeout has fired and is waiting on bd_lock, we could
1830 1836           * deadlock here because untimeout if bd_lock is held and would
1831 1837           * wait for bpf_timed_out to finish and it never would.
1832 1838           */
1833 1839          if (tid != 0) {
1834 1840                  mutex_exit(&d->bd_lock);
1835 1841                  (void) untimeout(tid);
1836 1842                  mutex_enter(&d->bd_lock);
1837 1843          }
1838 1844  
1839 1845          d->bd_inuse--;
1840 1846  }
1841 1847  
1842 1848  /*
1843 1849   * As a cloning device driver, BPF needs to keep track of which device
1844 1850   * numbers are in use and which ones are not. A hash table, indexed by
1845 1851   * the minor device number, is used to store the pointers to the
1846 1852   * individual descriptors that are allocated in bpfopen().
1847 1853   * The functions below present the interface for that hash table to
1848 1854   * the rest of the driver.
1849 1855   */
1850 1856  static struct bpf_d *
1851 1857  bpf_dev_find(minor_t minor)
1852 1858  {
1853 1859          struct bpf_d *d = NULL;
1854 1860  
1855 1861          (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1856 1862              (mod_hash_val_t *)&d);
1857 1863  
1858 1864          return (d);
1859 1865  }
1860 1866  
1861 1867  static void
1862 1868  bpf_dev_add(struct bpf_d *d)
1863 1869  {
1864 1870          (void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1865 1871              (mod_hash_val_t)d);
1866 1872  }
1867 1873  
1868 1874  static void
1869 1875  bpf_dev_remove(struct bpf_d *d)
1870 1876  {
1871 1877          struct bpf_d *stor;
1872 1878  
1873 1879          (void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1874 1880              (mod_hash_val_t *)&stor);
1875 1881          ASSERT(stor == d);
1876 1882  }
1877 1883  
1878 1884  /*
1879 1885   * bpf_def_get should only ever be called for a minor number that exists,
1880 1886   * thus there should always be a pointer in the hash table that corresponds
1881 1887   * to it.
1882 1888   */
1883 1889  static struct bpf_d *
1884 1890  bpf_dev_get(minor_t minor)
1885 1891  {
1886 1892          struct bpf_d *d = NULL;
1887 1893  
1888 1894          (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1889 1895              (mod_hash_val_t *)&d);
1890 1896          ASSERT(d != NULL);
1891 1897  
1892 1898          return (d);
1893 1899  }

↓ open down ↓

458 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX