epoll New usr/src/uts/common/io/bpf/bpf.c

   1 /*      $NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $    */
   2 
   3 /*
   4  * Copyright (c) 1990, 1991, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from the Stanford/CMU enet packet filter,
   8  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
   9  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  10  * Berkeley Laboratory.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      @(#)bpf.c       8.4 (Berkeley) 1/9/95
  37  * static char rcsid[] =
  38  * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
  39  */
  40 /*
  41  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  42  * Use is subject to license terms.
  43  * Copyright 2017 Joyent, Inc.
  44  */
  45 
  46 /*
  47  * The BPF implements the following access controls for zones attempting
  48  * to read and write data. Writing of data requires that the net_rawaccess
  49  * privilege is held whilst reading data requires either net_rawaccess or
  50  * net_observerability.
  51  *
  52  *                              | Shared |  Exclusive |   Global
  53  * -----------------------------+--------+------------+------------+
  54  * DLT_IPNET in local zone      |  Read  |    Read    |    Read    |
  55  * -----------------------------+--------+------------+------------+
  56  * Raw access to local zone NIC |  None  | Read/Write | Read/Write |
  57  * -----------------------------+--------+------------+------------+
  58  * Raw access to all NICs       |  None  |    None    | Read/Write |
  59  * -----------------------------+--------+------------+------------+
  60  *
  61  * The BPF driver is written as a cloning driver: each call to bpfopen()
  62  * allocates a new minor number. This provides BPF with a 1:1 relationship
  63  * between open's and close's. There is some amount of "descriptor state"
  64  * that is kept per open. Pointers to this data are stored in a hash table
  65  * (bpf_hash) that is index'd by the minor device number for each open file.
  66  */
  67 #include <sys/param.h>
  68 #include <sys/systm.h>
  69 #include <sys/time.h>
  70 #include <sys/ioctl.h>
  71 #include <sys/queue.h>
  72 #include <sys/filio.h>
  73 #include <sys/policy.h>
  74 #include <sys/cmn_err.h>
  75 #include <sys/uio.h>
  76 #include <sys/file.h>
  77 #include <sys/sysmacros.h>
  78 #include <sys/zone.h>
  79 
  80 #include <sys/socket.h>
  81 #include <sys/errno.h>
  82 #include <sys/poll.h>
  83 #include <sys/dlpi.h>
  84 #include <sys/neti.h>
  85 
  86 #include <net/if.h>
  87 
  88 #include <net/bpf.h>
  89 #include <net/bpfdesc.h>
  90 #include <net/dlt.h>
  91 
  92 #include <netinet/in.h>
  93 #include <sys/mac.h>
  94 #include <sys/mac_client.h>
  95 #include <sys/mac_impl.h>
  96 #include <sys/time_std_impl.h>
  97 #include <sys/hook.h>
  98 #include <sys/hook_event.h>
  99 
 100 
 101 #define mtod(_v, _t)    (_t)((_v)->b_rptr)
 102 #define M_LEN(_m)       ((_m)->b_wptr - (_m)->b_rptr)
 103 
 104 /*
 105  * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
 106  * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
 107  */
 108 #define BPF_BUFSIZE (32 * 1024)
 109 
 110 typedef void *(*cp_fn_t)(void *, const void *, size_t);
 111 
 112 /*
 113  * The default read buffer size, and limit for BIOCSBLEN.
 114  */
 115 int bpf_bufsize = BPF_BUFSIZE;
 116 int bpf_maxbufsize = (16 * 1024 * 1024);
 117 static mod_hash_t *bpf_hash = NULL;
 118 
 119 /*
 120  * Use a mutex to avoid a race condition between gathering the stats/peers
 121  * and opening/closing the device.
 122  */
 123 static kcondvar_t bpf_dlt_waiter;
 124 static kmutex_t bpf_mtx;
 125 static bpf_kstats_t ks_stats;
 126 static bpf_kstats_t bpf_kstats = {
 127         { "readWait",           KSTAT_DATA_UINT64 },
 128         { "writeOk",            KSTAT_DATA_UINT64 },
 129         { "writeError",         KSTAT_DATA_UINT64 },
 130         { "receive",            KSTAT_DATA_UINT64 },
 131         { "captured",           KSTAT_DATA_UINT64 },
 132         { "dropped",            KSTAT_DATA_UINT64 },
 133 };
 134 static kstat_t *bpf_ksp;
 135 
 136 /*
 137  *  bpf_list is a list of the BPF descriptors currently open
 138  */
 139 LIST_HEAD(, bpf_d) bpf_list;
 140 
 141 static int      bpf_allocbufs(struct bpf_d *);
 142 static void     bpf_clear_timeout(struct bpf_d *);
 143 static void     bpf_deliver(struct bpf_d *, cp_fn_t,
 144                     void *, uint_t, uint_t, boolean_t);
 145 static void     bpf_freed(struct bpf_d *);
 146 static int      bpf_ifname(struct bpf_d *d, char *, int);
 147 static void     *bpf_mcpy(void *, const void *, size_t);
 148 static int      bpf_attachd(struct bpf_d *, const char *, int);
 149 static void     bpf_detachd(struct bpf_d *);
 150 static int      bpf_setif(struct bpf_d *, char *, int);
 151 static void     bpf_timed_out(void *);
 152 static inline void
 153                 bpf_wakeup(struct bpf_d *);
 154 static void     catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t,
 155                     cp_fn_t, struct timeval *);
 156 static void     reset_d(struct bpf_d *);
 157 static int      bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 158 static int      bpf_setdlt(struct bpf_d *, void *);
 159 static void     bpf_dev_add(struct bpf_d *);
 160 static struct bpf_d *bpf_dev_find(minor_t);
 161 static struct bpf_d *bpf_dev_get(minor_t);
 162 static void     bpf_dev_remove(struct bpf_d *);
 163 
 164 static int
 165 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp)
 166 {
 167         mblk_t *m;
 168         int error;
 169         int len;
 170         int hlen;
 171         int align;
 172 
 173         /*
 174          * Build a sockaddr based on the data link layer type.
 175          * We do this at this level because the ethernet header
 176          * is copied directly into the data field of the sockaddr.
 177          * In the case of SLIP, there is no header and the packet
 178          * is forwarded as is.
 179          * Also, we are careful to leave room at the front of the mbuf
 180          * for the link level header.
 181          */
 182         switch (linktype) {
 183 
 184         case DLT_EN10MB:
 185                 hlen = sizeof (struct ether_header);
 186                 break;
 187 
 188         case DLT_FDDI:
 189                 hlen = 16;
 190                 break;
 191 
 192         case DLT_NULL:
 193                 hlen = 0;
 194                 break;
 195 
 196         case DLT_IPOIB:
 197                 hlen = 44;
 198                 break;
 199 
 200         default:
 201                 return (EIO);
 202         }
 203 
 204         align = 4 - (hlen & 3);
 205 
 206         len = uio->uio_resid;
 207         /*
 208          * If there aren't enough bytes for a link level header or the
 209          * packet length exceeds the interface mtu, return an error.
 210          */
 211         if (len < hlen || len - hlen > mtu)
 212                 return (EMSGSIZE);
 213 
 214         m = allocb(len + align, BPRI_MED);
 215         if (m == NULL) {
 216                 error = ENOBUFS;
 217                 goto bad;
 218         }
 219 
 220         /* Insure the data is properly aligned */
 221         if (align > 0)
 222                 m->b_rptr += align;
 223         m->b_wptr = m->b_rptr + len;
 224 
 225         error = uiomove(mtod(m, void *), len, UIO_WRITE, uio);
 226         if (error)
 227                 goto bad;
 228         *mp = m;
 229         return (0);
 230 
 231 bad:
 232         if (m != NULL)
 233                 freemsg(m);
 234         return (error);
 235 }
 236 
 237 
 238 /*
 239  * Attach file to the bpf interface, i.e. make d listen on bp.
 240  */
 241 static int
 242 bpf_attachd(struct bpf_d *d, const char *ifname, int dlt)
 243 {
 244         bpf_provider_list_t *bp;
 245         bpf_provider_t *bpr;
 246         boolean_t zonematch;
 247         zoneid_t niczone;
 248         uintptr_t mcip;
 249         zoneid_t zone;
 250         uint_t nicdlt;
 251         uintptr_t mh;
 252         int hdrlen;
 253         int error;
 254 
 255         ASSERT(d->bd_bif == NULL);
 256         ASSERT(d->bd_mcip == NULL);
 257         zone = d->bd_zone;
 258         zonematch = B_TRUE;
 259 again:
 260         mh = 0;
 261         mcip = 0;
 262         LIST_FOREACH(bp, &bpf_providers, bpl_next) {
 263                 bpr = bp->bpl_what;
 264                 error = MBPF_OPEN(bpr, ifname, &mh, zone);
 265                 if (error != 0)
 266                         goto next;
 267                 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
 268                 if (error != 0)
 269                         goto next;
 270                 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
 271                 if (error != 0)
 272                         goto next;
 273 
 274                 nicdlt = bpf_dl_to_dlt(nicdlt);
 275                 if (dlt != -1 && dlt != nicdlt) {
 276                         error = ENOENT;
 277                         goto next;
 278                 }
 279 
 280                 error = MBPF_GET_ZONE(bpr, mh, &niczone);
 281                 if (error != 0)
 282                         goto next;
 283 
 284                 DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr,
 285                     uintptr_t, mh, int, nicdlt, zoneid_t, niczone);
 286 
 287                 if (zonematch && niczone != zone) {
 288                         error = ENOENT;
 289                         goto next;
 290                 }
 291                 break;
 292 next:
 293                 if (mcip != 0) {
 294                         MBPF_CLIENT_CLOSE(bpr, mcip);
 295                         mcip = 0;
 296                 }
 297                 if (mh != NULL) {
 298                         MBPF_CLOSE(bpr, mh);
 299                         mh = 0;
 300                 }
 301         }
 302         if (error != 0) {
 303                 if (zonematch && (zone == GLOBAL_ZONEID)) {
 304                         /*
 305                          * If we failed to do an exact match for the global
 306                          * zone using the global zoneid, try again in case
 307                          * the network interface is owned by a local zone.
 308                          */
 309                         zonematch = B_FALSE;
 310                         goto again;
 311                 }
 312                 return (error);
 313         }
 314 
 315         d->bd_mac = *bpr;
 316         d->bd_mcip = mcip;
 317         d->bd_bif = mh;
 318         d->bd_dlt = nicdlt;
 319         hdrlen = bpf_dl_hdrsize(nicdlt);
 320         d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
 321 
 322         (void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip),
 323             sizeof (d->bd_ifname));
 324 
 325         (void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid,
 326             zone);
 327         (void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d,
 328             &d->bd_promisc_handle, d->bd_promisc_flags);
 329         return (0);
 330 }
 331 
 332 /*
 333  * Detach a file from its interface.
 334  */
 335 static void
 336 bpf_detachd(struct bpf_d *d)
 337 {
 338         uintptr_t mph;
 339         uintptr_t mch;
 340         uintptr_t mh;
 341 
 342         ASSERT(d->bd_inuse == -1);
 343         mch = d->bd_mcip;
 344         d->bd_mcip = 0;
 345         mh = d->bd_bif;
 346         d->bd_bif = 0;
 347 
 348         /*
 349          * Check if this descriptor had requested promiscuous mode.
 350          * If so, turn it off. There's no need to take any action
 351          * here, that is done when MBPF_PROMISC_REMOVE is used;
 352          * bd_promisc is just a local flag to stop promiscuous mode
 353          * from being set more than once.
 354          */
 355         if (d->bd_promisc)
 356                 d->bd_promisc = 0;
 357 
 358         /*
 359          * Take device out of "promiscuous" mode.  Since we were able to
 360          * enter "promiscuous" mode, we should be able to turn it off.
 361          * Note, this field stores a pointer used to support both
 362          * promiscuous and non-promiscuous callbacks for packets.
 363          */
 364         mph = d->bd_promisc_handle;
 365         d->bd_promisc_handle = 0;
 366 
 367         /*
 368          * The lock has to be dropped here because mac_promisc_remove may
 369          * need to wait for mac_promisc_dispatch, which has called into
 370          * bpf and catchpacket is waiting for bd_lock...
 371          * i.e mac_promisc_remove() needs to be called with none of the
 372          * locks held that are part of the bpf_mtap() call path.
 373          */
 374         mutex_exit(&d->bd_lock);
 375         if (mph != 0)
 376                 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
 377 
 378         if (mch != 0)
 379                 MBPF_CLIENT_CLOSE(&d->bd_mac, mch);
 380 
 381         if (mh != 0)
 382                 MBPF_CLOSE(&d->bd_mac, mh);
 383 
 384         /*
 385          * Because this function is called with bd_lock held, so it must
 386          * exit with it held.
 387          */
 388         mutex_enter(&d->bd_lock);
 389         *d->bd_ifname = '\0';
 390         (void) memset(&d->bd_mac, 0, sizeof (d->bd_mac));
 391 }
 392 
 393 
 394 /*
 395  * bpfilterattach() is called at load time.
 396  */
 397 int
 398 bpfilterattach(void)
 399 {
 400 
 401         bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31,
 402             mod_hash_null_keydtor);
 403         if (bpf_hash == NULL)
 404                 return (ENOMEM);
 405 
 406         (void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats));
 407 
 408         bpf_ksp = kstat_create("bpf", 0, "global", "misc",
 409             KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t),
 410             KSTAT_FLAG_VIRTUAL);
 411         if (bpf_ksp != NULL) {
 412                 bpf_ksp->ks_data = &ks_stats;
 413                 kstat_install(bpf_ksp);
 414         } else {
 415                 mod_hash_destroy_idhash(bpf_hash);
 416                 bpf_hash = NULL;
 417                 return (EEXIST);
 418         }
 419 
 420         cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL);
 421         mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL);
 422 
 423         LIST_INIT(&bpf_list);
 424 
 425         return (0);
 426 }
 427 
 428 
 429 /*
 430  * bpfilterdetach() is called at unload time.
 431  */
 432 int
 433 bpfilterdetach(void)
 434 {
 435 
 436         if (bpf_ksp != NULL) {
 437                 kstat_delete(bpf_ksp);
 438                 bpf_ksp = NULL;
 439         }
 440 
 441         mod_hash_destroy_idhash(bpf_hash);
 442         bpf_hash = NULL;
 443 
 444         cv_destroy(&bpf_dlt_waiter);
 445         mutex_destroy(&bpf_mtx);
 446 
 447         return (0);
 448 }
 449 
 450 /*
 451  * Open ethernet device. Clones.
 452  */
 453 /* ARGSUSED */
 454 int
 455 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred)
 456 {
 457         struct bpf_d *d;
 458         uint_t dmin;
 459 
 460         /*
 461          * The security policy described at the top of this file is
 462          * enforced here.
 463          */
 464         if ((flag & FWRITE) != 0) {
 465                 if (secpolicy_net_rawaccess(cred) != 0)
 466                         return (EACCES);
 467         }
 468 
 469         if ((flag & FREAD) != 0) {
 470                 if ((secpolicy_net_observability(cred) != 0) &&
 471                     (secpolicy_net_rawaccess(cred) != 0))
 472                         return (EACCES);
 473         }
 474 
 475         if ((flag & (FWRITE|FREAD)) == 0)
 476                 return (ENXIO);
 477 
 478         /*
 479          * A structure is allocated per open file in BPF to store settings
 480          * such as buffer capture size, provide private buffers, etc.
 481          */
 482         d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP);
 483         d->bd_bufsize = bpf_bufsize;
 484         d->bd_fmode = flag;
 485         d->bd_zone = crgetzoneid(cred);
 486         d->bd_seesent = 1;
 487         d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS|
 488             MAC_PROMISC_FLAGS_NO_COPY;
 489         mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL);
 490         cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL);
 491 
 492         mutex_enter(&bpf_mtx);
 493         /*
 494          * Find an unused minor number. Obviously this is an O(n) algorithm
 495          * and doesn't scale particularly well, so if there are large numbers
 496          * of open file descriptors happening in real use, this design may
 497          * need to be revisited.
 498          */
 499         for (dmin = 0; dmin < L_MAXMIN; dmin++)
 500                 if (bpf_dev_find(dmin) == NULL)
 501                         break;
 502         if (dmin == L_MAXMIN) {
 503                 mutex_exit(&bpf_mtx);
 504                 kmem_free(d, sizeof (*d));
 505                 return (ENXIO);
 506         }
 507         d->bd_dev = dmin;
 508         LIST_INSERT_HEAD(&bpf_list, d, bd_list);
 509         bpf_dev_add(d);
 510         mutex_exit(&bpf_mtx);
 511 
 512         *devp = makedevice(getmajor(*devp), dmin);
 513 
 514         return (0);
 515 }
 516 
 517 /*
 518  * Close the descriptor by detaching it from its interface,
 519  * deallocating its buffers, and marking it free.
 520  *
 521  * Because we only allow a device to be opened once, there is always a
 522  * 1 to 1 relationship between opens and closes supporting this function.
 523  */
 524 /* ARGSUSED */
 525 int
 526 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
 527 {
 528         struct bpf_d *d = bpf_dev_get(getminor(dev));
 529 
 530         mutex_enter(&d->bd_lock);
 531 
 532         while (d->bd_inuse != 0) {
 533                 d->bd_waiting++;
 534                 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
 535                         d->bd_waiting--;
 536                         mutex_exit(&d->bd_lock);
 537                         return (EINTR);
 538                 }
 539                 d->bd_waiting--;
 540         }
 541 
 542         d->bd_inuse = -1;
 543         if (d->bd_state == BPF_WAITING)
 544                 bpf_clear_timeout(d);
 545         d->bd_state = BPF_IDLE;
 546         if (d->bd_bif)
 547                 bpf_detachd(d);
 548         mutex_exit(&d->bd_lock);
 549 
 550         mutex_enter(&bpf_mtx);
 551         LIST_REMOVE(d, bd_list);
 552         bpf_dev_remove(d);
 553         mutex_exit(&bpf_mtx);
 554 
 555         mutex_enter(&d->bd_lock);
 556         mutex_destroy(&d->bd_lock);
 557         cv_destroy(&d->bd_wait);
 558 
 559         bpf_freed(d);
 560         kmem_free(d, sizeof (*d));
 561 
 562         return (0);
 563 }
 564 
 565 /*
 566  * Rotate the packet buffers in descriptor d.  Move the store buffer
 567  * into the hold slot, and the free buffer into the store slot.
 568  * Zero the length of the new store buffer.
 569  */
 570 #define ROTATE_BUFFERS(d) \
 571         (d)->bd_hbuf = (d)->bd_sbuf; \
 572         (d)->bd_hlen = (d)->bd_slen; \
 573         (d)->bd_sbuf = (d)->bd_fbuf; \
 574         (d)->bd_slen = 0; \
 575         (d)->bd_fbuf = 0;
 576 /*
 577  *  bpfread - read next chunk of packets from buffers
 578  */
 579 /* ARGSUSED */
 580 int
 581 bpfread(dev_t dev, struct uio *uio, cred_t *cred)
 582 {
 583         struct bpf_d *d = bpf_dev_get(getminor(dev));
 584         int timed_out;
 585         ulong_t delay;
 586         int error;
 587 
 588         if ((d->bd_fmode & FREAD) == 0)
 589                 return (EBADF);
 590 
 591         /*
 592          * Restrict application to use a buffer the same size as
 593          * the kernel buffers.
 594          */
 595         if (uio->uio_resid != d->bd_bufsize)
 596                 return (EINVAL);
 597 
 598         mutex_enter(&d->bd_lock);
 599         if (d->bd_state == BPF_WAITING)
 600                 bpf_clear_timeout(d);
 601         timed_out = (d->bd_state == BPF_TIMED_OUT);
 602         d->bd_state = BPF_IDLE;
 603         /*
 604          * If the hold buffer is empty, then do a timed sleep, which
 605          * ends when the timeout expires or when enough packets
 606          * have arrived to fill the store buffer.
 607          */
 608         while (d->bd_hbuf == 0) {
 609                 if (d->bd_nonblock) {
 610                         if (d->bd_slen == 0) {
 611                                 mutex_exit(&d->bd_lock);
 612                                 return (EWOULDBLOCK);
 613                         }
 614                         ROTATE_BUFFERS(d);
 615                         break;
 616                 }
 617 
 618                 if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
 619                         /*
 620                          * A packet(s) either arrived since the previous
 621                          * read or arrived while we were asleep.
 622                          * Rotate the buffers and return what's here.
 623                          */
 624                         ROTATE_BUFFERS(d);
 625                         break;
 626                 }
 627                 ks_stats.kp_read_wait.value.ui64++;
 628                 delay = ddi_get_lbolt() + d->bd_rtout;
 629                 error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay);
 630                 if (error == 0) {
 631                         mutex_exit(&d->bd_lock);
 632                         return (EINTR);
 633                 }
 634                 if (error == -1) {
 635                         /*
 636                          * On a timeout, return what's in the buffer,
 637                          * which may be nothing.  If there is something
 638                          * in the store buffer, we can rotate the buffers.
 639                          */
 640                         if (d->bd_hbuf)
 641                                 /*
 642                                  * We filled up the buffer in between
 643                                  * getting the timeout and arriving
 644                                  * here, so we don't need to rotate.
 645                                  */
 646                                 break;
 647 
 648                         if (d->bd_slen == 0) {
 649                                 mutex_exit(&d->bd_lock);
 650                                 return (0);
 651                         }
 652                         ROTATE_BUFFERS(d);
 653                 }
 654         }
 655         /*
 656          * At this point, we know we have something in the hold slot.
 657          */
 658         mutex_exit(&d->bd_lock);
 659 
 660         /*
 661          * Move data from hold buffer into user space.
 662          * We know the entire buffer is transferred since
 663          * we checked above that the read buffer is bpf_bufsize bytes.
 664          */
 665         error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio);
 666 
 667         mutex_enter(&d->bd_lock);
 668         d->bd_fbuf = d->bd_hbuf;
 669         d->bd_hbuf = 0;
 670         d->bd_hlen = 0;
 671 done:
 672         mutex_exit(&d->bd_lock);
 673         return (error);
 674 }
 675 
 676 
 677 /*
 678  * If there are processes sleeping on this descriptor, wake them up.
 679  * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
 680  * so there is no code here grabbing it.
 681  */
 682 static inline void
 683 bpf_wakeup(struct bpf_d *d)
 684 {
 685         cv_signal(&d->bd_wait);
 686 }
 687 
 688 static void
 689 bpf_timed_out(void *arg)
 690 {
 691         struct bpf_d *d = arg;
 692 
 693         mutex_enter(&d->bd_lock);
 694         if (d->bd_state == BPF_WAITING) {
 695                 d->bd_state = BPF_TIMED_OUT;
 696                 if (d->bd_slen != 0)
 697                         cv_signal(&d->bd_wait);
 698         }
 699         mutex_exit(&d->bd_lock);
 700 }
 701 
 702 
 703 /* ARGSUSED */
 704 int
 705 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred)
 706 {
 707         struct bpf_d *d = bpf_dev_get(getminor(dev));
 708         uintptr_t mch;
 709         uint_t mtu;
 710         mblk_t *m;
 711         int error;
 712         int dlt;
 713 
 714         if ((d->bd_fmode & FWRITE) == 0)
 715                 return (EBADF);
 716 
 717         mutex_enter(&d->bd_lock);
 718         if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) {
 719                 mutex_exit(&d->bd_lock);
 720                 return (EINTR);
 721         }
 722 
 723         if (uio->uio_resid == 0) {
 724                 mutex_exit(&d->bd_lock);
 725                 return (0);
 726         }
 727 
 728         while (d->bd_inuse < 0) {
 729                 d->bd_waiting++;
 730                 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
 731                         d->bd_waiting--;
 732                         mutex_exit(&d->bd_lock);
 733                         return (EINTR);
 734                 }
 735                 d->bd_waiting--;
 736         }
 737 
 738         mutex_exit(&d->bd_lock);
 739 
 740         dlt = d->bd_dlt;
 741         mch = d->bd_mcip;
 742         MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu);
 743         d->bd_inuse++;
 744 
 745         m = NULL;
 746         if (dlt == DLT_IPNET) {
 747                 error = EIO;
 748                 goto done;
 749         }
 750 
 751         error = bpf_movein(uio, dlt, mtu, &m);
 752         if (error)
 753                 goto done;
 754 
 755         DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt,
 756             uint_t, mtu, mblk_t *, m);
 757 
 758         if (M_LEN(m) > mtu) {
 759                 error = EMSGSIZE;
 760                 goto done;
 761         }
 762 
 763         error = MBPF_TX(&d->bd_mac, mch, m);
 764         /*
 765          * The "tx" action here is required to consume the mblk_t.
 766          */
 767         m = NULL;
 768 
 769 done:
 770         if (error == 0)
 771                 ks_stats.kp_write_ok.value.ui64++;
 772         else
 773                 ks_stats.kp_write_error.value.ui64++;
 774         if (m != NULL)
 775                 freemsg(m);
 776 
 777         mutex_enter(&d->bd_lock);
 778         d->bd_inuse--;
 779         if ((d->bd_inuse == 0) && (d->bd_waiting != 0))
 780                 cv_signal(&d->bd_wait);
 781         mutex_exit(&d->bd_lock);
 782 
 783         /*
 784          * The driver frees the mbuf.
 785          */
 786         return (error);
 787 }
 788 
 789 
 790 /*
 791  * Reset a descriptor by flushing its packet buffer and clearing the
 792  * receive and drop counts.  Should be called at splnet.
 793  */
 794 static void
 795 reset_d(struct bpf_d *d)
 796 {
 797         if (d->bd_hbuf) {
 798                 /* Free the hold buffer. */
 799                 d->bd_fbuf = d->bd_hbuf;
 800                 d->bd_hbuf = 0;
 801         }
 802         d->bd_slen = 0;
 803         d->bd_hlen = 0;
 804         d->bd_rcount = 0;
 805         d->bd_dcount = 0;
 806         d->bd_ccount = 0;
 807 }
 808 
 809 /*
 810  *  FIONREAD            Check for read packet available.
 811  *  BIOCGBLEN           Get buffer len [for read()].
 812  *  BIOCSETF            Set ethernet read filter.
 813  *  BIOCFLUSH           Flush read packet buffer.
 814  *  BIOCPROMISC         Put interface into promiscuous mode.
 815  *  BIOCGDLT            Get link layer type.
 816  *  BIOCGETIF           Get interface name.
 817  *  BIOCSETIF           Set interface.
 818  *  BIOCSRTIMEOUT       Set read timeout.
 819  *  BIOCGRTIMEOUT       Get read timeout.
 820  *  BIOCGSTATS          Get packet stats.
 821  *  BIOCIMMEDIATE       Set immediate mode.
 822  *  BIOCVERSION         Get filter language version.
 823  *  BIOCGHDRCMPLT       Get "header already complete" flag.
 824  *  BIOCSHDRCMPLT       Set "header already complete" flag.
 825  */
 826 /* ARGSUSED */
 827 int
 828 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval)
 829 {
 830         struct bpf_d *d = bpf_dev_get(getminor(dev));
 831         struct bpf_program prog;
 832         struct lifreq lifreq;
 833         struct ifreq ifreq;
 834         int error = 0;
 835         uint_t size;
 836 
 837         /*
 838          * Refresh the PID associated with this bpf file.
 839          */
 840         mutex_enter(&d->bd_lock);
 841         if (d->bd_state == BPF_WAITING)
 842                 bpf_clear_timeout(d);
 843         d->bd_state = BPF_IDLE;
 844         mutex_exit(&d->bd_lock);
 845 
 846         switch (cmd) {
 847 
 848         default:
 849                 error = EINVAL;
 850                 break;
 851 
 852         /*
 853          * Check for read packet available.
 854          */
 855         case FIONREAD:
 856                 {
 857                         int n;
 858 
 859                         mutex_enter(&d->bd_lock);
 860                         n = d->bd_slen;
 861                         if (d->bd_hbuf)
 862                                 n += d->bd_hlen;
 863                         mutex_exit(&d->bd_lock);
 864 
 865                         *(int *)addr = n;
 866                         break;
 867                 }
 868 
 869         /*
 870          * Get buffer len [for read()].
 871          */
 872         case BIOCGBLEN:
 873                 error = copyout(&d->bd_bufsize, (void *)addr,
 874                     sizeof (d->bd_bufsize));
 875                 break;
 876 
 877         /*
 878          * Set buffer length.
 879          */
 880         case BIOCSBLEN:
 881                 if (copyin((void *)addr, &size, sizeof (size)) != 0) {
 882                         error = EFAULT;
 883                         break;
 884                 }
 885 
 886                 mutex_enter(&d->bd_lock);
 887                 if (d->bd_bif != 0) {
 888                         error = EINVAL;
 889                 } else {
 890                         if (size > bpf_maxbufsize)
 891                                 size = bpf_maxbufsize;
 892                         else if (size < BPF_MINBUFSIZE)
 893                                 size = BPF_MINBUFSIZE;
 894 
 895                         d->bd_bufsize = size;
 896                 }
 897                 mutex_exit(&d->bd_lock);
 898 
 899                 if (error == 0)
 900                         error = copyout(&size, (void *)addr, sizeof (size));
 901                 break;
 902 
 903         /*
 904          * Set link layer read filter.
 905          */
 906         case BIOCSETF:
 907                 if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) {
 908                         error = EFAULT;
 909                         break;
 910                 }
 911                 error = bpf_setf(d, &prog);
 912                 break;
 913 
 914         /*
 915          * Flush read packet buffer.
 916          */
 917         case BIOCFLUSH:
 918                 mutex_enter(&d->bd_lock);
 919                 reset_d(d);
 920                 mutex_exit(&d->bd_lock);
 921                 break;
 922 
 923         /*
 924          * Put interface into promiscuous mode.
 925          * This is a one-way ioctl, it is not used to turn promiscuous
 926          * mode off.
 927          */
 928         case BIOCPROMISC:
 929                 if (d->bd_bif == 0) {
 930                         /*
 931                          * No interface attached yet.
 932                          */
 933                         error = EINVAL;
 934                         break;
 935                 }
 936                 mutex_enter(&d->bd_lock);
 937                 if (d->bd_promisc == 0) {
 938 
 939                         if (d->bd_promisc_handle) {
 940                                 uintptr_t mph;
 941 
 942                                 mph = d->bd_promisc_handle;
 943                                 d->bd_promisc_handle = 0;
 944 
 945                                 mutex_exit(&d->bd_lock);
 946                                 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
 947                                 mutex_enter(&d->bd_lock);
 948                         }
 949 
 950                         d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY;
 951                         error = MBPF_PROMISC_ADD(&d->bd_mac,
 952                             d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d,
 953                             &d->bd_promisc_handle, d->bd_promisc_flags);
 954                         if (error == 0)
 955                                 d->bd_promisc = 1;
 956                 }
 957                 mutex_exit(&d->bd_lock);
 958                 break;
 959 
 960         /*
 961          * Get device parameters.
 962          */
 963         case BIOCGDLT:
 964                 if (d->bd_bif == 0)
 965                         error = EINVAL;
 966                 else
 967                         error = copyout(&d->bd_dlt, (void *)addr,
 968                             sizeof (d->bd_dlt));
 969                 break;
 970 
 971         /*
 972          * Get a list of supported device parameters.
 973          */
 974         case BIOCGDLTLIST:
 975                 if (d->bd_bif == 0) {
 976                         error = EINVAL;
 977                 } else {
 978                         struct bpf_dltlist list;
 979 
 980                         if (copyin((void *)addr, &list, sizeof (list)) != 0) {
 981                                 error = EFAULT;
 982                                 break;
 983                         }
 984                         error = bpf_getdltlist(d, &list);
 985                         if ((error == 0) &&
 986                             copyout(&list, (void *)addr, sizeof (list)) != 0)
 987                                 error = EFAULT;
 988                 }
 989                 break;
 990 
 991         /*
 992          * Set device parameters.
 993          */
 994         case BIOCSDLT:
 995                 error = bpf_setdlt(d, (void *)addr);
 996                 break;
 997 
 998         /*
 999          * Get interface name.
1000          */
1001         case BIOCGETIF:
1002                 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1003                         error = EFAULT;
1004                         break;
1005                 }
1006                 error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1007                 if ((error == 0) &&
1008                     copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) {
1009                         error = EFAULT;
1010                         break;
1011                 }
1012                 break;
1013 
1014         /*
1015          * Set interface.
1016          */
1017         case BIOCSETIF:
1018                 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1019                         error = EFAULT;
1020                         break;
1021                 }
1022                 error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1023                 break;
1024 
1025         /*
1026          * Get interface name.
1027          */
1028         case BIOCGETLIF:
1029                 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1030                         error = EFAULT;
1031                         break;
1032                 }
1033                 error = bpf_ifname(d, lifreq.lifr_name,
1034                     sizeof (lifreq.lifr_name));
1035                 if ((error == 0) &&
1036                     copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) {
1037                         error = EFAULT;
1038                         break;
1039                 }
1040                 break;
1041 
1042         /*
1043          * Set interface.
1044          */
1045         case BIOCSETLIF:
1046                 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1047                         error = EFAULT;
1048                         break;
1049                 }
1050                 error = bpf_setif(d, lifreq.lifr_name,
1051                     sizeof (lifreq.lifr_name));
1052                 break;
1053 
1054 #ifdef _SYSCALL32_IMPL
1055         /*
1056          * Set read timeout.
1057          */
1058         case BIOCSRTIMEOUT32:
1059                 {
1060                         struct timeval32 tv;
1061 
1062                         if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1063                                 error = EFAULT;
1064                                 break;
1065                         }
1066 
1067                         /* Convert the timeout in microseconds to ticks */
1068                         d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1069                             tv.tv_usec);
1070                         if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1071                                 d->bd_rtout = 1;
1072                         break;
1073                 }
1074 
1075         /*
1076          * Get read timeout.
1077          */
1078         case BIOCGRTIMEOUT32:
1079                 {
1080                         struct timeval32 tv;
1081                         clock_t ticks;
1082 
1083                         ticks = drv_hztousec(d->bd_rtout);
1084                         tv.tv_sec = ticks / 1000000;
1085                         tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1086                         error = copyout(&tv, (void *)addr, sizeof (tv));
1087                         break;
1088                 }
1089 
1090         /*
1091          * Get a list of supported device parameters.
1092          */
1093         case BIOCGDLTLIST32:
1094                 if (d->bd_bif == 0) {
1095                         error = EINVAL;
1096                 } else {
1097                         struct bpf_dltlist32 lst32;
1098                         struct bpf_dltlist list;
1099 
1100                         if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) {
1101                                 error = EFAULT;
1102                                 break;
1103                         }
1104 
1105                         list.bfl_len = lst32.bfl_len;
1106                         list.bfl_list = (void *)(uint64_t)lst32.bfl_list;
1107                         error = bpf_getdltlist(d, &list);
1108                         if (error == 0) {
1109                                 lst32.bfl_len = list.bfl_len;
1110 
1111                                 if (copyout(&lst32, (void *)addr,
1112                                     sizeof (lst32)) != 0)
1113                                         error = EFAULT;
1114                         }
1115                 }
1116                 break;
1117 
1118         /*
1119          * Set link layer read filter.
1120          */
1121         case BIOCSETF32: {
1122                 struct bpf_program32 prog32;
1123 
1124                 if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) {
1125                         error = EFAULT;
1126                         break;
1127                 }
1128                 prog.bf_len = prog32.bf_len;
1129                 prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1130                 error = bpf_setf(d, &prog);
1131                 break;
1132         }
1133 #endif
1134 
1135         /*
1136          * Set read timeout.
1137          */
1138         case BIOCSRTIMEOUT:
1139                 {
1140                         struct timeval tv;
1141 
1142                         if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1143                                 error = EFAULT;
1144                                 break;
1145                         }
1146 
1147                         /* Convert the timeout in microseconds to ticks */
1148                         d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1149                             tv.tv_usec);
1150                         if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1151                                 d->bd_rtout = 1;
1152                         break;
1153                 }
1154 
1155         /*
1156          * Get read timeout.
1157          */
1158         case BIOCGRTIMEOUT:
1159                 {
1160                         struct timeval tv;
1161                         clock_t ticks;
1162 
1163                         ticks = drv_hztousec(d->bd_rtout);
1164                         tv.tv_sec = ticks / 1000000;
1165                         tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1166                         if (copyout(&tv, (void *)addr, sizeof (tv)) != 0)
1167                                 error = EFAULT;
1168                         break;
1169                 }
1170 
1171         /*
1172          * Get packet stats.
1173          */
1174         case BIOCGSTATS:
1175                 {
1176                         struct bpf_stat bs;
1177 
1178                         bs.bs_recv = d->bd_rcount;
1179                         bs.bs_drop = d->bd_dcount;
1180                         bs.bs_capt = d->bd_ccount;
1181                         if (copyout(&bs, (void *)addr, sizeof (bs)) != 0)
1182                                 error = EFAULT;
1183                         break;
1184                 }
1185 
1186         /*
1187          * Set immediate mode.
1188          */
1189         case BIOCIMMEDIATE:
1190                 if (copyin((void *)addr, &d->bd_immediate,
1191                     sizeof (d->bd_immediate)) != 0)
1192                         error = EFAULT;
1193                 break;
1194 
1195         case BIOCVERSION:
1196                 {
1197                         struct bpf_version bv;
1198 
1199                         bv.bv_major = BPF_MAJOR_VERSION;
1200                         bv.bv_minor = BPF_MINOR_VERSION;
1201                         if (copyout(&bv, (void *)addr, sizeof (bv)) != 0)
1202                                 error = EFAULT;
1203                         break;
1204                 }
1205 
1206         case BIOCGHDRCMPLT:     /* get "header already complete" flag */
1207                 if (copyout(&d->bd_hdrcmplt, (void *)addr,
1208                     sizeof (d->bd_hdrcmplt)) != 0)
1209                         error = EFAULT;
1210                 break;
1211 
1212         case BIOCSHDRCMPLT:     /* set "header already complete" flag */
1213                 if (copyin((void *)addr, &d->bd_hdrcmplt,
1214                     sizeof (d->bd_hdrcmplt)) != 0)
1215                         error = EFAULT;
1216                 break;
1217 
1218         /*
1219          * Get "see sent packets" flag
1220          */
1221         case BIOCGSEESENT:
1222                 if (copyout(&d->bd_seesent, (void *)addr,
1223                     sizeof (d->bd_seesent)) != 0)
1224                         error = EFAULT;
1225                 break;
1226 
1227         /*
1228          * Set "see sent" packets flag
1229          */
1230         case BIOCSSEESENT:
1231                 if (copyin((void *)addr, &d->bd_seesent,
1232                     sizeof (d->bd_seesent)) != 0)
1233                         error = EFAULT;
1234                 break;
1235 
1236         case FIONBIO:           /* Non-blocking I/O */
1237                 if (copyin((void *)addr, &d->bd_nonblock,
1238                     sizeof (d->bd_nonblock)) != 0)
1239                         error = EFAULT;
1240                 break;
1241         }
1242         return (error);
1243 }
1244 
1245 /*
1246  * Set d's packet filter program to fp.  If this file already has a filter,
1247  * free it and replace it. If the new filter is "empty" (has a 0 size), then
1248  * the result is to just remove and free the existing filter.
1249  * Returns EINVAL for bogus requests.
1250  */
1251 int
1252 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1253 {
1254         struct bpf_insn *fcode, *old;
1255         uint_t flen, size;
1256         size_t oldsize;
1257 
1258         if (fp->bf_insns == 0) {
1259                 if (fp->bf_len != 0)
1260                         return (EINVAL);
1261                 mutex_enter(&d->bd_lock);
1262                 old = d->bd_filter;
1263                 oldsize = d->bd_filter_size;
1264                 d->bd_filter = 0;
1265                 d->bd_filter_size = 0;
1266                 reset_d(d);
1267                 mutex_exit(&d->bd_lock);
1268                 if (old != 0)
1269                         kmem_free(old, oldsize);
1270                 return (0);
1271         }
1272         flen = fp->bf_len;
1273         if (flen > BPF_MAXINSNS)
1274                 return (EINVAL);
1275 
1276         size = flen * sizeof (*fp->bf_insns);
1277         fcode = kmem_alloc(size, KM_SLEEP);
1278         if (copyin(fp->bf_insns, fcode, size) != 0)
1279                 return (EFAULT);
1280 
1281         if (bpf_validate(fcode, (int)flen)) {
1282                 mutex_enter(&d->bd_lock);
1283                 old = d->bd_filter;
1284                 oldsize = d->bd_filter_size;
1285                 d->bd_filter = fcode;
1286                 d->bd_filter_size = size;
1287                 reset_d(d);
1288                 mutex_exit(&d->bd_lock);
1289                 if (old != 0)
1290                         kmem_free(old, oldsize);
1291 
1292                 return (0);
1293         }
1294         kmem_free(fcode, size);
1295         return (EINVAL);
1296 }
1297 
1298 /*
1299  * Detach a file from its current interface (if attached at all) and attach
1300  * to the interface indicated by the name stored in ifname.
1301  * Return an errno or 0.
1302  */
1303 static int
1304 bpf_setif(struct bpf_d *d, char *ifname, int namesize)
1305 {
1306         int unit_seen;
1307         int error = 0;
1308         char *cp;
1309         int i;
1310 
1311         /*
1312          * Make sure the provided name has a unit number, and default
1313          * it to '0' if not specified.
1314          * XXX This is ugly ... do this differently?
1315          */
1316         unit_seen = 0;
1317         cp = ifname;
1318         cp[namesize - 1] = '\0';        /* sanity */
1319         while (*cp++)
1320                 if (*cp >= '0' && *cp <= '9')
1321                         unit_seen = 1;
1322         if (!unit_seen) {
1323                 /* Make sure to leave room for the '\0'. */
1324                 for (i = 0; i < (namesize - 1); ++i) {
1325                         if ((ifname[i] >= 'a' && ifname[i] <= 'z') ||
1326                             (ifname[i] >= 'A' && ifname[i] <= 'Z'))
1327                                 continue;
1328                         ifname[i] = '0';
1329                 }
1330         }
1331 
1332         /*
1333          * Make sure that only one call to this function happens at a time
1334          * and that we're not interleaving a read/write
1335          */
1336         mutex_enter(&d->bd_lock);
1337         while (d->bd_inuse != 0) {
1338                 d->bd_waiting++;
1339                 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
1340                         d->bd_waiting--;
1341                         mutex_exit(&d->bd_lock);
1342                         return (EINTR);
1343                 }
1344                 d->bd_waiting--;
1345         }
1346         d->bd_inuse = -1;
1347         mutex_exit(&d->bd_lock);
1348 
1349         if (d->bd_sbuf == 0)
1350                 error = bpf_allocbufs(d);
1351 
1352         if (error == 0) {
1353                 mutex_enter(&d->bd_lock);
1354                 if (d->bd_bif)
1355                         /*
1356                          * Detach if attached to something else.
1357                          */
1358                         bpf_detachd(d);
1359 
1360                 error = bpf_attachd(d, ifname, -1);
1361                 reset_d(d);
1362                 d->bd_inuse = 0;
1363                 if (d->bd_waiting != 0)
1364                         cv_signal(&d->bd_wait);
1365                 mutex_exit(&d->bd_lock);
1366                 return (error);
1367         }
1368 
1369         mutex_enter(&d->bd_lock);
1370         d->bd_inuse = 0;
1371         if (d->bd_waiting != 0)
1372                 cv_signal(&d->bd_wait);
1373         mutex_exit(&d->bd_lock);
1374 
1375         /*
1376          * Try tickle the mac layer into attaching the device...
1377          */
1378         return (bpf_provider_tickle(ifname, d->bd_zone));
1379 }
1380 
1381 /*
1382  * Copy the interface name to the ifreq.
1383  */
1384 static int
1385 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize)
1386 {
1387 
1388         mutex_enter(&d->bd_lock);
1389         if (d->bd_bif == NULL) {
1390                 mutex_exit(&d->bd_lock);
1391                 return (EINVAL);
1392         }
1393 
1394         (void) strlcpy(buffer, d->bd_ifname, bufsize);
1395         mutex_exit(&d->bd_lock);
1396 
1397         return (0);
1398 }
1399 
1400 /* ARGSUSED */
1401 int
1402 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp,
1403     struct pollhead **phpp)
1404 {
1405         struct bpf_d *d = bpf_dev_get(getminor(dev));
1406 
1407         /*
1408          * Until this driver is modified to issue proper pollwakeup() calls on
1409          * its pollhead, edge-triggered polling is not allowed.
1410          */
1411         if (events & POLLET) {
1412                 return (EPERM);
1413         }
1414 
1415         if (events & (POLLIN | POLLRDNORM)) {
1416                 /*
1417                  * An imitation of the FIONREAD ioctl code.
1418                  */
1419                 mutex_enter(&d->bd_lock);
1420                 if (d->bd_hlen != 0 ||
1421                     ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1422                     d->bd_slen != 0)) {
1423                         *reventsp |= events & (POLLIN | POLLRDNORM);
1424                 } else {
1425                         /*
1426                          * Until the bpf driver has been updated to include
1427                          * adequate pollwakeup() logic, no pollhead will be
1428                          * emitted here, preventing the resource from being
1429                          * cached by poll()/devpoll/epoll.
1430                          */
1431                         *reventsp = 0;
1432                         /* Start the read timeout if necessary */
1433                         if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1434                                 bpf_clear_timeout(d);
1435                                 /*
1436                                  * Only allow the timeout to be set once.
1437                                  */
1438                                 if (d->bd_callout == 0)
1439                                         d->bd_callout = timeout(bpf_timed_out,
1440                                             d, d->bd_rtout);
1441                                 d->bd_state = BPF_WAITING;
1442                         }
1443                 }
1444                 mutex_exit(&d->bd_lock);
1445         }
1446 
1447         return (0);
1448 }
1449 
1450 /*
1451  * Copy data from an mblk_t chain into a buffer. This works for ipnet
1452  * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1453  * packet itself.
1454  */
1455 static void *
1456 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1457 {
1458         const mblk_t *m;
1459         uint_t count;
1460         uchar_t *dst;
1461 
1462         m = src_arg;
1463         dst = dst_arg;
1464         while (len > 0) {
1465                 if (m == NULL)
1466                         panic("bpf_mcpy");
1467                 count = (uint_t)min(M_LEN(m), len);
1468                 (void) memcpy(dst, mtod(m, const void *), count);
1469                 m = m->b_cont;
1470                 dst += count;
1471                 len -= count;
1472         }
1473         return (dst_arg);
1474 }
1475 
1476 /*
1477  * Dispatch a packet to all the listeners on interface bp.
1478  *
1479  * marg    pointer to the packet, either a data buffer or an mbuf chain
1480  * buflen  buffer length, if marg is a data buffer
1481  * cpfn    a function that can copy marg into the listener's buffer
1482  * pktlen  length of the packet
1483  * issent  boolean indicating whether the packet was sent or receive
1484  */
1485 static inline void
1486 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen,
1487     uint_t buflen, boolean_t issent)
1488 {
1489         struct timeval tv;
1490         uint_t slen;
1491 
1492         if (!d->bd_seesent && issent)
1493                 return;
1494 
1495         /*
1496          * Accuracy of the packet counters in BPF is vital so it
1497          * is important to protect even the outer ones.
1498          */
1499         mutex_enter(&d->bd_lock);
1500         slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
1501         DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif,
1502             struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen);
1503         d->bd_rcount++;
1504         ks_stats.kp_receive.value.ui64++;
1505         if (slen != 0) {
1506                 uniqtime(&tv);
1507                 catchpacket(d, marg, pktlen, slen, cpfn, &tv);
1508         }
1509         mutex_exit(&d->bd_lock);
1510 }
1511 
1512 /*
1513  * Incoming linkage from device drivers.
1514  */
1515 /* ARGSUSED */
1516 void
1517 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent)
1518 {
1519         cp_fn_t cpfn;
1520         struct bpf_d *d = arg;
1521         uint_t pktlen, buflen;
1522         void *marg;
1523 
1524         pktlen = msgdsize(m);
1525 
1526         if (pktlen == M_LEN(m)) {
1527                 cpfn = (cp_fn_t)memcpy;
1528                 marg = mtod(m, void *);
1529                 buflen = pktlen;
1530         } else {
1531                 cpfn = bpf_mcpy;
1532                 marg = m;
1533                 buflen = 0;
1534         }
1535 
1536         bpf_deliver(d, cpfn, marg, pktlen, buflen, issent);
1537 }
1538 
1539 /*
1540  * Incoming linkage from ipnet.
1541  * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1542  * from all network interfaces. Thus the tap function needs to apply a
1543  * filter using the interface index/id to immitate snoop'ing on just the
1544  * specified interface.
1545  */
1546 /* ARGSUSED */
1547 void
1548 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length)
1549 {
1550         hook_pkt_observe_t *hdr;
1551         struct bpf_d *d = arg;
1552 
1553         hdr = (hook_pkt_observe_t *)m->b_rptr;
1554         if (ntohl(hdr->hpo_ifindex) != d->bd_linkid)
1555                 return;
1556         bpf_deliver(d, bpf_mcpy, m, length, 0, issent);
1557 
1558 }
1559 
1560 /*
1561  * Move the packet data from interface memory (pkt) into the
1562  * store buffer.  Return 1 if it's time to wakeup a listener (buffer full),
1563  * otherwise 0.  "copy" is the routine called to do the actual data
1564  * transfer.  memcpy is passed in to copy contiguous chunks, while
1565  * bpf_mcpy is passed in to copy mbuf chains.  In the latter case,
1566  * pkt is really an mbuf.
1567  */
1568 static void
1569 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen,
1570     cp_fn_t cpfn, struct timeval *tv)
1571 {
1572         struct bpf_hdr *hp;
1573         int totlen, curlen;
1574         int hdrlen = d->bd_hdrlen;
1575         int do_wakeup = 0;
1576 
1577         ++d->bd_ccount;
1578         ks_stats.kp_capture.value.ui64++;
1579         /*
1580          * Figure out how many bytes to move.  If the packet is
1581          * greater or equal to the snapshot length, transfer that
1582          * much.  Otherwise, transfer the whole packet (unless
1583          * we hit the buffer size limit).
1584          */
1585         totlen = hdrlen + min(snaplen, pktlen);
1586         if (totlen > d->bd_bufsize)
1587                 totlen = d->bd_bufsize;
1588 
1589         /*
1590          * Round up the end of the previous packet to the next longword.
1591          */
1592         curlen = BPF_WORDALIGN(d->bd_slen);
1593         if (curlen + totlen > d->bd_bufsize) {
1594                 /*
1595                  * This packet will overflow the storage buffer.
1596                  * Rotate the buffers if we can, then wakeup any
1597                  * pending reads.
1598                  */
1599                 if (d->bd_fbuf == 0) {
1600                         /*
1601                          * We haven't completed the previous read yet,
1602                          * so drop the packet.
1603                          */
1604                         ++d->bd_dcount;
1605                         ks_stats.kp_dropped.value.ui64++;
1606                         return;
1607                 }
1608                 ROTATE_BUFFERS(d);
1609                 do_wakeup = 1;
1610                 curlen = 0;
1611         } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1612                 /*
1613                  * Immediate mode is set, or the read timeout has
1614                  * already expired during a select call.  A packet
1615                  * arrived, so the reader should be woken up.
1616                  */
1617                 do_wakeup = 1;
1618         }
1619 
1620         /*
1621          * Append the bpf header to the existing buffer before we add
1622          * on the actual packet data.
1623          */
1624         hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
1625         hp->bh_tstamp.tv_sec = tv->tv_sec;
1626         hp->bh_tstamp.tv_usec = tv->tv_usec;
1627         hp->bh_datalen = pktlen;
1628         hp->bh_hdrlen = (uint16_t)hdrlen;
1629         /*
1630          * Copy the packet data into the store buffer and update its length.
1631          */
1632         (*cpfn)((uchar_t *)hp + hdrlen, pkt,
1633             (hp->bh_caplen = totlen - hdrlen));
1634         d->bd_slen = curlen + totlen;
1635 
1636         /*
1637          * Call bpf_wakeup after bd_slen has been updated.
1638          */
1639         if (do_wakeup)
1640                 bpf_wakeup(d);
1641 }
1642 
1643 /*
1644  * Initialize all nonzero fields of a descriptor.
1645  */
1646 static int
1647 bpf_allocbufs(struct bpf_d *d)
1648 {
1649 
1650         d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1651         if (!d->bd_fbuf)
1652                 return (ENOBUFS);
1653         d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1654         if (!d->bd_sbuf) {
1655                 kmem_free(d->bd_fbuf, d->bd_bufsize);
1656                 return (ENOBUFS);
1657         }
1658         d->bd_slen = 0;
1659         d->bd_hlen = 0;
1660         return (0);
1661 }
1662 
1663 /*
1664  * Free buffers currently in use by a descriptor.
1665  * Called on close.
1666  */
1667 static void
1668 bpf_freed(struct bpf_d *d)
1669 {
1670         /*
1671          * At this point the descriptor has been detached from its
1672          * interface and it yet hasn't been marked free.
1673          */
1674         if (d->bd_sbuf != 0) {
1675                 kmem_free(d->bd_sbuf, d->bd_bufsize);
1676                 if (d->bd_hbuf != 0)
1677                         kmem_free(d->bd_hbuf, d->bd_bufsize);
1678                 if (d->bd_fbuf != 0)
1679                         kmem_free(d->bd_fbuf, d->bd_bufsize);
1680         }
1681         if (d->bd_filter)
1682                 kmem_free(d->bd_filter, d->bd_filter_size);
1683 }
1684 
1685 /*
1686  * Get a list of available data link type of the interface.
1687  */
1688 static int
1689 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp)
1690 {
1691         bpf_provider_list_t *bp;
1692         bpf_provider_t *bpr;
1693         zoneid_t zoneid;
1694         uintptr_t mcip;
1695         uint_t nicdlt;
1696         uintptr_t mh;
1697         int error;
1698         int n;
1699 
1700         n = 0;
1701         mh = 0;
1702         mcip = 0;
1703         error = 0;
1704         mutex_enter(&d->bd_lock);
1705         LIST_FOREACH(bp, &bpf_providers, bpl_next) {
1706                 bpr = bp->bpl_what;
1707                 error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone);
1708                 if (error != 0)
1709                         goto next;
1710                 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
1711                 if (error != 0)
1712                         goto next;
1713                 error = MBPF_GET_ZONE(bpr, mh, &zoneid);
1714                 if (error != 0)
1715                         goto next;
1716                 if (d->bd_zone != GLOBAL_ZONEID &&
1717                     d->bd_zone != zoneid)
1718                         goto next;
1719                 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
1720                 if (error != 0)
1721                         goto next;
1722                 nicdlt = bpf_dl_to_dlt(nicdlt);
1723                 if (listp->bfl_list != NULL) {
1724                         if (n >= listp->bfl_len) {
1725                                 MBPF_CLIENT_CLOSE(bpr, mcip);
1726                                 MBPF_CLOSE(bpr, mh);
1727                                 break;
1728                         }
1729                         /*
1730                          * Bumping of bd_inuse ensures the structure does not
1731                          * disappear while the copyout runs and allows the for
1732                          * loop to be continued.
1733                          */
1734                         d->bd_inuse++;
1735                         mutex_exit(&d->bd_lock);
1736                         if (copyout(&nicdlt,
1737                             listp->bfl_list + n, sizeof (uint_t)) != 0)
1738                                 error = EFAULT;
1739                         mutex_enter(&d->bd_lock);
1740                         if (error != 0)
1741                                 break;
1742                         d->bd_inuse--;
1743                 }
1744                 n++;
1745 next:
1746                 if (mcip != 0) {
1747                         MBPF_CLIENT_CLOSE(bpr, mcip);
1748                         mcip = 0;
1749                 }
1750                 if (mh != 0) {
1751                         MBPF_CLOSE(bpr, mh);
1752                         mh = 0;
1753                 }
1754         }
1755         mutex_exit(&d->bd_lock);
1756 
1757         /*
1758          * It is quite possible that one or more provider to BPF may not
1759          * know about a link name whlist others do. In that case, so long
1760          * as we have one success, do not declare an error unless it was
1761          * an EFAULT as this indicates a problem that needs to be reported.
1762          */
1763         if ((error != EFAULT) && (n > 0))
1764                 error = 0;
1765 
1766         listp->bfl_len = n;
1767         return (error);
1768 }
1769 
1770 /*
1771  * Set the data link type of a BPF instance.
1772  */
1773 static int
1774 bpf_setdlt(struct bpf_d *d, void *addr)
1775 {
1776         char ifname[LIFNAMSIZ+1];
1777         zoneid_t niczone;
1778         int error;
1779         int dlt;
1780 
1781         if (copyin(addr, &dlt, sizeof (dlt)) != 0)
1782                 return (EFAULT);
1783 
1784         mutex_enter(&d->bd_lock);
1785 
1786         if (d->bd_bif == 0) {                        /* Interface not set */
1787                 mutex_exit(&d->bd_lock);
1788                 return (EINVAL);
1789         }
1790         if (d->bd_dlt == dlt) {      /* NULL-op */
1791                 mutex_exit(&d->bd_lock);
1792                 return (0);
1793         }
1794 
1795         error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone);
1796         if (error != 0) {
1797                 mutex_exit(&d->bd_lock);
1798                 return (error);
1799         }
1800 
1801         /*
1802          * See the matrix at the top of the file for the permissions table
1803          * enforced by this driver.
1804          */
1805         if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) &&
1806             (niczone != d->bd_zone)) {
1807                 mutex_exit(&d->bd_lock);
1808                 return (EINVAL);
1809         }
1810 
1811         (void) strlcpy(ifname, d->bd_ifname, sizeof (ifname));
1812         d->bd_inuse = -1;
1813         bpf_detachd(d);
1814         error = bpf_attachd(d, ifname, dlt);
1815         reset_d(d);
1816         d->bd_inuse = 0;
1817 
1818         mutex_exit(&d->bd_lock);
1819         return (error);
1820 }
1821 
1822 /*
1823  * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1824  * with the necessary protection to retrieve and modify bd_callout but it
1825  * does not hold the lock for its entire duration... see below...
1826  */
1827 static void
1828 bpf_clear_timeout(struct bpf_d *d)
1829 {
1830         timeout_id_t tid = d->bd_callout;
1831         d->bd_callout = 0;
1832         d->bd_inuse++;
1833 
1834         /*
1835          * If the timeout has fired and is waiting on bd_lock, we could
1836          * deadlock here because untimeout if bd_lock is held and would
1837          * wait for bpf_timed_out to finish and it never would.
1838          */
1839         if (tid != 0) {
1840                 mutex_exit(&d->bd_lock);
1841                 (void) untimeout(tid);
1842                 mutex_enter(&d->bd_lock);
1843         }
1844 
1845         d->bd_inuse--;
1846 }
1847 
1848 /*
1849  * As a cloning device driver, BPF needs to keep track of which device
1850  * numbers are in use and which ones are not. A hash table, indexed by
1851  * the minor device number, is used to store the pointers to the
1852  * individual descriptors that are allocated in bpfopen().
1853  * The functions below present the interface for that hash table to
1854  * the rest of the driver.
1855  */
1856 static struct bpf_d *
1857 bpf_dev_find(minor_t minor)
1858 {
1859         struct bpf_d *d = NULL;
1860 
1861         (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1862             (mod_hash_val_t *)&d);
1863 
1864         return (d);
1865 }
1866 
1867 static void
1868 bpf_dev_add(struct bpf_d *d)
1869 {
1870         (void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1871             (mod_hash_val_t)d);
1872 }
1873 
1874 static void
1875 bpf_dev_remove(struct bpf_d *d)
1876 {
1877         struct bpf_d *stor;
1878 
1879         (void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1880             (mod_hash_val_t *)&stor);
1881         ASSERT(stor == d);
1882 }
1883 
1884 /*
1885  * bpf_def_get should only ever be called for a minor number that exists,
1886  * thus there should always be a pointer in the hash table that corresponds
1887  * to it.
1888  */
1889 static struct bpf_d *
1890 bpf_dev_get(minor_t minor)
1891 {
1892         struct bpf_d *d = NULL;
1893 
1894         (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1895             (mod_hash_val_t *)&d);
1896         ASSERT(d != NULL);
1897 
1898         return (d);
1899 }