epoll Old usr/src/uts/common/io/bpf/bpf.c

   1 /*      $NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $    */
   2 
   3 /*
   4  * Copyright (c) 1990, 1991, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from the Stanford/CMU enet packet filter,
   8  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
   9  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  10  * Berkeley Laboratory.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      @(#)bpf.c       8.4 (Berkeley) 1/9/95
  37  * static char rcsid[] =
  38  * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
  39  */
  40 /*
  41  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  42  * Use is subject to license terms.
  43  */
  44 
  45 /*
  46  * The BPF implements the following access controls for zones attempting
  47  * to read and write data. Writing of data requires that the net_rawaccess
  48  * privilege is held whilst reading data requires either net_rawaccess or
  49  * net_observerability.
  50  *
  51  *                              | Shared |  Exclusive |   Global
  52  * -----------------------------+--------+------------+------------+
  53  * DLT_IPNET in local zone      |  Read  |    Read    |    Read    |
  54  * -----------------------------+--------+------------+------------+
  55  * Raw access to local zone NIC |  None  | Read/Write | Read/Write |
  56  * -----------------------------+--------+------------+------------+
  57  * Raw access to all NICs       |  None  |    None    | Read/Write |
  58  * -----------------------------+--------+------------+------------+
  59  *
  60  * The BPF driver is written as a cloning driver: each call to bpfopen()
  61  * allocates a new minor number. This provides BPF with a 1:1 relationship
  62  * between open's and close's. There is some amount of "descriptor state"
  63  * that is kept per open. Pointers to this data are stored in a hash table
  64  * (bpf_hash) that is index'd by the minor device number for each open file.
  65  */
  66 #include <sys/param.h>
  67 #include <sys/systm.h>
  68 #include <sys/time.h>
  69 #include <sys/ioctl.h>
  70 #include <sys/queue.h>
  71 #include <sys/filio.h>
  72 #include <sys/policy.h>
  73 #include <sys/cmn_err.h>
  74 #include <sys/uio.h>
  75 #include <sys/file.h>
  76 #include <sys/sysmacros.h>
  77 #include <sys/zone.h>
  78 
  79 #include <sys/socket.h>
  80 #include <sys/errno.h>
  81 #include <sys/poll.h>
  82 #include <sys/dlpi.h>
  83 #include <sys/neti.h>
  84 
  85 #include <net/if.h>
  86 
  87 #include <net/bpf.h>
  88 #include <net/bpfdesc.h>
  89 #include <net/dlt.h>
  90 
  91 #include <netinet/in.h>
  92 #include <sys/mac.h>
  93 #include <sys/mac_client.h>
  94 #include <sys/mac_impl.h>
  95 #include <sys/time_std_impl.h>
  96 #include <sys/hook.h>
  97 #include <sys/hook_event.h>
  98 
  99 
 100 #define mtod(_v, _t)    (_t)((_v)->b_rptr)
 101 #define M_LEN(_m)       ((_m)->b_wptr - (_m)->b_rptr)
 102 
 103 /*
 104  * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
 105  * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
 106  */
 107 #define BPF_BUFSIZE (32 * 1024)
 108 
 109 typedef void *(*cp_fn_t)(void *, const void *, size_t);
 110 
 111 /*
 112  * The default read buffer size, and limit for BIOCSBLEN.
 113  */
 114 int bpf_bufsize = BPF_BUFSIZE;
 115 int bpf_maxbufsize = (16 * 1024 * 1024);
 116 static mod_hash_t *bpf_hash = NULL;
 117 
 118 /*
 119  * Use a mutex to avoid a race condition between gathering the stats/peers
 120  * and opening/closing the device.
 121  */
 122 static kcondvar_t bpf_dlt_waiter;
 123 static kmutex_t bpf_mtx;
 124 static bpf_kstats_t ks_stats;
 125 static bpf_kstats_t bpf_kstats = {
 126         { "readWait",           KSTAT_DATA_UINT64 },
 127         { "writeOk",            KSTAT_DATA_UINT64 },
 128         { "writeError",         KSTAT_DATA_UINT64 },
 129         { "receive",            KSTAT_DATA_UINT64 },
 130         { "captured",           KSTAT_DATA_UINT64 },
 131         { "dropped",            KSTAT_DATA_UINT64 },
 132 };
 133 static kstat_t *bpf_ksp;
 134 
 135 /*
 136  *  bpf_list is a list of the BPF descriptors currently open
 137  */
 138 LIST_HEAD(, bpf_d) bpf_list;
 139 
 140 static int      bpf_allocbufs(struct bpf_d *);
 141 static void     bpf_clear_timeout(struct bpf_d *);
 142 static void     bpf_deliver(struct bpf_d *, cp_fn_t,
 143                     void *, uint_t, uint_t, boolean_t);
 144 static void     bpf_freed(struct bpf_d *);
 145 static int      bpf_ifname(struct bpf_d *d, char *, int);
 146 static void     *bpf_mcpy(void *, const void *, size_t);
 147 static int      bpf_attachd(struct bpf_d *, const char *, int);
 148 static void     bpf_detachd(struct bpf_d *);
 149 static int      bpf_setif(struct bpf_d *, char *, int);
 150 static void     bpf_timed_out(void *);
 151 static inline void
 152                 bpf_wakeup(struct bpf_d *);
 153 static void     catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t,
 154                     cp_fn_t, struct timeval *);
 155 static void     reset_d(struct bpf_d *);
 156 static int      bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 157 static int      bpf_setdlt(struct bpf_d *, void *);
 158 static void     bpf_dev_add(struct bpf_d *);
 159 static struct bpf_d *bpf_dev_find(minor_t);
 160 static struct bpf_d *bpf_dev_get(minor_t);
 161 static void     bpf_dev_remove(struct bpf_d *);
 162 
 163 static int
 164 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp)
 165 {
 166         mblk_t *m;
 167         int error;
 168         int len;
 169         int hlen;
 170         int align;
 171 
 172         /*
 173          * Build a sockaddr based on the data link layer type.
 174          * We do this at this level because the ethernet header
 175          * is copied directly into the data field of the sockaddr.
 176          * In the case of SLIP, there is no header and the packet
 177          * is forwarded as is.
 178          * Also, we are careful to leave room at the front of the mbuf
 179          * for the link level header.
 180          */
 181         switch (linktype) {
 182 
 183         case DLT_EN10MB:
 184                 hlen = sizeof (struct ether_header);
 185                 break;
 186 
 187         case DLT_FDDI:
 188                 hlen = 16;
 189                 break;
 190 
 191         case DLT_NULL:
 192                 hlen = 0;
 193                 break;
 194 
 195         case DLT_IPOIB:
 196                 hlen = 44;
 197                 break;
 198 
 199         default:
 200                 return (EIO);
 201         }
 202 
 203         align = 4 - (hlen & 3);
 204 
 205         len = uio->uio_resid;
 206         /*
 207          * If there aren't enough bytes for a link level header or the
 208          * packet length exceeds the interface mtu, return an error.
 209          */
 210         if (len < hlen || len - hlen > mtu)
 211                 return (EMSGSIZE);
 212 
 213         m = allocb(len + align, BPRI_MED);
 214         if (m == NULL) {
 215                 error = ENOBUFS;
 216                 goto bad;
 217         }
 218 
 219         /* Insure the data is properly aligned */
 220         if (align > 0)
 221                 m->b_rptr += align;
 222         m->b_wptr = m->b_rptr + len;
 223 
 224         error = uiomove(mtod(m, void *), len, UIO_WRITE, uio);
 225         if (error)
 226                 goto bad;
 227         *mp = m;
 228         return (0);
 229 
 230 bad:
 231         if (m != NULL)
 232                 freemsg(m);
 233         return (error);
 234 }
 235 
 236 
 237 /*
 238  * Attach file to the bpf interface, i.e. make d listen on bp.
 239  */
 240 static int
 241 bpf_attachd(struct bpf_d *d, const char *ifname, int dlt)
 242 {
 243         bpf_provider_list_t *bp;
 244         bpf_provider_t *bpr;
 245         boolean_t zonematch;
 246         zoneid_t niczone;
 247         uintptr_t mcip;
 248         zoneid_t zone;
 249         uint_t nicdlt;
 250         uintptr_t mh;
 251         int hdrlen;
 252         int error;
 253 
 254         ASSERT(d->bd_bif == NULL);
 255         ASSERT(d->bd_mcip == NULL);
 256         zone = d->bd_zone;
 257         zonematch = B_TRUE;
 258 again:
 259         mh = 0;
 260         mcip = 0;
 261         LIST_FOREACH(bp, &bpf_providers, bpl_next) {
 262                 bpr = bp->bpl_what;
 263                 error = MBPF_OPEN(bpr, ifname, &mh, zone);
 264                 if (error != 0)
 265                         goto next;
 266                 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
 267                 if (error != 0)
 268                         goto next;
 269                 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
 270                 if (error != 0)
 271                         goto next;
 272 
 273                 nicdlt = bpf_dl_to_dlt(nicdlt);
 274                 if (dlt != -1 && dlt != nicdlt) {
 275                         error = ENOENT;
 276                         goto next;
 277                 }
 278 
 279                 error = MBPF_GET_ZONE(bpr, mh, &niczone);
 280                 if (error != 0)
 281                         goto next;
 282 
 283                 DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr,
 284                     uintptr_t, mh, int, nicdlt, zoneid_t, niczone);
 285 
 286                 if (zonematch && niczone != zone) {
 287                         error = ENOENT;
 288                         goto next;
 289                 }
 290                 break;
 291 next:
 292                 if (mcip != 0) {
 293                         MBPF_CLIENT_CLOSE(bpr, mcip);
 294                         mcip = 0;
 295                 }
 296                 if (mh != NULL) {
 297                         MBPF_CLOSE(bpr, mh);
 298                         mh = 0;
 299                 }
 300         }
 301         if (error != 0) {
 302                 if (zonematch && (zone == GLOBAL_ZONEID)) {
 303                         /*
 304                          * If we failed to do an exact match for the global
 305                          * zone using the global zoneid, try again in case
 306                          * the network interface is owned by a local zone.
 307                          */
 308                         zonematch = B_FALSE;
 309                         goto again;
 310                 }
 311                 return (error);
 312         }
 313 
 314         d->bd_mac = *bpr;
 315         d->bd_mcip = mcip;
 316         d->bd_bif = mh;
 317         d->bd_dlt = nicdlt;
 318         hdrlen = bpf_dl_hdrsize(nicdlt);
 319         d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
 320 
 321         (void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip),
 322             sizeof (d->bd_ifname));
 323 
 324         (void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid,
 325             zone);
 326         (void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d,
 327             &d->bd_promisc_handle, d->bd_promisc_flags);
 328         return (0);
 329 }
 330 
 331 /*
 332  * Detach a file from its interface.
 333  */
 334 static void
 335 bpf_detachd(struct bpf_d *d)
 336 {
 337         uintptr_t mph;
 338         uintptr_t mch;
 339         uintptr_t mh;
 340 
 341         ASSERT(d->bd_inuse == -1);
 342         mch = d->bd_mcip;
 343         d->bd_mcip = 0;
 344         mh = d->bd_bif;
 345         d->bd_bif = 0;
 346 
 347         /*
 348          * Check if this descriptor had requested promiscuous mode.
 349          * If so, turn it off. There's no need to take any action
 350          * here, that is done when MBPF_PROMISC_REMOVE is used;
 351          * bd_promisc is just a local flag to stop promiscuous mode
 352          * from being set more than once.
 353          */
 354         if (d->bd_promisc)
 355                 d->bd_promisc = 0;
 356 
 357         /*
 358          * Take device out of "promiscuous" mode.  Since we were able to
 359          * enter "promiscuous" mode, we should be able to turn it off.
 360          * Note, this field stores a pointer used to support both
 361          * promiscuous and non-promiscuous callbacks for packets.
 362          */
 363         mph = d->bd_promisc_handle;
 364         d->bd_promisc_handle = 0;
 365 
 366         /*
 367          * The lock has to be dropped here because mac_promisc_remove may
 368          * need to wait for mac_promisc_dispatch, which has called into
 369          * bpf and catchpacket is waiting for bd_lock...
 370          * i.e mac_promisc_remove() needs to be called with none of the
 371          * locks held that are part of the bpf_mtap() call path.
 372          */
 373         mutex_exit(&d->bd_lock);
 374         if (mph != 0)
 375                 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
 376 
 377         if (mch != 0)
 378                 MBPF_CLIENT_CLOSE(&d->bd_mac, mch);
 379 
 380         if (mh != 0)
 381                 MBPF_CLOSE(&d->bd_mac, mh);
 382 
 383         /*
 384          * Because this function is called with bd_lock held, so it must
 385          * exit with it held.
 386          */
 387         mutex_enter(&d->bd_lock);
 388         *d->bd_ifname = '\0';
 389         (void) memset(&d->bd_mac, 0, sizeof (d->bd_mac));
 390 }
 391 
 392 
 393 /*
 394  * bpfilterattach() is called at load time.
 395  */
 396 int
 397 bpfilterattach(void)
 398 {
 399 
 400         bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31,
 401             mod_hash_null_keydtor);
 402         if (bpf_hash == NULL)
 403                 return (ENOMEM);
 404 
 405         (void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats));
 406 
 407         bpf_ksp = kstat_create("bpf", 0, "global", "misc",
 408             KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t),
 409             KSTAT_FLAG_VIRTUAL);
 410         if (bpf_ksp != NULL) {
 411                 bpf_ksp->ks_data = &ks_stats;
 412                 kstat_install(bpf_ksp);
 413         } else {
 414                 mod_hash_destroy_idhash(bpf_hash);
 415                 bpf_hash = NULL;
 416                 return (EEXIST);
 417         }
 418 
 419         cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL);
 420         mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL);
 421 
 422         LIST_INIT(&bpf_list);
 423 
 424         return (0);
 425 }
 426 
 427 
 428 /*
 429  * bpfilterdetach() is called at unload time.
 430  */
 431 int
 432 bpfilterdetach(void)
 433 {
 434 
 435         if (bpf_ksp != NULL) {
 436                 kstat_delete(bpf_ksp);
 437                 bpf_ksp = NULL;
 438         }
 439 
 440         mod_hash_destroy_idhash(bpf_hash);
 441         bpf_hash = NULL;
 442 
 443         cv_destroy(&bpf_dlt_waiter);
 444         mutex_destroy(&bpf_mtx);
 445 
 446         return (0);
 447 }
 448 
 449 /*
 450  * Open ethernet device. Clones.
 451  */
 452 /* ARGSUSED */
 453 int
 454 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred)
 455 {
 456         struct bpf_d *d;
 457         uint_t dmin;
 458 
 459         /*
 460          * The security policy described at the top of this file is
 461          * enforced here.
 462          */
 463         if ((flag & FWRITE) != 0) {
 464                 if (secpolicy_net_rawaccess(cred) != 0)
 465                         return (EACCES);
 466         }
 467 
 468         if ((flag & FREAD) != 0) {
 469                 if ((secpolicy_net_observability(cred) != 0) &&
 470                     (secpolicy_net_rawaccess(cred) != 0))
 471                         return (EACCES);
 472         }
 473 
 474         if ((flag & (FWRITE|FREAD)) == 0)
 475                 return (ENXIO);
 476 
 477         /*
 478          * A structure is allocated per open file in BPF to store settings
 479          * such as buffer capture size, provide private buffers, etc.
 480          */
 481         d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP);
 482         d->bd_bufsize = bpf_bufsize;
 483         d->bd_fmode = flag;
 484         d->bd_zone = crgetzoneid(cred);
 485         d->bd_seesent = 1;
 486         d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS|
 487             MAC_PROMISC_FLAGS_NO_COPY;
 488         mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL);
 489         cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL);
 490 
 491         mutex_enter(&bpf_mtx);
 492         /*
 493          * Find an unused minor number. Obviously this is an O(n) algorithm
 494          * and doesn't scale particularly well, so if there are large numbers
 495          * of open file descriptors happening in real use, this design may
 496          * need to be revisited.
 497          */
 498         for (dmin = 0; dmin < L_MAXMIN; dmin++)
 499                 if (bpf_dev_find(dmin) == NULL)
 500                         break;
 501         if (dmin == L_MAXMIN) {
 502                 mutex_exit(&bpf_mtx);
 503                 kmem_free(d, sizeof (*d));
 504                 return (ENXIO);
 505         }
 506         d->bd_dev = dmin;
 507         LIST_INSERT_HEAD(&bpf_list, d, bd_list);
 508         bpf_dev_add(d);
 509         mutex_exit(&bpf_mtx);
 510 
 511         *devp = makedevice(getmajor(*devp), dmin);
 512 
 513         return (0);
 514 }
 515 
 516 /*
 517  * Close the descriptor by detaching it from its interface,
 518  * deallocating its buffers, and marking it free.
 519  *
 520  * Because we only allow a device to be opened once, there is always a
 521  * 1 to 1 relationship between opens and closes supporting this function.
 522  */
 523 /* ARGSUSED */
 524 int
 525 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
 526 {
 527         struct bpf_d *d = bpf_dev_get(getminor(dev));
 528 
 529         mutex_enter(&d->bd_lock);
 530 
 531         while (d->bd_inuse != 0) {
 532                 d->bd_waiting++;
 533                 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
 534                         d->bd_waiting--;
 535                         mutex_exit(&d->bd_lock);
 536                         return (EINTR);
 537                 }
 538                 d->bd_waiting--;
 539         }
 540 
 541         d->bd_inuse = -1;
 542         if (d->bd_state == BPF_WAITING)
 543                 bpf_clear_timeout(d);
 544         d->bd_state = BPF_IDLE;
 545         if (d->bd_bif)
 546                 bpf_detachd(d);
 547         mutex_exit(&d->bd_lock);
 548 
 549         mutex_enter(&bpf_mtx);
 550         LIST_REMOVE(d, bd_list);
 551         bpf_dev_remove(d);
 552         mutex_exit(&bpf_mtx);
 553 
 554         mutex_enter(&d->bd_lock);
 555         mutex_destroy(&d->bd_lock);
 556         cv_destroy(&d->bd_wait);
 557 
 558         bpf_freed(d);
 559         kmem_free(d, sizeof (*d));
 560 
 561         return (0);
 562 }
 563 
 564 /*
 565  * Rotate the packet buffers in descriptor d.  Move the store buffer
 566  * into the hold slot, and the free buffer into the store slot.
 567  * Zero the length of the new store buffer.
 568  */
 569 #define ROTATE_BUFFERS(d) \
 570         (d)->bd_hbuf = (d)->bd_sbuf; \
 571         (d)->bd_hlen = (d)->bd_slen; \
 572         (d)->bd_sbuf = (d)->bd_fbuf; \
 573         (d)->bd_slen = 0; \
 574         (d)->bd_fbuf = 0;
 575 /*
 576  *  bpfread - read next chunk of packets from buffers
 577  */
 578 /* ARGSUSED */
 579 int
 580 bpfread(dev_t dev, struct uio *uio, cred_t *cred)
 581 {
 582         struct bpf_d *d = bpf_dev_get(getminor(dev));
 583         int timed_out;
 584         ulong_t delay;
 585         int error;
 586 
 587         if ((d->bd_fmode & FREAD) == 0)
 588                 return (EBADF);
 589 
 590         /*
 591          * Restrict application to use a buffer the same size as
 592          * the kernel buffers.
 593          */
 594         if (uio->uio_resid != d->bd_bufsize)
 595                 return (EINVAL);
 596 
 597         mutex_enter(&d->bd_lock);
 598         if (d->bd_state == BPF_WAITING)
 599                 bpf_clear_timeout(d);
 600         timed_out = (d->bd_state == BPF_TIMED_OUT);
 601         d->bd_state = BPF_IDLE;
 602         /*
 603          * If the hold buffer is empty, then do a timed sleep, which
 604          * ends when the timeout expires or when enough packets
 605          * have arrived to fill the store buffer.
 606          */
 607         while (d->bd_hbuf == 0) {
 608                 if (d->bd_nonblock) {
 609                         if (d->bd_slen == 0) {
 610                                 mutex_exit(&d->bd_lock);
 611                                 return (EWOULDBLOCK);
 612                         }
 613                         ROTATE_BUFFERS(d);
 614                         break;
 615                 }
 616 
 617                 if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
 618                         /*
 619                          * A packet(s) either arrived since the previous
 620                          * read or arrived while we were asleep.
 621                          * Rotate the buffers and return what's here.
 622                          */
 623                         ROTATE_BUFFERS(d);
 624                         break;
 625                 }
 626                 ks_stats.kp_read_wait.value.ui64++;
 627                 delay = ddi_get_lbolt() + d->bd_rtout;
 628                 error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay);
 629                 if (error == 0) {
 630                         mutex_exit(&d->bd_lock);
 631                         return (EINTR);
 632                 }
 633                 if (error == -1) {
 634                         /*
 635                          * On a timeout, return what's in the buffer,
 636                          * which may be nothing.  If there is something
 637                          * in the store buffer, we can rotate the buffers.
 638                          */
 639                         if (d->bd_hbuf)
 640                                 /*
 641                                  * We filled up the buffer in between
 642                                  * getting the timeout and arriving
 643                                  * here, so we don't need to rotate.
 644                                  */
 645                                 break;
 646 
 647                         if (d->bd_slen == 0) {
 648                                 mutex_exit(&d->bd_lock);
 649                                 return (0);
 650                         }
 651                         ROTATE_BUFFERS(d);
 652                 }
 653         }
 654         /*
 655          * At this point, we know we have something in the hold slot.
 656          */
 657         mutex_exit(&d->bd_lock);
 658 
 659         /*
 660          * Move data from hold buffer into user space.
 661          * We know the entire buffer is transferred since
 662          * we checked above that the read buffer is bpf_bufsize bytes.
 663          */
 664         error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio);
 665 
 666         mutex_enter(&d->bd_lock);
 667         d->bd_fbuf = d->bd_hbuf;
 668         d->bd_hbuf = 0;
 669         d->bd_hlen = 0;
 670 done:
 671         mutex_exit(&d->bd_lock);
 672         return (error);
 673 }
 674 
 675 
 676 /*
 677  * If there are processes sleeping on this descriptor, wake them up.
 678  * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
 679  * so there is no code here grabbing it.
 680  */
 681 static inline void
 682 bpf_wakeup(struct bpf_d *d)
 683 {
 684         cv_signal(&d->bd_wait);
 685 }
 686 
 687 static void
 688 bpf_timed_out(void *arg)
 689 {
 690         struct bpf_d *d = arg;
 691 
 692         mutex_enter(&d->bd_lock);
 693         if (d->bd_state == BPF_WAITING) {
 694                 d->bd_state = BPF_TIMED_OUT;
 695                 if (d->bd_slen != 0)
 696                         cv_signal(&d->bd_wait);
 697         }
 698         mutex_exit(&d->bd_lock);
 699 }
 700 
 701 
 702 /* ARGSUSED */
 703 int
 704 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred)
 705 {
 706         struct bpf_d *d = bpf_dev_get(getminor(dev));
 707         uintptr_t mch;
 708         uint_t mtu;
 709         mblk_t *m;
 710         int error;
 711         int dlt;
 712 
 713         if ((d->bd_fmode & FWRITE) == 0)
 714                 return (EBADF);
 715 
 716         mutex_enter(&d->bd_lock);
 717         if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) {
 718                 mutex_exit(&d->bd_lock);
 719                 return (EINTR);
 720         }
 721 
 722         if (uio->uio_resid == 0) {
 723                 mutex_exit(&d->bd_lock);
 724                 return (0);
 725         }
 726 
 727         while (d->bd_inuse < 0) {
 728                 d->bd_waiting++;
 729                 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
 730                         d->bd_waiting--;
 731                         mutex_exit(&d->bd_lock);
 732                         return (EINTR);
 733                 }
 734                 d->bd_waiting--;
 735         }
 736 
 737         mutex_exit(&d->bd_lock);
 738 
 739         dlt = d->bd_dlt;
 740         mch = d->bd_mcip;
 741         MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu);
 742         d->bd_inuse++;
 743 
 744         m = NULL;
 745         if (dlt == DLT_IPNET) {
 746                 error = EIO;
 747                 goto done;
 748         }
 749 
 750         error = bpf_movein(uio, dlt, mtu, &m);
 751         if (error)
 752                 goto done;
 753 
 754         DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt,
 755             uint_t, mtu, mblk_t *, m);
 756 
 757         if (M_LEN(m) > mtu) {
 758                 error = EMSGSIZE;
 759                 goto done;
 760         }
 761 
 762         error = MBPF_TX(&d->bd_mac, mch, m);
 763         /*
 764          * The "tx" action here is required to consume the mblk_t.
 765          */
 766         m = NULL;
 767 
 768 done:
 769         if (error == 0)
 770                 ks_stats.kp_write_ok.value.ui64++;
 771         else
 772                 ks_stats.kp_write_error.value.ui64++;
 773         if (m != NULL)
 774                 freemsg(m);
 775 
 776         mutex_enter(&d->bd_lock);
 777         d->bd_inuse--;
 778         if ((d->bd_inuse == 0) && (d->bd_waiting != 0))
 779                 cv_signal(&d->bd_wait);
 780         mutex_exit(&d->bd_lock);
 781 
 782         /*
 783          * The driver frees the mbuf.
 784          */
 785         return (error);
 786 }
 787 
 788 
 789 /*
 790  * Reset a descriptor by flushing its packet buffer and clearing the
 791  * receive and drop counts.  Should be called at splnet.
 792  */
 793 static void
 794 reset_d(struct bpf_d *d)
 795 {
 796         if (d->bd_hbuf) {
 797                 /* Free the hold buffer. */
 798                 d->bd_fbuf = d->bd_hbuf;
 799                 d->bd_hbuf = 0;
 800         }
 801         d->bd_slen = 0;
 802         d->bd_hlen = 0;
 803         d->bd_rcount = 0;
 804         d->bd_dcount = 0;
 805         d->bd_ccount = 0;
 806 }
 807 
 808 /*
 809  *  FIONREAD            Check for read packet available.
 810  *  BIOCGBLEN           Get buffer len [for read()].
 811  *  BIOCSETF            Set ethernet read filter.
 812  *  BIOCFLUSH           Flush read packet buffer.
 813  *  BIOCPROMISC         Put interface into promiscuous mode.
 814  *  BIOCGDLT            Get link layer type.
 815  *  BIOCGETIF           Get interface name.
 816  *  BIOCSETIF           Set interface.
 817  *  BIOCSRTIMEOUT       Set read timeout.
 818  *  BIOCGRTIMEOUT       Get read timeout.
 819  *  BIOCGSTATS          Get packet stats.
 820  *  BIOCIMMEDIATE       Set immediate mode.
 821  *  BIOCVERSION         Get filter language version.
 822  *  BIOCGHDRCMPLT       Get "header already complete" flag.
 823  *  BIOCSHDRCMPLT       Set "header already complete" flag.
 824  */
 825 /* ARGSUSED */
 826 int
 827 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval)
 828 {
 829         struct bpf_d *d = bpf_dev_get(getminor(dev));
 830         struct bpf_program prog;
 831         struct lifreq lifreq;
 832         struct ifreq ifreq;
 833         int error = 0;
 834         uint_t size;
 835 
 836         /*
 837          * Refresh the PID associated with this bpf file.
 838          */
 839         mutex_enter(&d->bd_lock);
 840         if (d->bd_state == BPF_WAITING)
 841                 bpf_clear_timeout(d);
 842         d->bd_state = BPF_IDLE;
 843         mutex_exit(&d->bd_lock);
 844 
 845         switch (cmd) {
 846 
 847         default:
 848                 error = EINVAL;
 849                 break;
 850 
 851         /*
 852          * Check for read packet available.
 853          */
 854         case FIONREAD:
 855                 {
 856                         int n;
 857 
 858                         mutex_enter(&d->bd_lock);
 859                         n = d->bd_slen;
 860                         if (d->bd_hbuf)
 861                                 n += d->bd_hlen;
 862                         mutex_exit(&d->bd_lock);
 863 
 864                         *(int *)addr = n;
 865                         break;
 866                 }
 867 
 868         /*
 869          * Get buffer len [for read()].
 870          */
 871         case BIOCGBLEN:
 872                 error = copyout(&d->bd_bufsize, (void *)addr,
 873                     sizeof (d->bd_bufsize));
 874                 break;
 875 
 876         /*
 877          * Set buffer length.
 878          */
 879         case BIOCSBLEN:
 880                 if (copyin((void *)addr, &size, sizeof (size)) != 0) {
 881                         error = EFAULT;
 882                         break;
 883                 }
 884 
 885                 mutex_enter(&d->bd_lock);
 886                 if (d->bd_bif != 0) {
 887                         error = EINVAL;
 888                 } else {
 889                         if (size > bpf_maxbufsize)
 890                                 size = bpf_maxbufsize;
 891                         else if (size < BPF_MINBUFSIZE)
 892                                 size = BPF_MINBUFSIZE;
 893 
 894                         d->bd_bufsize = size;
 895                 }
 896                 mutex_exit(&d->bd_lock);
 897 
 898                 if (error == 0)
 899                         error = copyout(&size, (void *)addr, sizeof (size));
 900                 break;
 901 
 902         /*
 903          * Set link layer read filter.
 904          */
 905         case BIOCSETF:
 906                 if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) {
 907                         error = EFAULT;
 908                         break;
 909                 }
 910                 error = bpf_setf(d, &prog);
 911                 break;
 912 
 913         /*
 914          * Flush read packet buffer.
 915          */
 916         case BIOCFLUSH:
 917                 mutex_enter(&d->bd_lock);
 918                 reset_d(d);
 919                 mutex_exit(&d->bd_lock);
 920                 break;
 921 
 922         /*
 923          * Put interface into promiscuous mode.
 924          * This is a one-way ioctl, it is not used to turn promiscuous
 925          * mode off.
 926          */
 927         case BIOCPROMISC:
 928                 if (d->bd_bif == 0) {
 929                         /*
 930                          * No interface attached yet.
 931                          */
 932                         error = EINVAL;
 933                         break;
 934                 }
 935                 mutex_enter(&d->bd_lock);
 936                 if (d->bd_promisc == 0) {
 937 
 938                         if (d->bd_promisc_handle) {
 939                                 uintptr_t mph;
 940 
 941                                 mph = d->bd_promisc_handle;
 942                                 d->bd_promisc_handle = 0;
 943 
 944                                 mutex_exit(&d->bd_lock);
 945                                 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
 946                                 mutex_enter(&d->bd_lock);
 947                         }
 948 
 949                         d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY;
 950                         error = MBPF_PROMISC_ADD(&d->bd_mac,
 951                             d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d,
 952                             &d->bd_promisc_handle, d->bd_promisc_flags);
 953                         if (error == 0)
 954                                 d->bd_promisc = 1;
 955                 }
 956                 mutex_exit(&d->bd_lock);
 957                 break;
 958 
 959         /*
 960          * Get device parameters.
 961          */
 962         case BIOCGDLT:
 963                 if (d->bd_bif == 0)
 964                         error = EINVAL;
 965                 else
 966                         error = copyout(&d->bd_dlt, (void *)addr,
 967                             sizeof (d->bd_dlt));
 968                 break;
 969 
 970         /*
 971          * Get a list of supported device parameters.
 972          */
 973         case BIOCGDLTLIST:
 974                 if (d->bd_bif == 0) {
 975                         error = EINVAL;
 976                 } else {
 977                         struct bpf_dltlist list;
 978 
 979                         if (copyin((void *)addr, &list, sizeof (list)) != 0) {
 980                                 error = EFAULT;
 981                                 break;
 982                         }
 983                         error = bpf_getdltlist(d, &list);
 984                         if ((error == 0) &&
 985                             copyout(&list, (void *)addr, sizeof (list)) != 0)
 986                                 error = EFAULT;
 987                 }
 988                 break;
 989 
 990         /*
 991          * Set device parameters.
 992          */
 993         case BIOCSDLT:
 994                 error = bpf_setdlt(d, (void *)addr);
 995                 break;
 996 
 997         /*
 998          * Get interface name.
 999          */
1000         case BIOCGETIF:
1001                 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1002                         error = EFAULT;
1003                         break;
1004                 }
1005                 error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1006                 if ((error == 0) &&
1007                     copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) {
1008                         error = EFAULT;
1009                         break;
1010                 }
1011                 break;
1012 
1013         /*
1014          * Set interface.
1015          */
1016         case BIOCSETIF:
1017                 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1018                         error = EFAULT;
1019                         break;
1020                 }
1021                 error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1022                 break;
1023 
1024         /*
1025          * Get interface name.
1026          */
1027         case BIOCGETLIF:
1028                 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1029                         error = EFAULT;
1030                         break;
1031                 }
1032                 error = bpf_ifname(d, lifreq.lifr_name,
1033                     sizeof (lifreq.lifr_name));
1034                 if ((error == 0) &&
1035                     copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) {
1036                         error = EFAULT;
1037                         break;
1038                 }
1039                 break;
1040 
1041         /*
1042          * Set interface.
1043          */
1044         case BIOCSETLIF:
1045                 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1046                         error = EFAULT;
1047                         break;
1048                 }
1049                 error = bpf_setif(d, lifreq.lifr_name,
1050                     sizeof (lifreq.lifr_name));
1051                 break;
1052 
1053 #ifdef _SYSCALL32_IMPL
1054         /*
1055          * Set read timeout.
1056          */
1057         case BIOCSRTIMEOUT32:
1058                 {
1059                         struct timeval32 tv;
1060 
1061                         if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1062                                 error = EFAULT;
1063                                 break;
1064                         }
1065 
1066                         /* Convert the timeout in microseconds to ticks */
1067                         d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1068                             tv.tv_usec);
1069                         if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1070                                 d->bd_rtout = 1;
1071                         break;
1072                 }
1073 
1074         /*
1075          * Get read timeout.
1076          */
1077         case BIOCGRTIMEOUT32:
1078                 {
1079                         struct timeval32 tv;
1080                         clock_t ticks;
1081 
1082                         ticks = drv_hztousec(d->bd_rtout);
1083                         tv.tv_sec = ticks / 1000000;
1084                         tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1085                         error = copyout(&tv, (void *)addr, sizeof (tv));
1086                         break;
1087                 }
1088 
1089         /*
1090          * Get a list of supported device parameters.
1091          */
1092         case BIOCGDLTLIST32:
1093                 if (d->bd_bif == 0) {
1094                         error = EINVAL;
1095                 } else {
1096                         struct bpf_dltlist32 lst32;
1097                         struct bpf_dltlist list;
1098 
1099                         if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) {
1100                                 error = EFAULT;
1101                                 break;
1102                         }
1103 
1104                         list.bfl_len = lst32.bfl_len;
1105                         list.bfl_list = (void *)(uint64_t)lst32.bfl_list;
1106                         error = bpf_getdltlist(d, &list);
1107                         if (error == 0) {
1108                                 lst32.bfl_len = list.bfl_len;
1109 
1110                                 if (copyout(&lst32, (void *)addr,
1111                                     sizeof (lst32)) != 0)
1112                                         error = EFAULT;
1113                         }
1114                 }
1115                 break;
1116 
1117         /*
1118          * Set link layer read filter.
1119          */
1120         case BIOCSETF32: {
1121                 struct bpf_program32 prog32;
1122 
1123                 if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) {
1124                         error = EFAULT;
1125                         break;
1126                 }
1127                 prog.bf_len = prog32.bf_len;
1128                 prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1129                 error = bpf_setf(d, &prog);
1130                 break;
1131         }
1132 #endif
1133 
1134         /*
1135          * Set read timeout.
1136          */
1137         case BIOCSRTIMEOUT:
1138                 {
1139                         struct timeval tv;
1140 
1141                         if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1142                                 error = EFAULT;
1143                                 break;
1144                         }
1145 
1146                         /* Convert the timeout in microseconds to ticks */
1147                         d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1148                             tv.tv_usec);
1149                         if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1150                                 d->bd_rtout = 1;
1151                         break;
1152                 }
1153 
1154         /*
1155          * Get read timeout.
1156          */
1157         case BIOCGRTIMEOUT:
1158                 {
1159                         struct timeval tv;
1160                         clock_t ticks;
1161 
1162                         ticks = drv_hztousec(d->bd_rtout);
1163                         tv.tv_sec = ticks / 1000000;
1164                         tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1165                         if (copyout(&tv, (void *)addr, sizeof (tv)) != 0)
1166                                 error = EFAULT;
1167                         break;
1168                 }
1169 
1170         /*
1171          * Get packet stats.
1172          */
1173         case BIOCGSTATS:
1174                 {
1175                         struct bpf_stat bs;
1176 
1177                         bs.bs_recv = d->bd_rcount;
1178                         bs.bs_drop = d->bd_dcount;
1179                         bs.bs_capt = d->bd_ccount;
1180                         if (copyout(&bs, (void *)addr, sizeof (bs)) != 0)
1181                                 error = EFAULT;
1182                         break;
1183                 }
1184 
1185         /*
1186          * Set immediate mode.
1187          */
1188         case BIOCIMMEDIATE:
1189                 if (copyin((void *)addr, &d->bd_immediate,
1190                     sizeof (d->bd_immediate)) != 0)
1191                         error = EFAULT;
1192                 break;
1193 
1194         case BIOCVERSION:
1195                 {
1196                         struct bpf_version bv;
1197 
1198                         bv.bv_major = BPF_MAJOR_VERSION;
1199                         bv.bv_minor = BPF_MINOR_VERSION;
1200                         if (copyout(&bv, (void *)addr, sizeof (bv)) != 0)
1201                                 error = EFAULT;
1202                         break;
1203                 }
1204 
1205         case BIOCGHDRCMPLT:     /* get "header already complete" flag */
1206                 if (copyout(&d->bd_hdrcmplt, (void *)addr,
1207                     sizeof (d->bd_hdrcmplt)) != 0)
1208                         error = EFAULT;
1209                 break;
1210 
1211         case BIOCSHDRCMPLT:     /* set "header already complete" flag */
1212                 if (copyin((void *)addr, &d->bd_hdrcmplt,
1213                     sizeof (d->bd_hdrcmplt)) != 0)
1214                         error = EFAULT;
1215                 break;
1216 
1217         /*
1218          * Get "see sent packets" flag
1219          */
1220         case BIOCGSEESENT:
1221                 if (copyout(&d->bd_seesent, (void *)addr,
1222                     sizeof (d->bd_seesent)) != 0)
1223                         error = EFAULT;
1224                 break;
1225 
1226         /*
1227          * Set "see sent" packets flag
1228          */
1229         case BIOCSSEESENT:
1230                 if (copyin((void *)addr, &d->bd_seesent,
1231                     sizeof (d->bd_seesent)) != 0)
1232                         error = EFAULT;
1233                 break;
1234 
1235         case FIONBIO:           /* Non-blocking I/O */
1236                 if (copyin((void *)addr, &d->bd_nonblock,
1237                     sizeof (d->bd_nonblock)) != 0)
1238                         error = EFAULT;
1239                 break;
1240         }
1241         return (error);
1242 }
1243 
1244 /*
1245  * Set d's packet filter program to fp.  If this file already has a filter,
1246  * free it and replace it. If the new filter is "empty" (has a 0 size), then
1247  * the result is to just remove and free the existing filter.
1248  * Returns EINVAL for bogus requests.
1249  */
1250 int
1251 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1252 {
1253         struct bpf_insn *fcode, *old;
1254         uint_t flen, size;
1255         size_t oldsize;
1256 
1257         if (fp->bf_insns == 0) {
1258                 if (fp->bf_len != 0)
1259                         return (EINVAL);
1260                 mutex_enter(&d->bd_lock);
1261                 old = d->bd_filter;
1262                 oldsize = d->bd_filter_size;
1263                 d->bd_filter = 0;
1264                 d->bd_filter_size = 0;
1265                 reset_d(d);
1266                 mutex_exit(&d->bd_lock);
1267                 if (old != 0)
1268                         kmem_free(old, oldsize);
1269                 return (0);
1270         }
1271         flen = fp->bf_len;
1272         if (flen > BPF_MAXINSNS)
1273                 return (EINVAL);
1274 
1275         size = flen * sizeof (*fp->bf_insns);
1276         fcode = kmem_alloc(size, KM_SLEEP);
1277         if (copyin(fp->bf_insns, fcode, size) != 0)
1278                 return (EFAULT);
1279 
1280         if (bpf_validate(fcode, (int)flen)) {
1281                 mutex_enter(&d->bd_lock);
1282                 old = d->bd_filter;
1283                 oldsize = d->bd_filter_size;
1284                 d->bd_filter = fcode;
1285                 d->bd_filter_size = size;
1286                 reset_d(d);
1287                 mutex_exit(&d->bd_lock);
1288                 if (old != 0)
1289                         kmem_free(old, oldsize);
1290 
1291                 return (0);
1292         }
1293         kmem_free(fcode, size);
1294         return (EINVAL);
1295 }
1296 
1297 /*
1298  * Detach a file from its current interface (if attached at all) and attach
1299  * to the interface indicated by the name stored in ifname.
1300  * Return an errno or 0.
1301  */
1302 static int
1303 bpf_setif(struct bpf_d *d, char *ifname, int namesize)
1304 {
1305         int unit_seen;
1306         int error = 0;
1307         char *cp;
1308         int i;
1309 
1310         /*
1311          * Make sure the provided name has a unit number, and default
1312          * it to '0' if not specified.
1313          * XXX This is ugly ... do this differently?
1314          */
1315         unit_seen = 0;
1316         cp = ifname;
1317         cp[namesize - 1] = '\0';        /* sanity */
1318         while (*cp++)
1319                 if (*cp >= '0' && *cp <= '9')
1320                         unit_seen = 1;
1321         if (!unit_seen) {
1322                 /* Make sure to leave room for the '\0'. */
1323                 for (i = 0; i < (namesize - 1); ++i) {
1324                         if ((ifname[i] >= 'a' && ifname[i] <= 'z') ||
1325                             (ifname[i] >= 'A' && ifname[i] <= 'Z'))
1326                                 continue;
1327                         ifname[i] = '0';
1328                 }
1329         }
1330 
1331         /*
1332          * Make sure that only one call to this function happens at a time
1333          * and that we're not interleaving a read/write
1334          */
1335         mutex_enter(&d->bd_lock);
1336         while (d->bd_inuse != 0) {
1337                 d->bd_waiting++;
1338                 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
1339                         d->bd_waiting--;
1340                         mutex_exit(&d->bd_lock);
1341                         return (EINTR);
1342                 }
1343                 d->bd_waiting--;
1344         }
1345         d->bd_inuse = -1;
1346         mutex_exit(&d->bd_lock);
1347 
1348         if (d->bd_sbuf == 0)
1349                 error = bpf_allocbufs(d);
1350 
1351         if (error == 0) {
1352                 mutex_enter(&d->bd_lock);
1353                 if (d->bd_bif)
1354                         /*
1355                          * Detach if attached to something else.
1356                          */
1357                         bpf_detachd(d);
1358 
1359                 error = bpf_attachd(d, ifname, -1);
1360                 reset_d(d);
1361                 d->bd_inuse = 0;
1362                 if (d->bd_waiting != 0)
1363                         cv_signal(&d->bd_wait);
1364                 mutex_exit(&d->bd_lock);
1365                 return (error);
1366         }
1367 
1368         mutex_enter(&d->bd_lock);
1369         d->bd_inuse = 0;
1370         if (d->bd_waiting != 0)
1371                 cv_signal(&d->bd_wait);
1372         mutex_exit(&d->bd_lock);
1373 
1374         /*
1375          * Try tickle the mac layer into attaching the device...
1376          */
1377         return (bpf_provider_tickle(ifname, d->bd_zone));
1378 }
1379 
1380 /*
1381  * Copy the interface name to the ifreq.
1382  */
1383 static int
1384 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize)
1385 {
1386 
1387         mutex_enter(&d->bd_lock);
1388         if (d->bd_bif == NULL) {
1389                 mutex_exit(&d->bd_lock);
1390                 return (EINVAL);
1391         }
1392 
1393         (void) strlcpy(buffer, d->bd_ifname, bufsize);
1394         mutex_exit(&d->bd_lock);
1395 
1396         return (0);
1397 }
1398 
1399 /*
1400  * Support for poll() system call
1401  *
1402  * Return true iff the specific operation will not block indefinitely - with
1403  * the assumption that it is safe to positively acknowledge a request for the
1404  * ability to write to the BPF device.
1405  * Otherwise, return false but make a note that a selnotify() must be done.
1406  */
1407 int
1408 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp,
1409     struct pollhead **phpp)
1410 {
1411         struct bpf_d *d = bpf_dev_get(getminor(dev));
1412 
1413         if (events & (POLLIN | POLLRDNORM)) {
1414                 /*
1415                  * An imitation of the FIONREAD ioctl code.
1416                  */
1417                 mutex_enter(&d->bd_lock);
1418                 if (d->bd_hlen != 0 ||
1419                     ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1420                     d->bd_slen != 0)) {
1421                         *reventsp |= events & (POLLIN | POLLRDNORM);
1422                 } else {
1423                         *reventsp = 0;
1424                         if (!anyyet)
1425                                 *phpp = &d->bd_poll;
1426                         /* Start the read timeout if necessary */
1427                         if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1428                                 bpf_clear_timeout(d);
1429                                 /*
1430                                  * Only allow the timeout to be set once.
1431                                  */
1432                                 if (d->bd_callout == 0)
1433                                         d->bd_callout = timeout(bpf_timed_out,
1434                                             d, d->bd_rtout);
1435                                 d->bd_state = BPF_WAITING;
1436                         }
1437                 }
1438                 mutex_exit(&d->bd_lock);
1439         }
1440 
1441         return (0);
1442 }
1443 
1444 /*
1445  * Copy data from an mblk_t chain into a buffer. This works for ipnet
1446  * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1447  * packet itself.
1448  */
1449 static void *
1450 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1451 {
1452         const mblk_t *m;
1453         uint_t count;
1454         uchar_t *dst;
1455 
1456         m = src_arg;
1457         dst = dst_arg;
1458         while (len > 0) {
1459                 if (m == NULL)
1460                         panic("bpf_mcpy");
1461                 count = (uint_t)min(M_LEN(m), len);
1462                 (void) memcpy(dst, mtod(m, const void *), count);
1463                 m = m->b_cont;
1464                 dst += count;
1465                 len -= count;
1466         }
1467         return (dst_arg);
1468 }
1469 
1470 /*
1471  * Dispatch a packet to all the listeners on interface bp.
1472  *
1473  * marg    pointer to the packet, either a data buffer or an mbuf chain
1474  * buflen  buffer length, if marg is a data buffer
1475  * cpfn    a function that can copy marg into the listener's buffer
1476  * pktlen  length of the packet
1477  * issent  boolean indicating whether the packet was sent or receive
1478  */
1479 static inline void
1480 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen,
1481     uint_t buflen, boolean_t issent)
1482 {
1483         struct timeval tv;
1484         uint_t slen;
1485 
1486         if (!d->bd_seesent && issent)
1487                 return;
1488 
1489         /*
1490          * Accuracy of the packet counters in BPF is vital so it
1491          * is important to protect even the outer ones.
1492          */
1493         mutex_enter(&d->bd_lock);
1494         slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
1495         DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif,
1496             struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen);
1497         d->bd_rcount++;
1498         ks_stats.kp_receive.value.ui64++;
1499         if (slen != 0) {
1500                 uniqtime(&tv);
1501                 catchpacket(d, marg, pktlen, slen, cpfn, &tv);
1502         }
1503         mutex_exit(&d->bd_lock);
1504 }
1505 
1506 /*
1507  * Incoming linkage from device drivers.
1508  */
1509 /* ARGSUSED */
1510 void
1511 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent)
1512 {
1513         cp_fn_t cpfn;
1514         struct bpf_d *d = arg;
1515         uint_t pktlen, buflen;
1516         void *marg;
1517 
1518         pktlen = msgdsize(m);
1519 
1520         if (pktlen == M_LEN(m)) {
1521                 cpfn = (cp_fn_t)memcpy;
1522                 marg = mtod(m, void *);
1523                 buflen = pktlen;
1524         } else {
1525                 cpfn = bpf_mcpy;
1526                 marg = m;
1527                 buflen = 0;
1528         }
1529 
1530         bpf_deliver(d, cpfn, marg, pktlen, buflen, issent);
1531 }
1532 
1533 /*
1534  * Incoming linkage from ipnet.
1535  * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1536  * from all network interfaces. Thus the tap function needs to apply a
1537  * filter using the interface index/id to immitate snoop'ing on just the
1538  * specified interface.
1539  */
1540 /* ARGSUSED */
1541 void
1542 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length)
1543 {
1544         hook_pkt_observe_t *hdr;
1545         struct bpf_d *d = arg;
1546 
1547         hdr = (hook_pkt_observe_t *)m->b_rptr;
1548         if (ntohl(hdr->hpo_ifindex) != d->bd_linkid)
1549                 return;
1550         bpf_deliver(d, bpf_mcpy, m, length, 0, issent);
1551 
1552 }
1553 
1554 /*
1555  * Move the packet data from interface memory (pkt) into the
1556  * store buffer.  Return 1 if it's time to wakeup a listener (buffer full),
1557  * otherwise 0.  "copy" is the routine called to do the actual data
1558  * transfer.  memcpy is passed in to copy contiguous chunks, while
1559  * bpf_mcpy is passed in to copy mbuf chains.  In the latter case,
1560  * pkt is really an mbuf.
1561  */
1562 static void
1563 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen,
1564     cp_fn_t cpfn, struct timeval *tv)
1565 {
1566         struct bpf_hdr *hp;
1567         int totlen, curlen;
1568         int hdrlen = d->bd_hdrlen;
1569         int do_wakeup = 0;
1570 
1571         ++d->bd_ccount;
1572         ks_stats.kp_capture.value.ui64++;
1573         /*
1574          * Figure out how many bytes to move.  If the packet is
1575          * greater or equal to the snapshot length, transfer that
1576          * much.  Otherwise, transfer the whole packet (unless
1577          * we hit the buffer size limit).
1578          */
1579         totlen = hdrlen + min(snaplen, pktlen);
1580         if (totlen > d->bd_bufsize)
1581                 totlen = d->bd_bufsize;
1582 
1583         /*
1584          * Round up the end of the previous packet to the next longword.
1585          */
1586         curlen = BPF_WORDALIGN(d->bd_slen);
1587         if (curlen + totlen > d->bd_bufsize) {
1588                 /*
1589                  * This packet will overflow the storage buffer.
1590                  * Rotate the buffers if we can, then wakeup any
1591                  * pending reads.
1592                  */
1593                 if (d->bd_fbuf == 0) {
1594                         /*
1595                          * We haven't completed the previous read yet,
1596                          * so drop the packet.
1597                          */
1598                         ++d->bd_dcount;
1599                         ks_stats.kp_dropped.value.ui64++;
1600                         return;
1601                 }
1602                 ROTATE_BUFFERS(d);
1603                 do_wakeup = 1;
1604                 curlen = 0;
1605         } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1606                 /*
1607                  * Immediate mode is set, or the read timeout has
1608                  * already expired during a select call.  A packet
1609                  * arrived, so the reader should be woken up.
1610                  */
1611                 do_wakeup = 1;
1612         }
1613 
1614         /*
1615          * Append the bpf header to the existing buffer before we add
1616          * on the actual packet data.
1617          */
1618         hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
1619         hp->bh_tstamp.tv_sec = tv->tv_sec;
1620         hp->bh_tstamp.tv_usec = tv->tv_usec;
1621         hp->bh_datalen = pktlen;
1622         hp->bh_hdrlen = (uint16_t)hdrlen;
1623         /*
1624          * Copy the packet data into the store buffer and update its length.
1625          */
1626         (*cpfn)((uchar_t *)hp + hdrlen, pkt,
1627             (hp->bh_caplen = totlen - hdrlen));
1628         d->bd_slen = curlen + totlen;
1629 
1630         /*
1631          * Call bpf_wakeup after bd_slen has been updated.
1632          */
1633         if (do_wakeup)
1634                 bpf_wakeup(d);
1635 }
1636 
1637 /*
1638  * Initialize all nonzero fields of a descriptor.
1639  */
1640 static int
1641 bpf_allocbufs(struct bpf_d *d)
1642 {
1643 
1644         d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1645         if (!d->bd_fbuf)
1646                 return (ENOBUFS);
1647         d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1648         if (!d->bd_sbuf) {
1649                 kmem_free(d->bd_fbuf, d->bd_bufsize);
1650                 return (ENOBUFS);
1651         }
1652         d->bd_slen = 0;
1653         d->bd_hlen = 0;
1654         return (0);
1655 }
1656 
1657 /*
1658  * Free buffers currently in use by a descriptor.
1659  * Called on close.
1660  */
1661 static void
1662 bpf_freed(struct bpf_d *d)
1663 {
1664         /*
1665          * At this point the descriptor has been detached from its
1666          * interface and it yet hasn't been marked free.
1667          */
1668         if (d->bd_sbuf != 0) {
1669                 kmem_free(d->bd_sbuf, d->bd_bufsize);
1670                 if (d->bd_hbuf != 0)
1671                         kmem_free(d->bd_hbuf, d->bd_bufsize);
1672                 if (d->bd_fbuf != 0)
1673                         kmem_free(d->bd_fbuf, d->bd_bufsize);
1674         }
1675         if (d->bd_filter)
1676                 kmem_free(d->bd_filter, d->bd_filter_size);
1677 }
1678 
1679 /*
1680  * Get a list of available data link type of the interface.
1681  */
1682 static int
1683 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp)
1684 {
1685         bpf_provider_list_t *bp;
1686         bpf_provider_t *bpr;
1687         zoneid_t zoneid;
1688         uintptr_t mcip;
1689         uint_t nicdlt;
1690         uintptr_t mh;
1691         int error;
1692         int n;
1693 
1694         n = 0;
1695         mh = 0;
1696         mcip = 0;
1697         error = 0;
1698         mutex_enter(&d->bd_lock);
1699         LIST_FOREACH(bp, &bpf_providers, bpl_next) {
1700                 bpr = bp->bpl_what;
1701                 error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone);
1702                 if (error != 0)
1703                         goto next;
1704                 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
1705                 if (error != 0)
1706                         goto next;
1707                 error = MBPF_GET_ZONE(bpr, mh, &zoneid);
1708                 if (error != 0)
1709                         goto next;
1710                 if (d->bd_zone != GLOBAL_ZONEID &&
1711                     d->bd_zone != zoneid)
1712                         goto next;
1713                 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
1714                 if (error != 0)
1715                         goto next;
1716                 nicdlt = bpf_dl_to_dlt(nicdlt);
1717                 if (listp->bfl_list != NULL) {
1718                         if (n >= listp->bfl_len) {
1719                                 MBPF_CLIENT_CLOSE(bpr, mcip);
1720                                 MBPF_CLOSE(bpr, mh);
1721                                 break;
1722                         }
1723                         /*
1724                          * Bumping of bd_inuse ensures the structure does not
1725                          * disappear while the copyout runs and allows the for
1726                          * loop to be continued.
1727                          */
1728                         d->bd_inuse++;
1729                         mutex_exit(&d->bd_lock);
1730                         if (copyout(&nicdlt,
1731                             listp->bfl_list + n, sizeof (uint_t)) != 0)
1732                                 error = EFAULT;
1733                         mutex_enter(&d->bd_lock);
1734                         if (error != 0)
1735                                 break;
1736                         d->bd_inuse--;
1737                 }
1738                 n++;
1739 next:
1740                 if (mcip != 0) {
1741                         MBPF_CLIENT_CLOSE(bpr, mcip);
1742                         mcip = 0;
1743                 }
1744                 if (mh != 0) {
1745                         MBPF_CLOSE(bpr, mh);
1746                         mh = 0;
1747                 }
1748         }
1749         mutex_exit(&d->bd_lock);
1750 
1751         /*
1752          * It is quite possible that one or more provider to BPF may not
1753          * know about a link name whlist others do. In that case, so long
1754          * as we have one success, do not declare an error unless it was
1755          * an EFAULT as this indicates a problem that needs to be reported.
1756          */
1757         if ((error != EFAULT) && (n > 0))
1758                 error = 0;
1759 
1760         listp->bfl_len = n;
1761         return (error);
1762 }
1763 
1764 /*
1765  * Set the data link type of a BPF instance.
1766  */
1767 static int
1768 bpf_setdlt(struct bpf_d *d, void *addr)
1769 {
1770         char ifname[LIFNAMSIZ+1];
1771         zoneid_t niczone;
1772         int error;
1773         int dlt;
1774 
1775         if (copyin(addr, &dlt, sizeof (dlt)) != 0)
1776                 return (EFAULT);
1777 
1778         mutex_enter(&d->bd_lock);
1779 
1780         if (d->bd_bif == 0) {                        /* Interface not set */
1781                 mutex_exit(&d->bd_lock);
1782                 return (EINVAL);
1783         }
1784         if (d->bd_dlt == dlt) {      /* NULL-op */
1785                 mutex_exit(&d->bd_lock);
1786                 return (0);
1787         }
1788 
1789         error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone);
1790         if (error != 0) {
1791                 mutex_exit(&d->bd_lock);
1792                 return (error);
1793         }
1794 
1795         /*
1796          * See the matrix at the top of the file for the permissions table
1797          * enforced by this driver.
1798          */
1799         if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) &&
1800             (niczone != d->bd_zone)) {
1801                 mutex_exit(&d->bd_lock);
1802                 return (EINVAL);
1803         }
1804 
1805         (void) strlcpy(ifname, d->bd_ifname, sizeof (ifname));
1806         d->bd_inuse = -1;
1807         bpf_detachd(d);
1808         error = bpf_attachd(d, ifname, dlt);
1809         reset_d(d);
1810         d->bd_inuse = 0;
1811 
1812         mutex_exit(&d->bd_lock);
1813         return (error);
1814 }
1815 
1816 /*
1817  * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1818  * with the necessary protection to retrieve and modify bd_callout but it
1819  * does not hold the lock for its entire duration... see below...
1820  */
1821 static void
1822 bpf_clear_timeout(struct bpf_d *d)
1823 {
1824         timeout_id_t tid = d->bd_callout;
1825         d->bd_callout = 0;
1826         d->bd_inuse++;
1827 
1828         /*
1829          * If the timeout has fired and is waiting on bd_lock, we could
1830          * deadlock here because untimeout if bd_lock is held and would
1831          * wait for bpf_timed_out to finish and it never would.
1832          */
1833         if (tid != 0) {
1834                 mutex_exit(&d->bd_lock);
1835                 (void) untimeout(tid);
1836                 mutex_enter(&d->bd_lock);
1837         }
1838 
1839         d->bd_inuse--;
1840 }
1841 
1842 /*
1843  * As a cloning device driver, BPF needs to keep track of which device
1844  * numbers are in use and which ones are not. A hash table, indexed by
1845  * the minor device number, is used to store the pointers to the
1846  * individual descriptors that are allocated in bpfopen().
1847  * The functions below present the interface for that hash table to
1848  * the rest of the driver.
1849  */
1850 static struct bpf_d *
1851 bpf_dev_find(minor_t minor)
1852 {
1853         struct bpf_d *d = NULL;
1854 
1855         (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1856             (mod_hash_val_t *)&d);
1857 
1858         return (d);
1859 }
1860 
1861 static void
1862 bpf_dev_add(struct bpf_d *d)
1863 {
1864         (void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1865             (mod_hash_val_t)d);
1866 }
1867 
1868 static void
1869 bpf_dev_remove(struct bpf_d *d)
1870 {
1871         struct bpf_d *stor;
1872 
1873         (void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1874             (mod_hash_val_t *)&stor);
1875         ASSERT(stor == d);
1876 }
1877 
1878 /*
1879  * bpf_def_get should only ever be called for a minor number that exists,
1880  * thus there should always be a pointer in the hash table that corresponds
1881  * to it.
1882  */
1883 static struct bpf_d *
1884 bpf_dev_get(minor_t minor)
1885 {
1886         struct bpf_d *d = NULL;
1887 
1888         (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1889             (mod_hash_val_t *)&d);
1890         ASSERT(d != NULL);
1891 
1892         return (d);
1893 }