1 /* $NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $ */
2
3 /*
4 * Copyright (c) 1990, 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from the Stanford/CMU enet packet filter,
8 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
9 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
10 * Berkeley Laboratory.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)bpf.c 8.4 (Berkeley) 1/9/95
37 * static char rcsid[] =
38 * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
39 */
40 /*
41 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
42 * Use is subject to license terms.
43 * Copyright 2017 Joyent, Inc.
44 */
45
46 /*
47 * The BPF implements the following access controls for zones attempting
48 * to read and write data. Writing of data requires that the net_rawaccess
49 * privilege is held whilst reading data requires either net_rawaccess or
50 * net_observerability.
51 *
52 * | Shared | Exclusive | Global
53 * -----------------------------+--------+------------+------------+
54 * DLT_IPNET in local zone | Read | Read | Read |
55 * -----------------------------+--------+------------+------------+
56 * Raw access to local zone NIC | None | Read/Write | Read/Write |
57 * -----------------------------+--------+------------+------------+
58 * Raw access to all NICs | None | None | Read/Write |
59 * -----------------------------+--------+------------+------------+
60 *
61 * The BPF driver is written as a cloning driver: each call to bpfopen()
62 * allocates a new minor number. This provides BPF with a 1:1 relationship
63 * between open's and close's. There is some amount of "descriptor state"
64 * that is kept per open. Pointers to this data are stored in a hash table
65 * (bpf_hash) that is index'd by the minor device number for each open file.
66 */
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/time.h>
70 #include <sys/ioctl.h>
71 #include <sys/queue.h>
72 #include <sys/filio.h>
73 #include <sys/policy.h>
74 #include <sys/cmn_err.h>
75 #include <sys/uio.h>
76 #include <sys/file.h>
77 #include <sys/sysmacros.h>
78 #include <sys/zone.h>
79
80 #include <sys/socket.h>
81 #include <sys/errno.h>
82 #include <sys/poll.h>
83 #include <sys/dlpi.h>
84 #include <sys/neti.h>
85
86 #include <net/if.h>
87
88 #include <net/bpf.h>
89 #include <net/bpfdesc.h>
90 #include <net/dlt.h>
91
92 #include <netinet/in.h>
93 #include <sys/mac.h>
94 #include <sys/mac_client.h>
95 #include <sys/mac_impl.h>
96 #include <sys/time_std_impl.h>
97 #include <sys/hook.h>
98 #include <sys/hook_event.h>
99
100
101 #define mtod(_v, _t) (_t)((_v)->b_rptr)
102 #define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr)
103
104 /*
105 * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
106 * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
107 */
108 #define BPF_BUFSIZE (32 * 1024)
109
110 typedef void *(*cp_fn_t)(void *, const void *, size_t);
111
112 /*
113 * The default read buffer size, and limit for BIOCSBLEN.
114 */
115 int bpf_bufsize = BPF_BUFSIZE;
116 int bpf_maxbufsize = (16 * 1024 * 1024);
117 static mod_hash_t *bpf_hash = NULL;
118
119 /*
120 * Use a mutex to avoid a race condition between gathering the stats/peers
121 * and opening/closing the device.
122 */
123 static kcondvar_t bpf_dlt_waiter;
124 static kmutex_t bpf_mtx;
125 static bpf_kstats_t ks_stats;
126 static bpf_kstats_t bpf_kstats = {
127 { "readWait", KSTAT_DATA_UINT64 },
128 { "writeOk", KSTAT_DATA_UINT64 },
129 { "writeError", KSTAT_DATA_UINT64 },
130 { "receive", KSTAT_DATA_UINT64 },
131 { "captured", KSTAT_DATA_UINT64 },
132 { "dropped", KSTAT_DATA_UINT64 },
133 };
134 static kstat_t *bpf_ksp;
135
136 /*
137 * bpf_list is a list of the BPF descriptors currently open
138 */
139 LIST_HEAD(, bpf_d) bpf_list;
140
141 static int bpf_allocbufs(struct bpf_d *);
142 static void bpf_clear_timeout(struct bpf_d *);
143 static void bpf_deliver(struct bpf_d *, cp_fn_t,
144 void *, uint_t, uint_t, boolean_t);
145 static void bpf_freed(struct bpf_d *);
146 static int bpf_ifname(struct bpf_d *d, char *, int);
147 static void *bpf_mcpy(void *, const void *, size_t);
148 static int bpf_attachd(struct bpf_d *, const char *, int);
149 static void bpf_detachd(struct bpf_d *);
150 static int bpf_setif(struct bpf_d *, char *, int);
151 static void bpf_timed_out(void *);
152 static inline void
153 bpf_wakeup(struct bpf_d *);
154 static void catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t,
155 cp_fn_t, struct timeval *);
156 static void reset_d(struct bpf_d *);
157 static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
158 static int bpf_setdlt(struct bpf_d *, void *);
159 static void bpf_dev_add(struct bpf_d *);
160 static struct bpf_d *bpf_dev_find(minor_t);
161 static struct bpf_d *bpf_dev_get(minor_t);
162 static void bpf_dev_remove(struct bpf_d *);
163
164 static int
165 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp)
166 {
167 mblk_t *m;
168 int error;
169 int len;
170 int hlen;
171 int align;
172
173 /*
174 * Build a sockaddr based on the data link layer type.
175 * We do this at this level because the ethernet header
176 * is copied directly into the data field of the sockaddr.
177 * In the case of SLIP, there is no header and the packet
178 * is forwarded as is.
179 * Also, we are careful to leave room at the front of the mbuf
180 * for the link level header.
181 */
182 switch (linktype) {
183
184 case DLT_EN10MB:
185 hlen = sizeof (struct ether_header);
186 break;
187
188 case DLT_FDDI:
189 hlen = 16;
190 break;
191
192 case DLT_NULL:
193 hlen = 0;
194 break;
195
196 case DLT_IPOIB:
197 hlen = 44;
198 break;
199
200 default:
201 return (EIO);
202 }
203
204 align = 4 - (hlen & 3);
205
206 len = uio->uio_resid;
207 /*
208 * If there aren't enough bytes for a link level header or the
209 * packet length exceeds the interface mtu, return an error.
210 */
211 if (len < hlen || len - hlen > mtu)
212 return (EMSGSIZE);
213
214 m = allocb(len + align, BPRI_MED);
215 if (m == NULL) {
216 error = ENOBUFS;
217 goto bad;
218 }
219
220 /* Insure the data is properly aligned */
221 if (align > 0)
222 m->b_rptr += align;
223 m->b_wptr = m->b_rptr + len;
224
225 error = uiomove(mtod(m, void *), len, UIO_WRITE, uio);
226 if (error)
227 goto bad;
228 *mp = m;
229 return (0);
230
231 bad:
232 if (m != NULL)
233 freemsg(m);
234 return (error);
235 }
236
237
238 /*
239 * Attach file to the bpf interface, i.e. make d listen on bp.
240 */
241 static int
242 bpf_attachd(struct bpf_d *d, const char *ifname, int dlt)
243 {
244 bpf_provider_list_t *bp;
245 bpf_provider_t *bpr;
246 boolean_t zonematch;
247 zoneid_t niczone;
248 uintptr_t mcip;
249 zoneid_t zone;
250 uint_t nicdlt;
251 uintptr_t mh;
252 int hdrlen;
253 int error;
254
255 ASSERT(d->bd_bif == NULL);
256 ASSERT(d->bd_mcip == NULL);
257 zone = d->bd_zone;
258 zonematch = B_TRUE;
259 again:
260 mh = 0;
261 mcip = 0;
262 LIST_FOREACH(bp, &bpf_providers, bpl_next) {
263 bpr = bp->bpl_what;
264 error = MBPF_OPEN(bpr, ifname, &mh, zone);
265 if (error != 0)
266 goto next;
267 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
268 if (error != 0)
269 goto next;
270 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
271 if (error != 0)
272 goto next;
273
274 nicdlt = bpf_dl_to_dlt(nicdlt);
275 if (dlt != -1 && dlt != nicdlt) {
276 error = ENOENT;
277 goto next;
278 }
279
280 error = MBPF_GET_ZONE(bpr, mh, &niczone);
281 if (error != 0)
282 goto next;
283
284 DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr,
285 uintptr_t, mh, int, nicdlt, zoneid_t, niczone);
286
287 if (zonematch && niczone != zone) {
288 error = ENOENT;
289 goto next;
290 }
291 break;
292 next:
293 if (mcip != 0) {
294 MBPF_CLIENT_CLOSE(bpr, mcip);
295 mcip = 0;
296 }
297 if (mh != NULL) {
298 MBPF_CLOSE(bpr, mh);
299 mh = 0;
300 }
301 }
302 if (error != 0) {
303 if (zonematch && (zone == GLOBAL_ZONEID)) {
304 /*
305 * If we failed to do an exact match for the global
306 * zone using the global zoneid, try again in case
307 * the network interface is owned by a local zone.
308 */
309 zonematch = B_FALSE;
310 goto again;
311 }
312 return (error);
313 }
314
315 d->bd_mac = *bpr;
316 d->bd_mcip = mcip;
317 d->bd_bif = mh;
318 d->bd_dlt = nicdlt;
319 hdrlen = bpf_dl_hdrsize(nicdlt);
320 d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
321
322 (void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip),
323 sizeof (d->bd_ifname));
324
325 (void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid,
326 zone);
327 (void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d,
328 &d->bd_promisc_handle, d->bd_promisc_flags);
329 return (0);
330 }
331
332 /*
333 * Detach a file from its interface.
334 */
335 static void
336 bpf_detachd(struct bpf_d *d)
337 {
338 uintptr_t mph;
339 uintptr_t mch;
340 uintptr_t mh;
341
342 ASSERT(d->bd_inuse == -1);
343 mch = d->bd_mcip;
344 d->bd_mcip = 0;
345 mh = d->bd_bif;
346 d->bd_bif = 0;
347
348 /*
349 * Check if this descriptor had requested promiscuous mode.
350 * If so, turn it off. There's no need to take any action
351 * here, that is done when MBPF_PROMISC_REMOVE is used;
352 * bd_promisc is just a local flag to stop promiscuous mode
353 * from being set more than once.
354 */
355 if (d->bd_promisc)
356 d->bd_promisc = 0;
357
358 /*
359 * Take device out of "promiscuous" mode. Since we were able to
360 * enter "promiscuous" mode, we should be able to turn it off.
361 * Note, this field stores a pointer used to support both
362 * promiscuous and non-promiscuous callbacks for packets.
363 */
364 mph = d->bd_promisc_handle;
365 d->bd_promisc_handle = 0;
366
367 /*
368 * The lock has to be dropped here because mac_promisc_remove may
369 * need to wait for mac_promisc_dispatch, which has called into
370 * bpf and catchpacket is waiting for bd_lock...
371 * i.e mac_promisc_remove() needs to be called with none of the
372 * locks held that are part of the bpf_mtap() call path.
373 */
374 mutex_exit(&d->bd_lock);
375 if (mph != 0)
376 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
377
378 if (mch != 0)
379 MBPF_CLIENT_CLOSE(&d->bd_mac, mch);
380
381 if (mh != 0)
382 MBPF_CLOSE(&d->bd_mac, mh);
383
384 /*
385 * Because this function is called with bd_lock held, so it must
386 * exit with it held.
387 */
388 mutex_enter(&d->bd_lock);
389 *d->bd_ifname = '\0';
390 (void) memset(&d->bd_mac, 0, sizeof (d->bd_mac));
391 }
392
393
394 /*
395 * bpfilterattach() is called at load time.
396 */
397 int
398 bpfilterattach(void)
399 {
400
401 bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31,
402 mod_hash_null_keydtor);
403 if (bpf_hash == NULL)
404 return (ENOMEM);
405
406 (void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats));
407
408 bpf_ksp = kstat_create("bpf", 0, "global", "misc",
409 KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t),
410 KSTAT_FLAG_VIRTUAL);
411 if (bpf_ksp != NULL) {
412 bpf_ksp->ks_data = &ks_stats;
413 kstat_install(bpf_ksp);
414 } else {
415 mod_hash_destroy_idhash(bpf_hash);
416 bpf_hash = NULL;
417 return (EEXIST);
418 }
419
420 cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL);
421 mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL);
422
423 LIST_INIT(&bpf_list);
424
425 return (0);
426 }
427
428
429 /*
430 * bpfilterdetach() is called at unload time.
431 */
432 int
433 bpfilterdetach(void)
434 {
435
436 if (bpf_ksp != NULL) {
437 kstat_delete(bpf_ksp);
438 bpf_ksp = NULL;
439 }
440
441 mod_hash_destroy_idhash(bpf_hash);
442 bpf_hash = NULL;
443
444 cv_destroy(&bpf_dlt_waiter);
445 mutex_destroy(&bpf_mtx);
446
447 return (0);
448 }
449
450 /*
451 * Open ethernet device. Clones.
452 */
453 /* ARGSUSED */
454 int
455 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred)
456 {
457 struct bpf_d *d;
458 uint_t dmin;
459
460 /*
461 * The security policy described at the top of this file is
462 * enforced here.
463 */
464 if ((flag & FWRITE) != 0) {
465 if (secpolicy_net_rawaccess(cred) != 0)
466 return (EACCES);
467 }
468
469 if ((flag & FREAD) != 0) {
470 if ((secpolicy_net_observability(cred) != 0) &&
471 (secpolicy_net_rawaccess(cred) != 0))
472 return (EACCES);
473 }
474
475 if ((flag & (FWRITE|FREAD)) == 0)
476 return (ENXIO);
477
478 /*
479 * A structure is allocated per open file in BPF to store settings
480 * such as buffer capture size, provide private buffers, etc.
481 */
482 d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP);
483 d->bd_bufsize = bpf_bufsize;
484 d->bd_fmode = flag;
485 d->bd_zone = crgetzoneid(cred);
486 d->bd_seesent = 1;
487 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS|
488 MAC_PROMISC_FLAGS_NO_COPY;
489 mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL);
490 cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL);
491
492 mutex_enter(&bpf_mtx);
493 /*
494 * Find an unused minor number. Obviously this is an O(n) algorithm
495 * and doesn't scale particularly well, so if there are large numbers
496 * of open file descriptors happening in real use, this design may
497 * need to be revisited.
498 */
499 for (dmin = 0; dmin < L_MAXMIN; dmin++)
500 if (bpf_dev_find(dmin) == NULL)
501 break;
502 if (dmin == L_MAXMIN) {
503 mutex_exit(&bpf_mtx);
504 kmem_free(d, sizeof (*d));
505 return (ENXIO);
506 }
507 d->bd_dev = dmin;
508 LIST_INSERT_HEAD(&bpf_list, d, bd_list);
509 bpf_dev_add(d);
510 mutex_exit(&bpf_mtx);
511
512 *devp = makedevice(getmajor(*devp), dmin);
513
514 return (0);
515 }
516
517 /*
518 * Close the descriptor by detaching it from its interface,
519 * deallocating its buffers, and marking it free.
520 *
521 * Because we only allow a device to be opened once, there is always a
522 * 1 to 1 relationship between opens and closes supporting this function.
523 */
524 /* ARGSUSED */
525 int
526 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
527 {
528 struct bpf_d *d = bpf_dev_get(getminor(dev));
529
530 mutex_enter(&d->bd_lock);
531
532 while (d->bd_inuse != 0) {
533 d->bd_waiting++;
534 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
535 d->bd_waiting--;
536 mutex_exit(&d->bd_lock);
537 return (EINTR);
538 }
539 d->bd_waiting--;
540 }
541
542 d->bd_inuse = -1;
543 if (d->bd_state == BPF_WAITING)
544 bpf_clear_timeout(d);
545 d->bd_state = BPF_IDLE;
546 if (d->bd_bif)
547 bpf_detachd(d);
548 mutex_exit(&d->bd_lock);
549
550 mutex_enter(&bpf_mtx);
551 LIST_REMOVE(d, bd_list);
552 bpf_dev_remove(d);
553 mutex_exit(&bpf_mtx);
554
555 mutex_enter(&d->bd_lock);
556 mutex_destroy(&d->bd_lock);
557 cv_destroy(&d->bd_wait);
558
559 bpf_freed(d);
560 kmem_free(d, sizeof (*d));
561
562 return (0);
563 }
564
565 /*
566 * Rotate the packet buffers in descriptor d. Move the store buffer
567 * into the hold slot, and the free buffer into the store slot.
568 * Zero the length of the new store buffer.
569 */
570 #define ROTATE_BUFFERS(d) \
571 (d)->bd_hbuf = (d)->bd_sbuf; \
572 (d)->bd_hlen = (d)->bd_slen; \
573 (d)->bd_sbuf = (d)->bd_fbuf; \
574 (d)->bd_slen = 0; \
575 (d)->bd_fbuf = 0;
576 /*
577 * bpfread - read next chunk of packets from buffers
578 */
579 /* ARGSUSED */
580 int
581 bpfread(dev_t dev, struct uio *uio, cred_t *cred)
582 {
583 struct bpf_d *d = bpf_dev_get(getminor(dev));
584 int timed_out;
585 ulong_t delay;
586 int error;
587
588 if ((d->bd_fmode & FREAD) == 0)
589 return (EBADF);
590
591 /*
592 * Restrict application to use a buffer the same size as
593 * the kernel buffers.
594 */
595 if (uio->uio_resid != d->bd_bufsize)
596 return (EINVAL);
597
598 mutex_enter(&d->bd_lock);
599 if (d->bd_state == BPF_WAITING)
600 bpf_clear_timeout(d);
601 timed_out = (d->bd_state == BPF_TIMED_OUT);
602 d->bd_state = BPF_IDLE;
603 /*
604 * If the hold buffer is empty, then do a timed sleep, which
605 * ends when the timeout expires or when enough packets
606 * have arrived to fill the store buffer.
607 */
608 while (d->bd_hbuf == 0) {
609 if (d->bd_nonblock) {
610 if (d->bd_slen == 0) {
611 mutex_exit(&d->bd_lock);
612 return (EWOULDBLOCK);
613 }
614 ROTATE_BUFFERS(d);
615 break;
616 }
617
618 if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
619 /*
620 * A packet(s) either arrived since the previous
621 * read or arrived while we were asleep.
622 * Rotate the buffers and return what's here.
623 */
624 ROTATE_BUFFERS(d);
625 break;
626 }
627 ks_stats.kp_read_wait.value.ui64++;
628 delay = ddi_get_lbolt() + d->bd_rtout;
629 error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay);
630 if (error == 0) {
631 mutex_exit(&d->bd_lock);
632 return (EINTR);
633 }
634 if (error == -1) {
635 /*
636 * On a timeout, return what's in the buffer,
637 * which may be nothing. If there is something
638 * in the store buffer, we can rotate the buffers.
639 */
640 if (d->bd_hbuf)
641 /*
642 * We filled up the buffer in between
643 * getting the timeout and arriving
644 * here, so we don't need to rotate.
645 */
646 break;
647
648 if (d->bd_slen == 0) {
649 mutex_exit(&d->bd_lock);
650 return (0);
651 }
652 ROTATE_BUFFERS(d);
653 }
654 }
655 /*
656 * At this point, we know we have something in the hold slot.
657 */
658 mutex_exit(&d->bd_lock);
659
660 /*
661 * Move data from hold buffer into user space.
662 * We know the entire buffer is transferred since
663 * we checked above that the read buffer is bpf_bufsize bytes.
664 */
665 error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio);
666
667 mutex_enter(&d->bd_lock);
668 d->bd_fbuf = d->bd_hbuf;
669 d->bd_hbuf = 0;
670 d->bd_hlen = 0;
671 done:
672 mutex_exit(&d->bd_lock);
673 return (error);
674 }
675
676
677 /*
678 * If there are processes sleeping on this descriptor, wake them up.
679 * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
680 * so there is no code here grabbing it.
681 */
682 static inline void
683 bpf_wakeup(struct bpf_d *d)
684 {
685 cv_signal(&d->bd_wait);
686 }
687
688 static void
689 bpf_timed_out(void *arg)
690 {
691 struct bpf_d *d = arg;
692
693 mutex_enter(&d->bd_lock);
694 if (d->bd_state == BPF_WAITING) {
695 d->bd_state = BPF_TIMED_OUT;
696 if (d->bd_slen != 0)
697 cv_signal(&d->bd_wait);
698 }
699 mutex_exit(&d->bd_lock);
700 }
701
702
703 /* ARGSUSED */
704 int
705 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred)
706 {
707 struct bpf_d *d = bpf_dev_get(getminor(dev));
708 uintptr_t mch;
709 uint_t mtu;
710 mblk_t *m;
711 int error;
712 int dlt;
713
714 if ((d->bd_fmode & FWRITE) == 0)
715 return (EBADF);
716
717 mutex_enter(&d->bd_lock);
718 if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) {
719 mutex_exit(&d->bd_lock);
720 return (EINTR);
721 }
722
723 if (uio->uio_resid == 0) {
724 mutex_exit(&d->bd_lock);
725 return (0);
726 }
727
728 while (d->bd_inuse < 0) {
729 d->bd_waiting++;
730 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
731 d->bd_waiting--;
732 mutex_exit(&d->bd_lock);
733 return (EINTR);
734 }
735 d->bd_waiting--;
736 }
737
738 mutex_exit(&d->bd_lock);
739
740 dlt = d->bd_dlt;
741 mch = d->bd_mcip;
742 MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu);
743 d->bd_inuse++;
744
745 m = NULL;
746 if (dlt == DLT_IPNET) {
747 error = EIO;
748 goto done;
749 }
750
751 error = bpf_movein(uio, dlt, mtu, &m);
752 if (error)
753 goto done;
754
755 DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt,
756 uint_t, mtu, mblk_t *, m);
757
758 if (M_LEN(m) > mtu) {
759 error = EMSGSIZE;
760 goto done;
761 }
762
763 error = MBPF_TX(&d->bd_mac, mch, m);
764 /*
765 * The "tx" action here is required to consume the mblk_t.
766 */
767 m = NULL;
768
769 done:
770 if (error == 0)
771 ks_stats.kp_write_ok.value.ui64++;
772 else
773 ks_stats.kp_write_error.value.ui64++;
774 if (m != NULL)
775 freemsg(m);
776
777 mutex_enter(&d->bd_lock);
778 d->bd_inuse--;
779 if ((d->bd_inuse == 0) && (d->bd_waiting != 0))
780 cv_signal(&d->bd_wait);
781 mutex_exit(&d->bd_lock);
782
783 /*
784 * The driver frees the mbuf.
785 */
786 return (error);
787 }
788
789
790 /*
791 * Reset a descriptor by flushing its packet buffer and clearing the
792 * receive and drop counts. Should be called at splnet.
793 */
794 static void
795 reset_d(struct bpf_d *d)
796 {
797 if (d->bd_hbuf) {
798 /* Free the hold buffer. */
799 d->bd_fbuf = d->bd_hbuf;
800 d->bd_hbuf = 0;
801 }
802 d->bd_slen = 0;
803 d->bd_hlen = 0;
804 d->bd_rcount = 0;
805 d->bd_dcount = 0;
806 d->bd_ccount = 0;
807 }
808
809 /*
810 * FIONREAD Check for read packet available.
811 * BIOCGBLEN Get buffer len [for read()].
812 * BIOCSETF Set ethernet read filter.
813 * BIOCFLUSH Flush read packet buffer.
814 * BIOCPROMISC Put interface into promiscuous mode.
815 * BIOCGDLT Get link layer type.
816 * BIOCGETIF Get interface name.
817 * BIOCSETIF Set interface.
818 * BIOCSRTIMEOUT Set read timeout.
819 * BIOCGRTIMEOUT Get read timeout.
820 * BIOCGSTATS Get packet stats.
821 * BIOCIMMEDIATE Set immediate mode.
822 * BIOCVERSION Get filter language version.
823 * BIOCGHDRCMPLT Get "header already complete" flag.
824 * BIOCSHDRCMPLT Set "header already complete" flag.
825 */
826 /* ARGSUSED */
827 int
828 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval)
829 {
830 struct bpf_d *d = bpf_dev_get(getminor(dev));
831 struct bpf_program prog;
832 struct lifreq lifreq;
833 struct ifreq ifreq;
834 int error = 0;
835 uint_t size;
836
837 /*
838 * Refresh the PID associated with this bpf file.
839 */
840 mutex_enter(&d->bd_lock);
841 if (d->bd_state == BPF_WAITING)
842 bpf_clear_timeout(d);
843 d->bd_state = BPF_IDLE;
844 mutex_exit(&d->bd_lock);
845
846 switch (cmd) {
847
848 default:
849 error = EINVAL;
850 break;
851
852 /*
853 * Check for read packet available.
854 */
855 case FIONREAD:
856 {
857 int n;
858
859 mutex_enter(&d->bd_lock);
860 n = d->bd_slen;
861 if (d->bd_hbuf)
862 n += d->bd_hlen;
863 mutex_exit(&d->bd_lock);
864
865 *(int *)addr = n;
866 break;
867 }
868
869 /*
870 * Get buffer len [for read()].
871 */
872 case BIOCGBLEN:
873 error = copyout(&d->bd_bufsize, (void *)addr,
874 sizeof (d->bd_bufsize));
875 break;
876
877 /*
878 * Set buffer length.
879 */
880 case BIOCSBLEN:
881 if (copyin((void *)addr, &size, sizeof (size)) != 0) {
882 error = EFAULT;
883 break;
884 }
885
886 mutex_enter(&d->bd_lock);
887 if (d->bd_bif != 0) {
888 error = EINVAL;
889 } else {
890 if (size > bpf_maxbufsize)
891 size = bpf_maxbufsize;
892 else if (size < BPF_MINBUFSIZE)
893 size = BPF_MINBUFSIZE;
894
895 d->bd_bufsize = size;
896 }
897 mutex_exit(&d->bd_lock);
898
899 if (error == 0)
900 error = copyout(&size, (void *)addr, sizeof (size));
901 break;
902
903 /*
904 * Set link layer read filter.
905 */
906 case BIOCSETF:
907 if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) {
908 error = EFAULT;
909 break;
910 }
911 error = bpf_setf(d, &prog);
912 break;
913
914 /*
915 * Flush read packet buffer.
916 */
917 case BIOCFLUSH:
918 mutex_enter(&d->bd_lock);
919 reset_d(d);
920 mutex_exit(&d->bd_lock);
921 break;
922
923 /*
924 * Put interface into promiscuous mode.
925 * This is a one-way ioctl, it is not used to turn promiscuous
926 * mode off.
927 */
928 case BIOCPROMISC:
929 if (d->bd_bif == 0) {
930 /*
931 * No interface attached yet.
932 */
933 error = EINVAL;
934 break;
935 }
936 mutex_enter(&d->bd_lock);
937 if (d->bd_promisc == 0) {
938
939 if (d->bd_promisc_handle) {
940 uintptr_t mph;
941
942 mph = d->bd_promisc_handle;
943 d->bd_promisc_handle = 0;
944
945 mutex_exit(&d->bd_lock);
946 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
947 mutex_enter(&d->bd_lock);
948 }
949
950 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY;
951 error = MBPF_PROMISC_ADD(&d->bd_mac,
952 d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d,
953 &d->bd_promisc_handle, d->bd_promisc_flags);
954 if (error == 0)
955 d->bd_promisc = 1;
956 }
957 mutex_exit(&d->bd_lock);
958 break;
959
960 /*
961 * Get device parameters.
962 */
963 case BIOCGDLT:
964 if (d->bd_bif == 0)
965 error = EINVAL;
966 else
967 error = copyout(&d->bd_dlt, (void *)addr,
968 sizeof (d->bd_dlt));
969 break;
970
971 /*
972 * Get a list of supported device parameters.
973 */
974 case BIOCGDLTLIST:
975 if (d->bd_bif == 0) {
976 error = EINVAL;
977 } else {
978 struct bpf_dltlist list;
979
980 if (copyin((void *)addr, &list, sizeof (list)) != 0) {
981 error = EFAULT;
982 break;
983 }
984 error = bpf_getdltlist(d, &list);
985 if ((error == 0) &&
986 copyout(&list, (void *)addr, sizeof (list)) != 0)
987 error = EFAULT;
988 }
989 break;
990
991 /*
992 * Set device parameters.
993 */
994 case BIOCSDLT:
995 error = bpf_setdlt(d, (void *)addr);
996 break;
997
998 /*
999 * Get interface name.
1000 */
1001 case BIOCGETIF:
1002 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1003 error = EFAULT;
1004 break;
1005 }
1006 error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1007 if ((error == 0) &&
1008 copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) {
1009 error = EFAULT;
1010 break;
1011 }
1012 break;
1013
1014 /*
1015 * Set interface.
1016 */
1017 case BIOCSETIF:
1018 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1019 error = EFAULT;
1020 break;
1021 }
1022 error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1023 break;
1024
1025 /*
1026 * Get interface name.
1027 */
1028 case BIOCGETLIF:
1029 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1030 error = EFAULT;
1031 break;
1032 }
1033 error = bpf_ifname(d, lifreq.lifr_name,
1034 sizeof (lifreq.lifr_name));
1035 if ((error == 0) &&
1036 copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) {
1037 error = EFAULT;
1038 break;
1039 }
1040 break;
1041
1042 /*
1043 * Set interface.
1044 */
1045 case BIOCSETLIF:
1046 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1047 error = EFAULT;
1048 break;
1049 }
1050 error = bpf_setif(d, lifreq.lifr_name,
1051 sizeof (lifreq.lifr_name));
1052 break;
1053
1054 #ifdef _SYSCALL32_IMPL
1055 /*
1056 * Set read timeout.
1057 */
1058 case BIOCSRTIMEOUT32:
1059 {
1060 struct timeval32 tv;
1061
1062 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1063 error = EFAULT;
1064 break;
1065 }
1066
1067 /* Convert the timeout in microseconds to ticks */
1068 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1069 tv.tv_usec);
1070 if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1071 d->bd_rtout = 1;
1072 break;
1073 }
1074
1075 /*
1076 * Get read timeout.
1077 */
1078 case BIOCGRTIMEOUT32:
1079 {
1080 struct timeval32 tv;
1081 clock_t ticks;
1082
1083 ticks = drv_hztousec(d->bd_rtout);
1084 tv.tv_sec = ticks / 1000000;
1085 tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1086 error = copyout(&tv, (void *)addr, sizeof (tv));
1087 break;
1088 }
1089
1090 /*
1091 * Get a list of supported device parameters.
1092 */
1093 case BIOCGDLTLIST32:
1094 if (d->bd_bif == 0) {
1095 error = EINVAL;
1096 } else {
1097 struct bpf_dltlist32 lst32;
1098 struct bpf_dltlist list;
1099
1100 if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) {
1101 error = EFAULT;
1102 break;
1103 }
1104
1105 list.bfl_len = lst32.bfl_len;
1106 list.bfl_list = (void *)(uint64_t)lst32.bfl_list;
1107 error = bpf_getdltlist(d, &list);
1108 if (error == 0) {
1109 lst32.bfl_len = list.bfl_len;
1110
1111 if (copyout(&lst32, (void *)addr,
1112 sizeof (lst32)) != 0)
1113 error = EFAULT;
1114 }
1115 }
1116 break;
1117
1118 /*
1119 * Set link layer read filter.
1120 */
1121 case BIOCSETF32: {
1122 struct bpf_program32 prog32;
1123
1124 if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) {
1125 error = EFAULT;
1126 break;
1127 }
1128 prog.bf_len = prog32.bf_len;
1129 prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1130 error = bpf_setf(d, &prog);
1131 break;
1132 }
1133 #endif
1134
1135 /*
1136 * Set read timeout.
1137 */
1138 case BIOCSRTIMEOUT:
1139 {
1140 struct timeval tv;
1141
1142 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1143 error = EFAULT;
1144 break;
1145 }
1146
1147 /* Convert the timeout in microseconds to ticks */
1148 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1149 tv.tv_usec);
1150 if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1151 d->bd_rtout = 1;
1152 break;
1153 }
1154
1155 /*
1156 * Get read timeout.
1157 */
1158 case BIOCGRTIMEOUT:
1159 {
1160 struct timeval tv;
1161 clock_t ticks;
1162
1163 ticks = drv_hztousec(d->bd_rtout);
1164 tv.tv_sec = ticks / 1000000;
1165 tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1166 if (copyout(&tv, (void *)addr, sizeof (tv)) != 0)
1167 error = EFAULT;
1168 break;
1169 }
1170
1171 /*
1172 * Get packet stats.
1173 */
1174 case BIOCGSTATS:
1175 {
1176 struct bpf_stat bs;
1177
1178 bs.bs_recv = d->bd_rcount;
1179 bs.bs_drop = d->bd_dcount;
1180 bs.bs_capt = d->bd_ccount;
1181 if (copyout(&bs, (void *)addr, sizeof (bs)) != 0)
1182 error = EFAULT;
1183 break;
1184 }
1185
1186 /*
1187 * Set immediate mode.
1188 */
1189 case BIOCIMMEDIATE:
1190 if (copyin((void *)addr, &d->bd_immediate,
1191 sizeof (d->bd_immediate)) != 0)
1192 error = EFAULT;
1193 break;
1194
1195 case BIOCVERSION:
1196 {
1197 struct bpf_version bv;
1198
1199 bv.bv_major = BPF_MAJOR_VERSION;
1200 bv.bv_minor = BPF_MINOR_VERSION;
1201 if (copyout(&bv, (void *)addr, sizeof (bv)) != 0)
1202 error = EFAULT;
1203 break;
1204 }
1205
1206 case BIOCGHDRCMPLT: /* get "header already complete" flag */
1207 if (copyout(&d->bd_hdrcmplt, (void *)addr,
1208 sizeof (d->bd_hdrcmplt)) != 0)
1209 error = EFAULT;
1210 break;
1211
1212 case BIOCSHDRCMPLT: /* set "header already complete" flag */
1213 if (copyin((void *)addr, &d->bd_hdrcmplt,
1214 sizeof (d->bd_hdrcmplt)) != 0)
1215 error = EFAULT;
1216 break;
1217
1218 /*
1219 * Get "see sent packets" flag
1220 */
1221 case BIOCGSEESENT:
1222 if (copyout(&d->bd_seesent, (void *)addr,
1223 sizeof (d->bd_seesent)) != 0)
1224 error = EFAULT;
1225 break;
1226
1227 /*
1228 * Set "see sent" packets flag
1229 */
1230 case BIOCSSEESENT:
1231 if (copyin((void *)addr, &d->bd_seesent,
1232 sizeof (d->bd_seesent)) != 0)
1233 error = EFAULT;
1234 break;
1235
1236 case FIONBIO: /* Non-blocking I/O */
1237 if (copyin((void *)addr, &d->bd_nonblock,
1238 sizeof (d->bd_nonblock)) != 0)
1239 error = EFAULT;
1240 break;
1241 }
1242 return (error);
1243 }
1244
1245 /*
1246 * Set d's packet filter program to fp. If this file already has a filter,
1247 * free it and replace it. If the new filter is "empty" (has a 0 size), then
1248 * the result is to just remove and free the existing filter.
1249 * Returns EINVAL for bogus requests.
1250 */
1251 int
1252 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1253 {
1254 struct bpf_insn *fcode, *old;
1255 uint_t flen, size;
1256 size_t oldsize;
1257
1258 if (fp->bf_insns == 0) {
1259 if (fp->bf_len != 0)
1260 return (EINVAL);
1261 mutex_enter(&d->bd_lock);
1262 old = d->bd_filter;
1263 oldsize = d->bd_filter_size;
1264 d->bd_filter = 0;
1265 d->bd_filter_size = 0;
1266 reset_d(d);
1267 mutex_exit(&d->bd_lock);
1268 if (old != 0)
1269 kmem_free(old, oldsize);
1270 return (0);
1271 }
1272 flen = fp->bf_len;
1273 if (flen > BPF_MAXINSNS)
1274 return (EINVAL);
1275
1276 size = flen * sizeof (*fp->bf_insns);
1277 fcode = kmem_alloc(size, KM_SLEEP);
1278 if (copyin(fp->bf_insns, fcode, size) != 0)
1279 return (EFAULT);
1280
1281 if (bpf_validate(fcode, (int)flen)) {
1282 mutex_enter(&d->bd_lock);
1283 old = d->bd_filter;
1284 oldsize = d->bd_filter_size;
1285 d->bd_filter = fcode;
1286 d->bd_filter_size = size;
1287 reset_d(d);
1288 mutex_exit(&d->bd_lock);
1289 if (old != 0)
1290 kmem_free(old, oldsize);
1291
1292 return (0);
1293 }
1294 kmem_free(fcode, size);
1295 return (EINVAL);
1296 }
1297
1298 /*
1299 * Detach a file from its current interface (if attached at all) and attach
1300 * to the interface indicated by the name stored in ifname.
1301 * Return an errno or 0.
1302 */
1303 static int
1304 bpf_setif(struct bpf_d *d, char *ifname, int namesize)
1305 {
1306 int unit_seen;
1307 int error = 0;
1308 char *cp;
1309 int i;
1310
1311 /*
1312 * Make sure the provided name has a unit number, and default
1313 * it to '0' if not specified.
1314 * XXX This is ugly ... do this differently?
1315 */
1316 unit_seen = 0;
1317 cp = ifname;
1318 cp[namesize - 1] = '\0'; /* sanity */
1319 while (*cp++)
1320 if (*cp >= '0' && *cp <= '9')
1321 unit_seen = 1;
1322 if (!unit_seen) {
1323 /* Make sure to leave room for the '\0'. */
1324 for (i = 0; i < (namesize - 1); ++i) {
1325 if ((ifname[i] >= 'a' && ifname[i] <= 'z') ||
1326 (ifname[i] >= 'A' && ifname[i] <= 'Z'))
1327 continue;
1328 ifname[i] = '0';
1329 }
1330 }
1331
1332 /*
1333 * Make sure that only one call to this function happens at a time
1334 * and that we're not interleaving a read/write
1335 */
1336 mutex_enter(&d->bd_lock);
1337 while (d->bd_inuse != 0) {
1338 d->bd_waiting++;
1339 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
1340 d->bd_waiting--;
1341 mutex_exit(&d->bd_lock);
1342 return (EINTR);
1343 }
1344 d->bd_waiting--;
1345 }
1346 d->bd_inuse = -1;
1347 mutex_exit(&d->bd_lock);
1348
1349 if (d->bd_sbuf == 0)
1350 error = bpf_allocbufs(d);
1351
1352 if (error == 0) {
1353 mutex_enter(&d->bd_lock);
1354 if (d->bd_bif)
1355 /*
1356 * Detach if attached to something else.
1357 */
1358 bpf_detachd(d);
1359
1360 error = bpf_attachd(d, ifname, -1);
1361 reset_d(d);
1362 d->bd_inuse = 0;
1363 if (d->bd_waiting != 0)
1364 cv_signal(&d->bd_wait);
1365 mutex_exit(&d->bd_lock);
1366 return (error);
1367 }
1368
1369 mutex_enter(&d->bd_lock);
1370 d->bd_inuse = 0;
1371 if (d->bd_waiting != 0)
1372 cv_signal(&d->bd_wait);
1373 mutex_exit(&d->bd_lock);
1374
1375 /*
1376 * Try tickle the mac layer into attaching the device...
1377 */
1378 return (bpf_provider_tickle(ifname, d->bd_zone));
1379 }
1380
1381 /*
1382 * Copy the interface name to the ifreq.
1383 */
1384 static int
1385 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize)
1386 {
1387
1388 mutex_enter(&d->bd_lock);
1389 if (d->bd_bif == NULL) {
1390 mutex_exit(&d->bd_lock);
1391 return (EINVAL);
1392 }
1393
1394 (void) strlcpy(buffer, d->bd_ifname, bufsize);
1395 mutex_exit(&d->bd_lock);
1396
1397 return (0);
1398 }
1399
1400 /* ARGSUSED */
1401 int
1402 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp,
1403 struct pollhead **phpp)
1404 {
1405 struct bpf_d *d = bpf_dev_get(getminor(dev));
1406
1407 /*
1408 * Until this driver is modified to issue proper pollwakeup() calls on
1409 * its pollhead, edge-triggered polling is not allowed.
1410 */
1411 if (events & POLLET) {
1412 return (EPERM);
1413 }
1414
1415 if (events & (POLLIN | POLLRDNORM)) {
1416 /*
1417 * An imitation of the FIONREAD ioctl code.
1418 */
1419 mutex_enter(&d->bd_lock);
1420 if (d->bd_hlen != 0 ||
1421 ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1422 d->bd_slen != 0)) {
1423 *reventsp |= events & (POLLIN | POLLRDNORM);
1424 } else {
1425 /*
1426 * Until the bpf driver has been updated to include
1427 * adequate pollwakeup() logic, no pollhead will be
1428 * emitted here, preventing the resource from being
1429 * cached by poll()/devpoll/epoll.
1430 */
1431 *reventsp = 0;
1432 /* Start the read timeout if necessary */
1433 if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1434 bpf_clear_timeout(d);
1435 /*
1436 * Only allow the timeout to be set once.
1437 */
1438 if (d->bd_callout == 0)
1439 d->bd_callout = timeout(bpf_timed_out,
1440 d, d->bd_rtout);
1441 d->bd_state = BPF_WAITING;
1442 }
1443 }
1444 mutex_exit(&d->bd_lock);
1445 }
1446
1447 return (0);
1448 }
1449
1450 /*
1451 * Copy data from an mblk_t chain into a buffer. This works for ipnet
1452 * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1453 * packet itself.
1454 */
1455 static void *
1456 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1457 {
1458 const mblk_t *m;
1459 uint_t count;
1460 uchar_t *dst;
1461
1462 m = src_arg;
1463 dst = dst_arg;
1464 while (len > 0) {
1465 if (m == NULL)
1466 panic("bpf_mcpy");
1467 count = (uint_t)min(M_LEN(m), len);
1468 (void) memcpy(dst, mtod(m, const void *), count);
1469 m = m->b_cont;
1470 dst += count;
1471 len -= count;
1472 }
1473 return (dst_arg);
1474 }
1475
1476 /*
1477 * Dispatch a packet to all the listeners on interface bp.
1478 *
1479 * marg pointer to the packet, either a data buffer or an mbuf chain
1480 * buflen buffer length, if marg is a data buffer
1481 * cpfn a function that can copy marg into the listener's buffer
1482 * pktlen length of the packet
1483 * issent boolean indicating whether the packet was sent or receive
1484 */
1485 static inline void
1486 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen,
1487 uint_t buflen, boolean_t issent)
1488 {
1489 struct timeval tv;
1490 uint_t slen;
1491
1492 if (!d->bd_seesent && issent)
1493 return;
1494
1495 /*
1496 * Accuracy of the packet counters in BPF is vital so it
1497 * is important to protect even the outer ones.
1498 */
1499 mutex_enter(&d->bd_lock);
1500 slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
1501 DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif,
1502 struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen);
1503 d->bd_rcount++;
1504 ks_stats.kp_receive.value.ui64++;
1505 if (slen != 0) {
1506 uniqtime(&tv);
1507 catchpacket(d, marg, pktlen, slen, cpfn, &tv);
1508 }
1509 mutex_exit(&d->bd_lock);
1510 }
1511
1512 /*
1513 * Incoming linkage from device drivers.
1514 */
1515 /* ARGSUSED */
1516 void
1517 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent)
1518 {
1519 cp_fn_t cpfn;
1520 struct bpf_d *d = arg;
1521 uint_t pktlen, buflen;
1522 void *marg;
1523
1524 pktlen = msgdsize(m);
1525
1526 if (pktlen == M_LEN(m)) {
1527 cpfn = (cp_fn_t)memcpy;
1528 marg = mtod(m, void *);
1529 buflen = pktlen;
1530 } else {
1531 cpfn = bpf_mcpy;
1532 marg = m;
1533 buflen = 0;
1534 }
1535
1536 bpf_deliver(d, cpfn, marg, pktlen, buflen, issent);
1537 }
1538
1539 /*
1540 * Incoming linkage from ipnet.
1541 * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1542 * from all network interfaces. Thus the tap function needs to apply a
1543 * filter using the interface index/id to immitate snoop'ing on just the
1544 * specified interface.
1545 */
1546 /* ARGSUSED */
1547 void
1548 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length)
1549 {
1550 hook_pkt_observe_t *hdr;
1551 struct bpf_d *d = arg;
1552
1553 hdr = (hook_pkt_observe_t *)m->b_rptr;
1554 if (ntohl(hdr->hpo_ifindex) != d->bd_linkid)
1555 return;
1556 bpf_deliver(d, bpf_mcpy, m, length, 0, issent);
1557
1558 }
1559
1560 /*
1561 * Move the packet data from interface memory (pkt) into the
1562 * store buffer. Return 1 if it's time to wakeup a listener (buffer full),
1563 * otherwise 0. "copy" is the routine called to do the actual data
1564 * transfer. memcpy is passed in to copy contiguous chunks, while
1565 * bpf_mcpy is passed in to copy mbuf chains. In the latter case,
1566 * pkt is really an mbuf.
1567 */
1568 static void
1569 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen,
1570 cp_fn_t cpfn, struct timeval *tv)
1571 {
1572 struct bpf_hdr *hp;
1573 int totlen, curlen;
1574 int hdrlen = d->bd_hdrlen;
1575 int do_wakeup = 0;
1576
1577 ++d->bd_ccount;
1578 ks_stats.kp_capture.value.ui64++;
1579 /*
1580 * Figure out how many bytes to move. If the packet is
1581 * greater or equal to the snapshot length, transfer that
1582 * much. Otherwise, transfer the whole packet (unless
1583 * we hit the buffer size limit).
1584 */
1585 totlen = hdrlen + min(snaplen, pktlen);
1586 if (totlen > d->bd_bufsize)
1587 totlen = d->bd_bufsize;
1588
1589 /*
1590 * Round up the end of the previous packet to the next longword.
1591 */
1592 curlen = BPF_WORDALIGN(d->bd_slen);
1593 if (curlen + totlen > d->bd_bufsize) {
1594 /*
1595 * This packet will overflow the storage buffer.
1596 * Rotate the buffers if we can, then wakeup any
1597 * pending reads.
1598 */
1599 if (d->bd_fbuf == 0) {
1600 /*
1601 * We haven't completed the previous read yet,
1602 * so drop the packet.
1603 */
1604 ++d->bd_dcount;
1605 ks_stats.kp_dropped.value.ui64++;
1606 return;
1607 }
1608 ROTATE_BUFFERS(d);
1609 do_wakeup = 1;
1610 curlen = 0;
1611 } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1612 /*
1613 * Immediate mode is set, or the read timeout has
1614 * already expired during a select call. A packet
1615 * arrived, so the reader should be woken up.
1616 */
1617 do_wakeup = 1;
1618 }
1619
1620 /*
1621 * Append the bpf header to the existing buffer before we add
1622 * on the actual packet data.
1623 */
1624 hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
1625 hp->bh_tstamp.tv_sec = tv->tv_sec;
1626 hp->bh_tstamp.tv_usec = tv->tv_usec;
1627 hp->bh_datalen = pktlen;
1628 hp->bh_hdrlen = (uint16_t)hdrlen;
1629 /*
1630 * Copy the packet data into the store buffer and update its length.
1631 */
1632 (*cpfn)((uchar_t *)hp + hdrlen, pkt,
1633 (hp->bh_caplen = totlen - hdrlen));
1634 d->bd_slen = curlen + totlen;
1635
1636 /*
1637 * Call bpf_wakeup after bd_slen has been updated.
1638 */
1639 if (do_wakeup)
1640 bpf_wakeup(d);
1641 }
1642
1643 /*
1644 * Initialize all nonzero fields of a descriptor.
1645 */
1646 static int
1647 bpf_allocbufs(struct bpf_d *d)
1648 {
1649
1650 d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1651 if (!d->bd_fbuf)
1652 return (ENOBUFS);
1653 d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1654 if (!d->bd_sbuf) {
1655 kmem_free(d->bd_fbuf, d->bd_bufsize);
1656 return (ENOBUFS);
1657 }
1658 d->bd_slen = 0;
1659 d->bd_hlen = 0;
1660 return (0);
1661 }
1662
1663 /*
1664 * Free buffers currently in use by a descriptor.
1665 * Called on close.
1666 */
1667 static void
1668 bpf_freed(struct bpf_d *d)
1669 {
1670 /*
1671 * At this point the descriptor has been detached from its
1672 * interface and it yet hasn't been marked free.
1673 */
1674 if (d->bd_sbuf != 0) {
1675 kmem_free(d->bd_sbuf, d->bd_bufsize);
1676 if (d->bd_hbuf != 0)
1677 kmem_free(d->bd_hbuf, d->bd_bufsize);
1678 if (d->bd_fbuf != 0)
1679 kmem_free(d->bd_fbuf, d->bd_bufsize);
1680 }
1681 if (d->bd_filter)
1682 kmem_free(d->bd_filter, d->bd_filter_size);
1683 }
1684
1685 /*
1686 * Get a list of available data link type of the interface.
1687 */
1688 static int
1689 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp)
1690 {
1691 bpf_provider_list_t *bp;
1692 bpf_provider_t *bpr;
1693 zoneid_t zoneid;
1694 uintptr_t mcip;
1695 uint_t nicdlt;
1696 uintptr_t mh;
1697 int error;
1698 int n;
1699
1700 n = 0;
1701 mh = 0;
1702 mcip = 0;
1703 error = 0;
1704 mutex_enter(&d->bd_lock);
1705 LIST_FOREACH(bp, &bpf_providers, bpl_next) {
1706 bpr = bp->bpl_what;
1707 error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone);
1708 if (error != 0)
1709 goto next;
1710 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
1711 if (error != 0)
1712 goto next;
1713 error = MBPF_GET_ZONE(bpr, mh, &zoneid);
1714 if (error != 0)
1715 goto next;
1716 if (d->bd_zone != GLOBAL_ZONEID &&
1717 d->bd_zone != zoneid)
1718 goto next;
1719 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
1720 if (error != 0)
1721 goto next;
1722 nicdlt = bpf_dl_to_dlt(nicdlt);
1723 if (listp->bfl_list != NULL) {
1724 if (n >= listp->bfl_len) {
1725 MBPF_CLIENT_CLOSE(bpr, mcip);
1726 MBPF_CLOSE(bpr, mh);
1727 break;
1728 }
1729 /*
1730 * Bumping of bd_inuse ensures the structure does not
1731 * disappear while the copyout runs and allows the for
1732 * loop to be continued.
1733 */
1734 d->bd_inuse++;
1735 mutex_exit(&d->bd_lock);
1736 if (copyout(&nicdlt,
1737 listp->bfl_list + n, sizeof (uint_t)) != 0)
1738 error = EFAULT;
1739 mutex_enter(&d->bd_lock);
1740 if (error != 0)
1741 break;
1742 d->bd_inuse--;
1743 }
1744 n++;
1745 next:
1746 if (mcip != 0) {
1747 MBPF_CLIENT_CLOSE(bpr, mcip);
1748 mcip = 0;
1749 }
1750 if (mh != 0) {
1751 MBPF_CLOSE(bpr, mh);
1752 mh = 0;
1753 }
1754 }
1755 mutex_exit(&d->bd_lock);
1756
1757 /*
1758 * It is quite possible that one or more provider to BPF may not
1759 * know about a link name whlist others do. In that case, so long
1760 * as we have one success, do not declare an error unless it was
1761 * an EFAULT as this indicates a problem that needs to be reported.
1762 */
1763 if ((error != EFAULT) && (n > 0))
1764 error = 0;
1765
1766 listp->bfl_len = n;
1767 return (error);
1768 }
1769
1770 /*
1771 * Set the data link type of a BPF instance.
1772 */
1773 static int
1774 bpf_setdlt(struct bpf_d *d, void *addr)
1775 {
1776 char ifname[LIFNAMSIZ+1];
1777 zoneid_t niczone;
1778 int error;
1779 int dlt;
1780
1781 if (copyin(addr, &dlt, sizeof (dlt)) != 0)
1782 return (EFAULT);
1783
1784 mutex_enter(&d->bd_lock);
1785
1786 if (d->bd_bif == 0) { /* Interface not set */
1787 mutex_exit(&d->bd_lock);
1788 return (EINVAL);
1789 }
1790 if (d->bd_dlt == dlt) { /* NULL-op */
1791 mutex_exit(&d->bd_lock);
1792 return (0);
1793 }
1794
1795 error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone);
1796 if (error != 0) {
1797 mutex_exit(&d->bd_lock);
1798 return (error);
1799 }
1800
1801 /*
1802 * See the matrix at the top of the file for the permissions table
1803 * enforced by this driver.
1804 */
1805 if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) &&
1806 (niczone != d->bd_zone)) {
1807 mutex_exit(&d->bd_lock);
1808 return (EINVAL);
1809 }
1810
1811 (void) strlcpy(ifname, d->bd_ifname, sizeof (ifname));
1812 d->bd_inuse = -1;
1813 bpf_detachd(d);
1814 error = bpf_attachd(d, ifname, dlt);
1815 reset_d(d);
1816 d->bd_inuse = 0;
1817
1818 mutex_exit(&d->bd_lock);
1819 return (error);
1820 }
1821
1822 /*
1823 * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1824 * with the necessary protection to retrieve and modify bd_callout but it
1825 * does not hold the lock for its entire duration... see below...
1826 */
1827 static void
1828 bpf_clear_timeout(struct bpf_d *d)
1829 {
1830 timeout_id_t tid = d->bd_callout;
1831 d->bd_callout = 0;
1832 d->bd_inuse++;
1833
1834 /*
1835 * If the timeout has fired and is waiting on bd_lock, we could
1836 * deadlock here because untimeout if bd_lock is held and would
1837 * wait for bpf_timed_out to finish and it never would.
1838 */
1839 if (tid != 0) {
1840 mutex_exit(&d->bd_lock);
1841 (void) untimeout(tid);
1842 mutex_enter(&d->bd_lock);
1843 }
1844
1845 d->bd_inuse--;
1846 }
1847
1848 /*
1849 * As a cloning device driver, BPF needs to keep track of which device
1850 * numbers are in use and which ones are not. A hash table, indexed by
1851 * the minor device number, is used to store the pointers to the
1852 * individual descriptors that are allocated in bpfopen().
1853 * The functions below present the interface for that hash table to
1854 * the rest of the driver.
1855 */
1856 static struct bpf_d *
1857 bpf_dev_find(minor_t minor)
1858 {
1859 struct bpf_d *d = NULL;
1860
1861 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1862 (mod_hash_val_t *)&d);
1863
1864 return (d);
1865 }
1866
1867 static void
1868 bpf_dev_add(struct bpf_d *d)
1869 {
1870 (void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1871 (mod_hash_val_t)d);
1872 }
1873
1874 static void
1875 bpf_dev_remove(struct bpf_d *d)
1876 {
1877 struct bpf_d *stor;
1878
1879 (void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1880 (mod_hash_val_t *)&stor);
1881 ASSERT(stor == d);
1882 }
1883
1884 /*
1885 * bpf_def_get should only ever be called for a minor number that exists,
1886 * thus there should always be a pointer in the hash table that corresponds
1887 * to it.
1888 */
1889 static struct bpf_d *
1890 bpf_dev_get(minor_t minor)
1891 {
1892 struct bpf_d *d = NULL;
1893
1894 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1895 (mod_hash_val_t *)&d);
1896 ASSERT(d != NULL);
1897
1898 return (d);
1899 }