1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019, Joyent, Inc.
14 */
15
16 /* IPF oddness for compilation in userland for IPF tests. */
17 #if defined(KERNEL) || defined(_KERNEL)
18 #undef KERNEL
19 #undef _KERNEL
20 #define KERNEL 1
21 #define _KERNEL 1
22 #endif
23
24 #include <sys/errno.h>
25 #include <sys/types.h>
26 #include <sys/param.h>
27 #include <sys/time.h>
28 #include <sys/socket.h>
29 #include <net/if.h>
30 #include <net/route.h>
31 #include <netinet/in.h>
32 #include <netinet/in_systm.h>
33 #include <netinet/ip.h>
34 #include <netinet/ip_var.h>
35 #include <netinet/tcp.h>
36 #include "netinet/ip_compat.h"
37 #ifdef USE_INET6
38 #include <netinet/icmp6.h>
39 #endif
40 #include <netinet/tcpip.h>
41 #include "netinet/ip_fil.h"
42 #include "netinet/ip_nat.h"
43 #include "netinet/ip_frag.h"
44 #include "netinet/ip_state.h"
45 #include "netinet/ip_proxy.h"
46 #include "netinet/ip_auth.h"
47 #include "netinet/ipf_stack.h"
48 #ifdef IPFILTER_SCAN
49 #include "netinet/ip_scan.h"
50 #endif
51 #ifdef IPFILTER_SYNC
52 #include "netinet/ip_sync.h"
53 #endif
54 #include "netinet/ip_pool.h"
55 #include "netinet/ip_htable.h"
56 #ifdef IPFILTER_COMPILED
57 #include "netinet/ip_rules.h"
58 #endif
59 #if defined(_KERNEL)
60 #include <sys/sunddi.h>
61 #endif
62
63 #include "netinet/ipf_cfw.h"
64 #include <sys/file.h>
65 #include <sys/uio.h>
66 #include <sys/cred.h>
67 #include <sys/ddi.h>
68
69 /*
70 * cfw == Cloud Firewall ==> routines for a global-zone data collector about
71 * ipf events for SmartOS. The only ones that CFW cares about are ones
72 * enforced by global-zone-controlled rulesets.
73 *
74 * The variable below is mdb-hackable to experiment with turning it on and
75 * off. Eventually this will tie into a new ipf (GZ-only) device that flips
76 * this on when there is an open instance. It may also consume an fr_flag
77 * to have per-rule granularity.
78 */
79 boolean_t ipf_cfwlog_enabled;
80
81 /*
82 * Because ipf's test tools in $SRC/cmd insert all of these files, we need to
83 * stub out what we can vs. drag in even more headers and who knows what else.
84 */
85 #ifdef _KERNEL
86
87 /*
88 * CFW event ring buffer. Remember, this is for ALL ZONES because only a
89 * global-zone event-reader will be consuming these. In other words, it's
90 * not something to instantiate per-netstack.
91 */
92
93 /*
94 * We may want to get more sophisticated and performant (e.g. per-processor),
95 * but for now keep the ring buffer simple and stupid.
96 */
97
98 /* Must be a power of 2, to be bitmaskable, and must be countable by a uint_t */
99
100 #define IPF_CFW_DEFAULT_RING_BUFS 1024
101 #define IPF_CFW_MIN_RING_BUFS 8
102 #define IPF_CFW_MAX_RING_BUFS (1U << 31U)
103
104 /* Assume C's init-to-zero is sufficient for these types... */
105 static kmutex_t cfw_ringlock;
106 static kcondvar_t cfw_ringcv;
107
108 static cfwev_t *cfw_ring; /* NULL by default. */
109 static uint32_t cfw_ringsize; /* 0 by default, number of array elements. */
110 static uint32_t cfw_ringmask; /* 0 by default. */
111
112 /* If these are equal, we're either empty or full. */
113 static uint_t cfw_ringstart, cfw_ringend;
114 static boolean_t cfw_ringfull; /* Tell the difference here! */
115 static uint64_t cfw_evreports;
116 static uint64_t cfw_evdrops;
117
118 /*
119 * Place an event in the CFW event ring buffer.
120 *
121 * For now, be simple and drop the oldest event if we overflow. We may wish to
122 * selectively drop older events based on type in the future.
123 */
124 static void
125 ipf_cfwev_report(cfwev_t *event)
126 {
127 mutex_enter(&cfw_ringlock);
128 if (cfw_ringfull) {
129 cfw_ringstart++;
130 cfw_ringstart &= cfw_ringmask;
131 cfw_ringend++;
132 cfw_ringend &= cfw_ringmask;
133 DTRACE_PROBE(ipf__cfw__evdrop);
134 cfw_evdrops++;
135 cfw_ring[cfw_ringend] = *event;
136 } else {
137 cfw_ring[cfw_ringend] = *event;
138 cfw_ringend++;
139 cfw_ringend &= cfw_ringmask;
140 cfw_ringfull = (cfw_ringend == cfw_ringstart);
141 }
142 cfw_evreports++;
143 cv_broadcast(&cfw_ringcv);
144 mutex_exit(&cfw_ringlock);
145 }
146
147 #if 0
148 /*
149 * Simple event consumer which copies one event from the ring buffer into
150 * what's provided. In the future, maybe lock-then-callback, even with a
151 * request for multiple events?
152 *
153 * If there are no events, either cv_wait() or return B_FALSE, depending on
154 * "block".
155 */
156 boolean_t
157 ipf_cfwev_consume(cfwev_t *event, boolean_t block)
158 {
159 mutex_enter(&cfw_ringlock);
160
161 /*
162 * Return B_FALSE if non-block and no data, OR if we receive a signal.
163 */
164 while ((cfw_ringstart == cfw_ringend) && !cfw_ringfull) {
165 if (!block || !cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) {
166 mutex_exit(&cfw_ringlock);
167 return (B_FALSE);
168 }
169 }
170
171 *event = cfw_ring[cfw_ringstart];
172 cfw_ringstart++;
173 cfw_ringstart &= IPF_CFW_RING_MASK;
174 cfw_ringfull = B_FALSE;
175 mutex_exit(&cfw_ringlock);
176 return (B_TRUE);
177 }
178 #endif
179
180 /*
181 * More sophisticated access to multiple CFW events that can allow copying
182 * straight from the ring buffer up to userland. Requires a callback (which
183 * could call uiomove() directly, OR to a local still-in-kernel buffer) that
184 * must do the data copying-out.
185 *
186 * Callback function is of the form:
187 *
188 * uint_t cfw_many_cb(cfwev_t *evptr, int num_avail, void *cbarg);
189 *
190 * The function must return how many events got consumed, which MUST be <= the
191 * number available. The function must ALSO UNDERSTAND that cfw_ringlock is
192 * held during this time. The function may be called more than once, if the
193 * available buffers wrap-around OR "block" is set and we don't have enough
194 * buffers. If any callback returns 0, exit the function with however many
195 * were consumed.
196 *
197 * This function, like the callback, returns the number of events *CONSUMED*.
198 */
199
200 /*
201 * If you wish to attempt to coalesce reads (to reduce the likelihood of one
202 * event at a time during high load) change the number of tries below to
203 * something not 0. Early experiments set this to 10.
204 *
205 * The wait between tries is in usecs in cfw_timeout_wait. The pessimal
206 * case for this is a timeout_wait-spaced trickle of one event at a time.
207 */
208 int cfw_timeout_tries = 0;
209 int cfw_timeout_wait = 10000; /* 10ms wait. */
210
211 uint_t
212 ipf_cfwev_consume_many(uint_t num_requested, boolean_t block,
213 cfwmanycb_t cfw_many_cb, void *cbarg)
214 {
215 uint_t consumed = 0, cb_consumed, contig_size;
216 int timeout_tries = cfw_timeout_tries;
217
218 mutex_enter(&cfw_ringlock);
219
220 /* Silly reality checks */
221 ASSERT3U(cfw_ringstart, <, cfw_ringsize);
222 ASSERT3U(cfw_ringend, <, cfw_ringsize);
223
224 /*
225 * Can goto here again if caller wants blocking. NOTE that
226 * num_requested may have been decremented and consumed may have been
227 * incremented if we arrive here via a goto after a cv_wait.
228 */
229 from_the_top:
230 if (cfw_ringstart > cfw_ringend || cfw_ringfull)
231 contig_size = cfw_ringsize - cfw_ringstart;
232 else if (cfw_ringstart < cfw_ringend)
233 contig_size = cfw_ringend - cfw_ringstart;
234 else if (block && cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) {
235 /* Maybe something to consume now, try again. */
236 goto from_the_top;
237 } else {
238 /* Nothing (more) to consume, return! */
239 goto bail;
240 }
241
242 ASSERT(contig_size + cfw_ringstart == cfw_ringend ||
243 contig_size + cfw_ringstart == cfw_ringsize);
244
245 if (num_requested < contig_size)
246 contig_size = num_requested;
247
248 cb_consumed = cfw_many_cb(&(cfw_ring[cfw_ringstart]), contig_size,
249 cbarg);
250 ASSERT(cb_consumed <= contig_size);
251 cfw_ringstart += cb_consumed;
252 consumed += cb_consumed;
253 cfw_ringfull = (cfw_ringfull && cb_consumed == 0);
254 if (cb_consumed < contig_size) {
255 /* Caller clearly had a problem. Reality check and bail. */
256 ASSERT((cfw_ringstart & cfw_ringmask) == cfw_ringstart);
257 goto bail;
258 }
259 ASSERT(cb_consumed == contig_size);
260 cfw_ringstart &= cfw_ringmask; /* In case of wraparound. */
261 num_requested -= contig_size;
262
263 if (num_requested > 0 && cfw_ringstart != cfw_ringend) {
264 /* We must have wrapped around the end of the buffer! */
265 ASSERT(cfw_ringstart == 0);
266 ASSERT(!cfw_ringfull);
267 contig_size = cfw_ringend;
268 if (num_requested < contig_size)
269 contig_size = num_requested;
270 cb_consumed = cfw_many_cb(&(cfw_ring[cfw_ringstart]),
271 contig_size, cbarg);
272 cfw_ringstart += cb_consumed;
273 consumed += cb_consumed;
274 if (cb_consumed < contig_size) {
275 /*
276 * Caller clearly had a problem. Reality check and
277 * bail.
278 */
279 ASSERT(cfw_ringend > cfw_ringstart);
280 goto bail;
281 }
282 ASSERT(cb_consumed == contig_size);
283 num_requested -= contig_size;
284 }
285
286 ASSERT(consumed > 0);
287
288 if (num_requested > 0 && block && timeout_tries > 0) {
289 clock_t delta = drv_usectohz(cfw_timeout_wait);
290
291 timeout_tries--;
292
293 /*
294 * We obtained some of the events we requested, but not all.
295 * Since we have nothing to consume, wait *a little bit*
296 * longer.
297 */
298 switch (cv_reltimedwait_sig(&cfw_ringcv, &cfw_ringlock, delta,
299 TR_CLOCK_TICK)) {
300 case 0:
301 /* Received signal! Throw out what we have. */
302 DTRACE_PROBE1(ipf__cfw__sigdiscard, int, consumed);
303 cfw_evdrops += consumed;
304 consumed = 0;
305 break;
306 case -1:
307 /* Time reached! Bail with what we got. */
308 DTRACE_PROBE(ipf__cfw__timedexpired);
309 break;
310 default:
311 /* Aha! We've got more! */
312 DTRACE_PROBE(ipf__cfw__moredata);
313 goto from_the_top;
314 }
315 }
316
317 bail:
318 mutex_exit(&cfw_ringlock);
319 return (consumed);
320 }
321
322 static inline zoneid_t
323 ifs_to_did(ipf_stack_t *ifs)
324 {
325 if (ifs->ifs_zone_did == 0) {
326 zone_t *zone;
327
328 /*
329 * Because we can't get the zone_did at initialization time
330 * because most zone data isn't readily available then,
331 * cement the did in place now.
332 */
333 ASSERT(ifs->ifs_zone != GLOBAL_ZONEID);
334 zone = zone_find_by_id(ifs->ifs_zone);
335 if (zone != NULL) {
336 ifs->ifs_zone_did = zone->zone_did;
337 zone_rele(zone);
338 }
339 /* Else we are either in shutdown or something weirder. */
340 }
341 return (ifs->ifs_zone_did);
342 }
343
344 /*
345 * ipf_block_cfwlog()
346 *
347 * Called by fr_check(). Record drop events for a global-zone data collector.
348 * Use rest-of-ipf-style names for the parameters.
349 */
350 void
351 ipf_block_cfwlog(frentry_t *fr, fr_info_t *fin, ipf_stack_t *ifs)
352 {
353 cfwev_t event = {0};
354
355 /*
356 * We need a rule.
357 * Capture failure by using dtrace on this function's entry.
358 * 'ipf_block_cfwlog:entry /arg0 == NULL/ { printf("GOTCHA!\n"); }'
359 */
360 if (fr == NULL)
361 return;
362
363 event.cfwev_type = CFWEV_BLOCK;
364 event.cfwev_length = sizeof (event);
365 /*
366 * IPF code elsewhere does the cheesy single-flag check, even thogh
367 * there are two flags in a rule (one for in, one for out).
368 */
369 event.cfwev_direction = (fr->fr_flags & FR_INQUE) ?
370 CFWDIR_IN : CFWDIR_OUT;
371
372 event.cfwev_protocol = fin->fin_p;
373 /*
374 * NOTE: fin_*port is in host/native order, and ICMP info is here too.
375 */
376 event.cfwev_sport = htons(fin->fin_sport);
377 event.cfwev_dport = htons(fin->fin_dport);
378
379 if (fin->fin_v == IPV4_VERSION) {
380 IN6_INADDR_TO_V4MAPPED(&fin->fin_src, &event.cfwev_saddr);
381 IN6_INADDR_TO_V4MAPPED(&fin->fin_dst, &event.cfwev_daddr);
382 } else {
383 ASSERT3U(fin->fin_v, ==, IPV6_VERSION);
384 event.cfwev_saddr = fin->fin_src6.in6;
385 event.cfwev_daddr = fin->fin_dst6.in6;
386 }
387
388 /*
389 * uniqtime() is what ipf's GETKTIME() uses.
390 * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
391 */
392 uniqtime(&event.cfwev_tstamp);
393 event.cfwev_zonedid = ifs_to_did(ifs);
394 ASSERT(fin->fin_rule <= 0xffff); /* Must fit in uint16_t... */
395 event.cfwev_ruleid = fin->fin_rule;
396 memcpy(event.cfwev_ruleuuid, fr->fr_uuid, sizeof (uuid_t));
397
398 ipf_cfwev_report(&event);
399 }
400
401 /*
402 * ipf_log_cfwlog()
403 *
404 * Twin of ipstate_log(), but records state events for a global-zone data
405 * collector.
406 */
407 void
408 ipf_log_cfwlog(struct ipstate *is, uint_t type, ipf_stack_t *ifs)
409 {
410 cfwev_t event = {0};
411
412 switch (type) {
413 case ISL_NEW:
414 case ISL_CLONE:
415 event.cfwev_type = CFWEV_BEGIN;
416 break;
417 case ISL_EXPIRE:
418 case ISL_FLUSH:
419 case ISL_REMOVE:
420 case ISL_KILLED:
421 case ISL_ORPHAN:
422 #if 0
423 event.cfwev_type = CFWEV_END;
424 break;
425 #else
426 /*
427 * We don't care about session disappearances in CFW logging
428 * for now.
429 */
430 return;
431 #endif
432 default:
433 event.cfwev_type = CFWEV_BLOCK;
434 break;
435 }
436
437 /*
438 * IPF code elsewhere does the cheesy single-flag check, even thogh
439 * there are two flags in a rule (one for in, one for out).
440 */
441 event.cfwev_length = sizeof (event);
442 ASSERT(is->is_rule != NULL);
443 event.cfwev_direction = (is->is_rule->fr_flags & FR_INQUE) ?
444 CFWDIR_IN : CFWDIR_OUT;
445 event.cfwev_protocol = is->is_p;
446 switch (is->is_p) {
447 case IPPROTO_TCP:
448 case IPPROTO_UDP:
449 /* NOTE: is_*port is in network order. */
450 event.cfwev_sport = is->is_sport;
451 event.cfwev_dport = is->is_dport;
452 break;
453 case IPPROTO_ICMP:
454 case IPPROTO_ICMPV6:
455 /* Scribble the ICMP type in sport... */
456 event.cfwev_sport = is->is_icmp.ici_type;
457 break;
458 }
459
460 if (is->is_v == IPV4_VERSION) {
461 IN6_INADDR_TO_V4MAPPED(&is->is_src.in4, &event.cfwev_saddr);
462 IN6_INADDR_TO_V4MAPPED(&is->is_dst.in4, &event.cfwev_daddr);
463 } else {
464 ASSERT3U(is->is_v, ==, IPV6_VERSION);
465 event.cfwev_saddr = is->is_src.in6;
466 event.cfwev_daddr = is->is_dst.in6;
467 }
468
469 /*
470 * uniqtime() is what ipf's GETKTIME() uses.
471 * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
472 */
473 uniqtime(&event.cfwev_tstamp);
474 event.cfwev_zonedid = ifs_to_did(ifs);
475 ASSERT(is->is_rulen <= 0xffff); /* Must fit in uint16_t... */
476 event.cfwev_ruleid = is->is_rulen;
477 memcpy(event.cfwev_ruleuuid, is->is_uuid, sizeof (uuid_t));
478
479 ipf_cfwev_report(&event);
480 }
481
482 typedef struct uio_error_s {
483 struct uio *ue_uio;
484 int ue_error;
485 } uio_error_t;
486
487 /* Returning 0 means error indication. */
488 static uint_t
489 cfwlog_read_manycb(cfwev_t *evptr, uint_t num_avail, void *cbarg)
490 {
491 uio_error_t *ue = (uio_error_t *)cbarg;
492
493 ASSERT(MUTEX_HELD(&cfw_ringlock));
494
495 if (ue->ue_error != 0)
496 return (0);
497
498 ue->ue_error = uiomove((caddr_t)evptr, num_avail * sizeof (*evptr),
499 UIO_READ, ue->ue_uio);
500 if (ue->ue_error != 0)
501 return (0);
502
503 return (num_avail);
504 }
505
506 int
507 ipf_cfw_ring_resize(uint32_t newsize)
508 {
509 ASSERT(MUTEX_HELD(&cfw_ringlock) || newsize == IPF_CFW_RING_ALLOCATE ||
510 newsize == IPF_CFW_RING_DESTROY);
511
512 if (newsize == IPF_CFW_RING_ALLOCATE) {
513 if (cfw_ring != NULL)
514 return (EBUSY);
515 newsize = IPF_CFW_DEFAULT_RING_BUFS;
516 /* Fall through to allocating a new ring buffer. */
517 } else {
518 /* We may be called during error cleanup, so be liberal here. */
519 if (cfw_ring == NULL && newsize == IPF_CFW_RING_DESTROY)
520 return (0);
521 kmem_free(cfw_ring, cfw_ringsize * sizeof (cfwev_t));
522 cfw_ring = NULL;
523 if (cfw_ringfull) {
524 cfw_evdrops += cfw_ringsize;
525 } else if (cfw_ringstart > cfw_ringend) {
526 cfw_evdrops += cfw_ringend +
527 (cfw_ringsize - cfw_ringstart);
528 } else {
529 cfw_evdrops += cfw_ringend - cfw_ringstart;
530 }
531 cfw_ringsize = cfw_ringmask = cfw_ringstart = cfw_ringend = 0;
532 cfw_ringfull = B_FALSE;
533
534 if (newsize == IPF_CFW_RING_DESTROY)
535 return (0);
536 /*
537 * Keep the reports & drops around because if we're just
538 * resizing, we need to know what we lost.
539 */
540 }
541
542 ASSERT(ISP2(newsize));
543 cfw_ring = kmem_alloc(newsize * sizeof (cfwev_t), KM_SLEEP);
544 /* KM_SLEEP means we always succeed. */
545 cfw_ringsize = newsize;
546 cfw_ringmask = cfw_ringsize - 1;
547
548 return (0);
549 }
550
551 /* ARGSUSED */
552 int
553 ipf_cfwlog_ioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cp,
554 int *rp)
555 {
556 ipfcfwcfg_t cfginfo;
557 int error;
558
559 if (cmd != SIOCIPFCFWCFG && cmd != SIOCIPFCFWNEWSZ)
560 return (EIO);
561
562 if (crgetzoneid(cp) != GLOBAL_ZONEID)
563 return (EACCES);
564
565 error = COPYIN((caddr_t)data, (caddr_t)&cfginfo, sizeof (cfginfo));
566 if (error != 0)
567 return (EFAULT);
568
569 cfginfo.ipfcfwc_maxevsize = sizeof (cfwev_t);
570 mutex_enter(&cfw_ringlock);
571 cfginfo.ipfcfwc_evreports = cfw_evreports;
572 cfginfo.ipfcfwc_evdrops = cfw_evdrops;
573 if (cmd == SIOCIPFCFWNEWSZ) {
574 uint32_t newsize = cfginfo.ipfcfwc_evringsize;
575
576 /* Do ioctl parameter checking here, then call the resizer. */
577 if (newsize < IPF_CFW_MIN_RING_BUFS ||
578 newsize > IPF_CFW_MAX_RING_BUFS || !ISP2(newsize)) {
579 error = EINVAL;
580 } else {
581 error = ipf_cfw_ring_resize(cfginfo.ipfcfwc_evringsize);
582 }
583 } else {
584 error = 0;
585 }
586 cfginfo.ipfcfwc_evringsize = cfw_ringsize;
587 mutex_exit(&cfw_ringlock);
588
589 if (error != 0)
590 return (error);
591
592 error = COPYOUT((caddr_t)&cfginfo, (caddr_t)data, sizeof (cfginfo));
593 if (error != 0)
594 return (EFAULT);
595
596 return (0);
597 }
598
599 /* ARGSUSED */
600 int
601 ipf_cfwlog_read(dev_t dev, struct uio *uio, cred_t *cp)
602 {
603 uint_t requested, consumed;
604 uio_error_t ue = {uio, 0};
605 boolean_t block;
606
607 if (uio->uio_resid == 0)
608 return (0);
609 if (uio->uio_resid < sizeof (cfwev_t))
610 return (EINVAL);
611 /* XXX KEBE ASKS: Check for resid being too big?!? */
612
613 block = ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) == 0);
614 requested = uio->uio_resid / sizeof (cfwev_t);
615 ASSERT(requested > 0);
616
617 /*
618 * As stated earlier, ipf_cfwev_consume_many() takes a callback.
619 * The callback may be called multiple times before we return.
620 * The callback will execute uiomove().
621 */
622 consumed = ipf_cfwev_consume_many(requested, block, cfwlog_read_manycb,
623 &ue);
624 ASSERT3U(consumed, <=, requested);
625 if (!block && consumed == 0 && ue.ue_error == 0) {
626 /* No data available. */
627 ue.ue_error = EWOULDBLOCK;
628 } else if (ue.ue_error != 0 || (block && consumed == 0)) {
629 /* We had a problem... */
630 if (ue.ue_error == 0) {
631 /* Cover case of cv_wait_sig() receiving a signal. */
632 ue.ue_error = EINTR;
633 }
634 mutex_enter(&cfw_ringlock);
635 DTRACE_PROBE1(ipf__cfw__uiodiscard, int, consumed);
636 cfw_evdrops += consumed;
637 mutex_exit(&cfw_ringlock);
638 }
639 return (ue.ue_error);
640 }
641
642 #else
643
644 /* Blank stubs to satisfy userland's test compilations. */
645
646 int
647 ipf_cfw_ring_resize(uint32_t a)
648 {
649 return (0);
650 }
651
652 void
653 ipf_log_cfwlog(struct ipstate *a, uint_t b, ipf_stack_t *c)
654 {
655 }
656
657 void
658 ipf_block_cfwlog(frentry_t *a, fr_info_t *b, ipf_stack_t *c)
659 {
660 }
661
662 #endif /* _KERNEL */