1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019, Joyent, Inc.
14 */
15
16 /* IPF oddness for compilation in userland for IPF tests. */
17 #if defined(KERNEL) || defined(_KERNEL)
18 #undef KERNEL
19 #undef _KERNEL
20 #define KERNEL 1
21 #define _KERNEL 1
22 #endif
23
24 #include <sys/errno.h>
25 #include <sys/types.h>
26 #include <sys/param.h>
27 #include <sys/time.h>
28 #include <sys/socket.h>
29 #include <net/if.h>
30 #include <net/route.h>
31 #include <netinet/in.h>
32 #include <netinet/in_systm.h>
33 #include <netinet/ip.h>
34 #include <netinet/ip_var.h>
35 #include <netinet/tcp.h>
36 #include "netinet/ip_compat.h"
37 #ifdef USE_INET6
38 #include <netinet/icmp6.h>
39 #endif
40 #include <netinet/tcpip.h>
41 #include "netinet/ip_fil.h"
42 #include "netinet/ip_nat.h"
43 #include "netinet/ip_frag.h"
44 #include "netinet/ip_state.h"
45 #include "netinet/ip_proxy.h"
46 #include "netinet/ip_auth.h"
47 #include "netinet/ipf_stack.h"
48 #ifdef IPFILTER_SCAN
49 #include "netinet/ip_scan.h"
50 #endif
51 #ifdef IPFILTER_SYNC
52 #include "netinet/ip_sync.h"
53 #endif
54 #include "netinet/ip_pool.h"
55 #include "netinet/ip_htable.h"
56 #ifdef IPFILTER_COMPILED
57 #include "netinet/ip_rules.h"
58 #endif
59 #if defined(_KERNEL)
60 #include <sys/sunddi.h>
61 #endif
62
63 #include "netinet/ipf_cfw.h"
64 #include <sys/file.h>
65 #include <sys/uio.h>
66 #include <sys/cred.h>
67 #include <sys/ddi.h>
68
69 /*
70 * cfw == Cloud Firewall ==> routines for a global-zone data collector about
71 * ipf events for SmartOS. The only ones that CFW cares about are ones
72 * enforced by global-zone-controlled rulesets.
73 *
74 * The variable below is mdb-hackable to experiment with turning it on and
75 * off. Eventually this will tie into a new ipf (GZ-only) device that flips
76 * this on when there is an open instance. It may also consume an fr_flag
77 * to have per-rule granularity.
78 */
79 boolean_t ipf_cfwlog_enabled;
80
81 /*
82 * Because ipf's test tools in $SRC/cmd insert all of these files, we need to
83 * stub out what we can vs. drag in even more headers and who knows what else.
84 */
85 #ifdef _KERNEL
86
87 /*
88 * CFW event ring buffer. Remember, this is for ALL ZONES because only a
89 * global-zone event-reader will be consuming these. In other words, it's
90 * not something to instantiate per-netstack.
91 */
92
93 /*
94 * We may want to get more sophisticated and performant (e.g. per-processor),
95 * but for now keep the ring buffer simple and stupid.
96 */
97
98 /* Must be a power of 2, to be bitmaskable, and must be countable by a uint_t */
99
100 #define IPF_CFW_RING_BUFS 1024
101 #define IPF_CFW_RING_MASK (IPF_CFW_RING_BUFS - 1)
102
103 /* Assume C's init-to-zero is sufficient for these types... */
104 static kmutex_t cfw_ringlock;
105 static kcondvar_t cfw_ringcv;
106
107 static cfwev_t cfw_evring[IPF_CFW_RING_BUFS];
108 /* If these are equal, we're either empty or full. */
109 static uint_t cfw_ringstart, cfw_ringend;
110 static boolean_t cfw_ringfull; /* Tell the difference here! */
111 static uint64_t cfw_evreports;
112 static uint64_t cfw_evdrops;
113
114 /*
115 * Place an event in the CFW event ring buffer.
116 *
117 * For now, be simple and drop the oldest event if we overflow. We may wish to
118 * selectively drop older events based on type in the future.
119 */
120 static void
121 ipf_cfwev_report(cfwev_t *event)
122 {
123 mutex_enter(&cfw_ringlock);
124 if (cfw_ringfull) {
125 cfw_ringstart++;
126 cfw_ringstart &= IPF_CFW_RING_MASK;
127 cfw_ringend++;
128 cfw_ringend &= IPF_CFW_RING_MASK;
129 DTRACE_PROBE(ipf__cfw__evdrop);
130 cfw_evdrops++;
131 cfw_evring[cfw_ringend] = *event;
132 } else {
133 cfw_evring[cfw_ringend] = *event;
134 cfw_ringend++;
135 cfw_ringend &= IPF_CFW_RING_MASK;
136 cfw_ringfull = (cfw_ringend == cfw_ringstart);
137 }
138 cfw_evreports++;
139 cv_broadcast(&cfw_ringcv);
140 mutex_exit(&cfw_ringlock);
141 }
142
143 #if 0
144 /*
145 * Simple event consumer which copies one event from the ring buffer into
146 * what's provided. In the future, maybe lock-then-callback, even with a
147 * request for multiple events?
148 *
149 * If there are no events, either cv_wait() or return B_FALSE, depending on
150 * "block".
151 */
152 boolean_t
153 ipf_cfwev_consume(cfwev_t *event, boolean_t block)
154 {
155 mutex_enter(&cfw_ringlock);
156
157 /*
158 * Return B_FALSE if non-block and no data, OR if we receive a signal.
159 */
160 while ((cfw_ringstart == cfw_ringend) && !cfw_ringfull) {
161 if (!block || !cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) {
162 mutex_exit(&cfw_ringlock);
163 return (B_FALSE);
164 }
165 }
166
167 *event = cfw_evring[cfw_ringstart];
168 cfw_ringstart++;
169 cfw_ringstart &= IPF_CFW_RING_MASK;
170 cfw_ringfull = B_FALSE;
171 mutex_exit(&cfw_ringlock);
172 return (B_TRUE);
173 }
174 #endif
175
176 /*
177 * More sophisticated access to multiple CFW events that can allow copying
178 * straight from the ring buffer up to userland. Requires a callback (which
179 * could call uiomove() directly, OR to a local still-in-kernel buffer) that
180 * must do the data copying-out.
181 *
182 * Callback function is of the form:
183 *
184 * uint_t cfw_many_cb(cfwev_t *evptr, int num_avail, void *cbarg);
185 *
186 * The function must return how many events got consumed, which MUST be <= the
187 * number available. The function must ALSO UNDERSTAND that cfw_ringlock is
188 * held during this time. The function may be called more than once, if the
189 * available buffers wrap-around OR "block" is set and we don't have enough
190 * buffers. If any callback returns 0, exit the function with however many
191 * were consumed.
192 *
193 * This function, like the callback, returns the number of events *CONSUMED*.
194 */
195
196 /*
197 * If you wish to attempt to coalesce reads (to reduce the likelihood of one
198 * event at a time during high load) change the number of tries below to
199 * something not 0. Early experiments set this to 10.
200 *
201 * The wait between tries is in usecs in cfw_timeout_wait. The pessimal
202 * case for this is a timeout_wait-spaced trickle of one event at a time.
203 */
204 int cfw_timeout_tries = 0;
205 int cfw_timeout_wait = 10000; /* 10ms wait. */
206
207 uint_t
208 ipf_cfwev_consume_many(uint_t num_requested, boolean_t block,
209 cfwmanycb_t cfw_many_cb, void *cbarg)
210 {
211 uint_t consumed = 0, cb_consumed, contig_size;
212 int timeout_tries = cfw_timeout_tries;
213
214 mutex_enter(&cfw_ringlock);
215
216 /* Silly reality checks */
217 ASSERT3U(cfw_ringstart, <, IPF_CFW_RING_BUFS);
218 ASSERT3U(cfw_ringend, <, IPF_CFW_RING_BUFS);
219
220 /*
221 * Can goto here again if caller wants blocking. NOTE that
222 * num_requested may have been decremented and consumed may have been
223 * incremented if we arrive here via a goto after a cv_wait.
224 */
225 from_the_top:
226 if (cfw_ringstart > cfw_ringend || cfw_ringfull)
227 contig_size = IPF_CFW_RING_BUFS - cfw_ringstart;
228 else if (cfw_ringstart < cfw_ringend)
229 contig_size = cfw_ringend - cfw_ringstart;
230 else if (block && cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) {
231 /* Maybe something to consume now, try again. */
232 goto from_the_top;
233 } else {
234 /* Nothing (more) to consume, return! */
235 goto bail;
236 }
237
238 ASSERT(contig_size + cfw_ringstart == cfw_ringend ||
239 contig_size + cfw_ringstart == IPF_CFW_RING_BUFS);
240
241 if (num_requested < contig_size)
242 contig_size = num_requested;
243
244 cb_consumed = cfw_many_cb(&(cfw_evring[cfw_ringstart]), contig_size,
245 cbarg);
246 ASSERT(cb_consumed <= contig_size);
247 cfw_ringstart += cb_consumed;
248 consumed += cb_consumed;
249 cfw_ringfull = (cfw_ringfull && cb_consumed == 0);
250 if (cb_consumed < contig_size) {
251 /* Caller clearly had a problem. Reality check and bail. */
252 ASSERT((cfw_ringstart & IPF_CFW_RING_MASK) == cfw_ringstart);
253 goto bail;
254 }
255 ASSERT(cb_consumed == contig_size);
256 cfw_ringstart &= IPF_CFW_RING_MASK; /* In case of wraparound. */
257 num_requested -= contig_size;
258
259 if (num_requested > 0 && cfw_ringstart != cfw_ringend) {
260 /* We must have wrapped around the end of the buffer! */
261 ASSERT(cfw_ringstart == 0);
262 ASSERT(!cfw_ringfull);
263 contig_size = cfw_ringend;
264 if (num_requested < contig_size)
265 contig_size = num_requested;
266 cb_consumed = cfw_many_cb(&(cfw_evring[cfw_ringstart]),
267 contig_size, cbarg);
268 cfw_ringstart += cb_consumed;
269 consumed += cb_consumed;
270 if (cb_consumed < contig_size) {
271 /*
272 * Caller clearly had a problem. Reality check and
273 * bail.
274 */
275 ASSERT(cfw_ringend > cfw_ringstart);
276 goto bail;
277 }
278 ASSERT(cb_consumed == contig_size);
279 num_requested -= contig_size;
280 }
281
282 ASSERT(consumed > 0);
283
284 if (num_requested > 0 && block && timeout_tries > 0) {
285 clock_t delta = drv_usectohz(cfw_timeout_wait);
286
287 timeout_tries--;
288
289 /*
290 * We obtained some of the events we requested, but not all.
291 * Since we have nothing to consume, wait *a little bit*
292 * longer.
293 */
294 switch (cv_reltimedwait_sig(&cfw_ringcv, &cfw_ringlock, delta,
295 TR_CLOCK_TICK)) {
296 case 0:
297 /* Received signal! Throw out what we have. */
298 DTRACE_PROBE1(ipf__cfw__sigdiscard, int, consumed);
299 cfw_evdrops += consumed;
300 consumed = 0;
301 break;
302 case -1:
303 /* Time reached! Bail with what we got. */
304 DTRACE_PROBE(ipf__cfw__timedexpired);
305 break;
306 default:
307 /* Aha! We've got more! */
308 DTRACE_PROBE(ipf__cfw__moredata);
309 goto from_the_top;
310 }
311 }
312
313 bail:
314 mutex_exit(&cfw_ringlock);
315 return (consumed);
316 }
317
318 static inline zoneid_t
319 ifs_to_did(ipf_stack_t *ifs)
320 {
321 if (ifs->ifs_zone_did == 0) {
322 zone_t *zone;
323
324 /*
325 * Because we can't get the zone_did at initialization time
326 * because most zone data isn't readily available then,
327 * cement the did in place now.
328 */
329 ASSERT(ifs->ifs_zone != GLOBAL_ZONEID);
330 zone = zone_find_by_id(ifs->ifs_zone);
331 if (zone != NULL) {
332 ifs->ifs_zone_did = zone->zone_did;
333 zone_rele(zone);
334 }
335 /* Else we are either in shutdown or something weirder. */
336 }
337 return (ifs->ifs_zone_did);
338 }
339
340 /*
341 * ipf_block_cfwlog()
342 *
343 * Called by fr_check(). Record drop events for a global-zone data collector.
344 * Use rest-of-ipf-style names for the parameters.
345 */
346 void
347 ipf_block_cfwlog(frentry_t *fr, fr_info_t *fin, ipf_stack_t *ifs)
348 {
349 cfwev_t event = {0};
350
351 /*
352 * We need a rule.
353 * Capture failure by using dtrace on this function's entry.
354 * 'ipf_block_cfwlog:entry /arg0 == NULL/ { printf("GOTCHA!\n"); }'
355 */
356 if (fr == NULL)
357 return;
358
359 event.cfwev_type = CFWEV_BLOCK;
360 event.cfwev_length = sizeof (event);
361 /*
362 * IPF code elsewhere does the cheesy single-flag check, even thogh
363 * there are two flags in a rule (one for in, one for out).
364 */
365 event.cfwev_direction = (fr->fr_flags & FR_INQUE) ?
366 CFWDIR_IN : CFWDIR_OUT;
367
368 event.cfwev_protocol = fin->fin_p;
369 /*
370 * NOTE: fin_*port is in host/native order, and ICMP info is here too.
371 */
372 event.cfwev_sport = htons(fin->fin_sport);
373 event.cfwev_dport = htons(fin->fin_dport);
374
375 if (fin->fin_v == IPV4_VERSION) {
376 IN6_INADDR_TO_V4MAPPED(&fin->fin_src, &event.cfwev_saddr);
377 IN6_INADDR_TO_V4MAPPED(&fin->fin_dst, &event.cfwev_daddr);
378 } else {
379 ASSERT3U(fin->fin_v, ==, IPV6_VERSION);
380 event.cfwev_saddr = fin->fin_src6.in6;
381 event.cfwev_daddr = fin->fin_dst6.in6;
382 }
383
384 /*
385 * uniqtime() is what ipf's GETKTIME() uses.
386 * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
387 */
388 uniqtime(&event.cfwev_tstamp);
389 event.cfwev_zonedid = ifs_to_did(ifs);
390 ASSERT(fin->fin_rule <= 0xffff); /* Must fit in uint16_t... */
391 event.cfwev_ruleid = fin->fin_rule;
392 memcpy(event.cfwev_ruleuuid, fr->fr_uuid, sizeof (uuid_t));
393
394 ipf_cfwev_report(&event);
395 }
396
397 /*
398 * ipf_log_cfwlog()
399 *
400 * Twin of ipstate_log(), but records state events for a global-zone data
401 * collector.
402 */
403 void
404 ipf_log_cfwlog(struct ipstate *is, uint_t type, ipf_stack_t *ifs)
405 {
406 cfwev_t event = {0};
407
408 switch (type) {
409 case ISL_NEW:
410 case ISL_CLONE:
411 event.cfwev_type = CFWEV_BEGIN;
412 break;
413 case ISL_EXPIRE:
414 case ISL_FLUSH:
415 case ISL_REMOVE:
416 case ISL_KILLED:
417 case ISL_ORPHAN:
418 #if 0
419 event.cfwev_type = CFWEV_END;
420 break;
421 #else
422 /*
423 * We don't care about session disappearances in CFW logging
424 * for now.
425 */
426 return;
427 #endif
428 default:
429 event.cfwev_type = CFWEV_BLOCK;
430 break;
431 }
432
433 /*
434 * IPF code elsewhere does the cheesy single-flag check, even thogh
435 * there are two flags in a rule (one for in, one for out).
436 */
437 event.cfwev_length = sizeof (event);
438 ASSERT(is->is_rule != NULL);
439 event.cfwev_direction = (is->is_rule->fr_flags & FR_INQUE) ?
440 CFWDIR_IN : CFWDIR_OUT;
441 event.cfwev_protocol = is->is_p;
442 switch (is->is_p) {
443 case IPPROTO_TCP:
444 case IPPROTO_UDP:
445 /* NOTE: is_*port is in network order. */
446 event.cfwev_sport = is->is_sport;
447 event.cfwev_dport = is->is_dport;
448 break;
449 case IPPROTO_ICMP:
450 case IPPROTO_ICMPV6:
451 /* Scribble the ICMP type in sport... */
452 event.cfwev_sport = is->is_icmp.ici_type;
453 break;
454 }
455
456 if (is->is_v == IPV4_VERSION) {
457 IN6_INADDR_TO_V4MAPPED(&is->is_src.in4, &event.cfwev_saddr);
458 IN6_INADDR_TO_V4MAPPED(&is->is_dst.in4, &event.cfwev_daddr);
459 } else {
460 ASSERT3U(is->is_v, ==, IPV6_VERSION);
461 event.cfwev_saddr = is->is_src.in6;
462 event.cfwev_daddr = is->is_dst.in6;
463 }
464
465 /*
466 * uniqtime() is what ipf's GETKTIME() uses.
467 * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
468 */
469 uniqtime(&event.cfwev_tstamp);
470 event.cfwev_zonedid = ifs_to_did(ifs);
471 ASSERT(is->is_rulen <= 0xffff); /* Must fit in uint16_t... */
472 event.cfwev_ruleid = is->is_rulen;
473 memcpy(event.cfwev_ruleuuid, is->is_uuid, sizeof (uuid_t));
474
475 ipf_cfwev_report(&event);
476 }
477
478 typedef struct uio_error_s {
479 struct uio *ue_uio;
480 int ue_error;
481 } uio_error_t;
482
483 /* Returning 0 means error indication. */
484 static uint_t
485 cfwlog_read_manycb(cfwev_t *evptr, uint_t num_avail, void *cbarg)
486 {
487 uio_error_t *ue = (uio_error_t *)cbarg;
488
489 ASSERT(MUTEX_HELD(&cfw_ringlock));
490
491 if (ue->ue_error != 0)
492 return (0);
493
494 ue->ue_error = uiomove((caddr_t)evptr, num_avail * sizeof (*evptr),
495 UIO_READ, ue->ue_uio);
496 if (ue->ue_error != 0)
497 return (0);
498
499 return (num_avail);
500 }
501
502 /* ARGSUSED */
503 int
504 ipf_cfwlog_ioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cp,
505 int *rp)
506 {
507 ipfcfwcfg_t cfginfo;
508 int error;
509
510 if (cmd != SIOCIPFCFWCFG)
511 return (EIO);
512
513 if (crgetzoneid(cp) != GLOBAL_ZONEID)
514 return (EACCES);
515
516 #ifdef notyet
517 error = COPYIN((caddr_t)data, (caddr_t)&cfginfo, sizeof (cfginfo));
518 if (error != 0)
519 return (EFAULT);
520 /* TODO: Resize ring buffer based on cfginfo.ipfcfwc_evringsize. */
521 #endif
522
523 cfginfo.ipfcfwc_maxevsize = sizeof (cfwev_t);
524 cfginfo.ipfcfwc_evringsize = IPF_CFW_RING_BUFS;
525
526 error = COPYOUT((caddr_t)&cfginfo, (caddr_t)data, sizeof (cfginfo));
527 if (error != 0)
528 return (EFAULT);
529
530 return (0);
531 }
532
533 /* ARGSUSED */
534 int
535 ipf_cfwlog_read(dev_t dev, struct uio *uio, cred_t *cp)
536 {
537 uint_t requested, consumed;
538 uio_error_t ue = {uio, 0};
539 boolean_t block;
540
541 if (uio->uio_resid == 0)
542 return (0);
543 if (uio->uio_resid < sizeof (cfwev_t))
544 return (EINVAL);
545 /* XXX KEBE ASKS: Check for resid being too big?!? */
546
547 block = ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) == 0);
548 requested = uio->uio_resid / sizeof (cfwev_t);
549 ASSERT(requested > 0);
550
551 /*
552 * As stated earlier, ipf_cfwev_consume_many() takes a callback.
553 * The callback may be called multiple times before we return.
554 * The callback will execute uiomove().
555 */
556 consumed = ipf_cfwev_consume_many(requested, block, cfwlog_read_manycb,
557 &ue);
558 ASSERT3U(consumed, <=, requested);
559 if (!block && consumed == 0 && ue.ue_error == 0) {
560 /* No data available. */
561 ue.ue_error = EWOULDBLOCK;
562 } else if (ue.ue_error != 0 || (block && consumed == 0)) {
563 /* We had a problem... */
564 if (ue.ue_error == 0) {
565 /* Cover case of cv_wait_sig() receiving a signal. */
566 ue.ue_error = EINTR;
567 }
568 mutex_enter(&cfw_ringlock);
569 DTRACE_PROBE1(ipf__cfw__uiodiscard, int, consumed);
570 cfw_evdrops += consumed;
571 mutex_exit(&cfw_ringlock);
572 }
573 return (ue.ue_error);
574 }
575
576 #else
577
578 /* Blank stubs to satisfy userland's test compilations. */
579
580 void
581 ipf_log_cfwlog(struct ipstate *a, uint_t b, ipf_stack_t *c)
582 {
583 }
584
585 void
586 ipf_block_cfwlog(frentry_t *a, fr_info_t *b, ipf_stack_t *c)
587 {
588 }
589
590 #endif /* _KERNEL */