1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2019, Joyent, Inc.
  14  */
  15 
  16 /* IPF oddness for compilation in userland for IPF tests. */
  17 #if defined(KERNEL) || defined(_KERNEL)
  18 #undef KERNEL
  19 #undef _KERNEL
  20 #define KERNEL  1
  21 #define _KERNEL 1
  22 #endif
  23 
  24 #include <sys/errno.h>
  25 #include <sys/types.h>
  26 #include <sys/param.h>
  27 #include <sys/time.h>
  28 #include <sys/socket.h>
  29 #include <net/if.h>
  30 #include <net/route.h>
  31 #include <netinet/in.h>
  32 #include <netinet/in_systm.h>
  33 #include <netinet/ip.h>
  34 #include <netinet/ip_var.h>
  35 #include <netinet/tcp.h>
  36 #include "netinet/ip_compat.h"
  37 #ifdef  USE_INET6
  38 #include <netinet/icmp6.h>
  39 #endif
  40 #include <netinet/tcpip.h>
  41 #include "netinet/ip_fil.h"
  42 #include "netinet/ip_nat.h"
  43 #include "netinet/ip_frag.h"
  44 #include "netinet/ip_state.h"
  45 #include "netinet/ip_proxy.h"
  46 #include "netinet/ip_auth.h"
  47 #include "netinet/ipf_stack.h"
  48 #ifdef IPFILTER_SCAN
  49 #include "netinet/ip_scan.h"
  50 #endif
  51 #ifdef IPFILTER_SYNC
  52 #include "netinet/ip_sync.h"
  53 #endif
  54 #include "netinet/ip_pool.h"
  55 #include "netinet/ip_htable.h"
  56 #ifdef IPFILTER_COMPILED
  57 #include "netinet/ip_rules.h"
  58 #endif
  59 #if defined(_KERNEL)
  60 #include <sys/sunddi.h>
  61 #endif
  62 
  63 #include "netinet/ipf_cfw.h"
  64 #include <sys/file.h>
  65 #include <sys/uio.h>
  66 #include <sys/cred.h>
  67 #include <sys/ddi.h>
  68 
  69 /*
  70  * cfw == Cloud Firewall ==> routines for a global-zone data collector about
  71  * ipf events for SmartOS.  The only ones that CFW cares about are ones
  72  * enforced by global-zone-controlled rulesets.
  73  *
  74  * The variable below is mdb-hackable to experiment with turning it on and
  75  * off. Eventually this will tie into a new ipf (GZ-only) device that flips
  76  * this on when there is an open instance.  It may also consume an fr_flag
  77  * to have per-rule granularity.
  78  */
  79 boolean_t ipf_cfwlog_enabled;
  80 
  81 /*
  82  * Because ipf's test tools in $SRC/cmd insert all of these files, we need to
  83  * stub out what we can vs. drag in even more headers and who knows what else.
  84  */
  85 #ifdef _KERNEL
  86 
  87 /*
  88  * CFW event ring buffer.  Remember, this is for ALL ZONES because only a
  89  * global-zone event-reader will be consuming these.  In other words, it's
  90  * not something to instantiate per-netstack.
  91  */
  92 
  93 /*
  94  * We may want to get more sophisticated and performant (e.g. per-processor),
  95  * but for now keep the ring buffer simple and stupid.
  96  */
  97 
  98 /* Must be a power of 2, to be bitmaskable, and must be countable by a uint_t */
  99 
 100 #define IPF_CFW_RING_BUFS       1024
 101 #define IPF_CFW_RING_MASK (IPF_CFW_RING_BUFS - 1)
 102 
 103 /* Assume C's init-to-zero is sufficient for these types... */
 104 static kmutex_t cfw_ringlock;
 105 static kcondvar_t cfw_ringcv;
 106 
 107 static cfwev_t cfw_evring[IPF_CFW_RING_BUFS];
 108 /* If these are equal, we're either empty or full. */
 109 static uint_t cfw_ringstart, cfw_ringend;
 110 static boolean_t cfw_ringfull;  /* Tell the difference here! */
 111 static uint64_t cfw_evreports;
 112 static uint64_t cfw_evdrops;
 113 
 114 /*
 115  * Place an event in the CFW event ring buffer.
 116  *
 117  * For now, be simple and drop the oldest event if we overflow. We may wish to
 118  * selectively drop older events based on type in the future.
 119  */
 120 static void
 121 ipf_cfwev_report(cfwev_t *event)
 122 {
 123         mutex_enter(&cfw_ringlock);
 124         if (cfw_ringfull) {
 125                 cfw_ringstart++;
 126                 cfw_ringstart &= IPF_CFW_RING_MASK;
 127                 cfw_ringend++;
 128                 cfw_ringend &= IPF_CFW_RING_MASK;
 129                 DTRACE_PROBE(ipf__cfw__evdrop);
 130                 cfw_evdrops++;
 131                 cfw_evring[cfw_ringend] = *event;
 132         } else {
 133                 cfw_evring[cfw_ringend] = *event;
 134                 cfw_ringend++;
 135                 cfw_ringend &= IPF_CFW_RING_MASK;
 136                 cfw_ringfull = (cfw_ringend == cfw_ringstart);
 137         }
 138         cfw_evreports++;
 139         cv_broadcast(&cfw_ringcv);
 140         mutex_exit(&cfw_ringlock);
 141 }
 142 
 143 #if 0
 144 /*
 145  * Simple event consumer which copies one event from the ring buffer into
 146  * what's provided.  In the future, maybe lock-then-callback, even with a
 147  * request for multiple events?
 148  *
 149  * If there are no events, either cv_wait() or return B_FALSE, depending on
 150  * "block".
 151  */
 152 boolean_t
 153 ipf_cfwev_consume(cfwev_t *event, boolean_t block)
 154 {
 155         mutex_enter(&cfw_ringlock);
 156 
 157         /*
 158          * Return B_FALSE if non-block and no data, OR if we receive a signal.
 159          */
 160         while ((cfw_ringstart == cfw_ringend) && !cfw_ringfull) {
 161                 if (!block || !cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) {
 162                         mutex_exit(&cfw_ringlock);
 163                         return (B_FALSE);
 164                 }
 165         }
 166 
 167         *event = cfw_evring[cfw_ringstart];
 168         cfw_ringstart++;
 169         cfw_ringstart &= IPF_CFW_RING_MASK;
 170         cfw_ringfull = B_FALSE;
 171         mutex_exit(&cfw_ringlock);
 172         return (B_TRUE);
 173 }
 174 #endif
 175 
 176 /*
 177  * More sophisticated access to multiple CFW events that can allow copying
 178  * straight from the ring buffer up to userland.  Requires a callback (which
 179  * could call uiomove() directly, OR to a local still-in-kernel buffer) that
 180  * must do the data copying-out.
 181  *
 182  * Callback function is of the form:
 183  *
 184  *      uint_t cfw_many_cb(cfwev_t *evptr, int num_avail, void *cbarg);
 185  *
 186  * The function must return how many events got consumed, which MUST be <= the
 187  * number available.  The function must ALSO UNDERSTAND that cfw_ringlock is
 188  * held during this time.  The function may be called more than once, if the
 189  * available buffers wrap-around OR "block" is set and we don't have enough
 190  * buffers.  If any callback returns 0, exit the function with however many
 191  * were consumed.
 192  *
 193  * This function, like the callback, returns the number of events *CONSUMED*.
 194  */
 195 
 196 /*
 197  * If you wish to attempt to coalesce reads (to reduce the likelihood of one
 198  * event at a time during high load) change the number of tries below to
 199  * something not 0. Early experiments set this to 10.
 200  *
 201  * The wait between tries is in usecs in cfw_timeout_wait. The pessimal
 202  * case for this is a timeout_wait-spaced trickle of one event at a time.
 203  */
 204 int cfw_timeout_tries = 0;
 205 int cfw_timeout_wait = 10000;   /* 10ms wait. */
 206 
 207 uint_t
 208 ipf_cfwev_consume_many(uint_t num_requested, boolean_t block,
 209     cfwmanycb_t cfw_many_cb, void *cbarg)
 210 {
 211         uint_t consumed = 0, cb_consumed, contig_size;
 212         int timeout_tries = cfw_timeout_tries;
 213 
 214         mutex_enter(&cfw_ringlock);
 215 
 216         /* Silly reality checks */
 217         ASSERT3U(cfw_ringstart, <, IPF_CFW_RING_BUFS);
 218         ASSERT3U(cfw_ringend, <, IPF_CFW_RING_BUFS);
 219 
 220         /*
 221          * Can goto here again if caller wants blocking. NOTE that
 222          * num_requested may have been decremented and consumed may have been
 223          * incremented if we arrive here via a goto after a cv_wait.
 224          */
 225 from_the_top:
 226         if (cfw_ringstart > cfw_ringend || cfw_ringfull)
 227                 contig_size = IPF_CFW_RING_BUFS - cfw_ringstart;
 228         else if (cfw_ringstart < cfw_ringend)
 229                 contig_size = cfw_ringend - cfw_ringstart;
 230         else if (block && cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) {
 231                 /* Maybe something to consume now, try again. */
 232                 goto from_the_top;
 233         } else {
 234                 /* Nothing (more) to consume, return! */
 235                 goto bail;
 236         }
 237 
 238         ASSERT(contig_size + cfw_ringstart == cfw_ringend ||
 239             contig_size + cfw_ringstart == IPF_CFW_RING_BUFS);
 240 
 241         if (num_requested < contig_size)
 242                 contig_size = num_requested;
 243 
 244         cb_consumed = cfw_many_cb(&(cfw_evring[cfw_ringstart]), contig_size,
 245             cbarg);
 246         ASSERT(cb_consumed <= contig_size);
 247         cfw_ringstart += cb_consumed;
 248         consumed += cb_consumed;
 249         cfw_ringfull = (cfw_ringfull && cb_consumed == 0);
 250         if (cb_consumed < contig_size) {
 251                 /* Caller clearly had a problem. Reality check and bail. */
 252                 ASSERT((cfw_ringstart & IPF_CFW_RING_MASK) == cfw_ringstart);
 253                 goto bail;
 254         }
 255         ASSERT(cb_consumed == contig_size);
 256         cfw_ringstart &= IPF_CFW_RING_MASK; /* In case of wraparound. */
 257         num_requested -= contig_size;
 258 
 259         if (num_requested > 0 && cfw_ringstart != cfw_ringend) {
 260                 /* We must have wrapped around the end of the buffer! */
 261                 ASSERT(cfw_ringstart == 0);
 262                 ASSERT(!cfw_ringfull);
 263                 contig_size = cfw_ringend;
 264                 if (num_requested < contig_size)
 265                         contig_size = num_requested;
 266                 cb_consumed = cfw_many_cb(&(cfw_evring[cfw_ringstart]),
 267                     contig_size, cbarg);
 268                 cfw_ringstart += cb_consumed;
 269                 consumed += cb_consumed;
 270                 if (cb_consumed < contig_size) {
 271                         /*
 272                          * Caller clearly had a problem. Reality check and
 273                          * bail.
 274                          */
 275                         ASSERT(cfw_ringend > cfw_ringstart);
 276                         goto bail;
 277                 }
 278                 ASSERT(cb_consumed == contig_size);
 279                 num_requested -= contig_size;
 280         }
 281 
 282         ASSERT(consumed > 0);
 283 
 284         if (num_requested > 0 && block && timeout_tries > 0) {
 285                 clock_t delta = drv_usectohz(cfw_timeout_wait);
 286 
 287                 timeout_tries--;
 288 
 289                 /*
 290                  * We obtained some of the events we requested, but not all.
 291                  * Since we have nothing to consume, wait *a little bit*
 292                  * longer.
 293                  */
 294                 switch (cv_reltimedwait_sig(&cfw_ringcv, &cfw_ringlock, delta,
 295                     TR_CLOCK_TICK)) {
 296                 case 0:
 297                         /* Received signal! Throw out what we have. */
 298                         DTRACE_PROBE1(ipf__cfw__sigdiscard, int, consumed);
 299                         cfw_evdrops += consumed;
 300                         consumed = 0;
 301                         break;
 302                 case -1:
 303                         /* Time reached! Bail with what we got. */
 304                         DTRACE_PROBE(ipf__cfw__timedexpired);
 305                         break;
 306                 default:
 307                         /* Aha! We've got more! */
 308                         DTRACE_PROBE(ipf__cfw__moredata);
 309                         goto from_the_top;
 310                 }
 311         }
 312 
 313 bail:
 314         mutex_exit(&cfw_ringlock);
 315         return (consumed);
 316 }
 317 
 318 static inline zoneid_t
 319 ifs_to_did(ipf_stack_t *ifs)
 320 {
 321         if (ifs->ifs_zone_did == 0) {
 322                 zone_t *zone;
 323 
 324                 /*
 325                  * Because we can't get the zone_did at initialization time
 326                  * because most zone data isn't readily available then,
 327                  * cement the did in place now.
 328                  */
 329                 ASSERT(ifs->ifs_zone != GLOBAL_ZONEID);
 330                 zone = zone_find_by_id(ifs->ifs_zone);
 331                 if (zone != NULL) {
 332                         ifs->ifs_zone_did = zone->zone_did;
 333                         zone_rele(zone);
 334                 }
 335                 /* Else we are either in shutdown or something weirder. */
 336         }
 337         return (ifs->ifs_zone_did);
 338 }
 339 
 340 /*
 341  * ipf_block_cfwlog()
 342  *
 343  * Called by fr_check().  Record drop events for a global-zone data collector.
 344  * Use rest-of-ipf-style names for the parameters.
 345  */
 346 void
 347 ipf_block_cfwlog(frentry_t *fr, fr_info_t *fin, ipf_stack_t *ifs)
 348 {
 349         cfwev_t event = {0};
 350 
 351         /*
 352          * We need a rule.
 353          * Capture failure by using dtrace on this function's entry.
 354          * 'ipf_block_cfwlog:entry /arg0 == NULL/ { printf("GOTCHA!\n"); }'
 355          */
 356         if (fr == NULL)
 357                 return;
 358 
 359         event.cfwev_type = CFWEV_BLOCK;
 360         event.cfwev_length = sizeof (event);
 361         /*
 362          * IPF code elsewhere does the cheesy single-flag check, even thogh
 363          * there are two flags in a rule (one for in, one for out).
 364          */
 365         event.cfwev_direction = (fr->fr_flags & FR_INQUE) ?
 366             CFWDIR_IN : CFWDIR_OUT;
 367 
 368         event.cfwev_protocol = fin->fin_p;
 369         /*
 370          * NOTE: fin_*port is in host/native order, and ICMP info is here too.
 371          */
 372         event.cfwev_sport = htons(fin->fin_sport);
 373         event.cfwev_dport = htons(fin->fin_dport);
 374 
 375         if (fin->fin_v == IPV4_VERSION) {
 376                 IN6_INADDR_TO_V4MAPPED(&fin->fin_src, &event.cfwev_saddr);
 377                 IN6_INADDR_TO_V4MAPPED(&fin->fin_dst, &event.cfwev_daddr);
 378         } else {
 379                 ASSERT3U(fin->fin_v, ==, IPV6_VERSION);
 380                 event.cfwev_saddr = fin->fin_src6.in6;
 381                 event.cfwev_daddr = fin->fin_dst6.in6;
 382         }
 383 
 384         /*
 385          * uniqtime() is what ipf's GETKTIME() uses.
 386          * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
 387          */
 388         uniqtime(&event.cfwev_tstamp);
 389         event.cfwev_zonedid = ifs_to_did(ifs);
 390         ASSERT(fin->fin_rule <= 0xffff);  /* Must fit in uint16_t... */
 391         event.cfwev_ruleid = fin->fin_rule;
 392         memcpy(event.cfwev_ruleuuid, fr->fr_uuid, sizeof (uuid_t));
 393 
 394         ipf_cfwev_report(&event);
 395 }
 396 
 397 /*
 398  * ipf_log_cfwlog()
 399  *
 400  * Twin of ipstate_log(), but records state events for a global-zone data
 401  * collector.
 402  */
 403 void
 404 ipf_log_cfwlog(struct ipstate *is, uint_t type, ipf_stack_t *ifs)
 405 {
 406         cfwev_t event = {0};
 407 
 408         switch (type) {
 409         case ISL_NEW:
 410         case ISL_CLONE:
 411                 event.cfwev_type = CFWEV_BEGIN;
 412                 break;
 413         case ISL_EXPIRE:
 414         case ISL_FLUSH:
 415         case ISL_REMOVE:
 416         case ISL_KILLED:
 417         case ISL_ORPHAN:
 418 #if 0
 419                 event.cfwev_type = CFWEV_END;
 420                 break;
 421 #else
 422                 /*
 423                  * We don't care about session disappearances in CFW logging
 424                  * for now.
 425                  */
 426                 return;
 427 #endif
 428         default:
 429                 event.cfwev_type = CFWEV_BLOCK;
 430                 break;
 431         }
 432 
 433         /*
 434          * IPF code elsewhere does the cheesy single-flag check, even thogh
 435          * there are two flags in a rule (one for in, one for out).
 436          */
 437         event.cfwev_length = sizeof (event);
 438         ASSERT(is->is_rule != NULL);
 439         event.cfwev_direction = (is->is_rule->fr_flags & FR_INQUE) ?
 440             CFWDIR_IN : CFWDIR_OUT;
 441         event.cfwev_protocol = is->is_p;
 442         switch (is->is_p) {
 443         case IPPROTO_TCP:
 444         case IPPROTO_UDP:
 445                 /* NOTE: is_*port is in network order. */
 446                 event.cfwev_sport = is->is_sport;
 447                 event.cfwev_dport = is->is_dport;
 448                 break;
 449         case IPPROTO_ICMP:
 450         case IPPROTO_ICMPV6:
 451                 /* Scribble the ICMP type in sport... */
 452                 event.cfwev_sport = is->is_icmp.ici_type;
 453                 break;
 454         }
 455 
 456         if (is->is_v == IPV4_VERSION) {
 457                 IN6_INADDR_TO_V4MAPPED(&is->is_src.in4, &event.cfwev_saddr);
 458                 IN6_INADDR_TO_V4MAPPED(&is->is_dst.in4, &event.cfwev_daddr);
 459         } else {
 460                 ASSERT3U(is->is_v, ==, IPV6_VERSION);
 461                 event.cfwev_saddr = is->is_src.in6;
 462                 event.cfwev_daddr = is->is_dst.in6;
 463         }
 464 
 465         /*
 466          * uniqtime() is what ipf's GETKTIME() uses.
 467          * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
 468          */
 469         uniqtime(&event.cfwev_tstamp);
 470         event.cfwev_zonedid = ifs_to_did(ifs);
 471         ASSERT(is->is_rulen <= 0xffff);   /* Must fit in uint16_t... */
 472         event.cfwev_ruleid = is->is_rulen;
 473         memcpy(event.cfwev_ruleuuid, is->is_uuid, sizeof (uuid_t));
 474 
 475         ipf_cfwev_report(&event);
 476 }
 477 
 478 typedef struct uio_error_s {
 479         struct uio *ue_uio;
 480         int ue_error;
 481 } uio_error_t;
 482 
 483 /* Returning 0 means error indication. */
 484 static uint_t
 485 cfwlog_read_manycb(cfwev_t *evptr, uint_t num_avail, void *cbarg)
 486 {
 487         uio_error_t *ue = (uio_error_t *)cbarg;
 488 
 489         ASSERT(MUTEX_HELD(&cfw_ringlock));
 490 
 491         if (ue->ue_error != 0)
 492                 return (0);
 493 
 494         ue->ue_error = uiomove((caddr_t)evptr, num_avail * sizeof (*evptr),
 495             UIO_READ, ue->ue_uio);
 496         if (ue->ue_error != 0)
 497                 return (0);
 498 
 499         return (num_avail);
 500 }
 501 
 502 /* ARGSUSED */
 503 int
 504 ipf_cfwlog_ioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cp,
 505     int *rp)
 506 {
 507         ipfcfwcfg_t cfginfo;
 508         int error;
 509 
 510         if (cmd != SIOCIPFCFWCFG)
 511                 return (EIO);
 512 
 513         if (crgetzoneid(cp) != GLOBAL_ZONEID)
 514                 return (EACCES);
 515 
 516 #ifdef notyet
 517         error = COPYIN((caddr_t)data, (caddr_t)&cfginfo, sizeof (cfginfo));
 518         if (error != 0)
 519                 return (EFAULT);
 520         /* TODO: Resize ring buffer based on cfginfo.ipfcfwc_evringsize. */
 521 #endif
 522 
 523         cfginfo.ipfcfwc_maxevsize = sizeof (cfwev_t);
 524         cfginfo.ipfcfwc_evringsize = IPF_CFW_RING_BUFS;
 525 
 526         error = COPYOUT((caddr_t)&cfginfo, (caddr_t)data, sizeof (cfginfo));
 527         if (error != 0)
 528                 return (EFAULT);
 529 
 530         return (0);
 531 }
 532 
 533 /* ARGSUSED */
 534 int
 535 ipf_cfwlog_read(dev_t dev, struct uio *uio, cred_t *cp)
 536 {
 537         uint_t requested, consumed;
 538         uio_error_t ue = {uio, 0};
 539         boolean_t block;
 540 
 541         if (uio->uio_resid == 0)
 542                 return (0);
 543         if (uio->uio_resid < sizeof (cfwev_t))
 544                 return (EINVAL);
 545         /* XXX KEBE ASKS: Check for resid being too big?!? */
 546 
 547         block = ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) == 0);
 548         requested = uio->uio_resid / sizeof (cfwev_t);
 549         ASSERT(requested > 0);
 550 
 551         /*
 552          * As stated earlier, ipf_cfwev_consume_many() takes a callback.
 553          * The callback may be called multiple times before we return.
 554          * The callback will execute uiomove().
 555          */
 556         consumed = ipf_cfwev_consume_many(requested, block, cfwlog_read_manycb,
 557             &ue);
 558         ASSERT3U(consumed, <=, requested);
 559         if (!block && consumed == 0 && ue.ue_error == 0) {
 560                 /* No data available. */
 561                 ue.ue_error = EWOULDBLOCK;
 562         } else if (ue.ue_error != 0 || (block && consumed == 0)) {
 563                 /* We had a problem... */
 564                 if (ue.ue_error == 0) {
 565                         /* Cover case of cv_wait_sig() receiving a signal. */
 566                         ue.ue_error = EINTR;
 567                 }
 568                 mutex_enter(&cfw_ringlock);
 569                 DTRACE_PROBE1(ipf__cfw__uiodiscard, int, consumed);
 570                 cfw_evdrops += consumed;
 571                 mutex_exit(&cfw_ringlock);
 572         }
 573         return (ue.ue_error);
 574 }
 575 
 576 #else
 577 
 578 /* Blank stubs to satisfy userland's test compilations. */
 579 
 580 void
 581 ipf_log_cfwlog(struct ipstate *a, uint_t b, ipf_stack_t *c)
 582 {
 583 }
 584 
 585 void
 586 ipf_block_cfwlog(frentry_t *a, fr_info_t *b, ipf_stack_t *c)
 587 {
 588 }
 589 
 590 #endif  /* _KERNEL */