1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2021 Joyent, Inc.
  25  */
  26 
  27 #include <sys/param.h>
  28 #include <sys/thread.h>
  29 #include <sys/cpuvar.h>
  30 #include <sys/inttypes.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/time.h>
  33 #include <sys/ksynch.h>
  34 #include <sys/systm.h>
  35 #include <sys/kcpc.h>
  36 #include <sys/cpc_impl.h>
  37 #include <sys/cpc_pcbe.h>
  38 #include <sys/atomic.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/modctl.h>
  41 #include <sys/sdt.h>
  42 #include <sys/archsystm.h>
  43 #include <sys/promif.h>
  44 #include <sys/x_call.h>
  45 #include <sys/cap_util.h>
  46 #if defined(__x86)
  47 #include <asm/clock.h>
  48 #include <sys/xc_levels.h>
  49 #endif
  50 
  51 static kmutex_t kcpc_ctx_llock[CPC_HASH_BUCKETS];       /* protects ctx_list */
  52 static kcpc_ctx_t *kcpc_ctx_list[CPC_HASH_BUCKETS];     /* head of list */
  53 
  54 
  55 krwlock_t       kcpc_cpuctx_lock;       /* lock for 'kcpc_cpuctx' below */
  56 int             kcpc_cpuctx;            /* number of cpu-specific contexts */
  57 
  58 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
  59 
  60 /*
  61  * These are set when a PCBE module is loaded.
  62  */
  63 uint_t          cpc_ncounters = 0;
  64 pcbe_ops_t      *pcbe_ops = NULL;
  65 
  66 /*
  67  * Statistics on (mis)behavior
  68  */
  69 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
  70 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
  71 
  72 /*
  73  * By setting 'kcpc_nullctx_panic' to 1, any overflow interrupts in a thread
  74  * with no valid context will result in a panic.
  75  */
  76 static int kcpc_nullctx_panic = 0;
  77 
  78 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
  79 static void kcpc_restore(kcpc_ctx_t *ctx);
  80 static void kcpc_save(kcpc_ctx_t *ctx);
  81 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
  82 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
  83 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
  84 static kcpc_set_t *kcpc_set_create(kcpc_request_t *reqs, int nreqs,
  85     int set_flags, int kmem_flags);
  86 
  87 /*
  88  * Macros to manipulate context flags. All flag updates should use one of these
  89  * two macros
  90  *
  91  * Flags should be always be updated atomically since some of the updates are
  92  * not protected by locks.
  93  */
  94 #define KCPC_CTX_FLAG_SET(ctx, flag) atomic_or_uint(&(ctx)->kc_flags, (flag))
  95 #define KCPC_CTX_FLAG_CLR(ctx, flag) atomic_and_uint(&(ctx)->kc_flags, ~(flag))
  96 
  97 /*
  98  * The IS_HIPIL() macro verifies that the code is executed either from a
  99  * cross-call or from high-PIL interrupt
 100  */
 101 #ifdef DEBUG
 102 #define IS_HIPIL() (getpil() >= XCALL_PIL)
 103 #else
 104 #define IS_HIPIL()
 105 #endif  /* DEBUG */
 106 
 107 
 108 extern int kcpc_hw_load_pcbe(void);
 109 
 110 /*
 111  * Return value from kcpc_hw_load_pcbe()
 112  */
 113 static int kcpc_pcbe_error = 0;
 114 
 115 /*
 116  * Perform one-time initialization of kcpc framework.
 117  * This function performs the initialization only the first time it is called.
 118  * It is safe to call it multiple times.
 119  */
 120 int
 121 kcpc_init(void)
 122 {
 123         long hash;
 124         static uint32_t kcpc_initialized = 0;
 125 
 126         /*
 127          * We already tried loading platform pcbe module and failed
 128          */
 129         if (kcpc_pcbe_error != 0)
 130                 return (-1);
 131 
 132         /*
 133          * The kcpc framework should be initialized at most once
 134          */
 135         if (atomic_cas_32(&kcpc_initialized, 0, 1) != 0)
 136                 return (0);
 137 
 138         rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
 139         for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
 140                 mutex_init(&kcpc_ctx_llock[hash],
 141                     NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
 142 
 143         /*
 144          * Load platform-specific pcbe module
 145          */
 146         kcpc_pcbe_error = kcpc_hw_load_pcbe();
 147 
 148         return (kcpc_pcbe_error == 0 ? 0 : -1);
 149 }
 150 
 151 void
 152 kcpc_register_pcbe(pcbe_ops_t *ops)
 153 {
 154         pcbe_ops = ops;
 155         cpc_ncounters = pcbe_ops->pcbe_ncounters();
 156 }
 157 
 158 void
 159 kcpc_register_dcpc(void (*func)(uint64_t))
 160 {
 161         dtrace_cpc_fire = func;
 162 }
 163 
 164 void
 165 kcpc_unregister_dcpc(void)
 166 {
 167         dtrace_cpc_fire = NULL;
 168 }
 169 
 170 int
 171 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
 172 {
 173         cpu_t           *cp;
 174         kcpc_ctx_t      *ctx;
 175         int             error;
 176         int             save_spl;
 177 
 178         ctx = kcpc_ctx_alloc(KM_SLEEP);
 179 
 180         if (kcpc_assign_reqs(set, ctx) != 0) {
 181                 kcpc_ctx_free(ctx);
 182                 *subcode = CPC_RESOURCE_UNAVAIL;
 183                 return (EINVAL);
 184         }
 185 
 186         ctx->kc_cpuid = cpuid;
 187         ctx->kc_thread = curthread;
 188 
 189         set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
 190 
 191         if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
 192                 kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 193                 kcpc_ctx_free(ctx);
 194                 return (error);
 195         }
 196 
 197         set->ks_ctx = ctx;
 198         ctx->kc_set = set;
 199 
 200         /*
 201          * We must hold cpu_lock to prevent DR, offlining, or unbinding while
 202          * we are manipulating the cpu_t and programming the hardware, else the
 203          * the cpu_t could go away while we're looking at it.
 204          */
 205         mutex_enter(&cpu_lock);
 206         cp = cpu_get(cpuid);
 207 
 208         if (cp == NULL)
 209                 /*
 210                  * The CPU could have been DRd out while we were getting set up.
 211                  */
 212                 goto unbound;
 213 
 214         mutex_enter(&cp->cpu_cpc_ctxlock);
 215         kpreempt_disable();
 216         save_spl = spl_xcall();
 217 
 218         /*
 219          * Check to see whether counters for CPU already being used by someone
 220          * other than kernel for capacity and utilization (since kernel will
 221          * let go of counters for user in kcpc_program() below)
 222          */
 223         if (cp->cpu_cpc_ctx != NULL && !CU_CPC_ON(cp)) {
 224                 /*
 225                  * If this CPU already has a bound set, return an error.
 226                  */
 227                 splx(save_spl);
 228                 kpreempt_enable();
 229                 mutex_exit(&cp->cpu_cpc_ctxlock);
 230                 goto unbound;
 231         }
 232 
 233         if (curthread->t_bind_cpu != cpuid) {
 234                 splx(save_spl);
 235                 kpreempt_enable();
 236                 mutex_exit(&cp->cpu_cpc_ctxlock);
 237                 goto unbound;
 238         }
 239 
 240         kcpc_program(ctx, B_FALSE, B_TRUE);
 241 
 242         splx(save_spl);
 243         kpreempt_enable();
 244 
 245         mutex_exit(&cp->cpu_cpc_ctxlock);
 246         mutex_exit(&cpu_lock);
 247 
 248         mutex_enter(&set->ks_lock);
 249         set->ks_state |= KCPC_SET_BOUND;
 250         cv_signal(&set->ks_condv);
 251         mutex_exit(&set->ks_lock);
 252 
 253         return (0);
 254 
 255 unbound:
 256         mutex_exit(&cpu_lock);
 257         set->ks_ctx = NULL;
 258         kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 259         kcpc_ctx_free(ctx);
 260         return (EAGAIN);
 261 }
 262 
 263 int
 264 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 265 {
 266         kcpc_ctx_t      *ctx;
 267         int             error;
 268 
 269         /*
 270          * Only one set is allowed per context, so ensure there is no
 271          * existing context.
 272          */
 273 
 274         if (t->t_cpc_ctx != NULL)
 275                 return (EEXIST);
 276 
 277         ctx = kcpc_ctx_alloc(KM_SLEEP);
 278 
 279         /*
 280          * The context must begin life frozen until it has been properly
 281          * programmed onto the hardware. This prevents the context ops from
 282          * worrying about it until we're ready.
 283          */
 284         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 285         ctx->kc_hrtime = gethrtime();
 286 
 287         if (kcpc_assign_reqs(set, ctx) != 0) {
 288                 kcpc_ctx_free(ctx);
 289                 *subcode = CPC_RESOURCE_UNAVAIL;
 290                 return (EINVAL);
 291         }
 292 
 293         ctx->kc_cpuid = -1;
 294         if (set->ks_flags & CPC_BIND_LWP_INHERIT)
 295                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_LWPINHERIT);
 296         ctx->kc_thread = t;
 297         t->t_cpc_ctx = ctx;
 298         /*
 299          * Permit threads to look at their own hardware counters from userland.
 300          */
 301         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_NONPRIV);
 302 
 303         /*
 304          * Create the data store for this set.
 305          */
 306         set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
 307 
 308         if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
 309                 kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 310                 kcpc_ctx_free(ctx);
 311                 t->t_cpc_ctx = NULL;
 312                 return (error);
 313         }
 314 
 315         set->ks_ctx = ctx;
 316         ctx->kc_set = set;
 317 
 318         /*
 319          * Add a device context to the subject thread.
 320          */
 321         installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
 322             kcpc_lwp_create, NULL, kcpc_free, NULL);
 323 
 324         /*
 325          * Ask the backend to program the hardware.
 326          */
 327         if (t == curthread) {
 328                 int save_spl;
 329 
 330                 kpreempt_disable();
 331                 save_spl = spl_xcall();
 332                 kcpc_program(ctx, B_TRUE, B_TRUE);
 333                 splx(save_spl);
 334                 kpreempt_enable();
 335         } else {
 336                 /*
 337                  * Since we are the agent LWP, we know the victim LWP is stopped
 338                  * until we're done here; no need to worry about preemption or
 339                  * migration here. We still use an atomic op to clear the flag
 340                  * to ensure the flags are always self-consistent; they can
 341                  * still be accessed from, for instance, another CPU doing a
 342                  * kcpc_invalidate_all().
 343                  */
 344                 KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 345         }
 346 
 347         mutex_enter(&set->ks_lock);
 348         set->ks_state |= KCPC_SET_BOUND;
 349         cv_signal(&set->ks_condv);
 350         mutex_exit(&set->ks_lock);
 351 
 352         return (0);
 353 }
 354 
 355 /*
 356  * Walk through each request in the set and ask the PCBE to configure a
 357  * corresponding counter.
 358  */
 359 int
 360 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
 361 {
 362         int             i;
 363         int             ret;
 364         kcpc_request_t  *rp;
 365 
 366         for (i = 0; i < set->ks_nreqs; i++) {
 367                 int n;
 368                 rp = &set->ks_req[i];
 369 
 370                 n = rp->kr_picnum;
 371 
 372                 ASSERT(n >= 0 && n < cpc_ncounters);
 373 
 374                 ASSERT(ctx->kc_pics[n].kp_req == NULL);
 375 
 376                 if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
 377                         if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
 378                             == 0) {
 379                                 *subcode = -1;
 380                                 return (ENOTSUP);
 381                         }
 382                         /*
 383                          * If any of the counters have requested overflow
 384                          * notification, we flag the context as being one that
 385                          * cares about overflow.
 386                          */
 387                         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_SIGOVF);
 388                 }
 389 
 390                 rp->kr_config = NULL;
 391                 if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
 392                     rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
 393                     &(rp->kr_config), (void *)ctx)) != 0) {
 394                         kcpc_free_configs(set);
 395                         *subcode = ret;
 396                         switch (ret) {
 397                         case CPC_ATTR_REQUIRES_PRIVILEGE:
 398                         case CPC_HV_NO_ACCESS:
 399                                 return (EACCES);
 400                         default:
 401                                 return (EINVAL);
 402                         }
 403                 }
 404 
 405                 ctx->kc_pics[n].kp_req = rp;
 406                 rp->kr_picp = &ctx->kc_pics[n];
 407                 rp->kr_data = set->ks_data + rp->kr_index;
 408                 *rp->kr_data = rp->kr_preset;
 409         }
 410 
 411         return (0);
 412 }
 413 
 414 void
 415 kcpc_free_configs(kcpc_set_t *set)
 416 {
 417         int i;
 418 
 419         for (i = 0; i < set->ks_nreqs; i++)
 420                 if (set->ks_req[i].kr_config != NULL)
 421                         pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
 422 }
 423 
 424 /*
 425  * buf points to a user address and the data should be copied out to that
 426  * address in the current process.
 427  */
 428 int
 429 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
 430 {
 431         kcpc_ctx_t      *ctx = set->ks_ctx;
 432         int             save_spl;
 433 
 434         mutex_enter(&set->ks_lock);
 435         if ((set->ks_state & KCPC_SET_BOUND) == 0) {
 436                 mutex_exit(&set->ks_lock);
 437                 return (EINVAL);
 438         }
 439         mutex_exit(&set->ks_lock);
 440 
 441         /*
 442          * Kernel preemption must be disabled while reading the hardware regs,
 443          * and if this is a CPU-bound context, while checking the CPU binding of
 444          * the current thread.
 445          */
 446         kpreempt_disable();
 447         save_spl = spl_xcall();
 448 
 449         if (ctx->kc_flags & KCPC_CTX_INVALID) {
 450                 splx(save_spl);
 451                 kpreempt_enable();
 452                 return (EAGAIN);
 453         }
 454 
 455         if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
 456                 if (ctx->kc_cpuid != -1) {
 457                         if (curthread->t_bind_cpu != ctx->kc_cpuid) {
 458                                 splx(save_spl);
 459                                 kpreempt_enable();
 460                                 return (EAGAIN);
 461                         }
 462                 }
 463 
 464                 if (ctx->kc_thread == curthread) {
 465                         uint64_t curtick = KCPC_GET_TICK();
 466 
 467                         ctx->kc_hrtime = gethrtime_waitfree();
 468                         pcbe_ops->pcbe_sample(ctx);
 469                         ctx->kc_vtick += curtick - ctx->kc_rawtick;
 470                         ctx->kc_rawtick = curtick;
 471                 }
 472 
 473                 /*
 474                  * The config may have been invalidated by
 475                  * the pcbe_sample op.
 476                  */
 477                 if (ctx->kc_flags & KCPC_CTX_INVALID) {
 478                         splx(save_spl);
 479                         kpreempt_enable();
 480                         return (EAGAIN);
 481                 }
 482 
 483         }
 484 
 485         splx(save_spl);
 486         kpreempt_enable();
 487 
 488         if (copyout(set->ks_data, buf,
 489             set->ks_nreqs * sizeof (uint64_t)) == -1)
 490                 return (EFAULT);
 491         if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
 492                 return (EFAULT);
 493         if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
 494                 return (EFAULT);
 495 
 496         return (0);
 497 }
 498 
 499 /*
 500  * Stop the counters on the CPU this context is bound to.
 501  */
 502 static void
 503 kcpc_stop_hw(kcpc_ctx_t *ctx)
 504 {
 505         cpu_t *cp;
 506 
 507         kpreempt_disable();
 508 
 509         if (ctx->kc_cpuid == CPU->cpu_id) {
 510                 cp = CPU;
 511         } else {
 512                 cp = cpu_get(ctx->kc_cpuid);
 513         }
 514 
 515         ASSERT(cp != NULL && cp->cpu_cpc_ctx == ctx);
 516         kcpc_cpu_stop(cp, B_FALSE);
 517 
 518         kpreempt_enable();
 519 }
 520 
 521 int
 522 kcpc_unbind(kcpc_set_t *set)
 523 {
 524         kcpc_ctx_t      *ctx;
 525         kthread_t       *t;
 526 
 527         /*
 528          * We could be racing with the process's agent thread as it
 529          * binds the set; we must wait for the set to finish binding
 530          * before attempting to tear it down.
 531          */
 532         mutex_enter(&set->ks_lock);
 533         while ((set->ks_state & KCPC_SET_BOUND) == 0)
 534                 cv_wait(&set->ks_condv, &set->ks_lock);
 535         mutex_exit(&set->ks_lock);
 536 
 537         ctx = set->ks_ctx;
 538 
 539         /*
 540          * Use kc_lock to synchronize with kcpc_restore().
 541          */
 542         mutex_enter(&ctx->kc_lock);
 543         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 544         mutex_exit(&ctx->kc_lock);
 545 
 546         if (ctx->kc_cpuid == -1) {
 547                 t = ctx->kc_thread;
 548                 /*
 549                  * The context is thread-bound and therefore has a device
 550                  * context.  It will be freed via removectx() calling
 551                  * freectx() calling kcpc_free().
 552                  */
 553                 if (t == curthread) {
 554                         int save_spl;
 555 
 556                         kpreempt_disable();
 557                         save_spl = spl_xcall();
 558                         if (!(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED))
 559                                 kcpc_unprogram(ctx, B_TRUE);
 560                         splx(save_spl);
 561                         kpreempt_enable();
 562                 }
 563 #ifdef DEBUG
 564                 if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
 565                     kcpc_lwp_create, NULL, kcpc_free) == 0)
 566                         panic("kcpc_unbind: context %p not preset on thread %p",
 567                             (void *)ctx, (void *)t);
 568 #else
 569                 (void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
 570                     kcpc_lwp_create, NULL, kcpc_free);
 571 #endif /* DEBUG */
 572                 t->t_cpc_set = NULL;
 573                 t->t_cpc_ctx = NULL;
 574         } else {
 575                 /*
 576                  * If we are unbinding a CPU-bound set from a remote CPU, the
 577                  * native CPU's idle thread could be in the midst of programming
 578                  * this context onto the CPU. We grab the context's lock here to
 579                  * ensure that the idle thread is done with it. When we release
 580                  * the lock, the CPU no longer has a context and the idle thread
 581                  * will move on.
 582                  *
 583                  * cpu_lock must be held to prevent the CPU from being DR'd out
 584                  * while we disassociate the context from the cpu_t.
 585                  */
 586                 cpu_t *cp;
 587                 mutex_enter(&cpu_lock);
 588                 cp = cpu_get(ctx->kc_cpuid);
 589                 if (cp != NULL) {
 590                         /*
 591                          * The CPU may have been DR'd out of the system.
 592                          */
 593                         mutex_enter(&cp->cpu_cpc_ctxlock);
 594                         if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
 595                                 kcpc_stop_hw(ctx);
 596                         ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
 597                         mutex_exit(&cp->cpu_cpc_ctxlock);
 598                 }
 599                 mutex_exit(&cpu_lock);
 600                 if (ctx->kc_thread == curthread) {
 601                         kcpc_free(ctx, 0);
 602                         curthread->t_cpc_set = NULL;
 603                 }
 604         }
 605 
 606         return (0);
 607 }
 608 
 609 int
 610 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
 611 {
 612         int i;
 613 
 614         ASSERT(set != NULL);
 615         ASSERT(set->ks_state & KCPC_SET_BOUND);
 616         ASSERT(set->ks_ctx->kc_thread == curthread);
 617         ASSERT(set->ks_ctx->kc_cpuid == -1);
 618 
 619         if (index < 0 || index >= set->ks_nreqs)
 620                 return (EINVAL);
 621 
 622         for (i = 0; i < set->ks_nreqs; i++)
 623                 if (set->ks_req[i].kr_index == index)
 624                         break;
 625         ASSERT(i != set->ks_nreqs);
 626 
 627         set->ks_req[i].kr_preset = preset;
 628         return (0);
 629 }
 630 
 631 int
 632 kcpc_restart(kcpc_set_t *set)
 633 {
 634         kcpc_ctx_t      *ctx = set->ks_ctx;
 635         int             i;
 636         int             save_spl;
 637 
 638         ASSERT(set->ks_state & KCPC_SET_BOUND);
 639         ASSERT(ctx->kc_thread == curthread);
 640         ASSERT(ctx->kc_cpuid == -1);
 641 
 642         for (i = 0; i < set->ks_nreqs; i++) {
 643                 *(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
 644                 pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
 645                     0, 0, NULL, &set->ks_req[i].kr_config, NULL);
 646         }
 647 
 648         kpreempt_disable();
 649         save_spl = spl_xcall();
 650 
 651         /*
 652          * If the user is doing this on a running set, make sure the counters
 653          * are stopped first.
 654          */
 655         if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
 656                 pcbe_ops->pcbe_allstop();
 657 
 658         /*
 659          * Ask the backend to program the hardware.
 660          */
 661         ctx->kc_rawtick = KCPC_GET_TICK();
 662         KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 663         pcbe_ops->pcbe_program(ctx);
 664         splx(save_spl);
 665         kpreempt_enable();
 666 
 667         return (0);
 668 }
 669 
 670 /*
 671  * Caller must hold kcpc_cpuctx_lock.
 672  */
 673 int
 674 kcpc_enable(kthread_t *t, int cmd, int enable)
 675 {
 676         kcpc_ctx_t      *ctx = t->t_cpc_ctx;
 677         kcpc_set_t      *set = t->t_cpc_set;
 678         kcpc_set_t      *newset;
 679         int             i;
 680         int             flag;
 681         int             err;
 682 
 683         ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
 684 
 685         if (ctx == NULL) {
 686                 /*
 687                  * This thread has a set but no context; it must be a
 688                  * CPU-bound set.
 689                  */
 690                 ASSERT(t->t_cpc_set != NULL);
 691                 ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
 692                 return (EINVAL);
 693         } else if (ctx->kc_flags & KCPC_CTX_INVALID)
 694                 return (EAGAIN);
 695 
 696         if (cmd == CPC_ENABLE) {
 697                 if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
 698                         return (EINVAL);
 699                 kpreempt_disable();
 700                 KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 701                 kcpc_restore(ctx);
 702                 kpreempt_enable();
 703         } else if (cmd == CPC_DISABLE) {
 704                 if (ctx->kc_flags & KCPC_CTX_FREEZE)
 705                         return (EINVAL);
 706                 kpreempt_disable();
 707                 kcpc_save(ctx);
 708                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 709                 kpreempt_enable();
 710         } else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
 711                 /*
 712                  * Strategy for usr/sys: stop counters and update set's presets
 713                  * with current counter values, unbind, update requests with
 714                  * new config, then re-bind.
 715                  */
 716                 flag = (cmd == CPC_USR_EVENTS) ?
 717                     CPC_COUNT_USER: CPC_COUNT_SYSTEM;
 718 
 719                 kpreempt_disable();
 720                 KCPC_CTX_FLAG_SET(ctx,
 721                     KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
 722                 pcbe_ops->pcbe_allstop();
 723                 kpreempt_enable();
 724 
 725                 for (i = 0; i < set->ks_nreqs; i++) {
 726                         set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
 727                         if (enable)
 728                                 set->ks_req[i].kr_flags |= flag;
 729                         else
 730                                 set->ks_req[i].kr_flags &= ~flag;
 731                 }
 732                 newset = kcpc_dup_set(set);
 733                 if (kcpc_unbind(set) != 0)
 734                         return (EINVAL);
 735                 t->t_cpc_set = newset;
 736                 if (kcpc_bind_thread(newset, t, &err) != 0) {
 737                         t->t_cpc_set = NULL;
 738                         kcpc_free_set(newset);
 739                         return (EINVAL);
 740                 }
 741         } else
 742                 return (EINVAL);
 743 
 744         return (0);
 745 }
 746 
 747 /*
 748  * Provide PCBEs with a way of obtaining the configs of every counter which will
 749  * be programmed together.
 750  *
 751  * If current is NULL, provide the first config.
 752  *
 753  * If data != NULL, caller wants to know where the data store associated with
 754  * the config we return is located.
 755  */
 756 void *
 757 kcpc_next_config(void *token, void *current, uint64_t **data)
 758 {
 759         int             i;
 760         kcpc_pic_t      *pic;
 761         kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
 762 
 763         if (current == NULL) {
 764                 /*
 765                  * Client would like the first config, which may not be in
 766                  * counter 0; we need to search through the counters for the
 767                  * first config.
 768                  */
 769                 for (i = 0; i < cpc_ncounters; i++)
 770                         if (ctx->kc_pics[i].kp_req != NULL)
 771                                 break;
 772                 /*
 773                  * There are no counters configured for the given context.
 774                  */
 775                 if (i == cpc_ncounters)
 776                         return (NULL);
 777         } else {
 778                 /*
 779                  * There surely is a faster way to do this.
 780                  */
 781                 for (i = 0; i < cpc_ncounters; i++) {
 782                         pic = &ctx->kc_pics[i];
 783 
 784                         if (pic->kp_req != NULL &&
 785                             current == pic->kp_req->kr_config)
 786                                 break;
 787                 }
 788 
 789                 /*
 790                  * We found the current config at picnum i. Now search for the
 791                  * next configured PIC.
 792                  */
 793                 for (i++; i < cpc_ncounters; i++) {
 794                         pic = &ctx->kc_pics[i];
 795                         if (pic->kp_req != NULL)
 796                                 break;
 797                 }
 798 
 799                 if (i == cpc_ncounters)
 800                         return (NULL);
 801         }
 802 
 803         if (data != NULL) {
 804                 *data = ctx->kc_pics[i].kp_req->kr_data;
 805         }
 806 
 807         return (ctx->kc_pics[i].kp_req->kr_config);
 808 }
 809 
 810 
 811 kcpc_ctx_t *
 812 kcpc_ctx_alloc(int kmem_flags)
 813 {
 814         kcpc_ctx_t      *ctx;
 815         long            hash;
 816 
 817         ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), kmem_flags);
 818         if (ctx == NULL)
 819                 return (NULL);
 820 
 821         hash = CPC_HASH_CTX(ctx);
 822         mutex_enter(&kcpc_ctx_llock[hash]);
 823         ctx->kc_next = kcpc_ctx_list[hash];
 824         kcpc_ctx_list[hash] = ctx;
 825         mutex_exit(&kcpc_ctx_llock[hash]);
 826 
 827         ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
 828             cpc_ncounters, KM_SLEEP);
 829 
 830         ctx->kc_cpuid = -1;
 831 
 832         return (ctx);
 833 }
 834 
 835 /*
 836  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
 837  * in the flags.
 838  */
 839 static void
 840 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
 841 {
 842         kcpc_set_t      *ks = ctx->kc_set, *cks;
 843         int             i, j;
 844         int             code;
 845 
 846         ASSERT(ks != NULL);
 847 
 848         if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
 849                 return;
 850 
 851         cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
 852         cks->ks_state &= ~KCPC_SET_BOUND;
 853         cctx->kc_set = cks;
 854         cks->ks_flags = ks->ks_flags;
 855         cks->ks_nreqs = ks->ks_nreqs;
 856         cks->ks_req = kmem_alloc(cks->ks_nreqs *
 857             sizeof (kcpc_request_t), KM_SLEEP);
 858         cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
 859             KM_SLEEP);
 860         cks->ks_ctx = cctx;
 861 
 862         for (i = 0; i < cks->ks_nreqs; i++) {
 863                 cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
 864                 cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
 865                 (void) strncpy(cks->ks_req[i].kr_event,
 866                     ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
 867                 cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
 868                 cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
 869                 cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
 870                 if (ks->ks_req[i].kr_nattrs > 0) {
 871                         cks->ks_req[i].kr_attr =
 872                             kmem_alloc(ks->ks_req[i].kr_nattrs *
 873                             sizeof (kcpc_attr_t), KM_SLEEP);
 874                 }
 875                 for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
 876                         (void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
 877                             ks->ks_req[i].kr_attr[j].ka_name,
 878                             CPC_MAX_ATTR_LEN);
 879                         cks->ks_req[i].kr_attr[j].ka_val =
 880                             ks->ks_req[i].kr_attr[j].ka_val;
 881                 }
 882         }
 883         if (kcpc_configure_reqs(cctx, cks, &code) != 0)
 884                 kcpc_invalidate_config(cctx);
 885 
 886         mutex_enter(&cks->ks_lock);
 887         cks->ks_state |= KCPC_SET_BOUND;
 888         cv_signal(&cks->ks_condv);
 889         mutex_exit(&cks->ks_lock);
 890 }
 891 
 892 
 893 void
 894 kcpc_ctx_free(kcpc_ctx_t *ctx)
 895 {
 896         kcpc_ctx_t      **loc;
 897         long            hash = CPC_HASH_CTX(ctx);
 898 
 899         mutex_enter(&kcpc_ctx_llock[hash]);
 900         loc = &kcpc_ctx_list[hash];
 901         ASSERT(*loc != NULL);
 902         while (*loc != ctx)
 903                 loc = &(*loc)->kc_next;
 904         *loc = ctx->kc_next;
 905         mutex_exit(&kcpc_ctx_llock[hash]);
 906 
 907         kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
 908         cv_destroy(&ctx->kc_condv);
 909         mutex_destroy(&ctx->kc_lock);
 910         kmem_free(ctx, sizeof (*ctx));
 911 }
 912 
 913 /*
 914  * Generic interrupt handler used on hardware that generates
 915  * overflow interrupts.
 916  *
 917  * Note: executed at high-level interrupt context!
 918  */
 919 /*ARGSUSED*/
 920 kcpc_ctx_t *
 921 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
 922 {
 923         kcpc_ctx_t      *ctx;
 924         kthread_t       *t = curthread;
 925         int             i;
 926 
 927         /*
 928          * On both x86 and UltraSPARC, we may deliver the high-level
 929          * interrupt in kernel mode, just after we've started to run an
 930          * interrupt thread.  (That's because the hardware helpfully
 931          * delivers the overflow interrupt some random number of cycles
 932          * after the instruction that caused the overflow by which time
 933          * we're in some part of the kernel, not necessarily running on
 934          * the right thread).
 935          *
 936          * Check for this case here -- find the pinned thread
 937          * that was running when the interrupt went off.
 938          */
 939         if (t->t_flag & T_INTR_THREAD) {
 940                 klwp_t *lwp;
 941 
 942                 atomic_inc_32(&kcpc_intrctx_count);
 943 
 944                 /*
 945                  * Note that t_lwp is always set to point at the underlying
 946                  * thread, thus this will work in the presence of nested
 947                  * interrupts.
 948                  */
 949                 ctx = NULL;
 950                 if ((lwp = t->t_lwp) != NULL) {
 951                         t = lwptot(lwp);
 952                         ctx = t->t_cpc_ctx;
 953                 }
 954         } else
 955                 ctx = t->t_cpc_ctx;
 956 
 957         if (ctx == NULL) {
 958                 /*
 959                  * This can easily happen if we're using the counters in
 960                  * "shared" mode, for example, and an overflow interrupt
 961                  * occurs while we are running cpustat.  In that case, the
 962                  * bound thread that has the context that belongs to this
 963                  * CPU is almost certainly sleeping (if it was running on
 964                  * the CPU we'd have found it above), and the actual
 965                  * interrupted thread has no knowledge of performance counters!
 966                  */
 967                 ctx = curthread->t_cpu->cpu_cpc_ctx;
 968                 if (ctx != NULL) {
 969                         /*
 970                          * Return the bound context for this CPU to
 971                          * the interrupt handler so that it can synchronously
 972                          * sample the hardware counters and restart them.
 973                          */
 974                         return (ctx);
 975                 }
 976 
 977                 /*
 978                  * As long as the overflow interrupt really is delivered early
 979                  * enough after trapping into the kernel to avoid switching
 980                  * threads, we must always be able to find the cpc context,
 981                  * or something went terribly wrong i.e. we ended up
 982                  * running a passivated interrupt thread, a kernel
 983                  * thread or we interrupted idle, all of which are Very Bad.
 984                  *
 985                  * We also could end up here owing to an incredibly unlikely
 986                  * race condition that exists on x86 based architectures when
 987                  * the cpc provider is in use; overflow interrupts are directed
 988                  * to the cpc provider if the 'dtrace_cpc_in_use' variable is
 989                  * set when we enter the handler. This variable is unset after
 990                  * overflow interrupts have been disabled on all CPUs and all
 991                  * contexts have been torn down. To stop interrupts, the cpc
 992                  * provider issues a xcall to the remote CPU before it tears
 993                  * down that CPUs context. As high priority xcalls, on an x86
 994                  * architecture, execute at a higher PIL than this handler, it
 995                  * is possible (though extremely unlikely) that the xcall could
 996                  * interrupt the overflow handler before the handler has
 997                  * checked the 'dtrace_cpc_in_use' variable, stop the counters,
 998                  * return to the cpc provider which could then rip down
 999                  * contexts and unset 'dtrace_cpc_in_use' *before* the CPUs
1000                  * overflow handler has had a chance to check the variable. In
1001                  * that case, the handler would direct the overflow into this
1002                  * code and no valid context will be found. The default behavior
1003                  * when no valid context is found is now to shout a warning to
1004                  * the console and bump the 'kcpc_nullctx_count' variable.
1005                  */
1006                 if (kcpc_nullctx_panic)
1007                         panic("null cpc context, thread %p", (void *)t);
1008 #ifdef DEBUG
1009                 cmn_err(CE_NOTE,
1010                     "null cpc context found in overflow handler!\n");
1011 #endif
1012                 atomic_inc_32(&kcpc_nullctx_count);
1013         } else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
1014                 /*
1015                  * Schedule an ast to sample the counters, which will
1016                  * propagate any overflow into the virtualized performance
1017                  * counter(s), and may deliver a signal.
1018                  */
1019                 ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1020                 /*
1021                  * If a counter has overflowed which was counting on behalf of
1022                  * a request which specified CPC_OVF_NOTIFY_EMT, send the
1023                  * process a signal.
1024                  */
1025                 for (i = 0; i < cpc_ncounters; i++) {
1026                         if (ctx->kc_pics[i].kp_req != NULL &&
1027                             bitmap & (1 << i) &&
1028                             ctx->kc_pics[i].kp_req->kr_flags &
1029                             CPC_OVF_NOTIFY_EMT) {
1030                                 /*
1031                                  * A signal has been requested for this PIC, so
1032                                  * so freeze the context. The interrupt handler
1033                                  * has already stopped the counter hardware.
1034                                  */
1035                                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1036                                 atomic_or_uint(&ctx->kc_pics[i].kp_flags,
1037                                     KCPC_PIC_OVERFLOWED);
1038                         }
1039                 }
1040                 aston(t);
1041         } else if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1042                 /*
1043                  * Thread context is no longer valid, but here may be a valid
1044                  * CPU context.
1045                  */
1046                 return (curthread->t_cpu->cpu_cpc_ctx);
1047         }
1048 
1049         return (NULL);
1050 }
1051 
1052 /*
1053  * The current thread context had an overflow interrupt; we're
1054  * executing here in high-level interrupt context.
1055  */
1056 /*ARGSUSED*/
1057 uint_t
1058 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
1059 {
1060         kcpc_ctx_t *ctx;
1061         uint64_t bitmap;
1062         uint8_t *state;
1063         int     save_spl;
1064 
1065         if (pcbe_ops == NULL ||
1066             (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
1067                 return (DDI_INTR_UNCLAIMED);
1068 
1069         /*
1070          * Prevent any further interrupts.
1071          */
1072         pcbe_ops->pcbe_allstop();
1073 
1074         if (dtrace_cpc_in_use) {
1075                 state = &cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state;
1076 
1077                 /*
1078                  * Set the per-CPU state bit to indicate that we are currently
1079                  * processing an interrupt if it is currently free. Drop the
1080                  * interrupt if the state isn't free (i.e. a configuration
1081                  * event is taking place).
1082                  */
1083                 if (atomic_cas_8(state, DCPC_INTR_FREE,
1084                     DCPC_INTR_PROCESSING) == DCPC_INTR_FREE) {
1085                         int i;
1086                         kcpc_request_t req;
1087 
1088                         ASSERT(dtrace_cpc_fire != NULL);
1089 
1090                         (*dtrace_cpc_fire)(bitmap);
1091 
1092                         ctx = curthread->t_cpu->cpu_cpc_ctx;
1093                         if (ctx == NULL) {
1094 #ifdef DEBUG
1095                                 cmn_err(CE_NOTE, "null cpc context in"
1096                                     "hardware overflow handler!\n");
1097 #endif
1098                                 return (DDI_INTR_CLAIMED);
1099                         }
1100 
1101                         /* Reset any counters that have overflowed */
1102                         for (i = 0; i < ctx->kc_set->ks_nreqs; i++) {
1103                                 req = ctx->kc_set->ks_req[i];
1104 
1105                                 if (bitmap & (1 << req.kr_picnum)) {
1106                                         pcbe_ops->pcbe_configure(req.kr_picnum,
1107                                             req.kr_event, req.kr_preset,
1108                                             req.kr_flags, req.kr_nattrs,
1109                                             req.kr_attr, &(req.kr_config),
1110                                             (void *)ctx);
1111                                 }
1112                         }
1113                         pcbe_ops->pcbe_program(ctx);
1114 
1115                         /*
1116                          * We've finished processing the interrupt so set
1117                          * the state back to free.
1118                          */
1119                         cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state =
1120                             DCPC_INTR_FREE;
1121                         membar_producer();
1122                 }
1123                 return (DDI_INTR_CLAIMED);
1124         }
1125 
1126         /*
1127          * DTrace isn't involved so pass on accordingly.
1128          *
1129          * If the interrupt has occurred in the context of an lwp owning
1130          * the counters, then the handler posts an AST to the lwp to
1131          * trigger the actual sampling, and optionally deliver a signal or
1132          * restart the counters, on the way out of the kernel using
1133          * kcpc_hw_overflow_ast() (see below).
1134          *
1135          * On the other hand, if the handler returns the context to us
1136          * directly, then it means that there are no other threads in
1137          * the middle of updating it, no AST has been posted, and so we
1138          * should sample the counters here, and restart them with no
1139          * further fuss.
1140          *
1141          * The CPU's CPC context may disappear as a result of cross-call which
1142          * has higher PIL on x86, so protect the context by raising PIL to the
1143          * cross-call level.
1144          */
1145         save_spl = spl_xcall();
1146         if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
1147                 uint64_t curtick = KCPC_GET_TICK();
1148 
1149                 ctx->kc_hrtime = gethrtime_waitfree();
1150                 ctx->kc_vtick += curtick - ctx->kc_rawtick;
1151                 ctx->kc_rawtick = curtick;
1152                 pcbe_ops->pcbe_sample(ctx);
1153                 pcbe_ops->pcbe_program(ctx);
1154         }
1155         splx(save_spl);
1156 
1157         return (DDI_INTR_CLAIMED);
1158 }
1159 
1160 /*
1161  * Called from trap() when processing the ast posted by the high-level
1162  * interrupt handler.
1163  */
1164 int
1165 kcpc_overflow_ast()
1166 {
1167         kcpc_ctx_t      *ctx = curthread->t_cpc_ctx;
1168         int             i;
1169         int             found = 0;
1170         uint64_t        curtick = KCPC_GET_TICK();
1171 
1172         ASSERT(ctx != NULL);    /* Beware of interrupt skid. */
1173 
1174         /*
1175          * An overflow happened: sample the context to ensure that
1176          * the overflow is propagated into the upper bits of the
1177          * virtualized 64-bit counter(s).
1178          */
1179         kpreempt_disable();
1180         ctx->kc_hrtime = gethrtime_waitfree();
1181         pcbe_ops->pcbe_sample(ctx);
1182         kpreempt_enable();
1183 
1184         ctx->kc_vtick += curtick - ctx->kc_rawtick;
1185 
1186         /*
1187          * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
1188          * if that pic generated an overflow and if the request it was counting
1189          * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
1190          * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
1191          * found any overflowed pics, keep the context frozen and return true
1192          * (thus causing a signal to be sent).
1193          */
1194         for (i = 0; i < cpc_ncounters; i++) {
1195                 if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
1196                         atomic_and_uint(&ctx->kc_pics[i].kp_flags,
1197                             ~KCPC_PIC_OVERFLOWED);
1198                         found = 1;
1199                 }
1200         }
1201         if (found)
1202                 return (1);
1203 
1204         /*
1205          * Otherwise, re-enable the counters and continue life as before.
1206          */
1207         kpreempt_disable();
1208         KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
1209         pcbe_ops->pcbe_program(ctx);
1210         kpreempt_enable();
1211         return (0);
1212 }
1213 
1214 /*
1215  * Called when switching away from current thread.
1216  */
1217 static void
1218 kcpc_save(kcpc_ctx_t *ctx)
1219 {
1220         int err;
1221         int save_spl;
1222 
1223         kpreempt_disable();
1224         save_spl = spl_xcall();
1225 
1226         if (ctx->kc_flags & KCPC_CTX_INVALID) {
1227                 if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1228                         splx(save_spl);
1229                         kpreempt_enable();
1230                         return;
1231                 }
1232                 /*
1233                  * This context has been invalidated but the counters have not
1234                  * been stopped. Stop them here and mark the context stopped.
1235                  */
1236                 kcpc_unprogram(ctx, B_TRUE);
1237                 splx(save_spl);
1238                 kpreempt_enable();
1239                 return;
1240         }
1241 
1242         pcbe_ops->pcbe_allstop();
1243         if (ctx->kc_flags & KCPC_CTX_FREEZE) {
1244                 splx(save_spl);
1245                 kpreempt_enable();
1246                 return;
1247         }
1248 
1249         /*
1250          * Need to sample for all reqs into each req's current mpic.
1251          */
1252         ctx->kc_hrtime = gethrtime_waitfree();
1253         ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1254         pcbe_ops->pcbe_sample(ctx);
1255 
1256         /*
1257          * Program counter for measuring capacity and utilization since user
1258          * thread isn't using counter anymore
1259          */
1260         ASSERT(ctx->kc_cpuid == -1);
1261         cu_cpc_program(CPU, &err);
1262         splx(save_spl);
1263         kpreempt_enable();
1264 }
1265 
1266 static void
1267 kcpc_restore(kcpc_ctx_t *ctx)
1268 {
1269         int save_spl;
1270 
1271         mutex_enter(&ctx->kc_lock);
1272 
1273         if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1274             KCPC_CTX_INVALID) {
1275                 /*
1276                  * The context is invalidated but has not been marked stopped.
1277                  * We mark it as such here because we will not start the
1278                  * counters during this context switch.
1279                  */
1280                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
1281         }
1282 
1283         if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
1284                 mutex_exit(&ctx->kc_lock);
1285                 return;
1286         }
1287 
1288         /*
1289          * Set kc_flags to show that a kcpc_restore() is in progress to avoid
1290          * ctx & set related memory objects being freed without us knowing.
1291          * This can happen if an agent thread is executing a kcpc_unbind(),
1292          * with this thread as the target, whilst we're concurrently doing a
1293          * restorectx() during, for example, a proc_exit().  Effectively, by
1294          * doing this, we're asking kcpc_free() to cv_wait() until
1295          * kcpc_restore() has completed.
1296          */
1297         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_RESTORE);
1298         mutex_exit(&ctx->kc_lock);
1299 
1300         /*
1301          * While programming the hardware, the counters should be stopped. We
1302          * don't do an explicit pcbe_allstop() here because they should have
1303          * been stopped already by the last consumer.
1304          */
1305         kpreempt_disable();
1306         save_spl = spl_xcall();
1307         kcpc_program(ctx, B_TRUE, B_TRUE);
1308         splx(save_spl);
1309         kpreempt_enable();
1310 
1311         /*
1312          * Wake the agent thread if it's waiting in kcpc_free().
1313          */
1314         mutex_enter(&ctx->kc_lock);
1315         KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_RESTORE);
1316         cv_signal(&ctx->kc_condv);
1317         mutex_exit(&ctx->kc_lock);
1318 }
1319 
1320 /*
1321  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1322  * following context operators to the idle thread on each CPU. They stop the
1323  * counters when the idle thread is switched on, and they start them again when
1324  * it is switched off.
1325  */
1326 /*ARGSUSED*/
1327 void
1328 kcpc_idle_save(struct cpu *cp)
1329 {
1330         /*
1331          * The idle thread shouldn't be run anywhere else.
1332          */
1333         ASSERT(CPU == cp);
1334 
1335         /*
1336          * We must hold the CPU's context lock to ensure the context isn't freed
1337          * while we're looking at it.
1338          */
1339         mutex_enter(&cp->cpu_cpc_ctxlock);
1340 
1341         if ((cp->cpu_cpc_ctx == NULL) ||
1342             (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1343                 mutex_exit(&cp->cpu_cpc_ctxlock);
1344                 return;
1345         }
1346 
1347         pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1348         mutex_exit(&cp->cpu_cpc_ctxlock);
1349 }
1350 
1351 void
1352 kcpc_idle_restore(struct cpu *cp)
1353 {
1354         /*
1355          * The idle thread shouldn't be run anywhere else.
1356          */
1357         ASSERT(CPU == cp);
1358 
1359         /*
1360          * We must hold the CPU's context lock to ensure the context isn't freed
1361          * while we're looking at it.
1362          */
1363         mutex_enter(&cp->cpu_cpc_ctxlock);
1364 
1365         if ((cp->cpu_cpc_ctx == NULL) ||
1366             (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1367                 mutex_exit(&cp->cpu_cpc_ctxlock);
1368                 return;
1369         }
1370 
1371         pcbe_ops->pcbe_allstop();
1372         mutex_exit(&cp->cpu_cpc_ctxlock);
1373 }
1374 
1375 /*ARGSUSED*/
1376 static void
1377 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1378 {
1379         kcpc_ctx_t      *ctx = t->t_cpc_ctx, *cctx;
1380         int             i;
1381 
1382         if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1383                 return;
1384 
1385         rw_enter(&kcpc_cpuctx_lock, RW_READER);
1386         if (ctx->kc_flags & KCPC_CTX_INVALID) {
1387                 rw_exit(&kcpc_cpuctx_lock);
1388                 return;
1389         }
1390         cctx = kcpc_ctx_alloc(KM_SLEEP);
1391         kcpc_ctx_clone(ctx, cctx);
1392         rw_exit(&kcpc_cpuctx_lock);
1393 
1394         /*
1395          * Copy the parent context's kc_flags field, but don't overwrite
1396          * the child's in case it was modified during kcpc_ctx_clone.
1397          */
1398         KCPC_CTX_FLAG_SET(cctx,  ctx->kc_flags);
1399         cctx->kc_thread = ct;
1400         cctx->kc_cpuid = -1;
1401         ct->t_cpc_set = cctx->kc_set;
1402         ct->t_cpc_ctx = cctx;
1403 
1404         if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1405                 kcpc_set_t *ks = cctx->kc_set;
1406                 /*
1407                  * Our contract with the user requires us to immediately send an
1408                  * overflow signal to all children if we have the LWPINHERIT
1409                  * and SIGOVF flags set. In addition, all counters should be
1410                  * set to UINT64_MAX, and their pic's overflow flag turned on
1411                  * so that our trap() processing knows to send a signal.
1412                  */
1413                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1414                 for (i = 0; i < ks->ks_nreqs; i++) {
1415                         kcpc_request_t *kr = &ks->ks_req[i];
1416 
1417                         if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1418                                 *(kr->kr_data) = UINT64_MAX;
1419                                 atomic_or_uint(&kr->kr_picp->kp_flags,
1420                                     KCPC_PIC_OVERFLOWED);
1421                         }
1422                 }
1423                 ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1424                 aston(ct);
1425         }
1426 
1427         installctx(ct, cctx, kcpc_save, kcpc_restore,
1428             NULL, kcpc_lwp_create, NULL, kcpc_free, NULL);
1429 }
1430 
1431 /*
1432  * Counter Stoppage Theory
1433  *
1434  * The counters may need to be stopped properly at the following occasions:
1435  *
1436  * 1) An LWP exits.
1437  * 2) A thread exits.
1438  * 3) An LWP performs an exec().
1439  * 4) A bound set is unbound.
1440  *
1441  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1442  * to be freed as well.
1443  *
1444  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1445  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1446  * context.
1447  *
1448  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1449  *
1450  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1451  * been called from exec. It stops the counters _and_ frees the context.
1452  *
1453  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1454  *
1455  * CPU-bound counters are always stopped via kcpc_unbind().
1456  */
1457 
1458 /*
1459  * We're being called to delete the context; we ensure that all associated data
1460  * structures are freed, and that the hardware is passivated if this is an exec.
1461  */
1462 
1463 /*ARGSUSED*/
1464 void
1465 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1466 {
1467         int             i;
1468         kcpc_set_t      *set = ctx->kc_set;
1469 
1470         ASSERT(set != NULL);
1471 
1472         /*
1473          * Wait for kcpc_restore() to finish before we tear things down.
1474          */
1475         mutex_enter(&ctx->kc_lock);
1476         while (ctx->kc_flags & KCPC_CTX_RESTORE)
1477                 cv_wait(&ctx->kc_condv, &ctx->kc_lock);
1478         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1479         mutex_exit(&ctx->kc_lock);
1480 
1481         if (isexec) {
1482                 /*
1483                  * This thread is execing, and after the exec it should not have
1484                  * any performance counter context. Stop the counters properly
1485                  * here so the system isn't surprised by an overflow interrupt
1486                  * later.
1487                  */
1488                 if (ctx->kc_cpuid != -1) {
1489                         cpu_t *cp;
1490                         /*
1491                          * CPU-bound context; stop the appropriate CPU's ctrs.
1492                          * Hold cpu_lock while examining the CPU to ensure it
1493                          * doesn't go away.
1494                          */
1495                         mutex_enter(&cpu_lock);
1496                         cp = cpu_get(ctx->kc_cpuid);
1497                         /*
1498                          * The CPU could have been DR'd out, so only stop the
1499                          * CPU and clear its context pointer if the CPU still
1500                          * exists.
1501                          */
1502                         if (cp != NULL) {
1503                                 mutex_enter(&cp->cpu_cpc_ctxlock);
1504                                 kcpc_stop_hw(ctx);
1505                                 mutex_exit(&cp->cpu_cpc_ctxlock);
1506                         }
1507                         mutex_exit(&cpu_lock);
1508                         ASSERT(curthread->t_cpc_ctx == NULL);
1509                 } else {
1510                         int save_spl;
1511 
1512                         /*
1513                          * Thread-bound context; stop _this_ CPU's counters.
1514                          */
1515                         kpreempt_disable();
1516                         save_spl = spl_xcall();
1517                         kcpc_unprogram(ctx, B_TRUE);
1518                         curthread->t_cpc_ctx = NULL;
1519                         splx(save_spl);
1520                         kpreempt_enable();
1521                 }
1522 
1523                 /*
1524                  * Since we are being called from an exec and we know that
1525                  * exec is not permitted via the agent thread, we should clean
1526                  * up this thread's CPC state completely, and not leave dangling
1527                  * CPC pointers behind.
1528                  */
1529                 ASSERT(ctx->kc_thread == curthread);
1530                 curthread->t_cpc_set = NULL;
1531         }
1532 
1533         /*
1534          * Walk through each request in this context's set and free the PCBE's
1535          * configuration if it exists.
1536          */
1537         for (i = 0; i < set->ks_nreqs; i++) {
1538                 if (set->ks_req[i].kr_config != NULL)
1539                         pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1540         }
1541 
1542         kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1543         kcpc_ctx_free(ctx);
1544         kcpc_free_set(set);
1545 }
1546 
1547 /*
1548  * Free the memory associated with a request set.
1549  */
1550 void
1551 kcpc_free_set(kcpc_set_t *set)
1552 {
1553         int             i;
1554         kcpc_request_t  *req;
1555 
1556         ASSERT(set->ks_req != NULL);
1557 
1558         for (i = 0; i < set->ks_nreqs; i++) {
1559                 req = &set->ks_req[i];
1560 
1561                 if (req->kr_nattrs != 0) {
1562                         kmem_free(req->kr_attr,
1563                             req->kr_nattrs * sizeof (kcpc_attr_t));
1564                 }
1565         }
1566 
1567         kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1568         cv_destroy(&set->ks_condv);
1569         mutex_destroy(&set->ks_lock);
1570         kmem_free(set, sizeof (kcpc_set_t));
1571 }
1572 
1573 /*
1574  * Grab every existing context and mark it as invalid.
1575  */
1576 void
1577 kcpc_invalidate_all(void)
1578 {
1579         kcpc_ctx_t *ctx;
1580         long hash;
1581 
1582         for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1583                 mutex_enter(&kcpc_ctx_llock[hash]);
1584                 for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1585                         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1586                 mutex_exit(&kcpc_ctx_llock[hash]);
1587         }
1588 }
1589 
1590 /*
1591  * Interface for PCBEs to signal that an existing configuration has suddenly
1592  * become invalid.
1593  */
1594 void
1595 kcpc_invalidate_config(void *token)
1596 {
1597         kcpc_ctx_t *ctx = token;
1598 
1599         ASSERT(ctx != NULL);
1600 
1601         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1602 }
1603 
1604 /*
1605  * Called from lwp_exit() and thread_exit()
1606  */
1607 void
1608 kcpc_passivate(void)
1609 {
1610         kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1611         kcpc_set_t *set = curthread->t_cpc_set;
1612         int     save_spl;
1613 
1614         if (set == NULL)
1615                 return;
1616 
1617         if (ctx == NULL) {
1618                 /*
1619                  * This thread has a set but no context; it must be a CPU-bound
1620                  * set. The hardware will be stopped via kcpc_unbind() when the
1621                  * process exits and closes its file descriptors with
1622                  * kcpc_close(). Our only job here is to clean up this thread's
1623                  * state; the set will be freed with the unbind().
1624                  */
1625                 (void) kcpc_unbind(set);
1626                 /*
1627                  * Unbinding a set belonging to the current thread should clear
1628                  * its set pointer.
1629                  */
1630                 ASSERT(curthread->t_cpc_set == NULL);
1631                 return;
1632         }
1633 
1634         kpreempt_disable();
1635         save_spl = spl_xcall();
1636         curthread->t_cpc_set = NULL;
1637 
1638         /*
1639          * This thread/LWP is exiting but context switches will continue to
1640          * happen for a bit as the exit proceeds.  Kernel preemption must be
1641          * disabled here to prevent a race between checking or setting the
1642          * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1643          * a context switch.
1644          */
1645         if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1646                 kcpc_unprogram(ctx, B_TRUE);
1647                 KCPC_CTX_FLAG_SET(ctx,
1648                     KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1649         }
1650 
1651         /*
1652          * We're cleaning up after this thread; ensure there are no dangling
1653          * CPC pointers left behind. The context and set will be freed by
1654          * freectx().
1655          */
1656         curthread->t_cpc_ctx = NULL;
1657 
1658         splx(save_spl);
1659         kpreempt_enable();
1660 }
1661 
1662 /*
1663  * Assign the requests in the given set to the PICs in the context.
1664  * Returns 0 if successful, -1 on failure.
1665  */
1666 /*ARGSUSED*/
1667 int
1668 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1669 {
1670         int i;
1671         int *picnum_save;
1672 
1673         ASSERT(set->ks_nreqs <= cpc_ncounters);
1674 
1675         /*
1676          * Provide kcpc_tryassign() with scratch space to avoid doing an
1677          * alloc/free with every invocation.
1678          */
1679         picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1680         /*
1681          * kcpc_tryassign() blindly walks through each request in the set,
1682          * seeing if a counter can count its event. If yes, it assigns that
1683          * counter. However, that counter may have been the only capable counter
1684          * for _another_ request's event. The solution is to try every possible
1685          * request first. Note that this does not cover all solutions, as
1686          * that would require all unique orderings of requests, an n^n operation
1687          * which would be unacceptable for architectures with many counters.
1688          */
1689         for (i = 0; i < set->ks_nreqs; i++)
1690                 if (kcpc_tryassign(set, i, picnum_save) == 0)
1691                         break;
1692 
1693         kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1694         if (i == set->ks_nreqs)
1695                 return (-1);
1696         return (0);
1697 }
1698 
1699 static int
1700 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1701 {
1702         int             i;
1703         int             j;
1704         uint64_t        bitmap = 0, resmap = 0;
1705         uint64_t        ctrmap;
1706 
1707         /*
1708          * We are attempting to assign the reqs to pics, but we may fail. If we
1709          * fail, we need to restore the state of the requests to what it was
1710          * when we found it, as some reqs may have been explicitly assigned to
1711          * a specific PIC beforehand. We do this by snapshotting the assignments
1712          * now and restoring from it later if we fail.
1713          *
1714          * Also we note here which counters have already been claimed by
1715          * requests with explicit counter assignments.
1716          */
1717         for (i = 0; i < set->ks_nreqs; i++) {
1718                 scratch[i] = set->ks_req[i].kr_picnum;
1719                 if (set->ks_req[i].kr_picnum != -1)
1720                         resmap |= (1 << set->ks_req[i].kr_picnum);
1721         }
1722 
1723         /*
1724          * Walk through requests assigning them to the first PIC that is
1725          * capable.
1726          */
1727         i = starting_req;
1728         do {
1729                 if (set->ks_req[i].kr_picnum != -1) {
1730                         ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1731                         bitmap |= (1 << set->ks_req[i].kr_picnum);
1732                         if (++i == set->ks_nreqs)
1733                                 i = 0;
1734                         continue;
1735                 }
1736 
1737                 ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1738                 for (j = 0; j < cpc_ncounters; j++) {
1739                         if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1740                             (resmap & (1 << j)) == 0) {
1741                                 /*
1742                                  * We can assign this counter because:
1743                                  *
1744                                  * 1. It can count the event (ctrmap)
1745                                  * 2. It hasn't been assigned yet (bitmap)
1746                                  * 3. It wasn't reserved by a request (resmap)
1747                                  */
1748                                 bitmap |= (1 << j);
1749                                 break;
1750                         }
1751                 }
1752                 if (j == cpc_ncounters) {
1753                         for (i = 0; i < set->ks_nreqs; i++)
1754                                 set->ks_req[i].kr_picnum = scratch[i];
1755                         return (-1);
1756                 }
1757                 set->ks_req[i].kr_picnum = j;
1758 
1759                 if (++i == set->ks_nreqs)
1760                         i = 0;
1761         } while (i != starting_req);
1762 
1763         return (0);
1764 }
1765 
1766 kcpc_set_t *
1767 kcpc_dup_set(kcpc_set_t *set)
1768 {
1769         kcpc_set_t      *new;
1770         int             i;
1771         int             j;
1772 
1773         new = kmem_zalloc(sizeof (*new), KM_SLEEP);
1774         new->ks_state &= ~KCPC_SET_BOUND;
1775         new->ks_flags = set->ks_flags;
1776         new->ks_nreqs = set->ks_nreqs;
1777         new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1778             KM_SLEEP);
1779         new->ks_data = NULL;
1780         new->ks_ctx = NULL;
1781 
1782         for (i = 0; i < new->ks_nreqs; i++) {
1783                 new->ks_req[i].kr_config = NULL;
1784                 new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1785                 new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1786                 new->ks_req[i].kr_picp = NULL;
1787                 new->ks_req[i].kr_data = NULL;
1788                 (void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1789                     CPC_MAX_EVENT_LEN);
1790                 new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1791                 new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1792                 new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1793                 new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1794                     sizeof (kcpc_attr_t), KM_SLEEP);
1795                 for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1796                         new->ks_req[i].kr_attr[j].ka_val =
1797                             set->ks_req[i].kr_attr[j].ka_val;
1798                         (void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1799                             set->ks_req[i].kr_attr[j].ka_name,
1800                             CPC_MAX_ATTR_LEN);
1801                 }
1802         }
1803 
1804         return (new);
1805 }
1806 
1807 int
1808 kcpc_allow_nonpriv(void *token)
1809 {
1810         return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1811 }
1812 
1813 void
1814 kcpc_invalidate(kthread_t *t)
1815 {
1816         kcpc_ctx_t *ctx = t->t_cpc_ctx;
1817 
1818         if (ctx != NULL)
1819                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1820 }
1821 
1822 /*
1823  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1824  * are used to construct PCBE names, starting with the most specific,
1825  * "pcbe.first.second.third.fourth" and ending with the least specific,
1826  * "pcbe.first".
1827  *
1828  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1829  */
1830 int
1831 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1832 {
1833         uint_t s[3];
1834 
1835         s[0] = first;
1836         s[1] = second;
1837         s[2] = third;
1838 
1839         return (modload_qualified("pcbe",
1840             "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
1841 }
1842 
1843 /*
1844  * Create one or more CPC context for given CPU with specified counter event
1845  * requests
1846  *
1847  * If number of requested counter events is less than or equal number of
1848  * hardware counters on a CPU and can all be assigned to the counters on a CPU
1849  * at the same time, then make one CPC context.
1850  *
1851  * Otherwise, multiple CPC contexts are created to allow multiplexing more
1852  * counter events than existing counters onto the counters by iterating through
1853  * all of the CPC contexts, programming the counters with each CPC context one
1854  * at a time and measuring the resulting counter values.  Each of the resulting
1855  * CPC contexts contains some number of requested counter events less than or
1856  * equal the number of counters on a CPU depending on whether all the counter
1857  * events can be programmed on all the counters at the same time or not.
1858  *
1859  * Flags to kmem_{,z}alloc() are passed in as an argument to allow specifying
1860  * whether memory allocation should be non-blocking or not.  The code will try
1861  * to allocate *whole* CPC contexts if possible.  If there is any memory
1862  * allocation failure during the allocations needed for a given CPC context, it
1863  * will skip allocating that CPC context because it cannot allocate the whole
1864  * thing.  Thus, the only time that it will end up allocating none (ie. no CPC
1865  * contexts whatsoever) is when it cannot even allocate *one* whole CPC context
1866  * without a memory allocation failure occurring.
1867  */
1868 int
1869 kcpc_cpu_ctx_create(cpu_t *cp, kcpc_request_list_t *req_list, int kmem_flags,
1870     kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz)
1871 {
1872         kcpc_ctx_t      **ctx_ptrs;
1873         int             nctx;
1874         int             nctx_ptrs;
1875         int             nreqs;
1876         kcpc_request_t  *reqs;
1877 
1878         if (cp == NULL || ctx_ptr_array == NULL || ctx_ptr_array_sz == NULL ||
1879             req_list == NULL || req_list->krl_cnt < 1)
1880                 return (-1);
1881 
1882         /*
1883          * Allocate number of sets assuming that each set contains one and only
1884          * one counter event request for each counter on a CPU
1885          */
1886         nreqs = req_list->krl_cnt;
1887         nctx_ptrs = (nreqs + cpc_ncounters - 1) / cpc_ncounters;
1888         ctx_ptrs = kmem_zalloc(nctx_ptrs * sizeof (kcpc_ctx_t *), kmem_flags);
1889         if (ctx_ptrs == NULL)
1890                 return (-2);
1891 
1892         /*
1893          * Fill in sets of requests
1894          */
1895         nctx = 0;
1896         reqs = req_list->krl_list;
1897         while (nreqs > 0) {
1898                 kcpc_ctx_t      *ctx;
1899                 kcpc_set_t      *set;
1900                 int             subcode;
1901 
1902                 /*
1903                  * Allocate CPC context and set for requested counter events
1904                  */
1905                 ctx = kcpc_ctx_alloc(kmem_flags);
1906                 set = kcpc_set_create(reqs, nreqs, 0, kmem_flags);
1907                 if (set == NULL) {
1908                         kcpc_ctx_free(ctx);
1909                         break;
1910                 }
1911 
1912                 /*
1913                  * Determine assignment of requested counter events to specific
1914                  * counters
1915                  */
1916                 if (kcpc_assign_reqs(set, ctx) != 0) {
1917                         /*
1918                          * May not be able to assign requested counter events
1919                          * to all counters since all counters may not be able
1920                          * to do all events, so only do one counter event in
1921                          * set of counter requests when this happens since at
1922                          * least one of the counters must be able to do the
1923                          * event.
1924                          */
1925                         kcpc_free_set(set);
1926                         set = kcpc_set_create(reqs, 1, 0, kmem_flags);
1927                         if (set == NULL) {
1928                                 kcpc_ctx_free(ctx);
1929                                 break;
1930                         }
1931                         if (kcpc_assign_reqs(set, ctx) != 0) {
1932 #ifdef DEBUG
1933                                 cmn_err(CE_NOTE, "!kcpc_cpu_ctx_create: can't "
1934                                     "assign counter event %s!\n",
1935                                     set->ks_req->kr_event);
1936 #endif
1937                                 kcpc_free_set(set);
1938                                 kcpc_ctx_free(ctx);
1939                                 reqs++;
1940                                 nreqs--;
1941                                 continue;
1942                         }
1943                 }
1944 
1945                 /*
1946                  * Allocate memory needed to hold requested counter event data
1947                  */
1948                 set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t),
1949                     kmem_flags);
1950                 if (set->ks_data == NULL) {
1951                         kcpc_free_set(set);
1952                         kcpc_ctx_free(ctx);
1953                         break;
1954                 }
1955 
1956                 /*
1957                  * Configure requested counter events
1958                  */
1959                 if (kcpc_configure_reqs(ctx, set, &subcode) != 0) {
1960 #ifdef DEBUG
1961                         cmn_err(CE_NOTE,
1962                             "!kcpc_cpu_ctx_create: can't configure "
1963                             "set of counter event requests!\n");
1964 #endif
1965                         reqs += set->ks_nreqs;
1966                         nreqs -= set->ks_nreqs;
1967                         kmem_free(set->ks_data,
1968                             set->ks_nreqs * sizeof (uint64_t));
1969                         kcpc_free_set(set);
1970                         kcpc_ctx_free(ctx);
1971                         continue;
1972                 }
1973 
1974                 /*
1975                  * Point set of counter event requests at this context and fill
1976                  * in CPC context
1977                  */
1978                 set->ks_ctx = ctx;
1979                 ctx->kc_set = set;
1980                 ctx->kc_cpuid = cp->cpu_id;
1981                 ctx->kc_thread = curthread;
1982 
1983                 ctx_ptrs[nctx] = ctx;
1984 
1985                 /*
1986                  * Update requests and how many are left to be assigned to sets
1987                  */
1988                 reqs += set->ks_nreqs;
1989                 nreqs -= set->ks_nreqs;
1990 
1991                 /*
1992                  * Increment number of CPC contexts and allocate bigger array
1993                  * for context pointers as needed
1994                  */
1995                 nctx++;
1996                 if (nctx >= nctx_ptrs) {
1997                         kcpc_ctx_t      **new;
1998                         int             new_cnt;
1999 
2000                         /*
2001                          * Allocate more CPC contexts based on how many
2002                          * contexts allocated so far and how many counter
2003                          * requests left to assign
2004                          */
2005                         new_cnt = nctx_ptrs +
2006                             ((nreqs + cpc_ncounters - 1) / cpc_ncounters);
2007                         new = kmem_zalloc(new_cnt * sizeof (kcpc_ctx_t *),
2008                             kmem_flags);
2009                         if (new == NULL)
2010                                 break;
2011 
2012                         /*
2013                          * Copy contents of old sets into new ones
2014                          */
2015                         bcopy(ctx_ptrs, new,
2016                             nctx_ptrs * sizeof (kcpc_ctx_t *));
2017 
2018                         /*
2019                          * Free old array of context pointers and use newly
2020                          * allocated one instead now
2021                          */
2022                         kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2023                         ctx_ptrs = new;
2024                         nctx_ptrs = new_cnt;
2025                 }
2026         }
2027 
2028         /*
2029          * Return NULL if no CPC contexts filled in
2030          */
2031         if (nctx == 0) {
2032                 kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2033                 *ctx_ptr_array = NULL;
2034                 *ctx_ptr_array_sz = 0;
2035                 return (-2);
2036         }
2037 
2038         *ctx_ptr_array = ctx_ptrs;
2039         *ctx_ptr_array_sz = nctx_ptrs * sizeof (kcpc_ctx_t *);
2040         return (nctx);
2041 }
2042 
2043 /*
2044  * Return whether PCBE supports given counter event
2045  */
2046 boolean_t
2047 kcpc_event_supported(char *event)
2048 {
2049         if (pcbe_ops == NULL || pcbe_ops->pcbe_event_coverage(event) == 0)
2050                 return (B_FALSE);
2051 
2052         return (B_TRUE);
2053 }
2054 
2055 /*
2056  * Program counters on current CPU with given CPC context
2057  *
2058  * If kernel is interposing on counters to measure hardware capacity and
2059  * utilization, then unprogram counters for kernel *before* programming them
2060  * with specified CPC context.
2061  *
2062  * kcpc_{program,unprogram}() may be called either directly by a thread running
2063  * on the target CPU or from a cross-call from another CPU. To protect
2064  * programming and unprogramming from being interrupted by cross-calls, callers
2065  * who execute kcpc_{program,unprogram} should raise PIL to the level used by
2066  * cross-calls.
2067  */
2068 void
2069 kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread, boolean_t cu_interpose)
2070 {
2071         int     error;
2072 
2073         ASSERT(IS_HIPIL());
2074 
2075         /*
2076          * CPC context shouldn't be NULL, its CPU field should specify current
2077          * CPU or be -1 to specify any CPU when the context is bound to a
2078          * thread, and preemption should be disabled
2079          */
2080         ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2081             ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2082         if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2083             ctx->kc_cpuid != -1) || curthread->t_preempt < 1)
2084                 return;
2085 
2086         /*
2087          * Unprogram counters for kernel measuring hardware capacity and
2088          * utilization
2089          */
2090         if (cu_interpose == B_TRUE) {
2091                 cu_cpc_unprogram(CPU, &error);
2092         } else {
2093                 kcpc_set_t *set = ctx->kc_set;
2094                 int i;
2095 
2096                 ASSERT(set != NULL);
2097 
2098                 /*
2099                  * Since cu_interpose is false, we are programming CU context.
2100                  * In general, PCBE can continue from the state saved in the
2101                  * set, but it is not very reliable, so we start again from the
2102                  * preset value.
2103                  */
2104                 for (i = 0; i < set->ks_nreqs; i++) {
2105                         /*
2106                          * Reset the virtual counter value to the preset value.
2107                          */
2108                         *(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
2109 
2110                         /*
2111                          * Reset PCBE to the preset value.
2112                          */
2113                         pcbe_ops->pcbe_configure(0, NULL,
2114                             set->ks_req[i].kr_preset,
2115                             0, 0, NULL, &set->ks_req[i].kr_config, NULL);
2116                 }
2117         }
2118 
2119         /*
2120          * Program counters with specified CPC context
2121          */
2122         ctx->kc_rawtick = KCPC_GET_TICK();
2123         pcbe_ops->pcbe_program(ctx);
2124 
2125         /*
2126          * Denote that counters programmed for thread or CPU CPC context
2127          * differently
2128          */
2129         if (for_thread == B_TRUE)
2130                 KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
2131         else
2132                 CPU->cpu_cpc_ctx = ctx;
2133 }
2134 
2135 /*
2136  * Unprogram counters with given CPC context on current CPU
2137  *
2138  * If kernel is interposing on counters to measure hardware capacity and
2139  * utilization, then program counters for the kernel capacity and utilization
2140  * *after* unprogramming them for given CPC context.
2141  *
2142  * See the comment for kcpc_program regarding the synchronization with
2143  * cross-calls.
2144  */
2145 void
2146 kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose)
2147 {
2148         int     error;
2149 
2150         ASSERT(IS_HIPIL());
2151 
2152         /*
2153          * CPC context shouldn't be NULL, its CPU field should specify current
2154          * CPU or be -1 to specify any CPU when the context is bound to a
2155          * thread, and preemption should be disabled
2156          */
2157         ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2158             ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2159 
2160         if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2161             ctx->kc_cpuid != -1) || curthread->t_preempt < 1 ||
2162             (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) != 0) {
2163                 return;
2164         }
2165 
2166         /*
2167          * Specified CPC context to be unprogrammed should be bound to current
2168          * CPU or thread
2169          */
2170         ASSERT(CPU->cpu_cpc_ctx == ctx || curthread->t_cpc_ctx == ctx);
2171 
2172         /*
2173          * Stop counters
2174          */
2175         pcbe_ops->pcbe_allstop();
2176         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
2177 
2178         /*
2179          * Allow kernel to interpose on counters and program them for its own
2180          * use to measure hardware capacity and utilization if cu_interpose
2181          * argument is true
2182          */
2183         if (cu_interpose == B_TRUE)
2184                 cu_cpc_program(CPU, &error);
2185 }
2186 
2187 /*
2188  * Read CPU Performance Counter (CPC) on current CPU and call specified update
2189  * routine with data for each counter event currently programmed on CPU
2190  */
2191 int
2192 kcpc_read(kcpc_update_func_t update_func)
2193 {
2194         kcpc_ctx_t      *ctx;
2195         int             i;
2196         kcpc_request_t  *req;
2197         int             retval;
2198         kcpc_set_t      *set;
2199 
2200         ASSERT(IS_HIPIL());
2201 
2202         /*
2203          * Can't grab locks or block because may be called inside dispatcher
2204          */
2205         kpreempt_disable();
2206 
2207         ctx = CPU->cpu_cpc_ctx;
2208         if (ctx == NULL) {
2209                 kpreempt_enable();
2210                 return (0);
2211         }
2212 
2213         /*
2214          * Read counter data from current CPU
2215          */
2216         pcbe_ops->pcbe_sample(ctx);
2217 
2218         set = ctx->kc_set;
2219         if (set == NULL || set->ks_req == NULL) {
2220                 kpreempt_enable();
2221                 return (0);
2222         }
2223 
2224         /*
2225          * Call update function with preset pointer and data for each CPC event
2226          * request currently programmed on current CPU
2227          */
2228         req = set->ks_req;
2229         retval = 0;
2230         for (i = 0; i < set->ks_nreqs; i++) {
2231                 int     ret;
2232 
2233                 if (req[i].kr_data == NULL)
2234                         break;
2235 
2236                 ret = update_func(req[i].kr_ptr, *req[i].kr_data);
2237                 if (ret < 0)
2238                         retval = ret;
2239         }
2240 
2241         kpreempt_enable();
2242 
2243         return (retval);
2244 }
2245 
2246 /*
2247  * Initialize list of counter event requests
2248  */
2249 kcpc_request_list_t *
2250 kcpc_reqs_init(int nreqs, int kmem_flags)
2251 {
2252         kcpc_request_list_t     *req_list;
2253         kcpc_request_t          *reqs;
2254 
2255         if (nreqs < 1)
2256                 return (NULL);
2257 
2258         req_list = kmem_zalloc(sizeof (kcpc_request_list_t), kmem_flags);
2259         if (req_list == NULL)
2260                 return (NULL);
2261 
2262         reqs = kmem_zalloc(nreqs * sizeof (kcpc_request_t), kmem_flags);
2263         if (reqs == NULL) {
2264                 kmem_free(req_list, sizeof (kcpc_request_list_t));
2265                 return (NULL);
2266         }
2267 
2268         req_list->krl_list = reqs;
2269         req_list->krl_cnt = 0;
2270         req_list->krl_max = nreqs;
2271         return (req_list);
2272 }
2273 
2274 
2275 /*
2276  * Add counter event request to given list of counter event requests
2277  */
2278 int
2279 kcpc_reqs_add(kcpc_request_list_t *req_list, char *event, uint64_t preset,
2280     uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr, int kmem_flags)
2281 {
2282         kcpc_request_t  *req;
2283 
2284         if (req_list == NULL || req_list->krl_list == NULL)
2285                 return (-1);
2286 
2287         ASSERT(req_list->krl_max != 0);
2288 
2289         /*
2290          * Allocate more space (if needed)
2291          */
2292         if (req_list->krl_cnt > req_list->krl_max) {
2293                 kcpc_request_t  *new;
2294                 kcpc_request_t  *old;
2295 
2296                 old = req_list->krl_list;
2297                 new = kmem_zalloc((req_list->krl_max +
2298                     cpc_ncounters) * sizeof (kcpc_request_t), kmem_flags);
2299                 if (new == NULL)
2300                         return (-2);
2301 
2302                 req_list->krl_list = new;
2303                 bcopy(old, req_list->krl_list,
2304                     req_list->krl_cnt * sizeof (kcpc_request_t));
2305                 kmem_free(old, req_list->krl_max * sizeof (kcpc_request_t));
2306                 req_list->krl_cnt = 0;
2307                 req_list->krl_max += cpc_ncounters;
2308         }
2309 
2310         /*
2311          * Fill in request as much as possible now, but some fields will need
2312          * to be set when request is assigned to a set.
2313          */
2314         req = &req_list->krl_list[req_list->krl_cnt];
2315         req->kr_config = NULL;
2316         req->kr_picnum = -1; /* have CPC pick this */
2317         req->kr_index = -1;  /* set when assigning request to set */
2318         req->kr_data = NULL; /* set when configuring request */
2319         (void) strcpy(req->kr_event, event);
2320         req->kr_preset = preset;
2321         req->kr_flags = flags;
2322         req->kr_nattrs = nattrs;
2323         req->kr_attr = attr;
2324         /*
2325          * Keep pointer given by caller to give to update function when this
2326          * counter event is sampled/read
2327          */
2328         req->kr_ptr = ptr;
2329 
2330         req_list->krl_cnt++;
2331 
2332         return (0);
2333 }
2334 
2335 /*
2336  * Reset list of CPC event requests so its space can be used for another set
2337  * of requests
2338  */
2339 int
2340 kcpc_reqs_reset(kcpc_request_list_t *req_list)
2341 {
2342         /*
2343          * Return when pointer to request list structure or request is NULL or
2344          * when max requests is less than or equal to 0
2345          */
2346         if (req_list == NULL || req_list->krl_list == NULL ||
2347             req_list->krl_max <= 0)
2348                 return (-1);
2349 
2350         /*
2351          * Zero out requests and number of requests used
2352          */
2353         bzero(req_list->krl_list, req_list->krl_max * sizeof (kcpc_request_t));
2354         req_list->krl_cnt = 0;
2355         return (0);
2356 }
2357 
2358 /*
2359  * Free given list of counter event requests
2360  */
2361 int
2362 kcpc_reqs_fini(kcpc_request_list_t *req_list)
2363 {
2364         kmem_free(req_list->krl_list,
2365             req_list->krl_max * sizeof (kcpc_request_t));
2366         kmem_free(req_list, sizeof (kcpc_request_list_t));
2367         return (0);
2368 }
2369 
2370 /*
2371  * Create set of given counter event requests
2372  */
2373 static kcpc_set_t *
2374 kcpc_set_create(kcpc_request_t *reqs, int nreqs, int set_flags, int kmem_flags)
2375 {
2376         int             i;
2377         kcpc_set_t      *set;
2378 
2379         /*
2380          * Allocate set and assign number of requests in set and flags
2381          */
2382         set = kmem_zalloc(sizeof (kcpc_set_t), kmem_flags);
2383         if (set == NULL)
2384                 return (NULL);
2385 
2386         if (nreqs < cpc_ncounters)
2387                 set->ks_nreqs = nreqs;
2388         else
2389                 set->ks_nreqs = cpc_ncounters;
2390 
2391         set->ks_flags = set_flags;
2392 
2393         /*
2394          * Allocate requests needed, copy requests into set, and set index into
2395          * data for each request (which may change when we assign requested
2396          * counter events to counters)
2397          */
2398         set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
2399             set->ks_nreqs, kmem_flags);
2400         if (set->ks_req == NULL) {
2401                 kmem_free(set, sizeof (kcpc_set_t));
2402                 return (NULL);
2403         }
2404 
2405         bcopy(reqs, set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
2406 
2407         for (i = 0; i < set->ks_nreqs; i++)
2408                 set->ks_req[i].kr_index = i;
2409 
2410         return (set);
2411 }
2412 
2413 
2414 /*
2415  * Stop counters on current CPU.
2416  *
2417  * If preserve_context is true, the caller is interested in the CPU's CPC
2418  * context and wants it to be preserved.
2419  *
2420  * If preserve_context is false, the caller does not need the CPU's CPC context
2421  * to be preserved, so it is set to NULL.
2422  */
2423 static void
2424 kcpc_cpustop_func(uintptr_t arg1, uintptr_t arg2 __unused)
2425 {
2426         boolean_t preserve_context;
2427         kpreempt_disable();
2428 
2429         preserve_context = (boolean_t)arg1;
2430         /*
2431          * Someone already stopped this context before us, so there is nothing
2432          * to do.
2433          */
2434         if (CPU->cpu_cpc_ctx == NULL) {
2435                 kpreempt_enable();
2436                 return;
2437         }
2438 
2439         kcpc_unprogram(CPU->cpu_cpc_ctx, B_TRUE);
2440         /*
2441          * If CU does not use counters, then clear the CPU's CPC context
2442          * If the caller requested to preserve context it should disable CU
2443          * first, so there should be no CU context now.
2444          */
2445         ASSERT(!preserve_context || !CU_CPC_ON(CPU));
2446         if (!preserve_context && CPU->cpu_cpc_ctx != NULL && !CU_CPC_ON(CPU))
2447                 CPU->cpu_cpc_ctx = NULL;
2448 
2449         kpreempt_enable();
2450 }
2451 
2452 /*
2453  * Stop counters on given CPU and set its CPC context to NULL unless
2454  * preserve_context is true.
2455  */
2456 void
2457 kcpc_cpu_stop(cpu_t *cp, boolean_t preserve_context)
2458 {
2459         cpu_call(cp, kcpc_cpustop_func, preserve_context, 0);
2460 }
2461 
2462 /*
2463  * Program the context on the current CPU
2464  */
2465 static void
2466 kcpc_remoteprogram_func(uintptr_t arg1, uintptr_t arg2)
2467 {
2468         kcpc_ctx_t *ctx = (kcpc_ctx_t *)arg1;
2469         boolean_t for_thread = (boolean_t)arg2;
2470 
2471         ASSERT(ctx != NULL);
2472 
2473         kpreempt_disable();
2474         kcpc_program(ctx, for_thread, B_TRUE);
2475         kpreempt_enable();
2476 }
2477 
2478 /*
2479  * Program counters on given CPU
2480  */
2481 void
2482 kcpc_cpu_program(cpu_t *cp, kcpc_ctx_t *ctx)
2483 {
2484         cpu_call(cp, kcpc_remoteprogram_func, (uintptr_t)ctx,
2485             (uintptr_t)B_FALSE);
2486 }
2487 
2488 char *
2489 kcpc_list_attrs(void)
2490 {
2491         ASSERT(pcbe_ops != NULL);
2492 
2493         return (pcbe_ops->pcbe_list_attrs());
2494 }
2495 
2496 char *
2497 kcpc_list_events(uint_t pic)
2498 {
2499         ASSERT(pcbe_ops != NULL);
2500 
2501         return (pcbe_ops->pcbe_list_events(pic));
2502 }
2503 
2504 uint_t
2505 kcpc_pcbe_capabilities(void)
2506 {
2507         ASSERT(pcbe_ops != NULL);
2508 
2509         return (pcbe_ops->pcbe_caps);
2510 }
2511 
2512 int
2513 kcpc_pcbe_loaded(void)
2514 {
2515         return (pcbe_ops == NULL ? -1 : 0);
2516 }