1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2008-2009, Intel Corporation.
  23  * All Rights Reserved.
  24  */
  25 
  26 #define MAX_TAG         8
  27 #define MAX_STACK       64
  28 
  29 #pragma D option aggsize=8m
  30 #pragma D option bufsize=16m
  31 #pragma D option dynvarsize=16m
  32 #pragma D option aggrate=0
  33 #pragma D option stackframes=MAX_STACK
  34 /*
  35  * Our D script needs to compile even if some of the TRANSLATE probes cannot
  36  * be found. Missing probes can be caused by older kernel, different
  37  * architecture, unloaded modules etc.
  38  */
  39 #pragma D option zdefs
  40 
  41 #if defined(ENABLE_SCHED)
  42 #if defined(TRACE_PID)
  43 #define TRACE_FILTER    / pid == 0 || pid == TRACE_PID /
  44 #define TRACE_FILTER_COND(a)    / (pid == 0 || pid == TRACE_PID) && (a) /
  45 #elif defined(TRACE_PGID)
  46 #define TRACE_FILTER    / pid == 0 || curpsinfo->pr_pgid == TRACE_PGID /
  47 #define TRACE_FILTER_COND(a)
  48     / (pid == 0 || curpsinfo->pr_pgid == TRACE_PGID) && (a) /
  49 #else
  50 #define TRACE_FILTER
  51 #define TRACE_FILTER_COND(a)    / (a) /
  52 #endif
  53 #else   /* ENABLE_SCHED */
  54 #if defined(TRACE_PID)
  55 #define TRACE_FILTER    / pid == TRACE_PID /
  56 #define TRACE_FILTER_COND(a)    / (pid == TRACE_PID) && (a) /
  57 #elif defined(TRACE_PGID)
  58 #define TRACE_FILTER    / curpsinfo->pr_pgid == TRACE_PGID /
  59 #define TRACE_FILTER_COND(a)    / (curpsinfo->pr_pgid == TRACE_PGID) && (a) /
  60 #else
  61 #define TRACE_FILTER    / pid != 0 /
  62 #define TRACE_FILTER_COND(a)    / (pid != 0) && (a) /
  63 #endif
  64 #endif /* ENABLE_SCHED */
  65 
  66 /* Threshold to filter WAKEABLE latencies. */
  67 #define FILTER_THRESHOLD        5000000
  68 /* From thread.h */
  69 #define T_WAKEABLE              2
  70 
  71 /*
  72  * This array is used to store timestamp of when threads are enqueued
  73  * to dispatch queue.
  74  * self-> is not accessible when enqueue happens.
  75  */
  76 unsigned long long lt_timestamps[int, int];
  77 
  78 self unsigned int lt_is_block_wakeable;
  79 self unsigned long long lt_sleep_start;
  80 self unsigned long long lt_sleep_duration;
  81 self unsigned long long lt_sch_delay;
  82 self unsigned int lt_counter;           /* only used in low overhead */
  83 self unsigned long long lt_timestamp;   /* only used in low overhead */
  84 self unsigned int lt_stackp;
  85 self unsigned int lt_prio[int];
  86 self string lt_cause[int];
  87 
  88 this unsigned int priority;
  89 this string cause;
  90 
  91 /*
  92  * Clean up everything, otherwise we will run out of memory.
  93  */
  94 proc:::lwp-exit
  95 {
  96         lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0;
  97 
  98         self->lt_sleep_start = 0;
  99         self->lt_is_block_wakeable = 0;
 100         self->lt_counter = 0;
 101         self->lt_timestamp = 0;
 102 
 103         /*
 104          * Workaround: no way to clear associative array.
 105          * We have to manually clear 0 ~ (MAX_TAG-1).
 106          */
 107 
 108         self->lt_prio[0] = 0;
 109         self->lt_prio[1] = 0;
 110         self->lt_prio[2] = 0;
 111         self->lt_prio[3] = 0;
 112         self->lt_prio[4] = 0;
 113         self->lt_prio[5] = 0;
 114         self->lt_prio[6] = 0;
 115         self->lt_prio[7] = 0;
 116 
 117         self->lt_cause[0] = 0;
 118         self->lt_cause[1] = 0;
 119         self->lt_cause[2] = 0;
 120         self->lt_cause[3] = 0;
 121         self->lt_cause[4] = 0;
 122         self->lt_cause[5] = 0;
 123         self->lt_cause[6] = 0;
 124         self->lt_cause[7] = 0;
 125 }
 126 
 127 #if !defined(ENABLE_LOW_OVERHEAD)
 128 /*
 129  * Log timestamp when a thread is taken off the CPU.
 130  */
 131 sched::resume:off-cpu
 132 TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP)
 133 {
 134         self->lt_sleep_start = timestamp;
 135         self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE;
 136 
 137         lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] =
 138             self->lt_sleep_start;
 139 }
 140 
 141 /*
 142  * Log timestamp when a thread is put on a dispatch queue and becomes runnable.
 143  */
 144 sched:::enqueue
 145 /lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/
 146 {
 147         lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] = timestamp;
 148 }
 149 
 150 /*
 151  * Calculate latency when the thread is actually on the CPU.
 152  * This is necessary in order to get the right stack.
 153  */
 154 this unsigned long long end;
 155 this unsigned long long now;
 156 sched::resume:on-cpu
 157 /self->lt_sleep_start != 0/
 158 {
 159         this->end = lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid];
 160         this->now = timestamp;
 161         lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0;
 162         this->end = (this->end != 0 && this->end != self->lt_sleep_start)
 163             ? this->end : this->now;
 164 
 165         self->lt_sch_delay = this->now - this->end;
 166         self->lt_sleep_duration = this->end - self->lt_sleep_start;
 167         self->lt_sleep_start = 0;
 168 }
 169 
 170 /*
 171  * Filter: drop all "large" latency when it is interruptible, i.e., sleep()
 172  * etc.
 173  */
 174 #if defined(ENABLE_FILTER)
 175 sched::resume:on-cpu
 176 /self->lt_sleep_duration > FILTER_THRESHOLD &&
 177   self->lt_is_block_wakeable != 0/
 178 {
 179         self->lt_sch_delay = 0;
 180         self->lt_sleep_duration = 0;
 181         self->lt_is_block_wakeable = 0;
 182 }
 183 #endif /* defined(ENABLE_FILTER) */
 184 
 185 /*
 186  * Write sleep time to the aggregation.
 187  * lt_sleep_duration is the duration between the time when a thread is taken
 188  * off the CPU and the time when it is enqueued again.
 189  */
 190 sched::resume:on-cpu
 191 /self->lt_sleep_duration != 0/
 192 {
 193         this->cause = self->lt_stackp > 0 ?
 194             self->lt_cause[self->lt_stackp - 1] : "";
 195         this->priority = self->lt_stackp > 0 ?
 196             self->lt_prio[self->lt_stackp - 1] : 0;
 197 
 198         @lt_call_count[pid, tid, stack(), this->cause,
 199             this->priority] = count();
 200         @lt_call_sum[pid, tid, stack(), this->cause,
 201             this->priority] = sum(self->lt_sleep_duration);
 202         @lt_call_max[pid, tid, stack(),  this->cause,
 203             this->priority] = max(self->lt_sleep_duration);
 204 
 205         self->lt_is_block_wakeable = 0;      /* Clear the flag to avoid leak */
 206         self->lt_sleep_duration = 0;
 207 }
 208 
 209 /*
 210  * Write time spent in queue to the aggregation.
 211  * lt_sch_delay is the interval between the time when a thread becomes
 212  * runnable and the time when it is actually on the CPU.
 213  */
 214 sched::resume:on-cpu
 215 /self->lt_sch_delay != 0/
 216 {
 217         @lt_named_count[pid, tid, "Wait for available CPU"] = count();
 218         @lt_named_sum[pid, tid, "Wait for available CPU"] =
 219             sum(self->lt_sch_delay);
 220         @lt_named_max[pid, tid, "Wait for available CPU"] =
 221             max(self->lt_sch_delay);
 222 
 223         self->lt_sch_delay = 0;
 224 }
 225 
 226 /*
 227  * Probes to track latency caused by spinning on a lock.
 228  */
 229 lockstat:::adaptive-spin
 230 TRACE_FILTER
 231 {
 232         @lt_named_count[pid, tid, "Adapt. lock spin"] = count();
 233         @lt_named_sum[pid, tid, "Adapt. lock spin"] = sum(arg1);
 234         @lt_named_max[pid, tid, "Adapt. lock spin"] = max(arg1);
 235 }
 236 
 237 lockstat:::spin-spin
 238 TRACE_FILTER
 239 {
 240         @lt_named_count[pid, tid, "Spinlock spin"] = count();
 241         @lt_named_sum[pid, tid, "Spinlock spin"] = sum(arg1);
 242         @lt_named_max[pid, tid, "Spinlock spin"] = max(arg1);
 243 }
 244 
 245 /*
 246  * Probes to track latency caused by blocking on a lock.
 247  */
 248 lockstat:::adaptive-block
 249 TRACE_FILTER
 250 {
 251         @lt_named_count[pid, tid, "#Adapt. lock block"] = count();
 252         @lt_named_sum[pid, tid, "#Adapt. lock block"] = sum(arg1);
 253         @lt_named_max[pid, tid, "#Adapt. lock block"] = max(arg1);
 254 }
 255 
 256 lockstat:::rw-block
 257 TRACE_FILTER
 258 {
 259         @lt_named_count[pid, tid, "#RW. lock block"] = count();
 260         @lt_named_sum[pid, tid, "#RW. lock block"] = sum(arg1);
 261         @lt_named_max[pid, tid, "#RW. lock block"] = max(arg1);
 262 }
 263 
 264 #if defined(ENABLE_SYNCOBJ)
 265 /*
 266  * Probes to track latency caused by synchronization objects.
 267  */
 268 this int stype;
 269 this unsigned long long wchan;
 270 this unsigned long long wtime;
 271 
 272 sched:::wakeup
 273 /*
 274  * Currently we are unable to track wakeup from sched, because all its LWP IDs
 275  * are zero when we trace it and that makes lt_timestamps unusable.
 276  */
 277 /args[1]->pr_pid != 0 &&
 278     lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/
 279 {
 280         this->stype = args[0]->pr_stype;
 281         this->wchan = args[0]->pr_wchan;
 282         /*
 283          * We can use lt_timestamps[] here, because
 284          * wakeup is always fired before enqueue.
 285          * After enqueue, lt_timestamps[] will be overwritten.
 286          */
 287         this->wtime = timestamp - lt_timestamps[args[1]->pr_pid,
 288             args[0]->pr_lwpid];
 289 
 290         @lt_sync_count[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
 291             this->wchan] = count();
 292         @lt_sync_sum[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
 293             this->wchan] = sum(this->wtime);
 294         @lt_sync_max[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
 295             this->wchan] = max(this->wtime);
 296 }
 297 #endif /* defined(ENABLE_SYNCOBJ) */
 298 
 299 #else /* !defined(ENABLE_LOW_OVERHEAD) */
 300 
 301 /*
 302  * This is the low overhead mode.
 303  * In order to reduce the number of instructions executed during each
 304  * off-cpu and on-cpu event, we do the following:
 305  *
 306  *      1. Use sampling and update aggregations only roughly 1/100 times
 307  *              (SAMPLE_TIMES).
 308  *      2. Do not track anything other than what is needed for "main" window.
 309  *      3. Use as few thread local variables as possible.
 310  */
 311 
 312 #define SAMPLE_TIMES            100
 313 #define SAMPLE_THRESHOLD        50000000
 314 
 315 /*
 316  * Log timestamp when a thread is off CPU.
 317  */
 318 sched::resume:off-cpu
 319 TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP)
 320 {
 321         self->lt_timestamp = timestamp;
 322 #if defined(ENABLE_FILTER)
 323         self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE;
 324 #endif /* defined(ENABLE_FILTER) */
 325 }
 326 
 327 /*
 328  * Calculate latency when a thread is actually on the CPU.
 329  */
 330 this int need_skip;
 331 sched::resume:on-cpu
 332 /self->lt_timestamp != 0/
 333 {
 334         self->lt_timestamp = timestamp - self->lt_timestamp;
 335 
 336 #if defined(ENABLE_FILTER)
 337         self->lt_timestamp =
 338             (self->lt_timestamp > FILTER_THRESHOLD &&
 339             self->lt_is_block_wakeable != 0) ? 0 : self->lt_timestamp;
 340         self->lt_is_block_wakeable = 0;
 341 #endif /* defined(ENABLE_FILTER) */
 342 
 343         this->need_skip = (self->lt_counter < (SAMPLE_TIMES - 1) &&
 344             self->lt_timestamp <= SAMPLE_THRESHOLD) ? 1 : 0;
 345         self->lt_timestamp = this->need_skip ? 0 : self->lt_timestamp;
 346         self->lt_counter += this->need_skip;
 347 }
 348 
 349 /*
 350  * Track large latency first.
 351  */
 352 sched::resume:on-cpu
 353 /self->lt_timestamp > SAMPLE_THRESHOLD/
 354 {
 355         this->cause = self->lt_stackp > 0 ?
 356             self->lt_cause[self->lt_stackp - 1] : "";
 357         this->priority = self->lt_stackp > 0 ?
 358             self->lt_prio[self->lt_stackp - 1] : 0;
 359 
 360         @lt_call_count[pid, tid, stack(), this->cause,
 361             this->priority] = sum(1);
 362         @lt_call_sum[pid, tid, stack(), this->cause,
 363             this->priority] = sum(self->lt_timestamp);
 364         @lt_call_max[pid, tid, stack(), this->cause,
 365             this->priority] = max(self->lt_timestamp);
 366 
 367         self->lt_timestamp = 0;
 368 }
 369 
 370 /*
 371  * If we fall back to this probe, that means the latency is small and counter
 372  * has reached SAMPLE_TIMES.
 373  */
 374 sched::resume:on-cpu
 375 /self->lt_timestamp != 0/
 376 {
 377         this->cause = self->lt_stackp > 0 ?
 378             self->lt_cause[self->lt_stackp - 1] : "";
 379         this->priority = self->lt_stackp > 0 ?
 380             self->lt_prio[self->lt_stackp - 1] : 0;
 381 
 382         /* Need +1 because lt_counter has not been updated in this cycle. */
 383         @lt_call_count[pid, tid, stack(), this->cause,
 384             this->priority] = sum(self->lt_counter + 1);
 385         @lt_call_sum[pid, tid, stack(), this->cause,
 386             this->priority] = sum((self->lt_counter + 1) * self->lt_timestamp);
 387         @lt_call_max[pid, tid, stack(), this->cause,
 388             this->priority] = max(self->lt_timestamp);
 389 
 390         self->lt_timestamp = 0;
 391         self->lt_counter = 0;
 392 }
 393 
 394 #endif /* !defined(ENABLE_LOW_OVERHEAD) */
 395 
 396 #define TRANSLATE(entryprobe, returnprobe, cause, priority)             \
 397 entryprobe                                                              \
 398 TRACE_FILTER_COND(self->lt_stackp == 0 ||                            \
 399     (self->lt_stackp < MAX_TAG &&                                 \
 400     self->lt_prio[self->lt_stackp - 1] <= priority) )                  \
 401 {                                                                       \
 402         self->lt_prio[self->lt_stackp] = priority;                        \
 403         self->lt_cause[self->lt_stackp] = cause;                  \
 404         ++self->lt_stackp;                                           \
 405 }                                                                       \
 406 returnprobe                                                             \
 407 TRACE_FILTER_COND(self->lt_stackp > 0 &&                          \
 408     self->lt_cause[self->lt_stackp - 1] == cause)                 \
 409 {                                                                       \
 410         --self->lt_stackp;                                           \
 411         self->lt_cause[self->lt_stackp] = NULL;                           \
 412 }
 413 
 414 /*
 415  * Syscalls have a priority of 10. This is to make sure that latency is
 416  * traced to one of the syscalls only if nothing else matches.
 417  * We put this special probe here because it uses "probefunc" variable instead
 418  * of a constant string.
 419  */
 420 
 421 TRANSLATE(syscall:::entry, syscall:::return, probefunc, 10)