1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2008-2009, Intel Corporation.
  23  * All Rights Reserved.
  24  */
  25 
  26 #include <unistd.h>
  27 #include <stdio.h>
  28 #include <dtrace.h>
  29 #include <string.h>
  30 #include <stdlib.h>
  31 #include <memory.h>
  32 #include <limits.h>
  33 
  34 #include "latencytop.h"
  35 
  36 static dtrace_hdl_t *g_dtp = NULL;      /* dtrace handle */
  37 static pid_t pid_self = -1;             /* PID of our own process */
  38 
  39 /*
  40  * Ignore sched if sched is not tracked.
  41  * Also ignore ourselves (i.e., latencytop).
  42  */
  43 #define SHOULD_IGNORE(pid)              \
  44         ((!g_config.lt_cfg_trace_sched && 0 == (pid)) || pid_self == (pid))
  45 
  46 /*
  47  * Get an integer value from dtrace record.
  48  */
  49 static uint64_t
  50 rec_get_value(void *a, size_t b)
  51 {
  52         uint64_t ret = 0;
  53 
  54         switch (b) {
  55         case sizeof (uint64_t):
  56                 ret = *((uint64_t *)(a));
  57                 break;
  58         case sizeof (uint32_t):
  59                 ret = *((uint32_t *)(a));
  60                 break;
  61         case sizeof (uint16_t):
  62                 ret = *((uint16_t *)(a));
  63                 break;
  64         case sizeof (uint8_t):
  65                 ret = *((uint8_t *)(a));
  66                 break;
  67         default:
  68                 break;
  69         }
  70 
  71         return (ret);
  72 }
  73 
  74 /*
  75  * Callback to process aggregation lt_call_* (related to on/off cpu
  76  * activities) in the snapshot.
  77  */
  78 static int
  79 aggwalk_call(const dtrace_aggdata_t *data, lt_stat_type_t stat_type)
  80 {
  81         dtrace_aggdesc_t *aggdesc = data->dtada_desc;
  82         dtrace_syminfo_t dts;
  83         GElf_Sym sym;
  84         caddr_t addr;
  85         pid_t pid;
  86         id_t tid;
  87         unsigned int stack_depth;
  88         unsigned int pc_size;
  89         uint64_t pc;
  90         uint64_t agg_value;
  91         char *ptr = NULL;
  92         char *buffer = NULL;
  93         int ptrsize;
  94         unsigned int buffersize;
  95         char *tag = NULL;
  96         unsigned int priority;
  97         enum { REC_PID = 1, REC_TID, REC_STACK, REC_TAG, REC_PRIO, REC_AGG,
  98             NREC };
  99 
 100         /* Check action type */
 101         if ((aggdesc->dtagd_nrecs < NREC) ||
 102             (aggdesc->dtagd_rec[REC_PID].dtrd_action != DTRACEACT_DIFEXPR) ||
 103             (aggdesc->dtagd_rec[REC_TID].dtrd_action != DTRACEACT_DIFEXPR) ||
 104             (aggdesc->dtagd_rec[REC_TAG].dtrd_action != DTRACEACT_DIFEXPR) ||
 105             (aggdesc->dtagd_rec[REC_PRIO].dtrd_action != DTRACEACT_DIFEXPR) ||
 106             (!DTRACEACT_ISAGG(aggdesc->dtagd_rec[REC_AGG].dtrd_action)) ||
 107             (aggdesc->dtagd_rec[REC_STACK].dtrd_action != DTRACEACT_STACK)) {
 108 
 109                 return (-1);
 110         }
 111 
 112         pid = rec_get_value(
 113             data->dtada_data + aggdesc->dtagd_rec[REC_PID].dtrd_offset,
 114             aggdesc->dtagd_rec[REC_PID].dtrd_size);
 115 
 116         if (SHOULD_IGNORE(pid)) {
 117                 return (0);
 118         }
 119 
 120         tid = rec_get_value(
 121             data->dtada_data + aggdesc->dtagd_rec[REC_TID].dtrd_offset,
 122             aggdesc->dtagd_rec[REC_TID].dtrd_size);
 123 
 124         /* Parse stack array from dtagd_rec */
 125         stack_depth = aggdesc->dtagd_rec[REC_STACK].dtrd_arg;
 126         pc_size = aggdesc->dtagd_rec[REC_STACK].dtrd_size / stack_depth;
 127         addr = data->dtada_data + aggdesc->dtagd_rec[REC_STACK].dtrd_offset;
 128         buffersize = (stack_depth * (2 * PATH_MAX + 2) + 1) * sizeof (char);
 129         buffer = (char *)lt_malloc(buffersize);
 130         ptr = buffer;
 131         ptrsize = buffersize;
 132 
 133         /* Print the stack */
 134         while (stack_depth > 0) {
 135                 pc = rec_get_value(addr, pc_size);
 136 
 137                 if (pc == 0) {
 138                         break;
 139                 }
 140 
 141                 addr += pc_size;
 142 
 143                 if (dtrace_lookup_by_addr(g_dtp, pc, &sym, &dts) == 0) {
 144                         int len;
 145                         len = snprintf(ptr, ptrsize,
 146                             "%s`%s ", dts.dts_object, dts.dts_name);
 147                         ptrsize -= len;
 148 
 149                         if (ptrsize <= 0) {
 150                                 /*
 151                                  * snprintf returns "desired" length, so
 152                                  * reaching here means our buffer is full.
 153                                  * Move ptr to the last byte of the buffer and
 154                                  * break.
 155                                  */
 156                                 ptr = &buffer[buffersize-1];
 157                                 break;
 158                         } else {
 159                                 ptr += len;
 160                         }
 161                 }
 162         }
 163 
 164         if (ptr != buffer) {
 165                 /*
 166                  * We have printed something, so it is safe to remove
 167                  * the last ' '.
 168                  */
 169                 *(ptr-1) = '\0';
 170         }
 171 
 172         tag = (char *)data->dtada_data +
 173             aggdesc->dtagd_rec[REC_TAG].dtrd_offset;
 174 
 175         priority = rec_get_value(
 176             data->dtada_data + aggdesc->dtagd_rec[REC_PRIO].dtrd_offset,
 177             aggdesc->dtagd_rec[REC_PRIO].dtrd_size);
 178 
 179         agg_value = rec_get_value(
 180             data->dtada_data + aggdesc->dtagd_rec[REC_AGG].dtrd_offset,
 181             aggdesc->dtagd_rec[REC_AGG].dtrd_size);
 182 
 183         lt_stat_update(pid, tid, buffer, tag, priority, stat_type, agg_value);
 184 
 185         if (buffer != NULL)  {
 186                 free(buffer);
 187         }
 188 
 189         return (0);
 190 }
 191 
 192 /*
 193  * Callback to process aggregation lt_named_* (related to lock spinning etc.),
 194  * in the snapshot.
 195  */
 196 static int
 197 aggwalk_named(const dtrace_aggdata_t *data, lt_stat_type_t stat_type)
 198 {
 199         dtrace_aggdesc_t *aggdesc = data->dtada_desc;
 200         pid_t pid;
 201         id_t tid;
 202         uint64_t agg_value;
 203         int cause_id;
 204         char *type = NULL;
 205         enum { REC_PID = 1, REC_TID, REC_TYPE, REC_AGG, NREC };
 206 
 207         /* Check action type */
 208         if ((aggdesc->dtagd_nrecs < NREC) ||
 209             (aggdesc->dtagd_rec[REC_PID].dtrd_action != DTRACEACT_DIFEXPR) ||
 210             (aggdesc->dtagd_rec[REC_TID].dtrd_action != DTRACEACT_DIFEXPR) ||
 211             (aggdesc->dtagd_rec[REC_TYPE].dtrd_action != DTRACEACT_DIFEXPR) ||
 212             (!DTRACEACT_ISAGG(aggdesc->dtagd_rec[REC_AGG].dtrd_action))) {
 213 
 214                 return (-1);
 215         }
 216 
 217         pid = rec_get_value(
 218             data->dtada_data + aggdesc->dtagd_rec[REC_PID].dtrd_offset,
 219             aggdesc->dtagd_rec[REC_PID].dtrd_size);
 220 
 221         if (SHOULD_IGNORE(pid)) {
 222                 return (0);
 223         }
 224 
 225         tid = rec_get_value(
 226             data->dtada_data + aggdesc->dtagd_rec[REC_TID].dtrd_offset,
 227             aggdesc->dtagd_rec[REC_TID].dtrd_size);
 228 
 229         type = (char *)data->dtada_data
 230             + aggdesc->dtagd_rec[REC_TYPE].dtrd_offset;
 231         cause_id = lt_table_cause_from_name(type, 1, CAUSE_FLAG_SPECIAL);
 232 
 233         agg_value = rec_get_value(
 234             data->dtada_data + aggdesc->dtagd_rec[REC_AGG].dtrd_offset,
 235             aggdesc->dtagd_rec[REC_AGG].dtrd_size);
 236 
 237         lt_stat_update_cause(pid, tid, cause_id, stat_type, agg_value);
 238 
 239         return (0);
 240 
 241 }
 242 
 243 /*
 244  * Callback to process aggregation lt_sync_* (related to synchronization
 245  * objects), in the snapshot.
 246  */
 247 static int
 248 aggwalk_sync(const dtrace_aggdata_t *data, lt_stat_type_t stat_type)
 249 {
 250         dtrace_aggdesc_t *aggdesc = data->dtada_desc;
 251         pid_t pid;
 252         id_t tid;
 253         uint64_t agg_value;
 254         int stype;
 255         unsigned long long wchan;
 256         enum { REC_PID = 1, REC_TID, REC_STYPE, REC_WCHAN, REC_AGG, NREC };
 257 
 258         /* Check action type */
 259         if ((aggdesc->dtagd_nrecs < NREC) ||
 260             (aggdesc->dtagd_rec[REC_PID].dtrd_action != DTRACEACT_DIFEXPR) ||
 261             (aggdesc->dtagd_rec[REC_TID].dtrd_action != DTRACEACT_DIFEXPR) ||
 262             (aggdesc->dtagd_rec[REC_STYPE].dtrd_action != DTRACEACT_DIFEXPR) ||
 263             (aggdesc->dtagd_rec[REC_WCHAN].dtrd_action != DTRACEACT_DIFEXPR) ||
 264             (!DTRACEACT_ISAGG(aggdesc->dtagd_rec[REC_AGG].dtrd_action))) {
 265 
 266                 return (-1);
 267         }
 268 
 269         pid = rec_get_value(
 270             data->dtada_data + aggdesc->dtagd_rec[REC_PID].dtrd_offset,
 271             aggdesc->dtagd_rec[REC_PID].dtrd_size);
 272 
 273         if (SHOULD_IGNORE(pid)) {
 274                 return (0);
 275         }
 276 
 277         tid = rec_get_value(
 278             data->dtada_data + aggdesc->dtagd_rec[REC_TID].dtrd_offset,
 279             aggdesc->dtagd_rec[REC_TID].dtrd_size);
 280 
 281         stype = rec_get_value(
 282             data->dtada_data + aggdesc->dtagd_rec[REC_STYPE].dtrd_offset,
 283             aggdesc->dtagd_rec[REC_STYPE].dtrd_size);
 284 
 285         wchan = rec_get_value(
 286             data->dtada_data + aggdesc->dtagd_rec[REC_WCHAN].dtrd_offset,
 287             aggdesc->dtagd_rec[REC_WCHAN].dtrd_size);
 288 
 289         agg_value = rec_get_value(
 290             data->dtada_data + aggdesc->dtagd_rec[REC_AGG].dtrd_offset,
 291             aggdesc->dtagd_rec[REC_AGG].dtrd_size);
 292 
 293         lt_stat_update_sobj(pid, tid, stype, wchan, stat_type, agg_value);
 294 
 295         return (0);
 296 }
 297 
 298 /*
 299  * Callback to process various aggregations in the snapshot. Called by
 300  * different aggwalk_* functions.
 301  */
 302 /* ARGSUSED */
 303 static int
 304 aggwalk(const dtrace_aggdata_t *data, void *arg)
 305 {
 306         char *tmp;
 307         char buffer[32];
 308         lt_stat_type_t stat_type;
 309         int (*func)(const dtrace_aggdata_t *, lt_stat_type_t);
 310 
 311         (void) strncpy(buffer, data->dtada_desc->dtagd_name, sizeof (buffer));
 312         buffer[sizeof (buffer) - 1] = '\0';
 313         tmp = strtok(buffer, "_");
 314 
 315         if (tmp == NULL || strcmp(tmp, "lt") != 0) {
 316                 goto done;
 317         }
 318 
 319         tmp = strtok(NULL, "_");
 320 
 321         if (tmp == NULL) {
 322                 goto done;
 323         } else if (strcmp(tmp, "call") == 0) {
 324                 func = aggwalk_call;
 325         } else if (strcmp(tmp, "named") == 0) {
 326                 func = aggwalk_named;
 327         } else if (strcmp(tmp, "sync") == 0) {
 328                 func = aggwalk_sync;
 329         } else {
 330                 goto done;
 331         }
 332 
 333         tmp = strtok(NULL, "_");
 334 
 335         if (tmp == NULL) {
 336                 goto done;
 337         } else if (strcmp(tmp, "count") == 0) {
 338                 stat_type = LT_STAT_COUNT;
 339         } else if (strcmp(tmp, "sum") == 0) {
 340                 stat_type = LT_STAT_SUM;
 341         } else if (strcmp(tmp, "max") == 0) {
 342                 stat_type = LT_STAT_MAX;
 343         } else {
 344                 goto done;
 345         }
 346 
 347         (void) func(data, stat_type);
 348 
 349 done:
 350         /* We have our data, so remove it from DTrace now */
 351         return (DTRACE_AGGWALK_REMOVE);
 352 }
 353 
 354 /*
 355  * Callback to handle event caused by DTrace dropping data.
 356  */
 357 /*ARGSUSED*/
 358 static int
 359 drop_handler(const dtrace_dropdata_t *data, void *user)
 360 {
 361         lt_display_error("Drop: %s\n", data->dtdda_msg);
 362         lt_drop_detected = B_TRUE;
 363 
 364         /* Pretend nothing happened, so just continue */
 365         return (DTRACE_HANDLE_OK);
 366 }
 367 
 368 #ifndef EMBED_CONFIGS
 369 /*
 370  * Copy the content from a "real" file into a temp file.
 371  */
 372 static int
 373 copy_tmp_file(const char *src, FILE *dst)
 374 {
 375         FILE *tmp = NULL;
 376         char buffer[256];
 377         int bytes;
 378 
 379         if ((tmp = fopen(src, "r")) == NULL) {
 380                 return (-1);
 381         }
 382 
 383         while ((bytes = fread(buffer, 1, sizeof (buffer), tmp)) > 0) {
 384                 if (fwrite(buffer, bytes, 1, dst) != 1) {
 385                         return (-1);
 386                 }
 387         }
 388 
 389         (void) fclose(tmp);
 390 
 391         return (0);
 392 }
 393 #endif
 394 
 395 /*
 396  * DTrace initialization. D script starts running when this function returns.
 397  */
 398 int
 399 lt_dtrace_init(void)
 400 {
 401         dtrace_prog_t *prog;
 402         dtrace_proginfo_t info;
 403         int err;
 404         FILE *fp_script = NULL;
 405         char tmp[64];
 406 
 407         pid_self = getpid();
 408 
 409         if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
 410                 lt_display_error("Cannot open dtrace library: %s\n",
 411                     dtrace_errmsg(NULL, err));
 412                 return (-1);
 413         }
 414 
 415         if (dtrace_handle_drop(g_dtp, &drop_handler, NULL) == -1) {
 416                 lt_display_error("Cannot install DTrace handle: %s\n",
 417                     dtrace_errmsg(NULL, err));
 418                 return (-1);
 419         }
 420 
 421         if (g_config.lt_cfg_enable_filter) {
 422                 if ((err = dtrace_setopt(g_dtp, "define",
 423                     "ENABLE_FILTER")) != 0) {
 424                         lt_display_error(
 425                             "Failed to set option ENABLE_FILTER.\n");
 426                         return (err);
 427                 }
 428         }
 429 
 430         if (g_config.lt_cfg_trace_syncobj) {
 431                 if ((err = dtrace_setopt(g_dtp, "define",
 432                     "ENABLE_SYNCOBJ")) != 0) {
 433                         lt_display_error(
 434                             "Failed to set option ENABLE_SYNCOBJ.\n");
 435                         return (err);
 436                 }
 437         }
 438 
 439         if (g_config.lt_cfg_trace_sched) {
 440                 if ((err = dtrace_setopt(g_dtp, "define",
 441                     "ENABLE_SCHED")) != 0) {
 442                         lt_display_error(
 443                             "Failed to set option ENABLE_SCHED.\n");
 444                         return (err);
 445                 }
 446         }
 447 
 448         if (g_config.lt_cfg_trace_pid != 0) {
 449                 (void) snprintf(tmp, sizeof (tmp), "TRACE_PID=%u",
 450                     g_config.lt_cfg_trace_pid);
 451                 if ((err = dtrace_setopt(g_dtp, "define", tmp)) != 0) {
 452                         lt_display_error(
 453                             "Failed to set option TRACE_PID.\n");
 454                         return (err);
 455                 }
 456         }
 457 
 458         if (g_config.lt_cfg_trace_pgid != 0) {
 459                 (void) snprintf(tmp, sizeof (tmp), "TRACE_PGID=%u",
 460                     g_config.lt_cfg_trace_pgid);
 461                 if ((err = dtrace_setopt(g_dtp, "define", tmp)) != 0) {
 462                         lt_display_error(
 463                             "Failed to set option TRACE_PGID.\n");
 464                         return (err);
 465                 }
 466         }
 467 
 468         if (g_config.lt_cfg_low_overhead_mode) {
 469                 if ((err = dtrace_setopt(g_dtp, "define",
 470                     "ENABLE_LOW_OVERHEAD")) != 0) {
 471                         lt_display_error(
 472                             "Failed to set option ENABLE_LOW_OVERHEAD.\n");
 473                         return (err);
 474                 }
 475         }
 476 
 477         /* Create a temp file; libdtrace needs it for cpp(1) */
 478         if ((fp_script = tmpfile()) == NULL) {
 479                 lt_display_error("Cannot create tmp file\n");
 480                 return (-1);
 481         }
 482 
 483         /* Copy the main D script into the temp file */
 484 #ifdef EMBED_CONFIGS
 485         if (fwrite(&latencytop_d_start,
 486             (size_t)(&latencytop_d_end - &latencytop_d_start), 1, fp_script)
 487             != 1) {
 488                 lt_display_error("Could not copy D script, fwrite() failed\n");
 489                 (void) fclose(fp_script);
 490                 return (-1);
 491         }
 492 #else
 493         if (copy_tmp_file(DEFAULT_D_SCRIPT_NAME, fp_script) != 0) {
 494                 lt_display_error("Cannot open script file %s\n",
 495                     DEFAULT_D_SCRIPT_NAME);
 496                 (void) fclose(fp_script);
 497                 return (-1);
 498         }
 499 #endif  /* EMBED_CONFIGS */
 500 
 501         if (lt_table_append_trans(fp_script) != 0) {
 502                 (void) fclose(fp_script);
 503                 return (-1);
 504         }
 505 
 506         (void) fseek(fp_script, 0, SEEK_SET);
 507 
 508         if ((prog = dtrace_program_fcompile(g_dtp, fp_script,
 509             DTRACE_C_CPP, 0, NULL)) == NULL) {
 510                 lt_display_error("Failed to compile D script.\n");
 511                 (void) fclose(fp_script);
 512                 return (dtrace_errno(g_dtp));
 513         }
 514 
 515         (void) fclose(fp_script);
 516 
 517         /* Execute the D script */
 518         if (dtrace_program_exec(g_dtp, prog, &info) == -1) {
 519                 lt_display_error("Failed to enable probes.\n");
 520                 return (dtrace_errno(g_dtp));
 521         }
 522 
 523         if (dtrace_go(g_dtp) != 0) {
 524                 lt_display_error("Failed to run D script.\n");
 525                 return (dtrace_errno(g_dtp));
 526         }
 527 
 528         return (0);
 529 }
 530 
 531 /*
 532  * Worker function to move aggregate data to user space. Called periodically
 533  * to prevent the kernel from running out of memory.
 534  */
 535 int
 536 lt_dtrace_work(int force)
 537 {
 538         static uint64_t last_snap = 0;
 539         uint64_t now = lt_millisecond();
 540 
 541         if (!force && now - last_snap < g_config.lt_cfg_snap_interval) {
 542                 return (last_snap + g_config.lt_cfg_snap_interval - now);
 543         }
 544 
 545         if (dtrace_status(g_dtp) == -1) {
 546                 lt_display_error("Failed when getting status: %s\n",
 547                     dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
 548                 return (-1);
 549         }
 550 
 551         if (dtrace_aggregate_snap(g_dtp) != 0) {
 552                 lt_display_error("Failed to snap aggregate: %s\n",
 553                     dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
 554                 return (-1);
 555         }
 556 
 557         last_snap = now;
 558         return (0);
 559 }
 560 
 561 /*
 562  * Walk through dtrace aggregator and collect data for latencytop to display.
 563  * Called immediately before UI update.
 564  */
 565 int
 566 lt_dtrace_collect(void)
 567 {
 568         if (lt_dtrace_work(1) != 0) {
 569                 return (-1);
 570         }
 571 
 572         if (dtrace_aggregate_walk(g_dtp, aggwalk, NULL) != 0) {
 573                 lt_display_error("Failed to sort aggregate: %s\n",
 574                     dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
 575                 return (-1);
 576         }
 577 
 578         /*
 579          * Probably we don't need to clear again, because we have removed
 580          * everything. Paranoid ?
 581          */
 582         dtrace_aggregate_clear(g_dtp);
 583 
 584         return (0);
 585 }
 586 
 587 /*
 588  * dtrace clean up.
 589  */
 590 int
 591 lt_dtrace_deinit(void)
 592 {
 593         int ret = 0;
 594 
 595         if (dtrace_stop(g_dtp) != 0) {
 596                 lt_display_error("dtrace_stop failed: %s\n",
 597                     dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
 598                 ret = -1;
 599         }
 600 
 601         dtrace_close(g_dtp);
 602 
 603         return (ret);
 604 }