Print this page
NEX-9338 improve the layout of the crash directory (use sys/uuid.h)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9338 improve the layout of the crash directory (follow-up)
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9338 improve the layout of the crash directory
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Steve Peng <steve.peng@nexenta.com>


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.

  24  */
  25 
  26 /*
  27  * Panic software-diagnosis subsidiary
  28  *
  29  * We model a system panic as a defect diagnosis in FMA. When a system
  30  * panicks, savecore publishes events which we subscribe to here.
  31  *
  32  * Our driving events are all raised by savecore, run either from
  33  * startup of the dumpadm service or interactively at the command line.
  34  * The following describes the logic for the handling of these events.
  35  *
  36  * On reboot after panic we will run savecore as part of the dumpadm
  37  * service startup; we run savecore even if savecore is otherwise
  38  * disabled (ie dumpadm -n in effect) - we run savecore -c to check for
  39  * a valid dump and raise the initial event.
  40  *
  41  * If savecore (or savecore -c) observes a valid dump pending on the
  42  * device, it raises a "dump_pending_on_device" event provided this
  43  * was not an FMA-initiated panic (for those we will replay ereports


  67  * some reason then it exits and raises a "savecore_failure" event.
  68  * These two events are raised even for FMA-initiated panics.
  69  *
  70  * We subscribe to both the "dump_available" and "savecore_failed" events,
  71  * and in the handling thereof we will close the case opened earlier (if
  72  * this is not an FMA-initiated panic).  On receipt of the initial
  73  * "dump_available" event we also arm a timer for +10 minutes if
  74  * dumpadm is enabled - if no "dump_available" or "savecore_failed" arrives
  75  * in that time we will solve the case on timeout.
  76  *
  77  * When the timer fires we check whether the initial event for each panic
  78  * case was received more than 30 minutes ago; if it was we solve the case
  79  * with what we have.  If we're still within the waiting period we rearm
  80  * for a further 10 minutes.  The timer is shared by all cases that we
  81  * create, which is why the fire interval is shorter than the maximum time
  82  * we are prepared to wait.
  83  */
  84 
  85 #include <strings.h>
  86 #include <sys/panic.h>
  87 #include <alloca.h>
  88 #include <zone.h>

  89 
  90 #include "../../common/sw.h"
  91 #include "panic.h"
  92 
  93 #define MAX_STRING_LEN 160
  94 
  95 static id_t myid;
  96 
  97 static id_t mytimerid;
  98 
  99 /*
 100  * Our serialization structure type.
 101  */
 102 #define SWDE_PANIC_CASEDATA_VERS        1
 103 
 104 typedef struct swde_panic_casedata {
 105         uint32_t scd_vers;              /* must be first member */
 106         uint64_t scd_receive_time;      /* when we first knew of this panic */
 107         size_t scd_nvlbufsz;            /* size of following buffer */
 108                                         /* packed attr nvlist follows */


 179         err |= nvlist_add_uint8(fmri, FM_VERSION, FM_SW_SCHEME_VERSION);
 180         err |= nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_SW);
 181 
 182         sw_obj = fmd_nvl_alloc(hdl, FMD_SLEEP);
 183         err |= nvlist_add_string(sw_obj, FM_FMRI_SW_OBJ_PATH, object);
 184         err |= nvlist_add_nvlist(fmri, FM_FMRI_SW_OBJ, sw_obj);
 185         nvlist_free(sw_obj);
 186         if (!err)
 187                 return (fmri);
 188         else
 189                 return (0);
 190 }
 191 
 192 static const char *dumpfiles[2] = { "unix.%lld", "vmcore.%lld" };
 193 static const char *dumpfiles_comp[2] = { "vmdump.%lld", NULL};
 194 
 195 static void
 196 swde_panic_solve(fmd_hdl_t *hdl, fmd_case_t *cp,
 197     nvlist_t *attr, fmd_event_t *ep, boolean_t savecore_success)
 198 {
 199         char *dumpdir, *path, *uuid;

 200         nvlist_t *defect, *rsrc;
 201         nvpair_t *nvp;
 202         int i;
 203 
 204         /*
 205          * Attribute members to include in event-specific defect
 206          * payload.  Some attributes will not be present for some
 207          * cases - e.g., if we timed out and solved the case without
 208          * a "dump_available" report.
 209          */
 210         const char *toadd[] = {
 211                 "os-instance-uuid",     /* same as case uuid */
 212                 "panicstr",             /* for initial classification work */
 213                 "panicstack",           /* for initial classification work */
 214                 "crashtime",            /* in epoch time */
 215                 "panic-time",           /* Formatted crash time */
 216         };
 217 
 218         if (ep != NULL)
 219                 fmd_case_add_ereport(hdl, cp, ep);
 220         /*
 221          * As a temporary solution we create and fmri in the sw scheme
 222          * in panic_sw_fmri. This should become a generic fmri constructor
 223          *
 224          * We need to user a resource FMRI which will have a sufficiently
 225          * unique string representation such that fmd will not see
 226          * repeated panic diagnoses (all using the same defect class)
 227          * as duplicates and discard later cases.  We can't actually diagnose
 228          * the panic to anything specific (e.g., a path to a module and
 229          * function/line etc therein).  We could pick on a generic
 230          * representative such as /kernel/genunix but that could lead
 231          * to misunderstanding.  So we choose a path based on <dumpdir>
 232          * and the OS instance UUID - "<dumpdir>/.<os-instance-uuid>".
 233          * There's no file at that path (*) but no matter.  We can't use
 234          * <dumpdir>/vmdump.N or similar because if savecore is disabled
 235          * or failed we don't have any file or instance number.
 236          *
 237          * (*) Some day it would seem tidier to keep all files to do
 238          * with a single crash (unix/vmcore/vmdump, analysis output etc)
 239          * in a distinct directory, and <dumpdir>/.<uuid> seems like a good
 240          * choice.  For compatability we'd symlink into it.  So that is
 241          * another reason for this choice - some day it may exist!
 242          */
 243         (void) nvlist_lookup_string(attr, "dumpdir", &dumpdir);
 244         (void) nvlist_lookup_string(attr, "os-instance-uuid", &uuid);
 245         path = alloca(strlen(dumpdir) + 1 + 1 + 36 + 1);
 246         /* LINTED: E_SEC_SPRINTF_UNBOUNDED_COPY */
 247         (void) sprintf(path, "%s/.%s", dumpdir, uuid);
 248         rsrc = panic_sw_fmri(hdl, path);
 249 
 250         defect = fmd_nvl_create_defect(hdl, SW_SUNOS_PANIC_DEFECT,
 251             100, rsrc, NULL, rsrc);
 252         nvlist_free(rsrc);
 253 
 254         (void) nvlist_add_boolean_value(defect, "savecore-succcess",
 255             savecore_success);
 256 
 257         if (savecore_success) {
 258                 boolean_t compressed;
 259                 int64_t instance;
 260                 const char **pathfmts;
 261                 char buf[2][32];
 262                 int files = 0;
 263                 char *arr[2];
 264                 int i;
 265 
 266                 (void) nvlist_lookup_int64(attr, "instance", &instance);
 267                 (void) nvlist_lookup_boolean_value(attr, "compressed",




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
  25  */
  26 
  27 /*
  28  * Panic software-diagnosis subsidiary
  29  *
  30  * We model a system panic as a defect diagnosis in FMA. When a system
  31  * panicks, savecore publishes events which we subscribe to here.
  32  *
  33  * Our driving events are all raised by savecore, run either from
  34  * startup of the dumpadm service or interactively at the command line.
  35  * The following describes the logic for the handling of these events.
  36  *
  37  * On reboot after panic we will run savecore as part of the dumpadm
  38  * service startup; we run savecore even if savecore is otherwise
  39  * disabled (ie dumpadm -n in effect) - we run savecore -c to check for
  40  * a valid dump and raise the initial event.
  41  *
  42  * If savecore (or savecore -c) observes a valid dump pending on the
  43  * device, it raises a "dump_pending_on_device" event provided this
  44  * was not an FMA-initiated panic (for those we will replay ereports


  68  * some reason then it exits and raises a "savecore_failure" event.
  69  * These two events are raised even for FMA-initiated panics.
  70  *
  71  * We subscribe to both the "dump_available" and "savecore_failed" events,
  72  * and in the handling thereof we will close the case opened earlier (if
  73  * this is not an FMA-initiated panic).  On receipt of the initial
  74  * "dump_available" event we also arm a timer for +10 minutes if
  75  * dumpadm is enabled - if no "dump_available" or "savecore_failed" arrives
  76  * in that time we will solve the case on timeout.
  77  *
  78  * When the timer fires we check whether the initial event for each panic
  79  * case was received more than 30 minutes ago; if it was we solve the case
  80  * with what we have.  If we're still within the waiting period we rearm
  81  * for a further 10 minutes.  The timer is shared by all cases that we
  82  * create, which is why the fire interval is shorter than the maximum time
  83  * we are prepared to wait.
  84  */
  85 
  86 #include <strings.h>
  87 #include <sys/panic.h>

  88 #include <zone.h>
  89 #include <uuid/uuid.h>
  90 
  91 #include "../../common/sw.h"
  92 #include "panic.h"
  93 
  94 #define MAX_STRING_LEN 160
  95 
  96 static id_t myid;
  97 
  98 static id_t mytimerid;
  99 
 100 /*
 101  * Our serialization structure type.
 102  */
 103 #define SWDE_PANIC_CASEDATA_VERS        1
 104 
 105 typedef struct swde_panic_casedata {
 106         uint32_t scd_vers;              /* must be first member */
 107         uint64_t scd_receive_time;      /* when we first knew of this panic */
 108         size_t scd_nvlbufsz;            /* size of following buffer */
 109                                         /* packed attr nvlist follows */


 180         err |= nvlist_add_uint8(fmri, FM_VERSION, FM_SW_SCHEME_VERSION);
 181         err |= nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_SW);
 182 
 183         sw_obj = fmd_nvl_alloc(hdl, FMD_SLEEP);
 184         err |= nvlist_add_string(sw_obj, FM_FMRI_SW_OBJ_PATH, object);
 185         err |= nvlist_add_nvlist(fmri, FM_FMRI_SW_OBJ, sw_obj);
 186         nvlist_free(sw_obj);
 187         if (!err)
 188                 return (fmri);
 189         else
 190                 return (0);
 191 }
 192 
 193 static const char *dumpfiles[2] = { "unix.%lld", "vmcore.%lld" };
 194 static const char *dumpfiles_comp[2] = { "vmdump.%lld", NULL};
 195 
 196 static void
 197 swde_panic_solve(fmd_hdl_t *hdl, fmd_case_t *cp,
 198     nvlist_t *attr, fmd_event_t *ep, boolean_t savecore_success)
 199 {
 200         char path[MAXPATHLEN];
 201         char *dumpdir, *uuid;
 202         nvlist_t *defect, *rsrc;
 203         nvpair_t *nvp;
 204         int i;
 205 
 206         /*
 207          * Attribute members to include in event-specific defect
 208          * payload.  Some attributes will not be present for some
 209          * cases - e.g., if we timed out and solved the case without
 210          * a "dump_available" report.
 211          */
 212         const char *toadd[] = {
 213                 "os-instance-uuid",     /* same as case uuid */
 214                 "panicstr",             /* for initial classification work */
 215                 "panicstack",           /* for initial classification work */
 216                 "crashtime",            /* in epoch time */
 217                 "panic-time",           /* Formatted crash time */
 218         };
 219 
 220         if (ep != NULL)
 221                 fmd_case_add_ereport(hdl, cp, ep);
 222         /*
 223          * As a temporary solution we create and fmri in the sw scheme
 224          * in panic_sw_fmri. This should become a generic fmri constructor
 225          *
 226          * We need to user a resource FMRI which will have a sufficiently
 227          * unique string representation such that fmd will not see
 228          * repeated panic diagnoses (all using the same defect class)
 229          * as duplicates and discard later cases.  We can't actually diagnose
 230          * the panic to anything specific (e.g., a path to a module and
 231          * function/line etc therein).  We could pick on a generic
 232          * representative such as /kernel/genunix but that could lead
 233          * to misunderstanding.  So we choose a path based on <dumpdir>
 234          * and the OS instance UUID - "<dumpdir>/data/<uuid>".









 235          */
 236         (void) nvlist_lookup_string(attr, "dumpdir", &dumpdir);
 237         (void) nvlist_lookup_string(attr, "os-instance-uuid", &uuid);
 238         (void) snprintf(path, sizeof (path), "%s/data/%s", dumpdir, uuid);


 239         rsrc = panic_sw_fmri(hdl, path);
 240 
 241         defect = fmd_nvl_create_defect(hdl, SW_SUNOS_PANIC_DEFECT,
 242             100, rsrc, NULL, rsrc);
 243         nvlist_free(rsrc);
 244 
 245         (void) nvlist_add_boolean_value(defect, "savecore-succcess",
 246             savecore_success);
 247 
 248         if (savecore_success) {
 249                 boolean_t compressed;
 250                 int64_t instance;
 251                 const char **pathfmts;
 252                 char buf[2][32];
 253                 int files = 0;
 254                 char *arr[2];
 255                 int i;
 256 
 257                 (void) nvlist_lookup_int64(attr, "instance", &instance);
 258                 (void) nvlist_lookup_boolean_value(attr, "compressed",