4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Panic software-diagnosis subsidiary
28 *
29 * We model a system panic as a defect diagnosis in FMA. When a system
30 * panicks, savecore publishes events which we subscribe to here.
31 *
32 * Our driving events are all raised by savecore, run either from
33 * startup of the dumpadm service or interactively at the command line.
34 * The following describes the logic for the handling of these events.
35 *
36 * On reboot after panic we will run savecore as part of the dumpadm
37 * service startup; we run savecore even if savecore is otherwise
38 * disabled (ie dumpadm -n in effect) - we run savecore -c to check for
39 * a valid dump and raise the initial event.
40 *
41 * If savecore (or savecore -c) observes a valid dump pending on the
42 * device, it raises a "dump_pending_on_device" event provided this
43 * was not an FMA-initiated panic (for those we will replay ereports
67 * some reason then it exits and raises a "savecore_failure" event.
68 * These two events are raised even for FMA-initiated panics.
69 *
70 * We subscribe to both the "dump_available" and "savecore_failed" events,
71 * and in the handling thereof we will close the case opened earlier (if
72 * this is not an FMA-initiated panic). On receipt of the initial
73 * "dump_available" event we also arm a timer for +10 minutes if
74 * dumpadm is enabled - if no "dump_available" or "savecore_failed" arrives
75 * in that time we will solve the case on timeout.
76 *
77 * When the timer fires we check whether the initial event for each panic
78 * case was received more than 30 minutes ago; if it was we solve the case
79 * with what we have. If we're still within the waiting period we rearm
80 * for a further 10 minutes. The timer is shared by all cases that we
81 * create, which is why the fire interval is shorter than the maximum time
82 * we are prepared to wait.
83 */
84
85 #include <strings.h>
86 #include <sys/panic.h>
87 #include <alloca.h>
88 #include <zone.h>
89
90 #include "../../common/sw.h"
91 #include "panic.h"
92
93 #define MAX_STRING_LEN 160
94
95 static id_t myid;
96
97 static id_t mytimerid;
98
99 /*
100 * Our serialization structure type.
101 */
102 #define SWDE_PANIC_CASEDATA_VERS 1
103
104 typedef struct swde_panic_casedata {
105 uint32_t scd_vers; /* must be first member */
106 uint64_t scd_receive_time; /* when we first knew of this panic */
107 size_t scd_nvlbufsz; /* size of following buffer */
108 /* packed attr nvlist follows */
179 err |= nvlist_add_uint8(fmri, FM_VERSION, FM_SW_SCHEME_VERSION);
180 err |= nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_SW);
181
182 sw_obj = fmd_nvl_alloc(hdl, FMD_SLEEP);
183 err |= nvlist_add_string(sw_obj, FM_FMRI_SW_OBJ_PATH, object);
184 err |= nvlist_add_nvlist(fmri, FM_FMRI_SW_OBJ, sw_obj);
185 nvlist_free(sw_obj);
186 if (!err)
187 return (fmri);
188 else
189 return (0);
190 }
191
192 static const char *dumpfiles[2] = { "unix.%lld", "vmcore.%lld" };
193 static const char *dumpfiles_comp[2] = { "vmdump.%lld", NULL};
194
195 static void
196 swde_panic_solve(fmd_hdl_t *hdl, fmd_case_t *cp,
197 nvlist_t *attr, fmd_event_t *ep, boolean_t savecore_success)
198 {
199 char *dumpdir, *path, *uuid;
200 nvlist_t *defect, *rsrc;
201 nvpair_t *nvp;
202 int i;
203
204 /*
205 * Attribute members to include in event-specific defect
206 * payload. Some attributes will not be present for some
207 * cases - e.g., if we timed out and solved the case without
208 * a "dump_available" report.
209 */
210 const char *toadd[] = {
211 "os-instance-uuid", /* same as case uuid */
212 "panicstr", /* for initial classification work */
213 "panicstack", /* for initial classification work */
214 "crashtime", /* in epoch time */
215 "panic-time", /* Formatted crash time */
216 };
217
218 if (ep != NULL)
219 fmd_case_add_ereport(hdl, cp, ep);
220 /*
221 * As a temporary solution we create and fmri in the sw scheme
222 * in panic_sw_fmri. This should become a generic fmri constructor
223 *
224 * We need to user a resource FMRI which will have a sufficiently
225 * unique string representation such that fmd will not see
226 * repeated panic diagnoses (all using the same defect class)
227 * as duplicates and discard later cases. We can't actually diagnose
228 * the panic to anything specific (e.g., a path to a module and
229 * function/line etc therein). We could pick on a generic
230 * representative such as /kernel/genunix but that could lead
231 * to misunderstanding. So we choose a path based on <dumpdir>
232 * and the OS instance UUID - "<dumpdir>/.<os-instance-uuid>".
233 * There's no file at that path (*) but no matter. We can't use
234 * <dumpdir>/vmdump.N or similar because if savecore is disabled
235 * or failed we don't have any file or instance number.
236 *
237 * (*) Some day it would seem tidier to keep all files to do
238 * with a single crash (unix/vmcore/vmdump, analysis output etc)
239 * in a distinct directory, and <dumpdir>/.<uuid> seems like a good
240 * choice. For compatability we'd symlink into it. So that is
241 * another reason for this choice - some day it may exist!
242 */
243 (void) nvlist_lookup_string(attr, "dumpdir", &dumpdir);
244 (void) nvlist_lookup_string(attr, "os-instance-uuid", &uuid);
245 path = alloca(strlen(dumpdir) + 1 + 1 + 36 + 1);
246 /* LINTED: E_SEC_SPRINTF_UNBOUNDED_COPY */
247 (void) sprintf(path, "%s/.%s", dumpdir, uuid);
248 rsrc = panic_sw_fmri(hdl, path);
249
250 defect = fmd_nvl_create_defect(hdl, SW_SUNOS_PANIC_DEFECT,
251 100, rsrc, NULL, rsrc);
252 nvlist_free(rsrc);
253
254 (void) nvlist_add_boolean_value(defect, "savecore-succcess",
255 savecore_success);
256
257 if (savecore_success) {
258 boolean_t compressed;
259 int64_t instance;
260 const char **pathfmts;
261 char buf[2][32];
262 int files = 0;
263 char *arr[2];
264 int i;
265
266 (void) nvlist_lookup_int64(attr, "instance", &instance);
267 (void) nvlist_lookup_boolean_value(attr, "compressed",
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
25 */
26
27 /*
28 * Panic software-diagnosis subsidiary
29 *
30 * We model a system panic as a defect diagnosis in FMA. When a system
31 * panicks, savecore publishes events which we subscribe to here.
32 *
33 * Our driving events are all raised by savecore, run either from
34 * startup of the dumpadm service or interactively at the command line.
35 * The following describes the logic for the handling of these events.
36 *
37 * On reboot after panic we will run savecore as part of the dumpadm
38 * service startup; we run savecore even if savecore is otherwise
39 * disabled (ie dumpadm -n in effect) - we run savecore -c to check for
40 * a valid dump and raise the initial event.
41 *
42 * If savecore (or savecore -c) observes a valid dump pending on the
43 * device, it raises a "dump_pending_on_device" event provided this
44 * was not an FMA-initiated panic (for those we will replay ereports
68 * some reason then it exits and raises a "savecore_failure" event.
69 * These two events are raised even for FMA-initiated panics.
70 *
71 * We subscribe to both the "dump_available" and "savecore_failed" events,
72 * and in the handling thereof we will close the case opened earlier (if
73 * this is not an FMA-initiated panic). On receipt of the initial
74 * "dump_available" event we also arm a timer for +10 minutes if
75 * dumpadm is enabled - if no "dump_available" or "savecore_failed" arrives
76 * in that time we will solve the case on timeout.
77 *
78 * When the timer fires we check whether the initial event for each panic
79 * case was received more than 30 minutes ago; if it was we solve the case
80 * with what we have. If we're still within the waiting period we rearm
81 * for a further 10 minutes. The timer is shared by all cases that we
82 * create, which is why the fire interval is shorter than the maximum time
83 * we are prepared to wait.
84 */
85
86 #include <strings.h>
87 #include <sys/panic.h>
88 #include <zone.h>
89 #include <uuid/uuid.h>
90
91 #include "../../common/sw.h"
92 #include "panic.h"
93
94 #define MAX_STRING_LEN 160
95
96 static id_t myid;
97
98 static id_t mytimerid;
99
100 /*
101 * Our serialization structure type.
102 */
103 #define SWDE_PANIC_CASEDATA_VERS 1
104
105 typedef struct swde_panic_casedata {
106 uint32_t scd_vers; /* must be first member */
107 uint64_t scd_receive_time; /* when we first knew of this panic */
108 size_t scd_nvlbufsz; /* size of following buffer */
109 /* packed attr nvlist follows */
180 err |= nvlist_add_uint8(fmri, FM_VERSION, FM_SW_SCHEME_VERSION);
181 err |= nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_SW);
182
183 sw_obj = fmd_nvl_alloc(hdl, FMD_SLEEP);
184 err |= nvlist_add_string(sw_obj, FM_FMRI_SW_OBJ_PATH, object);
185 err |= nvlist_add_nvlist(fmri, FM_FMRI_SW_OBJ, sw_obj);
186 nvlist_free(sw_obj);
187 if (!err)
188 return (fmri);
189 else
190 return (0);
191 }
192
193 static const char *dumpfiles[2] = { "unix.%lld", "vmcore.%lld" };
194 static const char *dumpfiles_comp[2] = { "vmdump.%lld", NULL};
195
196 static void
197 swde_panic_solve(fmd_hdl_t *hdl, fmd_case_t *cp,
198 nvlist_t *attr, fmd_event_t *ep, boolean_t savecore_success)
199 {
200 char path[MAXPATHLEN];
201 char *dumpdir, *uuid;
202 nvlist_t *defect, *rsrc;
203 nvpair_t *nvp;
204 int i;
205
206 /*
207 * Attribute members to include in event-specific defect
208 * payload. Some attributes will not be present for some
209 * cases - e.g., if we timed out and solved the case without
210 * a "dump_available" report.
211 */
212 const char *toadd[] = {
213 "os-instance-uuid", /* same as case uuid */
214 "panicstr", /* for initial classification work */
215 "panicstack", /* for initial classification work */
216 "crashtime", /* in epoch time */
217 "panic-time", /* Formatted crash time */
218 };
219
220 if (ep != NULL)
221 fmd_case_add_ereport(hdl, cp, ep);
222 /*
223 * As a temporary solution we create and fmri in the sw scheme
224 * in panic_sw_fmri. This should become a generic fmri constructor
225 *
226 * We need to user a resource FMRI which will have a sufficiently
227 * unique string representation such that fmd will not see
228 * repeated panic diagnoses (all using the same defect class)
229 * as duplicates and discard later cases. We can't actually diagnose
230 * the panic to anything specific (e.g., a path to a module and
231 * function/line etc therein). We could pick on a generic
232 * representative such as /kernel/genunix but that could lead
233 * to misunderstanding. So we choose a path based on <dumpdir>
234 * and the OS instance UUID - "<dumpdir>/data/<uuid>".
235 */
236 (void) nvlist_lookup_string(attr, "dumpdir", &dumpdir);
237 (void) nvlist_lookup_string(attr, "os-instance-uuid", &uuid);
238 (void) snprintf(path, sizeof (path), "%s/data/%s", dumpdir, uuid);
239 rsrc = panic_sw_fmri(hdl, path);
240
241 defect = fmd_nvl_create_defect(hdl, SW_SUNOS_PANIC_DEFECT,
242 100, rsrc, NULL, rsrc);
243 nvlist_free(rsrc);
244
245 (void) nvlist_add_boolean_value(defect, "savecore-succcess",
246 savecore_success);
247
248 if (savecore_success) {
249 boolean_t compressed;
250 int64_t instance;
251 const char **pathfmts;
252 char buf[2][32];
253 int files = 0;
254 char *arr[2];
255 int i;
256
257 (void) nvlist_lookup_int64(attr, "instance", &instance);
258 (void) nvlist_lookup_boolean_value(attr, "compressed",
|