Print this page
NEX-9338 improve the layout of the crash directory (use sys/uuid.h)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9338 improve the layout of the crash directory (follow-up)
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9338 improve the layout of the crash directory
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Steve Peng <steve.peng@nexenta.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/cmd/fm/modules/common/sw-diag-response/subsidiary/panic/panic_diag.c
+++ new/usr/src/cmd/fm/modules/common/sw-diag-response/subsidiary/panic/panic_diag.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 + * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
24 25 */
25 26
26 27 /*
27 28 * Panic software-diagnosis subsidiary
28 29 *
29 30 * We model a system panic as a defect diagnosis in FMA. When a system
30 31 * panicks, savecore publishes events which we subscribe to here.
31 32 *
32 33 * Our driving events are all raised by savecore, run either from
33 34 * startup of the dumpadm service or interactively at the command line.
34 35 * The following describes the logic for the handling of these events.
35 36 *
36 37 * On reboot after panic we will run savecore as part of the dumpadm
37 38 * service startup; we run savecore even if savecore is otherwise
38 39 * disabled (ie dumpadm -n in effect) - we run savecore -c to check for
39 40 * a valid dump and raise the initial event.
40 41 *
41 42 * If savecore (or savecore -c) observes a valid dump pending on the
42 43 * device, it raises a "dump_pending_on_device" event provided this
43 44 * was not an FMA-initiated panic (for those we will replay ereports
44 45 * from the dump device as usual and make a diagnosis from those; we do
45 46 * not need to open a case for the panic). We subscribe to the
46 47 * "dump_pending_on_device" event and use that to open a case; we
47 48 * open a case requesting the same case uuid as the panic dump image
48 49 * has for the OS instance uuid - if that fails because of a duplicate
49 50 * uuid then we have already opened a case for this panic so no need
50 51 * to open another.
51 52 *
52 53 * Included in the "dump_pending_on_device" event is an indication of
53 54 * whether or not dumpadm is enabled. If not (dumpadm -n in effect)
54 55 * then we do not expect any further events regarding this panic
55 56 * until such time as the admin runs savecore manually (if ever).
56 57 * So in this case we solve the case immediately after open. If/when
57 58 * subsequent events arrive when savecore is run manually, we will toss
58 59 * them.
59 60 *
60 61 * If dumpadm is enabled then savecore, run from dumpadm service startup,
61 62 * will attempt to process the dump - either to copy it off the dump
62 63 * device (if saving compressed) or to uncompress it off the dump device.
63 64 * If this succeeds savecore raises a "dump_available" event which
64 65 * includes information on the directory it was saved in, the instance
65 66 * number, image uuid, compressed form or not, and whether the dump
66 67 * was complete (as per the dumphdr). If the savecore fails for
67 68 * some reason then it exits and raises a "savecore_failure" event.
68 69 * These two events are raised even for FMA-initiated panics.
69 70 *
70 71 * We subscribe to both the "dump_available" and "savecore_failed" events,
71 72 * and in the handling thereof we will close the case opened earlier (if
72 73 * this is not an FMA-initiated panic). On receipt of the initial
73 74 * "dump_available" event we also arm a timer for +10 minutes if
74 75 * dumpadm is enabled - if no "dump_available" or "savecore_failed" arrives
75 76 * in that time we will solve the case on timeout.
76 77 *
|
↓ open down ↓ |
43 lines elided |
↑ open up ↑ |
77 78 * When the timer fires we check whether the initial event for each panic
78 79 * case was received more than 30 minutes ago; if it was we solve the case
79 80 * with what we have. If we're still within the waiting period we rearm
80 81 * for a further 10 minutes. The timer is shared by all cases that we
81 82 * create, which is why the fire interval is shorter than the maximum time
82 83 * we are prepared to wait.
83 84 */
84 85
85 86 #include <strings.h>
86 87 #include <sys/panic.h>
87 -#include <alloca.h>
88 88 #include <zone.h>
89 +#include <uuid/uuid.h>
89 90
90 91 #include "../../common/sw.h"
91 92 #include "panic.h"
92 93
93 94 #define MAX_STRING_LEN 160
94 95
95 96 static id_t myid;
96 97
97 98 static id_t mytimerid;
98 99
99 100 /*
100 101 * Our serialization structure type.
101 102 */
102 103 #define SWDE_PANIC_CASEDATA_VERS 1
103 104
104 105 typedef struct swde_panic_casedata {
105 106 uint32_t scd_vers; /* must be first member */
106 107 uint64_t scd_receive_time; /* when we first knew of this panic */
107 108 size_t scd_nvlbufsz; /* size of following buffer */
108 109 /* packed attr nvlist follows */
109 110 } swde_panic_casedata_t;
110 111
111 112 static struct {
112 113 fmd_stat_t swde_panic_diagnosed;
113 114 fmd_stat_t swde_panic_badclass;
114 115 fmd_stat_t swde_panic_noattr;
115 116 fmd_stat_t swde_panic_unexpected_fm_panic;
116 117 fmd_stat_t swde_panic_badattr;
117 118 fmd_stat_t swde_panic_badfmri;
118 119 fmd_stat_t swde_panic_noinstance;
119 120 fmd_stat_t swde_panic_nouuid;
120 121 fmd_stat_t swde_panic_dupuuid;
121 122 fmd_stat_t swde_panic_nocase;
122 123 fmd_stat_t swde_panic_notime;
123 124 fmd_stat_t swde_panic_nopanicstr;
124 125 fmd_stat_t swde_panic_nodumpdir;
125 126 fmd_stat_t swde_panic_nostack;
126 127 fmd_stat_t swde_panic_incomplete;
127 128 fmd_stat_t swde_panic_failed;
128 129 fmd_stat_t swde_panic_basecasedata;
129 130 fmd_stat_t swde_panic_failsrlz;
130 131 } swde_panic_stats = {
131 132 { "swde_panic_diagnosed", FMD_TYPE_UINT64,
132 133 "panic defects published" },
133 134 { "swde_panic_badclass", FMD_TYPE_UINT64,
134 135 "incorrect event class received" },
135 136 { "swde_panic_noattr", FMD_TYPE_UINT64,
136 137 "malformed event - missing attr nvlist" },
137 138 { "swde_panic_unexpected_fm_panic", FMD_TYPE_UINT64,
138 139 "dump available for an fm_panic()" },
139 140 { "swde_panic_badattr", FMD_TYPE_UINT64,
140 141 "malformed event - invalid attr list" },
141 142 { "swde_panic_badfmri", FMD_TYPE_UINT64,
142 143 "malformed event - fmri2str fails" },
143 144 { "swde_panic_noinstance", FMD_TYPE_UINT64,
144 145 "malformed event - no instance number" },
145 146 { "swde_panic_nouuid", FMD_TYPE_UINT64,
146 147 "malformed event - missing uuid" },
147 148 { "swde_panic_dupuuid", FMD_TYPE_UINT64,
148 149 "duplicate events received" },
149 150 { "swde_panic_nocase", FMD_TYPE_UINT64,
150 151 "case missing for uuid" },
151 152 { "swde_panic_notime", FMD_TYPE_UINT64,
152 153 "missing crash dump time" },
153 154 { "swde_panic_nopanicstr", FMD_TYPE_UINT64,
154 155 "missing panic string" },
155 156 { "swde_panic_nodumpdir", FMD_TYPE_UINT64,
156 157 "missing crashdump save directory" },
157 158 { "swde_panic_nostack", FMD_TYPE_UINT64,
158 159 "missing panic stack" },
159 160 { "swde_panic_incomplete", FMD_TYPE_UINT64,
160 161 "missing panic incomplete" },
161 162 { "swde_panic_failed", FMD_TYPE_UINT64,
162 163 "missing panic failed" },
163 164 { "swde_panic_badcasedata", FMD_TYPE_UINT64,
164 165 "bad case data during timeout" },
165 166 { "swde_panic_failsrlz", FMD_TYPE_UINT64,
166 167 "failures to serialize case data" },
167 168 };
168 169
169 170 #define BUMPSTAT(stat) swde_panic_stats.stat.fmds_value.ui64++
170 171
171 172 static nvlist_t *
172 173 panic_sw_fmri(fmd_hdl_t *hdl, char *object)
173 174 {
174 175 nvlist_t *fmri;
175 176 nvlist_t *sw_obj;
176 177 int err = 0;
177 178
178 179 fmri = fmd_nvl_alloc(hdl, FMD_SLEEP);
179 180 err |= nvlist_add_uint8(fmri, FM_VERSION, FM_SW_SCHEME_VERSION);
180 181 err |= nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_SW);
181 182
182 183 sw_obj = fmd_nvl_alloc(hdl, FMD_SLEEP);
183 184 err |= nvlist_add_string(sw_obj, FM_FMRI_SW_OBJ_PATH, object);
184 185 err |= nvlist_add_nvlist(fmri, FM_FMRI_SW_OBJ, sw_obj);
185 186 nvlist_free(sw_obj);
186 187 if (!err)
187 188 return (fmri);
188 189 else
|
↓ open down ↓ |
90 lines elided |
↑ open up ↑ |
189 190 return (0);
190 191 }
191 192
192 193 static const char *dumpfiles[2] = { "unix.%lld", "vmcore.%lld" };
193 194 static const char *dumpfiles_comp[2] = { "vmdump.%lld", NULL};
194 195
195 196 static void
196 197 swde_panic_solve(fmd_hdl_t *hdl, fmd_case_t *cp,
197 198 nvlist_t *attr, fmd_event_t *ep, boolean_t savecore_success)
198 199 {
199 - char *dumpdir, *path, *uuid;
200 + char path[MAXPATHLEN];
201 + char *dumpdir, *uuid;
200 202 nvlist_t *defect, *rsrc;
201 203 nvpair_t *nvp;
202 204 int i;
203 205
204 206 /*
205 207 * Attribute members to include in event-specific defect
206 208 * payload. Some attributes will not be present for some
207 209 * cases - e.g., if we timed out and solved the case without
208 210 * a "dump_available" report.
209 211 */
210 212 const char *toadd[] = {
211 213 "os-instance-uuid", /* same as case uuid */
212 214 "panicstr", /* for initial classification work */
213 215 "panicstack", /* for initial classification work */
214 216 "crashtime", /* in epoch time */
215 217 "panic-time", /* Formatted crash time */
216 218 };
217 219
218 220 if (ep != NULL)
219 221 fmd_case_add_ereport(hdl, cp, ep);
220 222 /*
221 223 * As a temporary solution we create and fmri in the sw scheme
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
222 224 * in panic_sw_fmri. This should become a generic fmri constructor
223 225 *
224 226 * We need to user a resource FMRI which will have a sufficiently
225 227 * unique string representation such that fmd will not see
226 228 * repeated panic diagnoses (all using the same defect class)
227 229 * as duplicates and discard later cases. We can't actually diagnose
228 230 * the panic to anything specific (e.g., a path to a module and
229 231 * function/line etc therein). We could pick on a generic
230 232 * representative such as /kernel/genunix but that could lead
231 233 * to misunderstanding. So we choose a path based on <dumpdir>
232 - * and the OS instance UUID - "<dumpdir>/.<os-instance-uuid>".
233 - * There's no file at that path (*) but no matter. We can't use
234 - * <dumpdir>/vmdump.N or similar because if savecore is disabled
235 - * or failed we don't have any file or instance number.
236 - *
237 - * (*) Some day it would seem tidier to keep all files to do
238 - * with a single crash (unix/vmcore/vmdump, analysis output etc)
239 - * in a distinct directory, and <dumpdir>/.<uuid> seems like a good
240 - * choice. For compatability we'd symlink into it. So that is
241 - * another reason for this choice - some day it may exist!
234 + * and the OS instance UUID - "<dumpdir>/data/<uuid>".
242 235 */
243 236 (void) nvlist_lookup_string(attr, "dumpdir", &dumpdir);
244 237 (void) nvlist_lookup_string(attr, "os-instance-uuid", &uuid);
245 - path = alloca(strlen(dumpdir) + 1 + 1 + 36 + 1);
246 - /* LINTED: E_SEC_SPRINTF_UNBOUNDED_COPY */
247 - (void) sprintf(path, "%s/.%s", dumpdir, uuid);
238 + (void) snprintf(path, sizeof (path), "%s/data/%s", dumpdir, uuid);
248 239 rsrc = panic_sw_fmri(hdl, path);
249 240
250 241 defect = fmd_nvl_create_defect(hdl, SW_SUNOS_PANIC_DEFECT,
251 242 100, rsrc, NULL, rsrc);
252 243 nvlist_free(rsrc);
253 244
254 245 (void) nvlist_add_boolean_value(defect, "savecore-succcess",
255 246 savecore_success);
256 247
257 248 if (savecore_success) {
258 249 boolean_t compressed;
259 250 int64_t instance;
260 251 const char **pathfmts;
261 252 char buf[2][32];
262 253 int files = 0;
263 254 char *arr[2];
264 255 int i;
265 256
266 257 (void) nvlist_lookup_int64(attr, "instance", &instance);
267 258 (void) nvlist_lookup_boolean_value(attr, "compressed",
268 259 &compressed);
269 260
270 261 pathfmts = compressed ? &dumpfiles_comp[0] : &dumpfiles[0];
271 262
272 263 for (i = 0; i < 2; i++) {
273 264 if (pathfmts[i] == NULL) {
274 265 arr[i] = NULL;
275 266 continue;
276 267 }
277 268
278 269 (void) snprintf(buf[i], 32, pathfmts[i], instance);
279 270 arr[i] = buf[i];
280 271 files++;
281 272 }
282 273
283 274 (void) nvlist_add_string(defect, "dump-dir", dumpdir);
284 275 (void) nvlist_add_string_array(defect, "dump-files", arr,
285 276 files);
286 277 } else {
287 278 char *rsn;
288 279
289 280 if (nvlist_lookup_string(attr, "failure-reason", &rsn) == 0)
290 281 (void) nvlist_add_string(defect, "failure-reason", rsn);
291 282 }
292 283
293 284 /*
294 285 * Not all attributes will necessarily be available - eg if
295 286 * dumpadm was not enabled there'll be no instance and dumpdir.
296 287 */
297 288 for (i = 0; i < sizeof (toadd) / sizeof (toadd[0]); i++) {
298 289 if (nvlist_lookup_nvpair(attr, toadd[i], &nvp) == 0)
299 290 (void) nvlist_add_nvpair(defect, nvp);
300 291 }
301 292
302 293 fmd_case_add_suspect(hdl, cp, defect);
303 294 fmd_case_solve(hdl, cp);
304 295
305 296 /*
306 297 * Close the case. Do no free casedata - framework does that for us
307 298 * on closure callback.
308 299 */
309 300 fmd_case_close(hdl, cp);
310 301 BUMPSTAT(swde_panic_diagnosed);
311 302 }
312 303
313 304 /*ARGSUSED*/
314 305 static void
315 306 swde_panic_timeout(fmd_hdl_t *hdl, id_t timerid, void *data)
316 307 {
317 308 fmd_case_t *cp = swde_case_first(hdl, myid);
318 309 swde_panic_casedata_t *cdp;
319 310 time_t now = time(NULL);
320 311 nvlist_t *attr;
321 312 int remain = 0;
322 313 uint32_t vers;
323 314
324 315 while (cp != NULL) {
325 316 cdp = swde_case_data(hdl, cp, &vers);
326 317 if (vers != SWDE_PANIC_CASEDATA_VERS)
327 318 fmd_hdl_abort(hdl, "case data version confused\n");
328 319
329 320 if (now > cdp->scd_receive_time + 30 * 60) {
330 321 if (nvlist_unpack((char *)cdp + sizeof (*cdp),
331 322 cdp->scd_nvlbufsz, &attr, 0) == 0) {
332 323 swde_panic_solve(hdl, cp, attr, NULL, B_FALSE);
333 324 nvlist_free(attr);
334 325 } else {
335 326 BUMPSTAT(swde_panic_basecasedata);
336 327 fmd_case_close(hdl, cp);
337 328 }
338 329 } else {
339 330 remain++;
340 331 }
341 332
342 333
343 334 cp = swde_case_next(hdl, cp);
344 335 }
345 336
346 337 if (remain) {
347 338 mytimerid = sw_timer_install(hdl, myid, NULL, NULL,
348 339 10ULL * NANOSEC * 60);
349 340 }
350 341 }
351 342
352 343 /*
353 344 * Our verify entry point is called for each of our open cases during
354 345 * module load. We must return 0 for the case to be closed by our caller,
355 346 * or 1 to keep it (or if we have already closed it during this call).
356 347 */
357 348 static int
358 349 swde_panic_vrfy(fmd_hdl_t *hdl, fmd_case_t *cp)
359 350 {
360 351 swde_panic_casedata_t *cdp;
361 352 time_t now = time(NULL);
362 353 nvlist_t *attr;
363 354 uint32_t vers;
364 355
365 356 cdp = swde_case_data(hdl, cp, &vers);
366 357
367 358 if (vers != SWDE_PANIC_CASEDATA_VERS)
368 359 return (0); /* case will be closed */
369 360
370 361 if (now > cdp->scd_receive_time + 30 * 60) {
371 362 if (nvlist_unpack((char *)cdp + sizeof (*cdp),
372 363 cdp->scd_nvlbufsz, &attr, 0) == 0) {
373 364 swde_panic_solve(hdl, cp, attr, NULL, B_FALSE);
374 365 nvlist_free(attr);
375 366 return (1); /* case already closed */
376 367 } else {
377 368 return (0); /* close case */
378 369 }
379 370 }
380 371
381 372 if (mytimerid != 0)
382 373 mytimerid = sw_timer_install(hdl, myid,
383 374 NULL, NULL, 10ULL * NANOSEC * 60);
384 375
385 376 return (1); /* retain case */
386 377 }
387 378
388 379 /*
389 380 * Handler for ireport.os.sunos.panic.dump_pending_on_device.
390 381 *
391 382 * A future RFE should try adding a means of avoiding diagnosing repeated
392 383 * defects on panic loops, which would just add to the mayhem and potentially
393 384 * log lots of calls through ASR. Panics with similar enough panic
394 385 * strings and/or stacks should not diagnose to new defects with some
395 386 * period of time, for example.
396 387 */
397 388
398 389 /*ARGSUSED*/
399 390 void
400 391 swde_panic_detected(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
401 392 const char *class, void *arg)
402 393 {
403 394 boolean_t fm_panic, expect_savecore;
404 395 swde_panic_casedata_t *cdp;
405 396 nvlist_t *attr;
406 397 fmd_case_t *cp;
407 398 char *fmribuf;
408 399 char *uuid;
409 400 size_t sz;
410 401
411 402 fmd_hdl_debug(hdl, "swde_panic_detected\n");
412 403
413 404 if (nvlist_lookup_nvlist(nvl, FM_IREPORT_ATTRIBUTES, &attr) != 0) {
414 405 BUMPSTAT(swde_panic_noattr);
415 406 return;
416 407 }
417 408
418 409 if (nvlist_lookup_string(attr, "os-instance-uuid", &uuid) != 0) {
419 410 BUMPSTAT(swde_panic_nouuid);
420 411 return;
421 412 }
422 413
423 414 fmd_hdl_debug(hdl, "swde_panic_detected: OS instance %s\n", uuid);
424 415
425 416 if (nvlist_lookup_boolean_value(attr, "fm-panic", &fm_panic) != 0 ||
426 417 fm_panic == B_TRUE) {
427 418 BUMPSTAT(swde_panic_unexpected_fm_panic);
428 419 return;
429 420 }
430 421
431 422 /*
432 423 * Prepare serialization data to be associated with a new
433 424 * case. Our serialization data consists of a swde_panic_casedata_t
434 425 * structure followed by a packed nvlist of the attributes of
435 426 * the initial event.
436 427 */
437 428 if (nvlist_size(attr, &sz, NV_ENCODE_NATIVE) != 0) {
438 429 BUMPSTAT(swde_panic_failsrlz);
439 430 return;
440 431 }
441 432
442 433 cdp = fmd_hdl_zalloc(hdl, sizeof (*cdp) + sz, FMD_SLEEP);
443 434 fmribuf = (char *)cdp + sizeof (*cdp);
444 435 cdp->scd_vers = SWDE_PANIC_CASEDATA_VERS;
445 436 cdp->scd_receive_time = time(NULL);
446 437 cdp->scd_nvlbufsz = sz;
447 438
448 439 /*
449 440 * Open a case with UUID matching the the panicking kernel, add this
450 441 * event to the case.
451 442 */
452 443 if ((cp = swde_case_open(hdl, myid, uuid, SWDE_PANIC_CASEDATA_VERS,
453 444 cdp, sizeof (*cdp) + sz)) == NULL) {
454 445 BUMPSTAT(swde_panic_dupuuid);
455 446 fmd_hdl_debug(hdl, "swde_case_open returned NULL - dup?\n");
456 447 fmd_hdl_free(hdl, cdp, sizeof (*cdp) + sz);
457 448 return;
458 449 }
459 450
460 451 fmd_case_setprincipal(hdl, cp, ep);
461 452
462 453 if (nvlist_lookup_boolean_value(attr, "will-attempt-savecore",
463 454 &expect_savecore) != 0 || expect_savecore == B_FALSE) {
464 455 fmd_hdl_debug(hdl, "savecore not being attempted - "
465 456 "solve now\n");
466 457 swde_panic_solve(hdl, cp, attr, ep, B_FALSE);
467 458 return;
468 459 }
469 460
470 461 /*
471 462 * We expect to see either a "dump_available" or a "savecore_failed"
472 463 * event before too long. In case that never shows up, for whatever
473 464 * reason, we want to be able to solve the case anyway.
474 465 */
475 466 fmd_case_add_ereport(hdl, cp, ep);
476 467 (void) nvlist_pack(attr, &fmribuf, &sz, NV_ENCODE_NATIVE, 0);
477 468 swde_case_data_write(hdl, cp);
478 469
479 470 if (mytimerid == 0) {
480 471 mytimerid = sw_timer_install(hdl, myid, NULL, ep,
481 472 10ULL * NANOSEC * 60);
482 473 fmd_hdl_debug(hdl, "armed timer\n");
483 474 } else {
484 475 fmd_hdl_debug(hdl, "timer already armed\n");
485 476 }
486 477 }
487 478
488 479 /*
489 480 * savecore has now run and saved a crash dump to the filesystem. It is
490 481 * either a compressed dump (vmdump.n) or uncompressed {unix.n, vmcore.n}
491 482 * Savecore has raised an ireport to say the dump is there.
492 483 */
493 484
494 485 /*ARGSUSED*/
495 486 void
496 487 swde_panic_savecore_done(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
497 488 const char *class, void *arg)
498 489 {
499 490 boolean_t savecore_success = (arg != NULL);
500 491 boolean_t fm_panic;
501 492 nvlist_t *attr;
502 493 fmd_case_t *cp;
503 494 char *uuid;
504 495
505 496 fmd_hdl_debug(hdl, "savecore_done (%s)\n", savecore_success ?
506 497 "success" : "fail");
507 498
508 499 if (nvlist_lookup_nvlist(nvl, FM_IREPORT_ATTRIBUTES, &attr) != 0) {
509 500 BUMPSTAT(swde_panic_noattr);
510 501 return;
511 502 }
512 503
513 504 if (nvlist_lookup_boolean_value(attr, "fm-panic", &fm_panic) != 0 ||
514 505 fm_panic == B_TRUE) {
515 506 return; /* not expected, but just in case */
516 507 }
517 508
518 509 if (nvlist_lookup_string(attr, "os-instance-uuid", &uuid) != 0) {
519 510 BUMPSTAT(swde_panic_nouuid);
520 511 return;
521 512 }
522 513
523 514 /*
524 515 * Find the case related to the panicking kernel; our cases have
525 516 * the same uuid as the crashed OS image.
526 517 */
527 518 cp = fmd_case_uulookup(hdl, uuid);
528 519 if (!cp) {
529 520 /* Unable to find the case. */
530 521 fmd_hdl_debug(hdl, "savecore_done: can't find case for "
531 522 "image %s\n", uuid);
532 523 BUMPSTAT(swde_panic_nocase);
533 524 return;
534 525 }
535 526
536 527 fmd_hdl_debug(hdl, "savecore_done: solving case %s\n", uuid);
537 528 swde_panic_solve(hdl, cp, attr, ep, savecore_success);
538 529 }
539 530
540 531 const struct sw_disp swde_panic_disp[] = {
541 532 { SW_SUNOS_PANIC_DETECTED, swde_panic_detected, NULL },
542 533 { SW_SUNOS_PANIC_AVAIL, swde_panic_savecore_done, (void *)1 },
543 534 { SW_SUNOS_PANIC_FAILURE, swde_panic_savecore_done, NULL },
544 535 /*
545 536 * Something has to subscribe to every fault
546 537 * or defect diagnosed in fmd. We do that here, but throw it away.
547 538 */
548 539 { SW_SUNOS_PANIC_DEFECT, NULL, NULL },
549 540 { NULL, NULL, NULL }
550 541 };
551 542
552 543 /*ARGSUSED*/
553 544 int
554 545 swde_panic_init(fmd_hdl_t *hdl, id_t id, const struct sw_disp **dpp,
555 546 int *nelemp)
556 547 {
557 548 myid = id;
558 549
559 550 if (getzoneid() != GLOBAL_ZONEID)
560 551 return (SW_SUB_INIT_FAIL_VOLUNTARY);
561 552
562 553 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC,
563 554 sizeof (swde_panic_stats) / sizeof (fmd_stat_t),
564 555 (fmd_stat_t *)&swde_panic_stats);
565 556
566 557 fmd_hdl_subscribe(hdl, SW_SUNOS_PANIC_DETECTED);
567 558 fmd_hdl_subscribe(hdl, SW_SUNOS_PANIC_FAILURE);
568 559 fmd_hdl_subscribe(hdl, SW_SUNOS_PANIC_AVAIL);
569 560
570 561 *dpp = &swde_panic_disp[0];
571 562 *nelemp = sizeof (swde_panic_disp) / sizeof (swde_panic_disp[0]);
572 563 return (SW_SUB_INIT_SUCCESS);
573 564 }
574 565
575 566 void
576 567 swde_panic_fini(fmd_hdl_t *hdl)
577 568 {
578 569 if (mytimerid)
579 570 sw_timer_remove(hdl, myid, mytimerid);
580 571 }
581 572
582 573 const struct sw_subinfo panic_diag_info = {
583 574 "panic diagnosis", /* swsub_name */
584 575 SW_CASE_PANIC, /* swsub_casetype */
585 576 swde_panic_init, /* swsub_init */
586 577 swde_panic_fini, /* swsub_fini */
587 578 swde_panic_timeout, /* swsub_timeout */
588 579 NULL, /* swsub_case_close */
589 580 swde_panic_vrfy, /* swsub_case_vrfy */
590 581 };
|
↓ open down ↓ |
333 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX