1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2016 Joyent, Inc.
25 * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/vm.h>
32 #include <sys/proc.h>
33 #include <sys/file.h>
34 #include <sys/conf.h>
35 #include <sys/kmem.h>
36 #include <sys/mem.h>
37 #include <sys/mman.h>
38 #include <sys/vnode.h>
39 #include <sys/errno.h>
40 #include <sys/memlist.h>
41 #include <sys/dumphdr.h>
42 #include <sys/dumpadm.h>
43 #include <sys/ksyms.h>
44 #include <sys/compress.h>
45 #include <sys/stream.h>
46 #include <sys/strsun.h>
47 #include <sys/cmn_err.h>
48 #include <sys/bitmap.h>
49 #include <sys/modctl.h>
50 #include <sys/utsname.h>
51 #include <sys/systeminfo.h>
52 #include <sys/vmem.h>
53 #include <sys/log.h>
54 #include <sys/var.h>
55 #include <sys/debug.h>
56 #include <sys/sunddi.h>
57 #include <fs/fs_subr.h>
58 #include <sys/fs/snode.h>
59 #include <sys/ontrap.h>
60 #include <sys/panic.h>
61 #include <sys/dkio.h>
62 #include <sys/vtoc.h>
63 #include <sys/errorq.h>
64 #include <sys/fm/util.h>
65 #include <sys/fs/zfs.h>
66
67 #include <vm/hat.h>
68 #include <vm/as.h>
69 #include <vm/page.h>
70 #include <vm/pvn.h>
71 #include <vm/seg.h>
72 #include <vm/seg_kmem.h>
73 #include <sys/clock_impl.h>
74 #include <sys/hold_page.h>
75 #include <sys/cpu.h>
76
77 #include <sys/uuid.h>
78
79 /*
80 * Parallel Dump:
81 * CPUs that are otherwise idle during panic are employed to parallelize
82 * the compression task. I/O and compression are performed by different
83 * CPUs, and are hence overlapped in time, unlike the older serial code.
84 */
85
86 /*
87 * exported vars
88 */
89 kmutex_t dump_lock; /* lock for dump configuration */
90 dumphdr_t *dumphdr; /* dump header */
91 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */
92 vnode_t *dumpvp; /* dump device vnode pointer */
93 u_offset_t dumpvp_size; /* size of dump device, in bytes */
94 char *dumppath; /* pathname of dump device */
95 int dump_timeout = 120; /* timeout for dumping pages */
96 int dump_timeleft; /* portion of dump_timeout remaining */
97 int dump_ioerr; /* dump i/o error */
98 int dump_check_used; /* enable check for used pages */
99 char *dump_stack_scratch; /* scratch area for saving stack summary */
100
101 /*
102 * Tunables for dump compression and parallelism.
103 * These can be set via /etc/system.
104 *
105 * dump_ncpu_low:
106 * This is the minimum configuration for parallel lzjb.
107 * A special value of 0 means that parallel dump will not be used.
108 *
109 * dump_metrics_on:
110 * If set, metrics are collected in the kernel, passed to savecore
111 * via the dump file, and recorded by savecore in METRICS.txt.
112 */
113 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */
114
115 /* tunables for pre-reserved heap */
116 uint_t dump_kmem_permap = 1024;
117 uint_t dump_kmem_pages = 8;
118
119 /* Define multiple buffers per helper to avoid stalling */
120 #define NCBUF_PER_HELPER 2
121 #define NCMAP_PER_HELPER 4
122
123 /* minimum number of helpers configured */
124 #define MINHELPERS (MAX(dump_ncpu_low, 1))
125 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER)
126
127 /*
128 * Define constant parameters.
129 *
130 * CBUF_SIZE size of an output buffer
131 *
132 * CBUF_MAPSIZE size of virtual range for mapping pages
133 *
134 * CBUF_MAPNP size of virtual range in pages
135 *
136 */
137 #define DUMP_1KB ((size_t)1 << 10)
138 #define DUMP_1MB ((size_t)1 << 20)
139 #define CBUF_SIZE ((size_t)1 << 17)
140 #define CBUF_MAPSHIFT (22)
141 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT)
142 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT))
143
144 /*
145 * Compression metrics are accumulated nano-second subtotals. The
146 * results are normalized by the number of pages dumped. A report is
147 * generated when dumpsys() completes and is saved in the dump image
148 * after the trailing dump header.
149 *
150 * Metrics are always collected. Set the variable dump_metrics_on to
151 * cause metrics to be saved in the crash file, where savecore will
152 * save it in the file METRICS.txt.
153 */
154 #define PERPAGES \
155 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
156 PERPAGE(copy) PERPAGE(compress) \
157 PERPAGE(write) \
158 PERPAGE(inwait) PERPAGE(outwait)
159
160 typedef struct perpage {
161 #define PERPAGE(x) hrtime_t x;
162 PERPAGES
163 #undef PERPAGE
164 } perpage_t;
165
166 /*
167 * This macro controls the code generation for collecting dump
168 * performance information. By default, the code is generated, but
169 * automatic saving of the information is disabled. If dump_metrics_on
170 * is set to 1, the timing information is passed to savecore via the
171 * crash file, where it is appended to the file dump-dir/METRICS.txt.
172 */
173 #define COLLECT_METRICS
174
175 #ifdef COLLECT_METRICS
176 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */
177
178 #define HRSTART(v, m) v##ts.m = gethrtime()
179 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m
180 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s
181 #define HREND(v, m) v.m += gethrtime() - v##ts.m
182 #define HRNORM(v, m, n) v.m /= (n)
183
184 #else
185 #define HRSTART(v, m)
186 #define HRSTOP(v, m)
187 #define HRBEGIN(v, m, s)
188 #define HREND(v, m)
189 #define HRNORM(v, m, n)
190 #endif /* COLLECT_METRICS */
191
192 /*
193 * Buffers for copying and compressing memory pages.
194 *
195 * cbuf_t buffer controllers: used for both input and output.
196 *
197 * The buffer state indicates how it is being used:
198 *
199 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for
200 * mapping input pages.
201 *
202 * CBUF_INREADY: input pages are mapped and ready for compression by a
203 * helper.
204 *
205 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap.
206 *
207 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available.
208 *
209 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper,
210 * ready to write out.
211 *
212 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper
213 * (reports UE errors.)
214 */
215
216 typedef enum cbufstate {
217 CBUF_FREEMAP,
218 CBUF_INREADY,
219 CBUF_USEDMAP,
220 CBUF_FREEBUF,
221 CBUF_WRITE,
222 CBUF_ERRMSG
223 } cbufstate_t;
224
225 typedef struct cbuf cbuf_t;
226
227 struct cbuf {
228 cbuf_t *next; /* next in list */
229 cbufstate_t state; /* processing state */
230 size_t used; /* amount used */
231 size_t size; /* mem size */
232 char *buf; /* kmem or vmem */
233 pgcnt_t pagenum; /* index to pfn map */
234 pgcnt_t bitnum; /* first set bitnum */
235 pfn_t pfn; /* first pfn in mapped range */
236 int off; /* byte offset to first pfn */
237 };
238
239 static char dump_osimage_uuid[UUID_PRINTABLE_STRING_LENGTH];
240
241 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
242 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
243 ((ch) >= 'A' && (ch) <= 'F'))
244
245 /*
246 * cqueue_t queues: a uni-directional channel for communication
247 * from the master to helper tasks or vice-versa using put and
248 * get primitives. Both mappings and data buffers are passed via
249 * queues. Producers close a queue when done. The number of
250 * active producers is reference counted so the consumer can
251 * detect end of data. Concurrent access is mediated by atomic
252 * operations for panic dump, or mutex/cv for live dump.
253 *
254 * There a four queues, used as follows:
255 *
256 * Queue Dataflow NewState
257 * --------------------------------------------------
258 * mainq master -> master FREEMAP
259 * master has initialized or unmapped an input buffer
260 * --------------------------------------------------
261 * helperq master -> helper INREADY
262 * master has mapped input for use by helper
263 * --------------------------------------------------
264 * mainq master <- helper USEDMAP
265 * helper is done with input
266 * --------------------------------------------------
267 * freebufq master -> helper FREEBUF
268 * master has initialized or written an output buffer
269 * --------------------------------------------------
270 * mainq master <- helper WRITE
271 * block of compressed pages from a helper
272 * --------------------------------------------------
273 * mainq master <- helper ERRMSG
274 * error messages from a helper (memory error case)
275 * --------------------------------------------------
276 * writerq master <- master WRITE
277 * non-blocking queue of blocks to write
278 * --------------------------------------------------
279 */
280 typedef struct cqueue {
281 cbuf_t *volatile first; /* first in list */
282 cbuf_t *last; /* last in list */
283 hrtime_t ts; /* timestamp */
284 hrtime_t empty; /* total time empty */
285 kmutex_t mutex; /* live state lock */
286 kcondvar_t cv; /* live wait var */
287 lock_t spinlock; /* panic mode spin lock */
288 volatile uint_t open; /* producer ref count */
289 } cqueue_t;
290
291 /*
292 * Convenience macros for using the cqueue functions
293 * Note that the caller must have defined "dumpsync_t *ds"
294 */
295 #define CQ_IS_EMPTY(q) \
296 (ds->q.first == NULL)
297
298 #define CQ_OPEN(q) \
299 atomic_inc_uint(&ds->q.open)
300
301 #define CQ_CLOSE(q) \
302 dumpsys_close_cq(&ds->q, ds->live)
303
304 #define CQ_PUT(q, cp, st) \
305 dumpsys_put_cq(&ds->q, cp, st, ds->live)
306
307 #define CQ_GET(q) \
308 dumpsys_get_cq(&ds->q, ds->live)
309
310 /*
311 * Dynamic state when dumpsys() is running.
312 */
313 typedef struct dumpsync {
314 pgcnt_t npages; /* subtotal of pages dumped */
315 pgcnt_t pages_mapped; /* subtotal of pages mapped */
316 pgcnt_t pages_used; /* subtotal of pages used per map */
317 size_t nwrite; /* subtotal of bytes written */
318 uint_t live; /* running live dump */
319 uint_t neednl; /* will need to print a newline */
320 uint_t percent; /* dump progress */
321 uint_t percent_done; /* dump progress reported */
322 int sec_done; /* dump progress last report time */
323 cqueue_t freebufq; /* free kmem bufs for writing */
324 cqueue_t mainq; /* input for main task */
325 cqueue_t helperq; /* input for helpers */
326 cqueue_t writerq; /* input for writer */
327 hrtime_t start; /* start time */
328 hrtime_t elapsed; /* elapsed time when completed */
329 hrtime_t iotime; /* time spent writing nwrite bytes */
330 hrtime_t iowait; /* time spent waiting for output */
331 hrtime_t iowaitts; /* iowait timestamp */
332 perpage_t perpage; /* metrics */
333 perpage_t perpagets;
334 int dumpcpu; /* master cpu */
335 } dumpsync_t;
336
337 static dumpsync_t dumpsync; /* synchronization vars */
338
339 /*
340 * helper_t helpers: contains the context for a stream. CPUs run in
341 * parallel at dump time; each CPU creates a single stream of
342 * compression data. Stream data is divided into CBUF_SIZE blocks.
343 * The blocks are written in order within a stream. But, blocks from
344 * multiple streams can be interleaved. Each stream is identified by a
345 * unique tag.
346 */
347 typedef struct helper {
348 int helper; /* bound helper id */
349 int tag; /* compression stream tag */
350 perpage_t perpage; /* per page metrics */
351 perpage_t perpagets; /* per page metrics (timestamps) */
352 taskqid_t taskqid; /* live dump task ptr */
353 int in, out; /* buffer offsets */
354 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */
355 dumpsync_t *ds; /* pointer to sync vars */
356 size_t used; /* counts input consumed */
357 char *page; /* buffer for page copy */
358 char *lzbuf; /* lzjb output */
359 } helper_t;
360
361 #define MAINHELPER (-1) /* helper is also the main task */
362 #define FREEHELPER (-2) /* unbound helper */
363 #define DONEHELPER (-3) /* helper finished */
364
365 /*
366 * configuration vars for dumpsys
367 */
368 typedef struct dumpcfg {
369 int nhelper; /* number of helpers */
370 int nhelper_used; /* actual number of helpers used */
371 int ncmap; /* number VA pages for compression */
372 int ncbuf; /* number of bufs for compression */
373 int ncbuf_used; /* number of bufs in use */
374 uint_t clevel; /* dump compression level */
375 helper_t *helper; /* array of helpers */
376 cbuf_t *cmap; /* array of input (map) buffers */
377 cbuf_t *cbuf; /* array of output buffers */
378 ulong_t *helpermap; /* set of dumpsys helper CPU ids */
379 ulong_t *bitmap; /* bitmap for marking pages to dump */
380 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */
381 pgcnt_t bitmapsize; /* size of bitmap */
382 pgcnt_t rbitmapsize; /* size of bitmap for ranges */
383 pgcnt_t found4m; /* number ranges allocated by dump */
384 pgcnt_t foundsm; /* number small pages allocated by dump */
385 pid_t *pids; /* list of process IDs at dump time */
386 size_t maxsize; /* memory size needed at dump time */
387 size_t maxvmsize; /* size of reserved VM */
388 char *maxvm; /* reserved VM for spare pages */
389 lock_t helper_lock; /* protect helper state */
390 char helpers_wanted; /* flag to enable parallelism */
391 } dumpcfg_t;
392
393 static dumpcfg_t dumpcfg; /* config vars */
394
395 /*
396 * The dump I/O buffer.
397 *
398 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
399 * sized according to the optimum device transfer speed.
400 */
401 typedef struct dumpbuf {
402 vnode_t *cdev_vp; /* VCHR open of the dump device */
403 len_t vp_limit; /* maximum write offset */
404 offset_t vp_off; /* current dump device offset */
405 char *cur; /* dump write pointer */
406 char *start; /* dump buffer address */
407 char *end; /* dump buffer end */
408 size_t size; /* size of dumpbuf in bytes */
409 size_t iosize; /* best transfer size for device */
410 } dumpbuf_t;
411
412 dumpbuf_t dumpbuf; /* I/O buffer */
413
414 /*
415 * DUMP_HELPER_MAX_WAIT
416 * For parallel dump, defines maximum time main task thread will wait
417 * for at least one helper to register in dumpcfg.helpermap, before
418 * assuming there are no helpers and falling back to serial mode.
419 */
420 #define DUMP_HELPER_MAX_WAIT 1000 /* millisec */
421
422 /*
423 * The dump I/O buffer must be at least one page, at most xfer_size
424 * bytes, and should scale with physmem in between. The transfer size
425 * passed in will either represent a global default (maxphys) or the
426 * best size for the device. The size of the dumpbuf I/O buffer is
427 * limited by dumpbuf_limit (8MB by default) because the dump
428 * performance saturates beyond a certain size. The default is to
429 * select 1/4096 of the memory.
430 */
431 static int dumpbuf_fraction = 12; /* memory size scale factor */
432 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */
433
434 static size_t
435 dumpbuf_iosize(size_t xfer_size)
436 {
437 size_t iosize = ptob(physmem >> dumpbuf_fraction);
438
439 if (iosize < PAGESIZE)
440 iosize = PAGESIZE;
441 else if (iosize > xfer_size)
442 iosize = xfer_size;
443 if (iosize > dumpbuf_limit)
444 iosize = dumpbuf_limit;
445 return (iosize & PAGEMASK);
446 }
447
448 /*
449 * resize the I/O buffer
450 */
451 static void
452 dumpbuf_resize(void)
453 {
454 char *old_buf = dumpbuf.start;
455 size_t old_size = dumpbuf.size;
456 char *new_buf;
457 size_t new_size;
458
459 ASSERT(MUTEX_HELD(&dump_lock));
460
461 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys));
462 if (new_size <= old_size)
463 return; /* no need to reallocate buffer */
464
465 new_buf = kmem_alloc(new_size, KM_SLEEP);
466 dumpbuf.size = new_size;
467 dumpbuf.start = new_buf;
468 dumpbuf.end = new_buf + new_size;
469 kmem_free(old_buf, old_size);
470 }
471
472 /*
473 * dump_update_clevel is called when dumpadm configures the dump device.
474 * Determine the compression level / type
475 * - DUMP_CLEVEL_SERIAL is single threaded lzjb
476 * - DUMP_CLEVEL_LZJB is parallel lzjb
477 * Calculate number of helpers and buffers.
478 * Allocate the minimum configuration for now.
479 *
480 * When the dump file is configured we reserve a minimum amount of
481 * memory for use at crash time. But we reserve VA for all the memory
482 * we really want in order to do the fastest dump possible. The VA is
483 * backed by pages not being dumped, according to the bitmap. If
484 * there is insufficient spare memory, however, we fall back to the
485 * minimum.
486 *
487 * Live dump (savecore -L) always uses the minimum config.
488 *
489 * For parallel dumps, the number of helpers is ncpu-1. The CPU
490 * running panic runs the main task. For single-threaded dumps, the
491 * panic CPU does lzjb compression (it is tagged as MAINHELPER.)
492 *
493 * Need multiple buffers per helper so that they do not block waiting
494 * for the main task.
495 * parallel single-threaded
496 * Number of output buffers: nhelper*2 1
497 * Number of mapping buffers: nhelper*4 1
498 *
499 */
500 static void
501 dump_update_clevel()
502 {
503 int tag;
504 helper_t *hp, *hpend;
505 cbuf_t *cp, *cpend;
506 dumpcfg_t *old = &dumpcfg;
507 dumpcfg_t newcfg = *old;
508 dumpcfg_t *new = &newcfg;
509
510 ASSERT(MUTEX_HELD(&dump_lock));
511
512 /*
513 * Free the previously allocated bufs and VM.
514 */
515 if (old->helper != NULL) {
516
517 /* helpers */
518 hpend = &old->helper[old->nhelper];
519 for (hp = old->helper; hp != hpend; hp++) {
520 if (hp->lzbuf != NULL)
521 kmem_free(hp->lzbuf, PAGESIZE);
522 if (hp->page != NULL)
523 kmem_free(hp->page, PAGESIZE);
524 }
525 kmem_free(old->helper, old->nhelper * sizeof (helper_t));
526
527 /* VM space for mapping pages */
528 cpend = &old->cmap[old->ncmap];
529 for (cp = old->cmap; cp != cpend; cp++)
530 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE);
531 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t));
532
533 /* output bufs */
534 cpend = &old->cbuf[old->ncbuf];
535 for (cp = old->cbuf; cp != cpend; cp++)
536 if (cp->buf != NULL)
537 kmem_free(cp->buf, cp->size);
538 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t));
539
540 /* reserved VM for dumpsys_get_maxmem */
541 if (old->maxvmsize > 0)
542 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize);
543 }
544
545 /*
546 * Allocate memory and VM.
547 * One CPU runs dumpsys, the rest are helpers.
548 */
549 new->nhelper = ncpus - 1;
550 if (new->nhelper < 1)
551 new->nhelper = 1;
552
553 if (new->nhelper > DUMP_MAX_NHELPER)
554 new->nhelper = DUMP_MAX_NHELPER;
555
556 /* If dump_ncpu_low is 0 or greater than ncpus, do serial dump */
557 if (dump_ncpu_low == 0 || dump_ncpu_low > ncpus || new->nhelper < 2) {
558 new->clevel = DUMP_CLEVEL_SERIAL;
559 new->nhelper = 1;
560 new->ncbuf = 1;
561 new->ncmap = 1;
562 } else {
563 new->clevel = DUMP_CLEVEL_LZJB;
564 new->ncbuf = NCBUF_PER_HELPER * new->nhelper;
565 new->ncmap = NCMAP_PER_HELPER * new->nhelper;
566 }
567
568 /*
569 * Allocate new data structures and buffers for MINHELPERS,
570 * and also figure the max desired size.
571 */
572 new->maxsize = 0;
573 new->maxvmsize = 0;
574 new->maxvm = NULL;
575 tag = 1;
576 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP);
577 hpend = &new->helper[new->nhelper];
578 for (hp = new->helper; hp != hpend; hp++) {
579 hp->tag = tag++;
580 if (hp < &new->helper[MINHELPERS]) {
581 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP);
582 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP);
583 } else {
584 new->maxsize += 2 * PAGESIZE;
585 }
586 }
587
588 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP);
589 cpend = &new->cbuf[new->ncbuf];
590 for (cp = new->cbuf; cp != cpend; cp++) {
591 cp->state = CBUF_FREEBUF;
592 cp->size = CBUF_SIZE;
593 if (cp < &new->cbuf[MINCBUFS])
594 cp->buf = kmem_alloc(cp->size, KM_SLEEP);
595 else
596 new->maxsize += cp->size;
597 }
598
599 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
600 cpend = &new->cmap[new->ncmap];
601 for (cp = new->cmap; cp != cpend; cp++) {
602 cp->state = CBUF_FREEMAP;
603 cp->size = CBUF_MAPSIZE;
604 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
605 0, 0, NULL, NULL, VM_SLEEP);
606 }
607
608 /* reserve VA to be backed with spare pages at crash time */
609 if (new->maxsize > 0) {
610 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
611 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
612 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
613 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
614 }
615
616 /*
617 * Reserve memory for kmem allocation calls made during crash
618 * dump. The hat layer allocates memory for each mapping
619 * created, and the I/O path allocates buffers and data structs.
620 * Add a few pages for safety.
621 */
622 kmem_dump_init((new->ncmap * dump_kmem_permap) +
623 (dump_kmem_pages * PAGESIZE));
624
625 /* set new config pointers */
626 *old = *new;
627 }
628
629 /*
630 * Define a struct memlist walker to optimize bitnum to pfn
631 * lookup. The walker maintains the state of the list traversal.
632 */
633 typedef struct dumpmlw {
634 struct memlist *mp; /* current memlist */
635 pgcnt_t basenum; /* bitnum base offset */
636 pgcnt_t mppages; /* current memlist size */
637 pgcnt_t mpleft; /* size to end of current memlist */
638 pfn_t mpaddr; /* first pfn in memlist */
639 } dumpmlw_t;
640
641 /* initialize the walker */
642 static inline void
643 dump_init_memlist_walker(dumpmlw_t *pw)
644 {
645 pw->mp = phys_install;
646 pw->basenum = 0;
647 pw->mppages = pw->mp->ml_size >> PAGESHIFT;
648 pw->mpleft = pw->mppages;
649 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
650 }
651
652 /*
653 * Lookup pfn given bitnum. The memlist can be quite long on some
654 * systems (e.g.: one per board). To optimize sequential lookups, the
655 * caller initializes and presents a memlist walker.
656 */
657 static pfn_t
658 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw)
659 {
660 bitnum -= pw->basenum;
661 while (pw->mp != NULL) {
662 if (bitnum < pw->mppages) {
663 pw->mpleft = pw->mppages - bitnum;
664 return (pw->mpaddr + bitnum);
665 }
666 bitnum -= pw->mppages;
667 pw->basenum += pw->mppages;
668 pw->mp = pw->mp->ml_next;
669 if (pw->mp != NULL) {
670 pw->mppages = pw->mp->ml_size >> PAGESHIFT;
671 pw->mpleft = pw->mppages;
672 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
673 }
674 }
675 return (PFN_INVALID);
676 }
677
678 static pgcnt_t
679 dump_pfn_to_bitnum(pfn_t pfn)
680 {
681 struct memlist *mp;
682 pgcnt_t bitnum = 0;
683
684 for (mp = phys_install; mp != NULL; mp = mp->ml_next) {
685 if (pfn >= (mp->ml_address >> PAGESHIFT) &&
686 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT))
687 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT));
688 bitnum += mp->ml_size >> PAGESHIFT;
689 }
690 return ((pgcnt_t)-1);
691 }
692
693 /*
694 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The
695 * mapping of pfn to range index is imperfect because pfn and bitnum
696 * do not have the same phase. To make sure a CBUF_MAPSIZE range is
697 * covered, call this for both ends:
698 * dump_set_used(base)
699 * dump_set_used(base+CBUF_MAPNP-1)
700 *
701 * This is used during a panic dump to mark pages allocated by
702 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by
703 * page_get_mnode_freelist() to make sure pages used by dump are never
704 * allocated.
705 */
706 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT))
707
708 static void
709 dump_set_used(pfn_t pfn)
710 {
711
712 pgcnt_t bitnum, rbitnum;
713
714 bitnum = dump_pfn_to_bitnum(pfn);
715 ASSERT(bitnum != (pgcnt_t)-1);
716
717 rbitnum = CBUF_MAPP2R(bitnum);
718 ASSERT(rbitnum < dumpcfg.rbitmapsize);
719
720 BT_SET(dumpcfg.rbitmap, rbitnum);
721 }
722
723 int
724 dump_test_used(pfn_t pfn)
725 {
726 pgcnt_t bitnum, rbitnum;
727
728 bitnum = dump_pfn_to_bitnum(pfn);
729 ASSERT(bitnum != (pgcnt_t)-1);
730
731 rbitnum = CBUF_MAPP2R(bitnum);
732 ASSERT(rbitnum < dumpcfg.rbitmapsize);
733
734 return (BT_TEST(dumpcfg.rbitmap, rbitnum));
735 }
736
737 /*
738 * Perform additional checks on the page to see if we can really use
739 * it. The kernel (kas) pages are always set in the bitmap. However,
740 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the
741 * bitmap. So we check for them.
742 */
743 static inline int
744 dump_pfn_check(pfn_t pfn)
745 {
746 page_t *pp = page_numtopp_nolock(pfn);
747 if (pp == NULL || pp->p_pagenum != pfn ||
748 #if defined(__sparc)
749 pp->p_vnode == &promvp ||
750 #else
751 PP_ISBOOTPAGES(pp) ||
752 #endif
753 pp->p_toxic != 0)
754 return (0);
755 return (1);
756 }
757
758 /*
759 * Check a range to see if all contained pages are available and
760 * return non-zero if the range can be used.
761 */
762 static inline int
763 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn)
764 {
765 for (; start < end; start++, pfn++) {
766 if (BT_TEST(dumpcfg.bitmap, start))
767 return (0);
768 if (!dump_pfn_check(pfn))
769 return (0);
770 }
771 return (1);
772 }
773
774 /*
775 * dumpsys_get_maxmem() is called during panic. Find unused ranges
776 * and use them for buffers.
777 * It searches the dump bitmap in 2 passes. The first time it looks
778 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages.
779 */
780 static void
781 dumpsys_get_maxmem()
782 {
783 dumpcfg_t *cfg = &dumpcfg;
784 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf];
785 helper_t *endhp = &cfg->helper[cfg->nhelper];
786 pgcnt_t bitnum, end;
787 size_t sz, endsz;
788 pfn_t pfn, off;
789 cbuf_t *cp;
790 helper_t *hp;
791 dumpmlw_t mlw;
792 int k;
793
794 /*
795 * Setting dump_ncpu_low to 0 forces a single threaded dump.
796 */
797 if (dump_ncpu_low == 0) {
798 cfg->clevel = DUMP_CLEVEL_SERIAL;
799 return;
800 }
801
802 /*
803 * There may be no point in looking for spare memory. If
804 * dumping all memory, then none is spare. If doing a serial
805 * dump, then already have buffers.
806 */
807 if (cfg->maxsize == 0 || cfg->clevel == DUMP_CLEVEL_SERIAL ||
808 (dump_conflags & DUMP_ALL) != 0) {
809 return;
810 }
811
812 sz = 0;
813 cfg->found4m = 0;
814 cfg->foundsm = 0;
815
816 /* bitmap of ranges used to estimate which pfns are being used */
817 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize));
818
819 /* find ranges that are not being dumped to use for buffers */
820 dump_init_memlist_walker(&mlw);
821 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
822 dump_timeleft = dump_timeout;
823 end = bitnum + CBUF_MAPNP;
824 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
825 ASSERT(pfn != PFN_INVALID);
826
827 /* skip partial range at end of mem segment */
828 if (mlw.mpleft < CBUF_MAPNP) {
829 end = bitnum + mlw.mpleft;
830 continue;
831 }
832
833 /* skip non aligned pages */
834 off = P2PHASE(pfn, CBUF_MAPNP);
835 if (off != 0) {
836 end -= off;
837 continue;
838 }
839
840 if (!dump_range_check(bitnum, end, pfn))
841 continue;
842
843 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize);
844 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn,
845 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
846 sz += CBUF_MAPSIZE;
847 cfg->found4m++;
848
849 /* set the bitmap for both ends to be sure to cover the range */
850 dump_set_used(pfn);
851 dump_set_used(pfn + CBUF_MAPNP - 1);
852
853 if (sz >= cfg->maxsize)
854 goto foundmax;
855 }
856
857 /* Add small pages if we can't find enough large pages. */
858 dump_init_memlist_walker(&mlw);
859 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
860 dump_timeleft = dump_timeout;
861 end = bitnum + CBUF_MAPNP;
862 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
863 ASSERT(pfn != PFN_INVALID);
864
865 /* Find any non-aligned pages at start and end of segment. */
866 off = P2PHASE(pfn, CBUF_MAPNP);
867 if (mlw.mpleft < CBUF_MAPNP) {
868 end = bitnum + mlw.mpleft;
869 } else if (off != 0) {
870 end -= off;
871 } else if (cfg->found4m && dump_test_used(pfn)) {
872 continue;
873 }
874
875 for (; bitnum < end; bitnum++, pfn++) {
876 dump_timeleft = dump_timeout;
877 if (BT_TEST(dumpcfg.bitmap, bitnum))
878 continue;
879 if (!dump_pfn_check(pfn))
880 continue;
881 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize);
882 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn,
883 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
884 sz += PAGESIZE;
885 cfg->foundsm++;
886 dump_set_used(pfn);
887 if (sz >= cfg->maxsize)
888 goto foundmax;
889 }
890 }
891
892 /* Allocate memory for as many helpers as we can. */
893 foundmax:
894
895 /* Byte offsets into memory found and mapped above */
896 endsz = sz;
897 sz = 0;
898
899 /* Skip the preallocate output buffers. */
900 cp = &cfg->cbuf[MINCBUFS];
901
902 /* Loop over all helpers and allocate memory. */
903 for (hp = cfg->helper; hp < endhp; hp++) {
904
905 /* Skip preallocated helpers by checking hp->page. */
906 if (hp->page == NULL) {
907 /* lzjb needs 2 1-page buffers */
908 if ((sz + (2 * PAGESIZE)) > endsz)
909 break;
910 hp->page = cfg->maxvm + sz;
911 sz += PAGESIZE;
912 hp->lzbuf = cfg->maxvm + sz;
913 sz += PAGESIZE;
914 }
915
916 /*
917 * Add output buffers per helper. The number of
918 * buffers per helper is determined by the ratio of
919 * ncbuf to nhelper.
920 */
921 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz &&
922 k < NCBUF_PER_HELPER; k++) {
923 cp->state = CBUF_FREEBUF;
924 cp->size = CBUF_SIZE;
925 cp->buf = cfg->maxvm + sz;
926 sz += CBUF_SIZE;
927 ++cp;
928 }
929 }
930
931 /* Finish allocating output buffers */
932 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) {
933 cp->state = CBUF_FREEBUF;
934 cp->size = CBUF_SIZE;
935 cp->buf = cfg->maxvm + sz;
936 sz += CBUF_SIZE;
937 }
938
939 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */
940 if (cfg->found4m || cfg->foundsm)
941 dump_check_used = 1;
942
943 ASSERT(sz <= endsz);
944 }
945
946 static void
947 dumphdr_init(void)
948 {
949 pgcnt_t npages = 0;
950
951 ASSERT(MUTEX_HELD(&dump_lock));
952
953 if (dumphdr == NULL) {
954 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP);
955 dumphdr->dump_magic = DUMP_MAGIC;
956 dumphdr->dump_version = DUMP_VERSION;
957 dumphdr->dump_wordsize = DUMP_WORDSIZE;
958 dumphdr->dump_pageshift = PAGESHIFT;
959 dumphdr->dump_pagesize = PAGESIZE;
960 dumphdr->dump_utsname = utsname;
961 (void) strcpy(dumphdr->dump_platform, platform);
962 dumpbuf.size = dumpbuf_iosize(maxphys);
963 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP);
964 dumpbuf.end = dumpbuf.start + dumpbuf.size;
965 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP);
966 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP);
967 LOCK_INIT_HELD(&dumpcfg.helper_lock);
968 dump_stack_scratch = kmem_alloc(STACK_BUF_SIZE, KM_SLEEP);
969 (void) strncpy(dumphdr->dump_uuid, dump_get_uuid(),
970 sizeof (dumphdr->dump_uuid));
971 }
972
973 npages = num_phys_pages();
974
975 if (dumpcfg.bitmapsize != npages) {
976 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP));
977 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP);
978 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP);
979
980 if (dumpcfg.bitmap != NULL)
981 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.
982 bitmapsize));
983 if (dumpcfg.rbitmap != NULL)
984 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.
985 rbitmapsize));
986 dumpcfg.bitmap = map;
987 dumpcfg.bitmapsize = npages;
988 dumpcfg.rbitmap = rmap;
989 dumpcfg.rbitmapsize = rlen;
990 }
991 }
992
993 /*
994 * Establish a new dump device.
995 */
996 int
997 dumpinit(vnode_t *vp, char *name, int justchecking)
998 {
999 vnode_t *cvp;
1000 vattr_t vattr;
1001 vnode_t *cdev_vp;
1002 int error = 0;
1003
1004 ASSERT(MUTEX_HELD(&dump_lock));
1005
1006 dumphdr_init();
1007
1008 cvp = common_specvp(vp);
1009 if (cvp == dumpvp)
1010 return (0);
1011
1012 /*
1013 * Determine whether this is a plausible dump device. We want either:
1014 * (1) a real device that's not mounted and has a cb_dump routine, or
1015 * (2) a swapfile on some filesystem that has a vop_dump routine.
1016 */
1017 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0)
1018 return (error);
1019
1020 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV;
1021 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) {
1022 if (vattr.va_type == VBLK || vattr.va_type == VCHR) {
1023 if (devopsp[getmajor(vattr.va_rdev)]->
1024 devo_cb_ops->cb_dump == nodev)
1025 error = ENOTSUP;
1026 else if (vfs_devismounted(vattr.va_rdev))
1027 error = EBUSY;
1028 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip),
1029 ZFS_DRIVER) == 0 &&
1030 IS_SWAPVP(common_specvp(cvp)))
1031 error = EBUSY;
1032 } else {
1033 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) ||
1034 !IS_SWAPVP(cvp))
1035 error = ENOTSUP;
1036 }
1037 }
1038
1039 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE)
1040 error = ENOSPC;
1041
1042 if (error || justchecking) {
1043 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0,
1044 kcred, NULL);
1045 return (error);
1046 }
1047
1048 VN_HOLD(cvp);
1049
1050 if (dumpvp != NULL)
1051 dumpfini(); /* unconfigure the old dump device */
1052
1053 dumpvp = cvp;
1054 dumpvp_size = vattr.va_size & -DUMP_OFFSET;
1055 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1056 (void) strcpy(dumppath, name);
1057 dumpbuf.iosize = 0;
1058
1059 /*
1060 * If the dump device is a block device, attempt to open up the
1061 * corresponding character device and determine its maximum transfer
1062 * size. We use this information to potentially resize dumpbuf to a
1063 * larger and more optimal size for performing i/o to the dump device.
1064 */
1065 if (cvp->v_type == VBLK &&
1066 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) {
1067 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1068 size_t blk_size;
1069 struct dk_cinfo dki;
1070 struct dk_minfo minf;
1071
1072 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO,
1073 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL)
1074 == 0 && minf.dki_lbsize != 0)
1075 blk_size = minf.dki_lbsize;
1076 else
1077 blk_size = DEV_BSIZE;
1078
1079 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki,
1080 FKIOCTL, kcred, NULL, NULL) == 0) {
1081 dumpbuf.iosize = dki.dki_maxtransfer * blk_size;
1082 dumpbuf_resize();
1083 }
1084 /*
1085 * If we are working with a zvol then dumpify it
1086 * if it's not being used as swap.
1087 */
1088 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) {
1089 if (IS_SWAPVP(common_specvp(cvp)))
1090 error = EBUSY;
1091 else if ((error = VOP_IOCTL(cdev_vp,
1092 DKIOCDUMPINIT, NULL, FKIOCTL, kcred,
1093 NULL, NULL)) != 0)
1094 dumpfini();
1095 }
1096
1097 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1098 kcred, NULL);
1099 }
1100
1101 VN_RELE(cdev_vp);
1102 }
1103
1104 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20);
1105
1106 dump_update_clevel();
1107
1108 return (error);
1109 }
1110
1111 void
1112 dumpfini(void)
1113 {
1114 vattr_t vattr;
1115 boolean_t is_zfs = B_FALSE;
1116 vnode_t *cdev_vp;
1117 ASSERT(MUTEX_HELD(&dump_lock));
1118
1119 kmem_free(dumppath, strlen(dumppath) + 1);
1120
1121 /*
1122 * Determine if we are using zvols for our dump device
1123 */
1124 vattr.va_mask = AT_RDEV;
1125 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) {
1126 is_zfs = (getmajor(vattr.va_rdev) ==
1127 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE;
1128 }
1129
1130 /*
1131 * If we have a zvol dump device then we call into zfs so
1132 * that it may have a chance to cleanup.
1133 */
1134 if (is_zfs &&
1135 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) {
1136 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1137 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL,
1138 kcred, NULL, NULL);
1139 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1140 kcred, NULL);
1141 }
1142 VN_RELE(cdev_vp);
1143 }
1144
1145 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL);
1146
1147 VN_RELE(dumpvp);
1148
1149 dumpvp = NULL;
1150 dumpvp_size = 0;
1151 dumppath = NULL;
1152 }
1153
1154 static offset_t
1155 dumpvp_flush(void)
1156 {
1157 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE);
1158 hrtime_t iotime;
1159 int err;
1160
1161 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) {
1162 dump_ioerr = ENOSPC;
1163 dumpbuf.vp_off = dumpbuf.vp_limit;
1164 } else if (size != 0) {
1165 iotime = gethrtime();
1166 dumpsync.iowait += iotime - dumpsync.iowaitts;
1167 if (panicstr)
1168 err = VOP_DUMP(dumpvp, dumpbuf.start,
1169 lbtodb(dumpbuf.vp_off), btod(size), NULL);
1170 else
1171 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ?
1172 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size,
1173 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit,
1174 kcred, 0);
1175 if (err && dump_ioerr == 0)
1176 dump_ioerr = err;
1177 dumpsync.iowaitts = gethrtime();
1178 dumpsync.iotime += dumpsync.iowaitts - iotime;
1179 dumpsync.nwrite += size;
1180 dumpbuf.vp_off += size;
1181 }
1182 dumpbuf.cur = dumpbuf.start;
1183 dump_timeleft = dump_timeout;
1184 return (dumpbuf.vp_off);
1185 }
1186
1187 /* maximize write speed by keeping seek offset aligned with size */
1188 void
1189 dumpvp_write(const void *va, size_t size)
1190 {
1191 size_t len, off, sz;
1192
1193 while (size != 0) {
1194 len = MIN(size, dumpbuf.end - dumpbuf.cur);
1195 if (len == 0) {
1196 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size);
1197 if (off == 0 || !ISP2(dumpbuf.size)) {
1198 (void) dumpvp_flush();
1199 } else {
1200 sz = dumpbuf.size - off;
1201 dumpbuf.cur = dumpbuf.start + sz;
1202 (void) dumpvp_flush();
1203 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off);
1204 dumpbuf.cur += off;
1205 }
1206 } else {
1207 bcopy(va, dumpbuf.cur, len);
1208 va = (char *)va + len;
1209 dumpbuf.cur += len;
1210 size -= len;
1211 }
1212 }
1213 }
1214
1215 /*ARGSUSED*/
1216 static void
1217 dumpvp_ksyms_write(const void *src, void *dst, size_t size)
1218 {
1219 dumpvp_write(src, size);
1220 }
1221
1222 /*
1223 * Mark 'pfn' in the bitmap and dump its translation table entry.
1224 */
1225 void
1226 dump_addpage(struct as *as, void *va, pfn_t pfn)
1227 {
1228 mem_vtop_t mem_vtop;
1229 pgcnt_t bitnum;
1230
1231 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1232 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1233 dumphdr->dump_npages++;
1234 BT_SET(dumpcfg.bitmap, bitnum);
1235 }
1236 dumphdr->dump_nvtop++;
1237 mem_vtop.m_as = as;
1238 mem_vtop.m_va = va;
1239 mem_vtop.m_pfn = pfn;
1240 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
1241 }
1242 dump_timeleft = dump_timeout;
1243 }
1244
1245 /*
1246 * Mark 'pfn' in the bitmap
1247 */
1248 void
1249 dump_page(pfn_t pfn)
1250 {
1251 pgcnt_t bitnum;
1252
1253 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1254 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1255 dumphdr->dump_npages++;
1256 BT_SET(dumpcfg.bitmap, bitnum);
1257 }
1258 }
1259 dump_timeleft = dump_timeout;
1260 }
1261
1262 /*
1263 * Dump the <as, va, pfn> information for a given address space.
1264 * SEGOP_DUMP() will call dump_addpage() for each page in the segment.
1265 */
1266 static void
1267 dump_as(struct as *as)
1268 {
1269 struct seg *seg;
1270
1271 AS_LOCK_ENTER(as, RW_READER);
1272 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
1273 if (seg->s_as != as)
1274 break;
1275 if (seg->s_ops == NULL)
1276 continue;
1277 SEGOP_DUMP(seg);
1278 }
1279 AS_LOCK_EXIT(as);
1280
1281 if (seg != NULL)
1282 cmn_err(CE_WARN, "invalid segment %p in address space %p",
1283 (void *)seg, (void *)as);
1284 }
1285
1286 static int
1287 dump_process(pid_t pid)
1288 {
1289 proc_t *p = sprlock(pid);
1290
1291 if (p == NULL)
1292 return (-1);
1293 if (p->p_as != &kas) {
1294 mutex_exit(&p->p_lock);
1295 dump_as(p->p_as);
1296 mutex_enter(&p->p_lock);
1297 }
1298
1299 sprunlock(p);
1300
1301 return (0);
1302 }
1303
1304 /*
1305 * The following functions (dump_summary(), dump_ereports(), and
1306 * dump_messages()), write data to an uncompressed area within the
1307 * crashdump. The layout of these is
1308 *
1309 * +------------------------------------------------------------+
1310 * | compressed pages | summary | ereports | messages |
1311 * +------------------------------------------------------------+
1312 *
1313 * With the advent of saving a compressed crash dump by default, we
1314 * need to save a little more data to describe the failure mode in
1315 * an uncompressed buffer available before savecore uncompresses
1316 * the dump. Initially this is a copy of the stack trace. Additional
1317 * summary information should be added here.
1318 */
1319
1320 void
1321 dump_summary(void)
1322 {
1323 u_offset_t dumpvp_start;
1324 summary_dump_t sd;
1325
1326 if (dumpvp == NULL || dumphdr == NULL)
1327 return;
1328
1329 dumpbuf.cur = dumpbuf.start;
1330
1331 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE +
1332 DUMP_ERPTSIZE);
1333 dumpvp_start = dumpbuf.vp_limit - DUMP_SUMMARYSIZE;
1334 dumpbuf.vp_off = dumpvp_start;
1335
1336 sd.sd_magic = SUMMARY_MAGIC;
1337 sd.sd_ssum = checksum32(dump_stack_scratch, STACK_BUF_SIZE);
1338 dumpvp_write(&sd, sizeof (sd));
1339 dumpvp_write(dump_stack_scratch, STACK_BUF_SIZE);
1340
1341 sd.sd_magic = 0; /* indicate end of summary */
1342 dumpvp_write(&sd, sizeof (sd));
1343 (void) dumpvp_flush();
1344 }
1345
1346 void
1347 dump_ereports(void)
1348 {
1349 u_offset_t dumpvp_start;
1350 erpt_dump_t ed;
1351
1352 if (dumpvp == NULL || dumphdr == NULL)
1353 return;
1354
1355 dumpbuf.cur = dumpbuf.start;
1356 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE);
1357 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE;
1358 dumpbuf.vp_off = dumpvp_start;
1359
1360 fm_ereport_dump();
1361 if (panicstr)
1362 errorq_dump();
1363
1364 bzero(&ed, sizeof (ed)); /* indicate end of ereports */
1365 dumpvp_write(&ed, sizeof (ed));
1366 (void) dumpvp_flush();
1367
1368 if (!panicstr) {
1369 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1370 (size_t)(dumpbuf.vp_off - dumpvp_start),
1371 B_INVAL | B_FORCE, kcred, NULL);
1372 }
1373 }
1374
1375 void
1376 dump_messages(void)
1377 {
1378 log_dump_t ld;
1379 mblk_t *mctl, *mdata;
1380 queue_t *q, *qlast;
1381 u_offset_t dumpvp_start;
1382
1383 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL)
1384 return;
1385
1386 dumpbuf.cur = dumpbuf.start;
1387 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET;
1388 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE;
1389 dumpbuf.vp_off = dumpvp_start;
1390
1391 qlast = NULL;
1392 do {
1393 for (q = log_consq; q->q_next != qlast; q = q->q_next)
1394 continue;
1395 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) {
1396 dump_timeleft = dump_timeout;
1397 mdata = mctl->b_cont;
1398 ld.ld_magic = LOG_MAGIC;
1399 ld.ld_msgsize = MBLKL(mctl->b_cont);
1400 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl));
1401 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata));
1402 dumpvp_write(&ld, sizeof (ld));
1403 dumpvp_write(mctl->b_rptr, MBLKL(mctl));
1404 dumpvp_write(mdata->b_rptr, MBLKL(mdata));
1405 }
1406 } while ((qlast = q) != log_consq);
1407
1408 ld.ld_magic = 0; /* indicate end of messages */
1409 dumpvp_write(&ld, sizeof (ld));
1410 (void) dumpvp_flush();
1411 if (!panicstr) {
1412 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1413 (size_t)(dumpbuf.vp_off - dumpvp_start),
1414 B_INVAL | B_FORCE, kcred, NULL);
1415 }
1416 }
1417
1418 /*
1419 * The following functions are called on multiple CPUs during dump.
1420 * They must not use most kernel services, because all cross-calls are
1421 * disabled during panic. Therefore, blocking locks and cache flushes
1422 * will not work.
1423 */
1424
1425 /*
1426 * Copy pages, trapping ECC errors. Also, for robustness, trap data
1427 * access in case something goes wrong in the hat layer and the
1428 * mapping is broken.
1429 */
1430 static int
1431 dump_pagecopy(void *src, void *dst)
1432 {
1433 long *wsrc = (long *)src;
1434 long *wdst = (long *)dst;
1435 const ulong_t ncopies = PAGESIZE / sizeof (long);
1436 volatile int w = 0;
1437 volatile int ueoff = -1;
1438 on_trap_data_t otd;
1439
1440 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) {
1441 if (ueoff == -1)
1442 ueoff = w * sizeof (long);
1443 /* report "bad ECC" or "bad address" */
1444 #ifdef _LP64
1445 if (otd.ot_trap & OT_DATA_EC)
1446 wdst[w++] = 0x00badecc00badecc;
1447 else
1448 wdst[w++] = 0x00badadd00badadd;
1449 #else
1450 if (otd.ot_trap & OT_DATA_EC)
1451 wdst[w++] = 0x00badecc;
1452 else
1453 wdst[w++] = 0x00badadd;
1454 #endif
1455 }
1456 while (w < ncopies) {
1457 wdst[w] = wsrc[w];
1458 w++;
1459 }
1460 no_trap();
1461 return (ueoff);
1462 }
1463
1464 static void
1465 dumpsys_close_cq(cqueue_t *cq, int live)
1466 {
1467 if (live) {
1468 mutex_enter(&cq->mutex);
1469 atomic_dec_uint(&cq->open);
1470 cv_signal(&cq->cv);
1471 mutex_exit(&cq->mutex);
1472 } else {
1473 atomic_dec_uint(&cq->open);
1474 }
1475 }
1476
1477 static inline void
1478 dumpsys_spinlock(lock_t *lp)
1479 {
1480 uint_t backoff = 0;
1481 int loop_count = 0;
1482
1483 while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
1484 if (++loop_count >= ncpus) {
1485 backoff = mutex_lock_backoff(0);
1486 loop_count = 0;
1487 } else {
1488 backoff = mutex_lock_backoff(backoff);
1489 }
1490 mutex_lock_delay(backoff);
1491 }
1492 }
1493
1494 static inline void
1495 dumpsys_spinunlock(lock_t *lp)
1496 {
1497 lock_clear(lp);
1498 }
1499
1500 static inline void
1501 dumpsys_lock(cqueue_t *cq, int live)
1502 {
1503 if (live)
1504 mutex_enter(&cq->mutex);
1505 else
1506 dumpsys_spinlock(&cq->spinlock);
1507 }
1508
1509 static inline void
1510 dumpsys_unlock(cqueue_t *cq, int live, int signal)
1511 {
1512 if (live) {
1513 if (signal)
1514 cv_signal(&cq->cv);
1515 mutex_exit(&cq->mutex);
1516 } else {
1517 dumpsys_spinunlock(&cq->spinlock);
1518 }
1519 }
1520
1521 static void
1522 dumpsys_wait_cq(cqueue_t *cq, int live)
1523 {
1524 if (live) {
1525 cv_wait(&cq->cv, &cq->mutex);
1526 } else {
1527 dumpsys_spinunlock(&cq->spinlock);
1528 while (cq->open)
1529 if (cq->first)
1530 break;
1531 dumpsys_spinlock(&cq->spinlock);
1532 }
1533 }
1534
1535 static void
1536 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live)
1537 {
1538 if (cp == NULL)
1539 return;
1540
1541 dumpsys_lock(cq, live);
1542
1543 if (cq->ts != 0) {
1544 cq->empty += gethrtime() - cq->ts;
1545 cq->ts = 0;
1546 }
1547
1548 cp->state = newstate;
1549 cp->next = NULL;
1550 if (cq->last == NULL)
1551 cq->first = cp;
1552 else
1553 cq->last->next = cp;
1554 cq->last = cp;
1555
1556 dumpsys_unlock(cq, live, 1);
1557 }
1558
1559 static cbuf_t *
1560 dumpsys_get_cq(cqueue_t *cq, int live)
1561 {
1562 cbuf_t *cp;
1563 hrtime_t now = gethrtime();
1564
1565 dumpsys_lock(cq, live);
1566
1567 /* CONSTCOND */
1568 while (1) {
1569 cp = (cbuf_t *)cq->first;
1570 if (cp == NULL) {
1571 if (cq->open == 0)
1572 break;
1573 dumpsys_wait_cq(cq, live);
1574 continue;
1575 }
1576 cq->first = cp->next;
1577 if (cq->first == NULL) {
1578 cq->last = NULL;
1579 cq->ts = now;
1580 }
1581 break;
1582 }
1583
1584 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0);
1585 return (cp);
1586 }
1587
1588 /*
1589 * Send an error message to the console. If the main task is running
1590 * just write the message via uprintf. If a helper is running the
1591 * message has to be put on a queue for the main task. Setting fmt to
1592 * NULL means flush the error message buffer. If fmt is not NULL, just
1593 * add the text to the existing buffer.
1594 */
1595 static void
1596 dumpsys_errmsg(helper_t *hp, const char *fmt, ...)
1597 {
1598 dumpsync_t *ds = hp->ds;
1599 cbuf_t *cp = hp->cperr;
1600 va_list adx;
1601
1602 if (hp->helper == MAINHELPER) {
1603 if (fmt != NULL) {
1604 if (ds->neednl) {
1605 uprintf("\n");
1606 ds->neednl = 0;
1607 }
1608 va_start(adx, fmt);
1609 vuprintf(fmt, adx);
1610 va_end(adx);
1611 }
1612 } else if (fmt == NULL) {
1613 if (cp != NULL) {
1614 CQ_PUT(mainq, cp, CBUF_ERRMSG);
1615 hp->cperr = NULL;
1616 }
1617 } else {
1618 if (hp->cperr == NULL) {
1619 cp = CQ_GET(freebufq);
1620 hp->cperr = cp;
1621 cp->used = 0;
1622 }
1623 va_start(adx, fmt);
1624 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used,
1625 fmt, adx);
1626 va_end(adx);
1627 if ((cp->used + LOG_MSGSIZE) > cp->size) {
1628 CQ_PUT(mainq, cp, CBUF_ERRMSG);
1629 hp->cperr = NULL;
1630 }
1631 }
1632 }
1633
1634 /*
1635 * Write an output buffer to the dump file. If the main task is
1636 * running just write the data. If a helper is running the output is
1637 * placed on a queue for the main task.
1638 */
1639 static void
1640 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used)
1641 {
1642 dumpsync_t *ds = hp->ds;
1643
1644 if (hp->helper == MAINHELPER) {
1645 HRSTART(ds->perpage, write);
1646 dumpvp_write(cp->buf, used);
1647 HRSTOP(ds->perpage, write);
1648 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
1649 } else {
1650 cp->used = used;
1651 CQ_PUT(mainq, cp, CBUF_WRITE);
1652 }
1653 }
1654
1655 /*
1656 * Copy one page within the mapped range. The offset starts at 0 and
1657 * is relative to the first pfn. cp->buf + cp->off is the address of
1658 * the first pfn. If dump_pagecopy returns a UE offset, create an
1659 * error message. Returns the offset to the next pfn in the range
1660 * selected by the bitmap.
1661 */
1662 static int
1663 dumpsys_copy_page(helper_t *hp, int offset)
1664 {
1665 cbuf_t *cp = hp->cpin;
1666 int ueoff;
1667
1668 ASSERT(cp->off + offset + PAGESIZE <= cp->size);
1669 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum));
1670
1671 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page);
1672
1673 /* ueoff is the offset in the page to a UE error */
1674 if (ueoff != -1) {
1675 uint64_t pa = ptob(cp->pfn) + offset + ueoff;
1676
1677 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n",
1678 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa);
1679 }
1680
1681 /*
1682 * Advance bitnum and offset to the next input page for the
1683 * next call to this function.
1684 */
1685 offset += PAGESIZE;
1686 cp->bitnum++;
1687 while (cp->off + offset < cp->size) {
1688 if (BT_TEST(dumpcfg.bitmap, cp->bitnum))
1689 break;
1690 offset += PAGESIZE;
1691 cp->bitnum++;
1692 }
1693
1694 return (offset);
1695 }
1696
1697 /*
1698 * Read the helper queue, and copy one mapped page. Return 0 when
1699 * done. Return 1 when a page has been copied into hp->page.
1700 */
1701 static int
1702 dumpsys_sread(helper_t *hp)
1703 {
1704 dumpsync_t *ds = hp->ds;
1705
1706 /* CONSTCOND */
1707 while (1) {
1708
1709 /* Find the next input buffer. */
1710 if (hp->cpin == NULL) {
1711 HRSTART(hp->perpage, inwait);
1712
1713 /* CONSTCOND */
1714 while (1) {
1715 hp->cpin = CQ_GET(helperq);
1716 dump_timeleft = dump_timeout;
1717
1718 /*
1719 * NULL return means the helper queue
1720 * is closed and empty.
1721 */
1722 if (hp->cpin == NULL)
1723 break;
1724
1725 /* Have input, check for dump I/O error. */
1726 if (!dump_ioerr)
1727 break;
1728
1729 /*
1730 * If an I/O error occurs, stay in the
1731 * loop in order to empty the helper
1732 * queue. Return the buffers to the
1733 * main task to unmap and free it.
1734 */
1735 hp->cpin->used = 0;
1736 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1737 }
1738 HRSTOP(hp->perpage, inwait);
1739
1740 /* Stop here when the helper queue is closed. */
1741 if (hp->cpin == NULL)
1742 break;
1743
1744 /* Set the offset=0 to get the first pfn. */
1745 hp->in = 0;
1746
1747 /* Set the total processed to 0 */
1748 hp->used = 0;
1749 }
1750
1751 /* Process the next page. */
1752 if (hp->used < hp->cpin->used) {
1753
1754 /*
1755 * Get the next page from the input buffer and
1756 * return a copy.
1757 */
1758 ASSERT(hp->in != -1);
1759 HRSTART(hp->perpage, copy);
1760 hp->in = dumpsys_copy_page(hp, hp->in);
1761 hp->used += PAGESIZE;
1762 HRSTOP(hp->perpage, copy);
1763 break;
1764
1765 } else {
1766
1767 /*
1768 * Done with the input. Flush the VM and
1769 * return the buffer to the main task.
1770 */
1771 if (panicstr && hp->helper != MAINHELPER)
1772 hat_flush();
1773 dumpsys_errmsg(hp, NULL);
1774 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1775 hp->cpin = NULL;
1776 }
1777 }
1778
1779 return (hp->cpin != NULL);
1780 }
1781
1782 /*
1783 * Compress with lzjb
1784 * write stream block if full or size==0
1785 * if csize==0 write stream header, else write <csize, data>
1786 * size==0 is a call to flush a buffer
1787 * hp->cpout is the buffer we are flushing or filling
1788 * hp->out is the next index to fill data
1789 * osize is either csize+data, or the size of a stream header
1790 */
1791 static void
1792 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size)
1793 {
1794 dumpsync_t *ds = hp->ds;
1795 const int CSIZE = sizeof (dumpcsize_t);
1796 dumpcsize_t cs;
1797 size_t osize = csize > 0 ? CSIZE + size : size;
1798
1799 /* If flush, and there is no buffer, just return */
1800 if (size == 0 && hp->cpout == NULL)
1801 return;
1802
1803 /* If flush, or cpout is full, write it out */
1804 if (size == 0 ||
1805 hp->cpout != NULL && hp->out + osize > hp->cpout->size) {
1806
1807 /* Set tag+size word at the front of the stream block. */
1808 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag);
1809 (void) memcpy(hp->cpout->buf, &cs, CSIZE);
1810
1811 /* Write block to dump file. */
1812 dumpsys_swrite(hp, hp->cpout, hp->out);
1813
1814 /* Clear pointer to indicate we need a new buffer */
1815 hp->cpout = NULL;
1816
1817 /* flushing, we are done */
1818 if (size == 0)
1819 return;
1820 }
1821
1822 /* Get an output buffer if we dont have one. */
1823 if (hp->cpout == NULL) {
1824 HRSTART(hp->perpage, outwait);
1825 hp->cpout = CQ_GET(freebufq);
1826 HRSTOP(hp->perpage, outwait);
1827 hp->out = CSIZE;
1828 }
1829
1830 /* Store csize word. This is the size of compressed data. */
1831 if (csize > 0) {
1832 cs = DUMP_SET_TAG(csize, 0);
1833 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE);
1834 hp->out += CSIZE;
1835 }
1836
1837 /* Store the data. */
1838 (void) memcpy(hp->cpout->buf + hp->out, buf, size);
1839 hp->out += size;
1840 }
1841
1842 static void
1843 dumpsys_lzjbcompress(helper_t *hp)
1844 {
1845 dumpsync_t *ds = hp->ds;
1846 size_t csize;
1847 dumpstreamhdr_t sh;
1848
1849 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
1850 sh.stream_pagenum = (pfn_t)-1;
1851 sh.stream_npages = 0;
1852 hp->cpin = NULL;
1853 hp->cpout = NULL;
1854 hp->cperr = NULL;
1855 hp->in = 0;
1856 hp->out = 0;
1857
1858 /* Bump reference to mainq while we are running */
1859 CQ_OPEN(mainq);
1860
1861 /* Get one page at a time */
1862 while (dumpsys_sread(hp)) {
1863
1864 /* Create a stream header for each new input map */
1865 if (sh.stream_pagenum != hp->cpin->pagenum) {
1866 sh.stream_pagenum = hp->cpin->pagenum;
1867 sh.stream_npages = btop(hp->cpin->used);
1868 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh));
1869 }
1870
1871 /* Compress one page */
1872 HRSTART(hp->perpage, compress);
1873 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
1874 HRSTOP(hp->perpage, compress);
1875
1876 /* Add csize+data to output block */
1877 ASSERT(csize > 0 && csize <= PAGESIZE);
1878 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize);
1879 }
1880
1881 /* Done with input, flush any partial buffer */
1882 if (sh.stream_pagenum != (pfn_t)-1) {
1883 dumpsys_lzjbrun(hp, 0, NULL, 0);
1884 dumpsys_errmsg(hp, NULL);
1885 }
1886
1887 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
1888
1889 /* Decrement main queue count, we are done */
1890 CQ_CLOSE(mainq);
1891 }
1892
1893 /*
1894 * Dump helper called from panic_idle() to compress pages. CPUs in
1895 * this path must not call most kernel services.
1896 *
1897 * During panic, all but one of the CPUs is idle. These CPUs are used
1898 * as helpers working in parallel to copy and compress memory
1899 * pages. During a panic, however, these processors cannot call any
1900 * kernel services. This is because mutexes become no-ops during
1901 * panic, and, cross-call interrupts are inhibited. Therefore, during
1902 * panic dump the helper CPUs communicate with the panic CPU using
1903 * memory variables. All memory mapping and I/O is performed by the
1904 * panic CPU.
1905 *
1906 * At dump configuration time, helper_lock is set and helpers_wanted
1907 * is 0. dumpsys() decides whether to set helpers_wanted before
1908 * clearing helper_lock.
1909 *
1910 * At panic time, idle CPUs spin-wait on helper_lock, then alternately
1911 * take the lock and become a helper, or return.
1912 */
1913 void
1914 dumpsys_helper()
1915 {
1916 dumpsys_spinlock(&dumpcfg.helper_lock);
1917 if (dumpcfg.helpers_wanted) {
1918 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
1919
1920 for (hp = dumpcfg.helper; hp != hpend; hp++) {
1921 if (hp->helper == FREEHELPER) {
1922 hp->helper = CPU->cpu_id;
1923 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid);
1924 dumpsys_spinunlock(&dumpcfg.helper_lock);
1925 dumpsys_lzjbcompress(hp);
1926 hp->helper = DONEHELPER;
1927 return;
1928 }
1929 }
1930
1931 /* No more helpers are needed. */
1932 dumpcfg.helpers_wanted = 0;
1933
1934 }
1935 dumpsys_spinunlock(&dumpcfg.helper_lock);
1936 }
1937
1938 /*
1939 * No-wait helper callable in spin loops.
1940 *
1941 * Do not wait for helper_lock. Just check helpers_wanted. The caller
1942 * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s"
1943 * case.
1944 */
1945 void
1946 dumpsys_helper_nw()
1947 {
1948 if (dumpcfg.helpers_wanted)
1949 dumpsys_helper();
1950 }
1951
1952 /*
1953 * Dump helper for live dumps.
1954 * These run as a system task.
1955 */
1956 static void
1957 dumpsys_live_helper(void *arg)
1958 {
1959 helper_t *hp = arg;
1960
1961 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid);
1962 dumpsys_lzjbcompress(hp);
1963 }
1964
1965 /*
1966 * Compress one page with lzjb (single threaded case)
1967 */
1968 static void
1969 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp)
1970 {
1971 dumpsync_t *ds = hp->ds;
1972 uint32_t csize;
1973
1974 hp->helper = MAINHELPER;
1975 hp->in = 0;
1976 hp->used = 0;
1977 hp->cpin = cp;
1978 while (hp->used < cp->used) {
1979 HRSTART(hp->perpage, copy);
1980 hp->in = dumpsys_copy_page(hp, hp->in);
1981 hp->used += PAGESIZE;
1982 HRSTOP(hp->perpage, copy);
1983
1984 HRSTART(hp->perpage, compress);
1985 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
1986 HRSTOP(hp->perpage, compress);
1987
1988 HRSTART(hp->perpage, write);
1989 dumpvp_write(&csize, sizeof (csize));
1990 dumpvp_write(hp->lzbuf, csize);
1991 HRSTOP(hp->perpage, write);
1992 }
1993 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1994 hp->cpin = NULL;
1995 }
1996
1997 /*
1998 * Main task to dump pages. This is called on the dump CPU.
1999 */
2000 static void
2001 dumpsys_main_task(void *arg)
2002 {
2003 dumpsync_t *ds = arg;
2004 pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2005 dumpmlw_t mlw;
2006 cbuf_t *cp;
2007 pgcnt_t baseoff, pfnoff;
2008 pfn_t base, pfn;
2009 int i;
2010
2011 /*
2012 * Fall back to serial mode if there are no helpers.
2013 * dump_ncpu_low can be set to 0 at any time.
2014 * dumpcfg.helpermap must contain at least one member.
2015 *
2016 * It is possible that the helpers haven't registered
2017 * in helpermap yet; wait up to DUMP_HELPER_MAX_WAIT.
2018 */
2019 if (dump_ncpu_low != 0 && dumpcfg.clevel != DUMP_CLEVEL_SERIAL) {
2020 boolean_t dumpserial = B_TRUE;
2021 hrtime_t hrtmax = MSEC2NSEC(DUMP_HELPER_MAX_WAIT);
2022 hrtime_t hrtstart = gethrtime();
2023
2024 for (;;) {
2025 for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2026 if (dumpcfg.helpermap[i] != 0) {
2027 dumpserial = B_FALSE;
2028 break;
2029 }
2030 }
2031
2032 if ((!dumpserial) ||
2033 ((gethrtime() - hrtstart) >= hrtmax)) {
2034 break;
2035 }
2036
2037 ht_pause();
2038 }
2039
2040 if (dumpserial) {
2041 dumpcfg.clevel = DUMP_CLEVEL_SERIAL;
2042 if (dumpcfg.helper[0].lzbuf == NULL) {
2043 dumpcfg.helper[0].lzbuf =
2044 dumpcfg.helper[1].page;
2045 }
2046 }
2047 }
2048
2049 dump_init_memlist_walker(&mlw);
2050
2051 for (;;) {
2052 int sec = (gethrtime() - ds->start) / NANOSEC;
2053
2054 /*
2055 * Render a simple progress display on the system console to
2056 * make clear to the operator that the system has not hung.
2057 * Emit an update when dump progress has advanced by one
2058 * percent, or when no update has been drawn in the last
2059 * second.
2060 */
2061 if (ds->percent > ds->percent_done || sec > ds->sec_done) {
2062 ds->sec_done = sec;
2063 ds->percent_done = ds->percent;
2064 uprintf("^\rdumping: %2d:%02d %3d%% done",
2065 sec / 60, sec % 60, ds->percent);
2066 ds->neednl = 1;
2067 }
2068
2069 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) {
2070
2071 /* the writerq never blocks */
2072 cp = CQ_GET(writerq);
2073 if (cp == NULL)
2074 break;
2075
2076 dump_timeleft = dump_timeout;
2077
2078 HRSTART(ds->perpage, write);
2079 dumpvp_write(cp->buf, cp->used);
2080 HRSTOP(ds->perpage, write);
2081
2082 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2083 }
2084
2085 /*
2086 * Wait here for some buffers to process. Returns NULL
2087 * when all helpers have terminated and all buffers
2088 * have been processed.
2089 */
2090 cp = CQ_GET(mainq);
2091
2092 if (cp == NULL) {
2093
2094 /* Drain the write queue. */
2095 if (!CQ_IS_EMPTY(writerq))
2096 continue;
2097
2098 /* Main task exits here. */
2099 break;
2100 }
2101
2102 dump_timeleft = dump_timeout;
2103
2104 switch (cp->state) {
2105
2106 case CBUF_FREEMAP:
2107
2108 /*
2109 * Note that we drop CBUF_FREEMAP buffers on
2110 * the floor (they will not be on any cqueue)
2111 * when we no longer need them.
2112 */
2113 if (bitnum >= dumpcfg.bitmapsize)
2114 break;
2115
2116 if (dump_ioerr) {
2117 bitnum = dumpcfg.bitmapsize;
2118 CQ_CLOSE(helperq);
2119 break;
2120 }
2121
2122 HRSTART(ds->perpage, bitmap);
2123 for (; bitnum < dumpcfg.bitmapsize; bitnum++)
2124 if (BT_TEST(dumpcfg.bitmap, bitnum))
2125 break;
2126 HRSTOP(ds->perpage, bitmap);
2127 dump_timeleft = dump_timeout;
2128
2129 if (bitnum >= dumpcfg.bitmapsize) {
2130 CQ_CLOSE(helperq);
2131 break;
2132 }
2133
2134 /*
2135 * Try to map CBUF_MAPSIZE ranges. Can't
2136 * assume that memory segment size is a
2137 * multiple of CBUF_MAPSIZE. Can't assume that
2138 * the segment starts on a CBUF_MAPSIZE
2139 * boundary.
2140 */
2141 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2142 ASSERT(pfn != PFN_INVALID);
2143 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize);
2144
2145 base = P2ALIGN(pfn, CBUF_MAPNP);
2146 if (base < mlw.mpaddr) {
2147 base = mlw.mpaddr;
2148 baseoff = P2PHASE(base, CBUF_MAPNP);
2149 } else {
2150 baseoff = 0;
2151 }
2152
2153 pfnoff = pfn - base;
2154 if (pfnoff + mlw.mpleft < CBUF_MAPNP) {
2155 hibitnum = bitnum + mlw.mpleft;
2156 cp->size = ptob(pfnoff + mlw.mpleft);
2157 } else {
2158 hibitnum = bitnum - pfnoff + CBUF_MAPNP -
2159 baseoff;
2160 cp->size = CBUF_MAPSIZE - ptob(baseoff);
2161 }
2162
2163 cp->pfn = pfn;
2164 cp->bitnum = bitnum++;
2165 cp->pagenum = pagenum++;
2166 cp->off = ptob(pfnoff);
2167
2168 for (; bitnum < hibitnum; bitnum++)
2169 if (BT_TEST(dumpcfg.bitmap, bitnum))
2170 pagenum++;
2171
2172 dump_timeleft = dump_timeout;
2173 cp->used = ptob(pagenum - cp->pagenum);
2174
2175 HRSTART(ds->perpage, map);
2176 hat_devload(kas.a_hat, cp->buf, cp->size, base,
2177 PROT_READ, HAT_LOAD_NOCONSIST);
2178 HRSTOP(ds->perpage, map);
2179
2180 ds->pages_mapped += btop(cp->size);
2181 ds->pages_used += pagenum - cp->pagenum;
2182
2183 CQ_OPEN(mainq);
2184
2185 /*
2186 * If there are no helpers the main task does
2187 * non-streams lzjb compress.
2188 */
2189 if (dumpcfg.clevel == DUMP_CLEVEL_SERIAL) {
2190 dumpsys_lzjb_page(dumpcfg.helper, cp);
2191 } else {
2192 /* pass mapped pages to a helper */
2193 CQ_PUT(helperq, cp, CBUF_INREADY);
2194 }
2195
2196 /* the last page was done */
2197 if (bitnum >= dumpcfg.bitmapsize)
2198 CQ_CLOSE(helperq);
2199
2200 break;
2201
2202 case CBUF_USEDMAP:
2203
2204 ds->npages += btop(cp->used);
2205
2206 HRSTART(ds->perpage, unmap);
2207 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2208 HRSTOP(ds->perpage, unmap);
2209
2210 if (bitnum < dumpcfg.bitmapsize)
2211 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2212 CQ_CLOSE(mainq);
2213
2214 ASSERT(ds->npages <= dumphdr->dump_npages);
2215 ds->percent = ds->npages * 100LL / dumphdr->dump_npages;
2216 break;
2217
2218 case CBUF_WRITE:
2219
2220 CQ_PUT(writerq, cp, CBUF_WRITE);
2221 break;
2222
2223 case CBUF_ERRMSG:
2224
2225 if (cp->used > 0) {
2226 cp->buf[cp->size - 2] = '\n';
2227 cp->buf[cp->size - 1] = '\0';
2228 if (ds->neednl) {
2229 uprintf("\n%s", cp->buf);
2230 ds->neednl = 0;
2231 } else {
2232 uprintf("%s", cp->buf);
2233 }
2234 /* wait for console output */
2235 drv_usecwait(200000);
2236 dump_timeleft = dump_timeout;
2237 }
2238 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2239 break;
2240
2241 default:
2242 uprintf("dump: unexpected buffer state %d, "
2243 "buffer will be lost\n", cp->state);
2244 break;
2245
2246 } /* end switch */
2247 }
2248 }
2249
2250 #ifdef COLLECT_METRICS
2251 size_t
2252 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size)
2253 {
2254 dumpcfg_t *cfg = &dumpcfg;
2255 int myid = CPU->cpu_seqid;
2256 int i, compress_ratio;
2257 int sec, iorate;
2258 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper];
2259 char *e = buf + size;
2260 char *p = buf;
2261
2262 sec = ds->elapsed / (1000 * 1000 * 1000ULL);
2263 if (sec < 1)
2264 sec = 1;
2265
2266 if (ds->iotime < 1)
2267 ds->iotime = 1;
2268 iorate = (ds->nwrite * 100000ULL) / ds->iotime;
2269
2270 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1);
2271
2272 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0)
2273
2274 P("Master cpu_seqid,%d\n", CPU->cpu_seqid);
2275 P("Master cpu_id,%d\n", CPU->cpu_id);
2276 P("dump_flags,0x%x\n", dumphdr->dump_flags);
2277 P("dump_ioerr,%d\n", dump_ioerr);
2278
2279 P("Helpers:\n");
2280 for (i = 0; i < ncpus; i++) {
2281 if ((i & 15) == 0)
2282 P(",,%03d,", i);
2283 if (i == myid)
2284 P(" M");
2285 else if (BT_TEST(cfg->helpermap, i))
2286 P("%4d", cpu_seq[i]->cpu_id);
2287 else
2288 P(" *");
2289 if ((i & 15) == 15)
2290 P("\n");
2291 }
2292
2293 P("ncbuf_used,%d\n", cfg->ncbuf_used);
2294 P("ncmap,%d\n", cfg->ncmap);
2295
2296 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2297 P("Found small pages,%ld\n", cfg->foundsm);
2298
2299 P("Compression level,%d\n", cfg->clevel);
2300 P("Compression type,%s lzjb\n",
2301 cfg->clevel == DUMP_CLEVEL_SERIAL ? "serial" : "parallel");
2302 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2303 100);
2304 P("nhelper_used,%d\n", cfg->nhelper_used);
2305
2306 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2307 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2308 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2309 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2310 P("dumpbuf.size,%ld\n", dumpbuf.size);
2311
2312 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2313 P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2314 P("Dump time,%d\n", sec);
2315
2316 if (ds->pages_mapped > 0)
2317 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2318 / ds->pages_mapped));
2319
2320 P("\nPer-page metrics:\n");
2321 if (ds->npages > 0) {
2322 for (hp = cfg->helper; hp != hpend; hp++) {
2323 #define PERPAGE(x) ds->perpage.x += hp->perpage.x;
2324 PERPAGES;
2325 #undef PERPAGE
2326 }
2327 #define PERPAGE(x) \
2328 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages));
2329 PERPAGES;
2330 #undef PERPAGE
2331 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty /
2332 ds->npages));
2333 P("helperq.empty,%d\n", (int)(ds->helperq.empty /
2334 ds->npages));
2335 P("writerq.empty,%d\n", (int)(ds->writerq.empty /
2336 ds->npages));
2337 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages));
2338
2339 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait /
2340 ds->npages));
2341 }
2342 #undef P
2343 if (p < e)
2344 bzero(p, e - p);
2345 return (p - buf);
2346 }
2347 #endif /* COLLECT_METRICS */
2348
2349 /*
2350 * Dump the system.
2351 */
2352 void
2353 dumpsys(void)
2354 {
2355 dumpsync_t *ds = &dumpsync;
2356 taskq_t *livetaskq = NULL;
2357 pfn_t pfn;
2358 pgcnt_t bitnum;
2359 proc_t *p;
2360 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2361 cbuf_t *cp;
2362 pid_t npids, pidx;
2363 char *content;
2364 char *buf;
2365 size_t size;
2366 int save_dump_clevel;
2367 dumpmlw_t mlw;
2368 dumpcsize_t datatag;
2369 dumpdatahdr_t datahdr;
2370
2371 if (dumpvp == NULL || dumphdr == NULL) {
2372 uprintf("skipping system dump - no dump device configured\n");
2373 if (panicstr) {
2374 dumpcfg.helpers_wanted = 0;
2375 dumpsys_spinunlock(&dumpcfg.helper_lock);
2376 }
2377 return;
2378 }
2379 dumpbuf.cur = dumpbuf.start;
2380
2381 /* clear the sync variables */
2382 ASSERT(dumpcfg.nhelper > 0);
2383 bzero(ds, sizeof (*ds));
2384 ds->dumpcpu = CPU->cpu_id;
2385
2386 /*
2387 * Calculate the starting block for dump. If we're dumping on a
2388 * swap device, start 1/5 of the way in; otherwise, start at the
2389 * beginning. And never use the first page -- it may be a disk label.
2390 */
2391 if (dumpvp->v_flag & VISSWAP)
2392 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET);
2393 else
2394 dumphdr->dump_start = DUMP_OFFSET;
2395
2396 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED;
2397 dumphdr->dump_crashtime = gethrestime_sec();
2398 dumphdr->dump_npages = 0;
2399 dumphdr->dump_nvtop = 0;
2400 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize));
2401 dump_timeleft = dump_timeout;
2402
2403 if (panicstr) {
2404 dumphdr->dump_flags &= ~DF_LIVE;
2405 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL);
2406 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL);
2407 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE,
2408 panicstr, panicargs);
2409 (void) strncpy(dumphdr->dump_uuid, dump_get_uuid(),
2410 sizeof (dumphdr->dump_uuid));
2411 }
2412
2413 if (dump_conflags & DUMP_ALL)
2414 content = "all";
2415 else if (dump_conflags & DUMP_CURPROC)
2416 content = "kernel + curproc";
2417 else
2418 content = "kernel";
2419 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath,
2420 dumphdr->dump_start, content);
2421
2422 /* Make sure nodename is current */
2423 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN);
2424
2425 /*
2426 * If this is a live dump, try to open a VCHR vnode for better
2427 * performance. We must take care to flush the buffer cache
2428 * first.
2429 */
2430 if (!panicstr) {
2431 vnode_t *cdev_vp, *cmn_cdev_vp;
2432
2433 ASSERT(dumpbuf.cdev_vp == NULL);
2434 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR);
2435 if (cdev_vp != NULL) {
2436 cmn_cdev_vp = common_specvp(cdev_vp);
2437 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL)
2438 == 0) {
2439 if (vn_has_cached_data(dumpvp))
2440 (void) pvn_vplist_dirty(dumpvp, 0, NULL,
2441 B_INVAL | B_TRUNC, kcred);
2442 dumpbuf.cdev_vp = cmn_cdev_vp;
2443 } else {
2444 VN_RELE(cdev_vp);
2445 }
2446 }
2447 }
2448
2449 /*
2450 * Store a hires timestamp so we can look it up during debugging.
2451 */
2452 lbolt_debug_entry();
2453
2454 /*
2455 * Leave room for the summary, message and ereport save areas
2456 * and terminal dump header.
2457 */
2458 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET -
2459 DUMP_ERPTSIZE - DUMP_SUMMARYSIZE;
2460
2461 /*
2462 * Write out the symbol table. It's no longer compressed,
2463 * so its 'size' and 'csize' are equal.
2464 */
2465 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE;
2466 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize =
2467 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX);
2468
2469 /*
2470 * Write out the translation map.
2471 */
2472 dumphdr->dump_map = dumpvp_flush();
2473 dump_as(&kas);
2474 dumphdr->dump_nvtop += dump_plat_addr();
2475
2476 /*
2477 * call into hat, which may have unmapped pages that also need to
2478 * be in the dump
2479 */
2480 hat_dump();
2481
2482 if (dump_conflags & DUMP_ALL) {
2483 mutex_enter(&pidlock);
2484
2485 for (npids = 0, p = practive; p != NULL; p = p->p_next)
2486 dumpcfg.pids[npids++] = p->p_pid;
2487
2488 mutex_exit(&pidlock);
2489
2490 for (pidx = 0; pidx < npids; pidx++)
2491 (void) dump_process(dumpcfg.pids[pidx]);
2492
2493 dump_init_memlist_walker(&mlw);
2494 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2495 dump_timeleft = dump_timeout;
2496 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2497 /*
2498 * Some hypervisors do not have all pages available to
2499 * be accessed by the guest OS. Check for page
2500 * accessibility.
2501 */
2502 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) !=
2503 PLAT_HOLD_OK)
2504 continue;
2505 BT_SET(dumpcfg.bitmap, bitnum);
2506 }
2507 dumphdr->dump_npages = dumpcfg.bitmapsize;
2508 dumphdr->dump_flags |= DF_ALL;
2509
2510 } else if (dump_conflags & DUMP_CURPROC) {
2511 /*
2512 * Determine which pid is to be dumped. If we're panicking, we
2513 * dump the process associated with panic_thread (if any). If
2514 * this is a live dump, we dump the process associated with
2515 * curthread.
2516 */
2517 npids = 0;
2518 if (panicstr) {
2519 if (panic_thread != NULL &&
2520 panic_thread->t_procp != NULL &&
2521 panic_thread->t_procp != &p0) {
2522 dumpcfg.pids[npids++] =
2523 panic_thread->t_procp->p_pid;
2524 }
2525 } else {
2526 dumpcfg.pids[npids++] = curthread->t_procp->p_pid;
2527 }
2528
2529 if (npids && dump_process(dumpcfg.pids[0]) == 0)
2530 dumphdr->dump_flags |= DF_CURPROC;
2531 else
2532 dumphdr->dump_flags |= DF_KERNEL;
2533
2534 } else {
2535 dumphdr->dump_flags |= DF_KERNEL;
2536 }
2537
2538 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1;
2539
2540 /*
2541 * Write out the pfn table.
2542 */
2543 dumphdr->dump_pfn = dumpvp_flush();
2544 dump_init_memlist_walker(&mlw);
2545 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2546 dump_timeleft = dump_timeout;
2547 if (!BT_TEST(dumpcfg.bitmap, bitnum))
2548 continue;
2549 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2550 ASSERT(pfn != PFN_INVALID);
2551 dumpvp_write(&pfn, sizeof (pfn_t));
2552 }
2553 dump_plat_pfn();
2554
2555 /*
2556 * Write out all the pages.
2557 * Map pages, copy them handling UEs, compress, and write them out.
2558 * Cooperate with any helpers running on CPUs in panic_idle().
2559 */
2560 dumphdr->dump_data = dumpvp_flush();
2561
2562 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU));
2563 ds->live = dumpcfg.clevel > DUMP_CLEVEL_SERIAL &&
2564 (dumphdr->dump_flags & DF_LIVE) != 0;
2565
2566 save_dump_clevel = dumpcfg.clevel;
2567 if (panicstr)
2568 dumpsys_get_maxmem();
2569
2570 dumpcfg.nhelper_used = 0;
2571 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2572 if (hp->page == NULL) {
2573 hp->helper = DONEHELPER;
2574 continue;
2575 }
2576 ++dumpcfg.nhelper_used;
2577 hp->helper = FREEHELPER;
2578 hp->taskqid = NULL;
2579 hp->ds = ds;
2580 bzero(&hp->perpage, sizeof (hp->perpage));
2581 }
2582
2583 CQ_OPEN(freebufq);
2584 CQ_OPEN(helperq);
2585
2586 dumpcfg.ncbuf_used = 0;
2587 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) {
2588 if (cp->buf != NULL) {
2589 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2590 ++dumpcfg.ncbuf_used;
2591 }
2592 }
2593
2594 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++)
2595 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2596
2597 ds->start = gethrtime();
2598 ds->iowaitts = ds->start;
2599
2600 /* start helpers */
2601 if (ds->live) {
2602 int n = dumpcfg.nhelper_used;
2603 int pri = MINCLSYSPRI - 25;
2604
2605 livetaskq = taskq_create("LiveDump", n, pri, n, n,
2606 TASKQ_PREPOPULATE);
2607 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2608 if (hp->page == NULL)
2609 continue;
2610 hp->helper = hp - dumpcfg.helper;
2611 hp->taskqid = taskq_dispatch(livetaskq,
2612 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP);
2613 }
2614
2615 } else {
2616 if (panicstr)
2617 kmem_dump_begin();
2618 dumpcfg.helpers_wanted = dumpcfg.clevel > DUMP_CLEVEL_SERIAL;
2619 dumpsys_spinunlock(&dumpcfg.helper_lock);
2620 }
2621
2622 /* run main task */
2623 dumpsys_main_task(ds);
2624
2625 ds->elapsed = gethrtime() - ds->start;
2626 if (ds->elapsed < 1)
2627 ds->elapsed = 1;
2628
2629 if (livetaskq != NULL)
2630 taskq_destroy(livetaskq);
2631
2632 if (ds->neednl) {
2633 uprintf("\n");
2634 ds->neednl = 0;
2635 }
2636
2637 /* record actual pages dumped */
2638 dumphdr->dump_npages = ds->npages;
2639
2640 /* platform-specific data */
2641 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf);
2642
2643 /* note any errors by clearing DF_COMPLETE */
2644 if (dump_ioerr || ds->npages < dumphdr->dump_npages)
2645 dumphdr->dump_flags &= ~DF_COMPLETE;
2646
2647 /* end of stream blocks */
2648 datatag = 0;
2649 dumpvp_write(&datatag, sizeof (datatag));
2650
2651 bzero(&datahdr, sizeof (datahdr));
2652
2653 /* buffer for metrics */
2654 buf = dumpcfg.cbuf[0].buf;
2655 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) -
2656 sizeof (dumpdatahdr_t));
2657
2658 /* finish the kmem intercepts, collect kmem verbose info */
2659 if (panicstr) {
2660 datahdr.dump_metrics = kmem_dump_finish(buf, size);
2661 buf += datahdr.dump_metrics;
2662 size -= datahdr.dump_metrics;
2663 }
2664
2665 /* record in the header whether this is a fault-management panic */
2666 if (panicstr)
2667 dumphdr->dump_fm_panic = is_fm_panic();
2668
2669 /* compression info in data header */
2670 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC;
2671 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION;
2672 datahdr.dump_maxcsize = CBUF_SIZE;
2673 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE;
2674 datahdr.dump_nstreams = dumpcfg.nhelper_used;
2675 datahdr.dump_clevel = dumpcfg.clevel;
2676 #ifdef COLLECT_METRICS
2677 if (dump_metrics_on)
2678 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size);
2679 #endif
2680 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data;
2681
2682 /*
2683 * Write out the initial and terminal dump headers.
2684 */
2685 dumpbuf.vp_off = dumphdr->dump_start;
2686 dumpvp_write(dumphdr, sizeof (dumphdr_t));
2687 (void) dumpvp_flush();
2688
2689 dumpbuf.vp_limit = dumpvp_size;
2690 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET;
2691 dumpvp_write(dumphdr, sizeof (dumphdr_t));
2692 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t));
2693 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics);
2694
2695 (void) dumpvp_flush();
2696
2697 uprintf("\r%3d%% done: %llu pages dumped, ",
2698 ds->percent_done, (u_longlong_t)ds->npages);
2699
2700 if (dump_ioerr == 0) {
2701 uprintf("dump succeeded\n");
2702 } else {
2703 uprintf("dump failed: error %d\n", dump_ioerr);
2704 #ifdef DEBUG
2705 if (panicstr)
2706 debug_enter("dump failed");
2707 #endif
2708 }
2709
2710 /*
2711 * Write out all undelivered messages. This has to be the *last*
2712 * thing we do because the dump process itself emits messages.
2713 */
2714 if (panicstr) {
2715 dump_summary();
2716 dump_ereports();
2717 dump_messages();
2718 }
2719
2720 delay(2 * hz); /* let people see the 'done' message */
2721 dump_timeleft = 0;
2722 dump_ioerr = 0;
2723
2724 /* restore settings after live dump completes */
2725 if (!panicstr) {
2726 dumpcfg.clevel = save_dump_clevel;
2727
2728 /* release any VCHR open of the dump device */
2729 if (dumpbuf.cdev_vp != NULL) {
2730 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0,
2731 kcred, NULL);
2732 VN_RELE(dumpbuf.cdev_vp);
2733 dumpbuf.cdev_vp = NULL;
2734 }
2735 }
2736 }
2737
2738 /*
2739 * This function is called whenever the memory size, as represented
2740 * by the phys_install list, changes.
2741 */
2742 void
2743 dump_resize()
2744 {
2745 mutex_enter(&dump_lock);
2746 dumphdr_init();
2747 dumpbuf_resize();
2748 dump_update_clevel();
2749 mutex_exit(&dump_lock);
2750 }
2751
2752 /*
2753 * This function allows for dynamic resizing of a dump area. It assumes that
2754 * the underlying device has update its appropriate size(9P).
2755 */
2756 int
2757 dumpvp_resize()
2758 {
2759 int error;
2760 vattr_t vattr;
2761
2762 mutex_enter(&dump_lock);
2763 vattr.va_mask = AT_SIZE;
2764 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) {
2765 mutex_exit(&dump_lock);
2766 return (error);
2767 }
2768
2769 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) {
2770 mutex_exit(&dump_lock);
2771 return (ENOSPC);
2772 }
2773
2774 dumpvp_size = vattr.va_size & -DUMP_OFFSET;
2775 mutex_exit(&dump_lock);
2776 return (0);
2777 }
2778
2779 static int
2780 dump_validate_uuid(const char *uuidstr)
2781 {
2782 const char *ptr;
2783 int i;
2784
2785 if (uuidstr == NULL || strlen(uuidstr) !=
2786 UUID_PRINTABLE_STRING_LENGTH - 1)
2787 return (EINVAL);
2788
2789 /* uuid_parse is not common code so check manually */
2790 for (i = 0, ptr = uuidstr; i < UUID_PRINTABLE_STRING_LENGTH - 1;
2791 i++, ptr++) {
2792 switch (i) {
2793 case 8:
2794 case 13:
2795 case 18:
2796 case 23:
2797 if (*ptr != '-')
2798 return (EINVAL);
2799 break;
2800
2801 default:
2802 if (!isxdigit(*ptr))
2803 return (EINVAL);
2804 break;
2805 }
2806 }
2807
2808 return (0);
2809 }
2810
2811 int
2812 dump_update_uuid(const char *uuidstr)
2813 {
2814
2815 if (dump_validate_uuid(uuidstr) != 0 || dumphdr == NULL)
2816 return (EINVAL);
2817
2818 bzero(dumphdr->dump_uuid, sizeof (dumphdr->dump_uuid));
2819 (void) strncpy(dumphdr->dump_uuid, uuidstr,
2820 sizeof (dumphdr->dump_uuid));
2821
2822 return (0);
2823 }
2824
2825 int
2826 dump_set_uuid(const char *uuidstr)
2827 {
2828 if (dump_validate_uuid(uuidstr) != 0)
2829 return (EINVAL);
2830
2831 if (dump_osimage_uuid[0] != '\0')
2832 return (EALREADY);
2833
2834 (void) strncpy(dump_osimage_uuid, uuidstr,
2835 UUID_PRINTABLE_STRING_LENGTH);
2836
2837 cmn_err(CE_CONT, "?This Solaris instance has UUID %s\n",
2838 dump_osimage_uuid);
2839
2840 return (0);
2841 }
2842
2843 const char *
2844 dump_get_uuid(void)
2845 {
2846 return (dump_osimage_uuid[0] != '\0' ? dump_osimage_uuid : "");
2847 }