Print this page
NEX-3166 need to add FMA events for SSD lifespan
Reviewed by: Jeffry Molanus <jeffry.molanus@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
OS-104 handle attach-failure ereport
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/cmd/fm/eversholt/files/common/disk.esc
+++ new/usr/src/cmd/fm/eversholt/files/common/disk.esc
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25
26 26 #pragma dictionary "DISK"
27 27
28 28 #define P disk
29 29
30 30 fru P;
31 31 asru P;
32 32
33 33 /*
34 34 * Over all comments for this file:
35 35 * <disk-as-detector> The disk-as-detector DE provides the mapping between
36 36 * ereports generated by a kernel disk driver sd(7D) and resulting faults.
37 37 */
38 38
39 39 /*
40 40 * SERD engine for media error fault propagation:
41 41 *
42 42 * This strategy is designed to give a file system, like ZFS, the
43 43 * ability to attempt data recovery/relocation without faulting a disk.
44 44 * This implementation depends on a file system retry to the same lba
45 45 * to trigger a fault when recovery/relocation is not possible.
46 46 *
47 47 * We let the engine propagate one error only once every 1 minute and then if we
48 48 * still get 2 or more errors within 24 hours for the same LBA,
49 49 * there is a fault.
50 50 */
51 51 engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
52 52
53 53 /*
54 54 * disk-as-detector: fault events.
55 55 */
56 56 event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
57 57 event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
58 58 engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
59 59
60 60 /*
61 61 * The uderr fault will be defined at some future time.
62 62 * event fault.io.scsi.cmd.disk.dev.uderr@P;
63 63 */
64 64
65 65 /*
66 66 * disk-as-detector: upset events.
67 67 * NOTE: For now we define an upset to implement discard.
68 68 */
69 69 event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
70 70 event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
71 71 event upset.io.scsi.cmd.disk.dev.uderr@P;
72 72 event upset.io.scsi.cmd.disk.dev.serr@P;
73 73 event upset.io.scsi.cmd.disk.tran@P;
74 74 event upset.io.scsi.cmd.disk.recovered@P;
75 75
76 76 /*
77 77 * disk-as-detector: ereports from the kernel.
78 78 *
79 79 * We don't know the topology for all scsi disks, but the kernel will always
80 80 * generate ereport telemetry assuming that we do. We define these ereports
81 81 * with 'discard_if_config_unknown=1', which permits ereports against things
82 82 * with unknown topology to be silently discarded. The ereport data is logged
83 83 * in either case, and can be viewed via 'fmdump -eV'.
84 84 */
85 85 event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
86 86 event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
87 87 event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
88 88 event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
89 89 event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
90 90 event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
91 91
92 92 /*
93 93 * For some ereports we let the 'driver-assessment', communicated as part of
94 94 * the ereport payload, determine fault .vs. upset via propagation constraints.
95 95 */
96 96 #define DRIVER_ASSESSMENT_FATAL \
97 97 (payloadprop_contains("driver-assessment", "fatal"))
98 98 #define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL)
99 99
100 100 /*
101 101 * disk-as-detector: propagations from faults(based on
102 102 * DRIVER_ASSESSMENT_FATAL).
103 103 * We need to set additional fault payloads to indicate fault details.
104 104 * The payload we may need are listed as following:
105 105 * fault.io.scsi.cmd.disk.dev.rqs.derr
106 106 * op_code, key, asc, ascq
107 107 * fault.io.scsi.cmd.disk.dev.rqs.merr
108 108 * op_code, key, asc, ascq, lba
109 109 */
110 110 prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
111 111 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
112 112 setpayloadprop("key", payloadprop("key")) &&
113 113 setpayloadprop("asc", payloadprop("asc")) &&
114 114 setpayloadprop("ascq", payloadprop("ascq"))};
115 115
116 116 /*
117 117 * Utilize setserdsuffix with specific LBA,
118 118 * the serd engine would only trigger if the fault recurred on the same LBA
119 119 */
120 120 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
121 121 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
122 122 setserdsuffix(payloadprop("lba")) &&
123 123 setpayloadprop("key", payloadprop("key")) &&
124 124 setpayloadprop("asc", payloadprop("asc")) &&
125 125 setpayloadprop("ascq", payloadprop("ascq")) &&
126 126 setpayloadprop("lba", payloadprop("lba"))};
127 127
128 128 /*
129 129 * NOTE: this propagation uses the "may" propagation of eversholt.
130 130 * The ereport need never exist. It's just a way of making
131 131 * the diagnosis wait for the within time on that ereport
132 132 * to complete. Once it has completed the diagnosis continues
133 133 * even though the dummy ereport didn't occur.
134 134 */
135 135 event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
136 136 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
137 137 ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
138 138
139 139 /*
140 140 * The uderr fault will be propagated at some future time.
141 141 * prop fault.io.scsi.cmd.disk.dev.uderr@P->
142 142 * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
143 143 */
144 144
145 145 /*
146 146 * disk-as-detector: propagations from upsets(based on
147 147 * DRIVER_ASSESSMENT_NONFATAL).
148 148 */
149 149 prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
150 150 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
151 151
152 152 prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
153 153 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
154 154
155 155 /*
156 156 * disk-as-detector: propagations from upsets(independent of
157 157 * driver-assessment)
158 158 */
159 159
160 160 prop upset.io.scsi.cmd.disk.dev.serr@P->
161 161 ereport.io.scsi.cmd.disk.dev.serr@P;
162 162
163 163 prop upset.io.scsi.cmd.disk.dev.uderr@P->
164 164 ereport.io.scsi.cmd.disk.dev.uderr@P;
165 165
|
↓ open down ↓ |
165 lines elided |
↑ open up ↑ |
166 166 prop upset.io.scsi.cmd.disk.recovered@P->
167 167 ereport.io.scsi.cmd.disk.recovered@P;
168 168
169 169 prop upset.io.scsi.cmd.disk.tran@P->
170 170 ereport.io.scsi.cmd.disk.tran@P;
171 171
172 172 /*
173 173 * --------------------------------------
174 174 * The remainder of this file contains rules associated with the operation of
175 175 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
176 - *
176 + *
177 177 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
178 178 * generated by the disk-transport fmd module, and the resulting faults.
179 179 */
180 180
181 181 /*
182 182 * Fault events.
183 183 */
184 184 event fault.io.disk.over-temperature@P,
185 185 FITrate=10, FRU=P, ASRU=P;
186 186 event fault.io.disk.predictive-failure@P, FITrate=10,
187 187 FITrate=10, FRU=P, ASRU=P;
188 188 event fault.io.disk.self-test-failure@P, FITrate=10,
189 189 FITrate=10, FRU=P, ASRU=P;
190 +event fault.io.disk.attach-failure@P;
190 191 event fault.io.disk.ssm-wearout@P;
191 192
192 193 /*
193 194 * ereports.
194 195 */
195 196 event ereport.io.scsi.disk.over-temperature@P;
196 197 event ereport.io.scsi.disk.predictive-failure@P;
197 198 event ereport.io.scsi.disk.self-test-failure@P;
199 +event ereport.io.scsi.disk.attach-failure@P;
198 200 event ereport.io.scsi.disk.ssm-wearout@P;
199 201
200 202 /*
201 203 * Propagations.
202 204 */
203 205 prop fault.io.disk.over-temperature@P ->
204 206 ereport.io.scsi.disk.over-temperature@P;
205 207
206 208 prop fault.io.disk.self-test-failure@P ->
207 209 ereport.io.scsi.disk.self-test-failure@P;
208 210
209 211 prop fault.io.disk.predictive-failure@P ->
210 212 ereport.io.scsi.disk.predictive-failure@P {
211 213 setpayloadprop("asc", payloadprop("additional-sense-code")) &&
212 214 setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
213 215
216 +prop fault.io.disk.attach-failure@P ->
217 + ereport.io.scsi.disk.attach-failure@P;
218 +
214 219 prop fault.io.disk.ssm-wearout@P ->
215 220 ereport.io.scsi.disk.ssm-wearout@P {
216 221 setpayloadprop("current-wearout-percentage",
217 222 payloadprop("current-ssm-wearout"))
218 223 && setpayloadprop("threshold-wearout-percentage",
219 224 payloadprop("threshold-ssm-wearout")) };
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX