big-one Wdiff usr/src/cmd/fm/eversholt/files/common/disk.esc

Print this page

NEX-3166 need to add FMA events for SSD lifespan
Reviewed by: Jeffry Molanus <jeffry.molanus@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
OS-104 handle attach-failure ereport

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/fm/eversholt/files/common/disk.esc
          +++ new/usr/src/cmd/fm/eversholt/files/common/disk.esc

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  24   24   */
  25   25  
  26   26  #pragma dictionary "DISK"
  27   27  
  28   28  #define P                       disk
  29   29  
  30   30  fru P;
  31   31  asru P;
  32   32  
  33   33  /*
  34   34   * Over all comments for this file:
  35   35   * <disk-as-detector> The disk-as-detector DE provides the mapping between
  36   36   * ereports generated by a kernel disk driver sd(7D) and resulting faults.
  37   37   */
  38   38  
  39   39  /*
  40   40   * SERD engine for media error fault propagation:
  41   41   *
  42   42   * This strategy is designed to give a file system, like ZFS, the
  43   43   * ability to attempt data recovery/relocation without faulting a disk.
  44   44   * This implementation depends on a file system retry to the same lba
  45   45   * to trigger a fault when recovery/relocation is not possible.
  46   46   *
  47   47   * We let the engine propagate one error only once every 1 minute and then if we
  48   48   * still get 2 or more errors within 24 hours for the same LBA,
  49   49   * there is a fault.
  50   50   */
  51   51  engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
  52   52  
  53   53  /*
  54   54   * disk-as-detector: fault events.
  55   55   */
  56   56  event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
  57   57  event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
  58   58      engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
  59   59  
  60   60  /*
  61   61   * The uderr fault will be defined at some future time.
  62   62   * event fault.io.scsi.cmd.disk.dev.uderr@P;
  63   63   */
  64   64  
  65   65  /*
  66   66   * disk-as-detector: upset events.
  67   67   * NOTE: For now we define an upset to implement discard.
  68   68   */
  69   69  event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
  70   70  event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
  71   71  event upset.io.scsi.cmd.disk.dev.uderr@P;
  72   72  event upset.io.scsi.cmd.disk.dev.serr@P;
  73   73  event upset.io.scsi.cmd.disk.tran@P;
  74   74  event upset.io.scsi.cmd.disk.recovered@P;
  75   75  
  76   76  /*
  77   77   * disk-as-detector: ereports from the kernel.
  78   78   *
  79   79   * We don't know the topology for all scsi disks, but the kernel will always
  80   80   * generate ereport telemetry assuming that we do. We define these ereports
  81   81   * with 'discard_if_config_unknown=1', which permits ereports against things
  82   82   * with unknown topology to be silently discarded.  The ereport data is logged
  83   83   * in either case, and can be viewed via 'fmdump -eV'.
  84   84   */
  85   85  event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
  86   86  event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
  87   87  event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
  88   88  event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
  89   89  event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
  90   90  event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
  91   91  
  92   92  /*
  93   93   * For some ereports we let the 'driver-assessment', communicated as part of
  94   94   * the ereport payload, determine fault .vs. upset via propagation constraints.
  95   95   */
  96   96  #define DRIVER_ASSESSMENT_FATAL         \
  97   97              (payloadprop_contains("driver-assessment", "fatal"))
  98   98  #define DRIVER_ASSESSMENT_NONFATAL      (!DRIVER_ASSESSMENT_FATAL)
  99   99  
 100  100  /*
 101  101   * disk-as-detector: propagations from faults(based on
 102  102   * DRIVER_ASSESSMENT_FATAL).
 103  103   * We need to set additional fault payloads to indicate fault details.
 104  104   * The payload we may need are listed as following:
 105  105   * fault.io.scsi.cmd.disk.dev.rqs.derr
 106  106   *     op_code, key, asc, ascq
 107  107   * fault.io.scsi.cmd.disk.dev.rqs.merr
 108  108   *     op_code, key, asc, ascq, lba
 109  109   */
 110  110  prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
 111  111      ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
 112  112      setpayloadprop("key", payloadprop("key")) &&
 113  113      setpayloadprop("asc", payloadprop("asc")) &&
 114  114      setpayloadprop("ascq", payloadprop("ascq"))};
 115  115  
 116  116  /*
 117  117   * Utilize setserdsuffix with specific LBA, 
 118  118   * the serd engine would only trigger if the fault recurred on the same LBA
 119  119   */
 120  120  prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
 121  121      ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
 122  122      setserdsuffix(payloadprop("lba")) &&
 123  123      setpayloadprop("key", payloadprop("key")) &&
 124  124      setpayloadprop("asc", payloadprop("asc")) &&
 125  125      setpayloadprop("ascq", payloadprop("ascq")) &&
 126  126      setpayloadprop("lba", payloadprop("lba"))};
 127  127  
 128  128  /*
 129  129   * NOTE: this propagation uses the "may" propagation of eversholt.
 130  130   * The ereport need never exist. It's just a way of making
 131  131   * the diagnosis wait for the within time on that ereport
 132  132   * to complete. Once it has completed the diagnosis continues
 133  133   * even though the dummy ereport didn't occur.
 134  134   */
 135  135  event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
 136  136  prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
 137  137          ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
 138  138  
 139  139  /*
 140  140   * The uderr fault will be propagated at some future time.
 141  141   * prop fault.io.scsi.cmd.disk.dev.uderr@P->
 142  142   *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
 143  143   */
 144  144  
 145  145  /*
 146  146   * disk-as-detector: propagations from upsets(based on
 147  147   * DRIVER_ASSESSMENT_NONFATAL).
 148  148   */
 149  149  prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
 150  150      ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
 151  151  
 152  152  prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
 153  153      ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
 154  154  
 155  155  /*
 156  156   * disk-as-detector: propagations from upsets(independent of
 157  157   * driver-assessment)
 158  158   */
 159  159  
 160  160  prop upset.io.scsi.cmd.disk.dev.serr@P->
 161  161      ereport.io.scsi.cmd.disk.dev.serr@P;
 162  162  
 163  163  prop upset.io.scsi.cmd.disk.dev.uderr@P->
 164  164      ereport.io.scsi.cmd.disk.dev.uderr@P;
 165  165

↓ open down ↓

165 lines elided

↑ open up ↑

 166  166  prop upset.io.scsi.cmd.disk.recovered@P->
 167  167      ereport.io.scsi.cmd.disk.recovered@P;
 168  168  
 169  169  prop upset.io.scsi.cmd.disk.tran@P->
 170  170      ereport.io.scsi.cmd.disk.tran@P;
 171  171  
 172  172  /*
 173  173   * --------------------------------------
 174  174   * The remainder of this file contains rules associated with the operation of
 175  175   * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
 176      - * 
      176 + *
 177  177   * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
 178  178   * generated by the disk-transport fmd module, and the resulting faults.
 179  179   */
 180  180  
 181  181  /*
 182  182   * Fault events.
 183  183   */
 184  184  event fault.io.disk.over-temperature@P,
 185  185      FITrate=10, FRU=P, ASRU=P;
 186  186  event fault.io.disk.predictive-failure@P, FITrate=10,
 187  187      FITrate=10, FRU=P, ASRU=P;
 188  188  event fault.io.disk.self-test-failure@P, FITrate=10,
 189  189      FITrate=10, FRU=P, ASRU=P;
      190 +event fault.io.disk.attach-failure@P;
 190  191  event fault.io.disk.ssm-wearout@P;
 191  192  
 192  193  /*
 193  194   * ereports.
 194  195   */
 195  196  event ereport.io.scsi.disk.over-temperature@P;
 196  197  event ereport.io.scsi.disk.predictive-failure@P;
 197  198  event ereport.io.scsi.disk.self-test-failure@P;
      199 +event ereport.io.scsi.disk.attach-failure@P;
 198  200  event ereport.io.scsi.disk.ssm-wearout@P;
 199  201  
 200  202  /*
 201  203   * Propagations.
 202  204   */
 203  205  prop fault.io.disk.over-temperature@P ->
 204  206      ereport.io.scsi.disk.over-temperature@P;
 205  207  
 206  208  prop fault.io.disk.self-test-failure@P ->
 207  209      ereport.io.scsi.disk.self-test-failure@P;
 208  210  
 209  211  prop fault.io.disk.predictive-failure@P ->
 210  212      ereport.io.scsi.disk.predictive-failure@P {
 211  213      setpayloadprop("asc", payloadprop("additional-sense-code")) &&
 212  214      setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
 213  215  
      216 +prop fault.io.disk.attach-failure@P ->
      217 +    ereport.io.scsi.disk.attach-failure@P;
      218 +
 214  219  prop fault.io.disk.ssm-wearout@P ->
 215  220      ereport.io.scsi.disk.ssm-wearout@P {
 216  221      setpayloadprop("current-wearout-percentage",
 217  222      payloadprop("current-ssm-wearout"))
 218  223      && setpayloadprop("threshold-wearout-percentage",
 219  224      payloadprop("threshold-ssm-wearout")) };

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX