1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
  14  * Copyright 2016 Tegile Systems, Inc. All rights reserved.
  15  * Copyright (c) 2016 The MathWorks, Inc.  All rights reserved.
  16  * Copyright 2017 Joyent, Inc.
  17  */
  18 
  19 /*
  20  * blkdev driver for NVMe compliant storage devices
  21  *
  22  * This driver was written to conform to version 1.2.1 of the NVMe
  23  * specification.  It may work with newer versions, but that is completely
  24  * untested and disabled by default.
  25  *
  26  * The driver has only been tested on x86 systems and will not work on big-
  27  * endian systems without changes to the code accessing registers and data
  28  * structures used by the hardware.
  29  *
  30  *
  31  * Interrupt Usage:
  32  *
  33  * The driver will use a single interrupt while configuring the device as the
  34  * specification requires, but contrary to the specification it will try to use
  35  * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
  36  * will switch to multiple-message MSI(-X) if supported. The driver wants to
 
  66  * wraps around in that time a submission may find the next array slot to still
  67  * be used by a long-running command. In this case the array is sequentially
  68  * searched for the next free slot. The length of the command array is the same
  69  * as the configured queue length. Queue overrun is prevented by the semaphore,
  70  * so a command submission may block if the queue is full.
  71  *
  72  *
  73  * Polled I/O Support:
  74  *
  75  * For kernel core dump support the driver can do polled I/O. As interrupts are
  76  * turned off while dumping the driver will just submit a command in the regular
  77  * way, and then repeatedly attempt a command retrieval until it gets the
  78  * command back.
  79  *
  80  *
  81  * Namespace Support:
  82  *
  83  * NVMe devices can have multiple namespaces, each being a independent data
  84  * store. The driver supports multiple namespaces and creates a blkdev interface
  85  * for each namespace found. Namespaces can have various attributes to support
  86  * thin provisioning and protection information. This driver does not support
  87  * any of this and ignores namespaces that have these attributes.
  88  *
  89  * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
  90  * (EUI64). This driver uses the EUI64 if present to generate the devid and
  91  * passes it to blkdev to use it in the device node names. As this is currently
  92  * untested namespaces with EUI64 are ignored by default.
  93  *
  94  * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
  95  * single controller. This is an artificial limit imposed by the driver to be
  96  * able to address a reasonable number of controllers and namespaces using a
  97  * 32bit minor node number.
  98  *
  99  *
 100  * Minor nodes:
 101  *
 102  * For each NVMe device the driver exposes one minor node for the controller and
 103  * one minor node for each namespace. The only operations supported by those
 104  * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
 105  * interface for the nvmeadm(1M) utility.
 106  *
 107  *
 
 
 179  * the nc_mutex of the command to be aborted must be held across the call to
 180  * nvme_abort_cmd() to prevent the command from completing while the abort is in
 181  * progress.
 182  *
 183  * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
 184  * and exclusive-open flag nm_oexcl.
 185  *
 186  *
 187  * Quiesce / Fast Reboot:
 188  *
 189  * The driver currently does not support fast reboot. A quiesce(9E) entry point
 190  * is still provided which is used to send a shutdown notification to the
 191  * device.
 192  *
 193  *
 194  * Driver Configuration:
 195  *
 196  * The following driver properties can be changed to control some aspects of the
 197  * drivers operation:
 198  * - strict-version: can be set to 0 to allow devices conforming to newer
 199  *   versions or namespaces with EUI64 to be used
 200  * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
 201  *   specific command status as a fatal error leading device faulting
 202  * - admin-queue-len: the maximum length of the admin queue (16-4096)
 203  * - io-queue-len: the maximum length of the I/O queues (16-65536)
 204  * - async-event-limit: the maximum number of asynchronous event requests to be
 205  *   posted by the driver
 206  * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
 207  *   cache
 208  * - min-phys-block-size: the minimum physical block size to report to blkdev,
 209  *   which is among other things the basis for ZFS vdev ashift
 210  *
 211  *
 212  * TODO:
 213  * - figure out sane default for I/O queue depth reported to blkdev
 214  * - FMA handling of media errors
 215  * - support for devices supporting very large I/O requests using chained PRPs
 216  * - support for configuring hardware parameters like interrupt coalescing
 217  * - support for media formatting and hard partitioning into namespaces
 218  * - support for big-endian systems
 219  * - support for fast reboot
 
 
 241 #include <sys/varargs.h>
 242 #include <sys/cpuvar.h>
 243 #include <sys/disp.h>
 244 #include <sys/blkdev.h>
 245 #include <sys/atomic.h>
 246 #include <sys/archsystm.h>
 247 #include <sys/sata/sata_hba.h>
 248 #include <sys/stat.h>
 249 #include <sys/policy.h>
 250 #include <sys/list.h>
 251 
 252 #include <sys/nvme.h>
 253 
 254 #ifdef __x86
 255 #include <sys/x86_archext.h>
 256 #endif
 257 
 258 #include "nvme_reg.h"
 259 #include "nvme_var.h"
 260 
 261 
 262 /* NVMe spec version supported */
 263 static const int nvme_version_major = 1;
 264 static const int nvme_version_minor = 2;
 265 
 266 /* tunable for admin command timeout in seconds, default is 1s */
 267 int nvme_admin_cmd_timeout = 1;
 268 
 269 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
 270 int nvme_format_cmd_timeout = 600;
 271 
 272 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
 273 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
 274 static int nvme_quiesce(dev_info_t *);
 275 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
 276 static int nvme_setup_interrupts(nvme_t *, int, int);
 277 static void nvme_release_interrupts(nvme_t *);
 278 static uint_t nvme_intr(caddr_t, caddr_t);
 279 
 280 static void nvme_shutdown(nvme_t *, int, boolean_t);
 281 static boolean_t nvme_reset(nvme_t *, boolean_t);
 282 static int nvme_init(nvme_t *);
 283 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 284 static void nvme_free_cmd(nvme_cmd_t *);
 
 
1067                 return (EINVAL);
1068 
1069         case NVME_CQE_SC_GEN_INV_FLD:
1070                 /* Invalid Field in Command */
1071                 if (!cmd->nc_dontpanic)
1072                         dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1073                             "programming error: invalid field in cmd %p",
1074                             (void *)cmd);
1075                 return (EIO);
1076 
1077         case NVME_CQE_SC_GEN_ID_CNFL:
1078                 /* Command ID Conflict */
1079                 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1080                     "cmd ID conflict in cmd %p", (void *)cmd);
1081                 return (0);
1082 
1083         case NVME_CQE_SC_GEN_INV_NS:
1084                 /* Invalid Namespace or Format */
1085                 if (!cmd->nc_dontpanic)
1086                         dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1087                             "programming error: " "invalid NS/format in cmd %p",
1088                             (void *)cmd);
1089                 return (EINVAL);
1090 
1091         case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
1092                 /* LBA Out Of Range */
1093                 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1094                     "LBA out of range in cmd %p", (void *)cmd);
1095                 return (0);
1096 
1097         /*
1098          * Non-fatal errors, handle gracefully.
1099          */
1100         case NVME_CQE_SC_GEN_DATA_XFR_ERR:
1101                 /* Data Transfer Error (DMA) */
1102                 /* TODO: post ereport */
1103                 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
1104                 if (cmd->nc_xfer != NULL)
1105                         bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1106                 return (EIO);
1107 
 
 
1838 }
1839 
1840 static int
1841 nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res,
1842     void **buf, size_t *bufsize)
1843 {
1844         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1845         int ret = EINVAL;
1846 
1847         ASSERT(res != NULL);
1848 
1849         if (bufsize != NULL)
1850                 *bufsize = 0;
1851 
1852         cmd->nc_sqid = 0;
1853         cmd->nc_callback = nvme_wakeup_cmd;
1854         cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES;
1855         cmd->nc_sqe.sqe_cdw10 = feature;
1856         cmd->nc_sqe.sqe_cdw11 = *res;
1857 
1858         switch (feature) {
1859         case NVME_FEAT_ARBITRATION:
1860         case NVME_FEAT_POWER_MGMT:
1861         case NVME_FEAT_TEMPERATURE:
1862         case NVME_FEAT_ERROR:
1863         case NVME_FEAT_NQUEUES:
1864         case NVME_FEAT_INTR_COAL:
1865         case NVME_FEAT_INTR_VECT:
1866         case NVME_FEAT_WRITE_ATOM:
1867         case NVME_FEAT_ASYNC_EVENT:
1868         case NVME_FEAT_PROGRESS:
1869                 break;
1870 
1871         case NVME_FEAT_WRITE_CACHE:
1872                 if (!nvme->n_write_cache_present)
1873                         goto fail;
1874                 break;
1875 
1876         case NVME_FEAT_LBA_RANGE:
1877                 if (!nvme->n_lba_range_supported)
1878                         goto fail;
1879 
1880                 /*
1881                  * The LBA Range Type feature is optional. There doesn't seem
1882                  * be a method of detecting whether it is supported other than
1883                  * using it. This will cause a "invalid field in command" error,
1884                  * which is normally considered a programming error and causes
1885                  * panic in nvme_check_generic_cmd_status().
1886                  */
1887                 cmd->nc_dontpanic = B_TRUE;
1888                 cmd->nc_sqe.sqe_nsid = nsid;
1889                 ASSERT(bufsize != NULL);
1890                 *bufsize = NVME_LBA_RANGE_BUFSIZE;
1891 
1892                 break;
1893 
1894         case NVME_FEAT_AUTO_PST:
1895                 if (!nvme->n_auto_pst_supported)
1896                         goto fail;
1897 
1898                 ASSERT(bufsize != NULL);
1899                 *bufsize = NVME_AUTO_PST_BUFSIZE;
1900                 break;
1901 
1902         default:
1903                 goto fail;
1904         }
1905 
1906         if (bufsize != NULL && *bufsize != 0) {
1907                 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ,
1908                     &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1909                         dev_err(nvme->n_dip, CE_WARN,
1910                             "!nvme_zalloc_dma failed for GET FEATURES");
1911                         ret = ENOMEM;
1912                         goto fail;
1913                 }
1914 
1915                 if (cmd->nc_dma->nd_ncookie > 2) {
1916                         dev_err(nvme->n_dip, CE_WARN,
1917                             "!too many DMA cookies for GET FEATURES");
1918                         atomic_inc_32(&nvme->n_too_many_cookies);
1919                         ret = ENOMEM;
1920                         goto fail;
1921                 }
1922 
1923                 cmd->nc_sqe.sqe_dptr.d_prp[0] =
1924                     cmd->nc_dma->nd_cookie.dmac_laddress;
1925                 if (cmd->nc_dma->nd_ncookie > 1) {
1926                         ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1927                             &cmd->nc_dma->nd_cookie);
1928                         cmd->nc_sqe.sqe_dptr.d_prp[1] =
1929                             cmd->nc_dma->nd_cookie.dmac_laddress;
1930                 }
1931         }
1932 
1933         nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
1934 
1935         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1936                 if (feature == NVME_FEAT_LBA_RANGE &&
1937                     cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1938                     cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD)
1939                         nvme->n_lba_range_supported = B_FALSE;
1940                 else
1941                         dev_err(nvme->n_dip, CE_WARN,
1942                             "!GET FEATURES %d failed with sct = %x, sc = %x",
1943                             feature, cmd->nc_cqe.cqe_sf.sf_sct,
1944                             cmd->nc_cqe.cqe_sf.sf_sc);
1945                 goto fail;
1946         }
1947 
1948         if (bufsize != NULL && *bufsize != 0) {
1949                 ASSERT(buf != NULL);
1950                 *buf = kmem_alloc(*bufsize, KM_SLEEP);
1951                 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
1952         }
1953 
1954         *res = cmd->nc_cqe.cqe_dw0;
1955 
1956 fail:
1957         nvme_free_cmd(cmd);
1958         return (ret);
1959 }
1960 
1961 static int
1962 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
1963 {
1964         nvme_write_cache_t nwc = { 0 };
 
2182          * performance. A value of 3 means "degraded", 0 is best.
2183          */
2184         last_rp = 3;
2185         for (int j = 0; j <= idns->id_nlbaf; j++) {
2186                 if (idns->id_lbaf[j].lbaf_lbads == 0)
2187                         break;
2188                 if (idns->id_lbaf[j].lbaf_ms != 0)
2189                         continue;
2190                 if (idns->id_lbaf[j].lbaf_rp >= last_rp)
2191                         continue;
2192                 last_rp = idns->id_lbaf[j].lbaf_rp;
2193                 ns->ns_best_block_size =
2194                     1 << idns->id_lbaf[j].lbaf_lbads;
2195         }
2196 
2197         if (ns->ns_best_block_size < nvme->n_min_block_size)
2198                 ns->ns_best_block_size = nvme->n_min_block_size;
2199 
2200         /*
2201          * We currently don't support namespaces that use either:
2202          * - thin provisioning
2203          * - protection information
2204          * - illegal block size (< 512)
2205          */
2206         if (idns->id_nsfeat.f_thin ||
2207             idns->id_dps.dp_pinfo) {
2208                 dev_err(nvme->n_dip, CE_WARN,
2209                     "!ignoring namespace %d, unsupported features: "
2210                     "thin = %d, pinfo = %d", nsid,
2211                     idns->id_nsfeat.f_thin, idns->id_dps.dp_pinfo);
2212                 ns->ns_ignore = B_TRUE;
2213         } else if (ns->ns_block_size < 512) {
2214                 dev_err(nvme->n_dip, CE_WARN,
2215                     "!ignoring namespace %d, unsupported block size %"PRIu64,
2216                     nsid, (uint64_t)ns->ns_block_size);
2217                 ns->ns_ignore = B_TRUE;
2218         } else {
2219                 ns->ns_ignore = B_FALSE;
2220         }
2221 
2222         return (DDI_SUCCESS);
2223 }
2224 
2225 static int
2226 nvme_init(nvme_t *nvme)
2227 {
2228         nvme_reg_cc_t cc = { 0 };
2229         nvme_reg_aqa_t aqa = { 0 };
2230         nvme_reg_asq_t asq = { 0 };
2231         nvme_reg_acq_t acq = { 0 };
2232         nvme_reg_cap_t cap;
2233         nvme_reg_vs_t vs;
2234         nvme_reg_csts_t csts;
2235         int i = 0;
2236         uint16_t nqueues;
2237         char model[sizeof (nvme->n_idctl->id_model) + 1];
2238         char *vendor, *product;
2239 
2240         /* Check controller version */
2241         vs.r = nvme_get32(nvme, NVME_REG_VS);
2242         nvme->n_version.v_major = vs.b.vs_mjr;
2243         nvme->n_version.v_minor = vs.b.vs_mnr;
2244         dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
2245             nvme->n_version.v_major, nvme->n_version.v_minor);
2246 
2247         if (NVME_VERSION_HIGHER(&nvme->n_version,
2248             nvme_version_major, nvme_version_minor)) {
2249                 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.%d",
2250                     nvme_version_major, nvme_version_minor);
2251                 if (nvme->n_strict_version)
2252                         goto fail;
2253         }
2254 
2255         /* retrieve controller configuration */
2256         cap.r = nvme_get64(nvme, NVME_REG_CAP);
2257 
2258         if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
2259                 dev_err(nvme->n_dip, CE_WARN,
2260                     "!NVM command set not supported by hardware");
2261                 goto fail;
2262         }
2263 
2264         nvme->n_nssr_supported = cap.b.cap_nssrs;
2265         nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
2266         nvme->n_timeout = cap.b.cap_to;
2267         nvme->n_arbitration_mechanisms = cap.b.cap_ams;
2268         nvme->n_cont_queues_reqd = cap.b.cap_cqr;
2269         nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
2270 
 
2504         }
2505 
2506         (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2507             "volatile-write-cache-enable",
2508             nvme->n_write_cache_enabled ? 1 : 0);
2509 
2510         /*
2511          * Assume LBA Range Type feature is supported. If it isn't this
2512          * will be set to B_FALSE by nvme_get_features().
2513          */
2514         nvme->n_lba_range_supported = B_TRUE;
2515 
2516         /*
2517          * Check support for Autonomous Power State Transition.
2518          */
2519         if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2520                 nvme->n_auto_pst_supported =
2521                     nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE;
2522 
2523         /*
2524          * Identify Namespaces
2525          */
2526         nvme->n_namespace_count = nvme->n_idctl->id_nn;
2527         if (nvme->n_namespace_count > NVME_MINOR_MAX) {
2528                 dev_err(nvme->n_dip, CE_WARN,
2529                     "!too many namespaces: %d, limiting to %d\n",
2530                     nvme->n_namespace_count, NVME_MINOR_MAX);
2531                 nvme->n_namespace_count = NVME_MINOR_MAX;
2532         }
2533 
2534         nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
2535             nvme->n_namespace_count, KM_SLEEP);
2536 
2537         for (i = 0; i != nvme->n_namespace_count; i++) {
2538                 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER,
2539                     NULL);
2540                 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS)
2541                         goto fail;
2542         }
2543 
2544         /*
2545          * Try to set up MSI/MSI-X interrupts.
2546          */
 
 | 
   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Nexenta Systems, Inc.
  14  * Copyright 2016 Tegile Systems, Inc. All rights reserved.
  15  * Copyright (c) 2016 The MathWorks, Inc.  All rights reserved.
  16  * Copyright 2018 Joyent, Inc.
  17  */
  18 
  19 /*
  20  * blkdev driver for NVMe compliant storage devices
  21  *
  22  * This driver was written to conform to version 1.2.1 of the NVMe
  23  * specification.  It may work with newer versions, but that is completely
  24  * untested and disabled by default.
  25  *
  26  * The driver has only been tested on x86 systems and will not work on big-
  27  * endian systems without changes to the code accessing registers and data
  28  * structures used by the hardware.
  29  *
  30  *
  31  * Interrupt Usage:
  32  *
  33  * The driver will use a single interrupt while configuring the device as the
  34  * specification requires, but contrary to the specification it will try to use
  35  * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
  36  * will switch to multiple-message MSI(-X) if supported. The driver wants to
 
  66  * wraps around in that time a submission may find the next array slot to still
  67  * be used by a long-running command. In this case the array is sequentially
  68  * searched for the next free slot. The length of the command array is the same
  69  * as the configured queue length. Queue overrun is prevented by the semaphore,
  70  * so a command submission may block if the queue is full.
  71  *
  72  *
  73  * Polled I/O Support:
  74  *
  75  * For kernel core dump support the driver can do polled I/O. As interrupts are
  76  * turned off while dumping the driver will just submit a command in the regular
  77  * way, and then repeatedly attempt a command retrieval until it gets the
  78  * command back.
  79  *
  80  *
  81  * Namespace Support:
  82  *
  83  * NVMe devices can have multiple namespaces, each being a independent data
  84  * store. The driver supports multiple namespaces and creates a blkdev interface
  85  * for each namespace found. Namespaces can have various attributes to support
  86  * protection information. This driver does not support any of this and ignores
  87  * namespaces that have these attributes.
  88  *
  89  * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
  90  * (EUI64). This driver uses the EUI64 if present to generate the devid and
  91  * passes it to blkdev to use it in the device node names. As this is currently
  92  * untested namespaces with EUI64 are ignored by default.
  93  *
  94  * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
  95  * single controller. This is an artificial limit imposed by the driver to be
  96  * able to address a reasonable number of controllers and namespaces using a
  97  * 32bit minor node number.
  98  *
  99  *
 100  * Minor nodes:
 101  *
 102  * For each NVMe device the driver exposes one minor node for the controller and
 103  * one minor node for each namespace. The only operations supported by those
 104  * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
 105  * interface for the nvmeadm(1M) utility.
 106  *
 107  *
 
 
 179  * the nc_mutex of the command to be aborted must be held across the call to
 180  * nvme_abort_cmd() to prevent the command from completing while the abort is in
 181  * progress.
 182  *
 183  * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
 184  * and exclusive-open flag nm_oexcl.
 185  *
 186  *
 187  * Quiesce / Fast Reboot:
 188  *
 189  * The driver currently does not support fast reboot. A quiesce(9E) entry point
 190  * is still provided which is used to send a shutdown notification to the
 191  * device.
 192  *
 193  *
 194  * Driver Configuration:
 195  *
 196  * The following driver properties can be changed to control some aspects of the
 197  * drivers operation:
 198  * - strict-version: can be set to 0 to allow devices conforming to newer
 199  *   major versions to be used
 200  * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
 201  *   specific command status as a fatal error leading device faulting
 202  * - admin-queue-len: the maximum length of the admin queue (16-4096)
 203  * - io-queue-len: the maximum length of the I/O queues (16-65536)
 204  * - async-event-limit: the maximum number of asynchronous event requests to be
 205  *   posted by the driver
 206  * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
 207  *   cache
 208  * - min-phys-block-size: the minimum physical block size to report to blkdev,
 209  *   which is among other things the basis for ZFS vdev ashift
 210  *
 211  *
 212  * TODO:
 213  * - figure out sane default for I/O queue depth reported to blkdev
 214  * - FMA handling of media errors
 215  * - support for devices supporting very large I/O requests using chained PRPs
 216  * - support for configuring hardware parameters like interrupt coalescing
 217  * - support for media formatting and hard partitioning into namespaces
 218  * - support for big-endian systems
 219  * - support for fast reboot
 
 
 241 #include <sys/varargs.h>
 242 #include <sys/cpuvar.h>
 243 #include <sys/disp.h>
 244 #include <sys/blkdev.h>
 245 #include <sys/atomic.h>
 246 #include <sys/archsystm.h>
 247 #include <sys/sata/sata_hba.h>
 248 #include <sys/stat.h>
 249 #include <sys/policy.h>
 250 #include <sys/list.h>
 251 
 252 #include <sys/nvme.h>
 253 
 254 #ifdef __x86
 255 #include <sys/x86_archext.h>
 256 #endif
 257 
 258 #include "nvme_reg.h"
 259 #include "nvme_var.h"
 260 
 261 /*
 262  * Assertions to make sure that we've properly captured various aspects of the
 263  * packed structures and haven't broken them during updates.
 264  */
 265 CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000);
 266 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
 267 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
 268 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
 269 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
 270 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
 271 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
 272 
 273 CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000);
 274 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
 275 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
 276 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
 277 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
 278 
 279 CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000);
 280 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
 281 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
 282 
 283 
 284 /* NVMe spec version supported */
 285 static const int nvme_version_major = 1;
 286 
 287 /* tunable for admin command timeout in seconds, default is 1s */
 288 int nvme_admin_cmd_timeout = 1;
 289 
 290 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
 291 int nvme_format_cmd_timeout = 600;
 292 
 293 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
 294 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
 295 static int nvme_quiesce(dev_info_t *);
 296 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
 297 static int nvme_setup_interrupts(nvme_t *, int, int);
 298 static void nvme_release_interrupts(nvme_t *);
 299 static uint_t nvme_intr(caddr_t, caddr_t);
 300 
 301 static void nvme_shutdown(nvme_t *, int, boolean_t);
 302 static boolean_t nvme_reset(nvme_t *, boolean_t);
 303 static int nvme_init(nvme_t *);
 304 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 305 static void nvme_free_cmd(nvme_cmd_t *);
 
 
1088                 return (EINVAL);
1089 
1090         case NVME_CQE_SC_GEN_INV_FLD:
1091                 /* Invalid Field in Command */
1092                 if (!cmd->nc_dontpanic)
1093                         dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1094                             "programming error: invalid field in cmd %p",
1095                             (void *)cmd);
1096                 return (EIO);
1097 
1098         case NVME_CQE_SC_GEN_ID_CNFL:
1099                 /* Command ID Conflict */
1100                 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1101                     "cmd ID conflict in cmd %p", (void *)cmd);
1102                 return (0);
1103 
1104         case NVME_CQE_SC_GEN_INV_NS:
1105                 /* Invalid Namespace or Format */
1106                 if (!cmd->nc_dontpanic)
1107                         dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1108                             "programming error: invalid NS/format in cmd %p",
1109                             (void *)cmd);
1110                 return (EINVAL);
1111 
1112         case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
1113                 /* LBA Out Of Range */
1114                 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1115                     "LBA out of range in cmd %p", (void *)cmd);
1116                 return (0);
1117 
1118         /*
1119          * Non-fatal errors, handle gracefully.
1120          */
1121         case NVME_CQE_SC_GEN_DATA_XFR_ERR:
1122                 /* Data Transfer Error (DMA) */
1123                 /* TODO: post ereport */
1124                 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
1125                 if (cmd->nc_xfer != NULL)
1126                         bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1127                 return (EIO);
1128 
 
 
1859 }
1860 
1861 static int
1862 nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res,
1863     void **buf, size_t *bufsize)
1864 {
1865         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1866         int ret = EINVAL;
1867 
1868         ASSERT(res != NULL);
1869 
1870         if (bufsize != NULL)
1871                 *bufsize = 0;
1872 
1873         cmd->nc_sqid = 0;
1874         cmd->nc_callback = nvme_wakeup_cmd;
1875         cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES;
1876         cmd->nc_sqe.sqe_cdw10 = feature;
1877         cmd->nc_sqe.sqe_cdw11 = *res;
1878 
1879         /*
1880          * For some of the optional features there doesn't seem to be a method
1881          * of detecting whether it is supported other than using it.  This will
1882          * cause "Invalid Field in Command" error, which is normally considered
1883          * a programming error.  Set the nc_dontpanic flag to override the panic
1884          * in nvme_check_generic_cmd_status().
1885          */
1886         switch (feature) {
1887         case NVME_FEAT_ARBITRATION:
1888         case NVME_FEAT_POWER_MGMT:
1889         case NVME_FEAT_TEMPERATURE:
1890         case NVME_FEAT_NQUEUES:
1891         case NVME_FEAT_INTR_COAL:
1892         case NVME_FEAT_INTR_VECT:
1893         case NVME_FEAT_WRITE_ATOM:
1894         case NVME_FEAT_ASYNC_EVENT:
1895                 break;
1896 
1897         case NVME_FEAT_ERROR:
1898                 /*
1899                  * Per-namespace Deallocated or Unwritten Logical Block
1900                  * Error Enable (DULBE) feature was added after initial NVMe
1901                  * specification, but we currently only check this feature with
1902                  * NS ID of 0 (the controller itself), and some controllers get
1903                  * upset, reporting the error.  For the moment, override the
1904                  * panic by setting the nc_dontpanic flag.
1905                  */
1906                 cmd->nc_dontpanic = B_TRUE;
1907                 break;
1908 
1909         case NVME_FEAT_WRITE_CACHE:
1910                 if (!nvme->n_write_cache_present)
1911                         goto fail;
1912                 break;
1913 
1914         case NVME_FEAT_LBA_RANGE:
1915                 if (!nvme->n_lba_range_supported)
1916                         goto fail;
1917 
1918                 cmd->nc_dontpanic = B_TRUE;
1919                 cmd->nc_sqe.sqe_nsid = nsid;
1920                 ASSERT(bufsize != NULL);
1921                 *bufsize = NVME_LBA_RANGE_BUFSIZE;
1922                 break;
1923 
1924         case NVME_FEAT_AUTO_PST:
1925                 if (!nvme->n_auto_pst_supported)
1926                         goto fail;
1927 
1928                 ASSERT(bufsize != NULL);
1929                 *bufsize = NVME_AUTO_PST_BUFSIZE;
1930                 break;
1931 
1932         case NVME_FEAT_PROGRESS:
1933                 if (!nvme->n_progress_supported)
1934                         goto fail;
1935 
1936                 cmd->nc_dontpanic = B_TRUE;
1937                 break;
1938 
1939         default:
1940                 goto fail;
1941         }
1942 
1943         if (bufsize != NULL && *bufsize != 0) {
1944                 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ,
1945                     &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1946                         dev_err(nvme->n_dip, CE_WARN,
1947                             "!nvme_zalloc_dma failed for GET FEATURES");
1948                         ret = ENOMEM;
1949                         goto fail;
1950                 }
1951 
1952                 if (cmd->nc_dma->nd_ncookie > 2) {
1953                         dev_err(nvme->n_dip, CE_WARN,
1954                             "!too many DMA cookies for GET FEATURES");
1955                         atomic_inc_32(&nvme->n_too_many_cookies);
1956                         ret = ENOMEM;
1957                         goto fail;
1958                 }
1959 
1960                 cmd->nc_sqe.sqe_dptr.d_prp[0] =
1961                     cmd->nc_dma->nd_cookie.dmac_laddress;
1962                 if (cmd->nc_dma->nd_ncookie > 1) {
1963                         ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1964                             &cmd->nc_dma->nd_cookie);
1965                         cmd->nc_sqe.sqe_dptr.d_prp[1] =
1966                             cmd->nc_dma->nd_cookie.dmac_laddress;
1967                 }
1968         }
1969 
1970         nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
1971 
1972         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1973                 boolean_t known = B_TRUE;
1974 
1975                 /* Check if this is unsupported optional feature */
1976                 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1977                     cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) {
1978                         switch (feature) {
1979                         case NVME_FEAT_LBA_RANGE:
1980                                 nvme->n_lba_range_supported = B_FALSE;
1981                                 break;
1982                         case NVME_FEAT_PROGRESS:
1983                                 nvme->n_progress_supported = B_FALSE;
1984                                 break;
1985                         default:
1986                                 known = B_FALSE;
1987                                 break;
1988                         }
1989                 } else {
1990                         known = B_FALSE;
1991                 }
1992 
1993                 /* Report the error otherwise */
1994                 if (!known) {
1995                         dev_err(nvme->n_dip, CE_WARN,
1996                             "!GET FEATURES %d failed with sct = %x, sc = %x",
1997                             feature, cmd->nc_cqe.cqe_sf.sf_sct,
1998                             cmd->nc_cqe.cqe_sf.sf_sc);
1999                 }
2000 
2001                 goto fail;
2002         }
2003 
2004         if (bufsize != NULL && *bufsize != 0) {
2005                 ASSERT(buf != NULL);
2006                 *buf = kmem_alloc(*bufsize, KM_SLEEP);
2007                 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
2008         }
2009 
2010         *res = cmd->nc_cqe.cqe_dw0;
2011 
2012 fail:
2013         nvme_free_cmd(cmd);
2014         return (ret);
2015 }
2016 
2017 static int
2018 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
2019 {
2020         nvme_write_cache_t nwc = { 0 };
 
2238          * performance. A value of 3 means "degraded", 0 is best.
2239          */
2240         last_rp = 3;
2241         for (int j = 0; j <= idns->id_nlbaf; j++) {
2242                 if (idns->id_lbaf[j].lbaf_lbads == 0)
2243                         break;
2244                 if (idns->id_lbaf[j].lbaf_ms != 0)
2245                         continue;
2246                 if (idns->id_lbaf[j].lbaf_rp >= last_rp)
2247                         continue;
2248                 last_rp = idns->id_lbaf[j].lbaf_rp;
2249                 ns->ns_best_block_size =
2250                     1 << idns->id_lbaf[j].lbaf_lbads;
2251         }
2252 
2253         if (ns->ns_best_block_size < nvme->n_min_block_size)
2254                 ns->ns_best_block_size = nvme->n_min_block_size;
2255 
2256         /*
2257          * We currently don't support namespaces that use either:
2258          * - protection information
2259          * - illegal block size (< 512)
2260          */
2261         if (idns->id_dps.dp_pinfo) {
2262                 dev_err(nvme->n_dip, CE_WARN,
2263                     "!ignoring namespace %d, unsupported feature: "
2264                     "pinfo = %d", nsid, idns->id_dps.dp_pinfo);
2265                 ns->ns_ignore = B_TRUE;
2266         } else if (ns->ns_block_size < 512) {
2267                 dev_err(nvme->n_dip, CE_WARN,
2268                     "!ignoring namespace %d, unsupported block size %"PRIu64,
2269                     nsid, (uint64_t)ns->ns_block_size);
2270                 ns->ns_ignore = B_TRUE;
2271         } else {
2272                 ns->ns_ignore = B_FALSE;
2273         }
2274 
2275         return (DDI_SUCCESS);
2276 }
2277 
2278 static int
2279 nvme_init(nvme_t *nvme)
2280 {
2281         nvme_reg_cc_t cc = { 0 };
2282         nvme_reg_aqa_t aqa = { 0 };
2283         nvme_reg_asq_t asq = { 0 };
2284         nvme_reg_acq_t acq = { 0 };
2285         nvme_reg_cap_t cap;
2286         nvme_reg_vs_t vs;
2287         nvme_reg_csts_t csts;
2288         int i = 0;
2289         uint16_t nqueues;
2290         char model[sizeof (nvme->n_idctl->id_model) + 1];
2291         char *vendor, *product;
2292 
2293         /* Check controller version */
2294         vs.r = nvme_get32(nvme, NVME_REG_VS);
2295         nvme->n_version.v_major = vs.b.vs_mjr;
2296         nvme->n_version.v_minor = vs.b.vs_mnr;
2297         dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
2298             nvme->n_version.v_major, nvme->n_version.v_minor);
2299 
2300         if (nvme->n_version.v_major > nvme_version_major) {
2301                 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x",
2302                     nvme_version_major);
2303                 if (nvme->n_strict_version)
2304                         goto fail;
2305         }
2306 
2307         /* retrieve controller configuration */
2308         cap.r = nvme_get64(nvme, NVME_REG_CAP);
2309 
2310         if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
2311                 dev_err(nvme->n_dip, CE_WARN,
2312                     "!NVM command set not supported by hardware");
2313                 goto fail;
2314         }
2315 
2316         nvme->n_nssr_supported = cap.b.cap_nssrs;
2317         nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
2318         nvme->n_timeout = cap.b.cap_to;
2319         nvme->n_arbitration_mechanisms = cap.b.cap_ams;
2320         nvme->n_cont_queues_reqd = cap.b.cap_cqr;
2321         nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
2322 
 
2556         }
2557 
2558         (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2559             "volatile-write-cache-enable",
2560             nvme->n_write_cache_enabled ? 1 : 0);
2561 
2562         /*
2563          * Assume LBA Range Type feature is supported. If it isn't this
2564          * will be set to B_FALSE by nvme_get_features().
2565          */
2566         nvme->n_lba_range_supported = B_TRUE;
2567 
2568         /*
2569          * Check support for Autonomous Power State Transition.
2570          */
2571         if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2572                 nvme->n_auto_pst_supported =
2573                     nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE;
2574 
2575         /*
2576          * Assume Software Progress Marker feature is supported.  If it isn't
2577          * this will be set to B_FALSE by nvme_get_features().
2578          */
2579         nvme->n_progress_supported = B_TRUE;
2580 
2581         /*
2582          * Identify Namespaces
2583          */
2584         nvme->n_namespace_count = nvme->n_idctl->id_nn;
2585 
2586         if (nvme->n_namespace_count == 0) {
2587                 dev_err(nvme->n_dip, CE_WARN,
2588                     "!controllers without namespaces are not supported");
2589                 goto fail;
2590         }
2591 
2592         if (nvme->n_namespace_count > NVME_MINOR_MAX) {
2593                 dev_err(nvme->n_dip, CE_WARN,
2594                     "!too many namespaces: %d, limiting to %d\n",
2595                     nvme->n_namespace_count, NVME_MINOR_MAX);
2596                 nvme->n_namespace_count = NVME_MINOR_MAX;
2597         }
2598 
2599         nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
2600             nvme->n_namespace_count, KM_SLEEP);
2601 
2602         for (i = 0; i != nvme->n_namespace_count; i++) {
2603                 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER,
2604                     NULL);
2605                 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS)
2606                         goto fail;
2607         }
2608 
2609         /*
2610          * Try to set up MSI/MSI-X interrupts.
2611          */
 
 |