1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
14 * Copyright 2016 Tegile Systems, Inc. All rights reserved.
15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved.
16 * Copyright 2017 Joyent, Inc.
17 */
18
19 /*
20 * blkdev driver for NVMe compliant storage devices
21 *
22 * This driver was written to conform to version 1.2.1 of the NVMe
23 * specification. It may work with newer versions, but that is completely
24 * untested and disabled by default.
25 *
26 * The driver has only been tested on x86 systems and will not work on big-
27 * endian systems without changes to the code accessing registers and data
28 * structures used by the hardware.
29 *
30 *
31 * Interrupt Usage:
32 *
33 * The driver will use a single interrupt while configuring the device as the
34 * specification requires, but contrary to the specification it will try to use
35 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
36 * will switch to multiple-message MSI(-X) if supported. The driver wants to
66 * wraps around in that time a submission may find the next array slot to still
67 * be used by a long-running command. In this case the array is sequentially
68 * searched for the next free slot. The length of the command array is the same
69 * as the configured queue length. Queue overrun is prevented by the semaphore,
70 * so a command submission may block if the queue is full.
71 *
72 *
73 * Polled I/O Support:
74 *
75 * For kernel core dump support the driver can do polled I/O. As interrupts are
76 * turned off while dumping the driver will just submit a command in the regular
77 * way, and then repeatedly attempt a command retrieval until it gets the
78 * command back.
79 *
80 *
81 * Namespace Support:
82 *
83 * NVMe devices can have multiple namespaces, each being a independent data
84 * store. The driver supports multiple namespaces and creates a blkdev interface
85 * for each namespace found. Namespaces can have various attributes to support
86 * thin provisioning and protection information. This driver does not support
87 * any of this and ignores namespaces that have these attributes.
88 *
89 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
90 * (EUI64). This driver uses the EUI64 if present to generate the devid and
91 * passes it to blkdev to use it in the device node names. As this is currently
92 * untested namespaces with EUI64 are ignored by default.
93 *
94 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
95 * single controller. This is an artificial limit imposed by the driver to be
96 * able to address a reasonable number of controllers and namespaces using a
97 * 32bit minor node number.
98 *
99 *
100 * Minor nodes:
101 *
102 * For each NVMe device the driver exposes one minor node for the controller and
103 * one minor node for each namespace. The only operations supported by those
104 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
105 * interface for the nvmeadm(1M) utility.
106 *
107 *
179 * the nc_mutex of the command to be aborted must be held across the call to
180 * nvme_abort_cmd() to prevent the command from completing while the abort is in
181 * progress.
182 *
183 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
184 * and exclusive-open flag nm_oexcl.
185 *
186 *
187 * Quiesce / Fast Reboot:
188 *
189 * The driver currently does not support fast reboot. A quiesce(9E) entry point
190 * is still provided which is used to send a shutdown notification to the
191 * device.
192 *
193 *
194 * Driver Configuration:
195 *
196 * The following driver properties can be changed to control some aspects of the
197 * drivers operation:
198 * - strict-version: can be set to 0 to allow devices conforming to newer
199 * versions or namespaces with EUI64 to be used
200 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
201 * specific command status as a fatal error leading device faulting
202 * - admin-queue-len: the maximum length of the admin queue (16-4096)
203 * - io-queue-len: the maximum length of the I/O queues (16-65536)
204 * - async-event-limit: the maximum number of asynchronous event requests to be
205 * posted by the driver
206 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
207 * cache
208 * - min-phys-block-size: the minimum physical block size to report to blkdev,
209 * which is among other things the basis for ZFS vdev ashift
210 *
211 *
212 * TODO:
213 * - figure out sane default for I/O queue depth reported to blkdev
214 * - FMA handling of media errors
215 * - support for devices supporting very large I/O requests using chained PRPs
216 * - support for configuring hardware parameters like interrupt coalescing
217 * - support for media formatting and hard partitioning into namespaces
218 * - support for big-endian systems
219 * - support for fast reboot
241 #include <sys/varargs.h>
242 #include <sys/cpuvar.h>
243 #include <sys/disp.h>
244 #include <sys/blkdev.h>
245 #include <sys/atomic.h>
246 #include <sys/archsystm.h>
247 #include <sys/sata/sata_hba.h>
248 #include <sys/stat.h>
249 #include <sys/policy.h>
250 #include <sys/list.h>
251
252 #include <sys/nvme.h>
253
254 #ifdef __x86
255 #include <sys/x86_archext.h>
256 #endif
257
258 #include "nvme_reg.h"
259 #include "nvme_var.h"
260
261
262 /* NVMe spec version supported */
263 static const int nvme_version_major = 1;
264 static const int nvme_version_minor = 2;
265
266 /* tunable for admin command timeout in seconds, default is 1s */
267 int nvme_admin_cmd_timeout = 1;
268
269 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
270 int nvme_format_cmd_timeout = 600;
271
272 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
273 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
274 static int nvme_quiesce(dev_info_t *);
275 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
276 static int nvme_setup_interrupts(nvme_t *, int, int);
277 static void nvme_release_interrupts(nvme_t *);
278 static uint_t nvme_intr(caddr_t, caddr_t);
279
280 static void nvme_shutdown(nvme_t *, int, boolean_t);
281 static boolean_t nvme_reset(nvme_t *, boolean_t);
282 static int nvme_init(nvme_t *);
283 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
284 static void nvme_free_cmd(nvme_cmd_t *);
1067 return (EINVAL);
1068
1069 case NVME_CQE_SC_GEN_INV_FLD:
1070 /* Invalid Field in Command */
1071 if (!cmd->nc_dontpanic)
1072 dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1073 "programming error: invalid field in cmd %p",
1074 (void *)cmd);
1075 return (EIO);
1076
1077 case NVME_CQE_SC_GEN_ID_CNFL:
1078 /* Command ID Conflict */
1079 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1080 "cmd ID conflict in cmd %p", (void *)cmd);
1081 return (0);
1082
1083 case NVME_CQE_SC_GEN_INV_NS:
1084 /* Invalid Namespace or Format */
1085 if (!cmd->nc_dontpanic)
1086 dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1087 "programming error: " "invalid NS/format in cmd %p",
1088 (void *)cmd);
1089 return (EINVAL);
1090
1091 case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
1092 /* LBA Out Of Range */
1093 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1094 "LBA out of range in cmd %p", (void *)cmd);
1095 return (0);
1096
1097 /*
1098 * Non-fatal errors, handle gracefully.
1099 */
1100 case NVME_CQE_SC_GEN_DATA_XFR_ERR:
1101 /* Data Transfer Error (DMA) */
1102 /* TODO: post ereport */
1103 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
1104 if (cmd->nc_xfer != NULL)
1105 bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1106 return (EIO);
1107
1838 }
1839
1840 static int
1841 nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res,
1842 void **buf, size_t *bufsize)
1843 {
1844 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1845 int ret = EINVAL;
1846
1847 ASSERT(res != NULL);
1848
1849 if (bufsize != NULL)
1850 *bufsize = 0;
1851
1852 cmd->nc_sqid = 0;
1853 cmd->nc_callback = nvme_wakeup_cmd;
1854 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES;
1855 cmd->nc_sqe.sqe_cdw10 = feature;
1856 cmd->nc_sqe.sqe_cdw11 = *res;
1857
1858 switch (feature) {
1859 case NVME_FEAT_ARBITRATION:
1860 case NVME_FEAT_POWER_MGMT:
1861 case NVME_FEAT_TEMPERATURE:
1862 case NVME_FEAT_ERROR:
1863 case NVME_FEAT_NQUEUES:
1864 case NVME_FEAT_INTR_COAL:
1865 case NVME_FEAT_INTR_VECT:
1866 case NVME_FEAT_WRITE_ATOM:
1867 case NVME_FEAT_ASYNC_EVENT:
1868 case NVME_FEAT_PROGRESS:
1869 break;
1870
1871 case NVME_FEAT_WRITE_CACHE:
1872 if (!nvme->n_write_cache_present)
1873 goto fail;
1874 break;
1875
1876 case NVME_FEAT_LBA_RANGE:
1877 if (!nvme->n_lba_range_supported)
1878 goto fail;
1879
1880 /*
1881 * The LBA Range Type feature is optional. There doesn't seem
1882 * be a method of detecting whether it is supported other than
1883 * using it. This will cause a "invalid field in command" error,
1884 * which is normally considered a programming error and causes
1885 * panic in nvme_check_generic_cmd_status().
1886 */
1887 cmd->nc_dontpanic = B_TRUE;
1888 cmd->nc_sqe.sqe_nsid = nsid;
1889 ASSERT(bufsize != NULL);
1890 *bufsize = NVME_LBA_RANGE_BUFSIZE;
1891
1892 break;
1893
1894 case NVME_FEAT_AUTO_PST:
1895 if (!nvme->n_auto_pst_supported)
1896 goto fail;
1897
1898 ASSERT(bufsize != NULL);
1899 *bufsize = NVME_AUTO_PST_BUFSIZE;
1900 break;
1901
1902 default:
1903 goto fail;
1904 }
1905
1906 if (bufsize != NULL && *bufsize != 0) {
1907 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ,
1908 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1909 dev_err(nvme->n_dip, CE_WARN,
1910 "!nvme_zalloc_dma failed for GET FEATURES");
1911 ret = ENOMEM;
1912 goto fail;
1913 }
1914
1915 if (cmd->nc_dma->nd_ncookie > 2) {
1916 dev_err(nvme->n_dip, CE_WARN,
1917 "!too many DMA cookies for GET FEATURES");
1918 atomic_inc_32(&nvme->n_too_many_cookies);
1919 ret = ENOMEM;
1920 goto fail;
1921 }
1922
1923 cmd->nc_sqe.sqe_dptr.d_prp[0] =
1924 cmd->nc_dma->nd_cookie.dmac_laddress;
1925 if (cmd->nc_dma->nd_ncookie > 1) {
1926 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1927 &cmd->nc_dma->nd_cookie);
1928 cmd->nc_sqe.sqe_dptr.d_prp[1] =
1929 cmd->nc_dma->nd_cookie.dmac_laddress;
1930 }
1931 }
1932
1933 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
1934
1935 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1936 if (feature == NVME_FEAT_LBA_RANGE &&
1937 cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1938 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD)
1939 nvme->n_lba_range_supported = B_FALSE;
1940 else
1941 dev_err(nvme->n_dip, CE_WARN,
1942 "!GET FEATURES %d failed with sct = %x, sc = %x",
1943 feature, cmd->nc_cqe.cqe_sf.sf_sct,
1944 cmd->nc_cqe.cqe_sf.sf_sc);
1945 goto fail;
1946 }
1947
1948 if (bufsize != NULL && *bufsize != 0) {
1949 ASSERT(buf != NULL);
1950 *buf = kmem_alloc(*bufsize, KM_SLEEP);
1951 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
1952 }
1953
1954 *res = cmd->nc_cqe.cqe_dw0;
1955
1956 fail:
1957 nvme_free_cmd(cmd);
1958 return (ret);
1959 }
1960
1961 static int
1962 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
1963 {
1964 nvme_write_cache_t nwc = { 0 };
2182 * performance. A value of 3 means "degraded", 0 is best.
2183 */
2184 last_rp = 3;
2185 for (int j = 0; j <= idns->id_nlbaf; j++) {
2186 if (idns->id_lbaf[j].lbaf_lbads == 0)
2187 break;
2188 if (idns->id_lbaf[j].lbaf_ms != 0)
2189 continue;
2190 if (idns->id_lbaf[j].lbaf_rp >= last_rp)
2191 continue;
2192 last_rp = idns->id_lbaf[j].lbaf_rp;
2193 ns->ns_best_block_size =
2194 1 << idns->id_lbaf[j].lbaf_lbads;
2195 }
2196
2197 if (ns->ns_best_block_size < nvme->n_min_block_size)
2198 ns->ns_best_block_size = nvme->n_min_block_size;
2199
2200 /*
2201 * We currently don't support namespaces that use either:
2202 * - thin provisioning
2203 * - protection information
2204 * - illegal block size (< 512)
2205 */
2206 if (idns->id_nsfeat.f_thin ||
2207 idns->id_dps.dp_pinfo) {
2208 dev_err(nvme->n_dip, CE_WARN,
2209 "!ignoring namespace %d, unsupported features: "
2210 "thin = %d, pinfo = %d", nsid,
2211 idns->id_nsfeat.f_thin, idns->id_dps.dp_pinfo);
2212 ns->ns_ignore = B_TRUE;
2213 } else if (ns->ns_block_size < 512) {
2214 dev_err(nvme->n_dip, CE_WARN,
2215 "!ignoring namespace %d, unsupported block size %"PRIu64,
2216 nsid, (uint64_t)ns->ns_block_size);
2217 ns->ns_ignore = B_TRUE;
2218 } else {
2219 ns->ns_ignore = B_FALSE;
2220 }
2221
2222 return (DDI_SUCCESS);
2223 }
2224
2225 static int
2226 nvme_init(nvme_t *nvme)
2227 {
2228 nvme_reg_cc_t cc = { 0 };
2229 nvme_reg_aqa_t aqa = { 0 };
2230 nvme_reg_asq_t asq = { 0 };
2231 nvme_reg_acq_t acq = { 0 };
2232 nvme_reg_cap_t cap;
2233 nvme_reg_vs_t vs;
2234 nvme_reg_csts_t csts;
2235 int i = 0;
2236 uint16_t nqueues;
2237 char model[sizeof (nvme->n_idctl->id_model) + 1];
2238 char *vendor, *product;
2239
2240 /* Check controller version */
2241 vs.r = nvme_get32(nvme, NVME_REG_VS);
2242 nvme->n_version.v_major = vs.b.vs_mjr;
2243 nvme->n_version.v_minor = vs.b.vs_mnr;
2244 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
2245 nvme->n_version.v_major, nvme->n_version.v_minor);
2246
2247 if (NVME_VERSION_HIGHER(&nvme->n_version,
2248 nvme_version_major, nvme_version_minor)) {
2249 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.%d",
2250 nvme_version_major, nvme_version_minor);
2251 if (nvme->n_strict_version)
2252 goto fail;
2253 }
2254
2255 /* retrieve controller configuration */
2256 cap.r = nvme_get64(nvme, NVME_REG_CAP);
2257
2258 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
2259 dev_err(nvme->n_dip, CE_WARN,
2260 "!NVM command set not supported by hardware");
2261 goto fail;
2262 }
2263
2264 nvme->n_nssr_supported = cap.b.cap_nssrs;
2265 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
2266 nvme->n_timeout = cap.b.cap_to;
2267 nvme->n_arbitration_mechanisms = cap.b.cap_ams;
2268 nvme->n_cont_queues_reqd = cap.b.cap_cqr;
2269 nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
2270
2504 }
2505
2506 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2507 "volatile-write-cache-enable",
2508 nvme->n_write_cache_enabled ? 1 : 0);
2509
2510 /*
2511 * Assume LBA Range Type feature is supported. If it isn't this
2512 * will be set to B_FALSE by nvme_get_features().
2513 */
2514 nvme->n_lba_range_supported = B_TRUE;
2515
2516 /*
2517 * Check support for Autonomous Power State Transition.
2518 */
2519 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2520 nvme->n_auto_pst_supported =
2521 nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE;
2522
2523 /*
2524 * Identify Namespaces
2525 */
2526 nvme->n_namespace_count = nvme->n_idctl->id_nn;
2527 if (nvme->n_namespace_count > NVME_MINOR_MAX) {
2528 dev_err(nvme->n_dip, CE_WARN,
2529 "!too many namespaces: %d, limiting to %d\n",
2530 nvme->n_namespace_count, NVME_MINOR_MAX);
2531 nvme->n_namespace_count = NVME_MINOR_MAX;
2532 }
2533
2534 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
2535 nvme->n_namespace_count, KM_SLEEP);
2536
2537 for (i = 0; i != nvme->n_namespace_count; i++) {
2538 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER,
2539 NULL);
2540 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS)
2541 goto fail;
2542 }
2543
2544 /*
2545 * Try to set up MSI/MSI-X interrupts.
2546 */
|
1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Nexenta Systems, Inc.
14 * Copyright 2016 Tegile Systems, Inc. All rights reserved.
15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved.
16 * Copyright 2018 Joyent, Inc.
17 */
18
19 /*
20 * blkdev driver for NVMe compliant storage devices
21 *
22 * This driver was written to conform to version 1.2.1 of the NVMe
23 * specification. It may work with newer versions, but that is completely
24 * untested and disabled by default.
25 *
26 * The driver has only been tested on x86 systems and will not work on big-
27 * endian systems without changes to the code accessing registers and data
28 * structures used by the hardware.
29 *
30 *
31 * Interrupt Usage:
32 *
33 * The driver will use a single interrupt while configuring the device as the
34 * specification requires, but contrary to the specification it will try to use
35 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
36 * will switch to multiple-message MSI(-X) if supported. The driver wants to
66 * wraps around in that time a submission may find the next array slot to still
67 * be used by a long-running command. In this case the array is sequentially
68 * searched for the next free slot. The length of the command array is the same
69 * as the configured queue length. Queue overrun is prevented by the semaphore,
70 * so a command submission may block if the queue is full.
71 *
72 *
73 * Polled I/O Support:
74 *
75 * For kernel core dump support the driver can do polled I/O. As interrupts are
76 * turned off while dumping the driver will just submit a command in the regular
77 * way, and then repeatedly attempt a command retrieval until it gets the
78 * command back.
79 *
80 *
81 * Namespace Support:
82 *
83 * NVMe devices can have multiple namespaces, each being a independent data
84 * store. The driver supports multiple namespaces and creates a blkdev interface
85 * for each namespace found. Namespaces can have various attributes to support
86 * protection information. This driver does not support any of this and ignores
87 * namespaces that have these attributes.
88 *
89 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
90 * (EUI64). This driver uses the EUI64 if present to generate the devid and
91 * passes it to blkdev to use it in the device node names. As this is currently
92 * untested namespaces with EUI64 are ignored by default.
93 *
94 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
95 * single controller. This is an artificial limit imposed by the driver to be
96 * able to address a reasonable number of controllers and namespaces using a
97 * 32bit minor node number.
98 *
99 *
100 * Minor nodes:
101 *
102 * For each NVMe device the driver exposes one minor node for the controller and
103 * one minor node for each namespace. The only operations supported by those
104 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
105 * interface for the nvmeadm(1M) utility.
106 *
107 *
179 * the nc_mutex of the command to be aborted must be held across the call to
180 * nvme_abort_cmd() to prevent the command from completing while the abort is in
181 * progress.
182 *
183 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
184 * and exclusive-open flag nm_oexcl.
185 *
186 *
187 * Quiesce / Fast Reboot:
188 *
189 * The driver currently does not support fast reboot. A quiesce(9E) entry point
190 * is still provided which is used to send a shutdown notification to the
191 * device.
192 *
193 *
194 * Driver Configuration:
195 *
196 * The following driver properties can be changed to control some aspects of the
197 * drivers operation:
198 * - strict-version: can be set to 0 to allow devices conforming to newer
199 * major versions to be used
200 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
201 * specific command status as a fatal error leading device faulting
202 * - admin-queue-len: the maximum length of the admin queue (16-4096)
203 * - io-queue-len: the maximum length of the I/O queues (16-65536)
204 * - async-event-limit: the maximum number of asynchronous event requests to be
205 * posted by the driver
206 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
207 * cache
208 * - min-phys-block-size: the minimum physical block size to report to blkdev,
209 * which is among other things the basis for ZFS vdev ashift
210 *
211 *
212 * TODO:
213 * - figure out sane default for I/O queue depth reported to blkdev
214 * - FMA handling of media errors
215 * - support for devices supporting very large I/O requests using chained PRPs
216 * - support for configuring hardware parameters like interrupt coalescing
217 * - support for media formatting and hard partitioning into namespaces
218 * - support for big-endian systems
219 * - support for fast reboot
241 #include <sys/varargs.h>
242 #include <sys/cpuvar.h>
243 #include <sys/disp.h>
244 #include <sys/blkdev.h>
245 #include <sys/atomic.h>
246 #include <sys/archsystm.h>
247 #include <sys/sata/sata_hba.h>
248 #include <sys/stat.h>
249 #include <sys/policy.h>
250 #include <sys/list.h>
251
252 #include <sys/nvme.h>
253
254 #ifdef __x86
255 #include <sys/x86_archext.h>
256 #endif
257
258 #include "nvme_reg.h"
259 #include "nvme_var.h"
260
261 /*
262 * Assertions to make sure that we've properly captured various aspects of the
263 * packed structures and haven't broken them during updates.
264 */
265 CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000);
266 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
267 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
268 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
269 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
270 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
271 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
272
273 CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000);
274 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
275 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
276 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
277 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
278
279 CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000);
280 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
281 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
282
283
284 /* NVMe spec version supported */
285 static const int nvme_version_major = 1;
286
287 /* tunable for admin command timeout in seconds, default is 1s */
288 int nvme_admin_cmd_timeout = 1;
289
290 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
291 int nvme_format_cmd_timeout = 600;
292
293 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
294 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
295 static int nvme_quiesce(dev_info_t *);
296 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
297 static int nvme_setup_interrupts(nvme_t *, int, int);
298 static void nvme_release_interrupts(nvme_t *);
299 static uint_t nvme_intr(caddr_t, caddr_t);
300
301 static void nvme_shutdown(nvme_t *, int, boolean_t);
302 static boolean_t nvme_reset(nvme_t *, boolean_t);
303 static int nvme_init(nvme_t *);
304 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
305 static void nvme_free_cmd(nvme_cmd_t *);
1088 return (EINVAL);
1089
1090 case NVME_CQE_SC_GEN_INV_FLD:
1091 /* Invalid Field in Command */
1092 if (!cmd->nc_dontpanic)
1093 dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1094 "programming error: invalid field in cmd %p",
1095 (void *)cmd);
1096 return (EIO);
1097
1098 case NVME_CQE_SC_GEN_ID_CNFL:
1099 /* Command ID Conflict */
1100 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1101 "cmd ID conflict in cmd %p", (void *)cmd);
1102 return (0);
1103
1104 case NVME_CQE_SC_GEN_INV_NS:
1105 /* Invalid Namespace or Format */
1106 if (!cmd->nc_dontpanic)
1107 dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1108 "programming error: invalid NS/format in cmd %p",
1109 (void *)cmd);
1110 return (EINVAL);
1111
1112 case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
1113 /* LBA Out Of Range */
1114 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1115 "LBA out of range in cmd %p", (void *)cmd);
1116 return (0);
1117
1118 /*
1119 * Non-fatal errors, handle gracefully.
1120 */
1121 case NVME_CQE_SC_GEN_DATA_XFR_ERR:
1122 /* Data Transfer Error (DMA) */
1123 /* TODO: post ereport */
1124 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
1125 if (cmd->nc_xfer != NULL)
1126 bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1127 return (EIO);
1128
1859 }
1860
1861 static int
1862 nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res,
1863 void **buf, size_t *bufsize)
1864 {
1865 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1866 int ret = EINVAL;
1867
1868 ASSERT(res != NULL);
1869
1870 if (bufsize != NULL)
1871 *bufsize = 0;
1872
1873 cmd->nc_sqid = 0;
1874 cmd->nc_callback = nvme_wakeup_cmd;
1875 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES;
1876 cmd->nc_sqe.sqe_cdw10 = feature;
1877 cmd->nc_sqe.sqe_cdw11 = *res;
1878
1879 /*
1880 * For some of the optional features there doesn't seem to be a method
1881 * of detecting whether it is supported other than using it. This will
1882 * cause "Invalid Field in Command" error, which is normally considered
1883 * a programming error. Set the nc_dontpanic flag to override the panic
1884 * in nvme_check_generic_cmd_status().
1885 */
1886 switch (feature) {
1887 case NVME_FEAT_ARBITRATION:
1888 case NVME_FEAT_POWER_MGMT:
1889 case NVME_FEAT_TEMPERATURE:
1890 case NVME_FEAT_NQUEUES:
1891 case NVME_FEAT_INTR_COAL:
1892 case NVME_FEAT_INTR_VECT:
1893 case NVME_FEAT_WRITE_ATOM:
1894 case NVME_FEAT_ASYNC_EVENT:
1895 break;
1896
1897 case NVME_FEAT_ERROR:
1898 /*
1899 * Per-namespace Deallocated or Unwritten Logical Block
1900 * Error Enable (DULBE) feature was added after initial NVMe
1901 * specification, but we currently only check this feature with
1902 * NS ID of 0 (the controller itself), and some controllers get
1903 * upset, reporting the error. For the moment, override the
1904 * panic by setting the nc_dontpanic flag.
1905 */
1906 cmd->nc_dontpanic = B_TRUE;
1907 break;
1908
1909 case NVME_FEAT_WRITE_CACHE:
1910 if (!nvme->n_write_cache_present)
1911 goto fail;
1912 break;
1913
1914 case NVME_FEAT_LBA_RANGE:
1915 if (!nvme->n_lba_range_supported)
1916 goto fail;
1917
1918 cmd->nc_dontpanic = B_TRUE;
1919 cmd->nc_sqe.sqe_nsid = nsid;
1920 ASSERT(bufsize != NULL);
1921 *bufsize = NVME_LBA_RANGE_BUFSIZE;
1922 break;
1923
1924 case NVME_FEAT_AUTO_PST:
1925 if (!nvme->n_auto_pst_supported)
1926 goto fail;
1927
1928 ASSERT(bufsize != NULL);
1929 *bufsize = NVME_AUTO_PST_BUFSIZE;
1930 break;
1931
1932 case NVME_FEAT_PROGRESS:
1933 if (!nvme->n_progress_supported)
1934 goto fail;
1935
1936 cmd->nc_dontpanic = B_TRUE;
1937 break;
1938
1939 default:
1940 goto fail;
1941 }
1942
1943 if (bufsize != NULL && *bufsize != 0) {
1944 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ,
1945 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1946 dev_err(nvme->n_dip, CE_WARN,
1947 "!nvme_zalloc_dma failed for GET FEATURES");
1948 ret = ENOMEM;
1949 goto fail;
1950 }
1951
1952 if (cmd->nc_dma->nd_ncookie > 2) {
1953 dev_err(nvme->n_dip, CE_WARN,
1954 "!too many DMA cookies for GET FEATURES");
1955 atomic_inc_32(&nvme->n_too_many_cookies);
1956 ret = ENOMEM;
1957 goto fail;
1958 }
1959
1960 cmd->nc_sqe.sqe_dptr.d_prp[0] =
1961 cmd->nc_dma->nd_cookie.dmac_laddress;
1962 if (cmd->nc_dma->nd_ncookie > 1) {
1963 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1964 &cmd->nc_dma->nd_cookie);
1965 cmd->nc_sqe.sqe_dptr.d_prp[1] =
1966 cmd->nc_dma->nd_cookie.dmac_laddress;
1967 }
1968 }
1969
1970 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
1971
1972 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1973 boolean_t known = B_TRUE;
1974
1975 /* Check if this is unsupported optional feature */
1976 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1977 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) {
1978 switch (feature) {
1979 case NVME_FEAT_LBA_RANGE:
1980 nvme->n_lba_range_supported = B_FALSE;
1981 break;
1982 case NVME_FEAT_PROGRESS:
1983 nvme->n_progress_supported = B_FALSE;
1984 break;
1985 default:
1986 known = B_FALSE;
1987 break;
1988 }
1989 } else {
1990 known = B_FALSE;
1991 }
1992
1993 /* Report the error otherwise */
1994 if (!known) {
1995 dev_err(nvme->n_dip, CE_WARN,
1996 "!GET FEATURES %d failed with sct = %x, sc = %x",
1997 feature, cmd->nc_cqe.cqe_sf.sf_sct,
1998 cmd->nc_cqe.cqe_sf.sf_sc);
1999 }
2000
2001 goto fail;
2002 }
2003
2004 if (bufsize != NULL && *bufsize != 0) {
2005 ASSERT(buf != NULL);
2006 *buf = kmem_alloc(*bufsize, KM_SLEEP);
2007 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
2008 }
2009
2010 *res = cmd->nc_cqe.cqe_dw0;
2011
2012 fail:
2013 nvme_free_cmd(cmd);
2014 return (ret);
2015 }
2016
2017 static int
2018 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
2019 {
2020 nvme_write_cache_t nwc = { 0 };
2238 * performance. A value of 3 means "degraded", 0 is best.
2239 */
2240 last_rp = 3;
2241 for (int j = 0; j <= idns->id_nlbaf; j++) {
2242 if (idns->id_lbaf[j].lbaf_lbads == 0)
2243 break;
2244 if (idns->id_lbaf[j].lbaf_ms != 0)
2245 continue;
2246 if (idns->id_lbaf[j].lbaf_rp >= last_rp)
2247 continue;
2248 last_rp = idns->id_lbaf[j].lbaf_rp;
2249 ns->ns_best_block_size =
2250 1 << idns->id_lbaf[j].lbaf_lbads;
2251 }
2252
2253 if (ns->ns_best_block_size < nvme->n_min_block_size)
2254 ns->ns_best_block_size = nvme->n_min_block_size;
2255
2256 /*
2257 * We currently don't support namespaces that use either:
2258 * - protection information
2259 * - illegal block size (< 512)
2260 */
2261 if (idns->id_dps.dp_pinfo) {
2262 dev_err(nvme->n_dip, CE_WARN,
2263 "!ignoring namespace %d, unsupported feature: "
2264 "pinfo = %d", nsid, idns->id_dps.dp_pinfo);
2265 ns->ns_ignore = B_TRUE;
2266 } else if (ns->ns_block_size < 512) {
2267 dev_err(nvme->n_dip, CE_WARN,
2268 "!ignoring namespace %d, unsupported block size %"PRIu64,
2269 nsid, (uint64_t)ns->ns_block_size);
2270 ns->ns_ignore = B_TRUE;
2271 } else {
2272 ns->ns_ignore = B_FALSE;
2273 }
2274
2275 return (DDI_SUCCESS);
2276 }
2277
2278 static int
2279 nvme_init(nvme_t *nvme)
2280 {
2281 nvme_reg_cc_t cc = { 0 };
2282 nvme_reg_aqa_t aqa = { 0 };
2283 nvme_reg_asq_t asq = { 0 };
2284 nvme_reg_acq_t acq = { 0 };
2285 nvme_reg_cap_t cap;
2286 nvme_reg_vs_t vs;
2287 nvme_reg_csts_t csts;
2288 int i = 0;
2289 uint16_t nqueues;
2290 char model[sizeof (nvme->n_idctl->id_model) + 1];
2291 char *vendor, *product;
2292
2293 /* Check controller version */
2294 vs.r = nvme_get32(nvme, NVME_REG_VS);
2295 nvme->n_version.v_major = vs.b.vs_mjr;
2296 nvme->n_version.v_minor = vs.b.vs_mnr;
2297 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
2298 nvme->n_version.v_major, nvme->n_version.v_minor);
2299
2300 if (nvme->n_version.v_major > nvme_version_major) {
2301 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x",
2302 nvme_version_major);
2303 if (nvme->n_strict_version)
2304 goto fail;
2305 }
2306
2307 /* retrieve controller configuration */
2308 cap.r = nvme_get64(nvme, NVME_REG_CAP);
2309
2310 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
2311 dev_err(nvme->n_dip, CE_WARN,
2312 "!NVM command set not supported by hardware");
2313 goto fail;
2314 }
2315
2316 nvme->n_nssr_supported = cap.b.cap_nssrs;
2317 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
2318 nvme->n_timeout = cap.b.cap_to;
2319 nvme->n_arbitration_mechanisms = cap.b.cap_ams;
2320 nvme->n_cont_queues_reqd = cap.b.cap_cqr;
2321 nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
2322
2556 }
2557
2558 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2559 "volatile-write-cache-enable",
2560 nvme->n_write_cache_enabled ? 1 : 0);
2561
2562 /*
2563 * Assume LBA Range Type feature is supported. If it isn't this
2564 * will be set to B_FALSE by nvme_get_features().
2565 */
2566 nvme->n_lba_range_supported = B_TRUE;
2567
2568 /*
2569 * Check support for Autonomous Power State Transition.
2570 */
2571 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2572 nvme->n_auto_pst_supported =
2573 nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE;
2574
2575 /*
2576 * Assume Software Progress Marker feature is supported. If it isn't
2577 * this will be set to B_FALSE by nvme_get_features().
2578 */
2579 nvme->n_progress_supported = B_TRUE;
2580
2581 /*
2582 * Identify Namespaces
2583 */
2584 nvme->n_namespace_count = nvme->n_idctl->id_nn;
2585
2586 if (nvme->n_namespace_count == 0) {
2587 dev_err(nvme->n_dip, CE_WARN,
2588 "!controllers without namespaces are not supported");
2589 goto fail;
2590 }
2591
2592 if (nvme->n_namespace_count > NVME_MINOR_MAX) {
2593 dev_err(nvme->n_dip, CE_WARN,
2594 "!too many namespaces: %d, limiting to %d\n",
2595 nvme->n_namespace_count, NVME_MINOR_MAX);
2596 nvme->n_namespace_count = NVME_MINOR_MAX;
2597 }
2598
2599 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
2600 nvme->n_namespace_count, KM_SLEEP);
2601
2602 for (i = 0; i != nvme->n_namespace_count; i++) {
2603 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER,
2604 NULL);
2605 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS)
2606 goto fail;
2607 }
2608
2609 /*
2610 * Try to set up MSI/MSI-X interrupts.
2611 */
|