1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Nexenta Systems, Inc.
  14  * Copyright 2016 Tegile Systems, Inc. All rights reserved.
  15  * Copyright (c) 2016 The MathWorks, Inc.  All rights reserved.
  16  * Copyright 2020 Joyent, Inc.
  17  * Copyright 2019 Western Digital Corporation.
  18  * Copyright 2020 Racktop Systems.
  19  */
  20 
  21 /*
  22  * blkdev driver for NVMe compliant storage devices
  23  *
  24  * This driver was written to conform to version 1.2.1 of the NVMe
  25  * specification.  It may work with newer versions, but that is completely
  26  * untested and disabled by default.
  27  *
  28  * The driver has only been tested on x86 systems and will not work on big-
  29  * endian systems without changes to the code accessing registers and data
  30  * structures used by the hardware.
  31  *
  32  *
  33  * Interrupt Usage:
  34  *
  35  * The driver will use a single interrupt while configuring the device as the
  36  * specification requires, but contrary to the specification it will try to use
  37  * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
  38  * will switch to multiple-message MSI(-X) if supported. The driver wants to
  39  * have one interrupt vector per CPU, but it will work correctly if less are
  40  * available. Interrupts can be shared by queues, the interrupt handler will
  41  * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
  42  * the admin queue will share an interrupt with one I/O queue. The interrupt
  43  * handler will retrieve completed commands from all queues sharing an interrupt
  44  * vector and will post them to a taskq for completion processing.
  45  *
  46  *
  47  * Command Processing:
  48  *
  49  * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
  50  * to 65536 I/O commands. The driver will configure one I/O queue pair per
  51  * available interrupt vector, with the queue length usually much smaller than
  52  * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
  53  * interrupt vectors will be used.
  54  *
  55  * Additionally the hardware provides a single special admin queue pair that can
  56  * hold up to 4096 admin commands.
  57  *
  58  * From the hardware perspective both queues of a queue pair are independent,
  59  * but they share some driver state: the command array (holding pointers to
  60  * commands currently being processed by the hardware) and the active command
  61  * counter. Access to a submission queue and the shared state is protected by
  62  * nq_mutex, completion queue is protected by ncq_mutex.
  63  *
  64  * When a command is submitted to a queue pair the active command counter is
  65  * incremented and a pointer to the command is stored in the command array. The
  66  * array index is used as command identifier (CID) in the submission queue
  67  * entry. Some commands may take a very long time to complete, and if the queue
  68  * wraps around in that time a submission may find the next array slot to still
  69  * be used by a long-running command. In this case the array is sequentially
  70  * searched for the next free slot. The length of the command array is the same
  71  * as the configured queue length. Queue overrun is prevented by the semaphore,
  72  * so a command submission may block if the queue is full.
  73  *
  74  *
  75  * Polled I/O Support:
  76  *
  77  * For kernel core dump support the driver can do polled I/O. As interrupts are
  78  * turned off while dumping the driver will just submit a command in the regular
  79  * way, and then repeatedly attempt a command retrieval until it gets the
  80  * command back.
  81  *
  82  *
  83  * Namespace Support:
  84  *
  85  * NVMe devices can have multiple namespaces, each being a independent data
  86  * store. The driver supports multiple namespaces and creates a blkdev interface
  87  * for each namespace found. Namespaces can have various attributes to support
  88  * protection information. This driver does not support any of this and ignores
  89  * namespaces that have these attributes.
  90  *
  91  * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
  92  * (EUI64). This driver uses the EUI64 if present to generate the devid and
  93  * passes it to blkdev to use it in the device node names. As this is currently
  94  * untested namespaces with EUI64 are ignored by default.
  95  *
  96  * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
  97  * single controller. This is an artificial limit imposed by the driver to be
  98  * able to address a reasonable number of controllers and namespaces using a
  99  * 32bit minor node number.
 100  *
 101  *
 102  * Minor nodes:
 103  *
 104  * For each NVMe device the driver exposes one minor node for the controller and
 105  * one minor node for each namespace. The only operations supported by those
 106  * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
 107  * interface for the nvmeadm(1M) utility.
 108  *
 109  *
 110  * Blkdev Interface:
 111  *
 112  * This driver uses blkdev to do all the heavy lifting involved with presenting
 113  * a disk device to the system. As a result, the processing of I/O requests is
 114  * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
 115  * setup, and splitting of transfers into manageable chunks.
 116  *
 117  * I/O requests coming in from blkdev are turned into NVM commands and posted to
 118  * an I/O queue. The queue is selected by taking the CPU id modulo the number of
 119  * queues. There is currently no timeout handling of I/O commands.
 120  *
 121  * Blkdev also supports querying device/media information and generating a
 122  * devid. The driver reports the best block size as determined by the namespace
 123  * format back to blkdev as physical block size to support partition and block
 124  * alignment. The devid is either based on the namespace EUI64, if present, or
 125  * composed using the device vendor ID, model number, serial number, and the
 126  * namespace ID.
 127  *
 128  *
 129  * Error Handling:
 130  *
 131  * Error handling is currently limited to detecting fatal hardware errors,
 132  * either by asynchronous events, or synchronously through command status or
 133  * admin command timeouts. In case of severe errors the device is fenced off,
 134  * all further requests will return EIO. FMA is then called to fault the device.
 135  *
 136  * The hardware has a limit for outstanding asynchronous event requests. Before
 137  * this limit is known the driver assumes it is at least 1 and posts a single
 138  * asynchronous request. Later when the limit is known more asynchronous event
 139  * requests are posted to allow quicker reception of error information. When an
 140  * asynchronous event is posted by the hardware the driver will parse the error
 141  * status fields and log information or fault the device, depending on the
 142  * severity of the asynchronous event. The asynchronous event request is then
 143  * reused and posted to the admin queue again.
 144  *
 145  * On command completion the command status is checked for errors. In case of
 146  * errors indicating a driver bug the driver panics. Almost all other error
 147  * status values just cause EIO to be returned.
 148  *
 149  * Command timeouts are currently detected for all admin commands except
 150  * asynchronous event requests. If a command times out and the hardware appears
 151  * to be healthy the driver attempts to abort the command. The original command
 152  * timeout is also applied to the abort command. If the abort times out too the
 153  * driver assumes the device to be dead, fences it off, and calls FMA to retire
 154  * it. In all other cases the aborted command should return immediately with a
 155  * status indicating it was aborted, and the driver will wait indefinitely for
 156  * that to happen. No timeout handling of normal I/O commands is presently done.
 157  *
 158  * Any command that times out due to the controller dropping dead will be put on
 159  * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
 160  * memory being reused by the system and later be written to by a "dead" NVMe
 161  * controller.
 162  *
 163  *
 164  * Locking:
 165  *
 166  * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held
 167  * when accessing shared state and submission queue registers, ncq_mutex
 168  * is held when accessing completion queue state and registers.
 169  * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while
 170  * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both
 171  * mutexes themselves.
 172  *
 173  * Each command also has its own nc_mutex, which is associated with the
 174  * condition variable nc_cv. It is only used on admin commands which are run
 175  * synchronously. In that case it must be held across calls to
 176  * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
 177  * nvme_admin_cmd(). It must also be held whenever the completion state of the
 178  * command is changed or while a admin command timeout is handled.
 179  *
 180  * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
 181  * More than one nc_mutex may only be held when aborting commands. In this case,
 182  * the nc_mutex of the command to be aborted must be held across the call to
 183  * nvme_abort_cmd() to prevent the command from completing while the abort is in
 184  * progress.
 185  *
 186  * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be
 187  * acquired first. More than one nq_mutex is never held by a single thread.
 188  * The ncq_mutex is only held by nvme_retrieve_cmd() and
 189  * nvme_process_iocq(). nvme_process_iocq() is only called from the
 190  * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the
 191  * mutex is non-contentious but is required for implementation completeness
 192  * and safety.
 193  *
 194  * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
 195  * and exclusive-open flag nm_oexcl.
 196  *
 197  *
 198  * Quiesce / Fast Reboot:
 199  *
 200  * The driver currently does not support fast reboot. A quiesce(9E) entry point
 201  * is still provided which is used to send a shutdown notification to the
 202  * device.
 203  *
 204  *
 205  * DDI UFM Support
 206  *
 207  * The driver supports the DDI UFM framework for reporting information about
 208  * the device's firmware image and slot configuration. This data can be
 209  * queried by userland software via ioctls to the ufm driver. For more
 210  * information, see ddi_ufm(9E).
 211  *
 212  *
 213  * Driver Configuration:
 214  *
 215  * The following driver properties can be changed to control some aspects of the
 216  * drivers operation:
 217  * - strict-version: can be set to 0 to allow devices conforming to newer
 218  *   major versions to be used
 219  * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
 220  *   specific command status as a fatal error leading device faulting
 221  * - admin-queue-len: the maximum length of the admin queue (16-4096)
 222  * - io-squeue-len: the maximum length of the I/O submission queues (16-65536)
 223  * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536)
 224  * - async-event-limit: the maximum number of asynchronous event requests to be
 225  *   posted by the driver
 226  * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
 227  *   cache
 228  * - min-phys-block-size: the minimum physical block size to report to blkdev,
 229  *   which is among other things the basis for ZFS vdev ashift
 230  * - max-submission-queues: the maximum number of I/O submission queues.
 231  * - max-completion-queues: the maximum number of I/O completion queues,
 232  *   can be less than max-submission-queues, in which case the completion
 233  *   queues are shared.
 234  *
 235  *
 236  * TODO:
 237  * - figure out sane default for I/O queue depth reported to blkdev
 238  * - FMA handling of media errors
 239  * - support for devices supporting very large I/O requests using chained PRPs
 240  * - support for configuring hardware parameters like interrupt coalescing
 241  * - support for media formatting and hard partitioning into namespaces
 242  * - support for big-endian systems
 243  * - support for fast reboot
 244  * - support for NVMe Subsystem Reset (1.1)
 245  * - support for Scatter/Gather lists (1.1)
 246  * - support for Reservations (1.1)
 247  * - support for power management
 248  */
 249 
 250 #include <sys/byteorder.h>
 251 #ifdef _BIG_ENDIAN
 252 #error nvme driver needs porting for big-endian platforms
 253 #endif
 254 
 255 #include <sys/modctl.h>
 256 #include <sys/conf.h>
 257 #include <sys/devops.h>
 258 #include <sys/ddi.h>
 259 #include <sys/ddi_ufm.h>
 260 #include <sys/sunddi.h>
 261 #include <sys/sunndi.h>
 262 #include <sys/bitmap.h>
 263 #include <sys/sysmacros.h>
 264 #include <sys/param.h>
 265 #include <sys/varargs.h>
 266 #include <sys/cpuvar.h>
 267 #include <sys/disp.h>
 268 #include <sys/blkdev.h>
 269 #include <sys/atomic.h>
 270 #include <sys/archsystm.h>
 271 #include <sys/sata/sata_hba.h>
 272 #include <sys/stat.h>
 273 #include <sys/policy.h>
 274 #include <sys/list.h>
 275 #include <sys/dkio.h>
 276 
 277 #include <sys/nvme.h>
 278 
 279 #ifdef __x86
 280 #include <sys/x86_archext.h>
 281 #endif
 282 
 283 #include "nvme_reg.h"
 284 #include "nvme_var.h"
 285 
 286 /*
 287  * Assertions to make sure that we've properly captured various aspects of the
 288  * packed structures and haven't broken them during updates.
 289  */
 290 CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000);
 291 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
 292 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
 293 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520);
 294 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
 295 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
 296 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
 297 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
 298 
 299 CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000);
 300 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
 301 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92);
 302 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
 303 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
 304 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
 305 
 306 CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000);
 307 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
 308 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
 309 
 310 
 311 /* NVMe spec version supported */
 312 static const int nvme_version_major = 1;
 313 
 314 /* tunable for admin command timeout in seconds, default is 1s */
 315 int nvme_admin_cmd_timeout = 1;
 316 
 317 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
 318 int nvme_format_cmd_timeout = 600;
 319 
 320 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */
 321 int nvme_commit_save_cmd_timeout = 15;
 322 
 323 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
 324 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
 325 static int nvme_quiesce(dev_info_t *);
 326 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
 327 static int nvme_setup_interrupts(nvme_t *, int, int);
 328 static void nvme_release_interrupts(nvme_t *);
 329 static uint_t nvme_intr(caddr_t, caddr_t);
 330 
 331 static void nvme_shutdown(nvme_t *, int, boolean_t);
 332 static boolean_t nvme_reset(nvme_t *, boolean_t);
 333 static int nvme_init(nvme_t *);
 334 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 335 static void nvme_free_cmd(nvme_cmd_t *);
 336 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
 337     bd_xfer_t *);
 338 static void nvme_admin_cmd(nvme_cmd_t *, int);
 339 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
 340 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
 341 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
 342 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
 343 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
 344 static void nvme_wait_cmd(nvme_cmd_t *, uint_t);
 345 static void nvme_wakeup_cmd(void *);
 346 static void nvme_async_event_task(void *);
 347 
 348 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
 349 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
 350 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
 351 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
 352 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
 353 static inline int nvme_check_cmd_status(nvme_cmd_t *);
 354 
 355 static int nvme_abort_cmd(nvme_cmd_t *, uint_t);
 356 static void nvme_async_event(nvme_t *);
 357 static int nvme_format_nvm(nvme_t *, boolean_t, uint32_t, uint8_t, boolean_t,
 358     uint8_t, boolean_t, uint8_t);
 359 static int nvme_get_logpage(nvme_t *, boolean_t, void **, size_t *, uint8_t,
 360     ...);
 361 static int nvme_identify(nvme_t *, boolean_t, uint32_t, void **);
 362 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t,
 363     uint32_t *);
 364 static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *,
 365     void **, size_t *);
 366 static int nvme_write_cache_set(nvme_t *, boolean_t);
 367 static int nvme_set_nqueues(nvme_t *);
 368 
 369 static void nvme_free_dma(nvme_dma_t *);
 370 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
 371     nvme_dma_t **);
 372 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
 373     nvme_dma_t **);
 374 static void nvme_free_qpair(nvme_qpair_t *);
 375 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t);
 376 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
 377 
 378 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
 379 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
 380 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
 381 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
 382 
 383 static boolean_t nvme_check_regs_hdl(nvme_t *);
 384 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
 385 
 386 static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *);
 387 
 388 static void nvme_bd_xfer_done(void *);
 389 static void nvme_bd_driveinfo(void *, bd_drive_t *);
 390 static int nvme_bd_mediainfo(void *, bd_media_t *);
 391 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
 392 static int nvme_bd_read(void *, bd_xfer_t *);
 393 static int nvme_bd_write(void *, bd_xfer_t *);
 394 static int nvme_bd_sync(void *, bd_xfer_t *);
 395 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
 396 static int nvme_bd_free_space(void *, bd_xfer_t *);
 397 
 398 static int nvme_prp_dma_constructor(void *, void *, int);
 399 static void nvme_prp_dma_destructor(void *, void *);
 400 
 401 static void nvme_prepare_devid(nvme_t *, uint32_t);
 402 
 403 /* DDI UFM callbacks */
 404 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t,
 405     ddi_ufm_image_t *);
 406 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t,
 407     ddi_ufm_slot_t *);
 408 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *);
 409 
 410 static int nvme_open(dev_t *, int, int, cred_t *);
 411 static int nvme_close(dev_t, int, int, cred_t *);
 412 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 413 
 414 static ddi_ufm_ops_t nvme_ufm_ops = {
 415         NULL,
 416         nvme_ufm_fill_image,
 417         nvme_ufm_fill_slot,
 418         nvme_ufm_getcaps
 419 };
 420 
 421 #define NVME_MINOR_INST_SHIFT   9
 422 #define NVME_MINOR(inst, nsid)  (((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
 423 #define NVME_MINOR_INST(minor)  ((minor) >> NVME_MINOR_INST_SHIFT)
 424 #define NVME_MINOR_NSID(minor)  ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
 425 #define NVME_MINOR_MAX          (NVME_MINOR(1, 0) - 2)
 426 
 427 static void *nvme_state;
 428 static kmem_cache_t *nvme_cmd_cache;
 429 
 430 /*
 431  * DMA attributes for queue DMA memory
 432  *
 433  * Queue DMA memory must be page aligned. The maximum length of a queue is
 434  * 65536 entries, and an entry can be 64 bytes long.
 435  */
 436 static ddi_dma_attr_t nvme_queue_dma_attr = {
 437         .dma_attr_version       = DMA_ATTR_V0,
 438         .dma_attr_addr_lo       = 0,
 439         .dma_attr_addr_hi       = 0xffffffffffffffffULL,
 440         .dma_attr_count_max     = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
 441         .dma_attr_align         = 0x1000,
 442         .dma_attr_burstsizes    = 0x7ff,
 443         .dma_attr_minxfer       = 0x1000,
 444         .dma_attr_maxxfer       = (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
 445         .dma_attr_seg           = 0xffffffffffffffffULL,
 446         .dma_attr_sgllen        = 1,
 447         .dma_attr_granular      = 1,
 448         .dma_attr_flags         = 0,
 449 };
 450 
 451 /*
 452  * DMA attributes for transfers using Physical Region Page (PRP) entries
 453  *
 454  * A PRP entry describes one page of DMA memory using the page size specified
 455  * in the controller configuration's memory page size register (CC.MPS). It uses
 456  * a 64bit base address aligned to this page size. There is no limitation on
 457  * chaining PRPs together for arbitrarily large DMA transfers.
 458  */
 459 static ddi_dma_attr_t nvme_prp_dma_attr = {
 460         .dma_attr_version       = DMA_ATTR_V0,
 461         .dma_attr_addr_lo       = 0,
 462         .dma_attr_addr_hi       = 0xffffffffffffffffULL,
 463         .dma_attr_count_max     = 0xfff,
 464         .dma_attr_align         = 0x1000,
 465         .dma_attr_burstsizes    = 0x7ff,
 466         .dma_attr_minxfer       = 0x1000,
 467         .dma_attr_maxxfer       = 0x1000,
 468         .dma_attr_seg           = 0xfff,
 469         .dma_attr_sgllen        = -1,
 470         .dma_attr_granular      = 1,
 471         .dma_attr_flags         = 0,
 472 };
 473 
 474 /*
 475  * DMA attributes for transfers using scatter/gather lists
 476  *
 477  * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
 478  * 32bit length field. SGL Segment and SGL Last Segment entries require the
 479  * length to be a multiple of 16 bytes.
 480  */
 481 static ddi_dma_attr_t nvme_sgl_dma_attr = {
 482         .dma_attr_version       = DMA_ATTR_V0,
 483         .dma_attr_addr_lo       = 0,
 484         .dma_attr_addr_hi       = 0xffffffffffffffffULL,
 485         .dma_attr_count_max     = 0xffffffffUL,
 486         .dma_attr_align         = 1,
 487         .dma_attr_burstsizes    = 0x7ff,
 488         .dma_attr_minxfer       = 0x10,
 489         .dma_attr_maxxfer       = 0xfffffffffULL,
 490         .dma_attr_seg           = 0xffffffffffffffffULL,
 491         .dma_attr_sgllen        = -1,
 492         .dma_attr_granular      = 0x10,
 493         .dma_attr_flags         = 0
 494 };
 495 
 496 static ddi_device_acc_attr_t nvme_reg_acc_attr = {
 497         .devacc_attr_version    = DDI_DEVICE_ATTR_V0,
 498         .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
 499         .devacc_attr_dataorder  = DDI_STRICTORDER_ACC
 500 };
 501 
 502 static struct cb_ops nvme_cb_ops = {
 503         .cb_open        = nvme_open,
 504         .cb_close       = nvme_close,
 505         .cb_strategy    = nodev,
 506         .cb_print       = nodev,
 507         .cb_dump        = nodev,
 508         .cb_read        = nodev,
 509         .cb_write       = nodev,
 510         .cb_ioctl       = nvme_ioctl,
 511         .cb_devmap      = nodev,
 512         .cb_mmap        = nodev,
 513         .cb_segmap      = nodev,
 514         .cb_chpoll      = nochpoll,
 515         .cb_prop_op     = ddi_prop_op,
 516         .cb_str         = 0,
 517         .cb_flag        = D_NEW | D_MP,
 518         .cb_rev         = CB_REV,
 519         .cb_aread       = nodev,
 520         .cb_awrite      = nodev
 521 };
 522 
 523 static struct dev_ops nvme_dev_ops = {
 524         .devo_rev       = DEVO_REV,
 525         .devo_refcnt    = 0,
 526         .devo_getinfo   = ddi_no_info,
 527         .devo_identify  = nulldev,
 528         .devo_probe     = nulldev,
 529         .devo_attach    = nvme_attach,
 530         .devo_detach    = nvme_detach,
 531         .devo_reset     = nodev,
 532         .devo_cb_ops    = &nvme_cb_ops,
 533         .devo_bus_ops   = NULL,
 534         .devo_power     = NULL,
 535         .devo_quiesce   = nvme_quiesce,
 536 };
 537 
 538 static struct modldrv nvme_modldrv = {
 539         .drv_modops     = &mod_driverops,
 540         .drv_linkinfo   = "NVMe v1.1b",
 541         .drv_dev_ops    = &nvme_dev_ops
 542 };
 543 
 544 static struct modlinkage nvme_modlinkage = {
 545         .ml_rev         = MODREV_1,
 546         .ml_linkage     = { &nvme_modldrv, NULL }
 547 };
 548 
 549 static bd_ops_t nvme_bd_ops = {
 550         .o_version      = BD_OPS_CURRENT_VERSION,
 551         .o_drive_info   = nvme_bd_driveinfo,
 552         .o_media_info   = nvme_bd_mediainfo,
 553         .o_devid_init   = nvme_bd_devid,
 554         .o_sync_cache   = nvme_bd_sync,
 555         .o_read         = nvme_bd_read,
 556         .o_write        = nvme_bd_write,
 557         .o_free_space   = nvme_bd_free_space,
 558 };
 559 
 560 /*
 561  * This list will hold commands that have timed out and couldn't be aborted.
 562  * As we don't know what the hardware may still do with the DMA memory we can't
 563  * free them, so we'll keep them forever on this list where we can easily look
 564  * at them with mdb.
 565  */
 566 static struct list nvme_lost_cmds;
 567 static kmutex_t nvme_lc_mutex;
 568 
 569 int
 570 _init(void)
 571 {
 572         int error;
 573 
 574         error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
 575         if (error != DDI_SUCCESS)
 576                 return (error);
 577 
 578         nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
 579             sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
 580 
 581         mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
 582         list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
 583             offsetof(nvme_cmd_t, nc_list));
 584 
 585         bd_mod_init(&nvme_dev_ops);
 586 
 587         error = mod_install(&nvme_modlinkage);
 588         if (error != DDI_SUCCESS) {
 589                 ddi_soft_state_fini(&nvme_state);
 590                 mutex_destroy(&nvme_lc_mutex);
 591                 list_destroy(&nvme_lost_cmds);
 592                 bd_mod_fini(&nvme_dev_ops);
 593         }
 594 
 595         return (error);
 596 }
 597 
 598 int
 599 _fini(void)
 600 {
 601         int error;
 602 
 603         if (!list_is_empty(&nvme_lost_cmds))
 604                 return (DDI_FAILURE);
 605 
 606         error = mod_remove(&nvme_modlinkage);
 607         if (error == DDI_SUCCESS) {
 608                 ddi_soft_state_fini(&nvme_state);
 609                 kmem_cache_destroy(nvme_cmd_cache);
 610                 mutex_destroy(&nvme_lc_mutex);
 611                 list_destroy(&nvme_lost_cmds);
 612                 bd_mod_fini(&nvme_dev_ops);
 613         }
 614 
 615         return (error);
 616 }
 617 
 618 int
 619 _info(struct modinfo *modinfop)
 620 {
 621         return (mod_info(&nvme_modlinkage, modinfop));
 622 }
 623 
 624 static inline void
 625 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
 626 {
 627         ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
 628 
 629         /*LINTED: E_BAD_PTR_CAST_ALIGN*/
 630         ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
 631 }
 632 
 633 static inline void
 634 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
 635 {
 636         ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
 637 
 638         /*LINTED: E_BAD_PTR_CAST_ALIGN*/
 639         ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
 640 }
 641 
 642 static inline uint64_t
 643 nvme_get64(nvme_t *nvme, uintptr_t reg)
 644 {
 645         uint64_t val;
 646 
 647         ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
 648 
 649         /*LINTED: E_BAD_PTR_CAST_ALIGN*/
 650         val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
 651 
 652         return (val);
 653 }
 654 
 655 static inline uint32_t
 656 nvme_get32(nvme_t *nvme, uintptr_t reg)
 657 {
 658         uint32_t val;
 659 
 660         ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
 661 
 662         /*LINTED: E_BAD_PTR_CAST_ALIGN*/
 663         val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
 664 
 665         return (val);
 666 }
 667 
 668 static boolean_t
 669 nvme_check_regs_hdl(nvme_t *nvme)
 670 {
 671         ddi_fm_error_t error;
 672 
 673         ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
 674 
 675         if (error.fme_status != DDI_FM_OK)
 676                 return (B_TRUE);
 677 
 678         return (B_FALSE);
 679 }
 680 
 681 static boolean_t
 682 nvme_check_dma_hdl(nvme_dma_t *dma)
 683 {
 684         ddi_fm_error_t error;
 685 
 686         if (dma == NULL)
 687                 return (B_FALSE);
 688 
 689         ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
 690 
 691         if (error.fme_status != DDI_FM_OK)
 692                 return (B_TRUE);
 693 
 694         return (B_FALSE);
 695 }
 696 
 697 static void
 698 nvme_free_dma_common(nvme_dma_t *dma)
 699 {
 700         if (dma->nd_dmah != NULL)
 701                 (void) ddi_dma_unbind_handle(dma->nd_dmah);
 702         if (dma->nd_acch != NULL)
 703                 ddi_dma_mem_free(&dma->nd_acch);
 704         if (dma->nd_dmah != NULL)
 705                 ddi_dma_free_handle(&dma->nd_dmah);
 706 }
 707 
 708 static void
 709 nvme_free_dma(nvme_dma_t *dma)
 710 {
 711         nvme_free_dma_common(dma);
 712         kmem_free(dma, sizeof (*dma));
 713 }
 714 
 715 /* ARGSUSED */
 716 static void
 717 nvme_prp_dma_destructor(void *buf, void *private)
 718 {
 719         nvme_dma_t *dma = (nvme_dma_t *)buf;
 720 
 721         nvme_free_dma_common(dma);
 722 }
 723 
 724 static int
 725 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma,
 726     size_t len, uint_t flags, ddi_dma_attr_t *dma_attr)
 727 {
 728         if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
 729             &dma->nd_dmah) != DDI_SUCCESS) {
 730                 /*
 731                  * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
 732                  * the only other possible error is DDI_DMA_BADATTR which
 733                  * indicates a driver bug which should cause a panic.
 734                  */
 735                 dev_err(nvme->n_dip, CE_PANIC,
 736                     "!failed to get DMA handle, check DMA attributes");
 737                 return (DDI_FAILURE);
 738         }
 739 
 740         /*
 741          * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
 742          * or the flags are conflicting, which isn't the case here.
 743          */
 744         (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
 745             DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
 746             &dma->nd_len, &dma->nd_acch);
 747 
 748         if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
 749             dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
 750             &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
 751                 dev_err(nvme->n_dip, CE_WARN,
 752                     "!failed to bind DMA memory");
 753                 atomic_inc_32(&nvme->n_dma_bind_err);
 754                 nvme_free_dma_common(dma);
 755                 return (DDI_FAILURE);
 756         }
 757 
 758         return (DDI_SUCCESS);
 759 }
 760 
 761 static int
 762 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
 763     ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
 764 {
 765         nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
 766 
 767         if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) !=
 768             DDI_SUCCESS) {
 769                 *ret = NULL;
 770                 kmem_free(dma, sizeof (nvme_dma_t));
 771                 return (DDI_FAILURE);
 772         }
 773 
 774         bzero(dma->nd_memp, dma->nd_len);
 775 
 776         *ret = dma;
 777         return (DDI_SUCCESS);
 778 }
 779 
 780 /* ARGSUSED */
 781 static int
 782 nvme_prp_dma_constructor(void *buf, void *private, int flags)
 783 {
 784         nvme_dma_t *dma = (nvme_dma_t *)buf;
 785         nvme_t *nvme = (nvme_t *)private;
 786 
 787         dma->nd_dmah = NULL;
 788         dma->nd_acch = NULL;
 789 
 790         if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize,
 791             DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) {
 792                 return (-1);
 793         }
 794 
 795         ASSERT(dma->nd_ncookie == 1);
 796 
 797         dma->nd_cached = B_TRUE;
 798 
 799         return (0);
 800 }
 801 
 802 static int
 803 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
 804     uint_t flags, nvme_dma_t **dma)
 805 {
 806         uint32_t len = nentry * qe_len;
 807         ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
 808 
 809         len = roundup(len, nvme->n_pagesize);
 810 
 811         if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
 812             != DDI_SUCCESS) {
 813                 dev_err(nvme->n_dip, CE_WARN,
 814                     "!failed to get DMA memory for queue");
 815                 goto fail;
 816         }
 817 
 818         if ((*dma)->nd_ncookie != 1) {
 819                 dev_err(nvme->n_dip, CE_WARN,
 820                     "!got too many cookies for queue DMA");
 821                 goto fail;
 822         }
 823 
 824         return (DDI_SUCCESS);
 825 
 826 fail:
 827         if (*dma) {
 828                 nvme_free_dma(*dma);
 829                 *dma = NULL;
 830         }
 831 
 832         return (DDI_FAILURE);
 833 }
 834 
 835 static void
 836 nvme_free_cq(nvme_cq_t *cq)
 837 {
 838         mutex_destroy(&cq->ncq_mutex);
 839 
 840         if (cq->ncq_cmd_taskq != NULL)
 841                 taskq_destroy(cq->ncq_cmd_taskq);
 842 
 843         if (cq->ncq_dma != NULL)
 844                 nvme_free_dma(cq->ncq_dma);
 845 
 846         kmem_free(cq, sizeof (*cq));
 847 }
 848 
 849 static void
 850 nvme_free_qpair(nvme_qpair_t *qp)
 851 {
 852         int i;
 853 
 854         mutex_destroy(&qp->nq_mutex);
 855         sema_destroy(&qp->nq_sema);
 856 
 857         if (qp->nq_sqdma != NULL)
 858                 nvme_free_dma(qp->nq_sqdma);
 859 
 860         if (qp->nq_active_cmds > 0)
 861                 for (i = 0; i != qp->nq_nentry; i++)
 862                         if (qp->nq_cmd[i] != NULL)
 863                                 nvme_free_cmd(qp->nq_cmd[i]);
 864 
 865         if (qp->nq_cmd != NULL)
 866                 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
 867 
 868         kmem_free(qp, sizeof (nvme_qpair_t));
 869 }
 870 
 871 /*
 872  * Destroy the pre-allocated cq array, but only free individual completion
 873  * queues from the given starting index.
 874  */
 875 static void
 876 nvme_destroy_cq_array(nvme_t *nvme, uint_t start)
 877 {
 878         uint_t i;
 879 
 880         for (i = start; i < nvme->n_cq_count; i++)
 881                 if (nvme->n_cq[i] != NULL)
 882                         nvme_free_cq(nvme->n_cq[i]);
 883 
 884         kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count);
 885 }
 886 
 887 static int
 888 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx,
 889     uint_t nthr)
 890 {
 891         nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP);
 892         char name[64];          /* large enough for the taskq name */
 893 
 894         mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER,
 895             DDI_INTR_PRI(nvme->n_intr_pri));
 896 
 897         if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
 898             DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS)
 899                 goto fail;
 900 
 901         cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp;
 902         cq->ncq_nentry = nentry;
 903         cq->ncq_id = idx;
 904         cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx);
 905 
 906         /*
 907          * Each completion queue has its own command taskq.
 908          */
 909         (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u",
 910             ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx);
 911 
 912         cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX,
 913             TASKQ_PREPOPULATE);
 914 
 915         if (cq->ncq_cmd_taskq == NULL) {
 916                 dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd "
 917                     "taskq for cq %u", idx);
 918                 goto fail;
 919         }
 920 
 921         *cqp = cq;
 922         return (DDI_SUCCESS);
 923 
 924 fail:
 925         nvme_free_cq(cq);
 926         *cqp = NULL;
 927 
 928         return (DDI_FAILURE);
 929 }
 930 
 931 /*
 932  * Create the n_cq array big enough to hold "ncq" completion queues.
 933  * If the array already exists it will be re-sized (but only larger).
 934  * The admin queue is included in this array, which boosts the
 935  * max number of entries to UINT16_MAX + 1.
 936  */
 937 static int
 938 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr)
 939 {
 940         nvme_cq_t **cq;
 941         uint_t i, cq_count;
 942 
 943         ASSERT3U(ncq, >, nvme->n_cq_count);
 944 
 945         cq = nvme->n_cq;
 946         cq_count = nvme->n_cq_count;
 947 
 948         nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP);
 949         nvme->n_cq_count = ncq;
 950 
 951         for (i = 0; i < cq_count; i++)
 952                 nvme->n_cq[i] = cq[i];
 953 
 954         for (; i < nvme->n_cq_count; i++)
 955                 if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) !=
 956                     DDI_SUCCESS)
 957                         goto fail;
 958 
 959         if (cq != NULL)
 960                 kmem_free(cq, sizeof (*cq) * cq_count);
 961 
 962         return (DDI_SUCCESS);
 963 
 964 fail:
 965         nvme_destroy_cq_array(nvme, cq_count);
 966         /*
 967          * Restore the original array
 968          */
 969         nvme->n_cq_count = cq_count;
 970         nvme->n_cq = cq;
 971 
 972         return (DDI_FAILURE);
 973 }
 974 
 975 static int
 976 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
 977     uint_t idx)
 978 {
 979         nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
 980         uint_t cq_idx;
 981 
 982         mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
 983             DDI_INTR_PRI(nvme->n_intr_pri));
 984 
 985         /*
 986          * The NVMe spec defines that a full queue has one empty (unused) slot;
 987          * initialize the semaphore accordingly.
 988          */
 989         sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL);
 990 
 991         if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
 992             DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
 993                 goto fail;
 994 
 995         /*
 996          * idx == 0 is adminq, those above 0 are shared io completion queues.
 997          */
 998         cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1);
 999         qp->nq_cq = nvme->n_cq[cq_idx];
1000         qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
1001         qp->nq_nentry = nentry;
1002 
1003         qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
1004 
1005         qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
1006         qp->nq_next_cmd = 0;
1007 
1008         *nqp = qp;
1009         return (DDI_SUCCESS);
1010 
1011 fail:
1012         nvme_free_qpair(qp);
1013         *nqp = NULL;
1014 
1015         return (DDI_FAILURE);
1016 }
1017 
1018 static nvme_cmd_t *
1019 nvme_alloc_cmd(nvme_t *nvme, int kmflag)
1020 {
1021         nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
1022 
1023         if (cmd == NULL)
1024                 return (cmd);
1025 
1026         bzero(cmd, sizeof (nvme_cmd_t));
1027 
1028         cmd->nc_nvme = nvme;
1029 
1030         mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
1031             DDI_INTR_PRI(nvme->n_intr_pri));
1032         cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
1033 
1034         return (cmd);
1035 }
1036 
1037 static void
1038 nvme_free_cmd(nvme_cmd_t *cmd)
1039 {
1040         /* Don't free commands on the lost commands list. */
1041         if (list_link_active(&cmd->nc_list))
1042                 return;
1043 
1044         if (cmd->nc_dma) {
1045                 if (cmd->nc_dma->nd_cached)
1046                         kmem_cache_free(cmd->nc_nvme->n_prp_cache,
1047                             cmd->nc_dma);
1048                 else
1049                         nvme_free_dma(cmd->nc_dma);
1050                 cmd->nc_dma = NULL;
1051         }
1052 
1053         cv_destroy(&cmd->nc_cv);
1054         mutex_destroy(&cmd->nc_mutex);
1055 
1056         kmem_cache_free(nvme_cmd_cache, cmd);
1057 }
1058 
1059 static void
1060 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1061 {
1062         sema_p(&qp->nq_sema);
1063         nvme_submit_cmd_common(qp, cmd);
1064 }
1065 
1066 static int
1067 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1068 {
1069         if (sema_tryp(&qp->nq_sema) == 0)
1070                 return (EAGAIN);
1071 
1072         nvme_submit_cmd_common(qp, cmd);
1073         return (0);
1074 }
1075 
1076 static void
1077 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1078 {
1079         nvme_reg_sqtdbl_t tail = { 0 };
1080 
1081         mutex_enter(&qp->nq_mutex);
1082         cmd->nc_completed = B_FALSE;
1083 
1084         /*
1085          * Try to insert the cmd into the active cmd array at the nq_next_cmd
1086          * slot. If the slot is already occupied advance to the next slot and
1087          * try again. This can happen for long running commands like async event
1088          * requests.
1089          */
1090         while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
1091                 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1092         qp->nq_cmd[qp->nq_next_cmd] = cmd;
1093 
1094         qp->nq_active_cmds++;
1095 
1096         cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
1097         bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
1098         (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
1099             sizeof (nvme_sqe_t) * qp->nq_sqtail,
1100             sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
1101         qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1102 
1103         tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
1104         nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
1105 
1106         mutex_exit(&qp->nq_mutex);
1107 }
1108 
1109 static nvme_cmd_t *
1110 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid)
1111 {
1112         nvme_cmd_t *cmd;
1113 
1114         ASSERT(mutex_owned(&qp->nq_mutex));
1115         ASSERT3S(cid, <, qp->nq_nentry);
1116 
1117         cmd = qp->nq_cmd[cid];
1118         qp->nq_cmd[cid] = NULL;
1119         ASSERT3U(qp->nq_active_cmds, >, 0);
1120         qp->nq_active_cmds--;
1121         sema_v(&qp->nq_sema);
1122 
1123         ASSERT3P(cmd, !=, NULL);
1124         ASSERT3P(cmd->nc_nvme, ==, nvme);
1125         ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid);
1126 
1127         return (cmd);
1128 }
1129 
1130 /*
1131  * Get the command tied to the next completed cqe and bump along completion
1132  * queue head counter.
1133  */
1134 static nvme_cmd_t *
1135 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq)
1136 {
1137         nvme_qpair_t *qp;
1138         nvme_cqe_t *cqe;
1139         nvme_cmd_t *cmd;
1140 
1141         ASSERT(mutex_owned(&cq->ncq_mutex));
1142 
1143         cqe = &cq->ncq_cq[cq->ncq_head];
1144 
1145         /* Check phase tag of CQE. Hardware inverts it for new entries. */
1146         if (cqe->cqe_sf.sf_p == cq->ncq_phase)
1147                 return (NULL);
1148 
1149         qp = nvme->n_ioq[cqe->cqe_sqid];
1150 
1151         mutex_enter(&qp->nq_mutex);
1152         cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid);
1153         mutex_exit(&qp->nq_mutex);
1154 
1155         ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
1156         bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
1157 
1158         qp->nq_sqhead = cqe->cqe_sqhd;
1159 
1160         cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry;
1161 
1162         /* Toggle phase on wrap-around. */
1163         if (cq->ncq_head == 0)
1164                 cq->ncq_phase = cq->ncq_phase ? 0 : 1;
1165 
1166         return (cmd);
1167 }
1168 
1169 /*
1170  * Process all completed commands on the io completion queue.
1171  */
1172 static uint_t
1173 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq)
1174 {
1175         nvme_reg_cqhdbl_t head = { 0 };
1176         nvme_cmd_t *cmd;
1177         uint_t completed = 0;
1178 
1179         if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1180             DDI_SUCCESS)
1181                 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1182                     __func__);
1183 
1184         mutex_enter(&cq->ncq_mutex);
1185 
1186         while ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1187                 taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd,
1188                     TQ_NOSLEEP, &cmd->nc_tqent);
1189 
1190                 completed++;
1191         }
1192 
1193         if (completed > 0) {
1194                 /*
1195                  * Update the completion queue head doorbell.
1196                  */
1197                 head.b.cqhdbl_cqh = cq->ncq_head;
1198                 nvme_put32(nvme, cq->ncq_hdbl, head.r);
1199         }
1200 
1201         mutex_exit(&cq->ncq_mutex);
1202 
1203         return (completed);
1204 }
1205 
1206 static nvme_cmd_t *
1207 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
1208 {
1209         nvme_cq_t *cq = qp->nq_cq;
1210         nvme_reg_cqhdbl_t head = { 0 };
1211         nvme_cmd_t *cmd;
1212 
1213         if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1214             DDI_SUCCESS)
1215                 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1216                     __func__);
1217 
1218         mutex_enter(&cq->ncq_mutex);
1219 
1220         if ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1221                 head.b.cqhdbl_cqh = cq->ncq_head;
1222                 nvme_put32(nvme, cq->ncq_hdbl, head.r);
1223         }
1224 
1225         mutex_exit(&cq->ncq_mutex);
1226 
1227         return (cmd);
1228 }
1229 
1230 static int
1231 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
1232 {
1233         nvme_cqe_t *cqe = &cmd->nc_cqe;
1234 
1235         dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1236             "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1237             "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1238             cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1239             cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1240 
1241         if (cmd->nc_xfer != NULL)
1242                 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1243 
1244         if (cmd->nc_nvme->n_strict_version) {
1245                 cmd->nc_nvme->n_dead = B_TRUE;
1246                 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
1247         }
1248 
1249         return (EIO);
1250 }
1251 
1252 static int
1253 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd)
1254 {
1255         nvme_cqe_t *cqe = &cmd->nc_cqe;
1256 
1257         dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1258             "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1259             "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1260             cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1261             cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1262         if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) {
1263                 cmd->nc_nvme->n_dead = B_TRUE;
1264                 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
1265         }
1266 
1267         return (EIO);
1268 }
1269 
1270 static int
1271 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd)
1272 {
1273         nvme_cqe_t *cqe = &cmd->nc_cqe;
1274 
1275         switch (cqe->cqe_sf.sf_sc) {
1276         case NVME_CQE_SC_INT_NVM_WRITE:
1277                 /* write fail */
1278                 /* TODO: post ereport */
1279                 if (cmd->nc_xfer != NULL)
1280                         bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1281                 return (EIO);
1282 
1283         case NVME_CQE_SC_INT_NVM_READ:
1284                 /* read fail */
1285                 /* TODO: post ereport */
1286                 if (cmd->nc_xfer != NULL)
1287                         bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1288                 return (EIO);
1289 
1290         default:
1291                 return (nvme_check_unknown_cmd_status(cmd));
1292         }
1293 }
1294 
1295 static int
1296 nvme_check_generic_cmd_status(nvme_cmd_t *cmd)
1297 {
1298         nvme_cqe_t *cqe = &cmd->nc_cqe;
1299 
1300         switch (cqe->cqe_sf.sf_sc) {
1301         case NVME_CQE_SC_GEN_SUCCESS:
1302                 return (0);
1303 
1304         /*
1305          * Errors indicating a bug in the driver should cause a panic.
1306          */
1307         case NVME_CQE_SC_GEN_INV_OPC:
1308                 /* Invalid Command Opcode */
1309                 if (!cmd->nc_dontpanic)
1310                         dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1311                             "programming error: invalid opcode in cmd %p",
1312                             (void *)cmd);
1313                 return (EINVAL);
1314 
1315         case NVME_CQE_SC_GEN_INV_FLD:
1316                 /* Invalid Field in Command */
1317                 if (!cmd->nc_dontpanic)
1318                         dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1319                             "programming error: invalid field in cmd %p",
1320                             (void *)cmd);
1321                 return (EIO);
1322 
1323         case NVME_CQE_SC_GEN_ID_CNFL:
1324                 /* Command ID Conflict */
1325                 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1326                     "cmd ID conflict in cmd %p", (void *)cmd);
1327                 return (0);
1328 
1329         case NVME_CQE_SC_GEN_INV_NS:
1330                 /* Invalid Namespace or Format */
1331                 if (!cmd->nc_dontpanic)
1332                         dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1333                             "programming error: invalid NS/format in cmd %p",
1334                             (void *)cmd);
1335                 return (EINVAL);
1336 
1337         case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
1338                 /* LBA Out Of Range */
1339                 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1340                     "LBA out of range in cmd %p", (void *)cmd);
1341                 return (0);
1342 
1343         /*
1344          * Non-fatal errors, handle gracefully.
1345          */
1346         case NVME_CQE_SC_GEN_DATA_XFR_ERR:
1347                 /* Data Transfer Error (DMA) */
1348                 /* TODO: post ereport */
1349                 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
1350                 if (cmd->nc_xfer != NULL)
1351                         bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1352                 return (EIO);
1353 
1354         case NVME_CQE_SC_GEN_INTERNAL_ERR:
1355                 /*
1356                  * Internal Error. The spec (v1.0, section 4.5.1.2) says
1357                  * detailed error information is returned as async event,
1358                  * so we pretty much ignore the error here and handle it
1359                  * in the async event handler.
1360                  */
1361                 atomic_inc_32(&cmd->nc_nvme->n_internal_err);
1362                 if (cmd->nc_xfer != NULL)
1363                         bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1364                 return (EIO);
1365 
1366         case NVME_CQE_SC_GEN_ABORT_REQUEST:
1367                 /*
1368                  * Command Abort Requested. This normally happens only when a
1369                  * command times out.
1370                  */
1371                 /* TODO: post ereport or change blkdev to handle this? */
1372                 atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err);
1373                 return (ECANCELED);
1374 
1375         case NVME_CQE_SC_GEN_ABORT_PWRLOSS:
1376                 /* Command Aborted due to Power Loss Notification */
1377                 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
1378                 cmd->nc_nvme->n_dead = B_TRUE;
1379                 return (EIO);
1380 
1381         case NVME_CQE_SC_GEN_ABORT_SQ_DEL:
1382                 /* Command Aborted due to SQ Deletion */
1383                 atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del);
1384                 return (EIO);
1385 
1386         case NVME_CQE_SC_GEN_NVM_CAP_EXC:
1387                 /* Capacity Exceeded */
1388                 atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc);
1389                 if (cmd->nc_xfer != NULL)
1390                         bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1391                 return (EIO);
1392 
1393         case NVME_CQE_SC_GEN_NVM_NS_NOTRDY:
1394                 /* Namespace Not Ready */
1395                 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy);
1396                 if (cmd->nc_xfer != NULL)
1397                         bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1398                 return (EIO);
1399 
1400         default:
1401                 return (nvme_check_unknown_cmd_status(cmd));
1402         }
1403 }
1404 
1405 static int
1406 nvme_check_specific_cmd_status(nvme_cmd_t *cmd)
1407 {
1408         nvme_cqe_t *cqe = &cmd->nc_cqe;
1409 
1410         switch (cqe->cqe_sf.sf_sc) {
1411         case NVME_CQE_SC_SPC_INV_CQ:
1412                 /* Completion Queue Invalid */
1413                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE);
1414                 atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err);
1415                 return (EINVAL);
1416 
1417         case NVME_CQE_SC_SPC_INV_QID:
1418                 /* Invalid Queue Identifier */
1419                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1420                     cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE ||
1421                     cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE ||
1422                     cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1423                 atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err);
1424                 return (EINVAL);
1425 
1426         case NVME_CQE_SC_SPC_MAX_QSZ_EXC:
1427                 /* Max Queue Size Exceeded */
1428                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1429                     cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1430                 atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc);
1431                 return (EINVAL);
1432 
1433         case NVME_CQE_SC_SPC_ABRT_CMD_EXC:
1434                 /* Abort Command Limit Exceeded */
1435                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT);
1436                 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1437                     "abort command limit exceeded in cmd %p", (void *)cmd);
1438                 return (0);
1439 
1440         case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC:
1441                 /* Async Event Request Limit Exceeded */
1442                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT);
1443                 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1444                     "async event request limit exceeded in cmd %p",
1445                     (void *)cmd);
1446                 return (0);
1447 
1448         case NVME_CQE_SC_SPC_INV_INT_VECT:
1449                 /* Invalid Interrupt Vector */
1450                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1451                 atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect);
1452                 return (EINVAL);
1453 
1454         case NVME_CQE_SC_SPC_INV_LOG_PAGE:
1455                 /* Invalid Log Page */
1456                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE);
1457                 atomic_inc_32(&cmd->nc_nvme->n_inv_log_page);
1458                 return (EINVAL);
1459 
1460         case NVME_CQE_SC_SPC_INV_FORMAT:
1461                 /* Invalid Format */
1462                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT);
1463                 atomic_inc_32(&cmd->nc_nvme->n_inv_format);
1464                 if (cmd->nc_xfer != NULL)
1465                         bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1466                 return (EINVAL);
1467 
1468         case NVME_CQE_SC_SPC_INV_Q_DEL:
1469                 /* Invalid Queue Deletion */
1470                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1471                 atomic_inc_32(&cmd->nc_nvme->n_inv_q_del);
1472                 return (EINVAL);
1473 
1474         case NVME_CQE_SC_SPC_NVM_CNFL_ATTR:
1475                 /* Conflicting Attributes */
1476                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT ||
1477                     cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1478                     cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1479                 atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr);
1480                 if (cmd->nc_xfer != NULL)
1481                         bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1482                 return (EINVAL);
1483 
1484         case NVME_CQE_SC_SPC_NVM_INV_PROT:
1485                 /* Invalid Protection Information */
1486                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE ||
1487                     cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1488                     cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1489                 atomic_inc_32(&cmd->nc_nvme->n_inv_prot);
1490                 if (cmd->nc_xfer != NULL)
1491                         bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1492                 return (EINVAL);
1493 
1494         case NVME_CQE_SC_SPC_NVM_READONLY:
1495                 /* Write to Read Only Range */
1496                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1497                 atomic_inc_32(&cmd->nc_nvme->n_readonly);
1498                 if (cmd->nc_xfer != NULL)
1499                         bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1500                 return (EROFS);
1501 
1502         case NVME_CQE_SC_SPC_INV_FW_SLOT:
1503                 /* Invalid Firmware Slot */
1504                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1505                 return (EINVAL);
1506 
1507         case NVME_CQE_SC_SPC_INV_FW_IMG:
1508                 /* Invalid Firmware Image */
1509                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1510                 return (EINVAL);
1511 
1512         case NVME_CQE_SC_SPC_FW_RESET:
1513                 /* Conventional Reset Required */
1514                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1515                 return (0);
1516 
1517         case NVME_CQE_SC_SPC_FW_NSSR:
1518                 /* NVMe Subsystem Reset Required */
1519                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1520                 return (0);
1521 
1522         case NVME_CQE_SC_SPC_FW_NEXT_RESET:
1523                 /* Activation Requires Reset */
1524                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1525                 return (0);
1526 
1527         case NVME_CQE_SC_SPC_FW_MTFA:
1528                 /* Activation Requires Maximum Time Violation */
1529                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1530                 return (EAGAIN);
1531 
1532         case NVME_CQE_SC_SPC_FW_PROHIBITED:
1533                 /* Activation Prohibited */
1534                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1535                 return (EINVAL);
1536 
1537         case NVME_CQE_SC_SPC_FW_OVERLAP:
1538                 /* Overlapping Firmware Ranges */
1539                 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD);
1540                 return (EINVAL);
1541 
1542         default:
1543                 return (nvme_check_unknown_cmd_status(cmd));
1544         }
1545 }
1546 
1547 static inline int
1548 nvme_check_cmd_status(nvme_cmd_t *cmd)
1549 {
1550         nvme_cqe_t *cqe = &cmd->nc_cqe;
1551 
1552         /*
1553          * Take a shortcut if the controller is dead, or if
1554          * command status indicates no error.
1555          */
1556         if (cmd->nc_nvme->n_dead)
1557                 return (EIO);
1558 
1559         if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1560             cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
1561                 return (0);
1562 
1563         if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
1564                 return (nvme_check_generic_cmd_status(cmd));
1565         else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
1566                 return (nvme_check_specific_cmd_status(cmd));
1567         else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
1568                 return (nvme_check_integrity_cmd_status(cmd));
1569         else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
1570                 return (nvme_check_vendor_cmd_status(cmd));
1571 
1572         return (nvme_check_unknown_cmd_status(cmd));
1573 }
1574 
1575 static int
1576 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec)
1577 {
1578         nvme_t *nvme = abort_cmd->nc_nvme;
1579         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1580         nvme_abort_cmd_t ac = { 0 };
1581         int ret = 0;
1582 
1583         sema_p(&nvme->n_abort_sema);
1584 
1585         ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
1586         ac.b.ac_sqid = abort_cmd->nc_sqid;
1587 
1588         cmd->nc_sqid = 0;
1589         cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
1590         cmd->nc_callback = nvme_wakeup_cmd;
1591         cmd->nc_sqe.sqe_cdw10 = ac.r;
1592 
1593         /*
1594          * Send the ABORT to the hardware. The ABORT command will return _after_
1595          * the aborted command has completed (aborted or otherwise), but since
1596          * we still hold the aborted command's mutex its callback hasn't been
1597          * processed yet.
1598          */
1599         nvme_admin_cmd(cmd, sec);
1600         sema_v(&nvme->n_abort_sema);
1601 
1602         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1603                 dev_err(nvme->n_dip, CE_WARN,
1604                     "!ABORT failed with sct = %x, sc = %x",
1605                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1606                 atomic_inc_32(&nvme->n_abort_failed);
1607         } else {
1608                 dev_err(nvme->n_dip, CE_WARN,
1609                     "!ABORT of command %d/%d %ssuccessful",
1610                     abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid,
1611                     cmd->nc_cqe.cqe_dw0 & 1 ? "un" : "");
1612                 if ((cmd->nc_cqe.cqe_dw0 & 1) == 0)
1613                         atomic_inc_32(&nvme->n_cmd_aborted);
1614         }
1615 
1616         nvme_free_cmd(cmd);
1617         return (ret);
1618 }
1619 
1620 /*
1621  * nvme_wait_cmd -- wait for command completion or timeout
1622  *
1623  * In case of a serious error or a timeout of the abort command the hardware
1624  * will be declared dead and FMA will be notified.
1625  */
1626 static void
1627 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec)
1628 {
1629         clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC);
1630         nvme_t *nvme = cmd->nc_nvme;
1631         nvme_reg_csts_t csts;
1632         nvme_qpair_t *qp;
1633 
1634         ASSERT(mutex_owned(&cmd->nc_mutex));
1635 
1636         while (!cmd->nc_completed) {
1637                 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
1638                         break;
1639         }
1640 
1641         if (cmd->nc_completed)
1642                 return;
1643 
1644         /*
1645          * The command timed out.
1646          *
1647          * Check controller for fatal status, any errors associated with the
1648          * register or DMA handle, or for a double timeout (abort command timed
1649          * out). If necessary log a warning and call FMA.
1650          */
1651         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
1652         dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, "
1653             "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
1654             cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
1655         atomic_inc_32(&nvme->n_cmd_timeout);
1656 
1657         if (csts.b.csts_cfs ||
1658             nvme_check_regs_hdl(nvme) ||
1659             nvme_check_dma_hdl(cmd->nc_dma) ||
1660             cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
1661                 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1662                 nvme->n_dead = B_TRUE;
1663         } else if (nvme_abort_cmd(cmd, sec) == 0) {
1664                 /*
1665                  * If the abort succeeded the command should complete
1666                  * immediately with an appropriate status.
1667                  */
1668                 while (!cmd->nc_completed)
1669                         cv_wait(&cmd->nc_cv, &cmd->nc_mutex);
1670 
1671                 return;
1672         }
1673 
1674         qp = nvme->n_ioq[cmd->nc_sqid];
1675 
1676         mutex_enter(&qp->nq_mutex);
1677         (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
1678         mutex_exit(&qp->nq_mutex);
1679 
1680         /*
1681          * As we don't know what the presumed dead hardware might still do with
1682          * the DMA memory, we'll put the command on the lost commands list if it
1683          * has any DMA memory.
1684          */
1685         if (cmd->nc_dma != NULL) {
1686                 mutex_enter(&nvme_lc_mutex);
1687                 list_insert_head(&nvme_lost_cmds, cmd);
1688                 mutex_exit(&nvme_lc_mutex);
1689         }
1690 }
1691 
1692 static void
1693 nvme_wakeup_cmd(void *arg)
1694 {
1695         nvme_cmd_t *cmd = arg;
1696 
1697         mutex_enter(&cmd->nc_mutex);
1698         cmd->nc_completed = B_TRUE;
1699         cv_signal(&cmd->nc_cv);
1700         mutex_exit(&cmd->nc_mutex);
1701 }
1702 
1703 static void
1704 nvme_async_event_task(void *arg)
1705 {
1706         nvme_cmd_t *cmd = arg;
1707         nvme_t *nvme = cmd->nc_nvme;
1708         nvme_error_log_entry_t *error_log = NULL;
1709         nvme_health_log_t *health_log = NULL;
1710         size_t logsize = 0;
1711         nvme_async_event_t event;
1712 
1713         /*
1714          * Check for errors associated with the async request itself. The only
1715          * command-specific error is "async event limit exceeded", which
1716          * indicates a programming error in the driver and causes a panic in
1717          * nvme_check_cmd_status().
1718          *
1719          * Other possible errors are various scenarios where the async request
1720          * was aborted, or internal errors in the device. Internal errors are
1721          * reported to FMA, the command aborts need no special handling here.
1722          *
1723          * And finally, at least qemu nvme does not support async events,
1724          * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we
1725          * will avoid posting async events.
1726          */
1727 
1728         if (nvme_check_cmd_status(cmd) != 0) {
1729                 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1730                     "!async event request returned failure, sct = %x, "
1731                     "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
1732                     cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
1733                     cmd->nc_cqe.cqe_sf.sf_m);
1734 
1735                 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1736                     cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
1737                         cmd->nc_nvme->n_dead = B_TRUE;
1738                         ddi_fm_service_impact(cmd->nc_nvme->n_dip,
1739                             DDI_SERVICE_LOST);
1740                 }
1741 
1742                 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1743                     cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC &&
1744                     cmd->nc_cqe.cqe_sf.sf_dnr == 1) {
1745                         nvme->n_async_event_supported = B_FALSE;
1746                 }
1747 
1748                 nvme_free_cmd(cmd);
1749                 return;
1750         }
1751 
1752 
1753         event.r = cmd->nc_cqe.cqe_dw0;
1754 
1755         /* Clear CQE and re-submit the async request. */
1756         bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
1757         nvme_submit_admin_cmd(nvme->n_adminq, cmd);
1758 
1759         switch (event.b.ae_type) {
1760         case NVME_ASYNC_TYPE_ERROR:
1761                 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
1762                         (void) nvme_get_logpage(nvme, B_FALSE,
1763                             (void **)&error_log, &logsize, event.b.ae_logpage);
1764                 } else {
1765                         dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
1766                             "async event reply: %d", event.b.ae_logpage);
1767                         atomic_inc_32(&nvme->n_wrong_logpage);
1768                 }
1769 
1770                 switch (event.b.ae_info) {
1771                 case NVME_ASYNC_ERROR_INV_SQ:
1772                         dev_err(nvme->n_dip, CE_PANIC, "programming error: "
1773                             "invalid submission queue");
1774                         return;
1775 
1776                 case NVME_ASYNC_ERROR_INV_DBL:
1777                         dev_err(nvme->n_dip, CE_PANIC, "programming error: "
1778                             "invalid doorbell write value");
1779                         return;
1780 
1781                 case NVME_ASYNC_ERROR_DIAGFAIL:
1782                         dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure");
1783                         ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1784                         nvme->n_dead = B_TRUE;
1785                         atomic_inc_32(&nvme->n_diagfail_event);
1786                         break;
1787 
1788                 case NVME_ASYNC_ERROR_PERSISTENT:
1789                         dev_err(nvme->n_dip, CE_WARN, "!persistent internal "
1790                             "device error");
1791                         ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1792                         nvme->n_dead = B_TRUE;
1793                         atomic_inc_32(&nvme->n_persistent_event);
1794                         break;
1795 
1796                 case NVME_ASYNC_ERROR_TRANSIENT:
1797                         dev_err(nvme->n_dip, CE_WARN, "!transient internal "
1798                             "device error");
1799                         /* TODO: send ereport */
1800                         atomic_inc_32(&nvme->n_transient_event);
1801                         break;
1802 
1803                 case NVME_ASYNC_ERROR_FW_LOAD:
1804                         dev_err(nvme->n_dip, CE_WARN,
1805                             "!firmware image load error");
1806                         atomic_inc_32(&nvme->n_fw_load_event);
1807                         break;
1808                 }
1809                 break;
1810 
1811         case NVME_ASYNC_TYPE_HEALTH:
1812                 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) {
1813                         (void) nvme_get_logpage(nvme, B_FALSE,
1814                             (void **)&health_log, &logsize, event.b.ae_logpage,
1815                             -1);
1816                 } else {
1817                         dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
1818                             "async event reply: %d", event.b.ae_logpage);
1819                         atomic_inc_32(&nvme->n_wrong_logpage);
1820                 }
1821 
1822                 switch (event.b.ae_info) {
1823                 case NVME_ASYNC_HEALTH_RELIABILITY:
1824                         dev_err(nvme->n_dip, CE_WARN,
1825                             "!device reliability compromised");
1826                         /* TODO: send ereport */
1827                         atomic_inc_32(&nvme->n_reliability_event);
1828                         break;
1829 
1830                 case NVME_ASYNC_HEALTH_TEMPERATURE:
1831                         dev_err(nvme->n_dip, CE_WARN,
1832                             "!temperature above threshold");
1833                         /* TODO: send ereport */
1834                         atomic_inc_32(&nvme->n_temperature_event);
1835                         break;
1836 
1837                 case NVME_ASYNC_HEALTH_SPARE:
1838                         dev_err(nvme->n_dip, CE_WARN,
1839                             "!spare space below threshold");
1840                         /* TODO: send ereport */
1841                         atomic_inc_32(&nvme->n_spare_event);
1842                         break;
1843                 }
1844                 break;
1845 
1846         case NVME_ASYNC_TYPE_VENDOR:
1847                 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event "
1848                     "received, info = %x, logpage = %x", event.b.ae_info,
1849                     event.b.ae_logpage);
1850                 atomic_inc_32(&nvme->n_vendor_event);
1851                 break;
1852 
1853         default:
1854                 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
1855                     "type = %x, info = %x, logpage = %x", event.b.ae_type,
1856                     event.b.ae_info, event.b.ae_logpage);
1857                 atomic_inc_32(&nvme->n_unknown_event);
1858                 break;
1859         }
1860 
1861         if (error_log)
1862                 kmem_free(error_log, logsize);
1863 
1864         if (health_log)
1865                 kmem_free(health_log, logsize);
1866 }
1867 
1868 static void
1869 nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
1870 {
1871         mutex_enter(&cmd->nc_mutex);
1872         nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
1873         nvme_wait_cmd(cmd, sec);
1874         mutex_exit(&cmd->nc_mutex);
1875 }
1876 
1877 static void
1878 nvme_async_event(nvme_t *nvme)
1879 {
1880         nvme_cmd_t *cmd;
1881 
1882         cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1883         cmd->nc_sqid = 0;
1884         cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
1885         cmd->nc_callback = nvme_async_event_task;
1886         cmd->nc_dontpanic = B_TRUE;
1887 
1888         nvme_submit_admin_cmd(nvme->n_adminq, cmd);
1889 }
1890 
1891 static int
1892 nvme_format_nvm(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t lbaf,
1893     boolean_t ms, uint8_t pi, boolean_t pil, uint8_t ses)
1894 {
1895         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1896         nvme_format_nvm_t format_nvm = { 0 };
1897         int ret;
1898 
1899         format_nvm.b.fm_lbaf = lbaf & 0xf;
1900         format_nvm.b.fm_ms = ms ? 1 : 0;
1901         format_nvm.b.fm_pi = pi & 0x7;
1902         format_nvm.b.fm_pil = pil ? 1 : 0;
1903         format_nvm.b.fm_ses = ses & 0x7;
1904 
1905         cmd->nc_sqid = 0;
1906         cmd->nc_callback = nvme_wakeup_cmd;
1907         cmd->nc_sqe.sqe_nsid = nsid;
1908         cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;
1909         cmd->nc_sqe.sqe_cdw10 = format_nvm.r;
1910 
1911         /*
1912          * Some devices like Samsung SM951 don't allow formatting of all
1913          * namespaces in one command. Handle that gracefully.
1914          */
1915         if (nsid == (uint32_t)-1)
1916                 cmd->nc_dontpanic = B_TRUE;
1917         /*
1918          * If this format request was initiated by the user, then don't allow a
1919          * programmer error to panic the system.
1920          */
1921         if (user)
1922                 cmd->nc_dontpanic = B_TRUE;
1923 
1924         nvme_admin_cmd(cmd, nvme_format_cmd_timeout);
1925 
1926         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1927                 dev_err(nvme->n_dip, CE_WARN,
1928                     "!FORMAT failed with sct = %x, sc = %x",
1929                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1930         }
1931 
1932         nvme_free_cmd(cmd);
1933         return (ret);
1934 }
1935 
1936 static int
1937 nvme_get_logpage(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize,
1938     uint8_t logpage, ...)
1939 {
1940         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1941         nvme_getlogpage_t getlogpage = { 0 };
1942         va_list ap;
1943         int ret;
1944 
1945         va_start(ap, logpage);
1946 
1947         cmd->nc_sqid = 0;
1948         cmd->nc_callback = nvme_wakeup_cmd;
1949         cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
1950 
1951         if (user)
1952                 cmd->nc_dontpanic = B_TRUE;
1953 
1954         getlogpage.b.lp_lid = logpage;
1955 
1956         switch (logpage) {
1957         case NVME_LOGPAGE_ERROR:
1958                 cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
1959                 /*
1960                  * The GET LOG PAGE command can use at most 2 pages to return
1961                  * data, PRP lists are not supported.
1962                  */
1963                 *bufsize = MIN(2 * nvme->n_pagesize,
1964                     nvme->n_error_log_len * sizeof (nvme_error_log_entry_t));
1965                 break;
1966 
1967         case NVME_LOGPAGE_HEALTH:
1968                 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t);
1969                 *bufsize = sizeof (nvme_health_log_t);
1970                 break;
1971 
1972         case NVME_LOGPAGE_FWSLOT:
1973                 cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
1974                 *bufsize = sizeof (nvme_fwslot_log_t);
1975                 break;
1976 
1977         default:
1978                 dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d",
1979                     logpage);
1980                 atomic_inc_32(&nvme->n_unknown_logpage);
1981                 ret = EINVAL;
1982                 goto fail;
1983         }
1984 
1985         va_end(ap);
1986 
1987         getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1;
1988 
1989         cmd->nc_sqe.sqe_cdw10 = getlogpage.r;
1990 
1991         if (nvme_zalloc_dma(nvme, *bufsize,
1992             DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1993                 dev_err(nvme->n_dip, CE_WARN,
1994                     "!nvme_zalloc_dma failed for GET LOG PAGE");
1995                 ret = ENOMEM;
1996                 goto fail;
1997         }
1998 
1999         if (cmd->nc_dma->nd_ncookie > 2) {
2000                 dev_err(nvme->n_dip, CE_WARN,
2001                     "!too many DMA cookies for GET LOG PAGE");
2002                 atomic_inc_32(&nvme->n_too_many_cookies);
2003                 ret = ENOMEM;
2004                 goto fail;
2005         }
2006 
2007         cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
2008         if (cmd->nc_dma->nd_ncookie > 1) {
2009                 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
2010                     &cmd->nc_dma->nd_cookie);
2011                 cmd->nc_sqe.sqe_dptr.d_prp[1] =
2012                     cmd->nc_dma->nd_cookie.dmac_laddress;
2013         }
2014 
2015         nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2016 
2017         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2018                 dev_err(nvme->n_dip, CE_WARN,
2019                     "!GET LOG PAGE failed with sct = %x, sc = %x",
2020                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2021                 goto fail;
2022         }
2023 
2024         *buf = kmem_alloc(*bufsize, KM_SLEEP);
2025         bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
2026 
2027 fail:
2028         nvme_free_cmd(cmd);
2029 
2030         return (ret);
2031 }
2032 
2033 static int
2034 nvme_identify(nvme_t *nvme, boolean_t user, uint32_t nsid, void **buf)
2035 {
2036         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2037         int ret;
2038 
2039         if (buf == NULL)
2040                 return (EINVAL);
2041 
2042         cmd->nc_sqid = 0;
2043         cmd->nc_callback = nvme_wakeup_cmd;
2044         cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
2045         cmd->nc_sqe.sqe_nsid = nsid;
2046         cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL;
2047 
2048         if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
2049             &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2050                 dev_err(nvme->n_dip, CE_WARN,
2051                     "!nvme_zalloc_dma failed for IDENTIFY");
2052                 ret = ENOMEM;
2053                 goto fail;
2054         }
2055 
2056         if (cmd->nc_dma->nd_ncookie > 2) {
2057                 dev_err(nvme->n_dip, CE_WARN,
2058                     "!too many DMA cookies for IDENTIFY");
2059                 atomic_inc_32(&nvme->n_too_many_cookies);
2060                 ret = ENOMEM;
2061                 goto fail;
2062         }
2063 
2064         cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
2065         if (cmd->nc_dma->nd_ncookie > 1) {
2066                 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
2067                     &cmd->nc_dma->nd_cookie);
2068                 cmd->nc_sqe.sqe_dptr.d_prp[1] =
2069                     cmd->nc_dma->nd_cookie.dmac_laddress;
2070         }
2071 
2072         if (user)
2073                 cmd->nc_dontpanic = B_TRUE;
2074 
2075         nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2076 
2077         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2078                 dev_err(nvme->n_dip, CE_WARN,
2079                     "!IDENTIFY failed with sct = %x, sc = %x",
2080                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2081                 goto fail;
2082         }
2083 
2084         *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
2085         bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE);
2086 
2087 fail:
2088         nvme_free_cmd(cmd);
2089 
2090         return (ret);
2091 }
2092 
2093 static int
2094 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature,
2095     uint32_t val, uint32_t *res)
2096 {
2097         _NOTE(ARGUNUSED(nsid));
2098         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2099         int ret = EINVAL;
2100 
2101         ASSERT(res != NULL);
2102 
2103         cmd->nc_sqid = 0;
2104         cmd->nc_callback = nvme_wakeup_cmd;
2105         cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
2106         cmd->nc_sqe.sqe_cdw10 = feature;
2107         cmd->nc_sqe.sqe_cdw11 = val;
2108 
2109         if (user)
2110                 cmd->nc_dontpanic = B_TRUE;
2111 
2112         switch (feature) {
2113         case NVME_FEAT_WRITE_CACHE:
2114                 if (!nvme->n_write_cache_present)
2115                         goto fail;
2116                 break;
2117 
2118         case NVME_FEAT_NQUEUES:
2119                 break;
2120 
2121         default:
2122                 goto fail;
2123         }
2124 
2125         nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2126 
2127         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2128                 dev_err(nvme->n_dip, CE_WARN,
2129                     "!SET FEATURES %d failed with sct = %x, sc = %x",
2130                     feature, cmd->nc_cqe.cqe_sf.sf_sct,
2131                     cmd->nc_cqe.cqe_sf.sf_sc);
2132                 goto fail;
2133         }
2134 
2135         *res = cmd->nc_cqe.cqe_dw0;
2136 
2137 fail:
2138         nvme_free_cmd(cmd);
2139         return (ret);
2140 }
2141 
2142 static int
2143 nvme_get_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature,
2144     uint32_t *res, void **buf, size_t *bufsize)
2145 {
2146         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2147         int ret = EINVAL;
2148 
2149         ASSERT(res != NULL);
2150 
2151         if (bufsize != NULL)
2152                 *bufsize = 0;
2153 
2154         cmd->nc_sqid = 0;
2155         cmd->nc_callback = nvme_wakeup_cmd;
2156         cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES;
2157         cmd->nc_sqe.sqe_cdw10 = feature;
2158         cmd->nc_sqe.sqe_cdw11 = *res;
2159 
2160         /*
2161          * For some of the optional features there doesn't seem to be a method
2162          * of detecting whether it is supported other than using it.  This will
2163          * cause "Invalid Field in Command" error, which is normally considered
2164          * a programming error.  Set the nc_dontpanic flag to override the panic
2165          * in nvme_check_generic_cmd_status().
2166          */
2167         switch (feature) {
2168         case NVME_FEAT_ARBITRATION:
2169         case NVME_FEAT_POWER_MGMT:
2170         case NVME_FEAT_TEMPERATURE:
2171         case NVME_FEAT_ERROR:
2172         case NVME_FEAT_NQUEUES:
2173         case NVME_FEAT_INTR_COAL:
2174         case NVME_FEAT_INTR_VECT:
2175         case NVME_FEAT_WRITE_ATOM:
2176         case NVME_FEAT_ASYNC_EVENT:
2177                 break;
2178 
2179         case NVME_FEAT_WRITE_CACHE:
2180                 if (!nvme->n_write_cache_present)
2181                         goto fail;
2182                 break;
2183 
2184         case NVME_FEAT_LBA_RANGE:
2185                 if (!nvme->n_lba_range_supported)
2186                         goto fail;
2187 
2188                 cmd->nc_dontpanic = B_TRUE;
2189                 cmd->nc_sqe.sqe_nsid = nsid;
2190                 ASSERT(bufsize != NULL);
2191                 *bufsize = NVME_LBA_RANGE_BUFSIZE;
2192                 break;
2193 
2194         case NVME_FEAT_AUTO_PST:
2195                 if (!nvme->n_auto_pst_supported)
2196                         goto fail;
2197 
2198                 ASSERT(bufsize != NULL);
2199                 *bufsize = NVME_AUTO_PST_BUFSIZE;
2200                 break;
2201 
2202         case NVME_FEAT_PROGRESS:
2203                 if (!nvme->n_progress_supported)
2204                         goto fail;
2205 
2206                 cmd->nc_dontpanic = B_TRUE;
2207                 break;
2208 
2209         default:
2210                 goto fail;
2211         }
2212 
2213         if (user)
2214                 cmd->nc_dontpanic = B_TRUE;
2215 
2216         if (bufsize != NULL && *bufsize != 0) {
2217                 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ,
2218                     &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2219                         dev_err(nvme->n_dip, CE_WARN,
2220                             "!nvme_zalloc_dma failed for GET FEATURES");
2221                         ret = ENOMEM;
2222                         goto fail;
2223                 }
2224 
2225                 if (cmd->nc_dma->nd_ncookie > 2) {
2226                         dev_err(nvme->n_dip, CE_WARN,
2227                             "!too many DMA cookies for GET FEATURES");
2228                         atomic_inc_32(&nvme->n_too_many_cookies);
2229                         ret = ENOMEM;
2230                         goto fail;
2231                 }
2232 
2233                 cmd->nc_sqe.sqe_dptr.d_prp[0] =
2234                     cmd->nc_dma->nd_cookie.dmac_laddress;
2235                 if (cmd->nc_dma->nd_ncookie > 1) {
2236                         ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
2237                             &cmd->nc_dma->nd_cookie);
2238                         cmd->nc_sqe.sqe_dptr.d_prp[1] =
2239                             cmd->nc_dma->nd_cookie.dmac_laddress;
2240                 }
2241         }
2242 
2243         nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2244 
2245         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2246                 boolean_t known = B_TRUE;
2247 
2248                 /* Check if this is unsupported optional feature */
2249                 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2250                     cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) {
2251                         switch (feature) {
2252                         case NVME_FEAT_LBA_RANGE:
2253                                 nvme->n_lba_range_supported = B_FALSE;
2254                                 break;
2255                         case NVME_FEAT_PROGRESS:
2256                                 nvme->n_progress_supported = B_FALSE;
2257                                 break;
2258                         default:
2259                                 known = B_FALSE;
2260                                 break;
2261                         }
2262                 } else {
2263                         known = B_FALSE;
2264                 }
2265 
2266                 /* Report the error otherwise */
2267                 if (!known) {
2268                         dev_err(nvme->n_dip, CE_WARN,
2269                             "!GET FEATURES %d failed with sct = %x, sc = %x",
2270                             feature, cmd->nc_cqe.cqe_sf.sf_sct,
2271                             cmd->nc_cqe.cqe_sf.sf_sc);
2272                 }
2273 
2274                 goto fail;
2275         }
2276 
2277         if (bufsize != NULL && *bufsize != 0) {
2278                 ASSERT(buf != NULL);
2279                 *buf = kmem_alloc(*bufsize, KM_SLEEP);
2280                 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
2281         }
2282 
2283         *res = cmd->nc_cqe.cqe_dw0;
2284 
2285 fail:
2286         nvme_free_cmd(cmd);
2287         return (ret);
2288 }
2289 
2290 static int
2291 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
2292 {
2293         nvme_write_cache_t nwc = { 0 };
2294 
2295         if (enable)
2296                 nwc.b.wc_wce = 1;
2297 
2298         return (nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_WRITE_CACHE,
2299             nwc.r, &nwc.r));
2300 }
2301 
2302 static int
2303 nvme_set_nqueues(nvme_t *nvme)
2304 {
2305         nvme_nqueues_t nq = { 0 };
2306         int ret;
2307 
2308         /*
2309          * The default is to allocate one completion queue per vector.
2310          */
2311         if (nvme->n_completion_queues == -1)
2312                 nvme->n_completion_queues = nvme->n_intr_cnt;
2313 
2314         /*
2315          * There is no point in having more compeletion queues than
2316          * interrupt vectors.
2317          */
2318         nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2319             nvme->n_intr_cnt);
2320 
2321         /*
2322          * The default is to use one submission queue per completion queue.
2323          */
2324         if (nvme->n_submission_queues == -1)
2325                 nvme->n_submission_queues = nvme->n_completion_queues;
2326 
2327         /*
2328          * There is no point in having more compeletion queues than
2329          * submission queues.
2330          */
2331         nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2332             nvme->n_submission_queues);
2333 
2334         ASSERT(nvme->n_submission_queues > 0);
2335         ASSERT(nvme->n_completion_queues > 0);
2336 
2337         nq.b.nq_nsq = nvme->n_submission_queues - 1;
2338         nq.b.nq_ncq = nvme->n_completion_queues - 1;
2339 
2340         ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r,
2341             &nq.r);
2342 
2343         if (ret == 0) {
2344                 /*
2345                  * Never use more than the requested number of queues.
2346                  */
2347                 nvme->n_submission_queues = MIN(nvme->n_submission_queues,
2348                     nq.b.nq_nsq + 1);
2349                 nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2350                     nq.b.nq_ncq + 1);
2351         }
2352 
2353         return (ret);
2354 }
2355 
2356 static int
2357 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq)
2358 {
2359         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2360         nvme_create_queue_dw10_t dw10 = { 0 };
2361         nvme_create_cq_dw11_t c_dw11 = { 0 };
2362         int ret;
2363 
2364         dw10.b.q_qid = cq->ncq_id;
2365         dw10.b.q_qsize = cq->ncq_nentry - 1;
2366 
2367         c_dw11.b.cq_pc = 1;
2368         c_dw11.b.cq_ien = 1;
2369         c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt;
2370 
2371         cmd->nc_sqid = 0;
2372         cmd->nc_callback = nvme_wakeup_cmd;
2373         cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
2374         cmd->nc_sqe.sqe_cdw10 = dw10.r;
2375         cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
2376         cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress;
2377 
2378         nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2379 
2380         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2381                 dev_err(nvme->n_dip, CE_WARN,
2382                     "!CREATE CQUEUE failed with sct = %x, sc = %x",
2383                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2384         }
2385 
2386         nvme_free_cmd(cmd);
2387 
2388         return (ret);
2389 }
2390 
2391 static int
2392 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
2393 {
2394         nvme_cq_t *cq = qp->nq_cq;
2395         nvme_cmd_t *cmd;
2396         nvme_create_queue_dw10_t dw10 = { 0 };
2397         nvme_create_sq_dw11_t s_dw11 = { 0 };
2398         int ret;
2399 
2400         /*
2401          * It is possible to have more qpairs than completion queues,
2402          * and when the idx > ncq_id, that completion queue is shared
2403          * and has already been created.
2404          */
2405         if (idx <= cq->ncq_id &&
2406             nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS)
2407                 return (DDI_FAILURE);
2408 
2409         dw10.b.q_qid = idx;
2410         dw10.b.q_qsize = qp->nq_nentry - 1;
2411 
2412         s_dw11.b.sq_pc = 1;
2413         s_dw11.b.sq_cqid = cq->ncq_id;
2414 
2415         cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2416         cmd->nc_sqid = 0;
2417         cmd->nc_callback = nvme_wakeup_cmd;
2418         cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
2419         cmd->nc_sqe.sqe_cdw10 = dw10.r;
2420         cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
2421         cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
2422 
2423         nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2424 
2425         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2426                 dev_err(nvme->n_dip, CE_WARN,
2427                     "!CREATE SQUEUE failed with sct = %x, sc = %x",
2428                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2429         }
2430 
2431         nvme_free_cmd(cmd);
2432 
2433         return (ret);
2434 }
2435 
2436 static boolean_t
2437 nvme_reset(nvme_t *nvme, boolean_t quiesce)
2438 {
2439         nvme_reg_csts_t csts;
2440         int i;
2441 
2442         nvme_put32(nvme, NVME_REG_CC, 0);
2443 
2444         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2445         if (csts.b.csts_rdy == 1) {
2446                 nvme_put32(nvme, NVME_REG_CC, 0);
2447                 for (i = 0; i != nvme->n_timeout * 10; i++) {
2448                         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2449                         if (csts.b.csts_rdy == 0)
2450                                 break;
2451 
2452                         if (quiesce)
2453                                 drv_usecwait(50000);
2454                         else
2455                                 delay(drv_usectohz(50000));
2456                 }
2457         }
2458 
2459         nvme_put32(nvme, NVME_REG_AQA, 0);
2460         nvme_put32(nvme, NVME_REG_ASQ, 0);
2461         nvme_put32(nvme, NVME_REG_ACQ, 0);
2462 
2463         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2464         return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE);
2465 }
2466 
2467 static void
2468 nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce)
2469 {
2470         nvme_reg_cc_t cc;
2471         nvme_reg_csts_t csts;
2472         int i;
2473 
2474         ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT);
2475 
2476         cc.r = nvme_get32(nvme, NVME_REG_CC);
2477         cc.b.cc_shn = mode & 0x3;
2478         nvme_put32(nvme, NVME_REG_CC, cc.r);
2479 
2480         for (i = 0; i != 10; i++) {
2481                 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2482                 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE)
2483                         break;
2484 
2485                 if (quiesce)
2486                         drv_usecwait(100000);
2487                 else
2488                         delay(drv_usectohz(100000));
2489         }
2490 }
2491 
2492 
2493 static void
2494 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid)
2495 {
2496         /*
2497          * Section 7.7 of the spec describes how to get a unique ID for
2498          * the controller: the vendor ID, the model name and the serial
2499          * number shall be unique when combined.
2500          *
2501          * If a namespace has no EUI64 we use the above and add the hex
2502          * namespace ID to get a unique ID for the namespace.
2503          */
2504         char model[sizeof (nvme->n_idctl->id_model) + 1];
2505         char serial[sizeof (nvme->n_idctl->id_serial) + 1];
2506 
2507         bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2508         bcopy(nvme->n_idctl->id_serial, serial,
2509             sizeof (nvme->n_idctl->id_serial));
2510 
2511         model[sizeof (nvme->n_idctl->id_model)] = '\0';
2512         serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
2513 
2514         nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X",
2515             nvme->n_idctl->id_vid, model, serial, nsid);
2516 }
2517 
2518 static int
2519 nvme_init_ns(nvme_t *nvme, int nsid)
2520 {
2521         nvme_namespace_t *ns = &nvme->n_ns[nsid - 1];
2522         nvme_identify_nsid_t *idns;
2523         boolean_t was_ignored;
2524         int last_rp;
2525 
2526         ns->ns_nvme = nvme;
2527 
2528         if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) {
2529                 dev_err(nvme->n_dip, CE_WARN,
2530                     "!failed to identify namespace %d", nsid);
2531                 return (DDI_FAILURE);
2532         }
2533 
2534         ns->ns_idns = idns;
2535         ns->ns_id = nsid;
2536         ns->ns_block_count = idns->id_nsize;
2537         ns->ns_block_size =
2538             1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
2539         ns->ns_best_block_size = ns->ns_block_size;
2540 
2541         /*
2542          * Get the EUI64 if present. Use it for devid and device node names.
2543          */
2544         if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2545                 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64));
2546 
2547         /*LINTED: E_BAD_PTR_CAST_ALIGN*/
2548         if (*(uint64_t *)ns->ns_eui64 != 0) {
2549                 uint8_t *eui64 = ns->ns_eui64;
2550 
2551                 (void) snprintf(ns->ns_name, sizeof (ns->ns_name),
2552                     "%02x%02x%02x%02x%02x%02x%02x%02x",
2553                     eui64[0], eui64[1], eui64[2], eui64[3],
2554                     eui64[4], eui64[5], eui64[6], eui64[7]);
2555         } else {
2556                 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d",
2557                     ns->ns_id);
2558 
2559                 nvme_prepare_devid(nvme, ns->ns_id);
2560         }
2561 
2562         /*
2563          * Find the LBA format with no metadata and the best relative
2564          * performance. A value of 3 means "degraded", 0 is best.
2565          */
2566         last_rp = 3;
2567         for (int j = 0; j <= idns->id_nlbaf; j++) {
2568                 if (idns->id_lbaf[j].lbaf_lbads == 0)
2569                         break;
2570                 if (idns->id_lbaf[j].lbaf_ms != 0)
2571                         continue;
2572                 if (idns->id_lbaf[j].lbaf_rp >= last_rp)
2573                         continue;
2574                 last_rp = idns->id_lbaf[j].lbaf_rp;
2575                 ns->ns_best_block_size =
2576                     1 << idns->id_lbaf[j].lbaf_lbads;
2577         }
2578 
2579         if (ns->ns_best_block_size < nvme->n_min_block_size)
2580                 ns->ns_best_block_size = nvme->n_min_block_size;
2581 
2582         was_ignored = ns->ns_ignore;
2583 
2584         /*
2585          * We currently don't support namespaces that use either:
2586          * - protection information
2587          * - illegal block size (< 512)
2588          */
2589         if (idns->id_dps.dp_pinfo) {
2590                 dev_err(nvme->n_dip, CE_WARN,
2591                     "!ignoring namespace %d, unsupported feature: "
2592                     "pinfo = %d", nsid, idns->id_dps.dp_pinfo);
2593                 ns->ns_ignore = B_TRUE;
2594         } else if (ns->ns_block_size < 512) {
2595                 dev_err(nvme->n_dip, CE_WARN,
2596                     "!ignoring namespace %d, unsupported block size %"PRIu64,
2597                     nsid, (uint64_t)ns->ns_block_size);
2598                 ns->ns_ignore = B_TRUE;
2599         } else {
2600                 ns->ns_ignore = B_FALSE;
2601         }
2602 
2603         /*
2604          * Keep a count of namespaces which are attachable.
2605          * See comments in nvme_bd_driveinfo() to understand its effect.
2606          */
2607         if (was_ignored) {
2608                 /*
2609                  * Previously ignored, but now not. Count it.
2610                  */
2611                 if (!ns->ns_ignore)
2612                         nvme->n_namespaces_attachable++;
2613         } else {
2614                 /*
2615                  * Wasn't ignored previously, but now needs to be.
2616                  * Discount it.
2617                  */
2618                 if (ns->ns_ignore)
2619                         nvme->n_namespaces_attachable--;
2620         }
2621 
2622         return (DDI_SUCCESS);
2623 }
2624 
2625 static int
2626 nvme_init(nvme_t *nvme)
2627 {
2628         nvme_reg_cc_t cc = { 0 };
2629         nvme_reg_aqa_t aqa = { 0 };
2630         nvme_reg_asq_t asq = { 0 };
2631         nvme_reg_acq_t acq = { 0 };
2632         nvme_reg_cap_t cap;
2633         nvme_reg_vs_t vs;
2634         nvme_reg_csts_t csts;
2635         int i = 0;
2636         uint16_t nqueues;
2637         uint_t tq_threads;
2638         char model[sizeof (nvme->n_idctl->id_model) + 1];
2639         char *vendor, *product;
2640 
2641         /* Check controller version */
2642         vs.r = nvme_get32(nvme, NVME_REG_VS);
2643         nvme->n_version.v_major = vs.b.vs_mjr;
2644         nvme->n_version.v_minor = vs.b.vs_mnr;
2645         dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
2646             nvme->n_version.v_major, nvme->n_version.v_minor);
2647 
2648         if (nvme->n_version.v_major > nvme_version_major) {
2649                 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x",
2650                     nvme_version_major);
2651                 if (nvme->n_strict_version)
2652                         goto fail;
2653         }
2654 
2655         /* retrieve controller configuration */
2656         cap.r = nvme_get64(nvme, NVME_REG_CAP);
2657 
2658         if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
2659                 dev_err(nvme->n_dip, CE_WARN,
2660                     "!NVM command set not supported by hardware");
2661                 goto fail;
2662         }
2663 
2664         nvme->n_nssr_supported = cap.b.cap_nssrs;
2665         nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
2666         nvme->n_timeout = cap.b.cap_to;
2667         nvme->n_arbitration_mechanisms = cap.b.cap_ams;
2668         nvme->n_cont_queues_reqd = cap.b.cap_cqr;
2669         nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
2670 
2671         /*
2672          * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
2673          * the base page size of 4k (1<<12), so add 12 here to get the real
2674          * page size value.
2675          */
2676         nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT),
2677             cap.b.cap_mpsmax + 12);
2678         nvme->n_pagesize = 1UL << (nvme->n_pageshift);
2679 
2680         /*
2681          * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
2682          */
2683         nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize;
2684         nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
2685 
2686         /*
2687          * Set up PRP DMA to transfer 1 page-aligned page at a time.
2688          * Maxxfer may be increased after we identified the controller limits.
2689          */
2690         nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize;
2691         nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
2692         nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize;
2693         nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1;
2694 
2695         /*
2696          * Reset controller if it's still in ready state.
2697          */
2698         if (nvme_reset(nvme, B_FALSE) == B_FALSE) {
2699                 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller");
2700                 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
2701                 nvme->n_dead = B_TRUE;
2702                 goto fail;
2703         }
2704 
2705         /*
2706          * Create the cq array with one completion queue to be assigned
2707          * to the admin queue pair and a limited number of taskqs (4).
2708          */
2709         if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) !=
2710             DDI_SUCCESS) {
2711                 dev_err(nvme->n_dip, CE_WARN,
2712                     "!failed to pre-allocate admin completion queue");
2713                 goto fail;
2714         }
2715         /*
2716          * Create the admin queue pair.
2717          */
2718         if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0)
2719             != DDI_SUCCESS) {
2720                 dev_err(nvme->n_dip, CE_WARN,
2721                     "!unable to allocate admin qpair");
2722                 goto fail;
2723         }
2724         nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP);
2725         nvme->n_ioq[0] = nvme->n_adminq;
2726 
2727         nvme->n_progress |= NVME_ADMIN_QUEUE;
2728 
2729         (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2730             "admin-queue-len", nvme->n_admin_queue_len);
2731 
2732         aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1;
2733         asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress;
2734         acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress;
2735 
2736         ASSERT((asq & (nvme->n_pagesize - 1)) == 0);
2737         ASSERT((acq & (nvme->n_pagesize - 1)) == 0);
2738 
2739         nvme_put32(nvme, NVME_REG_AQA, aqa.r);
2740         nvme_put64(nvme, NVME_REG_ASQ, asq);
2741         nvme_put64(nvme, NVME_REG_ACQ, acq);
2742 
2743         cc.b.cc_ams = 0;        /* use Round-Robin arbitration */
2744         cc.b.cc_css = 0;        /* use NVM command set */
2745         cc.b.cc_mps = nvme->n_pageshift - 12;
2746         cc.b.cc_shn = 0;        /* no shutdown in progress */
2747         cc.b.cc_en = 1;         /* enable controller */
2748         cc.b.cc_iosqes = 6;     /* submission queue entry is 2^6 bytes long */
2749         cc.b.cc_iocqes = 4;     /* completion queue entry is 2^4 bytes long */
2750 
2751         nvme_put32(nvme, NVME_REG_CC, cc.r);
2752 
2753         /*
2754          * Wait for the controller to become ready.
2755          */
2756         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2757         if (csts.b.csts_rdy == 0) {
2758                 for (i = 0; i != nvme->n_timeout * 10; i++) {
2759                         delay(drv_usectohz(50000));
2760                         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2761 
2762                         if (csts.b.csts_cfs == 1) {
2763                                 dev_err(nvme->n_dip, CE_WARN,
2764                                     "!controller fatal status at init");
2765                                 ddi_fm_service_impact(nvme->n_dip,
2766                                     DDI_SERVICE_LOST);
2767                                 nvme->n_dead = B_TRUE;
2768                                 goto fail;
2769                         }
2770 
2771                         if (csts.b.csts_rdy == 1)
2772                                 break;
2773                 }
2774         }
2775 
2776         if (csts.b.csts_rdy == 0) {
2777                 dev_err(nvme->n_dip, CE_WARN, "!controller not ready");
2778                 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
2779                 nvme->n_dead = B_TRUE;
2780                 goto fail;
2781         }
2782 
2783         /*
2784          * Assume an abort command limit of 1. We'll destroy and re-init
2785          * that later when we know the true abort command limit.
2786          */
2787         sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
2788 
2789         /*
2790          * Setup initial interrupt for admin queue.
2791          */
2792         if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
2793             != DDI_SUCCESS) &&
2794             (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
2795             != DDI_SUCCESS) &&
2796             (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
2797             != DDI_SUCCESS)) {
2798                 dev_err(nvme->n_dip, CE_WARN,
2799                     "!failed to setup initial interrupt");
2800                 goto fail;
2801         }
2802 
2803         /*
2804          * Post an asynchronous event command to catch errors.
2805          * We assume the asynchronous events are supported as required by
2806          * specification (Figure 40 in section 5 of NVMe 1.2).
2807          * However, since at least qemu does not follow the specification,
2808          * we need a mechanism to protect ourselves.
2809          */
2810         nvme->n_async_event_supported = B_TRUE;
2811         nvme_async_event(nvme);
2812 
2813         /*
2814          * Identify Controller
2815          */
2816         if (nvme_identify(nvme, B_FALSE, 0, (void **)&nvme->n_idctl) != 0) {
2817                 dev_err(nvme->n_dip, CE_WARN,
2818                     "!failed to identify controller");
2819                 goto fail;
2820         }
2821 
2822         /*
2823          * Get Vendor & Product ID
2824          */
2825         bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2826         model[sizeof (nvme->n_idctl->id_model)] = '\0';
2827         sata_split_model(model, &vendor, &product);
2828 
2829         if (vendor == NULL)
2830                 nvme->n_vendor = strdup("NVMe");
2831         else
2832                 nvme->n_vendor = strdup(vendor);
2833 
2834         nvme->n_product = strdup(product);
2835 
2836         /*
2837          * Get controller limits.
2838          */
2839         nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT,
2840             MIN(nvme->n_admin_queue_len / 10,
2841             MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit)));
2842 
2843         (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2844             "async-event-limit", nvme->n_async_event_limit);
2845 
2846         nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1;
2847 
2848         /*
2849          * Reinitialize the semaphore with the true abort command limit
2850          * supported by the hardware. It's not necessary to disable interrupts
2851          * as only command aborts use the semaphore, and no commands are
2852          * executed or aborted while we're here.
2853          */
2854         sema_destroy(&nvme->n_abort_sema);
2855         sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL,
2856             SEMA_DRIVER, NULL);
2857 
2858         nvme->n_progress |= NVME_CTRL_LIMITS;
2859 
2860         if (nvme->n_idctl->id_mdts == 0)
2861                 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536;
2862         else
2863                 nvme->n_max_data_transfer_size =
2864                     1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts);
2865 
2866         nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1;
2867 
2868         /*
2869          * Limit n_max_data_transfer_size to what we can handle in one PRP.
2870          * Chained PRPs are currently unsupported.
2871          *
2872          * This is a no-op on hardware which doesn't support a transfer size
2873          * big enough to require chained PRPs.
2874          */
2875         nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size,
2876             (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize));
2877 
2878         nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size;
2879 
2880         /*
2881          * Make sure the minimum/maximum queue entry sizes are not
2882          * larger/smaller than the default.
2883          */
2884 
2885         if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
2886             ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
2887             ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
2888             ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
2889                 goto fail;
2890 
2891         /*
2892          * Check for the presence of a Volatile Write Cache. If present,
2893          * enable or disable based on the value of the property
2894          * volatile-write-cache-enable (default is enabled).
2895          */
2896         nvme->n_write_cache_present =
2897             nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE;
2898 
2899         (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2900             "volatile-write-cache-present",
2901             nvme->n_write_cache_present ? 1 : 0);
2902 
2903         if (!nvme->n_write_cache_present) {
2904                 nvme->n_write_cache_enabled = B_FALSE;
2905         } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)
2906             != 0) {
2907                 dev_err(nvme->n_dip, CE_WARN,
2908                     "!failed to %sable volatile write cache",
2909                     nvme->n_write_cache_enabled ? "en" : "dis");
2910                 /*
2911                  * Assume the cache is (still) enabled.
2912                  */
2913                 nvme->n_write_cache_enabled = B_TRUE;
2914         }
2915 
2916         (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2917             "volatile-write-cache-enable",
2918             nvme->n_write_cache_enabled ? 1 : 0);
2919 
2920         /*
2921          * Assume LBA Range Type feature is supported. If it isn't this
2922          * will be set to B_FALSE by nvme_get_features().
2923          */
2924         nvme->n_lba_range_supported = B_TRUE;
2925 
2926         /*
2927          * Check support for Autonomous Power State Transition.
2928          */
2929         if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2930                 nvme->n_auto_pst_supported =
2931                     nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE;
2932 
2933         /*
2934          * Assume Software Progress Marker feature is supported.  If it isn't
2935          * this will be set to B_FALSE by nvme_get_features().
2936          */
2937         nvme->n_progress_supported = B_TRUE;
2938 
2939         /*
2940          * Identify Namespaces
2941          */
2942         nvme->n_namespace_count = nvme->n_idctl->id_nn;
2943 
2944         if (nvme->n_namespace_count == 0) {
2945                 dev_err(nvme->n_dip, CE_WARN,
2946                     "!controllers without namespaces are not supported");
2947                 goto fail;
2948         }
2949 
2950         if (nvme->n_namespace_count > NVME_MINOR_MAX) {
2951                 dev_err(nvme->n_dip, CE_WARN,
2952                     "!too many namespaces: %d, limiting to %d\n",
2953                     nvme->n_namespace_count, NVME_MINOR_MAX);
2954                 nvme->n_namespace_count = NVME_MINOR_MAX;
2955         }
2956 
2957         nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
2958             nvme->n_namespace_count, KM_SLEEP);
2959 
2960         for (i = 0; i != nvme->n_namespace_count; i++) {
2961                 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER,
2962                     NULL);
2963                 nvme->n_ns[i].ns_ignore = B_TRUE;
2964                 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS)
2965                         goto fail;
2966         }
2967 
2968         /*
2969          * Try to set up MSI/MSI-X interrupts.
2970          */
2971         if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX))
2972             != 0) {
2973                 nvme_release_interrupts(nvme);
2974 
2975                 nqueues = MIN(UINT16_MAX, ncpus);
2976 
2977                 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
2978                     nqueues) != DDI_SUCCESS) &&
2979                     (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
2980                     nqueues) != DDI_SUCCESS)) {
2981                         dev_err(nvme->n_dip, CE_WARN,
2982                             "!failed to setup MSI/MSI-X interrupts");
2983                         goto fail;
2984                 }
2985         }
2986 
2987         /*
2988          * Create I/O queue pairs.
2989          */
2990 
2991         if (nvme_set_nqueues(nvme) != 0) {
2992                 dev_err(nvme->n_dip, CE_WARN,
2993                     "!failed to set number of I/O queues to %d",
2994                     nvme->n_intr_cnt);
2995                 goto fail;
2996         }
2997 
2998         /*
2999          * Reallocate I/O queue array
3000          */
3001         kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
3002         nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
3003             (nvme->n_submission_queues + 1), KM_SLEEP);
3004         nvme->n_ioq[0] = nvme->n_adminq;
3005 
3006         /*
3007          * There should always be at least as many submission queues
3008          * as completion queues.
3009          */
3010         ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues);
3011 
3012         nvme->n_ioq_count = nvme->n_submission_queues;
3013 
3014         nvme->n_io_squeue_len =
3015             MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries);
3016 
3017         (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len",
3018             nvme->n_io_squeue_len);
3019 
3020         /*
3021          * Pre-allocate completion queues.
3022          * When there are the same number of submission and completion
3023          * queues there is no value in having a larger completion
3024          * queue length.
3025          */
3026         if (nvme->n_submission_queues == nvme->n_completion_queues)
3027                 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
3028                     nvme->n_io_squeue_len);
3029 
3030         nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
3031             nvme->n_max_queue_entries);
3032 
3033         (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len",
3034             nvme->n_io_cqueue_len);
3035 
3036         /*
3037          * Assign the equal quantity of taskq threads to each completion
3038          * queue, capping the total number of threads to the number
3039          * of CPUs.
3040          */
3041         tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues;
3042 
3043         /*
3044          * In case the calculation above is zero, we need at least one
3045          * thread per completion queue.
3046          */
3047         tq_threads = MAX(1, tq_threads);
3048 
3049         if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1,
3050             nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) {
3051                 dev_err(nvme->n_dip, CE_WARN,
3052                     "!failed to pre-allocate completion queues");
3053                 goto fail;
3054         }
3055 
3056         /*
3057          * If we use less completion queues than interrupt vectors return
3058          * some of the interrupt vectors back to the system.
3059          */
3060         if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) {
3061                 nvme_release_interrupts(nvme);
3062 
3063                 if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
3064                     nvme->n_completion_queues + 1) != DDI_SUCCESS) {
3065                         dev_err(nvme->n_dip, CE_WARN,
3066                             "!failed to reduce number of interrupts");
3067                         goto fail;
3068                 }
3069         }
3070 
3071         /*
3072          * Alloc & register I/O queue pairs
3073          */
3074 
3075         for (i = 1; i != nvme->n_ioq_count + 1; i++) {
3076                 if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len,
3077                     &nvme->n_ioq[i], i) != DDI_SUCCESS) {
3078                         dev_err(nvme->n_dip, CE_WARN,
3079                             "!unable to allocate I/O qpair %d", i);
3080                         goto fail;
3081                 }
3082 
3083                 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) {
3084                         dev_err(nvme->n_dip, CE_WARN,
3085                             "!unable to create I/O qpair %d", i);
3086                         goto fail;
3087                 }
3088         }
3089 
3090         /*
3091          * Post more asynchronous events commands to reduce event reporting
3092          * latency as suggested by the spec.
3093          */
3094         if (nvme->n_async_event_supported) {
3095                 for (i = 1; i != nvme->n_async_event_limit; i++)
3096                         nvme_async_event(nvme);
3097         }
3098 
3099         return (DDI_SUCCESS);
3100 
3101 fail:
3102         (void) nvme_reset(nvme, B_FALSE);
3103         return (DDI_FAILURE);
3104 }
3105 
3106 static uint_t
3107 nvme_intr(caddr_t arg1, caddr_t arg2)
3108 {
3109         /*LINTED: E_PTR_BAD_CAST_ALIGN*/
3110         nvme_t *nvme = (nvme_t *)arg1;
3111         int inum = (int)(uintptr_t)arg2;
3112         int ccnt = 0;
3113         int qnum;
3114 
3115         if (inum >= nvme->n_intr_cnt)
3116                 return (DDI_INTR_UNCLAIMED);
3117 
3118         if (nvme->n_dead)
3119                 return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ?
3120                     DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
3121 
3122         /*
3123          * The interrupt vector a queue uses is calculated as queue_idx %
3124          * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
3125          * in steps of n_intr_cnt to process all queues using this vector.
3126          */
3127         for (qnum = inum;
3128             qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL;
3129             qnum += nvme->n_intr_cnt) {
3130                 ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]);
3131         }
3132 
3133         return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
3134 }
3135 
3136 static void
3137 nvme_release_interrupts(nvme_t *nvme)
3138 {
3139         int i;
3140 
3141         for (i = 0; i < nvme->n_intr_cnt; i++) {
3142                 if (nvme->n_inth[i] == NULL)
3143                         break;
3144 
3145                 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
3146                         (void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
3147                 else
3148                         (void) ddi_intr_disable(nvme->n_inth[i]);
3149 
3150                 (void) ddi_intr_remove_handler(nvme->n_inth[i]);
3151                 (void) ddi_intr_free(nvme->n_inth[i]);
3152         }
3153 
3154         kmem_free(nvme->n_inth, nvme->n_inth_sz);
3155         nvme->n_inth = NULL;
3156         nvme->n_inth_sz = 0;
3157 
3158         nvme->n_progress &= ~NVME_INTERRUPTS;
3159 }
3160 
3161 static int
3162 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs)
3163 {
3164         int nintrs, navail, count;
3165         int ret;
3166         int i;
3167 
3168         if (nvme->n_intr_types == 0) {
3169                 ret = ddi_intr_get_supported_types(nvme->n_dip,
3170                     &nvme->n_intr_types);
3171                 if (ret != DDI_SUCCESS) {
3172                         dev_err(nvme->n_dip, CE_WARN,
3173                             "!%s: ddi_intr_get_supported types failed",
3174                             __func__);
3175                         return (ret);
3176                 }
3177 #ifdef __x86
3178                 if (get_hwenv() == HW_VMWARE)
3179                         nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX;
3180 #endif
3181         }
3182 
3183         if ((nvme->n_intr_types & intr_type) == 0)
3184                 return (DDI_FAILURE);
3185 
3186         ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs);
3187         if (ret != DDI_SUCCESS) {
3188                 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed",
3189                     __func__);
3190                 return (ret);
3191         }
3192 
3193         ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail);
3194         if (ret != DDI_SUCCESS) {
3195                 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed",
3196                     __func__);
3197                 return (ret);
3198         }
3199 
3200         /* We want at most one interrupt per queue pair. */
3201         if (navail > nqpairs)
3202                 navail = nqpairs;
3203 
3204         nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail;
3205         nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP);
3206 
3207         ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail,
3208             &count, 0);
3209         if (ret != DDI_SUCCESS) {
3210                 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed",
3211                     __func__);
3212                 goto fail;
3213         }
3214 
3215         nvme->n_intr_cnt = count;
3216 
3217         ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri);
3218         if (ret != DDI_SUCCESS) {
3219                 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed",
3220                     __func__);
3221                 goto fail;
3222         }
3223 
3224         for (i = 0; i < count; i++) {
3225                 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr,
3226                     (void *)nvme, (void *)(uintptr_t)i);
3227                 if (ret != DDI_SUCCESS) {
3228                         dev_err(nvme->n_dip, CE_WARN,
3229                             "!%s: ddi_intr_add_handler failed", __func__);
3230                         goto fail;
3231                 }
3232         }
3233 
3234         (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap);
3235 
3236         for (i = 0; i < count; i++) {
3237                 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
3238                         ret = ddi_intr_block_enable(&nvme->n_inth[i], 1);
3239                 else
3240                         ret = ddi_intr_enable(nvme->n_inth[i]);
3241 
3242                 if (ret != DDI_SUCCESS) {
3243                         dev_err(nvme->n_dip, CE_WARN,
3244                             "!%s: enabling interrupt %d failed", __func__, i);
3245                         goto fail;
3246                 }
3247         }
3248 
3249         nvme->n_intr_type = intr_type;
3250 
3251         nvme->n_progress |= NVME_INTERRUPTS;
3252 
3253         return (DDI_SUCCESS);
3254 
3255 fail:
3256         nvme_release_interrupts(nvme);
3257 
3258         return (ret);
3259 }
3260 
3261 static int
3262 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
3263 {
3264         _NOTE(ARGUNUSED(arg));
3265 
3266         pci_ereport_post(dip, fm_error, NULL);
3267         return (fm_error->fme_status);
3268 }
3269 
3270 static int
3271 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3272 {
3273         nvme_t *nvme;
3274         int instance;
3275         int nregs;
3276         off_t regsize;
3277         int i;
3278         char name[32];
3279         bd_ops_t ops = nvme_bd_ops;
3280 
3281         if (cmd != DDI_ATTACH)
3282                 return (DDI_FAILURE);
3283 
3284         instance = ddi_get_instance(dip);
3285 
3286         if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS)
3287                 return (DDI_FAILURE);
3288 
3289         nvme = ddi_get_soft_state(nvme_state, instance);
3290         ddi_set_driver_private(dip, nvme);
3291         nvme->n_dip = dip;
3292 
3293         mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL);
3294 
3295         nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3296             DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE;
3297         nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY,
3298             dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ?
3299             B_TRUE : B_FALSE;
3300         nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3301             DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN);
3302         nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3303             DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN);
3304         /*
3305          * Double up the default for completion queues in case of
3306          * queue sharing.
3307          */
3308         nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3309             DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN);
3310         nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3311             DDI_PROP_DONTPASS, "async-event-limit",
3312             NVME_DEFAULT_ASYNC_EVENT_LIMIT);
3313         nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3314             DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ?
3315             B_TRUE : B_FALSE;
3316         nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3317             DDI_PROP_DONTPASS, "min-phys-block-size",
3318             NVME_DEFAULT_MIN_BLOCK_SIZE);
3319         nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3320             DDI_PROP_DONTPASS, "max-submission-queues", -1);
3321         nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3322             DDI_PROP_DONTPASS, "max-completion-queues", -1);
3323 
3324         if (!ISP2(nvme->n_min_block_size) ||
3325             (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) {
3326                 dev_err(dip, CE_WARN, "!min-phys-block-size %s, "
3327                     "using default %d", ISP2(nvme->n_min_block_size) ?
3328                     "too low" : "not a power of 2",
3329                     NVME_DEFAULT_MIN_BLOCK_SIZE);
3330                 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
3331         }
3332 
3333         if (nvme->n_submission_queues != -1 &&
3334             (nvme->n_submission_queues < 1 ||
3335             nvme->n_submission_queues > UINT16_MAX)) {
3336                 dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not "
3337                     "valid. Must be [1..%d]", nvme->n_submission_queues,
3338                     UINT16_MAX);
3339                 nvme->n_submission_queues = -1;
3340         }
3341 
3342         if (nvme->n_completion_queues != -1 &&
3343             (nvme->n_completion_queues < 1 ||
3344             nvme->n_completion_queues > UINT16_MAX)) {
3345                 dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not "
3346                     "valid. Must be [1..%d]", nvme->n_completion_queues,
3347                     UINT16_MAX);
3348                 nvme->n_completion_queues = -1;
3349         }
3350 
3351         if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN)
3352                 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN;
3353         else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN)
3354                 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN;
3355 
3356         if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN)
3357                 nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN;
3358         if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN)
3359                 nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN;
3360 
3361         if (nvme->n_async_event_limit < 1)
3362                 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT;
3363 
3364         nvme->n_reg_acc_attr = nvme_reg_acc_attr;
3365         nvme->n_queue_dma_attr = nvme_queue_dma_attr;
3366         nvme->n_prp_dma_attr = nvme_prp_dma_attr;
3367         nvme->n_sgl_dma_attr = nvme_sgl_dma_attr;
3368 
3369         /*
3370          * Setup FMA support.
3371          */
3372         nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip,
3373             DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable",
3374             DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
3375             DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
3376 
3377         ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc);
3378 
3379         if (nvme->n_fm_cap) {
3380                 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE)
3381                         nvme->n_reg_acc_attr.devacc_attr_access =
3382                             DDI_FLAGERR_ACC;
3383 
3384                 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) {
3385                         nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
3386                         nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
3387                 }
3388 
3389                 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
3390                     DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
3391                         pci_ereport_setup(dip);
3392 
3393                 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
3394                         ddi_fm_handler_register(dip, nvme_fm_errcb,
3395                             (void *)nvme);
3396         }
3397 
3398         nvme->n_progress |= NVME_FMA_INIT;
3399 
3400         /*
3401          * The spec defines several register sets. Only the controller
3402          * registers (set 1) are currently used.
3403          */
3404         if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE ||
3405             nregs < 2 ||
3406             ddi_dev_regsize(dip, 1, &regsize) == DDI_FAILURE)
3407                 goto fail;
3408 
3409         if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize,
3410             &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) {
3411                 dev_err(dip, CE_WARN, "!failed to map regset 1");
3412                 goto fail;
3413         }
3414 
3415         nvme->n_progress |= NVME_REGS_MAPPED;
3416 
3417         /*
3418          * Create PRP DMA cache
3419          */
3420         (void) snprintf(name, sizeof (name), "%s%d_prp_cache",
3421             ddi_driver_name(dip), ddi_get_instance(dip));
3422         nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t),
3423             0, nvme_prp_dma_constructor, nvme_prp_dma_destructor,
3424             NULL, (void *)nvme, NULL, 0);
3425 
3426         if (nvme_init(nvme) != DDI_SUCCESS)
3427                 goto fail;
3428 
3429         if (!nvme->n_idctl->id_oncs.on_dset_mgmt)
3430                 ops.o_free_space = NULL;
3431 
3432         /*
3433          * Initialize the driver with the UFM subsystem
3434          */
3435         if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops,
3436             &nvme->n_ufmh, nvme) != 0) {
3437                 dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem");
3438                 goto fail;
3439         }
3440         mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL);
3441         ddi_ufm_update(nvme->n_ufmh);
3442         nvme->n_progress |= NVME_UFM_INIT;
3443 
3444         /*
3445          * Attach the blkdev driver for each namespace.
3446          */
3447         for (i = 0; i != nvme->n_namespace_count; i++) {
3448                 if (ddi_create_minor_node(nvme->n_dip, nvme->n_ns[i].ns_name,
3449                     S_IFCHR, NVME_MINOR(ddi_get_instance(nvme->n_dip), i + 1),
3450                     DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) {
3451                         dev_err(dip, CE_WARN,
3452                             "!failed to create minor node for namespace %d", i);
3453                         goto fail;
3454                 }
3455 
3456                 if (nvme->n_ns[i].ns_ignore)
3457                         continue;
3458 
3459                 nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i],
3460                     &ops, &nvme->n_prp_dma_attr, KM_SLEEP);
3461 
3462                 if (nvme->n_ns[i].ns_bd_hdl == NULL) {
3463                         dev_err(dip, CE_WARN,
3464                             "!failed to get blkdev handle for namespace %d", i);
3465                         goto fail;
3466                 }
3467 
3468                 if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl)
3469                     != DDI_SUCCESS) {
3470                         dev_err(dip, CE_WARN,
3471                             "!failed to attach blkdev handle for namespace %d",
3472                             i);
3473                         goto fail;
3474                 }
3475         }
3476 
3477         if (ddi_create_minor_node(dip, "devctl", S_IFCHR,
3478             NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0)
3479             != DDI_SUCCESS) {
3480                 dev_err(dip, CE_WARN, "nvme_attach: "
3481                     "cannot create devctl minor node");
3482                 goto fail;
3483         }
3484 
3485         return (DDI_SUCCESS);
3486 
3487 fail:
3488         /* attach successful anyway so that FMA can retire the device */
3489         if (nvme->n_dead)
3490                 return (DDI_SUCCESS);
3491 
3492         (void) nvme_detach(dip, DDI_DETACH);
3493 
3494         return (DDI_FAILURE);
3495 }
3496 
3497 static int
3498 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3499 {
3500         int instance, i;
3501         nvme_t *nvme;
3502 
3503         if (cmd != DDI_DETACH)
3504                 return (DDI_FAILURE);
3505 
3506         instance = ddi_get_instance(dip);
3507 
3508         nvme = ddi_get_soft_state(nvme_state, instance);
3509 
3510         if (nvme == NULL)
3511                 return (DDI_FAILURE);
3512 
3513         ddi_remove_minor_node(dip, "devctl");
3514         mutex_destroy(&nvme->n_minor.nm_mutex);
3515 
3516         if (nvme->n_ns) {
3517                 for (i = 0; i != nvme->n_namespace_count; i++) {
3518                         ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name);
3519                         mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex);
3520 
3521                         if (nvme->n_ns[i].ns_bd_hdl) {
3522                                 (void) bd_detach_handle(
3523                                     nvme->n_ns[i].ns_bd_hdl);
3524                                 bd_free_handle(nvme->n_ns[i].ns_bd_hdl);
3525                         }
3526 
3527                         if (nvme->n_ns[i].ns_idns)
3528                                 kmem_free(nvme->n_ns[i].ns_idns,
3529                                     sizeof (nvme_identify_nsid_t));
3530                         if (nvme->n_ns[i].ns_devid)
3531                                 strfree(nvme->n_ns[i].ns_devid);
3532                 }
3533 
3534                 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
3535                     nvme->n_namespace_count);
3536         }
3537         if (nvme->n_progress & NVME_UFM_INIT) {
3538                 ddi_ufm_fini(nvme->n_ufmh);
3539                 mutex_destroy(&nvme->n_fwslot_mutex);
3540         }
3541 
3542         if (nvme->n_progress & NVME_INTERRUPTS)
3543                 nvme_release_interrupts(nvme);
3544 
3545         for (i = 0; i < nvme->n_cq_count; i++) {
3546                 if (nvme->n_cq[i]->ncq_cmd_taskq != NULL)
3547                         taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq);
3548         }
3549 
3550         if (nvme->n_ioq_count > 0) {
3551                 for (i = 1; i != nvme->n_ioq_count + 1; i++) {
3552                         if (nvme->n_ioq[i] != NULL) {
3553                                 /* TODO: send destroy queue commands */
3554                                 nvme_free_qpair(nvme->n_ioq[i]);
3555                         }
3556                 }
3557 
3558                 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
3559                     (nvme->n_ioq_count + 1));
3560         }
3561 
3562         if (nvme->n_prp_cache != NULL) {
3563                 kmem_cache_destroy(nvme->n_prp_cache);
3564         }
3565 
3566         if (nvme->n_progress & NVME_REGS_MAPPED) {
3567                 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE);
3568                 (void) nvme_reset(nvme, B_FALSE);
3569         }
3570 
3571         if (nvme->n_progress & NVME_CTRL_LIMITS)
3572                 sema_destroy(&nvme->n_abort_sema);
3573 
3574         if (nvme->n_progress & NVME_ADMIN_QUEUE)
3575                 nvme_free_qpair(nvme->n_adminq);
3576 
3577         if (nvme->n_cq_count > 0) {
3578                 nvme_destroy_cq_array(nvme, 0);
3579                 nvme->n_cq = NULL;
3580                 nvme->n_cq_count = 0;
3581         }
3582 
3583         if (nvme->n_idctl)
3584                 kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE);
3585 
3586         if (nvme->n_progress & NVME_REGS_MAPPED)
3587                 ddi_regs_map_free(&nvme->n_regh);
3588 
3589         if (nvme->n_progress & NVME_FMA_INIT) {
3590                 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
3591                         ddi_fm_handler_unregister(nvme->n_dip);
3592 
3593                 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
3594                     DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
3595                         pci_ereport_teardown(nvme->n_dip);
3596 
3597                 ddi_fm_fini(nvme->n_dip);
3598         }
3599 
3600         if (nvme->n_vendor != NULL)
3601                 strfree(nvme->n_vendor);
3602 
3603         if (nvme->n_product != NULL)
3604                 strfree(nvme->n_product);
3605 
3606         ddi_soft_state_free(nvme_state, instance);
3607 
3608         return (DDI_SUCCESS);
3609 }
3610 
3611 static int
3612 nvme_quiesce(dev_info_t *dip)
3613 {
3614         int instance;
3615         nvme_t *nvme;
3616 
3617         instance = ddi_get_instance(dip);
3618 
3619         nvme = ddi_get_soft_state(nvme_state, instance);
3620 
3621         if (nvme == NULL)
3622                 return (DDI_FAILURE);
3623 
3624         nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE);
3625 
3626         (void) nvme_reset(nvme, B_TRUE);
3627 
3628         return (DDI_FAILURE);
3629 }
3630 
3631 static int
3632 nvme_fill_prp(nvme_cmd_t *cmd, bd_xfer_t *xfer)
3633 {
3634         nvme_t *nvme = cmd->nc_nvme;
3635         int nprp_page, nprp;
3636         uint64_t *prp;
3637 
3638         if (xfer->x_ndmac == 0)
3639                 return (DDI_FAILURE);
3640 
3641         cmd->nc_sqe.sqe_dptr.d_prp[0] = xfer->x_dmac.dmac_laddress;
3642 
3643         if (xfer->x_ndmac == 1) {
3644                 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
3645                 return (DDI_SUCCESS);
3646         } else if (xfer->x_ndmac == 2) {
3647                 ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac);
3648                 cmd->nc_sqe.sqe_dptr.d_prp[1] = xfer->x_dmac.dmac_laddress;
3649                 return (DDI_SUCCESS);
3650         }
3651 
3652         xfer->x_ndmac--;
3653 
3654         nprp_page = nvme->n_pagesize / sizeof (uint64_t);
3655         ASSERT(nprp_page > 0);
3656         nprp = (xfer->x_ndmac + nprp_page - 1) / nprp_page;
3657 
3658         /*
3659          * We currently don't support chained PRPs and set up our DMA
3660          * attributes to reflect that. If we still get an I/O request
3661          * that needs a chained PRP something is very wrong.
3662          */
3663         VERIFY(nprp == 1);
3664 
3665         cmd->nc_dma = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP);
3666         bzero(cmd->nc_dma->nd_memp, cmd->nc_dma->nd_len);
3667 
3668         cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_dma->nd_cookie.dmac_laddress;
3669 
3670         /*LINTED: E_PTR_BAD_CAST_ALIGN*/
3671         for (prp = (uint64_t *)cmd->nc_dma->nd_memp;
3672             xfer->x_ndmac > 0;
3673             prp++, xfer->x_ndmac--) {
3674                 ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac);
3675                 *prp = xfer->x_dmac.dmac_laddress;
3676         }
3677 
3678         (void) ddi_dma_sync(cmd->nc_dma->nd_dmah, 0, cmd->nc_dma->nd_len,
3679             DDI_DMA_SYNC_FORDEV);
3680         return (DDI_SUCCESS);
3681 }
3682 
3683 /*
3684  * The maximum number of requests supported for a deallocate request is
3685  * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and
3686  * unchanged through at least 1.4a). The definition of nvme_range_t is also
3687  * from the NVMe 1.1 spec. Together, the result is that all of the ranges for
3688  * a deallocate request will fit into the smallest supported namespace page
3689  * (4k).
3690  */
3691 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096);
3692 
3693 static int
3694 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize,
3695     int allocflag)
3696 {
3697         const dkioc_free_list_t *dfl = xfer->x_dfl;
3698         const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
3699         nvme_t *nvme = cmd->nc_nvme;
3700         nvme_range_t *ranges = NULL;
3701         uint_t i;
3702 
3703         /*
3704          * The number of ranges in the request is 0s based (that is
3705          * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ...,
3706          * word10 == 255 -> 256 ranges). Therefore the allowed values are
3707          * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request,
3708          * we either provided bad info in nvme_bd_driveinfo() or there is a bug
3709          * in blkdev.
3710          */
3711         VERIFY3U(dfl->dfl_num_exts, >, 0);
3712         VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES);
3713         cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff;
3714 
3715         cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE;
3716 
3717         cmd->nc_dma = kmem_cache_alloc(nvme->n_prp_cache, allocflag);
3718         if (cmd->nc_dma == NULL)
3719                 return (DDI_FAILURE);
3720 
3721         bzero(cmd->nc_dma->nd_memp, cmd->nc_dma->nd_len);
3722         ranges = (nvme_range_t *)cmd->nc_dma->nd_memp;
3723 
3724         cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
3725         cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
3726 
3727         for (i = 0; i < dfl->dfl_num_exts; i++) {
3728                 uint64_t lba, len;
3729 
3730                 lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize;
3731                 len = exts[i].dfle_length / blocksize;
3732 
3733                 VERIFY3U(len, <=, UINT32_MAX);
3734 
3735                 /* No context attributes for a deallocate request */
3736                 ranges[i].nr_ctxattr = 0;
3737                 ranges[i].nr_len = len;
3738                 ranges[i].nr_lba = lba;
3739         }
3740 
3741         (void) ddi_dma_sync(cmd->nc_dma->nd_dmah, 0, cmd->nc_dma->nd_len,
3742             DDI_DMA_SYNC_FORDEV);
3743 
3744         return (DDI_SUCCESS);
3745 }
3746 
3747 static nvme_cmd_t *
3748 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer)
3749 {
3750         nvme_t *nvme = ns->ns_nvme;
3751         nvme_cmd_t *cmd;
3752         int allocflag;
3753 
3754         /*
3755          * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
3756          */
3757         allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP;
3758         cmd = nvme_alloc_cmd(nvme, allocflag);
3759 
3760         if (cmd == NULL)
3761                 return (NULL);
3762 
3763         cmd->nc_sqe.sqe_opc = opc;
3764         cmd->nc_callback = nvme_bd_xfer_done;
3765         cmd->nc_xfer = xfer;
3766 
3767         switch (opc) {
3768         case NVME_OPC_NVM_WRITE:
3769         case NVME_OPC_NVM_READ:
3770                 VERIFY(xfer->x_nblks <= 0x10000);
3771 
3772                 cmd->nc_sqe.sqe_nsid = ns->ns_id;
3773 
3774                 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu;
3775                 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32);
3776                 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1);
3777 
3778                 if (nvme_fill_prp(cmd, xfer) != DDI_SUCCESS)
3779                         goto fail;
3780                 break;
3781 
3782         case NVME_OPC_NVM_FLUSH:
3783                 cmd->nc_sqe.sqe_nsid = ns->ns_id;
3784                 break;
3785 
3786         case NVME_OPC_NVM_DSET_MGMT:
3787                 cmd->nc_sqe.sqe_nsid = ns->ns_id;
3788 
3789                 if (nvme_fill_ranges(cmd, xfer,
3790                     (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS)
3791                         goto fail;
3792                 break;
3793 
3794         default:
3795                 goto fail;
3796         }
3797 
3798         return (cmd);
3799 
3800 fail:
3801         nvme_free_cmd(cmd);
3802         return (NULL);
3803 }
3804 
3805 static void
3806 nvme_bd_xfer_done(void *arg)
3807 {
3808         nvme_cmd_t *cmd = arg;
3809         bd_xfer_t *xfer = cmd->nc_xfer;
3810         int error = 0;
3811 
3812         error = nvme_check_cmd_status(cmd);
3813         nvme_free_cmd(cmd);
3814 
3815         bd_xfer_done(xfer, error);
3816 }
3817 
3818 static void
3819 nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
3820 {
3821         nvme_namespace_t *ns = arg;
3822         nvme_t *nvme = ns->ns_nvme;
3823         uint_t ns_count = MAX(1, nvme->n_namespaces_attachable);
3824 
3825         /*
3826          * Set the blkdev qcount to the number of submission queues.
3827          * It will then create one waitq/runq pair for each submission
3828          * queue and spread I/O requests across the queues.
3829          */
3830         drive->d_qcount = nvme->n_ioq_count;
3831 
3832         /*
3833          * I/O activity to individual namespaces is distributed across
3834          * each of the d_qcount blkdev queues (which has been set to
3835          * the number of nvme submission queues). d_qsize is the number
3836          * of submitted and not completed I/Os within each queue that blkdev
3837          * will allow before it starts holding them in the waitq.
3838          *
3839          * Each namespace will create a child blkdev instance, for each one
3840          * we try and set the d_qsize so that each namespace gets an
3841          * equal portion of the submission queue.
3842          *
3843          * If post instantiation of the nvme drive, n_namespaces_attachable
3844          * changes and a namespace is attached it could calculate a
3845          * different d_qsize. It may even be that the sum of the d_qsizes is
3846          * now beyond the submission queue size. Should that be the case
3847          * and the I/O rate is such that blkdev attempts to submit more
3848          * I/Os than the size of the submission queue, the excess I/Os
3849          * will be held behind the semaphore nq_sema.
3850          */
3851         drive->d_qsize = nvme->n_io_squeue_len / ns_count;
3852 
3853         /*
3854          * Don't let the queue size drop below the minimum, though.
3855          */
3856         drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN);
3857 
3858         /*
3859          * d_maxxfer is not set, which means the value is taken from the DMA
3860          * attributes specified to bd_alloc_handle.
3861          */
3862 
3863         drive->d_removable = B_FALSE;
3864         drive->d_hotpluggable = B_FALSE;
3865 
3866         bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64));
3867         drive->d_target = ns->ns_id;
3868         drive->d_lun = 0;
3869 
3870         drive->d_model = nvme->n_idctl->id_model;
3871         drive->d_model_len = sizeof (nvme->n_idctl->id_model);
3872         drive->d_vendor = nvme->n_vendor;
3873         drive->d_vendor_len = strlen(nvme->n_vendor);
3874         drive->d_product = nvme->n_product;
3875         drive->d_product_len = strlen(nvme->n_product);
3876         drive->d_serial = nvme->n_idctl->id_serial;
3877         drive->d_serial_len = sizeof (nvme->n_idctl->id_serial);
3878         drive->d_revision = nvme->n_idctl->id_fwrev;
3879         drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev);
3880 
3881         /*
3882          * If we support the dataset management command, the only restrictions
3883          * on a discard request are the maximum number of ranges (segments)
3884          * per single request.
3885          */
3886         if (nvme->n_idctl->id_oncs.on_dset_mgmt)
3887                 drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES;
3888 }
3889 
3890 static int
3891 nvme_bd_mediainfo(void *arg, bd_media_t *media)
3892 {
3893         nvme_namespace_t *ns = arg;
3894 
3895         media->m_nblks = ns->ns_block_count;
3896         media->m_blksize = ns->ns_block_size;
3897         media->m_readonly = B_FALSE;
3898         media->m_solidstate = B_TRUE;
3899 
3900         media->m_pblksize = ns->ns_best_block_size;
3901 
3902         return (0);
3903 }
3904 
3905 static int
3906 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
3907 {
3908         nvme_t *nvme = ns->ns_nvme;
3909         nvme_cmd_t *cmd;
3910         nvme_qpair_t *ioq;
3911         boolean_t poll;
3912         int ret;
3913 
3914         if (nvme->n_dead)
3915                 return (EIO);
3916 
3917         cmd = nvme_create_nvm_cmd(ns, opc, xfer);
3918         if (cmd == NULL)
3919                 return (ENOMEM);
3920 
3921         cmd->nc_sqid = xfer->x_qnum + 1;
3922         ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
3923         ioq = nvme->n_ioq[cmd->nc_sqid];
3924 
3925         /*
3926          * Get the polling flag before submitting the command. The command may
3927          * complete immediately after it was submitted, which means we must
3928          * treat both cmd and xfer as if they have been freed already.
3929          */
3930         poll = (xfer->x_flags & BD_XFER_POLL) != 0;
3931 
3932         ret = nvme_submit_io_cmd(ioq, cmd);
3933 
3934         if (ret != 0)
3935                 return (ret);
3936 
3937         if (!poll)
3938                 return (0);
3939 
3940         do {
3941                 cmd = nvme_retrieve_cmd(nvme, ioq);
3942                 if (cmd != NULL)
3943                         cmd->nc_callback(cmd);
3944                 else
3945                         drv_usecwait(10);
3946         } while (ioq->nq_active_cmds != 0);
3947 
3948         return (0);
3949 }
3950 
3951 static int
3952 nvme_bd_read(void *arg, bd_xfer_t *xfer)
3953 {
3954         nvme_namespace_t *ns = arg;
3955 
3956         return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
3957 }
3958 
3959 static int
3960 nvme_bd_write(void *arg, bd_xfer_t *xfer)
3961 {
3962         nvme_namespace_t *ns = arg;
3963 
3964         return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE));
3965 }
3966 
3967 static int
3968 nvme_bd_sync(void *arg, bd_xfer_t *xfer)
3969 {
3970         nvme_namespace_t *ns = arg;
3971 
3972         if (ns->ns_nvme->n_dead)
3973                 return (EIO);
3974 
3975         /*
3976          * If the volatile write cache is not present or not enabled the FLUSH
3977          * command is a no-op, so we can take a shortcut here.
3978          */
3979         if (!ns->ns_nvme->n_write_cache_present) {
3980                 bd_xfer_done(xfer, ENOTSUP);
3981                 return (0);
3982         }
3983 
3984         if (!ns->ns_nvme->n_write_cache_enabled) {
3985                 bd_xfer_done(xfer, 0);
3986                 return (0);
3987         }
3988 
3989         return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
3990 }
3991 
3992 static int
3993 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
3994 {
3995         nvme_namespace_t *ns = arg;
3996 
3997         /*LINTED: E_BAD_PTR_CAST_ALIGN*/
3998         if (*(uint64_t *)ns->ns_eui64 != 0) {
3999                 return (ddi_devid_init(devinfo, DEVID_SCSI3_WWN,
4000                     sizeof (ns->ns_eui64), ns->ns_eui64, devid));
4001         } else {
4002                 return (ddi_devid_init(devinfo, DEVID_ENCAP,
4003                     strlen(ns->ns_devid), ns->ns_devid, devid));
4004         }
4005 }
4006 
4007 static int
4008 nvme_bd_free_space(void *arg, bd_xfer_t *xfer)
4009 {
4010         nvme_namespace_t *ns = arg;
4011 
4012         if (xfer->x_dfl == NULL)
4013                 return (EINVAL);
4014 
4015         if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt)
4016                 return (ENOTSUP);
4017 
4018         return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT));
4019 }
4020 
4021 static int
4022 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
4023 {
4024 #ifndef __lock_lint
4025         _NOTE(ARGUNUSED(cred_p));
4026 #endif
4027         minor_t minor = getminor(*devp);
4028         nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor));
4029         int nsid = NVME_MINOR_NSID(minor);
4030         nvme_minor_state_t *nm;
4031         int rv = 0;
4032 
4033         if (otyp != OTYP_CHR)
4034                 return (EINVAL);
4035 
4036         if (nvme == NULL)
4037                 return (ENXIO);
4038 
4039         if (nsid > nvme->n_namespace_count)
4040                 return (ENXIO);
4041 
4042         if (nvme->n_dead)
4043                 return (EIO);
4044 
4045         nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor;
4046 
4047         mutex_enter(&nm->nm_mutex);
4048         if (nm->nm_oexcl) {
4049                 rv = EBUSY;
4050                 goto out;
4051         }
4052 
4053         if (flag & FEXCL) {
4054                 if (nm->nm_ocnt != 0) {
4055                         rv = EBUSY;
4056                         goto out;
4057                 }
4058                 nm->nm_oexcl = B_TRUE;
4059         }
4060 
4061         nm->nm_ocnt++;
4062 
4063 out:
4064         mutex_exit(&nm->nm_mutex);
4065         return (rv);
4066 
4067 }
4068 
4069 static int
4070 nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
4071 {
4072 #ifndef __lock_lint
4073         _NOTE(ARGUNUSED(cred_p));
4074         _NOTE(ARGUNUSED(flag));
4075 #endif
4076         minor_t minor = getminor(dev);
4077         nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor));
4078         int nsid = NVME_MINOR_NSID(minor);
4079         nvme_minor_state_t *nm;
4080 
4081         if (otyp != OTYP_CHR)
4082                 return (ENXIO);
4083 
4084         if (nvme == NULL)
4085                 return (ENXIO);
4086 
4087         if (nsid > nvme->n_namespace_count)
4088                 return (ENXIO);
4089 
4090         nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor;
4091 
4092         mutex_enter(&nm->nm_mutex);
4093         if (nm->nm_oexcl)
4094                 nm->nm_oexcl = B_FALSE;
4095 
4096         ASSERT(nm->nm_ocnt > 0);
4097         nm->nm_ocnt--;
4098         mutex_exit(&nm->nm_mutex);
4099 
4100         return (0);
4101 }
4102 
4103 static int
4104 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4105     cred_t *cred_p)
4106 {
4107         _NOTE(ARGUNUSED(cred_p));
4108         int rv = 0;
4109         void *idctl;
4110 
4111         if ((mode & FREAD) == 0)
4112                 return (EPERM);
4113 
4114         if (nioc->n_len < NVME_IDENTIFY_BUFSIZE)
4115                 return (EINVAL);
4116 
4117         if ((rv = nvme_identify(nvme, B_TRUE, nsid, (void **)&idctl)) != 0)
4118                 return (rv);
4119 
4120         if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode)
4121             != 0)
4122                 rv = EFAULT;
4123 
4124         kmem_free(idctl, NVME_IDENTIFY_BUFSIZE);
4125 
4126         return (rv);
4127 }
4128 
4129 /*
4130  * Execute commands on behalf of the various ioctls.
4131  */
4132 static int
4133 nvme_ioc_cmd(nvme_t *nvme, nvme_sqe_t *sqe, boolean_t is_admin, void *data_addr,
4134     uint32_t data_len, int rwk, nvme_cqe_t *cqe, uint_t timeout)
4135 {
4136         nvme_cmd_t *cmd;
4137         nvme_qpair_t *ioq;
4138         int rv = 0;
4139 
4140         cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
4141         if (is_admin) {
4142                 cmd->nc_sqid = 0;
4143                 ioq = nvme->n_adminq;
4144         } else {
4145                 cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1;
4146                 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
4147                 ioq = nvme->n_ioq[cmd->nc_sqid];
4148         }
4149 
4150         cmd->nc_callback = nvme_wakeup_cmd;
4151         cmd->nc_sqe = *sqe;
4152 
4153         if ((rwk & (FREAD | FWRITE)) != 0) {
4154                 if (data_addr == NULL) {
4155                         rv = EINVAL;
4156                         goto free_cmd;
4157                 }
4158 
4159                 /*
4160                  * Because we use PRPs and haven't implemented PRP
4161                  * lists here, the maximum data size is restricted to
4162                  * 2 pages.
4163                  */
4164                 if (data_len > 2 * nvme->n_pagesize) {
4165                         dev_err(nvme->n_dip, CE_WARN, "!Data size %u is too "
4166                             "large for nvme_ioc_cmd(). Limit is 2 pages "
4167                             "(%u bytes)", data_len,  2 * nvme->n_pagesize);
4168 
4169                         rv = EINVAL;
4170                         goto free_cmd;
4171                 }
4172 
4173                 if (nvme_zalloc_dma(nvme, data_len, DDI_DMA_READ,
4174                     &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
4175                         dev_err(nvme->n_dip, CE_WARN,
4176                             "!nvme_zalloc_dma failed for nvme_ioc_cmd()");
4177 
4178                         rv = ENOMEM;
4179                         goto free_cmd;
4180                 }
4181 
4182                 if (cmd->nc_dma->nd_ncookie > 2) {
4183                         dev_err(nvme->n_dip, CE_WARN,
4184                             "!too many DMA cookies for nvme_ioc_cmd()");
4185                         atomic_inc_32(&nvme->n_too_many_cookies);
4186 
4187                         rv = E2BIG;
4188                         goto free_cmd;
4189                 }
4190 
4191                 cmd->nc_sqe.sqe_dptr.d_prp[0] =
4192                     cmd->nc_dma->nd_cookie.dmac_laddress;
4193 
4194                 if (cmd->nc_dma->nd_ncookie > 1) {
4195                         ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
4196                             &cmd->nc_dma->nd_cookie);
4197                         cmd->nc_sqe.sqe_dptr.d_prp[1] =
4198                             cmd->nc_dma->nd_cookie.dmac_laddress;
4199                 }
4200 
4201                 if ((rwk & FWRITE) != 0) {
4202                         if (ddi_copyin(data_addr, cmd->nc_dma->nd_memp,
4203                             data_len, rwk & FKIOCTL) != 0) {
4204                                 rv = EFAULT;
4205                                 goto free_cmd;
4206                         }
4207                 }
4208         }
4209 
4210         if (is_admin) {
4211                 nvme_admin_cmd(cmd, timeout);
4212         } else {
4213                 mutex_enter(&cmd->nc_mutex);
4214 
4215                 rv = nvme_submit_io_cmd(ioq, cmd);
4216 
4217                 if (rv == EAGAIN) {
4218                         mutex_exit(&cmd->nc_mutex);
4219                         dev_err(cmd->nc_nvme->n_dip, CE_WARN,
4220                             "!nvme_ioc_cmd() failed, I/O Q full");
4221                         goto free_cmd;
4222                 }
4223 
4224                 nvme_wait_cmd(cmd, timeout);
4225 
4226                 mutex_exit(&cmd->nc_mutex);
4227         }
4228 
4229         if (cqe != NULL)
4230                 *cqe = cmd->nc_cqe;
4231 
4232         if ((rv = nvme_check_cmd_status(cmd)) != 0) {
4233                 dev_err(nvme->n_dip, CE_WARN,
4234                     "!nvme_ioc_cmd() failed with sct = %x, sc = %x",
4235                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
4236 
4237                 goto free_cmd;
4238         }
4239 
4240         if ((rwk & FREAD) != 0) {
4241                 if (ddi_copyout(cmd->nc_dma->nd_memp,
4242                     data_addr, data_len, rwk & FKIOCTL) != 0)
4243                         rv = EFAULT;
4244         }
4245 
4246 free_cmd:
4247         nvme_free_cmd(cmd);
4248 
4249         return (rv);
4250 }
4251 
4252 static int
4253 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4254     int mode, cred_t *cred_p)
4255 {
4256         _NOTE(ARGUNUSED(nsid, cred_p));
4257         int rv = 0;
4258         nvme_reg_cap_t cap = { 0 };
4259         nvme_capabilities_t nc;
4260 
4261         if ((mode & FREAD) == 0)
4262                 return (EPERM);
4263 
4264         if (nioc->n_len < sizeof (nc))
4265                 return (EINVAL);
4266 
4267         cap.r = nvme_get64(nvme, NVME_REG_CAP);
4268 
4269         /*
4270          * The MPSMIN and MPSMAX fields in the CAP register use 0 to
4271          * specify the base page size of 4k (1<<12), so add 12 here to
4272          * get the real page size value.
4273          */
4274         nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax);
4275         nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin);
4276 
4277         if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0)
4278                 rv = EFAULT;
4279 
4280         return (rv);
4281 }
4282 
4283 static int
4284 nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4285     int mode, cred_t *cred_p)
4286 {
4287         _NOTE(ARGUNUSED(cred_p));
4288         void *log = NULL;
4289         size_t bufsize = 0;
4290         int rv = 0;
4291 
4292         if ((mode & FREAD) == 0)
4293                 return (EPERM);
4294 
4295         switch (nioc->n_arg) {
4296         case NVME_LOGPAGE_ERROR:
4297                 if (nsid != 0)
4298                         return (EINVAL);
4299                 break;
4300         case NVME_LOGPAGE_HEALTH:
4301                 if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0)
4302                         return (EINVAL);
4303 
4304                 if (nsid == 0)
4305                         nsid = (uint32_t)-1;
4306 
4307                 break;
4308         case NVME_LOGPAGE_FWSLOT:
4309                 if (nsid != 0)
4310                         return (EINVAL);
4311                 break;
4312         default:
4313                 return (EINVAL);
4314         }
4315 
4316         if (nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, nioc->n_arg, nsid)
4317             != DDI_SUCCESS)
4318                 return (EIO);
4319 
4320         if (nioc->n_len < bufsize) {
4321                 kmem_free(log, bufsize);
4322                 return (EINVAL);
4323         }
4324 
4325         if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0)
4326                 rv = EFAULT;
4327 
4328         nioc->n_len = bufsize;
4329         kmem_free(log, bufsize);
4330 
4331         return (rv);
4332 }
4333 
4334 static int
4335 nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4336     int mode, cred_t *cred_p)
4337 {
4338         _NOTE(ARGUNUSED(cred_p));
4339         void *buf = NULL;
4340         size_t bufsize = 0;
4341         uint32_t res = 0;
4342         uint8_t feature;
4343         int rv = 0;
4344 
4345         if ((mode & FREAD) == 0)
4346                 return (EPERM);
4347 
4348         if ((nioc->n_arg >> 32) > 0xff)
4349                 return (EINVAL);
4350 
4351         feature = (uint8_t)(nioc->n_arg >> 32);
4352 
4353         switch (feature) {
4354         case NVME_FEAT_ARBITRATION:
4355         case NVME_FEAT_POWER_MGMT:
4356         case NVME_FEAT_ERROR:
4357         case NVME_FEAT_NQUEUES:
4358         case NVME_FEAT_INTR_COAL:
4359         case NVME_FEAT_WRITE_ATOM:
4360         case NVME_FEAT_ASYNC_EVENT:
4361         case NVME_FEAT_PROGRESS:
4362                 if (nsid != 0)
4363                         return (EINVAL);
4364                 break;
4365 
4366         case NVME_FEAT_TEMPERATURE:
4367                 if (nsid != 0)
4368                         return (EINVAL);
4369                 res = nioc->n_arg & 0xffffffffUL;
4370                 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2)) {
4371                         nvme_temp_threshold_t tt;
4372 
4373                         tt.r = res;
4374                         if (tt.b.tt_thsel != NVME_TEMP_THRESH_OVER &&
4375                             tt.b.tt_thsel != NVME_TEMP_THRESH_UNDER) {
4376                                 return (EINVAL);
4377                         }
4378 
4379                         if (tt.b.tt_tmpsel > NVME_TEMP_THRESH_MAX_SENSOR) {
4380                                 return (EINVAL);
4381                         }
4382                 } else if (res != 0) {
4383                         return (EINVAL);
4384                 }
4385                 break;
4386 
4387         case NVME_FEAT_INTR_VECT:
4388                 if (nsid != 0)
4389                         return (EINVAL);
4390 
4391                 res = nioc->n_arg & 0xffffffffUL;
4392                 if (res >= nvme->n_intr_cnt)
4393                         return (EINVAL);
4394                 break;
4395 
4396         case NVME_FEAT_LBA_RANGE:
4397                 if (nvme->n_lba_range_supported == B_FALSE)
4398                         return (EINVAL);
4399 
4400                 if (nsid == 0 ||
4401                     nsid > nvme->n_namespace_count)
4402                         return (EINVAL);
4403 
4404                 break;
4405 
4406         case NVME_FEAT_WRITE_CACHE:
4407                 if (nsid != 0)
4408                         return (EINVAL);
4409 
4410                 if (!nvme->n_write_cache_present)
4411                         return (EINVAL);
4412 
4413                 break;
4414 
4415         case NVME_FEAT_AUTO_PST:
4416                 if (nsid != 0)
4417                         return (EINVAL);
4418 
4419                 if (!nvme->n_auto_pst_supported)
4420                         return (EINVAL);
4421 
4422                 break;
4423 
4424         default:
4425                 return (EINVAL);
4426         }
4427 
4428         rv = nvme_get_features(nvme, B_TRUE, nsid, feature, &res, &buf,
4429             &bufsize);
4430         if (rv != 0)
4431                 return (rv);
4432 
4433         if (nioc->n_len < bufsize) {
4434                 kmem_free(buf, bufsize);
4435                 return (EINVAL);
4436         }
4437 
4438         if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0)
4439                 rv = EFAULT;
4440 
4441         kmem_free(buf, bufsize);
4442         nioc->n_arg = res;
4443         nioc->n_len = bufsize;
4444 
4445         return (rv);
4446 }
4447 
4448 static int
4449 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4450     cred_t *cred_p)
4451 {
4452         _NOTE(ARGUNUSED(nsid, mode, cred_p));
4453 
4454         if ((mode & FREAD) == 0)
4455                 return (EPERM);
4456 
4457         nioc->n_arg = nvme->n_intr_cnt;
4458         return (0);
4459 }
4460 
4461 static int
4462 nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4463     cred_t *cred_p)
4464 {
4465         _NOTE(ARGUNUSED(nsid, cred_p));
4466         int rv = 0;
4467 
4468         if ((mode & FREAD) == 0)
4469                 return (EPERM);
4470 
4471         if (nioc->n_len < sizeof (nvme->n_version))
4472                 return (ENOMEM);
4473 
4474         if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf,
4475             sizeof (nvme->n_version), mode) != 0)
4476                 rv = EFAULT;
4477 
4478         return (rv);
4479 }
4480 
4481 static int
4482 nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4483     cred_t *cred_p)
4484 {
4485         _NOTE(ARGUNUSED(mode));
4486         nvme_format_nvm_t frmt = { 0 };
4487         int c_nsid = nsid != 0 ? nsid - 1 : 0;
4488 
4489         if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4490                 return (EPERM);
4491 
4492         frmt.r = nioc->n_arg & 0xffffffff;
4493 
4494         /*
4495          * Check whether the FORMAT NVM command is supported.
4496          */
4497         if (nvme->n_idctl->id_oacs.oa_format == 0)
4498                 return (EINVAL);
4499 
4500         /*
4501          * Don't allow format or secure erase of individual namespace if that
4502          * would cause a format or secure erase of all namespaces.
4503          */
4504         if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0)
4505                 return (EINVAL);
4506 
4507         if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE &&
4508             nvme->n_idctl->id_fna.fn_sec_erase != 0)
4509                 return (EINVAL);
4510 
4511         /*
4512          * Don't allow formatting with Protection Information.
4513          */
4514         if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0)
4515                 return (EINVAL);
4516 
4517         /*
4518          * Don't allow formatting using an illegal LBA format, or any LBA format
4519          * that uses metadata.
4520          */
4521         if (frmt.b.fm_lbaf > nvme->n_ns[c_nsid].ns_idns->id_nlbaf ||
4522             nvme->n_ns[c_nsid].ns_idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0)
4523                 return (EINVAL);
4524 
4525         /*
4526          * Don't allow formatting using an illegal Secure Erase setting.
4527          */
4528         if (frmt.b.fm_ses > NVME_FRMT_MAX_SES ||
4529             (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO &&
4530             nvme->n_idctl->id_fna.fn_crypt_erase == 0))
4531                 return (EINVAL);
4532 
4533         if (nsid == 0)
4534                 nsid = (uint32_t)-1;
4535 
4536         return (nvme_format_nvm(nvme, B_TRUE, nsid, frmt.b.fm_lbaf, B_FALSE, 0,
4537             B_FALSE, frmt.b.fm_ses));
4538 }
4539 
4540 static int
4541 nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4542     cred_t *cred_p)
4543 {
4544         _NOTE(ARGUNUSED(nioc, mode));
4545         int rv = 0;
4546 
4547         if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4548                 return (EPERM);
4549 
4550         if (nsid == 0)
4551                 return (EINVAL);
4552 
4553         if (nvme->n_ns[nsid - 1].ns_ignore)
4554                 return (0);
4555 
4556         rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl);
4557         if (rv != DDI_SUCCESS)
4558                 rv = EBUSY;
4559 
4560         return (rv);
4561 }
4562 
4563 static int
4564 nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4565     cred_t *cred_p)
4566 {
4567         _NOTE(ARGUNUSED(nioc, mode));
4568         nvme_identify_nsid_t *idns;
4569         int rv = 0;
4570 
4571         if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4572                 return (EPERM);
4573 
4574         if (nsid == 0)
4575                 return (EINVAL);
4576 
4577         /*
4578          * Identify namespace again, free old identify data.
4579          */
4580         idns = nvme->n_ns[nsid - 1].ns_idns;
4581         if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS)
4582                 return (EIO);
4583 
4584         kmem_free(idns, sizeof (nvme_identify_nsid_t));
4585 
4586         if (nvme->n_ns[nsid - 1].ns_ignore)
4587                 return (ENOTSUP);
4588 
4589         if (nvme->n_ns[nsid - 1].ns_bd_hdl == NULL)
4590                 nvme->n_ns[nsid - 1].ns_bd_hdl = bd_alloc_handle(
4591                     &nvme->n_ns[nsid - 1], &nvme_bd_ops, &nvme->n_prp_dma_attr,
4592                     KM_SLEEP);
4593 
4594         rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl);
4595         if (rv != DDI_SUCCESS)
4596                 rv = EBUSY;
4597 
4598         return (rv);
4599 }
4600 
4601 static void
4602 nvme_ufm_update(nvme_t *nvme)
4603 {
4604         mutex_enter(&nvme->n_fwslot_mutex);
4605         ddi_ufm_update(nvme->n_ufmh);
4606         if (nvme->n_fwslot != NULL) {
4607                 kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t));
4608                 nvme->n_fwslot = NULL;
4609         }
4610         mutex_exit(&nvme->n_fwslot_mutex);
4611 }
4612 
4613 static int
4614 nvme_ioctl_firmware_download(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4615     int mode, cred_t *cred_p)
4616 {
4617         int rv = 0;
4618         size_t len, copylen;
4619         offset_t offset;
4620         uintptr_t buf;
4621         nvme_sqe_t sqe = {
4622             .sqe_opc    = NVME_OPC_FW_IMAGE_LOAD
4623         };
4624 
4625         if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4626                 return (EPERM);
4627 
4628         if (nsid != 0)
4629                 return (EINVAL);
4630 
4631         /*
4632          * The offset (in n_len) is restricted to the number of DWORDs in
4633          * 32 bits.
4634          */
4635         if (nioc->n_len > NVME_FW_OFFSETB_MAX)
4636                 return (EINVAL);
4637 
4638         /* Confirm that both offset and length are a multiple of DWORD bytes */
4639         if ((nioc->n_len & NVME_DWORD_MASK) != 0 ||
4640             (nioc->n_arg & NVME_DWORD_MASK) != 0)
4641                 return (EINVAL);
4642 
4643         len = nioc->n_len;
4644         offset = nioc->n_arg;
4645         buf = (uintptr_t)nioc->n_buf;
4646         while (len > 0 && rv == 0) {
4647                 /*
4648                  * nvme_ioc_cmd() does not use SGLs or PRP lists.
4649                  * It is limited to 2 PRPs per NVM command, so limit
4650                  * the size of the data to 2 pages.
4651                  */
4652                 copylen = MIN(2 * nvme->n_pagesize, len);
4653 
4654                 sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1;
4655                 sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT);
4656 
4657                 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void *)buf, copylen,
4658                     FWRITE, NULL, nvme_admin_cmd_timeout);
4659 
4660                 buf += copylen;
4661                 offset += copylen;
4662                 len -= copylen;
4663         }
4664 
4665         /*
4666          * Let the DDI UFM subsystem know that the firmware information for
4667          * this device has changed.
4668          */
4669         nvme_ufm_update(nvme);
4670 
4671         return (rv);
4672 }
4673 
4674 static int
4675 nvme_ioctl_firmware_commit(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4676     int mode, cred_t *cred_p)
4677 {
4678         nvme_firmware_commit_dw10_t fc_dw10 = { 0 };
4679         uint32_t slot = nioc->n_arg & 0xffffffff;
4680         uint32_t action = nioc->n_arg >> 32;
4681         nvme_cqe_t cqe = { 0 };
4682         nvme_sqe_t sqe = {
4683             .sqe_opc    = NVME_OPC_FW_ACTIVATE
4684         };
4685         int timeout;
4686         int rv;
4687 
4688         if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4689                 return (EPERM);
4690 
4691         if (nsid != 0)
4692                 return (EINVAL);
4693 
4694         /* Validate slot is in range. */
4695         if (slot < NVME_FW_SLOT_MIN || slot > NVME_FW_SLOT_MAX)
4696                 return (EINVAL);
4697 
4698         switch (action) {
4699         case NVME_FWC_SAVE:
4700         case NVME_FWC_SAVE_ACTIVATE:
4701                 timeout = nvme_commit_save_cmd_timeout;
4702                 break;
4703         case NVME_FWC_ACTIVATE:
4704         case NVME_FWC_ACTIVATE_IMMED:
4705                 timeout = nvme_admin_cmd_timeout;
4706                 break;
4707         default:
4708                 return (EINVAL);
4709         }
4710 
4711         fc_dw10.b.fc_slot = slot;
4712         fc_dw10.b.fc_action = action;
4713         sqe.sqe_cdw10 = fc_dw10.r;
4714 
4715         rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, NULL, 0, 0, &cqe, timeout);
4716 
4717         nioc->n_arg = ((uint64_t)cqe.cqe_sf.sf_sct << 16) | cqe.cqe_sf.sf_sc;
4718 
4719         /*
4720          * Let the DDI UFM subsystem know that the firmware information for
4721          * this device has changed.
4722          */
4723         nvme_ufm_update(nvme);
4724 
4725         return (rv);
4726 }
4727 
4728 static int
4729 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p,
4730     int *rval_p)
4731 {
4732 #ifndef __lock_lint
4733         _NOTE(ARGUNUSED(rval_p));
4734 #endif
4735         minor_t minor = getminor(dev);
4736         nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor));
4737         int nsid = NVME_MINOR_NSID(minor);
4738         int rv = 0;
4739         nvme_ioctl_t nioc;
4740 
4741         int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = {
4742                 NULL,
4743                 nvme_ioctl_identify,
4744                 nvme_ioctl_identify,
4745                 nvme_ioctl_capabilities,
4746                 nvme_ioctl_get_logpage,
4747                 nvme_ioctl_get_features,
4748                 nvme_ioctl_intr_cnt,
4749                 nvme_ioctl_version,
4750                 nvme_ioctl_format,
4751                 nvme_ioctl_detach,
4752                 nvme_ioctl_attach,
4753                 nvme_ioctl_firmware_download,
4754                 nvme_ioctl_firmware_commit
4755         };
4756 
4757         if (nvme == NULL)
4758                 return (ENXIO);
4759 
4760         if (nsid > nvme->n_namespace_count)
4761                 return (ENXIO);
4762 
4763         if (IS_DEVCTL(cmd))
4764                 return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0));
4765 
4766 #ifdef _MULTI_DATAMODEL
4767         switch (ddi_model_convert_from(mode & FMODELS)) {
4768         case DDI_MODEL_ILP32: {
4769                 nvme_ioctl32_t nioc32;
4770                 if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t),
4771                     mode) != 0)
4772                         return (EFAULT);
4773                 nioc.n_len = nioc32.n_len;
4774                 nioc.n_buf = nioc32.n_buf;
4775                 nioc.n_arg = nioc32.n_arg;
4776                 break;
4777         }
4778         case DDI_MODEL_NONE:
4779 #endif
4780                 if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode)
4781                     != 0)
4782                         return (EFAULT);
4783 #ifdef _MULTI_DATAMODEL
4784                 break;
4785         }
4786 #endif
4787 
4788         if (nvme->n_dead && cmd != NVME_IOC_DETACH)
4789                 return (EIO);
4790 
4791 
4792         if (cmd == NVME_IOC_IDENTIFY_CTRL) {
4793                 /*
4794                  * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and
4795                  * attachment point nodes.
4796                  */
4797                 nsid = 0;
4798         } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) {
4799                 /*
4800                  * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it
4801                  * will always return identify data for namespace 1.
4802                  */
4803                 nsid = 1;
4804         }
4805 
4806         if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL)
4807                 rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode,
4808                     cred_p);
4809         else
4810                 rv = EINVAL;
4811 
4812 #ifdef _MULTI_DATAMODEL
4813         switch (ddi_model_convert_from(mode & FMODELS)) {
4814         case DDI_MODEL_ILP32: {
4815                 nvme_ioctl32_t nioc32;
4816 
4817                 nioc32.n_len = (size32_t)nioc.n_len;
4818                 nioc32.n_buf = (uintptr32_t)nioc.n_buf;
4819                 nioc32.n_arg = nioc.n_arg;
4820 
4821                 if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t),
4822                     mode) != 0)
4823                         return (EFAULT);
4824                 break;
4825         }
4826         case DDI_MODEL_NONE:
4827 #endif
4828                 if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode)
4829                     != 0)
4830                         return (EFAULT);
4831 #ifdef _MULTI_DATAMODEL
4832                 break;
4833         }
4834 #endif
4835 
4836         return (rv);
4837 }
4838 
4839 /*
4840  * DDI UFM Callbacks
4841  */
4842 static int
4843 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
4844     ddi_ufm_image_t *img)
4845 {
4846         nvme_t *nvme = arg;
4847 
4848         if (imgno != 0)
4849                 return (EINVAL);
4850 
4851         ddi_ufm_image_set_desc(img, "Firmware");
4852         ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot);
4853 
4854         return (0);
4855 }
4856 
4857 /*
4858  * Fill out firmware slot information for the requested slot.  The firmware
4859  * slot information is gathered by requesting the Firmware Slot Information log
4860  * page.  The format of the page is described in section 5.10.1.3.
4861  *
4862  * We lazily cache the log page on the first call and then invalidate the cache
4863  * data after a successful firmware download or firmware commit command.
4864  * The cached data is protected by a mutex as the state can change
4865  * asynchronous to this callback.
4866  */
4867 static int
4868 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
4869     uint_t slotno, ddi_ufm_slot_t *slot)
4870 {
4871         nvme_t *nvme = arg;
4872         void *log = NULL;
4873         size_t bufsize;
4874         ddi_ufm_attr_t attr = 0;
4875         char fw_ver[NVME_FWVER_SZ + 1];
4876         int ret;
4877 
4878         if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1))
4879                 return (EINVAL);
4880 
4881         mutex_enter(&nvme->n_fwslot_mutex);
4882         if (nvme->n_fwslot == NULL) {
4883                 ret = nvme_get_logpage(nvme, B_TRUE, &log, &bufsize,
4884                     NVME_LOGPAGE_FWSLOT, 0);
4885                 if (ret != DDI_SUCCESS ||
4886                     bufsize != sizeof (nvme_fwslot_log_t)) {
4887                         if (log != NULL)
4888                                 kmem_free(log, bufsize);
4889                         mutex_exit(&nvme->n_fwslot_mutex);
4890                         return (EIO);
4891                 }
4892                 nvme->n_fwslot = (nvme_fwslot_log_t *)log;
4893         }
4894 
4895         /*
4896          * NVMe numbers firmware slots starting at 1
4897          */
4898         if (slotno == (nvme->n_fwslot->fw_afi - 1))
4899                 attr |= DDI_UFM_ATTR_ACTIVE;
4900 
4901         if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0)
4902                 attr |= DDI_UFM_ATTR_WRITEABLE;
4903 
4904         if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') {
4905                 attr |= DDI_UFM_ATTR_EMPTY;
4906         } else {
4907                 (void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno],
4908                     NVME_FWVER_SZ);
4909                 fw_ver[NVME_FWVER_SZ] = '\0';
4910                 ddi_ufm_slot_set_version(slot, fw_ver);
4911         }
4912         mutex_exit(&nvme->n_fwslot_mutex);
4913 
4914         ddi_ufm_slot_set_attrs(slot, attr);
4915 
4916         return (0);
4917 }
4918 
4919 static int
4920 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps)
4921 {
4922         *caps = DDI_UFM_CAP_REPORT;
4923         return (0);
4924 }