Print this page
Revert "OS-8005 bhyve memory pressure needs to target ARC better (#354)"
This reverts commit a6033573eedd94118d2b9e65f45deca0bf4b42f7.
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ new/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
|
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 12
13 13 /*
14 14 * Copyright 2015 Pluribus Networks Inc.
15 - * Copyright 2020 Joyent, Inc.
15 + * Copyright 2019 Joyent, Inc.
16 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17 17 * Copyright 2021 Oxide Computer Company
18 18 */
19 19
20 20 #include <sys/types.h>
21 21 #include <sys/conf.h>
22 22 #include <sys/cpuvar.h>
23 23 #include <sys/ioccom.h>
24 24 #include <sys/stat.h>
25 25 #include <sys/vmsystm.h>
26 26 #include <sys/ddi.h>
27 27 #include <sys/mkdev.h>
28 28 #include <sys/sunddi.h>
29 29 #include <sys/fs/dv_node.h>
30 30 #include <sys/cpuset.h>
31 31 #include <sys/id_space.h>
32 32 #include <sys/fs/sdev_plugin.h>
33 33 #include <sys/smt.h>
34 34 #include <sys/kstat.h>
35 35
36 36 #include <sys/kernel.h>
37 37 #include <sys/hma.h>
38 38 #include <sys/x86_archext.h>
39 39 #include <x86/apicreg.h>
40 40
41 41 #include <sys/vmm.h>
42 42 #include <sys/vmm_kernel.h>
43 43 #include <sys/vmm_instruction_emul.h>
44 44 #include <sys/vmm_dev.h>
45 45 #include <sys/vmm_impl.h>
46 46 #include <sys/vmm_drv.h>
47 47 #include <sys/vmm_vm.h>
48 48
49 49 #include <vm/seg_dev.h>
50 50
51 51 #include "io/ppt.h"
52 52 #include "io/vatpic.h"
53 53 #include "io/vioapic.h"
54 54 #include "io/vrtc.h"
55 55 #include "io/vhpet.h"
56 56 #include "io/vpmtmr.h"
57 57 #include "vmm_lapic.h"
58 58 #include "vmm_stat.h"
59 59 #include "vmm_util.h"
60 60
61 61 /*
62 62 * Locking details:
63 63 *
64 64 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
65 65 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data
66 66 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire
67 67 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to
68 68 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
69 69 */
70 70
71 71 static kmutex_t vmmdev_mtx;
72 72 static dev_info_t *vmmdev_dip;
73 73 static hma_reg_t *vmmdev_hma_reg;
74 74 static uint_t vmmdev_hma_ref;
75 75 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
76 76
77 77 static kmutex_t vmm_mtx;
78 78 static list_t vmm_list;
79 79 static list_t vmm_destroy_list;
80 80 static id_space_t *vmm_minors;
81 81 static void *vmm_statep;
82 82
83 83 static const char *vmmdev_hvm_name = "bhyve";
84 84
85 85 /* For sdev plugin (/dev) */
86 86 #define VMM_SDEV_ROOT "/dev/vmm"
87 87
88 88 /* From uts/i86pc/io/vmm/intel/vmx.c */
89 89 extern int vmx_x86_supported(const char **);
90 90
91 91 /* Holds and hooks from drivers external to vmm */
92 92 struct vmm_hold {
93 93 list_node_t vmh_node;
94 94 vmm_softc_t *vmh_sc;
95 95 boolean_t vmh_release_req;
96 96 uint_t vmh_ioport_hook_cnt;
97 97 };
98 98
99 99 struct vmm_lease {
100 100 list_node_t vml_node;
101 101 struct vm *vml_vm;
102 102 boolean_t vml_expired;
103 103 boolean_t (*vml_expire_func)(void *);
104 104 void *vml_expire_arg;
105 105 list_node_t vml_expire_node;
106 106 struct vmm_hold *vml_hold;
107 107 };
108 108
109 109 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
110 110 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
111 111 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
112 112 static void vmm_kstat_init(vmm_softc_t *);
113 113 static void vmm_kstat_fini(vmm_softc_t *);
114 114
115 115 static int
116 116 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
117 117 {
118 118 int error;
119 119 bool sysmem;
120 120
121 121 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
122 122 NULL);
123 123 if (error || mseg->len == 0)
124 124 return (error);
125 125
126 126 if (!sysmem) {
127 127 vmm_devmem_entry_t *de;
128 128 list_t *dl = &sc->vmm_devmem_list;
129 129
130 130 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
131 131 if (de->vde_segid == mseg->segid) {
132 132 break;
133 133 }
134 134 }
135 135 if (de != NULL) {
136 136 (void) strlcpy(mseg->name, de->vde_name,
137 137 sizeof (mseg->name));
138 138 }
139 139 } else {
140 140 bzero(mseg->name, sizeof (mseg->name));
141 141 }
142 142
143 143 return (error);
144 144 }
145 145
146 146 /*
147 147 * The 'devmem' hack:
148 148 *
149 149 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
150 150 * in the vm which appear with their own name related to the vm under /dev.
151 151 * Since this would be a hassle from an sdev perspective and would require a
152 152 * new cdev interface (or complicate the existing one), we choose to implement
153 153 * this in a different manner. When 'devmem' mappings are created, an
154 154 * identifying off_t is communicated back out to userspace. That off_t,
155 155 * residing above the normal guest memory space, can be used to mmap the
156 156 * 'devmem' mapping from the already-open vm device.
157 157 */
158 158
159 159 static int
160 160 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
161 161 {
162 162 off_t map_offset;
163 163 vmm_devmem_entry_t *entry;
164 164
165 165 if (list_is_empty(&sc->vmm_devmem_list)) {
166 166 map_offset = VM_DEVMEM_START;
167 167 } else {
168 168 entry = list_tail(&sc->vmm_devmem_list);
169 169 map_offset = entry->vde_off + entry->vde_len;
170 170 if (map_offset < entry->vde_off) {
171 171 /* Do not tolerate overflow */
172 172 return (ERANGE);
173 173 }
174 174 /*
175 175 * XXXJOY: We could choose to search the list for duplicate
176 176 * names and toss an error. Since we're using the offset
177 177 * method for now, it does not make much of a difference.
178 178 */
179 179 }
180 180
181 181 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
182 182 entry->vde_segid = mseg->segid;
183 183 entry->vde_len = mseg->len;
184 184 entry->vde_off = map_offset;
185 185 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
186 186 list_insert_tail(&sc->vmm_devmem_list, entry);
187 187
188 188 return (0);
189 189 }
190 190
191 191 static boolean_t
192 192 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
193 193 off_t *map_offp)
194 194 {
195 195 list_t *dl = &sc->vmm_devmem_list;
196 196 vmm_devmem_entry_t *de = NULL;
197 197 const off_t map_end = off + len;
198 198
199 199 VERIFY(off >= VM_DEVMEM_START);
200 200
201 201 if (map_end < off) {
202 202 /* No match on overflow */
203 203 return (B_FALSE);
204 204 }
205 205
206 206 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
207 207 const off_t item_end = de->vde_off + de->vde_len;
208 208
209 209 if (de->vde_off <= off && item_end >= map_end) {
210 210 *segidp = de->vde_segid;
211 211 *map_offp = off - de->vde_off;
212 212 return (B_TRUE);
213 213 }
214 214 }
215 215 return (B_FALSE);
216 216 }
217 217
218 218 static void
219 219 vmmdev_devmem_purge(vmm_softc_t *sc)
220 220 {
221 221 vmm_devmem_entry_t *entry;
222 222
223 223 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
224 224 kmem_free(entry, sizeof (*entry));
225 225 }
226 226 }
227 227
228 228 static int
229 229 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
230 230 {
231 231 int error;
232 232 bool sysmem = true;
233 233
234 234 if (VM_MEMSEG_NAME(mseg)) {
235 235 sysmem = false;
236 236 }
237 237 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
238 238
239 239 if (error == 0 && VM_MEMSEG_NAME(mseg)) {
240 240 /*
241 241 * Rather than create a whole fresh device from which userspace
242 242 * can mmap this segment, instead make it available at an
243 243 * offset above where the main guest memory resides.
244 244 */
245 245 error = vmmdev_devmem_create(sc, mseg, mseg->name);
246 246 if (error != 0) {
247 247 vm_free_memseg(sc->vmm_vm, mseg->segid);
248 248 }
249 249 }
250 250 return (error);
251 251 }
252 252
253 253 /*
254 254 * Resource Locking and Exclusion
255 255 *
256 256 * Much of bhyve depends on key portions of VM state, such as the guest memory
257 257 * map, to remain unchanged while the guest is running. As ported from
258 258 * FreeBSD, the initial strategy for this resource exclusion hinged on gating
259 259 * access to the instance vCPUs. Threads acting on a single vCPU, like those
260 260 * performing the work of actually running the guest in VMX/SVM, would lock
261 261 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide
262 262 * state, all of the vCPUs would be first locked, ensuring that the
263 263 * operation(s) could complete without any other threads stumbling into
264 264 * intermediate states.
265 265 *
266 266 * This approach is largely effective for bhyve. Common operations, such as
267 267 * running the vCPUs, steer clear of lock contention. The model begins to
268 268 * break down for operations which do not occur in the context of a specific
269 269 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker
270 270 * thread in the bhyve process. In order to properly protect those vCPU-less
271 271 * operations from encountering invalid states, additional locking is required.
272 272 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
273 273 * It does mean that class of operations will be serialized on locking the
274 274 * specific vCPU and that instances sized at VM_MAXCPU will potentially see
275 275 * undue contention on the VM_MAXCPU-1 vCPU.
276 276 *
277 277 * In order to address the shortcomings of this model, the concept of a
278 278 * read/write lock has been added to bhyve. Operations which change
279 279 * fundamental aspects of a VM (such as the memory map) must acquire the write
280 280 * lock, which also implies locking all of the vCPUs and waiting for all read
281 281 * lock holders to release. While it increases the cost and waiting time for
282 282 * those few operations, it allows most hot-path operations on the VM (which
283 283 * depend on its configuration remaining stable) to occur with minimal locking.
284 284 *
285 285 * Consumers of the Driver API (see below) are a special case when it comes to
286 286 * this locking, since they may hold a read lock via the drv_lease mechanism
287 287 * for an extended period of time. Rather than forcing those consumers to
288 288 * continuously poll for a write lock attempt, the lease system forces them to
289 289 * provide a release callback to trigger their clean-up (and potential later
290 290 * reacquisition) of the read lock.
291 291 */
292 292
293 293 static void
294 294 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
295 295 {
296 296 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
297 297
298 298 /*
299 299 * Since this state transition is utilizing from_idle=true, it should
300 300 * not fail, but rather block until it can be successful.
301 301 */
302 302 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
303 303 }
304 304
305 305 static void
306 306 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
307 307 {
308 308 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
309 309
310 310 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
311 311 vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
312 312 }
313 313
314 314 static void
315 315 vmm_read_lock(vmm_softc_t *sc)
316 316 {
317 317 rw_enter(&sc->vmm_rwlock, RW_READER);
318 318 }
319 319
320 320 static void
321 321 vmm_read_unlock(vmm_softc_t *sc)
322 322 {
323 323 rw_exit(&sc->vmm_rwlock);
324 324 }
325 325
326 326 static void
327 327 vmm_write_lock(vmm_softc_t *sc)
328 328 {
329 329 int maxcpus;
330 330
331 331 /* First lock all the vCPUs */
332 332 maxcpus = vm_get_maxcpus(sc->vmm_vm);
333 333 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
334 334 vcpu_lock_one(sc, vcpu);
335 335 }
336 336
337 337 mutex_enter(&sc->vmm_lease_lock);
338 338 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
339 339 sc->vmm_lease_blocker++;
340 340 if (sc->vmm_lease_blocker == 1) {
341 341 list_t *list = &sc->vmm_lease_list;
342 342 vmm_lease_t *lease = list_head(list);
343 343
344 344 while (lease != NULL) {
345 345 boolean_t sync_break = B_FALSE;
346 346
347 347 if (!lease->vml_expired) {
348 348 void *arg = lease->vml_expire_arg;
349 349 lease->vml_expired = B_TRUE;
350 350 sync_break = lease->vml_expire_func(arg);
351 351 }
352 352
353 353 if (sync_break) {
354 354 vmm_lease_t *next;
355 355
356 356 /*
357 357 * These leases which are synchronously broken
358 358 * result in vmm_read_unlock() calls from a
359 359 * different thread than the corresponding
360 360 * vmm_read_lock(). This is acceptable, given
361 361 * that the rwlock underpinning the whole
362 362 * mechanism tolerates the behavior. This
363 363 * flexibility is _only_ afforded to VM read
364 364 * lock (RW_READER) holders.
365 365 */
366 366 next = list_next(list, lease);
367 367 vmm_lease_break_locked(sc, lease);
368 368 lease = next;
369 369 } else {
370 370 lease = list_next(list, lease);
371 371 }
372 372 }
373 373 }
374 374 mutex_exit(&sc->vmm_lease_lock);
375 375
376 376 rw_enter(&sc->vmm_rwlock, RW_WRITER);
377 377 /*
378 378 * For now, the 'maxcpus' value for an instance is fixed at the
379 379 * compile-time constant of VM_MAXCPU at creation. If this changes in
380 380 * the future, allowing for dynamic vCPU resource sizing, acquisition
381 381 * of the write lock will need to be wary of such changes.
382 382 */
383 383 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
384 384 }
385 385
386 386 static void
387 387 vmm_write_unlock(vmm_softc_t *sc)
388 388 {
389 389 int maxcpus;
390 390
391 391 mutex_enter(&sc->vmm_lease_lock);
392 392 VERIFY3U(sc->vmm_lease_blocker, !=, 0);
393 393 sc->vmm_lease_blocker--;
394 394 if (sc->vmm_lease_blocker == 0) {
395 395 cv_broadcast(&sc->vmm_lease_cv);
396 396 }
397 397 mutex_exit(&sc->vmm_lease_lock);
398 398
399 399 /*
400 400 * The VM write lock _must_ be released from the same thread it was
401 401 * acquired in, unlike the read lock.
402 402 */
403 403 VERIFY(rw_write_held(&sc->vmm_rwlock));
404 404 rw_exit(&sc->vmm_rwlock);
405 405
406 406 /* Unlock all the vCPUs */
407 407 maxcpus = vm_get_maxcpus(sc->vmm_vm);
408 408 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
409 409 vcpu_unlock_one(sc, vcpu);
410 410 }
411 411 }
412 412
413 413 static int
414 414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
415 415 cred_t *credp, int *rvalp)
416 416 {
417 417 int error = 0, vcpu = -1;
418 418 void *datap = (void *)arg;
419 419 enum vm_lock_type {
420 420 LOCK_NONE = 0,
421 421 LOCK_VCPU,
422 422 LOCK_READ_HOLD,
423 423 LOCK_WRITE_HOLD
424 424 } lock_type = LOCK_NONE;
425 425
426 426 /* Acquire any exclusion resources needed for the operation. */
427 427 switch (cmd) {
428 428 case VM_RUN:
429 429 case VM_GET_REGISTER:
430 430 case VM_SET_REGISTER:
431 431 case VM_GET_SEGMENT_DESCRIPTOR:
432 432 case VM_SET_SEGMENT_DESCRIPTOR:
433 433 case VM_GET_REGISTER_SET:
434 434 case VM_SET_REGISTER_SET:
435 435 case VM_INJECT_EXCEPTION:
436 436 case VM_GET_CAPABILITY:
437 437 case VM_SET_CAPABILITY:
438 438 case VM_PPTDEV_MSI:
439 439 case VM_PPTDEV_MSIX:
440 440 case VM_SET_X2APIC_STATE:
441 441 case VM_GLA2GPA:
442 442 case VM_GLA2GPA_NOFAULT:
443 443 case VM_ACTIVATE_CPU:
444 444 case VM_SET_INTINFO:
445 445 case VM_GET_INTINFO:
446 446 case VM_RESTART_INSTRUCTION:
447 447 case VM_SET_KERNEMU_DEV:
448 448 case VM_GET_KERNEMU_DEV:
449 449 case VM_RESET_CPU:
450 450 case VM_GET_RUN_STATE:
451 451 case VM_SET_RUN_STATE:
452 452 /*
453 453 * Copy in the ID of the vCPU chosen for this operation.
454 454 * Since a nefarious caller could update their struct between
455 455 * this locking and when the rest of the ioctl data is copied
456 456 * in, it is _critical_ that this local 'vcpu' variable be used
457 457 * rather than the in-struct one when performing the ioctl.
458 458 */
459 459 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
460 460 return (EFAULT);
461 461 }
462 462 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
463 463 return (EINVAL);
464 464 }
465 465 vcpu_lock_one(sc, vcpu);
466 466 lock_type = LOCK_VCPU;
467 467 break;
468 468
|
↓ open down ↓ |
443 lines elided |
↑ open up ↑ |
469 469 case VM_REINIT:
470 470 case VM_BIND_PPTDEV:
471 471 case VM_UNBIND_PPTDEV:
472 472 case VM_MAP_PPTDEV_MMIO:
473 473 case VM_UNMAP_PPTDEV_MMIO:
474 474 case VM_ALLOC_MEMSEG:
475 475 case VM_MMAP_MEMSEG:
476 476 case VM_MUNMAP_MEMSEG:
477 477 case VM_WRLOCK_CYCLE:
478 478 case VM_PMTMR_LOCATE:
479 - case VM_ARC_RESV:
480 479 vmm_write_lock(sc);
481 480 lock_type = LOCK_WRITE_HOLD;
482 481 break;
483 482
484 483 case VM_GET_GPA_PMAP:
485 484 case VM_GET_MEMSEG:
486 485 case VM_MMAP_GETNEXT:
487 486 case VM_LAPIC_IRQ:
488 487 case VM_INJECT_NMI:
489 488 case VM_IOAPIC_ASSERT_IRQ:
490 489 case VM_IOAPIC_DEASSERT_IRQ:
491 490 case VM_IOAPIC_PULSE_IRQ:
492 491 case VM_LAPIC_MSI:
493 492 case VM_LAPIC_LOCAL_IRQ:
494 493 case VM_GET_X2APIC_STATE:
495 494 case VM_RTC_READ:
496 495 case VM_RTC_WRITE:
497 496 case VM_RTC_SETTIME:
498 497 case VM_RTC_GETTIME:
499 498 case VM_PPTDEV_DISABLE_MSIX:
500 499 case VM_DEVMEM_GETOFFSET:
501 500 vmm_read_lock(sc);
502 501 lock_type = LOCK_READ_HOLD;
503 502 break;
504 503
505 504 case VM_IOAPIC_PINCOUNT:
506 505 default:
507 506 break;
508 507 }
509 508
510 509 /* Execute the primary logic for the ioctl. */
511 510 switch (cmd) {
512 511 case VM_RUN: {
513 512 struct vm_entry entry;
514 513
515 514 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
516 515 error = EFAULT;
517 516 break;
518 517 }
519 518
520 519 if (!(curthread->t_schedflag & TS_VCPU))
521 520 smt_mark_as_vcpu();
522 521
523 522 error = vm_run(sc->vmm_vm, vcpu, &entry);
524 523
525 524 /*
526 525 * Unexpected states in vm_run() are expressed through positive
527 526 * errno-oriented return values. VM states which expect further
528 527 * processing in userspace (necessary context via exitinfo) are
529 528 * expressed through negative return values. For the time being
530 529 * a return value of 0 is not expected from vm_run().
531 530 */
532 531 ASSERT(error != 0);
533 532 if (error < 0) {
534 533 const struct vm_exit *vme;
535 534 void *outp = entry.exit_data;
536 535
537 536 error = 0;
538 537 vme = vm_exitinfo(sc->vmm_vm, vcpu);
539 538 if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
540 539 error = EFAULT;
541 540 }
542 541 }
543 542 break;
544 543 }
545 544 case VM_SUSPEND: {
546 545 struct vm_suspend vmsuspend;
547 546
548 547 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
549 548 error = EFAULT;
550 549 break;
551 550 }
552 551 error = vm_suspend(sc->vmm_vm, vmsuspend.how);
553 552 break;
554 553 }
555 554 case VM_REINIT:
556 555 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
557 556 /*
558 557 * The VM instance should be free of driver-attached
559 558 * hooks during the reinitialization process.
560 559 */
561 560 break;
562 561 }
563 562 error = vm_reinit(sc->vmm_vm);
564 563 (void) vmm_drv_block_hook(sc, B_FALSE);
565 564 break;
566 565 case VM_STAT_DESC: {
567 566 struct vm_stat_desc statdesc;
568 567
569 568 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
570 569 error = EFAULT;
571 570 break;
572 571 }
573 572 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
574 573 sizeof (statdesc.desc));
575 574 if (error == 0 &&
576 575 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
577 576 error = EFAULT;
578 577 break;
579 578 }
580 579 break;
581 580 }
582 581 case VM_STATS_IOC: {
583 582 struct vm_stats vmstats;
584 583
585 584 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
586 585 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
587 586 error = EFAULT;
588 587 break;
589 588 }
590 589 hrt2tv(gethrtime(), &vmstats.tv);
591 590 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
592 591 &vmstats.num_entries, vmstats.statbuf);
593 592 if (error == 0 &&
594 593 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
595 594 error = EFAULT;
596 595 break;
597 596 }
598 597 break;
599 598 }
600 599
601 600 case VM_PPTDEV_MSI: {
602 601 struct vm_pptdev_msi pptmsi;
603 602
604 603 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
605 604 error = EFAULT;
606 605 break;
607 606 }
608 607 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
609 608 pptmsi.addr, pptmsi.msg, pptmsi.numvec);
610 609 break;
611 610 }
612 611 case VM_PPTDEV_MSIX: {
613 612 struct vm_pptdev_msix pptmsix;
614 613
615 614 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
616 615 error = EFAULT;
617 616 break;
618 617 }
619 618 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
620 619 pptmsix.idx, pptmsix.addr, pptmsix.msg,
621 620 pptmsix.vector_control);
622 621 break;
623 622 }
624 623 case VM_PPTDEV_DISABLE_MSIX: {
625 624 struct vm_pptdev pptdev;
626 625
627 626 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
628 627 error = EFAULT;
629 628 break;
630 629 }
631 630 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
632 631 break;
633 632 }
634 633 case VM_MAP_PPTDEV_MMIO: {
635 634 struct vm_pptdev_mmio pptmmio;
636 635
637 636 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
638 637 error = EFAULT;
639 638 break;
640 639 }
641 640 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
642 641 pptmmio.len, pptmmio.hpa);
643 642 break;
644 643 }
645 644 case VM_UNMAP_PPTDEV_MMIO: {
646 645 struct vm_pptdev_mmio pptmmio;
647 646
648 647 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
649 648 error = EFAULT;
650 649 break;
651 650 }
652 651 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
653 652 pptmmio.len);
654 653 break;
655 654 }
656 655 case VM_BIND_PPTDEV: {
657 656 struct vm_pptdev pptdev;
658 657
659 658 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
660 659 error = EFAULT;
661 660 break;
662 661 }
663 662 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
664 663 break;
665 664 }
666 665 case VM_UNBIND_PPTDEV: {
667 666 struct vm_pptdev pptdev;
668 667
669 668 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
670 669 error = EFAULT;
671 670 break;
672 671 }
673 672 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
674 673 break;
675 674 }
676 675 case VM_GET_PPTDEV_LIMITS: {
677 676 struct vm_pptdev_limits pptlimits;
678 677
679 678 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
680 679 error = EFAULT;
681 680 break;
682 681 }
683 682 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
684 683 &pptlimits.msi_limit, &pptlimits.msix_limit);
685 684 if (error == 0 &&
686 685 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
687 686 error = EFAULT;
688 687 break;
689 688 }
690 689 break;
691 690 }
692 691 case VM_INJECT_EXCEPTION: {
693 692 struct vm_exception vmexc;
694 693 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
695 694 error = EFAULT;
696 695 break;
697 696 }
698 697 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
699 698 vmexc.error_code_valid, vmexc.error_code,
700 699 vmexc.restart_instruction);
701 700 break;
702 701 }
703 702 case VM_INJECT_NMI: {
704 703 struct vm_nmi vmnmi;
705 704
706 705 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
707 706 error = EFAULT;
708 707 break;
709 708 }
710 709 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
711 710 break;
712 711 }
713 712 case VM_LAPIC_IRQ: {
714 713 struct vm_lapic_irq vmirq;
715 714
716 715 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
717 716 error = EFAULT;
718 717 break;
719 718 }
720 719 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
721 720 break;
722 721 }
723 722 case VM_LAPIC_LOCAL_IRQ: {
724 723 struct vm_lapic_irq vmirq;
725 724
726 725 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
727 726 error = EFAULT;
728 727 break;
729 728 }
730 729 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
731 730 vmirq.vector);
732 731 break;
733 732 }
734 733 case VM_LAPIC_MSI: {
735 734 struct vm_lapic_msi vmmsi;
736 735
737 736 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
738 737 error = EFAULT;
739 738 break;
740 739 }
741 740 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
742 741 break;
743 742 }
744 743
745 744 case VM_IOAPIC_ASSERT_IRQ: {
746 745 struct vm_ioapic_irq ioapic_irq;
747 746
748 747 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
749 748 error = EFAULT;
750 749 break;
751 750 }
752 751 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
753 752 break;
754 753 }
755 754 case VM_IOAPIC_DEASSERT_IRQ: {
756 755 struct vm_ioapic_irq ioapic_irq;
757 756
758 757 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
759 758 error = EFAULT;
760 759 break;
761 760 }
762 761 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
763 762 break;
764 763 }
765 764 case VM_IOAPIC_PULSE_IRQ: {
766 765 struct vm_ioapic_irq ioapic_irq;
767 766
768 767 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
769 768 error = EFAULT;
770 769 break;
771 770 }
772 771 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
773 772 break;
774 773 }
775 774 case VM_IOAPIC_PINCOUNT: {
776 775 int pincount;
777 776
778 777 pincount = vioapic_pincount(sc->vmm_vm);
779 778 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
780 779 error = EFAULT;
781 780 break;
782 781 }
783 782 break;
784 783 }
785 784
786 785 case VM_ISA_ASSERT_IRQ: {
787 786 struct vm_isa_irq isa_irq;
788 787
789 788 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
790 789 error = EFAULT;
791 790 break;
792 791 }
793 792 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
794 793 if (error == 0 && isa_irq.ioapic_irq != -1) {
795 794 error = vioapic_assert_irq(sc->vmm_vm,
796 795 isa_irq.ioapic_irq);
797 796 }
798 797 break;
799 798 }
800 799 case VM_ISA_DEASSERT_IRQ: {
801 800 struct vm_isa_irq isa_irq;
802 801
803 802 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
804 803 error = EFAULT;
805 804 break;
806 805 }
807 806 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
808 807 if (error == 0 && isa_irq.ioapic_irq != -1) {
809 808 error = vioapic_deassert_irq(sc->vmm_vm,
810 809 isa_irq.ioapic_irq);
811 810 }
812 811 break;
813 812 }
814 813 case VM_ISA_PULSE_IRQ: {
815 814 struct vm_isa_irq isa_irq;
816 815
817 816 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
818 817 error = EFAULT;
819 818 break;
820 819 }
821 820 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
822 821 if (error == 0 && isa_irq.ioapic_irq != -1) {
823 822 error = vioapic_pulse_irq(sc->vmm_vm,
824 823 isa_irq.ioapic_irq);
825 824 }
826 825 break;
827 826 }
828 827 case VM_ISA_SET_IRQ_TRIGGER: {
829 828 struct vm_isa_irq_trigger isa_irq_trigger;
830 829
831 830 if (ddi_copyin(datap, &isa_irq_trigger,
832 831 sizeof (isa_irq_trigger), md)) {
833 832 error = EFAULT;
834 833 break;
835 834 }
836 835 error = vatpic_set_irq_trigger(sc->vmm_vm,
837 836 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
838 837 break;
839 838 }
840 839
841 840 case VM_MMAP_GETNEXT: {
842 841 struct vm_memmap mm;
843 842
844 843 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
845 844 error = EFAULT;
846 845 break;
847 846 }
848 847 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
849 848 &mm.segoff, &mm.len, &mm.prot, &mm.flags);
850 849 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
851 850 error = EFAULT;
852 851 break;
853 852 }
854 853 break;
855 854 }
856 855 case VM_MMAP_MEMSEG: {
857 856 struct vm_memmap mm;
858 857
859 858 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
860 859 error = EFAULT;
861 860 break;
862 861 }
863 862 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
864 863 mm.len, mm.prot, mm.flags);
865 864 break;
866 865 }
867 866 case VM_MUNMAP_MEMSEG: {
868 867 struct vm_munmap mu;
869 868
870 869 if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
871 870 error = EFAULT;
872 871 break;
873 872 }
874 873 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
875 874 break;
876 875 }
877 876 case VM_ALLOC_MEMSEG: {
878 877 struct vm_memseg vmseg;
879 878
880 879 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
881 880 error = EFAULT;
882 881 break;
883 882 }
884 883 error = vmmdev_alloc_memseg(sc, &vmseg);
885 884 break;
886 885 }
887 886 case VM_GET_MEMSEG: {
888 887 struct vm_memseg vmseg;
889 888
890 889 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
891 890 error = EFAULT;
892 891 break;
893 892 }
894 893 error = vmmdev_get_memseg(sc, &vmseg);
895 894 if (error == 0 &&
896 895 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
897 896 error = EFAULT;
898 897 break;
899 898 }
900 899 break;
901 900 }
902 901 case VM_GET_REGISTER: {
903 902 struct vm_register vmreg;
904 903
905 904 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
906 905 error = EFAULT;
907 906 break;
908 907 }
909 908 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
910 909 &vmreg.regval);
911 910 if (error == 0 &&
912 911 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
913 912 error = EFAULT;
914 913 break;
915 914 }
916 915 break;
917 916 }
918 917 case VM_SET_REGISTER: {
919 918 struct vm_register vmreg;
920 919
921 920 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
922 921 error = EFAULT;
923 922 break;
924 923 }
925 924 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
926 925 vmreg.regval);
927 926 break;
928 927 }
929 928 case VM_SET_SEGMENT_DESCRIPTOR: {
930 929 struct vm_seg_desc vmsegd;
931 930
932 931 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
933 932 error = EFAULT;
934 933 break;
935 934 }
936 935 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
937 936 &vmsegd.desc);
938 937 break;
939 938 }
940 939 case VM_GET_SEGMENT_DESCRIPTOR: {
941 940 struct vm_seg_desc vmsegd;
942 941
943 942 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
944 943 error = EFAULT;
945 944 break;
946 945 }
947 946 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
948 947 &vmsegd.desc);
949 948 if (error == 0 &&
950 949 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
951 950 error = EFAULT;
952 951 break;
953 952 }
954 953 break;
955 954 }
956 955 case VM_GET_REGISTER_SET: {
957 956 struct vm_register_set vrs;
958 957 int regnums[VM_REG_LAST];
959 958 uint64_t regvals[VM_REG_LAST];
960 959
961 960 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
962 961 error = EFAULT;
963 962 break;
964 963 }
965 964 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
966 965 error = EINVAL;
967 966 break;
968 967 }
969 968 if (ddi_copyin(vrs.regnums, regnums,
970 969 sizeof (int) * vrs.count, md)) {
971 970 error = EFAULT;
972 971 break;
973 972 }
974 973
975 974 error = 0;
976 975 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
977 976 if (regnums[i] < 0) {
978 977 error = EINVAL;
979 978 break;
980 979 }
981 980 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
982 981 ®vals[i]);
983 982 }
984 983 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
985 984 sizeof (uint64_t) * vrs.count, md)) {
986 985 error = EFAULT;
987 986 }
988 987 break;
989 988 }
990 989 case VM_SET_REGISTER_SET: {
991 990 struct vm_register_set vrs;
992 991 int regnums[VM_REG_LAST];
993 992 uint64_t regvals[VM_REG_LAST];
994 993
995 994 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
996 995 error = EFAULT;
997 996 break;
998 997 }
999 998 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1000 999 error = EINVAL;
1001 1000 break;
1002 1001 }
1003 1002 if (ddi_copyin(vrs.regnums, regnums,
1004 1003 sizeof (int) * vrs.count, md)) {
1005 1004 error = EFAULT;
1006 1005 break;
1007 1006 }
1008 1007 if (ddi_copyin(vrs.regvals, regvals,
1009 1008 sizeof (uint64_t) * vrs.count, md)) {
1010 1009 error = EFAULT;
1011 1010 break;
1012 1011 }
1013 1012
1014 1013 error = 0;
1015 1014 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1016 1015 /*
1017 1016 * Setting registers in a set is not atomic, since a
1018 1017 * failure in the middle of the set will cause a
1019 1018 * bail-out and inconsistent register state. Callers
1020 1019 * should be wary of this.
1021 1020 */
1022 1021 if (regnums[i] < 0) {
1023 1022 error = EINVAL;
1024 1023 break;
1025 1024 }
1026 1025 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1027 1026 regvals[i]);
1028 1027 }
1029 1028 break;
1030 1029 }
1031 1030 case VM_RESET_CPU: {
1032 1031 struct vm_vcpu_reset vvr;
1033 1032
1034 1033 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1035 1034 error = EFAULT;
1036 1035 break;
1037 1036 }
1038 1037 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1039 1038 error = EINVAL;
1040 1039 }
1041 1040
1042 1041 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1043 1042 break;
1044 1043 }
1045 1044 case VM_GET_RUN_STATE: {
1046 1045 struct vm_run_state vrs;
1047 1046
1048 1047 bzero(&vrs, sizeof (vrs));
1049 1048 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1050 1049 &vrs.sipi_vector);
1051 1050 if (error == 0) {
1052 1051 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1053 1052 error = EFAULT;
1054 1053 break;
1055 1054 }
1056 1055 }
1057 1056 break;
1058 1057 }
1059 1058 case VM_SET_RUN_STATE: {
1060 1059 struct vm_run_state vrs;
1061 1060
1062 1061 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1063 1062 error = EFAULT;
1064 1063 break;
1065 1064 }
1066 1065 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1067 1066 vrs.sipi_vector);
1068 1067 break;
1069 1068 }
1070 1069
1071 1070 case VM_SET_KERNEMU_DEV:
1072 1071 case VM_GET_KERNEMU_DEV: {
1073 1072 struct vm_readwrite_kernemu_device kemu;
1074 1073 size_t size = 0;
1075 1074
1076 1075 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1077 1076 error = EFAULT;
1078 1077 break;
1079 1078 }
1080 1079
1081 1080 if (kemu.access_width > 3) {
1082 1081 error = EINVAL;
1083 1082 break;
1084 1083 }
1085 1084 size = (1 << kemu.access_width);
1086 1085 ASSERT(size >= 1 && size <= 8);
1087 1086
1088 1087 if (cmd == VM_SET_KERNEMU_DEV) {
1089 1088 error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1090 1089 kemu.gpa, kemu.value, size);
1091 1090 } else {
1092 1091 error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1093 1092 kemu.gpa, &kemu.value, size);
1094 1093 }
1095 1094
1096 1095 if (error == 0) {
1097 1096 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1098 1097 error = EFAULT;
1099 1098 break;
1100 1099 }
1101 1100 }
1102 1101 break;
1103 1102 }
1104 1103
1105 1104 case VM_GET_CAPABILITY: {
1106 1105 struct vm_capability vmcap;
1107 1106
1108 1107 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1109 1108 error = EFAULT;
1110 1109 break;
1111 1110 }
1112 1111 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1113 1112 &vmcap.capval);
1114 1113 if (error == 0 &&
1115 1114 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1116 1115 error = EFAULT;
1117 1116 break;
1118 1117 }
1119 1118 break;
1120 1119 }
1121 1120 case VM_SET_CAPABILITY: {
1122 1121 struct vm_capability vmcap;
1123 1122
1124 1123 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1125 1124 error = EFAULT;
1126 1125 break;
1127 1126 }
1128 1127 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1129 1128 vmcap.capval);
1130 1129 break;
1131 1130 }
1132 1131 case VM_SET_X2APIC_STATE: {
1133 1132 struct vm_x2apic x2apic;
1134 1133
1135 1134 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1136 1135 error = EFAULT;
1137 1136 break;
1138 1137 }
1139 1138 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1140 1139 break;
1141 1140 }
1142 1141 case VM_GET_X2APIC_STATE: {
1143 1142 struct vm_x2apic x2apic;
1144 1143
1145 1144 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1146 1145 error = EFAULT;
1147 1146 break;
1148 1147 }
1149 1148 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1150 1149 &x2apic.state);
1151 1150 if (error == 0 &&
1152 1151 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1153 1152 error = EFAULT;
1154 1153 break;
1155 1154 }
1156 1155 break;
1157 1156 }
1158 1157 case VM_GET_GPA_PMAP: {
1159 1158 struct vm_gpa_pte gpapte;
1160 1159
1161 1160 if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1162 1161 error = EFAULT;
1163 1162 break;
1164 1163 }
1165 1164 #ifdef __FreeBSD__
1166 1165 /* XXXJOY: add function? */
1167 1166 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1168 1167 gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1169 1168 #endif
1170 1169 error = 0;
1171 1170 break;
1172 1171 }
1173 1172 case VM_GET_HPET_CAPABILITIES: {
1174 1173 struct vm_hpet_cap hpetcap;
1175 1174
1176 1175 error = vhpet_getcap(&hpetcap);
1177 1176 if (error == 0 &&
1178 1177 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1179 1178 error = EFAULT;
1180 1179 break;
1181 1180 }
1182 1181 break;
1183 1182 }
1184 1183 case VM_GLA2GPA: {
1185 1184 struct vm_gla2gpa gg;
1186 1185
1187 1186 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1188 1187 error = EFAULT;
1189 1188 break;
1190 1189 }
1191 1190 gg.vcpuid = vcpu;
1192 1191 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1193 1192 gg.prot, &gg.gpa, &gg.fault);
1194 1193 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1195 1194 error = EFAULT;
1196 1195 break;
1197 1196 }
1198 1197 break;
1199 1198 }
1200 1199 case VM_GLA2GPA_NOFAULT: {
1201 1200 struct vm_gla2gpa gg;
1202 1201
1203 1202 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1204 1203 error = EFAULT;
1205 1204 break;
1206 1205 }
1207 1206 gg.vcpuid = vcpu;
1208 1207 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1209 1208 gg.gla, gg.prot, &gg.gpa, &gg.fault);
1210 1209 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1211 1210 error = EFAULT;
1212 1211 break;
1213 1212 }
1214 1213 break;
1215 1214 }
1216 1215
1217 1216 case VM_ACTIVATE_CPU:
1218 1217 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1219 1218 break;
1220 1219
1221 1220 case VM_SUSPEND_CPU:
1222 1221 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1223 1222 error = EFAULT;
1224 1223 } else {
1225 1224 error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1226 1225 }
1227 1226 break;
1228 1227
1229 1228 case VM_RESUME_CPU:
1230 1229 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1231 1230 error = EFAULT;
1232 1231 } else {
1233 1232 error = vm_resume_cpu(sc->vmm_vm, vcpu);
1234 1233 }
1235 1234 break;
1236 1235
1237 1236 case VM_GET_CPUS: {
1238 1237 struct vm_cpuset vm_cpuset;
1239 1238 cpuset_t tempset;
1240 1239 void *srcp = &tempset;
1241 1240 int size;
1242 1241
1243 1242 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1244 1243 error = EFAULT;
1245 1244 break;
1246 1245 }
1247 1246
1248 1247 /* Be more generous about sizing since our cpuset_t is large. */
1249 1248 size = vm_cpuset.cpusetsize;
1250 1249 if (size <= 0 || size > sizeof (cpuset_t)) {
1251 1250 error = ERANGE;
1252 1251 }
1253 1252 /*
1254 1253 * If they want a ulong_t or less, make sure they receive the
1255 1254 * low bits with all the useful information.
1256 1255 */
1257 1256 if (size <= sizeof (tempset.cpub[0])) {
1258 1257 srcp = &tempset.cpub[0];
1259 1258 }
1260 1259
1261 1260 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1262 1261 tempset = vm_active_cpus(sc->vmm_vm);
1263 1262 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1264 1263 tempset = vm_suspended_cpus(sc->vmm_vm);
1265 1264 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1266 1265 tempset = vm_debug_cpus(sc->vmm_vm);
1267 1266 } else {
1268 1267 error = EINVAL;
1269 1268 }
1270 1269
1271 1270 ASSERT(size > 0 && size <= sizeof (tempset));
1272 1271 if (error == 0 &&
1273 1272 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1274 1273 error = EFAULT;
1275 1274 break;
1276 1275 }
1277 1276 break;
1278 1277 }
1279 1278 case VM_SET_INTINFO: {
1280 1279 struct vm_intinfo vmii;
1281 1280
1282 1281 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1283 1282 error = EFAULT;
1284 1283 break;
1285 1284 }
1286 1285 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1287 1286 break;
1288 1287 }
1289 1288 case VM_GET_INTINFO: {
1290 1289 struct vm_intinfo vmii;
1291 1290
1292 1291 vmii.vcpuid = vcpu;
1293 1292 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1294 1293 &vmii.info2);
1295 1294 if (error == 0 &&
1296 1295 ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1297 1296 error = EFAULT;
1298 1297 break;
1299 1298 }
1300 1299 break;
1301 1300 }
1302 1301 case VM_RTC_WRITE: {
1303 1302 struct vm_rtc_data rtcdata;
1304 1303
1305 1304 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1306 1305 error = EFAULT;
1307 1306 break;
1308 1307 }
1309 1308 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1310 1309 rtcdata.value);
1311 1310 break;
1312 1311 }
1313 1312 case VM_RTC_READ: {
1314 1313 struct vm_rtc_data rtcdata;
1315 1314
1316 1315 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1317 1316 error = EFAULT;
1318 1317 break;
1319 1318 }
1320 1319 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1321 1320 &rtcdata.value);
1322 1321 if (error == 0 &&
1323 1322 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1324 1323 error = EFAULT;
1325 1324 break;
1326 1325 }
1327 1326 break;
1328 1327 }
1329 1328 case VM_RTC_SETTIME: {
1330 1329 struct vm_rtc_time rtctime;
1331 1330
1332 1331 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1333 1332 error = EFAULT;
1334 1333 break;
1335 1334 }
1336 1335 error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1337 1336 break;
1338 1337 }
1339 1338 case VM_RTC_GETTIME: {
1340 1339 struct vm_rtc_time rtctime;
1341 1340
1342 1341 rtctime.secs = vrtc_get_time(sc->vmm_vm);
1343 1342 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1344 1343 error = EFAULT;
1345 1344 break;
1346 1345 }
1347 1346 break;
1348 1347 }
1349 1348
1350 1349 case VM_PMTMR_LOCATE: {
1351 1350 uint16_t port = arg;
1352 1351 error = vpmtmr_set_location(sc->vmm_vm, port);
1353 1352 break;
1354 1353 }
1355 1354
1356 1355 case VM_RESTART_INSTRUCTION:
1357 1356 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1358 1357 break;
1359 1358
1360 1359 case VM_SET_TOPOLOGY: {
1361 1360 struct vm_cpu_topology topo;
1362 1361
1363 1362 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1364 1363 error = EFAULT;
1365 1364 break;
1366 1365 }
1367 1366 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1368 1367 topo.threads, topo.maxcpus);
1369 1368 break;
1370 1369 }
1371 1370 case VM_GET_TOPOLOGY: {
1372 1371 struct vm_cpu_topology topo;
1373 1372
1374 1373 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1375 1374 &topo.threads, &topo.maxcpus);
1376 1375 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1377 1376 error = EFAULT;
1378 1377 break;
1379 1378 }
1380 1379 break;
1381 1380 }
1382 1381
1383 1382 case VM_DEVMEM_GETOFFSET: {
1384 1383 struct vm_devmem_offset vdo;
1385 1384 list_t *dl = &sc->vmm_devmem_list;
1386 1385 vmm_devmem_entry_t *de = NULL;
1387 1386
1388 1387 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1389 1388 error = EFAULT;
1390 1389 break;
1391 1390 }
1392 1391
1393 1392 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1394 1393 if (de->vde_segid == vdo.segid) {
1395 1394 break;
1396 1395 }
1397 1396 }
1398 1397 if (de != NULL) {
1399 1398 vdo.offset = de->vde_off;
1400 1399 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1401 1400 error = EFAULT;
1402 1401 }
1403 1402 } else {
1404 1403 error = ENOENT;
|
↓ open down ↓ |
915 lines elided |
↑ open up ↑ |
1405 1404 }
1406 1405 break;
1407 1406 }
1408 1407 case VM_WRLOCK_CYCLE: {
1409 1408 /*
1410 1409 * Present a test mechanism to acquire/release the write lock
1411 1410 * on the VM without any other effects.
1412 1411 */
1413 1412 break;
1414 1413 }
1415 - case VM_ARC_RESV:
1416 - error = vm_arc_resv(sc->vmm_vm, (uint64_t)arg);
1417 - break;
1414 +
1418 1415 default:
1419 1416 error = ENOTTY;
1420 1417 break;
1421 1418 }
1422 1419
1423 1420 /* Release exclusion resources */
1424 1421 switch (lock_type) {
1425 1422 case LOCK_NONE:
1426 1423 break;
1427 1424 case LOCK_VCPU:
1428 1425 vcpu_unlock_one(sc, vcpu);
1429 1426 break;
1430 1427 case LOCK_READ_HOLD:
1431 1428 vmm_read_unlock(sc);
1432 1429 break;
1433 1430 case LOCK_WRITE_HOLD:
1434 1431 vmm_write_unlock(sc);
1435 1432 break;
1436 1433 default:
1437 1434 panic("unexpected lock type");
1438 1435 break;
1439 1436 }
1440 1437
1441 1438 return (error);
1442 1439 }
1443 1440
1444 1441 static vmm_softc_t *
1445 1442 vmm_lookup(const char *name)
1446 1443 {
1447 1444 list_t *vml = &vmm_list;
1448 1445 vmm_softc_t *sc;
1449 1446
1450 1447 ASSERT(MUTEX_HELD(&vmm_mtx));
1451 1448
1452 1449 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1453 1450 if (strcmp(sc->vmm_name, name) == 0) {
1454 1451 break;
1455 1452 }
1456 1453 }
1457 1454
1458 1455 return (sc);
1459 1456 }
1460 1457
1461 1458 /*
1462 1459 * Acquire an HMA registration if not already held.
1463 1460 */
1464 1461 static boolean_t
1465 1462 vmm_hma_acquire(void)
1466 1463 {
1467 1464 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1468 1465
1469 1466 mutex_enter(&vmmdev_mtx);
1470 1467
1471 1468 if (vmmdev_hma_reg == NULL) {
1472 1469 VERIFY3U(vmmdev_hma_ref, ==, 0);
1473 1470 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1474 1471 if (vmmdev_hma_reg == NULL) {
1475 1472 cmn_err(CE_WARN, "%s HMA registration failed.",
1476 1473 vmmdev_hvm_name);
1477 1474 mutex_exit(&vmmdev_mtx);
1478 1475 return (B_FALSE);
1479 1476 }
1480 1477 }
1481 1478
1482 1479 vmmdev_hma_ref++;
1483 1480
1484 1481 mutex_exit(&vmmdev_mtx);
1485 1482
1486 1483 return (B_TRUE);
1487 1484 }
1488 1485
1489 1486 /*
1490 1487 * Release the HMA registration if held and there are no remaining VMs.
1491 1488 */
1492 1489 static void
1493 1490 vmm_hma_release(void)
1494 1491 {
1495 1492 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1496 1493
1497 1494 mutex_enter(&vmmdev_mtx);
1498 1495
1499 1496 VERIFY3U(vmmdev_hma_ref, !=, 0);
1500 1497
1501 1498 vmmdev_hma_ref--;
1502 1499
1503 1500 if (vmmdev_hma_ref == 0) {
1504 1501 VERIFY(vmmdev_hma_reg != NULL);
1505 1502 hma_unregister(vmmdev_hma_reg);
1506 1503 vmmdev_hma_reg = NULL;
1507 1504 }
1508 1505 mutex_exit(&vmmdev_mtx);
1509 1506 }
1510 1507
1511 1508 static int
1512 1509 vmmdev_do_vm_create(char *name, cred_t *cr)
1513 1510 {
1514 1511 vmm_softc_t *sc = NULL;
1515 1512 minor_t minor;
1516 1513 int error = ENOMEM;
1517 1514
1518 1515 if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1519 1516 return (EINVAL);
1520 1517 }
1521 1518
1522 1519 if (!vmm_hma_acquire())
1523 1520 return (ENXIO);
1524 1521
1525 1522 mutex_enter(&vmm_mtx);
1526 1523
1527 1524 /* Look for duplicate names */
1528 1525 if (vmm_lookup(name) != NULL) {
1529 1526 mutex_exit(&vmm_mtx);
1530 1527 vmm_hma_release();
1531 1528 return (EEXIST);
1532 1529 }
1533 1530
1534 1531 /* Allow only one instance per non-global zone. */
1535 1532 if (!INGLOBALZONE(curproc)) {
1536 1533 for (sc = list_head(&vmm_list); sc != NULL;
1537 1534 sc = list_next(&vmm_list, sc)) {
1538 1535 if (sc->vmm_zone == curzone) {
1539 1536 mutex_exit(&vmm_mtx);
1540 1537 vmm_hma_release();
1541 1538 return (EINVAL);
1542 1539 }
1543 1540 }
1544 1541 }
1545 1542
1546 1543 minor = id_alloc(vmm_minors);
1547 1544 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1548 1545 goto fail;
1549 1546 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1550 1547 ddi_soft_state_free(vmm_statep, minor);
1551 1548 goto fail;
1552 1549 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1553 1550 DDI_PSEUDO, 0) != DDI_SUCCESS) {
1554 1551 goto fail;
1555 1552 }
1556 1553
1557 1554 if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1558 1555 goto fail;
1559 1556 }
1560 1557
1561 1558 error = vm_create(name, &sc->vmm_vm);
1562 1559 if (error == 0) {
1563 1560 /* Complete VM intialization and report success. */
1564 1561 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1565 1562 sc->vmm_minor = minor;
1566 1563 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1567 1564 offsetof(vmm_devmem_entry_t, vde_node));
1568 1565
1569 1566 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1570 1567 offsetof(vmm_hold_t, vmh_node));
1571 1568 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1572 1569
1573 1570 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1574 1571 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1575 1572 offsetof(vmm_lease_t, vml_node));
1576 1573 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1577 1574 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1578 1575
1579 1576 sc->vmm_zone = crgetzone(cr);
1580 1577 zone_hold(sc->vmm_zone);
1581 1578 vmm_zsd_add_vm(sc);
1582 1579 vmm_kstat_init(sc);
1583 1580
1584 1581 list_insert_tail(&vmm_list, sc);
1585 1582 mutex_exit(&vmm_mtx);
1586 1583 return (0);
1587 1584 }
1588 1585
1589 1586 vmm_kstat_fini(sc);
1590 1587 ddi_remove_minor_node(vmmdev_dip, name);
1591 1588 fail:
1592 1589 id_free(vmm_minors, minor);
1593 1590 if (sc != NULL) {
1594 1591 ddi_soft_state_free(vmm_statep, minor);
1595 1592 }
1596 1593 mutex_exit(&vmm_mtx);
1597 1594 vmm_hma_release();
1598 1595
1599 1596 return (error);
1600 1597 }
1601 1598
1602 1599 /*
1603 1600 * Bhyve 'Driver' Interface
1604 1601 *
1605 1602 * While many devices are emulated in the bhyve userspace process, there are
1606 1603 * others with performance constraints which require that they run mostly or
1607 1604 * entirely in-kernel. For those not integrated directly into bhyve, an API is
1608 1605 * needed so they can query/manipulate the portions of VM state needed to
1609 1606 * fulfill their purpose.
1610 1607 *
1611 1608 * This includes:
1612 1609 * - Translating guest-physical addresses to host-virtual pointers
1613 1610 * - Injecting MSIs
1614 1611 * - Hooking IO port addresses
1615 1612 *
1616 1613 * The vmm_drv interface exists to provide that functionality to its consumers.
1617 1614 * (At this time, 'viona' is the only user)
1618 1615 */
1619 1616 int
1620 1617 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1621 1618 {
1622 1619 vnode_t *vp = fp->f_vnode;
1623 1620 const dev_t dev = vp->v_rdev;
1624 1621 vmm_softc_t *sc;
1625 1622 vmm_hold_t *hold;
1626 1623 int err = 0;
1627 1624
1628 1625 if (vp->v_type != VCHR) {
1629 1626 return (ENXIO);
1630 1627 }
1631 1628 const major_t major = getmajor(dev);
1632 1629 const minor_t minor = getminor(dev);
1633 1630
1634 1631 mutex_enter(&vmmdev_mtx);
1635 1632 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1636 1633 mutex_exit(&vmmdev_mtx);
1637 1634 return (ENOENT);
1638 1635 }
1639 1636 mutex_enter(&vmm_mtx);
1640 1637 mutex_exit(&vmmdev_mtx);
1641 1638
1642 1639 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1643 1640 err = ENOENT;
1644 1641 goto out;
1645 1642 }
1646 1643 /* XXXJOY: check cred permissions against instance */
1647 1644
1648 1645 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1649 1646 err = EBUSY;
1650 1647 goto out;
1651 1648 }
1652 1649
1653 1650 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1654 1651 hold->vmh_sc = sc;
1655 1652 hold->vmh_release_req = B_FALSE;
1656 1653
1657 1654 list_insert_tail(&sc->vmm_holds, hold);
1658 1655 sc->vmm_flags |= VMM_HELD;
1659 1656 *holdp = hold;
1660 1657
1661 1658 out:
1662 1659 mutex_exit(&vmm_mtx);
1663 1660 return (err);
1664 1661 }
1665 1662
1666 1663 void
1667 1664 vmm_drv_rele(vmm_hold_t *hold)
1668 1665 {
1669 1666 vmm_softc_t *sc;
1670 1667
1671 1668 ASSERT(hold != NULL);
1672 1669 ASSERT(hold->vmh_sc != NULL);
1673 1670 VERIFY(hold->vmh_ioport_hook_cnt == 0);
1674 1671
1675 1672 mutex_enter(&vmm_mtx);
1676 1673 sc = hold->vmh_sc;
1677 1674 list_remove(&sc->vmm_holds, hold);
1678 1675 if (list_is_empty(&sc->vmm_holds)) {
1679 1676 sc->vmm_flags &= ~VMM_HELD;
1680 1677 cv_broadcast(&sc->vmm_cv);
1681 1678 }
1682 1679 mutex_exit(&vmm_mtx);
1683 1680 kmem_free(hold, sizeof (*hold));
1684 1681 }
1685 1682
1686 1683 boolean_t
1687 1684 vmm_drv_release_reqd(vmm_hold_t *hold)
1688 1685 {
1689 1686 ASSERT(hold != NULL);
1690 1687
1691 1688 return (hold->vmh_release_req);
1692 1689 }
1693 1690
1694 1691 vmm_lease_t *
1695 1692 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1696 1693 {
1697 1694 vmm_softc_t *sc = hold->vmh_sc;
1698 1695 vmm_lease_t *lease;
1699 1696
1700 1697 ASSERT3P(expiref, !=, NULL);
1701 1698
1702 1699 if (hold->vmh_release_req) {
1703 1700 return (NULL);
1704 1701 }
1705 1702
1706 1703 lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1707 1704 list_link_init(&lease->vml_node);
1708 1705 lease->vml_expire_func = expiref;
1709 1706 lease->vml_expire_arg = arg;
1710 1707 lease->vml_expired = B_FALSE;
1711 1708 lease->vml_hold = hold;
1712 1709 /* cache the VM pointer for one less pointer chase */
1713 1710 lease->vml_vm = sc->vmm_vm;
1714 1711
1715 1712 mutex_enter(&sc->vmm_lease_lock);
1716 1713 while (sc->vmm_lease_blocker != 0) {
1717 1714 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1718 1715 }
1719 1716 list_insert_tail(&sc->vmm_lease_list, lease);
1720 1717 vmm_read_lock(sc);
1721 1718 mutex_exit(&sc->vmm_lease_lock);
1722 1719
1723 1720 return (lease);
1724 1721 }
1725 1722
1726 1723 static void
1727 1724 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1728 1725 {
1729 1726 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1730 1727
1731 1728 list_remove(&sc->vmm_lease_list, lease);
1732 1729 vmm_read_unlock(sc);
1733 1730 kmem_free(lease, sizeof (*lease));
1734 1731 }
1735 1732
1736 1733 void
1737 1734 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1738 1735 {
1739 1736 vmm_softc_t *sc = hold->vmh_sc;
1740 1737
1741 1738 VERIFY3P(hold, ==, lease->vml_hold);
1742 1739
1743 1740 mutex_enter(&sc->vmm_lease_lock);
1744 1741 vmm_lease_break_locked(sc, lease);
1745 1742 mutex_exit(&sc->vmm_lease_lock);
1746 1743 }
1747 1744
1748 1745 boolean_t
1749 1746 vmm_drv_lease_expired(vmm_lease_t *lease)
1750 1747 {
1751 1748 return (lease->vml_expired);
1752 1749 }
1753 1750
1754 1751 void *
1755 1752 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1756 1753 {
1757 1754 ASSERT(lease != NULL);
1758 1755
1759 1756 return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1760 1757 }
1761 1758
1762 1759 int
1763 1760 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1764 1761 {
1765 1762 ASSERT(lease != NULL);
1766 1763
1767 1764 return (lapic_intr_msi(lease->vml_vm, addr, msg));
1768 1765 }
1769 1766
1770 1767 int
1771 1768 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1772 1769 void *arg, void **cookie)
1773 1770 {
1774 1771 vmm_softc_t *sc;
1775 1772 int err;
1776 1773
1777 1774 ASSERT(hold != NULL);
1778 1775 ASSERT(cookie != NULL);
1779 1776
1780 1777 sc = hold->vmh_sc;
1781 1778 mutex_enter(&vmm_mtx);
1782 1779 /* Confirm that hook installation is not blocked */
1783 1780 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1784 1781 mutex_exit(&vmm_mtx);
1785 1782 return (EBUSY);
1786 1783 }
1787 1784 /*
1788 1785 * Optimistically record an installed hook which will prevent a block
1789 1786 * from being asserted while the mutex is dropped.
1790 1787 */
1791 1788 hold->vmh_ioport_hook_cnt++;
1792 1789 mutex_exit(&vmm_mtx);
1793 1790
1794 1791 vmm_write_lock(sc);
1795 1792 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1796 1793 arg, cookie);
1797 1794 vmm_write_unlock(sc);
1798 1795
1799 1796 if (err != 0) {
1800 1797 mutex_enter(&vmm_mtx);
1801 1798 /* Walk back optimism about the hook installation */
1802 1799 hold->vmh_ioport_hook_cnt--;
1803 1800 mutex_exit(&vmm_mtx);
1804 1801 }
1805 1802 return (err);
1806 1803 }
1807 1804
1808 1805 void
1809 1806 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1810 1807 {
1811 1808 vmm_softc_t *sc;
1812 1809
1813 1810 ASSERT(hold != NULL);
1814 1811 ASSERT(cookie != NULL);
1815 1812 ASSERT(hold->vmh_ioport_hook_cnt != 0);
1816 1813
1817 1814 sc = hold->vmh_sc;
1818 1815 vmm_write_lock(sc);
1819 1816 vm_ioport_unhook(sc->vmm_vm, cookie);
1820 1817 vmm_write_unlock(sc);
1821 1818
1822 1819 mutex_enter(&vmm_mtx);
1823 1820 hold->vmh_ioport_hook_cnt--;
1824 1821 mutex_exit(&vmm_mtx);
1825 1822 }
1826 1823
1827 1824 static int
1828 1825 vmm_drv_purge(vmm_softc_t *sc)
1829 1826 {
1830 1827 ASSERT(MUTEX_HELD(&vmm_mtx));
1831 1828
1832 1829 if ((sc->vmm_flags & VMM_HELD) != 0) {
1833 1830 vmm_hold_t *hold;
1834 1831
1835 1832 sc->vmm_flags |= VMM_CLEANUP;
1836 1833 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1837 1834 hold = list_next(&sc->vmm_holds, hold)) {
1838 1835 hold->vmh_release_req = B_TRUE;
1839 1836 }
1840 1837 while ((sc->vmm_flags & VMM_HELD) != 0) {
1841 1838 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1842 1839 return (EINTR);
1843 1840 }
1844 1841 }
1845 1842 sc->vmm_flags &= ~VMM_CLEANUP;
1846 1843 }
1847 1844
1848 1845 VERIFY(list_is_empty(&sc->vmm_holds));
1849 1846 sc->vmm_flags |= VMM_PURGED;
1850 1847 return (0);
1851 1848 }
1852 1849
1853 1850 static int
1854 1851 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1855 1852 {
1856 1853 int err = 0;
1857 1854
1858 1855 mutex_enter(&vmm_mtx);
1859 1856 if (!enable_block) {
1860 1857 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1861 1858
1862 1859 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1863 1860 goto done;
1864 1861 }
1865 1862
1866 1863 /* If any holds have hooks installed, the block is a failure */
1867 1864 if (!list_is_empty(&sc->vmm_holds)) {
1868 1865 vmm_hold_t *hold;
1869 1866
1870 1867 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1871 1868 hold = list_next(&sc->vmm_holds, hold)) {
1872 1869 if (hold->vmh_ioport_hook_cnt != 0) {
1873 1870 err = EBUSY;
1874 1871 goto done;
1875 1872 }
1876 1873 }
1877 1874 }
1878 1875 sc->vmm_flags |= VMM_BLOCK_HOOK;
1879 1876
1880 1877 done:
1881 1878 mutex_exit(&vmm_mtx);
1882 1879 return (err);
1883 1880 }
1884 1881
1885 1882 static int
1886 1883 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1887 1884 boolean_t *hma_release)
1888 1885 {
1889 1886 dev_info_t *pdip = ddi_get_parent(vmmdev_dip);
1890 1887 minor_t minor;
1891 1888
1892 1889 ASSERT(MUTEX_HELD(&vmm_mtx));
1893 1890
1894 1891 *hma_release = B_FALSE;
1895 1892
1896 1893 if (vmm_drv_purge(sc) != 0) {
1897 1894 return (EINTR);
1898 1895 }
1899 1896
1900 1897 if (clean_zsd) {
1901 1898 vmm_zsd_rem_vm(sc);
1902 1899 }
1903 1900
1904 1901 /* Clean up devmem entries */
1905 1902 vmmdev_devmem_purge(sc);
1906 1903
1907 1904 list_remove(&vmm_list, sc);
1908 1905 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1909 1906 minor = sc->vmm_minor;
1910 1907 zone_rele(sc->vmm_zone);
1911 1908 if (sc->vmm_is_open) {
1912 1909 list_insert_tail(&vmm_destroy_list, sc);
1913 1910 sc->vmm_flags |= VMM_DESTROY;
1914 1911 } else {
1915 1912 vm_destroy(sc->vmm_vm);
1916 1913 vmm_kstat_fini(sc);
1917 1914 ddi_soft_state_free(vmm_statep, minor);
1918 1915 id_free(vmm_minors, minor);
1919 1916 *hma_release = B_TRUE;
1920 1917 }
1921 1918 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1922 1919
1923 1920 return (0);
1924 1921 }
1925 1922
1926 1923 int
1927 1924 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1928 1925 {
1929 1926 boolean_t hma_release = B_FALSE;
1930 1927 int err;
1931 1928
1932 1929 mutex_enter(&vmm_mtx);
1933 1930 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1934 1931 mutex_exit(&vmm_mtx);
1935 1932
1936 1933 if (hma_release)
1937 1934 vmm_hma_release();
1938 1935
1939 1936 return (err);
1940 1937 }
1941 1938
1942 1939 /* ARGSUSED */
1943 1940 static int
1944 1941 vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1945 1942 {
1946 1943 boolean_t hma_release = B_FALSE;
1947 1944 vmm_softc_t *sc;
1948 1945 int err;
1949 1946
1950 1947 if (crgetuid(cr) != 0)
1951 1948 return (EPERM);
1952 1949
1953 1950 mutex_enter(&vmm_mtx);
1954 1951
1955 1952 if ((sc = vmm_lookup(name)) == NULL) {
1956 1953 mutex_exit(&vmm_mtx);
1957 1954 return (ENOENT);
1958 1955 }
1959 1956 /*
1960 1957 * We don't check this in vmm_lookup() since that function is also used
1961 1958 * for validation during create and currently vmm names must be unique.
1962 1959 */
1963 1960 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1964 1961 mutex_exit(&vmm_mtx);
1965 1962 return (EPERM);
1966 1963 }
1967 1964 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1968 1965
1969 1966 mutex_exit(&vmm_mtx);
1970 1967
1971 1968 if (hma_release)
1972 1969 vmm_hma_release();
1973 1970
1974 1971 return (err);
1975 1972 }
1976 1973
1977 1974 #define VCPU_NAME_BUFLEN 32
1978 1975
1979 1976 static int
1980 1977 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
1981 1978 {
1982 1979 zoneid_t zid = crgetzoneid(cr);
1983 1980 int instance = minor;
1984 1981 kstat_t *ksp;
1985 1982
1986 1983 ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
1987 1984
1988 1985 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
1989 1986 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
1990 1987 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
1991 1988
1992 1989 if (ksp == NULL) {
1993 1990 return (-1);
1994 1991 }
1995 1992 sc->vmm_kstat_vm = ksp;
1996 1993
1997 1994 for (uint_t i = 0; i < VM_MAXCPU; i++) {
1998 1995 char namebuf[VCPU_NAME_BUFLEN];
1999 1996
2000 1997 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2001 1998
2002 1999 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2003 2000 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2004 2001 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2005 2002 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2006 2003 0, zid);
2007 2004 if (ksp == NULL) {
2008 2005 goto fail;
2009 2006 }
2010 2007
2011 2008 sc->vmm_kstat_vcpu[i] = ksp;
2012 2009 }
2013 2010
2014 2011 /*
2015 2012 * If this instance is associated with a non-global zone, make its
2016 2013 * kstats visible from the GZ.
2017 2014 */
2018 2015 if (zid != GLOBAL_ZONEID) {
2019 2016 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2020 2017 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2021 2018 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2022 2019 }
2023 2020 }
2024 2021
2025 2022 return (0);
2026 2023
2027 2024 fail:
2028 2025 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2029 2026 if (sc->vmm_kstat_vcpu[i] != NULL) {
2030 2027 kstat_delete(sc->vmm_kstat_vcpu[i]);
2031 2028 sc->vmm_kstat_vcpu[i] = NULL;
2032 2029 } else {
2033 2030 break;
2034 2031 }
2035 2032 }
2036 2033 kstat_delete(sc->vmm_kstat_vm);
2037 2034 sc->vmm_kstat_vm = NULL;
2038 2035 return (-1);
2039 2036 }
2040 2037
2041 2038 static void
2042 2039 vmm_kstat_init(vmm_softc_t *sc)
2043 2040 {
2044 2041 kstat_t *ksp;
2045 2042
2046 2043 ASSERT3P(sc->vmm_vm, !=, NULL);
2047 2044 ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2048 2045
2049 2046 ksp = sc->vmm_kstat_vm;
2050 2047 vmm_kstats_t *vk = ksp->ks_data;
2051 2048 ksp->ks_private = sc->vmm_vm;
2052 2049 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2053 2050 kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2054 2051
2055 2052 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2056 2053 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2057 2054
2058 2055 ksp = sc->vmm_kstat_vcpu[i];
2059 2056 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2060 2057
2061 2058 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2062 2059 vvk->vvk_vcpu.value.ui32 = i;
2063 2060 kstat_named_init(&vvk->vvk_time_init, "time_init",
2064 2061 KSTAT_DATA_UINT64);
2065 2062 kstat_named_init(&vvk->vvk_time_run, "time_run",
2066 2063 KSTAT_DATA_UINT64);
2067 2064 kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2068 2065 KSTAT_DATA_UINT64);
2069 2066 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2070 2067 KSTAT_DATA_UINT64);
2071 2068 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2072 2069 KSTAT_DATA_UINT64);
2073 2070 kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2074 2071 KSTAT_DATA_UINT64);
2075 2072 ksp->ks_private = sc->vmm_vm;
2076 2073 ksp->ks_update = vmm_kstat_update_vcpu;
2077 2074 }
2078 2075
2079 2076 kstat_install(sc->vmm_kstat_vm);
2080 2077 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2081 2078 kstat_install(sc->vmm_kstat_vcpu[i]);
2082 2079 }
2083 2080 }
2084 2081
2085 2082 static void
2086 2083 vmm_kstat_fini(vmm_softc_t *sc)
2087 2084 {
2088 2085 ASSERT(sc->vmm_kstat_vm != NULL);
2089 2086
2090 2087 kstat_delete(sc->vmm_kstat_vm);
2091 2088 sc->vmm_kstat_vm = NULL;
2092 2089
2093 2090 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2094 2091 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2095 2092
2096 2093 kstat_delete(sc->vmm_kstat_vcpu[i]);
2097 2094 sc->vmm_kstat_vcpu[i] = NULL;
2098 2095 }
2099 2096 }
2100 2097
2101 2098 static int
2102 2099 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2103 2100 {
2104 2101 minor_t minor;
2105 2102 vmm_softc_t *sc;
2106 2103
2107 2104 minor = getminor(*devp);
2108 2105 if (minor == VMM_CTL_MINOR) {
2109 2106 /*
2110 2107 * Master control device must be opened exclusively.
2111 2108 */
2112 2109 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2113 2110 return (EINVAL);
2114 2111 }
2115 2112
2116 2113 return (0);
2117 2114 }
2118 2115
2119 2116 mutex_enter(&vmm_mtx);
2120 2117 sc = ddi_get_soft_state(vmm_statep, minor);
2121 2118 if (sc == NULL) {
2122 2119 mutex_exit(&vmm_mtx);
2123 2120 return (ENXIO);
2124 2121 }
2125 2122
2126 2123 sc->vmm_is_open = B_TRUE;
2127 2124 mutex_exit(&vmm_mtx);
2128 2125
2129 2126 return (0);
2130 2127 }
2131 2128
2132 2129 static int
2133 2130 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2134 2131 {
2135 2132 minor_t minor;
2136 2133 vmm_softc_t *sc;
2137 2134 boolean_t hma_release = B_FALSE;
2138 2135
2139 2136 minor = getminor(dev);
2140 2137 if (minor == VMM_CTL_MINOR)
2141 2138 return (0);
2142 2139
2143 2140 mutex_enter(&vmm_mtx);
2144 2141 sc = ddi_get_soft_state(vmm_statep, minor);
2145 2142 if (sc == NULL) {
2146 2143 mutex_exit(&vmm_mtx);
2147 2144 return (ENXIO);
2148 2145 }
2149 2146
2150 2147 VERIFY(sc->vmm_is_open);
2151 2148 sc->vmm_is_open = B_FALSE;
2152 2149
2153 2150 /*
2154 2151 * If this VM was destroyed while the vmm device was open, then
2155 2152 * clean it up now that it is closed.
2156 2153 */
2157 2154 if (sc->vmm_flags & VMM_DESTROY) {
2158 2155 list_remove(&vmm_destroy_list, sc);
2159 2156 vm_destroy(sc->vmm_vm);
2160 2157 ddi_soft_state_free(vmm_statep, minor);
2161 2158 id_free(vmm_minors, minor);
2162 2159 hma_release = B_TRUE;
2163 2160 }
2164 2161 mutex_exit(&vmm_mtx);
2165 2162
2166 2163 if (hma_release)
2167 2164 vmm_hma_release();
2168 2165
2169 2166 return (0);
2170 2167 }
2171 2168
2172 2169 static int
2173 2170 vmm_is_supported(intptr_t arg)
2174 2171 {
2175 2172 int r;
2176 2173 const char *msg;
2177 2174
2178 2175 if (vmm_is_intel()) {
2179 2176 r = vmx_x86_supported(&msg);
2180 2177 } else if (vmm_is_svm()) {
2181 2178 /*
|
↓ open down ↓ |
754 lines elided |
↑ open up ↑ |
2182 2179 * HMA already ensured that the features necessary for SVM
2183 2180 * operation were present and online during vmm_attach().
2184 2181 */
2185 2182 r = 0;
2186 2183 } else {
2187 2184 r = ENXIO;
2188 2185 msg = "Unsupported CPU vendor";
2189 2186 }
2190 2187
2191 2188 if (r != 0 && arg != (intptr_t)NULL) {
2192 - if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
2189 + if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2193 2190 return (EFAULT);
2194 2191 }
2195 2192 return (r);
2196 2193 }
2197 2194
2198 2195 static int
2199 2196 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2200 2197 int *rvalp)
2201 2198 {
2202 2199 vmm_softc_t *sc;
2203 2200 minor_t minor;
2204 2201
2205 2202 /* The structs in bhyve ioctls assume a 64-bit datamodel */
2206 2203 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2207 2204 return (ENOTSUP);
2208 2205 }
2209 2206
2210 2207 minor = getminor(dev);
2211 2208
2212 2209 if (minor == VMM_CTL_MINOR) {
2213 2210 void *argp = (void *)arg;
2214 2211 char name[VM_MAX_NAMELEN] = { 0 };
2215 2212 size_t len = 0;
2216 2213
2217 2214 if ((mode & FKIOCTL) != 0) {
2218 2215 len = strlcpy(name, argp, sizeof (name));
2219 2216 } else {
2220 2217 if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2221 2218 return (EFAULT);
2222 2219 }
2223 2220 }
2224 2221 if (len >= VM_MAX_NAMELEN) {
2225 2222 return (ENAMETOOLONG);
2226 2223 }
2227 2224
2228 2225 switch (cmd) {
2229 2226 case VMM_CREATE_VM:
2230 2227 if ((mode & FWRITE) == 0)
2231 2228 return (EPERM);
2232 2229 return (vmmdev_do_vm_create(name, credp));
2233 2230 case VMM_DESTROY_VM:
2234 2231 if ((mode & FWRITE) == 0)
2235 2232 return (EPERM);
2236 2233 return (vmmdev_do_vm_destroy(name, credp));
2237 2234 case VMM_VM_SUPPORTED:
2238 2235 return (vmm_is_supported(arg));
2239 2236 default:
2240 2237 /* No other actions are legal on ctl device */
2241 2238 return (ENOTTY);
2242 2239 }
2243 2240 }
2244 2241
2245 2242 sc = ddi_get_soft_state(vmm_statep, minor);
2246 2243 ASSERT(sc);
2247 2244
2248 2245 if (sc->vmm_flags & VMM_DESTROY)
2249 2246 return (ENXIO);
2250 2247
2251 2248 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2252 2249 }
2253 2250
2254 2251 static int
2255 2252 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2256 2253 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2257 2254 {
2258 2255 vmm_softc_t *sc;
2259 2256 const minor_t minor = getminor(dev);
2260 2257 struct vm *vm;
2261 2258 int err;
2262 2259 vm_object_t vmo = NULL;
2263 2260 struct vmspace *vms;
2264 2261
2265 2262 if (minor == VMM_CTL_MINOR) {
2266 2263 return (ENODEV);
2267 2264 }
2268 2265 if (off < 0 || (off + len) <= 0) {
2269 2266 return (EINVAL);
2270 2267 }
2271 2268 if ((prot & PROT_USER) == 0) {
2272 2269 return (EACCES);
2273 2270 }
2274 2271
2275 2272 sc = ddi_get_soft_state(vmm_statep, minor);
2276 2273 ASSERT(sc);
2277 2274
2278 2275 if (sc->vmm_flags & VMM_DESTROY)
2279 2276 return (ENXIO);
2280 2277
2281 2278 /* Grab read lock on the VM to prevent any changes to the memory map */
2282 2279 vmm_read_lock(sc);
2283 2280
2284 2281 vm = sc->vmm_vm;
2285 2282 vms = vm_get_vmspace(vm);
2286 2283 if (off >= VM_DEVMEM_START) {
2287 2284 int segid;
2288 2285 off_t map_off = 0;
2289 2286
2290 2287 /* Mapping a devmem "device" */
2291 2288 if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2292 2289 err = ENODEV;
2293 2290 goto out;
2294 2291 }
2295 2292 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2296 2293 if (err != 0) {
2297 2294 goto out;
2298 2295 }
2299 2296 err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2300 2297 flags);
2301 2298 } else {
2302 2299 /* Mapping a part of the guest physical space */
2303 2300 err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2304 2301 flags);
2305 2302 }
2306 2303
2307 2304
2308 2305 out:
2309 2306 vmm_read_unlock(sc);
2310 2307 return (err);
2311 2308 }
2312 2309
2313 2310 static sdev_plugin_validate_t
2314 2311 vmm_sdev_validate(sdev_ctx_t ctx)
2315 2312 {
2316 2313 const char *name = sdev_ctx_name(ctx);
2317 2314 vmm_softc_t *sc;
2318 2315 sdev_plugin_validate_t ret;
2319 2316 minor_t minor;
2320 2317
2321 2318 if (sdev_ctx_vtype(ctx) != VCHR)
2322 2319 return (SDEV_VTOR_INVALID);
2323 2320
2324 2321 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2325 2322
2326 2323 mutex_enter(&vmm_mtx);
2327 2324 if ((sc = vmm_lookup(name)) == NULL)
2328 2325 ret = SDEV_VTOR_INVALID;
2329 2326 else if (sc->vmm_minor != minor)
2330 2327 ret = SDEV_VTOR_STALE;
2331 2328 else
2332 2329 ret = SDEV_VTOR_VALID;
2333 2330 mutex_exit(&vmm_mtx);
2334 2331
2335 2332 return (ret);
2336 2333 }
2337 2334
2338 2335 static int
2339 2336 vmm_sdev_filldir(sdev_ctx_t ctx)
2340 2337 {
2341 2338 vmm_softc_t *sc;
2342 2339 int ret;
2343 2340
2344 2341 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2345 2342 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2346 2343 sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2347 2344 return (EINVAL);
2348 2345 }
2349 2346
2350 2347 mutex_enter(&vmm_mtx);
2351 2348 ASSERT(vmmdev_dip != NULL);
2352 2349 for (sc = list_head(&vmm_list); sc != NULL;
2353 2350 sc = list_next(&vmm_list, sc)) {
2354 2351 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2355 2352 ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2356 2353 S_IFCHR | 0600,
2357 2354 makedevice(ddi_driver_major(vmmdev_dip),
2358 2355 sc->vmm_minor));
2359 2356 } else {
2360 2357 continue;
2361 2358 }
2362 2359 if (ret != 0 && ret != EEXIST)
2363 2360 goto out;
2364 2361 }
2365 2362
2366 2363 ret = 0;
2367 2364
2368 2365 out:
2369 2366 mutex_exit(&vmm_mtx);
2370 2367 return (ret);
2371 2368 }
2372 2369
2373 2370 /* ARGSUSED */
2374 2371 static void
2375 2372 vmm_sdev_inactive(sdev_ctx_t ctx)
2376 2373 {
2377 2374 }
2378 2375
2379 2376 static sdev_plugin_ops_t vmm_sdev_ops = {
2380 2377 .spo_version = SDEV_PLUGIN_VERSION,
2381 2378 .spo_flags = SDEV_PLUGIN_SUBDIR,
2382 2379 .spo_validate = vmm_sdev_validate,
2383 2380 .spo_filldir = vmm_sdev_filldir,
2384 2381 .spo_inactive = vmm_sdev_inactive
2385 2382 };
2386 2383
2387 2384 /* ARGSUSED */
2388 2385 static int
2389 2386 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2390 2387 {
2391 2388 int error;
2392 2389
2393 2390 switch (cmd) {
2394 2391 case DDI_INFO_DEVT2DEVINFO:
2395 2392 *result = (void *)vmmdev_dip;
2396 2393 error = DDI_SUCCESS;
2397 2394 break;
2398 2395 case DDI_INFO_DEVT2INSTANCE:
2399 2396 *result = (void *)0;
2400 2397 error = DDI_SUCCESS;
2401 2398 break;
2402 2399 default:
2403 2400 error = DDI_FAILURE;
2404 2401 break;
2405 2402 }
2406 2403 return (error);
2407 2404 }
2408 2405
2409 2406 static int
2410 2407 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2411 2408 {
2412 2409 sdev_plugin_hdl_t sph;
2413 2410 hma_reg_t *reg = NULL;
2414 2411 boolean_t vmm_loaded = B_FALSE;
2415 2412
2416 2413 if (cmd != DDI_ATTACH) {
2417 2414 return (DDI_FAILURE);
2418 2415 }
2419 2416
2420 2417 mutex_enter(&vmmdev_mtx);
2421 2418 /* Ensure we are not already attached. */
2422 2419 if (vmmdev_dip != NULL) {
2423 2420 mutex_exit(&vmmdev_mtx);
2424 2421 return (DDI_FAILURE);
2425 2422 }
2426 2423
2427 2424 vmm_sol_glue_init();
2428 2425 vmm_arena_init();
2429 2426
2430 2427 /*
2431 2428 * Perform temporary HMA registration to determine if the system
2432 2429 * is capable.
2433 2430 */
2434 2431 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2435 2432 goto fail;
2436 2433 } else if (vmm_mod_load() != 0) {
2437 2434 goto fail;
2438 2435 }
2439 2436 vmm_loaded = B_TRUE;
2440 2437 hma_unregister(reg);
2441 2438 reg = NULL;
2442 2439
2443 2440 /* Create control node. Other nodes will be created on demand. */
2444 2441 if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2445 2442 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2446 2443 goto fail;
2447 2444 }
2448 2445
2449 2446 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2450 2447 if (sph == (sdev_plugin_hdl_t)NULL) {
2451 2448 ddi_remove_minor_node(dip, NULL);
2452 2449 goto fail;
2453 2450 }
2454 2451
2455 2452 ddi_report_dev(dip);
2456 2453 vmmdev_sdev_hdl = sph;
2457 2454 vmmdev_dip = dip;
2458 2455 mutex_exit(&vmmdev_mtx);
2459 2456 return (DDI_SUCCESS);
2460 2457
2461 2458 fail:
2462 2459 if (vmm_loaded) {
2463 2460 VERIFY0(vmm_mod_unload());
2464 2461 }
2465 2462 if (reg != NULL) {
2466 2463 hma_unregister(reg);
2467 2464 }
2468 2465 vmm_arena_fini();
2469 2466 vmm_sol_glue_cleanup();
2470 2467 mutex_exit(&vmmdev_mtx);
2471 2468 return (DDI_FAILURE);
2472 2469 }
2473 2470
2474 2471 static int
2475 2472 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2476 2473 {
2477 2474 if (cmd != DDI_DETACH) {
2478 2475 return (DDI_FAILURE);
2479 2476 }
2480 2477
2481 2478 /*
2482 2479 * Ensure that all resources have been cleaned up.
2483 2480 *
2484 2481 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2485 2482 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2486 2483 * devinfo locked as iommu_cleanup() tries to recursively lock each
2487 2484 * devinfo, including our own, while holding vmmdev_mtx.
2488 2485 */
2489 2486 if (mutex_tryenter(&vmmdev_mtx) == 0)
2490 2487 return (DDI_FAILURE);
2491 2488
2492 2489 mutex_enter(&vmm_mtx);
2493 2490 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2494 2491 mutex_exit(&vmm_mtx);
2495 2492 mutex_exit(&vmmdev_mtx);
2496 2493 return (DDI_FAILURE);
2497 2494 }
2498 2495 mutex_exit(&vmm_mtx);
2499 2496
2500 2497 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2501 2498 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2502 2499 mutex_exit(&vmmdev_mtx);
2503 2500 return (DDI_FAILURE);
2504 2501 }
2505 2502 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2506 2503
2507 2504 /* Remove the control node. */
2508 2505 ddi_remove_minor_node(dip, "ctl");
2509 2506 vmmdev_dip = NULL;
2510 2507
2511 2508 VERIFY0(vmm_mod_unload());
2512 2509 VERIFY3U(vmmdev_hma_reg, ==, NULL);
2513 2510 vmm_arena_fini();
2514 2511 vmm_sol_glue_cleanup();
2515 2512
2516 2513 mutex_exit(&vmmdev_mtx);
2517 2514
2518 2515 return (DDI_SUCCESS);
2519 2516 }
2520 2517
2521 2518 static struct cb_ops vmm_cb_ops = {
2522 2519 vmm_open,
2523 2520 vmm_close,
2524 2521 nodev, /* strategy */
2525 2522 nodev, /* print */
2526 2523 nodev, /* dump */
2527 2524 nodev, /* read */
2528 2525 nodev, /* write */
2529 2526 vmm_ioctl,
2530 2527 nodev, /* devmap */
2531 2528 nodev, /* mmap */
2532 2529 vmm_segmap,
2533 2530 nochpoll, /* poll */
2534 2531 ddi_prop_op,
2535 2532 NULL,
2536 2533 D_NEW | D_MP | D_DEVMAP
2537 2534 };
2538 2535
2539 2536 static struct dev_ops vmm_ops = {
2540 2537 DEVO_REV,
2541 2538 0,
2542 2539 vmm_info,
2543 2540 nulldev, /* identify */
2544 2541 nulldev, /* probe */
2545 2542 vmm_attach,
2546 2543 vmm_detach,
2547 2544 nodev, /* reset */
2548 2545 &vmm_cb_ops,
2549 2546 (struct bus_ops *)NULL
2550 2547 };
2551 2548
2552 2549 static struct modldrv modldrv = {
2553 2550 &mod_driverops,
2554 2551 "bhyve vmm",
2555 2552 &vmm_ops
2556 2553 };
2557 2554
2558 2555 static struct modlinkage modlinkage = {
2559 2556 MODREV_1,
2560 2557 &modldrv,
2561 2558 NULL
2562 2559 };
2563 2560
2564 2561 int
2565 2562 _init(void)
2566 2563 {
2567 2564 int error;
2568 2565
2569 2566 sysinit();
2570 2567
2571 2568 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2572 2569 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2573 2570 list_create(&vmm_list, sizeof (vmm_softc_t),
2574 2571 offsetof(vmm_softc_t, vmm_node));
2575 2572 list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2576 2573 offsetof(vmm_softc_t, vmm_node));
2577 2574 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2578 2575
2579 2576 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2580 2577 if (error) {
2581 2578 return (error);
2582 2579 }
2583 2580
2584 2581 vmm_zsd_init();
2585 2582
2586 2583 error = mod_install(&modlinkage);
2587 2584 if (error) {
2588 2585 ddi_soft_state_fini(&vmm_statep);
2589 2586 vmm_zsd_fini();
2590 2587 }
2591 2588
2592 2589 return (error);
2593 2590 }
2594 2591
2595 2592 int
2596 2593 _fini(void)
2597 2594 {
2598 2595 int error;
2599 2596
2600 2597 error = mod_remove(&modlinkage);
2601 2598 if (error) {
2602 2599 return (error);
2603 2600 }
2604 2601
2605 2602 vmm_zsd_fini();
2606 2603
2607 2604 ddi_soft_state_fini(&vmm_statep);
2608 2605
2609 2606 return (0);
2610 2607 }
2611 2608
2612 2609 int
2613 2610 _info(struct modinfo *modinfop)
2614 2611 {
2615 2612 return (mod_info(&modlinkage, modinfop));
2616 2613 }
|
↓ open down ↓ |
414 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX