1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright (c) 2016 by Delphix. All rights reserved.
25 */
26
27 /*
28 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
29 * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
30 */
31
32 /*
33 * Memory special file
34 */
35
36 #include <sys/types.h>
37 #include <sys/param.h>
38 #include <sys/user.h>
39 #include <sys/buf.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/vm.h>
43 #include <sys/uio.h>
44 #include <sys/mman.h>
45 #include <sys/kmem.h>
46 #include <vm/seg.h>
47 #include <vm/page.h>
48 #include <sys/stat.h>
49 #include <sys/vmem.h>
50 #include <sys/memlist.h>
51 #include <sys/bootconf.h>
52
53 #include <vm/seg_vn.h>
54 #include <vm/seg_dev.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_kp.h>
57 #include <vm/seg_kpm.h>
58 #include <vm/hat.h>
59
60 #include <sys/conf.h>
61 #include <sys/mem.h>
62 #include <sys/types.h>
63 #include <sys/conf.h>
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/errno.h>
67 #include <sys/modctl.h>
68 #include <sys/memlist.h>
69 #include <sys/ddi.h>
70 #include <sys/sunddi.h>
71 #include <sys/debug.h>
72 #include <sys/fm/protocol.h>
73
74 #if defined(__sparc)
75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
77 uint64_t *, int *, int *, int *);
78 extern size_t cpu_get_name_bufsize(void);
79 extern int cpu_get_mem_sid(char *, char *, int, int *);
80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
81 #elif defined(__x86)
82 #include <sys/cpu_module.h>
83 #endif /* __sparc */
84
85 /*
86 * Turn a byte length into a pagecount. The DDI btop takes a
87 * 32-bit size on 32-bit machines, this handles 64-bit sizes for
88 * large physical-memory 32-bit machines.
89 */
90 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
91
92 static kmutex_t mm_lock;
93 static caddr_t mm_map;
94
95 static dev_info_t *mm_dip; /* private copy of devinfo pointer */
96
97 static int mm_kmem_io_access;
98
99 static int mm_kstat_update(kstat_t *ksp, int rw);
100 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
101
102 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
103
104 #define MM_KMEMLOG_NENTRIES 64
105
106 static int mm_kmemlogent;
107 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
108
109 /*
110 * On kmem/allmem writes, we log information that might be useful in the event
111 * that a write is errant (that is, due to operator error) and induces a later
112 * problem. Note that (in particular) in the event of such operator-induced
113 * corruption, a search over the kernel address space for the corrupted
114 * address will yield the ring buffer entry that recorded the write. And
115 * should it seem baroque or otherwise unnecessary, yes, we need this kind of
116 * auditing facility and yes, we learned that the hard way: disturbingly,
117 * there exist recommendations for "tuning" the system that involve writing to
118 * kernel memory addresses via the kernel debugger, and -- as we discovered --
119 * these can easily be applied incorrectly or unsafely, yielding an entirely
120 * undebuggable "can't happen" kind of panic.
121 */
122 static void
123 mm_logkmem(struct uio *uio)
124 {
125 mm_logentry_t *ent;
126 proc_t *p = curthread->t_procp;
127
128 mutex_enter(&mm_lock);
129
130 ent = &mm_kmemlog[mm_kmemlogent++];
131
132 if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
133 mm_kmemlogent = 0;
134
135 ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
136 ent->mle_len = uio->uio_resid;
137 gethrestime(&ent->mle_hrestime);
138 ent->mle_hrtime = gethrtime();
139 ent->mle_pid = p->p_pidp->pid_id;
140
141 (void) strncpy(ent->mle_psargs,
142 p->p_user.u_psargs, sizeof (ent->mle_psargs));
143
144 mutex_exit(&mm_lock);
145 }
146
147 /*ARGSUSED1*/
148 static int
149 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
150 {
151 int i;
152 struct mem_minor {
153 char *name;
154 minor_t minor;
155 int privonly;
156 const char *rdpriv;
157 const char *wrpriv;
158 mode_t priv_mode;
159 } mm[] = {
160 { "mem", M_MEM, 0, NULL, "all", 0640 },
161 { "kmem", M_KMEM, 0, NULL, "all", 0640 },
162 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 },
163 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 },
164 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 },
165 { "full", M_FULL, PRIVONLY_DEV, NULL, NULL, 0666 },
166 };
167 kstat_t *ksp;
168
169 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
170 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
171
172 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
173 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
174 mm[i].minor, DDI_PSEUDO, mm[i].privonly,
175 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
176 DDI_FAILURE) {
177 ddi_remove_minor_node(devi, NULL);
178 return (DDI_FAILURE);
179 }
180 }
181
182 mm_dip = devi;
183
184 ksp = kstat_create("mm", 0, "phys_installed", "misc",
185 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
186 if (ksp != NULL) {
187 ksp->ks_update = mm_kstat_update;
188 ksp->ks_snapshot = mm_kstat_snapshot;
189 ksp->ks_lock = &mm_lock; /* XXX - not really needed */
190 kstat_install(ksp);
191 }
192
193 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
194 "kmem_io_access", 0);
195
196 return (DDI_SUCCESS);
197 }
198
199 /*ARGSUSED*/
200 static int
201 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
202 {
203 register int error;
204
205 switch (infocmd) {
206 case DDI_INFO_DEVT2DEVINFO:
207 *result = (void *)mm_dip;
208 error = DDI_SUCCESS;
209 break;
210 case DDI_INFO_DEVT2INSTANCE:
211 *result = (void *)0;
212 error = DDI_SUCCESS;
213 break;
214 default:
215 error = DDI_FAILURE;
216 }
217 return (error);
218 }
219
220 /*ARGSUSED1*/
221 static int
222 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
223 {
224 switch (getminor(*devp)) {
225 case M_NULL:
226 case M_ZERO:
227 case M_FULL:
228 case M_MEM:
229 case M_KMEM:
230 case M_ALLKMEM:
231 /* standard devices */
232 break;
233
234 default:
235 /* Unsupported or unknown type */
236 return (EINVAL);
237 }
238 /* must be character device */
239 if (typ != OTYP_CHR)
240 return (EINVAL);
241 return (0);
242 }
243
244 struct pollhead mm_pollhd;
245
246 /*ARGSUSED*/
247 static int
248 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
249 struct pollhead **phpp)
250 {
251 switch (getminor(dev)) {
252 case M_NULL:
253 case M_ZERO:
254 case M_FULL:
255 case M_MEM:
256 case M_KMEM:
257 case M_ALLKMEM:
258 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
259 POLLWRNORM | POLLRDBAND | POLLWRBAND);
260 /*
261 * A non NULL pollhead pointer should be returned in case
262 * user polls for 0 events.
263 */
264 *phpp = !anyyet && !*reventsp ?
265 &mm_pollhd : (struct pollhead *)NULL;
266 return (0);
267 default:
268 /* no other devices currently support polling */
269 return (ENXIO);
270 }
271 }
272
273 static int
274 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
275 char *name, caddr_t valuep, int *lengthp)
276 {
277 /*
278 * implement zero size to reduce overhead (avoid two failing
279 * property lookups per stat).
280 */
281 return (ddi_prop_op_size(dev, dip, prop_op,
282 flags, name, valuep, lengthp, 0));
283 }
284
285 static int
286 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
287 page_t *pp)
288 {
289 int error = 0;
290 int devload = 0;
291 int is_memory = pf_is_memory(pfn);
292 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
293 (size_t)uio->uio_iov->iov_len);
294 caddr_t va = NULL;
295
296 mutex_enter(&mm_lock);
297
298 if (is_memory && kpm_enable) {
299 if (pp)
300 va = hat_kpm_mapin(pp, NULL);
301 else
302 va = hat_kpm_mapin_pfn(pfn);
303 }
304
305 if (va == NULL) {
306 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
307 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
308 HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
309 va = mm_map;
310 devload = 1;
311 }
312
313 if (!is_memory) {
314 if (allowio) {
315 size_t c = uio->uio_iov->iov_len;
316
317 if (ddi_peekpokeio(NULL, uio, rw,
318 (caddr_t)(uintptr_t)uio->uio_loffset, c,
319 sizeof (int32_t)) != DDI_SUCCESS)
320 error = EFAULT;
321 } else
322 error = EIO;
323 } else
324 error = uiomove(va + pageoff, nbytes, rw, uio);
325
326 if (devload)
327 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
328 else if (pp)
329 hat_kpm_mapout(pp, NULL, va);
330 else
331 hat_kpm_mapout_pfn(pfn);
332
333 mutex_exit(&mm_lock);
334 return (error);
335 }
336
337 static int
338 mmpagelock(struct as *as, caddr_t va)
339 {
340 struct seg *seg;
341 int i;
342
343 AS_LOCK_ENTER(as, RW_READER);
344 seg = as_segat(as, va);
345 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
346 AS_LOCK_EXIT(as);
347
348 return (i);
349 }
350
351 #ifdef __sparc
352
353 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva)
354
355 #else /* __i386, __amd64 */
356
357 #define NEED_LOCK_KVADDR(va) 0
358
359 #endif /* __sparc */
360
361 /*ARGSUSED3*/
362 static int
363 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
364 {
365 pfn_t v;
366 struct iovec *iov;
367 int error = 0;
368 size_t c;
369 ssize_t oresid = uio->uio_resid;
370 minor_t minor = getminor(dev);
371
372 while (uio->uio_resid > 0 && error == 0) {
373 iov = uio->uio_iov;
374 if (iov->iov_len == 0) {
375 uio->uio_iov++;
376 uio->uio_iovcnt--;
377 if (uio->uio_iovcnt < 0)
378 panic("mmrw");
379 continue;
380 }
381 switch (minor) {
382
383 case M_MEM:
384 memlist_read_lock();
385 if (!address_in_memlist(phys_install,
386 (uint64_t)uio->uio_loffset, 1)) {
387 memlist_read_unlock();
388 error = EFAULT;
389 break;
390 }
391 memlist_read_unlock();
392
393 v = BTOP((u_offset_t)uio->uio_loffset);
394 error = mmio(uio, rw, v,
395 uio->uio_loffset & PAGEOFFSET, 0, NULL);
396 break;
397
398 case M_KMEM:
399 case M_ALLKMEM:
400 {
401 page_t **ppp = NULL;
402 caddr_t vaddr = (caddr_t)uio->uio_offset;
403 int try_lock = NEED_LOCK_KVADDR(vaddr);
404 int locked = 0;
405
406 if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
407 break;
408
409 if (rw == UIO_WRITE)
410 mm_logkmem(uio);
411
412 /*
413 * If vaddr does not map a valid page, as_pagelock()
414 * will return failure. Hence we can't check the
415 * return value and return EFAULT here as we'd like.
416 * seg_kp and seg_kpm do not properly support
417 * as_pagelock() for this context so we avoid it
418 * using the try_lock set check above. Some day when
419 * the kernel page locking gets redesigned all this
420 * muck can be cleaned up.
421 */
422 if (try_lock)
423 locked = (as_pagelock(&kas, &ppp, vaddr,
424 PAGESIZE, S_WRITE) == 0);
425
426 v = hat_getpfnum(kas.a_hat,
427 (caddr_t)(uintptr_t)uio->uio_loffset);
428 if (v == PFN_INVALID) {
429 if (locked)
430 as_pageunlock(&kas, ppp, vaddr,
431 PAGESIZE, S_WRITE);
432 error = EFAULT;
433 break;
434 }
435
436 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
437 minor == M_ALLKMEM || mm_kmem_io_access,
438 (locked && ppp) ? *ppp : NULL);
439 if (locked)
440 as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
441 S_WRITE);
442 }
443
444 break;
445
446 case M_FULL:
447 if (rw == UIO_WRITE) {
448 error = ENOSPC;
449 break;
450 }
451 /* else it's a read, fall through to zero case */
452 /*FALLTHROUGH*/
453
454 case M_ZERO:
455 if (rw == UIO_READ) {
456 label_t ljb;
457
458 if (on_fault(&ljb)) {
459 no_fault();
460 error = EFAULT;
461 break;
462 }
463 uzero(iov->iov_base, iov->iov_len);
464 no_fault();
465 uio->uio_resid -= iov->iov_len;
466 uio->uio_loffset += iov->iov_len;
467 break;
468 }
469 /* else it's a write, fall through to NULL case */
470 /*FALLTHROUGH*/
471
472 case M_NULL:
473 if (rw == UIO_READ)
474 return (0);
475 c = iov->iov_len;
476 iov->iov_base += c;
477 iov->iov_len -= c;
478 uio->uio_loffset += c;
479 uio->uio_resid -= c;
480 break;
481
482 }
483 }
484 return (uio->uio_resid == oresid ? error : 0);
485 }
486
487 static int
488 mmread(dev_t dev, struct uio *uio, cred_t *cred)
489 {
490 return (mmrw(dev, uio, UIO_READ, cred));
491 }
492
493 static int
494 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
495 {
496 return (mmrw(dev, uio, UIO_WRITE, cred));
497 }
498
499 /*
500 * Private ioctl for libkvm to support kvm_physaddr().
501 * Given an address space and a VA, compute the PA.
502 */
503 static int
504 mmioctl_vtop(intptr_t data)
505 {
506 #ifdef _SYSCALL32
507 mem_vtop32_t vtop32;
508 #endif
509 mem_vtop_t mem_vtop;
510 proc_t *p;
511 pfn_t pfn = (pfn_t)PFN_INVALID;
512 pid_t pid = 0;
513 struct as *as;
514 struct seg *seg;
515
516 if (get_udatamodel() == DATAMODEL_NATIVE) {
517 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
518 return (EFAULT);
519 }
520 #ifdef _SYSCALL32
521 else {
522 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
523 return (EFAULT);
524 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
525 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
526
527 if (mem_vtop.m_as != NULL)
528 return (EINVAL);
529 }
530 #endif
531
532 if (mem_vtop.m_as == &kas) {
533 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
534 } else {
535 if (mem_vtop.m_as == NULL) {
536 /*
537 * Assume the calling process's address space if the
538 * caller didn't specify one.
539 */
540 p = curthread->t_procp;
541 if (p == NULL)
542 return (EIO);
543 mem_vtop.m_as = p->p_as;
544 }
545
546 mutex_enter(&pidlock);
547 for (p = practive; p != NULL; p = p->p_next) {
548 if (p->p_as == mem_vtop.m_as) {
549 pid = p->p_pid;
550 break;
551 }
552 }
553 mutex_exit(&pidlock);
554 if (p == NULL)
555 return (EIO);
556 p = sprlock(pid);
557 if (p == NULL)
558 return (EIO);
559 as = p->p_as;
560 if (as == mem_vtop.m_as) {
561 mutex_exit(&p->p_lock);
562 AS_LOCK_ENTER(as, RW_READER);
563 for (seg = AS_SEGFIRST(as); seg != NULL;
564 seg = AS_SEGNEXT(as, seg))
565 if ((uintptr_t)mem_vtop.m_va -
566 (uintptr_t)seg->s_base < seg->s_size)
567 break;
568 if (seg != NULL)
569 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
570 AS_LOCK_EXIT(as);
571 mutex_enter(&p->p_lock);
572 }
573 sprunlock(p);
574 }
575 mem_vtop.m_pfn = pfn;
576 if (pfn == PFN_INVALID)
577 return (EIO);
578
579 if (get_udatamodel() == DATAMODEL_NATIVE) {
580 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
581 return (EFAULT);
582 }
583 #ifdef _SYSCALL32
584 else {
585 vtop32.m_pfn = mem_vtop.m_pfn;
586 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
587 return (EFAULT);
588 }
589 #endif
590
591 return (0);
592 }
593
594 /*
595 * Given a PA, execute the given page retire command on it.
596 */
597 static int
598 mmioctl_page_retire(int cmd, intptr_t data)
599 {
600 extern int page_retire_test(void);
601 uint64_t pa;
602
603 if (copyin((void *)data, &pa, sizeof (uint64_t))) {
604 return (EFAULT);
605 }
606
607 switch (cmd) {
608 case MEM_PAGE_ISRETIRED:
609 return (page_retire_check(pa, NULL));
610
611 case MEM_PAGE_UNRETIRE:
612 return (page_unretire(pa));
613
614 case MEM_PAGE_RETIRE:
615 return (page_retire(pa, PR_FMA));
616
617 case MEM_PAGE_RETIRE_MCE:
618 return (page_retire(pa, PR_MCE));
619
620 case MEM_PAGE_RETIRE_UE:
621 return (page_retire(pa, PR_UE));
622
623 case MEM_PAGE_GETERRORS:
624 {
625 uint64_t page_errors;
626 int rc = page_retire_check(pa, &page_errors);
627 if (copyout(&page_errors, (void *)data,
628 sizeof (uint64_t))) {
629 return (EFAULT);
630 }
631 return (rc);
632 }
633
634 case MEM_PAGE_RETIRE_TEST:
635 return (page_retire_test());
636
637 }
638
639 return (EINVAL);
640 }
641
642 #ifdef __sparc
643 /*
644 * Given a syndrome, syndrome type, and address return the
645 * associated memory name in the provided data buffer.
646 */
647 static int
648 mmioctl_get_mem_name(intptr_t data)
649 {
650 mem_name_t mem_name;
651 void *buf;
652 size_t bufsize;
653 int len, err;
654
655 if ((bufsize = cpu_get_name_bufsize()) == 0)
656 return (ENOTSUP);
657
658 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
659 return (err);
660
661 buf = kmem_alloc(bufsize, KM_SLEEP);
662
663 /*
664 * Call into cpu specific code to do the lookup.
665 */
666 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
667 mem_name.m_addr, buf, bufsize, &len)) != 0) {
668 kmem_free(buf, bufsize);
669 return (err);
670 }
671
672 if (len >= mem_name.m_namelen) {
673 kmem_free(buf, bufsize);
674 return (ENOSPC);
675 }
676
677 if (copyoutstr(buf, (char *)mem_name.m_name,
678 mem_name.m_namelen, NULL) != 0) {
679 kmem_free(buf, bufsize);
680 return (EFAULT);
681 }
682
683 kmem_free(buf, bufsize);
684 return (0);
685 }
686
687 /*
688 * Given a syndrome and address return information about the associated memory.
689 */
690 static int
691 mmioctl_get_mem_info(intptr_t data)
692 {
693 mem_info_t mem_info;
694 int err;
695
696 if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
697 return (EFAULT);
698
699 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
700 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
701 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
702 return (err);
703
704 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
705 return (EFAULT);
706
707 return (0);
708 }
709
710 /*
711 * Given a memory name, return its associated serial id
712 */
713 static int
714 mmioctl_get_mem_sid(intptr_t data)
715 {
716 mem_name_t mem_name;
717 void *buf;
718 void *name;
719 size_t name_len;
720 size_t bufsize;
721 int len, err;
722
723 if ((bufsize = cpu_get_name_bufsize()) == 0)
724 return (ENOTSUP);
725
726 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
727 return (err);
728
729 buf = kmem_alloc(bufsize, KM_SLEEP);
730
731 if (mem_name.m_namelen > 1024)
732 mem_name.m_namelen = 1024; /* cap at 1024 bytes */
733
734 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
735
736 if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
737 mem_name.m_namelen, &name_len)) != 0) {
738 kmem_free(buf, bufsize);
739 kmem_free(name, mem_name.m_namelen);
740 return (err);
741 }
742
743 /*
744 * Call into cpu specific code to do the lookup.
745 */
746 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
747 kmem_free(buf, bufsize);
748 kmem_free(name, mem_name.m_namelen);
749 return (err);
750 }
751
752 if (len > mem_name.m_sidlen) {
753 kmem_free(buf, bufsize);
754 kmem_free(name, mem_name.m_namelen);
755 return (ENAMETOOLONG);
756 }
757
758 if (copyoutstr(buf, (char *)mem_name.m_sid,
759 mem_name.m_sidlen, NULL) != 0) {
760 kmem_free(buf, bufsize);
761 kmem_free(name, mem_name.m_namelen);
762 return (EFAULT);
763 }
764
765 kmem_free(buf, bufsize);
766 kmem_free(name, mem_name.m_namelen);
767 return (0);
768 }
769 #endif /* __sparc */
770
771 /*
772 * Private ioctls for
773 * libkvm to support kvm_physaddr().
774 * FMA support for page_retire() and memory attribute information.
775 */
776 /*ARGSUSED*/
777 static int
778 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
779 {
780 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
781 (cmd != MEM_VTOP && getminor(dev) != M_MEM))
782 return (ENXIO);
783
784 switch (cmd) {
785 case MEM_VTOP:
786 return (mmioctl_vtop(data));
787
788 case MEM_PAGE_RETIRE:
789 case MEM_PAGE_ISRETIRED:
790 case MEM_PAGE_UNRETIRE:
791 case MEM_PAGE_RETIRE_MCE:
792 case MEM_PAGE_RETIRE_UE:
793 case MEM_PAGE_GETERRORS:
794 case MEM_PAGE_RETIRE_TEST:
795 return (mmioctl_page_retire(cmd, data));
796
797 #ifdef __sparc
798 case MEM_NAME:
799 return (mmioctl_get_mem_name(data));
800
801 case MEM_INFO:
802 return (mmioctl_get_mem_info(data));
803
804 case MEM_SID:
805 return (mmioctl_get_mem_sid(data));
806 #else
807 case MEM_NAME:
808 case MEM_INFO:
809 case MEM_SID:
810 return (ENOTSUP);
811 #endif /* __sparc */
812 }
813 return (ENXIO);
814 }
815
816 /*ARGSUSED2*/
817 static int
818 mmmmap(dev_t dev, off_t off, int prot)
819 {
820 pfn_t pf;
821 struct memlist *pmem;
822 minor_t minor = getminor(dev);
823
824 switch (minor) {
825 case M_MEM:
826 pf = btop(off);
827 memlist_read_lock();
828 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
829 if (pf >= BTOP(pmem->ml_address) &&
830 pf < BTOP(pmem->ml_address + pmem->ml_size)) {
831 memlist_read_unlock();
832 return (impl_obmem_pfnum(pf));
833 }
834 }
835 memlist_read_unlock();
836 break;
837
838 case M_KMEM:
839 case M_ALLKMEM:
840 /* no longer supported with KPR */
841 return (-1);
842
843 case M_FULL:
844 case M_ZERO:
845 /*
846 * We shouldn't be mmap'ing to /dev/zero here as
847 * mmsegmap() should have already converted
848 * a mapping request for this device to a mapping
849 * using seg_vn for anonymous memory.
850 */
851 break;
852
853 }
854 return (-1);
855 }
856
857 /*
858 * This function is called when a memory device is mmap'ed.
859 * Set up the mapping to the correct device driver.
860 */
861 static int
862 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
863 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
864 {
865 struct segvn_crargs vn_a;
866 struct segdev_crargs dev_a;
867 int error;
868 minor_t minor;
869 off_t i;
870
871 minor = getminor(dev);
872
873 as_rangelock(as);
874 /*
875 * No need to worry about vac alignment on /dev/zero
876 * since this is a "clone" object that doesn't yet exist.
877 */
878 error = choose_addr(as, addrp, len, off,
879 (minor == M_MEM) || (minor == M_KMEM), flags);
880 if (error != 0) {
881 as_rangeunlock(as);
882 return (error);
883 }
884
885 switch (minor) {
886 case M_MEM:
887 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
888 if ((flags & MAP_TYPE) != MAP_SHARED) {
889 as_rangeunlock(as);
890 return (EINVAL);
891 }
892
893 /*
894 * Check to ensure that the entire range is
895 * legal and we are not trying to map in
896 * more than the device will let us.
897 */
898 for (i = 0; i < len; i += PAGESIZE) {
899 if (mmmmap(dev, off + i, maxprot) == -1) {
900 as_rangeunlock(as);
901 return (ENXIO);
902 }
903 }
904
905 /*
906 * Use seg_dev segment driver for /dev/mem mapping.
907 */
908 dev_a.mapfunc = mmmmap;
909 dev_a.dev = dev;
910 dev_a.offset = off;
911 dev_a.type = (flags & MAP_TYPE);
912 dev_a.prot = (uchar_t)prot;
913 dev_a.maxprot = (uchar_t)maxprot;
914 dev_a.hat_attr = 0;
915
916 /*
917 * Make /dev/mem mappings non-consistent since we can't
918 * alias pages that don't have page structs behind them,
919 * such as kernel stack pages. If someone mmap()s a kernel
920 * stack page and if we give them a tte with cv, a line from
921 * that page can get into both pages of the spitfire d$.
922 * But snoop from another processor will only invalidate
923 * the first page. This later caused kernel (xc_attention)
924 * to go into an infinite loop at pil 13 and no interrupts
925 * could come in. See 1203630.
926 *
927 */
928 dev_a.hat_flags = HAT_LOAD_NOCONSIST;
929 dev_a.devmap_data = NULL;
930
931 error = as_map(as, *addrp, len, segdev_create, &dev_a);
932 break;
933
934 case M_ZERO:
935 /*
936 * Use seg_vn segment driver for /dev/zero mapping.
937 * Passing in a NULL amp gives us the "cloning" effect.
938 */
939 vn_a.vp = NULL;
940 vn_a.offset = 0;
941 vn_a.type = (flags & MAP_TYPE);
942 vn_a.prot = prot;
943 vn_a.maxprot = maxprot;
944 vn_a.flags = flags & ~MAP_TYPE;
945 vn_a.cred = cred;
946 vn_a.amp = NULL;
947 vn_a.szc = 0;
948 vn_a.lgrp_mem_policy_flags = 0;
949 error = as_map(as, *addrp, len, segvn_create, &vn_a);
950 break;
951
952 case M_KMEM:
953 case M_ALLKMEM:
954 /* No longer supported with KPR. */
955 error = ENXIO;
956 break;
957
958 case M_NULL:
959 /*
960 * Use seg_dev segment driver for /dev/null mapping.
961 */
962 dev_a.mapfunc = mmmmap;
963 dev_a.dev = dev;
964 dev_a.offset = off;
965 dev_a.type = 0; /* neither PRIVATE nor SHARED */
966 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
967 dev_a.hat_attr = 0;
968 dev_a.hat_flags = 0;
969 error = as_map(as, *addrp, len, segdev_create, &dev_a);
970 break;
971
972 default:
973 error = ENXIO;
974 }
975
976 as_rangeunlock(as);
977 return (error);
978 }
979
980 static struct cb_ops mm_cb_ops = {
981 mmopen, /* open */
982 nulldev, /* close */
983 nodev, /* strategy */
984 nodev, /* print */
985 nodev, /* dump */
986 mmread, /* read */
987 mmwrite, /* write */
988 mmioctl, /* ioctl */
989 nodev, /* devmap */
990 mmmmap, /* mmap */
991 mmsegmap, /* segmap */
992 mmchpoll, /* poll */
993 mmpropop, /* prop_op */
994 0, /* streamtab */
995 D_NEW | D_MP | D_64BIT | D_U64BIT
996 };
997
998 static struct dev_ops mm_ops = {
999 DEVO_REV, /* devo_rev, */
1000 0, /* refcnt */
1001 mm_info, /* get_dev_info */
1002 nulldev, /* identify */
1003 nulldev, /* probe */
1004 mm_attach, /* attach */
1005 nodev, /* detach */
1006 nodev, /* reset */
1007 &mm_cb_ops, /* driver operations */
1008 (struct bus_ops *)0, /* bus operations */
1009 NULL, /* power */
1010 ddi_quiesce_not_needed, /* quiesce */
1011 };
1012
1013 static struct modldrv modldrv = {
1014 &mod_driverops, "memory driver", &mm_ops,
1015 };
1016
1017 static struct modlinkage modlinkage = {
1018 MODREV_1, &modldrv, NULL
1019 };
1020
1021 int
1022 _init(void)
1023 {
1024 return (mod_install(&modlinkage));
1025 }
1026
1027 int
1028 _info(struct modinfo *modinfop)
1029 {
1030 return (mod_info(&modlinkage, modinfop));
1031 }
1032
1033 int
1034 _fini(void)
1035 {
1036 return (mod_remove(&modlinkage));
1037 }
1038
1039 static int
1040 mm_kstat_update(kstat_t *ksp, int rw)
1041 {
1042 struct memlist *pmem;
1043 uint_t count;
1044
1045 if (rw == KSTAT_WRITE)
1046 return (EACCES);
1047
1048 count = 0;
1049 memlist_read_lock();
1050 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1051 count++;
1052 }
1053 memlist_read_unlock();
1054
1055 ksp->ks_ndata = count;
1056 ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1057
1058 return (0);
1059 }
1060
1061 static int
1062 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1063 {
1064 struct memlist *pmem;
1065 struct memunit {
1066 uint64_t address;
1067 uint64_t size;
1068 } *kspmem;
1069
1070 if (rw == KSTAT_WRITE)
1071 return (EACCES);
1072
1073 ksp->ks_snaptime = gethrtime();
1074
1075 kspmem = (struct memunit *)buf;
1076 memlist_read_lock();
1077 for (pmem = phys_install; pmem != NULL;
1078 pmem = pmem->ml_next, kspmem++) {
1079 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1080 break;
1081 kspmem->address = pmem->ml_address;
1082 kspmem->size = pmem->ml_size;
1083 }
1084 memlist_read_unlock();
1085
1086 return (0);
1087 }
1088
1089 /*
1090 * Read a mem_name_t from user-space and store it in the mem_name_t
1091 * pointed to by the mem_name argument.
1092 */
1093 static int
1094 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1095 {
1096 if (get_udatamodel() == DATAMODEL_NATIVE) {
1097 if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1098 return (EFAULT);
1099 }
1100 #ifdef _SYSCALL32
1101 else {
1102 mem_name32_t mem_name32;
1103
1104 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1105 return (EFAULT);
1106 mem_name->m_addr = mem_name32.m_addr;
1107 mem_name->m_synd = mem_name32.m_synd;
1108 mem_name->m_type[0] = mem_name32.m_type[0];
1109 mem_name->m_type[1] = mem_name32.m_type[1];
1110 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1111 mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1112 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1113 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1114 }
1115 #endif /* _SYSCALL32 */
1116
1117 return (0);
1118 }