1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright (c) 2016 by Delphix. All rights reserved.
  25  */
  26 
  27 /*
  28  * Copyright 2017 Joyent, Inc.
  29  * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
  30  */
  31 
  32 /*
  33  * Memory special file
  34  */
  35 
  36 #include <sys/types.h>
  37 #include <sys/param.h>
  38 #include <sys/user.h>
  39 #include <sys/buf.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/vm.h>
  43 #include <sys/uio.h>
  44 #include <sys/mman.h>
  45 #include <sys/kmem.h>
  46 #include <vm/seg.h>
  47 #include <vm/page.h>
  48 #include <sys/stat.h>
  49 #include <sys/vmem.h>
  50 #include <sys/memlist.h>
  51 #include <sys/bootconf.h>
  52 
  53 #include <vm/seg_vn.h>
  54 #include <vm/seg_dev.h>
  55 #include <vm/seg_kmem.h>
  56 #include <vm/seg_kp.h>
  57 #include <vm/seg_kpm.h>
  58 #include <vm/hat.h>
  59 
  60 #include <sys/conf.h>
  61 #include <sys/mem.h>
  62 #include <sys/types.h>
  63 #include <sys/conf.h>
  64 #include <sys/param.h>
  65 #include <sys/systm.h>
  66 #include <sys/errno.h>
  67 #include <sys/modctl.h>
  68 #include <sys/memlist.h>
  69 #include <sys/ddi.h>
  70 #include <sys/sunddi.h>
  71 #include <sys/debug.h>
  72 #include <sys/fm/protocol.h>
  73 
  74 #if defined(__sparc)
  75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
  76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
  77     uint64_t *, int *, int *, int *);
  78 extern size_t cpu_get_name_bufsize(void);
  79 extern int cpu_get_mem_sid(char *, char *, int, int *);
  80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
  81 #elif defined(__x86)
  82 #include <sys/cpu_module.h>
  83 #endif  /* __sparc */
  84 
  85 /*
  86  * Turn a byte length into a pagecount.  The DDI btop takes a
  87  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
  88  * large physical-memory 32-bit machines.
  89  */
  90 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
  91 
  92 static kmutex_t mm_lock;
  93 static caddr_t mm_map;
  94 
  95 static dev_info_t *mm_dip;      /* private copy of devinfo pointer */
  96 
  97 static int mm_kmem_io_access;
  98 
  99 static int mm_kstat_update(kstat_t *ksp, int rw);
 100 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
 101 
 102 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
 103 
 104 #define MM_KMEMLOG_NENTRIES     64
 105 
 106 static int mm_kmemlogent;
 107 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
 108 
 109 /*
 110  * On kmem/allmem writes, we log information that might be useful in the event
 111  * that a write is errant (that is, due to operator error) and induces a later
 112  * problem.  Note that (in particular) in the event of such operator-induced
 113  * corruption, a search over the kernel address space for the corrupted
 114  * address will yield the ring buffer entry that recorded the write.  And
 115  * should it seem baroque or otherwise unnecessary, yes, we need this kind of
 116  * auditing facility and yes, we learned that the hard way: disturbingly,
 117  * there exist recommendations for "tuning" the system that involve writing to
 118  * kernel memory addresses via the kernel debugger, and -- as we discovered --
 119  * these can easily be applied incorrectly or unsafely, yielding an entirely
 120  * undebuggable "can't happen" kind of panic.
 121  */
 122 static void
 123 mm_logkmem(struct uio *uio)
 124 {
 125         mm_logentry_t *ent;
 126         proc_t *p = curthread->t_procp;
 127 
 128         mutex_enter(&mm_lock);
 129 
 130         ent = &mm_kmemlog[mm_kmemlogent++];
 131 
 132         if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
 133                 mm_kmemlogent = 0;
 134 
 135         ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
 136         ent->mle_len = uio->uio_resid;
 137         gethrestime(&ent->mle_hrestime);
 138         ent->mle_hrtime = gethrtime();
 139         ent->mle_pid = p->p_pidp->pid_id;
 140 
 141         (void) strncpy(ent->mle_psargs,
 142             p->p_user.u_psargs, sizeof (ent->mle_psargs));
 143 
 144         mutex_exit(&mm_lock);
 145 }
 146 
 147 /*ARGSUSED1*/
 148 static int
 149 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 150 {
 151         int i;
 152         struct mem_minor {
 153                 char *name;
 154                 minor_t minor;
 155                 int privonly;
 156                 const char *rdpriv;
 157                 const char *wrpriv;
 158                 mode_t priv_mode;
 159         } mm[] = {
 160                 { "mem",        M_MEM,          0,      NULL,   "all",  0640 },
 161                 { "kmem",       M_KMEM,         0,      NULL,   "all",  0640 },
 162                 { "allkmem",    M_ALLKMEM,      0,      "all",  "all",  0600 },
 163                 { "null",       M_NULL, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 164                 { "zero",       M_ZERO, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 165                 { "full",       M_FULL, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 166         };
 167         kstat_t *ksp;
 168 
 169         mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
 170         mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
 171 
 172         for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
 173                 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
 174                     mm[i].minor, DDI_PSEUDO, mm[i].privonly,
 175                     mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
 176                     DDI_FAILURE) {
 177                         ddi_remove_minor_node(devi, NULL);
 178                         return (DDI_FAILURE);
 179                 }
 180         }
 181 
 182         mm_dip = devi;
 183 
 184         ksp = kstat_create("mm", 0, "phys_installed", "misc",
 185             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
 186         if (ksp != NULL) {
 187                 ksp->ks_update = mm_kstat_update;
 188                 ksp->ks_snapshot = mm_kstat_snapshot;
 189                 ksp->ks_lock = &mm_lock; /* XXX - not really needed */
 190                 kstat_install(ksp);
 191         }
 192 
 193         mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 194             "kmem_io_access", 0);
 195 
 196         return (DDI_SUCCESS);
 197 }
 198 
 199 /*ARGSUSED*/
 200 static int
 201 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 202 {
 203         register int error;
 204 
 205         switch (infocmd) {
 206         case DDI_INFO_DEVT2DEVINFO:
 207                 *result = (void *)mm_dip;
 208                 error = DDI_SUCCESS;
 209                 break;
 210         case DDI_INFO_DEVT2INSTANCE:
 211                 *result = (void *)0;
 212                 error = DDI_SUCCESS;
 213                 break;
 214         default:
 215                 error = DDI_FAILURE;
 216         }
 217         return (error);
 218 }
 219 
 220 /*ARGSUSED1*/
 221 static int
 222 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
 223 {
 224         switch (getminor(*devp)) {
 225         case M_NULL:
 226         case M_ZERO:
 227         case M_FULL:
 228         case M_MEM:
 229         case M_KMEM:
 230         case M_ALLKMEM:
 231                 /* standard devices */
 232                 break;
 233 
 234         default:
 235                 /* Unsupported or unknown type */
 236                 return (EINVAL);
 237         }
 238         /* must be character device */
 239         if (typ != OTYP_CHR)
 240                 return (EINVAL);
 241         return (0);
 242 }
 243 
 244 struct pollhead mm_pollhd;
 245 
 246 /*ARGSUSED*/
 247 static int
 248 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
 249     struct pollhead **phpp)
 250 {
 251         switch (getminor(dev)) {
 252         case M_NULL:
 253         case M_ZERO:
 254         case M_FULL:
 255         case M_MEM:
 256         case M_KMEM:
 257         case M_ALLKMEM:
 258                 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
 259                     POLLWRNORM | POLLRDBAND | POLLWRBAND);
 260                 /*
 261                  * A non NULL pollhead pointer should be returned in case
 262                  * user polls for 0 events or is doing an edge-triggerd poll.
 263                  */
 264                 if ((!*reventsp && !anyyet) || (events & POLLET)) {
 265                         *phpp = &mm_pollhd;
 266                 }
 267                 return (0);
 268         default:
 269                 /* no other devices currently support polling */
 270                 return (ENXIO);
 271         }
 272 }
 273 
 274 static int
 275 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
 276     char *name, caddr_t valuep, int *lengthp)
 277 {
 278         /*
 279          * implement zero size to reduce overhead (avoid two failing
 280          * property lookups per stat).
 281          */
 282         return (ddi_prop_op_size(dev, dip, prop_op,
 283             flags, name, valuep, lengthp, 0));
 284 }
 285 
 286 static int
 287 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
 288     page_t *pp)
 289 {
 290         int error = 0;
 291         int devload = 0;
 292         int is_memory = pf_is_memory(pfn);
 293         size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
 294             (size_t)uio->uio_iov->iov_len);
 295         caddr_t va = NULL;
 296 
 297         mutex_enter(&mm_lock);
 298 
 299         if (is_memory && kpm_enable) {
 300                 if (pp)
 301                         va = hat_kpm_mapin(pp, NULL);
 302                 else
 303                         va = hat_kpm_mapin_pfn(pfn);
 304         }
 305 
 306         if (va == NULL) {
 307                 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
 308                     (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
 309                     HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
 310                 va = mm_map;
 311                 devload = 1;
 312         }
 313 
 314         if (!is_memory) {
 315                 if (allowio) {
 316                         size_t c = uio->uio_iov->iov_len;
 317 
 318                         if (ddi_peekpokeio(NULL, uio, rw,
 319                             (caddr_t)(uintptr_t)uio->uio_loffset, c,
 320                             sizeof (int32_t)) != DDI_SUCCESS)
 321                                 error = EFAULT;
 322                 } else
 323                         error = EIO;
 324         } else
 325                 error = uiomove(va + pageoff, nbytes, rw, uio);
 326 
 327         if (devload)
 328                 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
 329         else if (pp)
 330                 hat_kpm_mapout(pp, NULL, va);
 331         else
 332                 hat_kpm_mapout_pfn(pfn);
 333 
 334         mutex_exit(&mm_lock);
 335         return (error);
 336 }
 337 
 338 static int
 339 mmpagelock(struct as *as, caddr_t va)
 340 {
 341         struct seg *seg;
 342         int i;
 343 
 344         AS_LOCK_ENTER(as, RW_READER);
 345         seg = as_segat(as, va);
 346         i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
 347         AS_LOCK_EXIT(as);
 348 
 349         return (i);
 350 }
 351 
 352 #ifdef  __sparc
 353 
 354 #define NEED_LOCK_KVADDR(kva)   mmpagelock(&kas, kva)
 355 
 356 #else   /* __i386, __amd64 */
 357 
 358 #define NEED_LOCK_KVADDR(va)    0
 359 
 360 #endif  /* __sparc */
 361 
 362 /*ARGSUSED3*/
 363 static int
 364 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
 365 {
 366         pfn_t v;
 367         struct iovec *iov;
 368         int error = 0;
 369         size_t c;
 370         ssize_t oresid = uio->uio_resid;
 371         minor_t minor = getminor(dev);
 372 
 373         while (uio->uio_resid > 0 && error == 0) {
 374                 iov = uio->uio_iov;
 375                 if (iov->iov_len == 0) {
 376                         uio->uio_iov++;
 377                         uio->uio_iovcnt--;
 378                         if (uio->uio_iovcnt < 0)
 379                                 panic("mmrw");
 380                         continue;
 381                 }
 382                 switch (minor) {
 383 
 384                 case M_MEM:
 385                         memlist_read_lock();
 386                         if (!address_in_memlist(phys_install,
 387                             (uint64_t)uio->uio_loffset, 1)) {
 388                                 memlist_read_unlock();
 389                                 error = EFAULT;
 390                                 break;
 391                         }
 392                         memlist_read_unlock();
 393 
 394                         v = BTOP((u_offset_t)uio->uio_loffset);
 395                         error = mmio(uio, rw, v,
 396                             uio->uio_loffset & PAGEOFFSET, 0, NULL);
 397                         break;
 398 
 399                 case M_KMEM:
 400                 case M_ALLKMEM:
 401                         {
 402                         page_t **ppp = NULL;
 403                         caddr_t vaddr = (caddr_t)uio->uio_offset;
 404                         int try_lock = NEED_LOCK_KVADDR(vaddr);
 405                         int locked = 0;
 406 
 407                         if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
 408                                 break;
 409 
 410                         if (rw == UIO_WRITE)
 411                                 mm_logkmem(uio);
 412 
 413                         /*
 414                          * If vaddr does not map a valid page, as_pagelock()
 415                          * will return failure. Hence we can't check the
 416                          * return value and return EFAULT here as we'd like.
 417                          * seg_kp and seg_kpm do not properly support
 418                          * as_pagelock() for this context so we avoid it
 419                          * using the try_lock set check above.  Some day when
 420                          * the kernel page locking gets redesigned all this
 421                          * muck can be cleaned up.
 422                          */
 423                         if (try_lock)
 424                                 locked = (as_pagelock(&kas, &ppp, vaddr,
 425                                     PAGESIZE, S_WRITE) == 0);
 426 
 427                         v = hat_getpfnum(kas.a_hat,
 428                             (caddr_t)(uintptr_t)uio->uio_loffset);
 429                         if (v == PFN_INVALID) {
 430                                 if (locked)
 431                                         as_pageunlock(&kas, ppp, vaddr,
 432                                             PAGESIZE, S_WRITE);
 433                                 error = EFAULT;
 434                                 break;
 435                         }
 436 
 437                         error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
 438                             minor == M_ALLKMEM || mm_kmem_io_access,
 439                             (locked && ppp) ? *ppp : NULL);
 440                         if (locked)
 441                                 as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
 442                                     S_WRITE);
 443                         }
 444 
 445                         break;
 446 
 447                 case M_FULL:
 448                         if (rw == UIO_WRITE) {
 449                                 error = ENOSPC;
 450                                 break;
 451                         }
 452                         /* else it's a read, fall through to zero case */
 453                         /*FALLTHROUGH*/
 454 
 455                 case M_ZERO:
 456                         if (rw == UIO_READ) {
 457                                 label_t ljb;
 458 
 459                                 if (on_fault(&ljb)) {
 460                                         no_fault();
 461                                         error = EFAULT;
 462                                         break;
 463                                 }
 464                                 uzero(iov->iov_base, iov->iov_len);
 465                                 no_fault();
 466                                 uio->uio_resid -= iov->iov_len;
 467                                 uio->uio_loffset += iov->iov_len;
 468                                 break;
 469                         }
 470                         /* else it's a write, fall through to NULL case */
 471                         /*FALLTHROUGH*/
 472 
 473                 case M_NULL:
 474                         if (rw == UIO_READ)
 475                                 return (0);
 476                         c = iov->iov_len;
 477                         iov->iov_base += c;
 478                         iov->iov_len -= c;
 479                         uio->uio_loffset += c;
 480                         uio->uio_resid -= c;
 481                         break;
 482 
 483                 }
 484         }
 485         return (uio->uio_resid == oresid ? error : 0);
 486 }
 487 
 488 static int
 489 mmread(dev_t dev, struct uio *uio, cred_t *cred)
 490 {
 491         return (mmrw(dev, uio, UIO_READ, cred));
 492 }
 493 
 494 static int
 495 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
 496 {
 497         return (mmrw(dev, uio, UIO_WRITE, cred));
 498 }
 499 
 500 /*
 501  * Private ioctl for libkvm to support kvm_physaddr().
 502  * Given an address space and a VA, compute the PA.
 503  */
 504 static int
 505 mmioctl_vtop(intptr_t data)
 506 {
 507 #ifdef _SYSCALL32
 508         mem_vtop32_t vtop32;
 509 #endif
 510         mem_vtop_t mem_vtop;
 511         proc_t *p;
 512         pfn_t pfn = (pfn_t)PFN_INVALID;
 513         pid_t pid = 0;
 514         struct as *as;
 515         struct seg *seg;
 516 
 517         if (get_udatamodel() == DATAMODEL_NATIVE) {
 518                 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
 519                         return (EFAULT);
 520         }
 521 #ifdef _SYSCALL32
 522         else {
 523                 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
 524                         return (EFAULT);
 525                 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
 526                 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
 527 
 528                 if (mem_vtop.m_as != NULL)
 529                         return (EINVAL);
 530         }
 531 #endif
 532 
 533         if (mem_vtop.m_as == &kas) {
 534                 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
 535         } else {
 536                 if (mem_vtop.m_as == NULL) {
 537                         /*
 538                          * Assume the calling process's address space if the
 539                          * caller didn't specify one.
 540                          */
 541                         p = curthread->t_procp;
 542                         if (p == NULL)
 543                                 return (EIO);
 544                         mem_vtop.m_as = p->p_as;
 545                 }
 546 
 547                 mutex_enter(&pidlock);
 548                 for (p = practive; p != NULL; p = p->p_next) {
 549                         if (p->p_as == mem_vtop.m_as) {
 550                                 pid = p->p_pid;
 551                                 break;
 552                         }
 553                 }
 554                 mutex_exit(&pidlock);
 555                 if (p == NULL)
 556                         return (EIO);
 557                 p = sprlock(pid);
 558                 if (p == NULL)
 559                         return (EIO);
 560                 as = p->p_as;
 561                 if (as == mem_vtop.m_as) {
 562                         mutex_exit(&p->p_lock);
 563                         AS_LOCK_ENTER(as, RW_READER);
 564                         for (seg = AS_SEGFIRST(as); seg != NULL;
 565                             seg = AS_SEGNEXT(as, seg))
 566                                 if ((uintptr_t)mem_vtop.m_va -
 567                                     (uintptr_t)seg->s_base < seg->s_size)
 568                                         break;
 569                         if (seg != NULL)
 570                                 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
 571                         AS_LOCK_EXIT(as);
 572                         mutex_enter(&p->p_lock);
 573                 }
 574                 sprunlock(p);
 575         }
 576         mem_vtop.m_pfn = pfn;
 577         if (pfn == PFN_INVALID)
 578                 return (EIO);
 579 
 580         if (get_udatamodel() == DATAMODEL_NATIVE) {
 581                 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
 582                         return (EFAULT);
 583         }
 584 #ifdef _SYSCALL32
 585         else {
 586                 vtop32.m_pfn = mem_vtop.m_pfn;
 587                 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
 588                         return (EFAULT);
 589         }
 590 #endif
 591 
 592         return (0);
 593 }
 594 
 595 /*
 596  * Given a PA, execute the given page retire command on it.
 597  */
 598 static int
 599 mmioctl_page_retire(int cmd, intptr_t data)
 600 {
 601         extern int page_retire_test(void);
 602         uint64_t pa;
 603 
 604         if (copyin((void *)data, &pa, sizeof (uint64_t))) {
 605                 return (EFAULT);
 606         }
 607 
 608         switch (cmd) {
 609         case MEM_PAGE_ISRETIRED:
 610                 return (page_retire_check(pa, NULL));
 611 
 612         case MEM_PAGE_UNRETIRE:
 613                 return (page_unretire(pa));
 614 
 615         case MEM_PAGE_RETIRE:
 616                 return (page_retire(pa, PR_FMA));
 617 
 618         case MEM_PAGE_RETIRE_MCE:
 619                 return (page_retire(pa, PR_MCE));
 620 
 621         case MEM_PAGE_RETIRE_UE:
 622                 return (page_retire(pa, PR_UE));
 623 
 624         case MEM_PAGE_GETERRORS:
 625                 {
 626                         uint64_t page_errors;
 627                         int rc = page_retire_check(pa, &page_errors);
 628                         if (copyout(&page_errors, (void *)data,
 629                             sizeof (uint64_t))) {
 630                                 return (EFAULT);
 631                         }
 632                         return (rc);
 633                 }
 634 
 635         case MEM_PAGE_RETIRE_TEST:
 636                 return (page_retire_test());
 637 
 638         }
 639 
 640         return (EINVAL);
 641 }
 642 
 643 #ifdef __sparc
 644 /*
 645  * Given a syndrome, syndrome type, and address return the
 646  * associated memory name in the provided data buffer.
 647  */
 648 static int
 649 mmioctl_get_mem_name(intptr_t data)
 650 {
 651         mem_name_t mem_name;
 652         void *buf;
 653         size_t bufsize;
 654         int len, err;
 655 
 656         if ((bufsize = cpu_get_name_bufsize()) == 0)
 657                 return (ENOTSUP);
 658 
 659         if ((err = mm_read_mem_name(data, &mem_name)) < 0)
 660                 return (err);
 661 
 662         buf = kmem_alloc(bufsize, KM_SLEEP);
 663 
 664         /*
 665          * Call into cpu specific code to do the lookup.
 666          */
 667         if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
 668             mem_name.m_addr, buf, bufsize, &len)) != 0) {
 669                 kmem_free(buf, bufsize);
 670                 return (err);
 671         }
 672 
 673         if (len >= mem_name.m_namelen) {
 674                 kmem_free(buf, bufsize);
 675                 return (ENOSPC);
 676         }
 677 
 678         if (copyoutstr(buf, (char *)mem_name.m_name,
 679             mem_name.m_namelen, NULL) != 0) {
 680                 kmem_free(buf, bufsize);
 681                 return (EFAULT);
 682         }
 683 
 684         kmem_free(buf, bufsize);
 685         return (0);
 686 }
 687 
 688 /*
 689  * Given a syndrome and address return information about the associated memory.
 690  */
 691 static int
 692 mmioctl_get_mem_info(intptr_t data)
 693 {
 694         mem_info_t mem_info;
 695         int err;
 696 
 697         if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
 698                 return (EFAULT);
 699 
 700         if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
 701             &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
 702             &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
 703                 return (err);
 704 
 705         if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
 706                 return (EFAULT);
 707 
 708         return (0);
 709 }
 710 
 711 /*
 712  * Given a memory name, return its associated serial id
 713  */
 714 static int
 715 mmioctl_get_mem_sid(intptr_t data)
 716 {
 717         mem_name_t mem_name;
 718         void *buf;
 719         void *name;
 720         size_t  name_len;
 721         size_t bufsize;
 722         int len, err;
 723 
 724         if ((bufsize = cpu_get_name_bufsize()) == 0)
 725                 return (ENOTSUP);
 726 
 727         if ((err = mm_read_mem_name(data, &mem_name)) < 0)
 728                 return (err);
 729 
 730         buf = kmem_alloc(bufsize, KM_SLEEP);
 731 
 732         if (mem_name.m_namelen > 1024)
 733                 mem_name.m_namelen = 1024; /* cap at 1024 bytes */
 734 
 735         name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
 736 
 737         if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
 738             mem_name.m_namelen, &name_len)) != 0) {
 739                 kmem_free(buf, bufsize);
 740                 kmem_free(name, mem_name.m_namelen);
 741                 return (err);
 742         }
 743 
 744         /*
 745          * Call into cpu specific code to do the lookup.
 746          */
 747         if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
 748                 kmem_free(buf, bufsize);
 749                 kmem_free(name, mem_name.m_namelen);
 750                 return (err);
 751         }
 752 
 753         if (len > mem_name.m_sidlen) {
 754                 kmem_free(buf, bufsize);
 755                 kmem_free(name, mem_name.m_namelen);
 756                 return (ENAMETOOLONG);
 757         }
 758 
 759         if (copyoutstr(buf, (char *)mem_name.m_sid,
 760             mem_name.m_sidlen, NULL) != 0) {
 761                 kmem_free(buf, bufsize);
 762                 kmem_free(name, mem_name.m_namelen);
 763                 return (EFAULT);
 764         }
 765 
 766         kmem_free(buf, bufsize);
 767         kmem_free(name, mem_name.m_namelen);
 768         return (0);
 769 }
 770 #endif  /* __sparc */
 771 
 772 /*
 773  * Private ioctls for
 774  *      libkvm to support kvm_physaddr().
 775  *      FMA support for page_retire() and memory attribute information.
 776  */
 777 /*ARGSUSED*/
 778 static int
 779 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
 780 {
 781         if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
 782             (cmd != MEM_VTOP && getminor(dev) != M_MEM))
 783                 return (ENXIO);
 784 
 785         switch (cmd) {
 786         case MEM_VTOP:
 787                 return (mmioctl_vtop(data));
 788 
 789         case MEM_PAGE_RETIRE:
 790         case MEM_PAGE_ISRETIRED:
 791         case MEM_PAGE_UNRETIRE:
 792         case MEM_PAGE_RETIRE_MCE:
 793         case MEM_PAGE_RETIRE_UE:
 794         case MEM_PAGE_GETERRORS:
 795         case MEM_PAGE_RETIRE_TEST:
 796                 return (mmioctl_page_retire(cmd, data));
 797 
 798 #ifdef __sparc
 799         case MEM_NAME:
 800                 return (mmioctl_get_mem_name(data));
 801 
 802         case MEM_INFO:
 803                 return (mmioctl_get_mem_info(data));
 804 
 805         case MEM_SID:
 806                 return (mmioctl_get_mem_sid(data));
 807 #else
 808         case MEM_NAME:
 809         case MEM_INFO:
 810         case MEM_SID:
 811                 return (ENOTSUP);
 812 #endif  /* __sparc */
 813         }
 814         return (ENXIO);
 815 }
 816 
 817 /*ARGSUSED2*/
 818 static int
 819 mmmmap(dev_t dev, off_t off, int prot)
 820 {
 821         pfn_t pf;
 822         struct memlist *pmem;
 823         minor_t minor = getminor(dev);
 824 
 825         switch (minor) {
 826         case M_MEM:
 827                 pf = btop(off);
 828                 memlist_read_lock();
 829                 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
 830                         if (pf >= BTOP(pmem->ml_address) &&
 831                             pf < BTOP(pmem->ml_address + pmem->ml_size)) {
 832                                 memlist_read_unlock();
 833                                 return (impl_obmem_pfnum(pf));
 834                         }
 835                 }
 836                 memlist_read_unlock();
 837                 break;
 838 
 839         case M_KMEM:
 840         case M_ALLKMEM:
 841                 /* no longer supported with KPR */
 842                 return (-1);
 843 
 844         case M_FULL:
 845         case M_ZERO:
 846                 /*
 847                  * We shouldn't be mmap'ing to /dev/zero here as
 848                  * mmsegmap() should have already converted
 849                  * a mapping request for this device to a mapping
 850                  * using seg_vn for anonymous memory.
 851                  */
 852                 break;
 853 
 854         }
 855         return (-1);
 856 }
 857 
 858 /*
 859  * This function is called when a memory device is mmap'ed.
 860  * Set up the mapping to the correct device driver.
 861  */
 862 static int
 863 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
 864     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
 865 {
 866         struct segvn_crargs vn_a;
 867         struct segdev_crargs dev_a;
 868         int error;
 869         minor_t minor;
 870         off_t i;
 871 
 872         minor = getminor(dev);
 873 
 874         as_rangelock(as);
 875         /*
 876          * No need to worry about vac alignment on /dev/zero
 877          * since this is a "clone" object that doesn't yet exist.
 878          */
 879         error = choose_addr(as, addrp, len, off,
 880             (minor == M_MEM) || (minor == M_KMEM), flags);
 881         if (error != 0) {
 882                 as_rangeunlock(as);
 883                 return (error);
 884         }
 885 
 886         switch (minor) {
 887         case M_MEM:
 888                 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
 889                 if ((flags & MAP_TYPE) != MAP_SHARED) {
 890                         as_rangeunlock(as);
 891                         return (EINVAL);
 892                 }
 893 
 894                 /*
 895                  * Check to ensure that the entire range is
 896                  * legal and we are not trying to map in
 897                  * more than the device will let us.
 898                  */
 899                 for (i = 0; i < len; i += PAGESIZE) {
 900                         if (mmmmap(dev, off + i, maxprot) == -1) {
 901                                 as_rangeunlock(as);
 902                                 return (ENXIO);
 903                         }
 904                 }
 905 
 906                 /*
 907                  * Use seg_dev segment driver for /dev/mem mapping.
 908                  */
 909                 dev_a.mapfunc = mmmmap;
 910                 dev_a.dev = dev;
 911                 dev_a.offset = off;
 912                 dev_a.type = (flags & MAP_TYPE);
 913                 dev_a.prot = (uchar_t)prot;
 914                 dev_a.maxprot = (uchar_t)maxprot;
 915                 dev_a.hat_attr = 0;
 916 
 917                 /*
 918                  * Make /dev/mem mappings non-consistent since we can't
 919                  * alias pages that don't have page structs behind them,
 920                  * such as kernel stack pages. If someone mmap()s a kernel
 921                  * stack page and if we give them a tte with cv, a line from
 922                  * that page can get into both pages of the spitfire d$.
 923                  * But snoop from another processor will only invalidate
 924                  * the first page. This later caused kernel (xc_attention)
 925                  * to go into an infinite loop at pil 13 and no interrupts
 926                  * could come in. See 1203630.
 927                  *
 928                  */
 929                 dev_a.hat_flags = HAT_LOAD_NOCONSIST;
 930                 dev_a.devmap_data = NULL;
 931 
 932                 error = as_map(as, *addrp, len, segdev_create, &dev_a);
 933                 break;
 934 
 935         case M_ZERO:
 936                 /*
 937                  * Use seg_vn segment driver for /dev/zero mapping.
 938                  * Passing in a NULL amp gives us the "cloning" effect.
 939                  */
 940                 vn_a.vp = NULL;
 941                 vn_a.offset = 0;
 942                 vn_a.type = (flags & MAP_TYPE);
 943                 vn_a.prot = prot;
 944                 vn_a.maxprot = maxprot;
 945                 vn_a.flags = flags & ~MAP_TYPE;
 946                 vn_a.cred = cred;
 947                 vn_a.amp = NULL;
 948                 vn_a.szc = 0;
 949                 vn_a.lgrp_mem_policy_flags = 0;
 950                 error = as_map(as, *addrp, len, segvn_create, &vn_a);
 951                 break;
 952 
 953         case M_KMEM:
 954         case M_ALLKMEM:
 955                 /* No longer supported with KPR. */
 956                 error = ENXIO;
 957                 break;
 958 
 959         case M_NULL:
 960                 /*
 961                  * Use seg_dev segment driver for /dev/null mapping.
 962                  */
 963                 dev_a.mapfunc = mmmmap;
 964                 dev_a.dev = dev;
 965                 dev_a.offset = off;
 966                 dev_a.type = 0;         /* neither PRIVATE nor SHARED */
 967                 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
 968                 dev_a.hat_attr = 0;
 969                 dev_a.hat_flags = 0;
 970                 error = as_map(as, *addrp, len, segdev_create, &dev_a);
 971                 break;
 972 
 973         default:
 974                 error = ENXIO;
 975         }
 976 
 977         as_rangeunlock(as);
 978         return (error);
 979 }
 980 
 981 static struct cb_ops mm_cb_ops = {
 982         mmopen,                 /* open */
 983         nulldev,                /* close */
 984         nodev,                  /* strategy */
 985         nodev,                  /* print */
 986         nodev,                  /* dump */
 987         mmread,                 /* read */
 988         mmwrite,                /* write */
 989         mmioctl,                /* ioctl */
 990         nodev,                  /* devmap */
 991         mmmmap,                 /* mmap */
 992         mmsegmap,               /* segmap */
 993         mmchpoll,               /* poll */
 994         mmpropop,               /* prop_op */
 995         0,                      /* streamtab  */
 996         D_NEW | D_MP | D_64BIT | D_U64BIT
 997 };
 998 
 999 static struct dev_ops mm_ops = {
1000         DEVO_REV,               /* devo_rev, */
1001         0,                      /* refcnt  */
1002         mm_info,                /* get_dev_info */
1003         nulldev,                /* identify */
1004         nulldev,                /* probe */
1005         mm_attach,              /* attach */
1006         nodev,                  /* detach */
1007         nodev,                  /* reset */
1008         &mm_cb_ops,         /* driver operations */
1009         (struct bus_ops *)0,    /* bus operations */
1010         NULL,                   /* power */
1011         ddi_quiesce_not_needed,         /* quiesce */
1012 };
1013 
1014 static struct modldrv modldrv = {
1015         &mod_driverops, "memory driver", &mm_ops,
1016 };
1017 
1018 static struct modlinkage modlinkage = {
1019         MODREV_1, &modldrv, NULL
1020 };
1021 
1022 int
1023 _init(void)
1024 {
1025         return (mod_install(&modlinkage));
1026 }
1027 
1028 int
1029 _info(struct modinfo *modinfop)
1030 {
1031         return (mod_info(&modlinkage, modinfop));
1032 }
1033 
1034 int
1035 _fini(void)
1036 {
1037         return (mod_remove(&modlinkage));
1038 }
1039 
1040 static int
1041 mm_kstat_update(kstat_t *ksp, int rw)
1042 {
1043         struct memlist *pmem;
1044         uint_t count;
1045 
1046         if (rw == KSTAT_WRITE)
1047                 return (EACCES);
1048 
1049         count = 0;
1050         memlist_read_lock();
1051         for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1052                 count++;
1053         }
1054         memlist_read_unlock();
1055 
1056         ksp->ks_ndata = count;
1057         ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1058 
1059         return (0);
1060 }
1061 
1062 static int
1063 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1064 {
1065         struct memlist *pmem;
1066         struct memunit {
1067                 uint64_t address;
1068                 uint64_t size;
1069         } *kspmem;
1070 
1071         if (rw == KSTAT_WRITE)
1072                 return (EACCES);
1073 
1074         ksp->ks_snaptime = gethrtime();
1075 
1076         kspmem = (struct memunit *)buf;
1077         memlist_read_lock();
1078         for (pmem = phys_install; pmem != NULL;
1079             pmem = pmem->ml_next, kspmem++) {
1080                 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1081                         break;
1082                 kspmem->address = pmem->ml_address;
1083                 kspmem->size = pmem->ml_size;
1084         }
1085         memlist_read_unlock();
1086 
1087         return (0);
1088 }
1089 
1090 /*
1091  * Read a mem_name_t from user-space and store it in the mem_name_t
1092  * pointed to by the mem_name argument.
1093  */
1094 static int
1095 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1096 {
1097         if (get_udatamodel() == DATAMODEL_NATIVE) {
1098                 if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1099                         return (EFAULT);
1100         }
1101 #ifdef  _SYSCALL32
1102         else {
1103                 mem_name32_t mem_name32;
1104 
1105                 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1106                         return (EFAULT);
1107                 mem_name->m_addr = mem_name32.m_addr;
1108                 mem_name->m_synd = mem_name32.m_synd;
1109                 mem_name->m_type[0] = mem_name32.m_type[0];
1110                 mem_name->m_type[1] = mem_name32.m_type[1];
1111                 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1112                 mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1113                 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1114                 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1115         }
1116 #endif  /* _SYSCALL32 */
1117 
1118         return (0);
1119 }