1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright (c) 2016 by Delphix. All rights reserved.
  25  */
  26 
  27 /*
  28  * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
  29  * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
  30  */
  31 
  32 /*
  33  * Memory special file
  34  */
  35 
  36 #include <sys/types.h>
  37 #include <sys/param.h>
  38 #include <sys/user.h>
  39 #include <sys/buf.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/vm.h>
  43 #include <sys/uio.h>
  44 #include <sys/mman.h>
  45 #include <sys/kmem.h>
  46 #include <vm/seg.h>
  47 #include <vm/page.h>
  48 #include <sys/stat.h>
  49 #include <sys/vmem.h>
  50 #include <sys/memlist.h>
  51 #include <sys/bootconf.h>
  52 
  53 #include <vm/seg_vn.h>
  54 #include <vm/seg_dev.h>
  55 #include <vm/seg_kmem.h>
  56 #include <vm/seg_kp.h>
  57 #include <vm/seg_kpm.h>
  58 #include <vm/hat.h>
  59 
  60 #include <sys/conf.h>
  61 #include <sys/mem.h>
  62 #include <sys/types.h>
  63 #include <sys/conf.h>
  64 #include <sys/param.h>
  65 #include <sys/systm.h>
  66 #include <sys/errno.h>
  67 #include <sys/modctl.h>
  68 #include <sys/memlist.h>
  69 #include <sys/ddi.h>
  70 #include <sys/sunddi.h>
  71 #include <sys/debug.h>
  72 #include <sys/fm/protocol.h>
  73 
  74 #if defined(__sparc)
  75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
  76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
  77     uint64_t *, int *, int *, int *);
  78 extern size_t cpu_get_name_bufsize(void);
  79 extern int cpu_get_mem_sid(char *, char *, int, int *);
  80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
  81 #elif defined(__x86)
  82 #include <sys/cpu_module.h>
  83 #endif  /* __sparc */
  84 
  85 /*
  86  * Turn a byte length into a pagecount.  The DDI btop takes a
  87  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
  88  * large physical-memory 32-bit machines.
  89  */
  90 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
  91 
  92 static kmutex_t mm_lock;
  93 static caddr_t mm_map;
  94 
  95 static dev_info_t *mm_dip;      /* private copy of devinfo pointer */
  96 
  97 static int mm_kmem_io_access;
  98 
  99 static int mm_kstat_update(kstat_t *ksp, int rw);
 100 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
 101 
 102 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
 103 
 104 #define MM_KMEMLOG_NENTRIES     64
 105 
 106 static int mm_kmemlogent;
 107 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
 108 
 109 /*
 110  * On kmem/allmem writes, we log information that might be useful in the event
 111  * that a write is errant (that is, due to operator error) and induces a later
 112  * problem.  Note that (in particular) in the event of such operator-induced
 113  * corruption, a search over the kernel address space for the corrupted
 114  * address will yield the ring buffer entry that recorded the write.  And
 115  * should it seem baroque or otherwise unnecessary, yes, we need this kind of
 116  * auditing facility and yes, we learned that the hard way: disturbingly,
 117  * there exist recommendations for "tuning" the system that involve writing to
 118  * kernel memory addresses via the kernel debugger, and -- as we discovered --
 119  * these can easily be applied incorrectly or unsafely, yielding an entirely
 120  * undebuggable "can't happen" kind of panic.
 121  */
 122 static void
 123 mm_logkmem(struct uio *uio)
 124 {
 125         mm_logentry_t *ent;
 126         proc_t *p = curthread->t_procp;
 127 
 128         mutex_enter(&mm_lock);
 129 
 130         ent = &mm_kmemlog[mm_kmemlogent++];
 131 
 132         if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
 133                 mm_kmemlogent = 0;
 134 
 135         ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
 136         ent->mle_len = uio->uio_resid;
 137         gethrestime(&ent->mle_hrestime);
 138         ent->mle_hrtime = gethrtime();
 139         ent->mle_pid = p->p_pidp->pid_id;
 140 
 141         (void) strncpy(ent->mle_psargs,
 142             p->p_user.u_psargs, sizeof (ent->mle_psargs));
 143 
 144         mutex_exit(&mm_lock);
 145 }
 146 
 147 /*ARGSUSED1*/
 148 static int
 149 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 150 {
 151         int i;
 152         struct mem_minor {
 153                 char *name;
 154                 minor_t minor;
 155                 int privonly;
 156                 const char *rdpriv;
 157                 const char *wrpriv;
 158                 mode_t priv_mode;
 159         } mm[] = {
 160                 { "mem",        M_MEM,          0,      NULL,   "all",  0640 },
 161                 { "kmem",       M_KMEM,         0,      NULL,   "all",  0640 },
 162                 { "allkmem",    M_ALLKMEM,      0,      "all",  "all",  0600 },
 163                 { "null",       M_NULL, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 164                 { "zero",       M_ZERO, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 165                 { "full",       M_FULL, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 166         };
 167         kstat_t *ksp;
 168 
 169         mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
 170         mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
 171 
 172         for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
 173                 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
 174                     mm[i].minor, DDI_PSEUDO, mm[i].privonly,
 175                     mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
 176                     DDI_FAILURE) {
 177                         ddi_remove_minor_node(devi, NULL);
 178                         return (DDI_FAILURE);
 179                 }
 180         }
 181 
 182         mm_dip = devi;
 183 
 184         ksp = kstat_create("mm", 0, "phys_installed", "misc",
 185             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
 186         if (ksp != NULL) {
 187                 ksp->ks_update = mm_kstat_update;
 188                 ksp->ks_snapshot = mm_kstat_snapshot;
 189                 ksp->ks_lock = &mm_lock; /* XXX - not really needed */
 190                 kstat_install(ksp);
 191         }
 192 
 193         mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 194             "kmem_io_access", 0);
 195 
 196         return (DDI_SUCCESS);
 197 }
 198 
 199 /*ARGSUSED*/
 200 static int
 201 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 202 {
 203         register int error;
 204 
 205         switch (infocmd) {
 206         case DDI_INFO_DEVT2DEVINFO:
 207                 *result = (void *)mm_dip;
 208                 error = DDI_SUCCESS;
 209                 break;
 210         case DDI_INFO_DEVT2INSTANCE:
 211                 *result = (void *)0;
 212                 error = DDI_SUCCESS;
 213                 break;
 214         default:
 215                 error = DDI_FAILURE;
 216         }
 217         return (error);
 218 }
 219 
 220 /*ARGSUSED1*/
 221 static int
 222 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
 223 {
 224         switch (getminor(*devp)) {
 225         case M_NULL:
 226         case M_ZERO:
 227         case M_FULL:
 228         case M_MEM:
 229         case M_KMEM:
 230         case M_ALLKMEM:
 231                 /* standard devices */
 232                 break;
 233 
 234         default:
 235                 /* Unsupported or unknown type */
 236                 return (EINVAL);
 237         }
 238         /* must be character device */
 239         if (typ != OTYP_CHR)
 240                 return (EINVAL);
 241         return (0);
 242 }
 243 
 244 struct pollhead mm_pollhd;
 245 
 246 /*ARGSUSED*/
 247 static int
 248 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
 249     struct pollhead **phpp)
 250 {
 251         switch (getminor(dev)) {
 252         case M_NULL:
 253         case M_ZERO:
 254         case M_FULL:
 255         case M_MEM:
 256         case M_KMEM:
 257         case M_ALLKMEM:
 258                 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
 259                     POLLWRNORM | POLLRDBAND | POLLWRBAND);
 260                 /*
 261                  * A non NULL pollhead pointer should be returned in case
 262                  * user polls for 0 events.
 263                  */
 264                 *phpp = !anyyet && !*reventsp ?
 265                     &mm_pollhd : (struct pollhead *)NULL;
 266                 return (0);
 267         default:
 268                 /* no other devices currently support polling */
 269                 return (ENXIO);
 270         }
 271 }
 272 
 273 static int
 274 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
 275     char *name, caddr_t valuep, int *lengthp)
 276 {
 277         /*
 278          * implement zero size to reduce overhead (avoid two failing
 279          * property lookups per stat).
 280          */
 281         return (ddi_prop_op_size(dev, dip, prop_op,
 282             flags, name, valuep, lengthp, 0));
 283 }
 284 
 285 static int
 286 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
 287     page_t *pp)
 288 {
 289         int error = 0;
 290         int devload = 0;
 291         int is_memory = pf_is_memory(pfn);
 292         size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
 293             (size_t)uio->uio_iov->iov_len);
 294         caddr_t va = NULL;
 295 
 296         mutex_enter(&mm_lock);
 297 
 298         if (is_memory && kpm_enable) {
 299                 if (pp)
 300                         va = hat_kpm_mapin(pp, NULL);
 301                 else
 302                         va = hat_kpm_mapin_pfn(pfn);
 303         }
 304 
 305         if (va == NULL) {
 306                 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
 307                     (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
 308                     HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
 309                 va = mm_map;
 310                 devload = 1;
 311         }
 312 
 313         if (!is_memory) {
 314                 if (allowio) {
 315                         size_t c = uio->uio_iov->iov_len;
 316 
 317                         if (ddi_peekpokeio(NULL, uio, rw,
 318                             (caddr_t)(uintptr_t)uio->uio_loffset, c,
 319                             sizeof (int32_t)) != DDI_SUCCESS)
 320                                 error = EFAULT;
 321                 } else
 322                         error = EIO;
 323         } else
 324                 error = uiomove(va + pageoff, nbytes, rw, uio);
 325 
 326         if (devload)
 327                 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
 328         else if (pp)
 329                 hat_kpm_mapout(pp, NULL, va);
 330         else
 331                 hat_kpm_mapout_pfn(pfn);
 332 
 333         mutex_exit(&mm_lock);
 334         return (error);
 335 }
 336 
 337 static int
 338 mmpagelock(struct as *as, caddr_t va)
 339 {
 340         struct seg *seg;
 341         int i;
 342 
 343         AS_LOCK_ENTER(as, RW_READER);
 344         seg = as_segat(as, va);
 345         i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
 346         AS_LOCK_EXIT(as);
 347 
 348         return (i);
 349 }
 350 
 351 #ifdef  __sparc
 352 
 353 #define NEED_LOCK_KVADDR(kva)   mmpagelock(&kas, kva)
 354 
 355 #else   /* __i386, __amd64 */
 356 
 357 #define NEED_LOCK_KVADDR(va)    0
 358 
 359 #endif  /* __sparc */
 360 
 361 /*ARGSUSED3*/
 362 static int
 363 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
 364 {
 365         pfn_t v;
 366         struct iovec *iov;
 367         int error = 0;
 368         size_t c;
 369         ssize_t oresid = uio->uio_resid;
 370         minor_t minor = getminor(dev);
 371 
 372         while (uio->uio_resid > 0 && error == 0) {
 373                 iov = uio->uio_iov;
 374                 if (iov->iov_len == 0) {
 375                         uio->uio_iov++;
 376                         uio->uio_iovcnt--;
 377                         if (uio->uio_iovcnt < 0)
 378                                 panic("mmrw");
 379                         continue;
 380                 }
 381                 switch (minor) {
 382 
 383                 case M_MEM:
 384                         memlist_read_lock();
 385                         if (!address_in_memlist(phys_install,
 386                             (uint64_t)uio->uio_loffset, 1)) {
 387                                 memlist_read_unlock();
 388                                 error = EFAULT;
 389                                 break;
 390                         }
 391                         memlist_read_unlock();
 392 
 393                         v = BTOP((u_offset_t)uio->uio_loffset);
 394                         error = mmio(uio, rw, v,
 395                             uio->uio_loffset & PAGEOFFSET, 0, NULL);
 396                         break;
 397 
 398                 case M_KMEM:
 399                 case M_ALLKMEM:
 400                         {
 401                         page_t **ppp = NULL;
 402                         caddr_t vaddr = (caddr_t)uio->uio_offset;
 403                         int try_lock = NEED_LOCK_KVADDR(vaddr);
 404                         int locked = 0;
 405 
 406                         if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
 407                                 break;
 408 
 409                         if (rw == UIO_WRITE)
 410                                 mm_logkmem(uio);
 411 
 412                         /*
 413                          * If vaddr does not map a valid page, as_pagelock()
 414                          * will return failure. Hence we can't check the
 415                          * return value and return EFAULT here as we'd like.
 416                          * seg_kp and seg_kpm do not properly support
 417                          * as_pagelock() for this context so we avoid it
 418                          * using the try_lock set check above.  Some day when
 419                          * the kernel page locking gets redesigned all this
 420                          * muck can be cleaned up.
 421                          */
 422                         if (try_lock)
 423                                 locked = (as_pagelock(&kas, &ppp, vaddr,
 424                                     PAGESIZE, S_WRITE) == 0);
 425 
 426                         v = hat_getpfnum(kas.a_hat,
 427                             (caddr_t)(uintptr_t)uio->uio_loffset);
 428                         if (v == PFN_INVALID) {
 429                                 if (locked)
 430                                         as_pageunlock(&kas, ppp, vaddr,
 431                                             PAGESIZE, S_WRITE);
 432                                 error = EFAULT;
 433                                 break;
 434                         }
 435 
 436                         error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
 437                             minor == M_ALLKMEM || mm_kmem_io_access,
 438                             (locked && ppp) ? *ppp : NULL);
 439                         if (locked)
 440                                 as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
 441                                     S_WRITE);
 442                         }
 443 
 444                         break;
 445 
 446                 case M_FULL:
 447                         if (rw == UIO_WRITE) {
 448                                 error = ENOSPC;
 449                                 break;
 450                         }
 451                         /* else it's a read, fall through to zero case */
 452                         /*FALLTHROUGH*/
 453 
 454                 case M_ZERO:
 455                         if (rw == UIO_READ) {
 456                                 label_t ljb;
 457 
 458                                 if (on_fault(&ljb)) {
 459                                         no_fault();
 460                                         error = EFAULT;
 461                                         break;
 462                                 }
 463                                 uzero(iov->iov_base, iov->iov_len);
 464                                 no_fault();
 465                                 uio->uio_resid -= iov->iov_len;
 466                                 uio->uio_loffset += iov->iov_len;
 467                                 break;
 468                         }
 469                         /* else it's a write, fall through to NULL case */
 470                         /*FALLTHROUGH*/
 471 
 472                 case M_NULL:
 473                         if (rw == UIO_READ)
 474                                 return (0);
 475                         c = iov->iov_len;
 476                         iov->iov_base += c;
 477                         iov->iov_len -= c;
 478                         uio->uio_loffset += c;
 479                         uio->uio_resid -= c;
 480                         break;
 481 
 482                 }
 483         }
 484         return (uio->uio_resid == oresid ? error : 0);
 485 }
 486 
 487 static int
 488 mmread(dev_t dev, struct uio *uio, cred_t *cred)
 489 {
 490         return (mmrw(dev, uio, UIO_READ, cred));
 491 }
 492 
 493 static int
 494 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
 495 {
 496         return (mmrw(dev, uio, UIO_WRITE, cred));
 497 }
 498 
 499 /*
 500  * Private ioctl for libkvm to support kvm_physaddr().
 501  * Given an address space and a VA, compute the PA.
 502  */
 503 static int
 504 mmioctl_vtop(intptr_t data)
 505 {
 506 #ifdef _SYSCALL32
 507         mem_vtop32_t vtop32;
 508 #endif
 509         mem_vtop_t mem_vtop;
 510         proc_t *p;
 511         pfn_t pfn = (pfn_t)PFN_INVALID;
 512         pid_t pid = 0;
 513         struct as *as;
 514         struct seg *seg;
 515 
 516         if (get_udatamodel() == DATAMODEL_NATIVE) {
 517                 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
 518                         return (EFAULT);
 519         }
 520 #ifdef _SYSCALL32
 521         else {
 522                 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
 523                         return (EFAULT);
 524                 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
 525                 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
 526 
 527                 if (mem_vtop.m_as != NULL)
 528                         return (EINVAL);
 529         }
 530 #endif
 531 
 532         if (mem_vtop.m_as == &kas) {
 533                 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
 534         } else {
 535                 if (mem_vtop.m_as == NULL) {
 536                         /*
 537                          * Assume the calling process's address space if the
 538                          * caller didn't specify one.
 539                          */
 540                         p = curthread->t_procp;
 541                         if (p == NULL)
 542                                 return (EIO);
 543                         mem_vtop.m_as = p->p_as;
 544                 }
 545 
 546                 mutex_enter(&pidlock);
 547                 for (p = practive; p != NULL; p = p->p_next) {
 548                         if (p->p_as == mem_vtop.m_as) {
 549                                 pid = p->p_pid;
 550                                 break;
 551                         }
 552                 }
 553                 mutex_exit(&pidlock);
 554                 if (p == NULL)
 555                         return (EIO);
 556                 p = sprlock(pid);
 557                 if (p == NULL)
 558                         return (EIO);
 559                 as = p->p_as;
 560                 if (as == mem_vtop.m_as) {
 561                         mutex_exit(&p->p_lock);
 562                         AS_LOCK_ENTER(as, RW_READER);
 563                         for (seg = AS_SEGFIRST(as); seg != NULL;
 564                             seg = AS_SEGNEXT(as, seg))
 565                                 if ((uintptr_t)mem_vtop.m_va -
 566                                     (uintptr_t)seg->s_base < seg->s_size)
 567                                         break;
 568                         if (seg != NULL)
 569                                 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
 570                         AS_LOCK_EXIT(as);
 571                         mutex_enter(&p->p_lock);
 572                 }
 573                 sprunlock(p);
 574         }
 575         mem_vtop.m_pfn = pfn;
 576         if (pfn == PFN_INVALID)
 577                 return (EIO);
 578 
 579         if (get_udatamodel() == DATAMODEL_NATIVE) {
 580                 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
 581                         return (EFAULT);
 582         }
 583 #ifdef _SYSCALL32
 584         else {
 585                 vtop32.m_pfn = mem_vtop.m_pfn;
 586                 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
 587                         return (EFAULT);
 588         }
 589 #endif
 590 
 591         return (0);
 592 }
 593 
 594 /*
 595  * Given a PA, execute the given page retire command on it.
 596  */
 597 static int
 598 mmioctl_page_retire(int cmd, intptr_t data)
 599 {
 600         extern int page_retire_test(void);
 601         uint64_t pa;
 602 
 603         if (copyin((void *)data, &pa, sizeof (uint64_t))) {
 604                 return (EFAULT);
 605         }
 606 
 607         switch (cmd) {
 608         case MEM_PAGE_ISRETIRED:
 609                 return (page_retire_check(pa, NULL));
 610 
 611         case MEM_PAGE_UNRETIRE:
 612                 return (page_unretire(pa));
 613 
 614         case MEM_PAGE_RETIRE:
 615                 return (page_retire(pa, PR_FMA));
 616 
 617         case MEM_PAGE_RETIRE_MCE:
 618                 return (page_retire(pa, PR_MCE));
 619 
 620         case MEM_PAGE_RETIRE_UE:
 621                 return (page_retire(pa, PR_UE));
 622 
 623         case MEM_PAGE_GETERRORS:
 624                 {
 625                         uint64_t page_errors;
 626                         int rc = page_retire_check(pa, &page_errors);
 627                         if (copyout(&page_errors, (void *)data,
 628                             sizeof (uint64_t))) {
 629                                 return (EFAULT);
 630                         }
 631                         return (rc);
 632                 }
 633 
 634         case MEM_PAGE_RETIRE_TEST:
 635                 return (page_retire_test());
 636 
 637         }
 638 
 639         return (EINVAL);
 640 }
 641 
 642 #ifdef __sparc
 643 /*
 644  * Given a syndrome, syndrome type, and address return the
 645  * associated memory name in the provided data buffer.
 646  */
 647 static int
 648 mmioctl_get_mem_name(intptr_t data)
 649 {
 650         mem_name_t mem_name;
 651         void *buf;
 652         size_t bufsize;
 653         int len, err;
 654 
 655         if ((bufsize = cpu_get_name_bufsize()) == 0)
 656                 return (ENOTSUP);
 657 
 658         if ((err = mm_read_mem_name(data, &mem_name)) < 0)
 659                 return (err);
 660 
 661         buf = kmem_alloc(bufsize, KM_SLEEP);
 662 
 663         /*
 664          * Call into cpu specific code to do the lookup.
 665          */
 666         if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
 667             mem_name.m_addr, buf, bufsize, &len)) != 0) {
 668                 kmem_free(buf, bufsize);
 669                 return (err);
 670         }
 671 
 672         if (len >= mem_name.m_namelen) {
 673                 kmem_free(buf, bufsize);
 674                 return (ENOSPC);
 675         }
 676 
 677         if (copyoutstr(buf, (char *)mem_name.m_name,
 678             mem_name.m_namelen, NULL) != 0) {
 679                 kmem_free(buf, bufsize);
 680                 return (EFAULT);
 681         }
 682 
 683         kmem_free(buf, bufsize);
 684         return (0);
 685 }
 686 
 687 /*
 688  * Given a syndrome and address return information about the associated memory.
 689  */
 690 static int
 691 mmioctl_get_mem_info(intptr_t data)
 692 {
 693         mem_info_t mem_info;
 694         int err;
 695 
 696         if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
 697                 return (EFAULT);
 698 
 699         if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
 700             &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
 701             &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
 702                 return (err);
 703 
 704         if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
 705                 return (EFAULT);
 706 
 707         return (0);
 708 }
 709 
 710 /*
 711  * Given a memory name, return its associated serial id
 712  */
 713 static int
 714 mmioctl_get_mem_sid(intptr_t data)
 715 {
 716         mem_name_t mem_name;
 717         void *buf;
 718         void *name;
 719         size_t  name_len;
 720         size_t bufsize;
 721         int len, err;
 722 
 723         if ((bufsize = cpu_get_name_bufsize()) == 0)
 724                 return (ENOTSUP);
 725 
 726         if ((err = mm_read_mem_name(data, &mem_name)) < 0)
 727                 return (err);
 728 
 729         buf = kmem_alloc(bufsize, KM_SLEEP);
 730 
 731         if (mem_name.m_namelen > 1024)
 732                 mem_name.m_namelen = 1024; /* cap at 1024 bytes */
 733 
 734         name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
 735 
 736         if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
 737             mem_name.m_namelen, &name_len)) != 0) {
 738                 kmem_free(buf, bufsize);
 739                 kmem_free(name, mem_name.m_namelen);
 740                 return (err);
 741         }
 742 
 743         /*
 744          * Call into cpu specific code to do the lookup.
 745          */
 746         if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
 747                 kmem_free(buf, bufsize);
 748                 kmem_free(name, mem_name.m_namelen);
 749                 return (err);
 750         }
 751 
 752         if (len > mem_name.m_sidlen) {
 753                 kmem_free(buf, bufsize);
 754                 kmem_free(name, mem_name.m_namelen);
 755                 return (ENAMETOOLONG);
 756         }
 757 
 758         if (copyoutstr(buf, (char *)mem_name.m_sid,
 759             mem_name.m_sidlen, NULL) != 0) {
 760                 kmem_free(buf, bufsize);
 761                 kmem_free(name, mem_name.m_namelen);
 762                 return (EFAULT);
 763         }
 764 
 765         kmem_free(buf, bufsize);
 766         kmem_free(name, mem_name.m_namelen);
 767         return (0);
 768 }
 769 #endif  /* __sparc */
 770 
 771 /*
 772  * Private ioctls for
 773  *      libkvm to support kvm_physaddr().
 774  *      FMA support for page_retire() and memory attribute information.
 775  */
 776 /*ARGSUSED*/
 777 static int
 778 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
 779 {
 780         if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
 781             (cmd != MEM_VTOP && getminor(dev) != M_MEM))
 782                 return (ENXIO);
 783 
 784         switch (cmd) {
 785         case MEM_VTOP:
 786                 return (mmioctl_vtop(data));
 787 
 788         case MEM_PAGE_RETIRE:
 789         case MEM_PAGE_ISRETIRED:
 790         case MEM_PAGE_UNRETIRE:
 791         case MEM_PAGE_RETIRE_MCE:
 792         case MEM_PAGE_RETIRE_UE:
 793         case MEM_PAGE_GETERRORS:
 794         case MEM_PAGE_RETIRE_TEST:
 795                 return (mmioctl_page_retire(cmd, data));
 796 
 797 #ifdef __sparc
 798         case MEM_NAME:
 799                 return (mmioctl_get_mem_name(data));
 800 
 801         case MEM_INFO:
 802                 return (mmioctl_get_mem_info(data));
 803 
 804         case MEM_SID:
 805                 return (mmioctl_get_mem_sid(data));
 806 #else
 807         case MEM_NAME:
 808         case MEM_INFO:
 809         case MEM_SID:
 810                 return (ENOTSUP);
 811 #endif  /* __sparc */
 812         }
 813         return (ENXIO);
 814 }
 815 
 816 /*ARGSUSED2*/
 817 static int
 818 mmmmap(dev_t dev, off_t off, int prot)
 819 {
 820         pfn_t pf;
 821         struct memlist *pmem;
 822         minor_t minor = getminor(dev);
 823 
 824         switch (minor) {
 825         case M_MEM:
 826                 pf = btop(off);
 827                 memlist_read_lock();
 828                 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
 829                         if (pf >= BTOP(pmem->ml_address) &&
 830                             pf < BTOP(pmem->ml_address + pmem->ml_size)) {
 831                                 memlist_read_unlock();
 832                                 return (impl_obmem_pfnum(pf));
 833                         }
 834                 }
 835                 memlist_read_unlock();
 836                 break;
 837 
 838         case M_KMEM:
 839         case M_ALLKMEM:
 840                 /* no longer supported with KPR */
 841                 return (-1);
 842 
 843         case M_FULL:
 844         case M_ZERO:
 845                 /*
 846                  * We shouldn't be mmap'ing to /dev/zero here as
 847                  * mmsegmap() should have already converted
 848                  * a mapping request for this device to a mapping
 849                  * using seg_vn for anonymous memory.
 850                  */
 851                 break;
 852 
 853         }
 854         return (-1);
 855 }
 856 
 857 /*
 858  * This function is called when a memory device is mmap'ed.
 859  * Set up the mapping to the correct device driver.
 860  */
 861 static int
 862 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
 863     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
 864 {
 865         struct segvn_crargs vn_a;
 866         struct segdev_crargs dev_a;
 867         int error;
 868         minor_t minor;
 869         off_t i;
 870 
 871         minor = getminor(dev);
 872 
 873         as_rangelock(as);
 874         /*
 875          * No need to worry about vac alignment on /dev/zero
 876          * since this is a "clone" object that doesn't yet exist.
 877          */
 878         error = choose_addr(as, addrp, len, off,
 879             (minor == M_MEM) || (minor == M_KMEM), flags);
 880         if (error != 0) {
 881                 as_rangeunlock(as);
 882                 return (error);
 883         }
 884 
 885         switch (minor) {
 886         case M_MEM:
 887                 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
 888                 if ((flags & MAP_TYPE) != MAP_SHARED) {
 889                         as_rangeunlock(as);
 890                         return (EINVAL);
 891                 }
 892 
 893                 /*
 894                  * Check to ensure that the entire range is
 895                  * legal and we are not trying to map in
 896                  * more than the device will let us.
 897                  */
 898                 for (i = 0; i < len; i += PAGESIZE) {
 899                         if (mmmmap(dev, off + i, maxprot) == -1) {
 900                                 as_rangeunlock(as);
 901                                 return (ENXIO);
 902                         }
 903                 }
 904 
 905                 /*
 906                  * Use seg_dev segment driver for /dev/mem mapping.
 907                  */
 908                 dev_a.mapfunc = mmmmap;
 909                 dev_a.dev = dev;
 910                 dev_a.offset = off;
 911                 dev_a.type = (flags & MAP_TYPE);
 912                 dev_a.prot = (uchar_t)prot;
 913                 dev_a.maxprot = (uchar_t)maxprot;
 914                 dev_a.hat_attr = 0;
 915 
 916                 /*
 917                  * Make /dev/mem mappings non-consistent since we can't
 918                  * alias pages that don't have page structs behind them,
 919                  * such as kernel stack pages. If someone mmap()s a kernel
 920                  * stack page and if we give them a tte with cv, a line from
 921                  * that page can get into both pages of the spitfire d$.
 922                  * But snoop from another processor will only invalidate
 923                  * the first page. This later caused kernel (xc_attention)
 924                  * to go into an infinite loop at pil 13 and no interrupts
 925                  * could come in. See 1203630.
 926                  *
 927                  */
 928                 dev_a.hat_flags = HAT_LOAD_NOCONSIST;
 929                 dev_a.devmap_data = NULL;
 930 
 931                 error = as_map(as, *addrp, len, segdev_create, &dev_a);
 932                 break;
 933 
 934         case M_ZERO:
 935                 /*
 936                  * Use seg_vn segment driver for /dev/zero mapping.
 937                  * Passing in a NULL amp gives us the "cloning" effect.
 938                  */
 939                 vn_a.vp = NULL;
 940                 vn_a.offset = 0;
 941                 vn_a.type = (flags & MAP_TYPE);
 942                 vn_a.prot = prot;
 943                 vn_a.maxprot = maxprot;
 944                 vn_a.flags = flags & ~MAP_TYPE;
 945                 vn_a.cred = cred;
 946                 vn_a.amp = NULL;
 947                 vn_a.szc = 0;
 948                 vn_a.lgrp_mem_policy_flags = 0;
 949                 error = as_map(as, *addrp, len, segvn_create, &vn_a);
 950                 break;
 951 
 952         case M_KMEM:
 953         case M_ALLKMEM:
 954                 /* No longer supported with KPR. */
 955                 error = ENXIO;
 956                 break;
 957 
 958         case M_NULL:
 959                 /*
 960                  * Use seg_dev segment driver for /dev/null mapping.
 961                  */
 962                 dev_a.mapfunc = mmmmap;
 963                 dev_a.dev = dev;
 964                 dev_a.offset = off;
 965                 dev_a.type = 0;         /* neither PRIVATE nor SHARED */
 966                 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
 967                 dev_a.hat_attr = 0;
 968                 dev_a.hat_flags = 0;
 969                 error = as_map(as, *addrp, len, segdev_create, &dev_a);
 970                 break;
 971 
 972         default:
 973                 error = ENXIO;
 974         }
 975 
 976         as_rangeunlock(as);
 977         return (error);
 978 }
 979 
 980 static struct cb_ops mm_cb_ops = {
 981         mmopen,                 /* open */
 982         nulldev,                /* close */
 983         nodev,                  /* strategy */
 984         nodev,                  /* print */
 985         nodev,                  /* dump */
 986         mmread,                 /* read */
 987         mmwrite,                /* write */
 988         mmioctl,                /* ioctl */
 989         nodev,                  /* devmap */
 990         mmmmap,                 /* mmap */
 991         mmsegmap,               /* segmap */
 992         mmchpoll,               /* poll */
 993         mmpropop,               /* prop_op */
 994         0,                      /* streamtab  */
 995         D_NEW | D_MP | D_64BIT | D_U64BIT
 996 };
 997 
 998 static struct dev_ops mm_ops = {
 999         DEVO_REV,               /* devo_rev, */
1000         0,                      /* refcnt  */
1001         mm_info,                /* get_dev_info */
1002         nulldev,                /* identify */
1003         nulldev,                /* probe */
1004         mm_attach,              /* attach */
1005         nodev,                  /* detach */
1006         nodev,                  /* reset */
1007         &mm_cb_ops,         /* driver operations */
1008         (struct bus_ops *)0,    /* bus operations */
1009         NULL,                   /* power */
1010         ddi_quiesce_not_needed,         /* quiesce */
1011 };
1012 
1013 static struct modldrv modldrv = {
1014         &mod_driverops, "memory driver", &mm_ops,
1015 };
1016 
1017 static struct modlinkage modlinkage = {
1018         MODREV_1, &modldrv, NULL
1019 };
1020 
1021 int
1022 _init(void)
1023 {
1024         return (mod_install(&modlinkage));
1025 }
1026 
1027 int
1028 _info(struct modinfo *modinfop)
1029 {
1030         return (mod_info(&modlinkage, modinfop));
1031 }
1032 
1033 int
1034 _fini(void)
1035 {
1036         return (mod_remove(&modlinkage));
1037 }
1038 
1039 static int
1040 mm_kstat_update(kstat_t *ksp, int rw)
1041 {
1042         struct memlist *pmem;
1043         uint_t count;
1044 
1045         if (rw == KSTAT_WRITE)
1046                 return (EACCES);
1047 
1048         count = 0;
1049         memlist_read_lock();
1050         for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1051                 count++;
1052         }
1053         memlist_read_unlock();
1054 
1055         ksp->ks_ndata = count;
1056         ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1057 
1058         return (0);
1059 }
1060 
1061 static int
1062 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1063 {
1064         struct memlist *pmem;
1065         struct memunit {
1066                 uint64_t address;
1067                 uint64_t size;
1068         } *kspmem;
1069 
1070         if (rw == KSTAT_WRITE)
1071                 return (EACCES);
1072 
1073         ksp->ks_snaptime = gethrtime();
1074 
1075         kspmem = (struct memunit *)buf;
1076         memlist_read_lock();
1077         for (pmem = phys_install; pmem != NULL;
1078             pmem = pmem->ml_next, kspmem++) {
1079                 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1080                         break;
1081                 kspmem->address = pmem->ml_address;
1082                 kspmem->size = pmem->ml_size;
1083         }
1084         memlist_read_unlock();
1085 
1086         return (0);
1087 }
1088 
1089 /*
1090  * Read a mem_name_t from user-space and store it in the mem_name_t
1091  * pointed to by the mem_name argument.
1092  */
1093 static int
1094 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1095 {
1096         if (get_udatamodel() == DATAMODEL_NATIVE) {
1097                 if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1098                         return (EFAULT);
1099         }
1100 #ifdef  _SYSCALL32
1101         else {
1102                 mem_name32_t mem_name32;
1103 
1104                 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1105                         return (EFAULT);
1106                 mem_name->m_addr = mem_name32.m_addr;
1107                 mem_name->m_synd = mem_name32.m_synd;
1108                 mem_name->m_type[0] = mem_name32.m_type[0];
1109                 mem_name->m_type[1] = mem_name32.m_type[1];
1110                 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1111                 mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1112                 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1113                 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1114         }
1115 #endif  /* _SYSCALL32 */
1116 
1117         return (0);
1118 }