Print this page
8634 epoll fails to wake on certain edge-triggered conditions
8635 epoll should not emit POLLNVAL
8636 recursive epoll should emit EPOLLRDNORM
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/mem.c
+++ new/usr/src/uts/common/io/mem.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
|
↓ open down ↓ |
17 lines elided |
↑ open up ↑ |
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 * Copyright (c) 2016 by Delphix. All rights reserved.
25 25 */
26 26
27 27 /*
28 - * Copyright (c) 2015, Joyent, Inc. All rights reserved.
28 + * Copyright 2017 Joyent, Inc.
29 29 * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
30 30 */
31 31
32 32 /*
33 33 * Memory special file
34 34 */
35 35
36 36 #include <sys/types.h>
37 37 #include <sys/param.h>
38 38 #include <sys/user.h>
39 39 #include <sys/buf.h>
40 40 #include <sys/systm.h>
41 41 #include <sys/cred.h>
42 42 #include <sys/vm.h>
43 43 #include <sys/uio.h>
44 44 #include <sys/mman.h>
45 45 #include <sys/kmem.h>
46 46 #include <vm/seg.h>
47 47 #include <vm/page.h>
48 48 #include <sys/stat.h>
49 49 #include <sys/vmem.h>
50 50 #include <sys/memlist.h>
51 51 #include <sys/bootconf.h>
52 52
53 53 #include <vm/seg_vn.h>
54 54 #include <vm/seg_dev.h>
55 55 #include <vm/seg_kmem.h>
56 56 #include <vm/seg_kp.h>
57 57 #include <vm/seg_kpm.h>
58 58 #include <vm/hat.h>
59 59
60 60 #include <sys/conf.h>
61 61 #include <sys/mem.h>
62 62 #include <sys/types.h>
63 63 #include <sys/conf.h>
64 64 #include <sys/param.h>
65 65 #include <sys/systm.h>
66 66 #include <sys/errno.h>
67 67 #include <sys/modctl.h>
68 68 #include <sys/memlist.h>
69 69 #include <sys/ddi.h>
70 70 #include <sys/sunddi.h>
71 71 #include <sys/debug.h>
72 72 #include <sys/fm/protocol.h>
73 73
74 74 #if defined(__sparc)
75 75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
76 76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
77 77 uint64_t *, int *, int *, int *);
78 78 extern size_t cpu_get_name_bufsize(void);
79 79 extern int cpu_get_mem_sid(char *, char *, int, int *);
80 80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
81 81 #elif defined(__x86)
82 82 #include <sys/cpu_module.h>
83 83 #endif /* __sparc */
84 84
85 85 /*
86 86 * Turn a byte length into a pagecount. The DDI btop takes a
87 87 * 32-bit size on 32-bit machines, this handles 64-bit sizes for
88 88 * large physical-memory 32-bit machines.
89 89 */
90 90 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
91 91
92 92 static kmutex_t mm_lock;
93 93 static caddr_t mm_map;
94 94
95 95 static dev_info_t *mm_dip; /* private copy of devinfo pointer */
96 96
97 97 static int mm_kmem_io_access;
98 98
99 99 static int mm_kstat_update(kstat_t *ksp, int rw);
100 100 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
101 101
102 102 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
103 103
104 104 #define MM_KMEMLOG_NENTRIES 64
105 105
106 106 static int mm_kmemlogent;
107 107 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
108 108
109 109 /*
110 110 * On kmem/allmem writes, we log information that might be useful in the event
111 111 * that a write is errant (that is, due to operator error) and induces a later
112 112 * problem. Note that (in particular) in the event of such operator-induced
113 113 * corruption, a search over the kernel address space for the corrupted
114 114 * address will yield the ring buffer entry that recorded the write. And
115 115 * should it seem baroque or otherwise unnecessary, yes, we need this kind of
116 116 * auditing facility and yes, we learned that the hard way: disturbingly,
117 117 * there exist recommendations for "tuning" the system that involve writing to
118 118 * kernel memory addresses via the kernel debugger, and -- as we discovered --
119 119 * these can easily be applied incorrectly or unsafely, yielding an entirely
120 120 * undebuggable "can't happen" kind of panic.
121 121 */
122 122 static void
123 123 mm_logkmem(struct uio *uio)
124 124 {
125 125 mm_logentry_t *ent;
126 126 proc_t *p = curthread->t_procp;
127 127
128 128 mutex_enter(&mm_lock);
129 129
130 130 ent = &mm_kmemlog[mm_kmemlogent++];
131 131
132 132 if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
133 133 mm_kmemlogent = 0;
134 134
135 135 ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
136 136 ent->mle_len = uio->uio_resid;
137 137 gethrestime(&ent->mle_hrestime);
138 138 ent->mle_hrtime = gethrtime();
139 139 ent->mle_pid = p->p_pidp->pid_id;
140 140
141 141 (void) strncpy(ent->mle_psargs,
142 142 p->p_user.u_psargs, sizeof (ent->mle_psargs));
143 143
144 144 mutex_exit(&mm_lock);
145 145 }
146 146
147 147 /*ARGSUSED1*/
148 148 static int
149 149 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
150 150 {
151 151 int i;
152 152 struct mem_minor {
153 153 char *name;
154 154 minor_t minor;
155 155 int privonly;
156 156 const char *rdpriv;
157 157 const char *wrpriv;
158 158 mode_t priv_mode;
159 159 } mm[] = {
160 160 { "mem", M_MEM, 0, NULL, "all", 0640 },
161 161 { "kmem", M_KMEM, 0, NULL, "all", 0640 },
162 162 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 },
163 163 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 },
164 164 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 },
165 165 { "full", M_FULL, PRIVONLY_DEV, NULL, NULL, 0666 },
166 166 };
167 167 kstat_t *ksp;
168 168
169 169 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
170 170 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
171 171
172 172 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
173 173 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
174 174 mm[i].minor, DDI_PSEUDO, mm[i].privonly,
175 175 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
176 176 DDI_FAILURE) {
177 177 ddi_remove_minor_node(devi, NULL);
178 178 return (DDI_FAILURE);
179 179 }
180 180 }
181 181
182 182 mm_dip = devi;
183 183
184 184 ksp = kstat_create("mm", 0, "phys_installed", "misc",
185 185 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
186 186 if (ksp != NULL) {
187 187 ksp->ks_update = mm_kstat_update;
188 188 ksp->ks_snapshot = mm_kstat_snapshot;
189 189 ksp->ks_lock = &mm_lock; /* XXX - not really needed */
190 190 kstat_install(ksp);
191 191 }
192 192
193 193 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
194 194 "kmem_io_access", 0);
195 195
196 196 return (DDI_SUCCESS);
197 197 }
198 198
199 199 /*ARGSUSED*/
200 200 static int
201 201 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
202 202 {
203 203 register int error;
204 204
205 205 switch (infocmd) {
206 206 case DDI_INFO_DEVT2DEVINFO:
207 207 *result = (void *)mm_dip;
208 208 error = DDI_SUCCESS;
209 209 break;
210 210 case DDI_INFO_DEVT2INSTANCE:
211 211 *result = (void *)0;
212 212 error = DDI_SUCCESS;
213 213 break;
214 214 default:
215 215 error = DDI_FAILURE;
216 216 }
217 217 return (error);
218 218 }
219 219
220 220 /*ARGSUSED1*/
221 221 static int
222 222 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
223 223 {
224 224 switch (getminor(*devp)) {
225 225 case M_NULL:
226 226 case M_ZERO:
227 227 case M_FULL:
228 228 case M_MEM:
229 229 case M_KMEM:
230 230 case M_ALLKMEM:
231 231 /* standard devices */
232 232 break;
233 233
234 234 default:
235 235 /* Unsupported or unknown type */
236 236 return (EINVAL);
237 237 }
238 238 /* must be character device */
239 239 if (typ != OTYP_CHR)
240 240 return (EINVAL);
241 241 return (0);
242 242 }
243 243
244 244 struct pollhead mm_pollhd;
245 245
246 246 /*ARGSUSED*/
247 247 static int
248 248 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
249 249 struct pollhead **phpp)
250 250 {
251 251 switch (getminor(dev)) {
|
↓ open down ↓ |
213 lines elided |
↑ open up ↑ |
252 252 case M_NULL:
253 253 case M_ZERO:
254 254 case M_FULL:
255 255 case M_MEM:
256 256 case M_KMEM:
257 257 case M_ALLKMEM:
258 258 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
259 259 POLLWRNORM | POLLRDBAND | POLLWRBAND);
260 260 /*
261 261 * A non NULL pollhead pointer should be returned in case
262 - * user polls for 0 events.
262 + * user polls for 0 events or is doing an edge-triggerd poll.
263 263 */
264 - *phpp = !anyyet && !*reventsp ?
265 - &mm_pollhd : (struct pollhead *)NULL;
264 + if ((!*reventsp && !anyyet) || (events & POLLET)) {
265 + *phpp = &mm_pollhd;
266 + }
266 267 return (0);
267 268 default:
268 269 /* no other devices currently support polling */
269 270 return (ENXIO);
270 271 }
271 272 }
272 273
273 274 static int
274 275 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
275 276 char *name, caddr_t valuep, int *lengthp)
276 277 {
277 278 /*
278 279 * implement zero size to reduce overhead (avoid two failing
279 280 * property lookups per stat).
280 281 */
281 282 return (ddi_prop_op_size(dev, dip, prop_op,
282 283 flags, name, valuep, lengthp, 0));
283 284 }
284 285
285 286 static int
286 287 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
287 288 page_t *pp)
288 289 {
289 290 int error = 0;
290 291 int devload = 0;
291 292 int is_memory = pf_is_memory(pfn);
292 293 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
293 294 (size_t)uio->uio_iov->iov_len);
294 295 caddr_t va = NULL;
295 296
296 297 mutex_enter(&mm_lock);
297 298
298 299 if (is_memory && kpm_enable) {
299 300 if (pp)
300 301 va = hat_kpm_mapin(pp, NULL);
301 302 else
302 303 va = hat_kpm_mapin_pfn(pfn);
303 304 }
304 305
305 306 if (va == NULL) {
306 307 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
307 308 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
308 309 HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
309 310 va = mm_map;
310 311 devload = 1;
311 312 }
312 313
313 314 if (!is_memory) {
314 315 if (allowio) {
315 316 size_t c = uio->uio_iov->iov_len;
316 317
317 318 if (ddi_peekpokeio(NULL, uio, rw,
318 319 (caddr_t)(uintptr_t)uio->uio_loffset, c,
319 320 sizeof (int32_t)) != DDI_SUCCESS)
320 321 error = EFAULT;
321 322 } else
322 323 error = EIO;
323 324 } else
324 325 error = uiomove(va + pageoff, nbytes, rw, uio);
325 326
326 327 if (devload)
327 328 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
328 329 else if (pp)
329 330 hat_kpm_mapout(pp, NULL, va);
330 331 else
331 332 hat_kpm_mapout_pfn(pfn);
332 333
333 334 mutex_exit(&mm_lock);
334 335 return (error);
335 336 }
336 337
337 338 static int
338 339 mmpagelock(struct as *as, caddr_t va)
339 340 {
340 341 struct seg *seg;
341 342 int i;
342 343
343 344 AS_LOCK_ENTER(as, RW_READER);
344 345 seg = as_segat(as, va);
345 346 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
346 347 AS_LOCK_EXIT(as);
347 348
348 349 return (i);
349 350 }
350 351
351 352 #ifdef __sparc
352 353
353 354 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva)
354 355
355 356 #else /* __i386, __amd64 */
356 357
357 358 #define NEED_LOCK_KVADDR(va) 0
358 359
359 360 #endif /* __sparc */
360 361
361 362 /*ARGSUSED3*/
362 363 static int
363 364 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
364 365 {
365 366 pfn_t v;
366 367 struct iovec *iov;
367 368 int error = 0;
368 369 size_t c;
369 370 ssize_t oresid = uio->uio_resid;
370 371 minor_t minor = getminor(dev);
371 372
372 373 while (uio->uio_resid > 0 && error == 0) {
373 374 iov = uio->uio_iov;
374 375 if (iov->iov_len == 0) {
375 376 uio->uio_iov++;
376 377 uio->uio_iovcnt--;
377 378 if (uio->uio_iovcnt < 0)
378 379 panic("mmrw");
379 380 continue;
380 381 }
381 382 switch (minor) {
382 383
383 384 case M_MEM:
384 385 memlist_read_lock();
385 386 if (!address_in_memlist(phys_install,
386 387 (uint64_t)uio->uio_loffset, 1)) {
387 388 memlist_read_unlock();
388 389 error = EFAULT;
389 390 break;
390 391 }
391 392 memlist_read_unlock();
392 393
393 394 v = BTOP((u_offset_t)uio->uio_loffset);
394 395 error = mmio(uio, rw, v,
395 396 uio->uio_loffset & PAGEOFFSET, 0, NULL);
396 397 break;
397 398
398 399 case M_KMEM:
399 400 case M_ALLKMEM:
400 401 {
401 402 page_t **ppp = NULL;
402 403 caddr_t vaddr = (caddr_t)uio->uio_offset;
403 404 int try_lock = NEED_LOCK_KVADDR(vaddr);
404 405 int locked = 0;
405 406
406 407 if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
407 408 break;
408 409
409 410 if (rw == UIO_WRITE)
410 411 mm_logkmem(uio);
411 412
412 413 /*
413 414 * If vaddr does not map a valid page, as_pagelock()
414 415 * will return failure. Hence we can't check the
415 416 * return value and return EFAULT here as we'd like.
416 417 * seg_kp and seg_kpm do not properly support
417 418 * as_pagelock() for this context so we avoid it
418 419 * using the try_lock set check above. Some day when
419 420 * the kernel page locking gets redesigned all this
420 421 * muck can be cleaned up.
421 422 */
422 423 if (try_lock)
423 424 locked = (as_pagelock(&kas, &ppp, vaddr,
424 425 PAGESIZE, S_WRITE) == 0);
425 426
426 427 v = hat_getpfnum(kas.a_hat,
427 428 (caddr_t)(uintptr_t)uio->uio_loffset);
428 429 if (v == PFN_INVALID) {
429 430 if (locked)
430 431 as_pageunlock(&kas, ppp, vaddr,
431 432 PAGESIZE, S_WRITE);
432 433 error = EFAULT;
433 434 break;
434 435 }
435 436
436 437 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
437 438 minor == M_ALLKMEM || mm_kmem_io_access,
438 439 (locked && ppp) ? *ppp : NULL);
439 440 if (locked)
440 441 as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
441 442 S_WRITE);
442 443 }
443 444
444 445 break;
445 446
446 447 case M_FULL:
447 448 if (rw == UIO_WRITE) {
448 449 error = ENOSPC;
449 450 break;
450 451 }
451 452 /* else it's a read, fall through to zero case */
452 453 /*FALLTHROUGH*/
453 454
454 455 case M_ZERO:
455 456 if (rw == UIO_READ) {
456 457 label_t ljb;
457 458
458 459 if (on_fault(&ljb)) {
459 460 no_fault();
460 461 error = EFAULT;
461 462 break;
462 463 }
463 464 uzero(iov->iov_base, iov->iov_len);
464 465 no_fault();
465 466 uio->uio_resid -= iov->iov_len;
466 467 uio->uio_loffset += iov->iov_len;
467 468 break;
468 469 }
469 470 /* else it's a write, fall through to NULL case */
470 471 /*FALLTHROUGH*/
471 472
472 473 case M_NULL:
473 474 if (rw == UIO_READ)
474 475 return (0);
475 476 c = iov->iov_len;
476 477 iov->iov_base += c;
477 478 iov->iov_len -= c;
478 479 uio->uio_loffset += c;
479 480 uio->uio_resid -= c;
480 481 break;
481 482
482 483 }
483 484 }
484 485 return (uio->uio_resid == oresid ? error : 0);
485 486 }
486 487
487 488 static int
488 489 mmread(dev_t dev, struct uio *uio, cred_t *cred)
489 490 {
490 491 return (mmrw(dev, uio, UIO_READ, cred));
491 492 }
492 493
493 494 static int
494 495 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
495 496 {
496 497 return (mmrw(dev, uio, UIO_WRITE, cred));
497 498 }
498 499
499 500 /*
500 501 * Private ioctl for libkvm to support kvm_physaddr().
501 502 * Given an address space and a VA, compute the PA.
502 503 */
503 504 static int
504 505 mmioctl_vtop(intptr_t data)
505 506 {
506 507 #ifdef _SYSCALL32
507 508 mem_vtop32_t vtop32;
508 509 #endif
509 510 mem_vtop_t mem_vtop;
510 511 proc_t *p;
511 512 pfn_t pfn = (pfn_t)PFN_INVALID;
512 513 pid_t pid = 0;
513 514 struct as *as;
514 515 struct seg *seg;
515 516
516 517 if (get_udatamodel() == DATAMODEL_NATIVE) {
517 518 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
518 519 return (EFAULT);
519 520 }
520 521 #ifdef _SYSCALL32
521 522 else {
522 523 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
523 524 return (EFAULT);
524 525 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
525 526 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
526 527
527 528 if (mem_vtop.m_as != NULL)
528 529 return (EINVAL);
529 530 }
530 531 #endif
531 532
532 533 if (mem_vtop.m_as == &kas) {
533 534 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
534 535 } else {
535 536 if (mem_vtop.m_as == NULL) {
536 537 /*
537 538 * Assume the calling process's address space if the
538 539 * caller didn't specify one.
539 540 */
540 541 p = curthread->t_procp;
541 542 if (p == NULL)
542 543 return (EIO);
543 544 mem_vtop.m_as = p->p_as;
544 545 }
545 546
546 547 mutex_enter(&pidlock);
547 548 for (p = practive; p != NULL; p = p->p_next) {
548 549 if (p->p_as == mem_vtop.m_as) {
549 550 pid = p->p_pid;
550 551 break;
551 552 }
552 553 }
553 554 mutex_exit(&pidlock);
554 555 if (p == NULL)
555 556 return (EIO);
556 557 p = sprlock(pid);
557 558 if (p == NULL)
558 559 return (EIO);
559 560 as = p->p_as;
560 561 if (as == mem_vtop.m_as) {
561 562 mutex_exit(&p->p_lock);
562 563 AS_LOCK_ENTER(as, RW_READER);
563 564 for (seg = AS_SEGFIRST(as); seg != NULL;
564 565 seg = AS_SEGNEXT(as, seg))
565 566 if ((uintptr_t)mem_vtop.m_va -
566 567 (uintptr_t)seg->s_base < seg->s_size)
567 568 break;
568 569 if (seg != NULL)
569 570 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
570 571 AS_LOCK_EXIT(as);
571 572 mutex_enter(&p->p_lock);
572 573 }
573 574 sprunlock(p);
574 575 }
575 576 mem_vtop.m_pfn = pfn;
576 577 if (pfn == PFN_INVALID)
577 578 return (EIO);
578 579
579 580 if (get_udatamodel() == DATAMODEL_NATIVE) {
580 581 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
581 582 return (EFAULT);
582 583 }
583 584 #ifdef _SYSCALL32
584 585 else {
585 586 vtop32.m_pfn = mem_vtop.m_pfn;
586 587 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
587 588 return (EFAULT);
588 589 }
589 590 #endif
590 591
591 592 return (0);
592 593 }
593 594
594 595 /*
595 596 * Given a PA, execute the given page retire command on it.
596 597 */
597 598 static int
598 599 mmioctl_page_retire(int cmd, intptr_t data)
599 600 {
600 601 extern int page_retire_test(void);
601 602 uint64_t pa;
602 603
603 604 if (copyin((void *)data, &pa, sizeof (uint64_t))) {
604 605 return (EFAULT);
605 606 }
606 607
607 608 switch (cmd) {
608 609 case MEM_PAGE_ISRETIRED:
609 610 return (page_retire_check(pa, NULL));
610 611
611 612 case MEM_PAGE_UNRETIRE:
612 613 return (page_unretire(pa));
613 614
614 615 case MEM_PAGE_RETIRE:
615 616 return (page_retire(pa, PR_FMA));
616 617
617 618 case MEM_PAGE_RETIRE_MCE:
618 619 return (page_retire(pa, PR_MCE));
619 620
620 621 case MEM_PAGE_RETIRE_UE:
621 622 return (page_retire(pa, PR_UE));
622 623
623 624 case MEM_PAGE_GETERRORS:
624 625 {
625 626 uint64_t page_errors;
626 627 int rc = page_retire_check(pa, &page_errors);
627 628 if (copyout(&page_errors, (void *)data,
628 629 sizeof (uint64_t))) {
629 630 return (EFAULT);
630 631 }
631 632 return (rc);
632 633 }
633 634
634 635 case MEM_PAGE_RETIRE_TEST:
635 636 return (page_retire_test());
636 637
637 638 }
638 639
639 640 return (EINVAL);
640 641 }
641 642
642 643 #ifdef __sparc
643 644 /*
644 645 * Given a syndrome, syndrome type, and address return the
645 646 * associated memory name in the provided data buffer.
646 647 */
647 648 static int
648 649 mmioctl_get_mem_name(intptr_t data)
649 650 {
650 651 mem_name_t mem_name;
651 652 void *buf;
652 653 size_t bufsize;
653 654 int len, err;
654 655
655 656 if ((bufsize = cpu_get_name_bufsize()) == 0)
656 657 return (ENOTSUP);
657 658
658 659 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
659 660 return (err);
660 661
661 662 buf = kmem_alloc(bufsize, KM_SLEEP);
662 663
663 664 /*
664 665 * Call into cpu specific code to do the lookup.
665 666 */
666 667 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
667 668 mem_name.m_addr, buf, bufsize, &len)) != 0) {
668 669 kmem_free(buf, bufsize);
669 670 return (err);
670 671 }
671 672
672 673 if (len >= mem_name.m_namelen) {
673 674 kmem_free(buf, bufsize);
674 675 return (ENOSPC);
675 676 }
676 677
677 678 if (copyoutstr(buf, (char *)mem_name.m_name,
678 679 mem_name.m_namelen, NULL) != 0) {
679 680 kmem_free(buf, bufsize);
680 681 return (EFAULT);
681 682 }
682 683
683 684 kmem_free(buf, bufsize);
684 685 return (0);
685 686 }
686 687
687 688 /*
688 689 * Given a syndrome and address return information about the associated memory.
689 690 */
690 691 static int
691 692 mmioctl_get_mem_info(intptr_t data)
692 693 {
693 694 mem_info_t mem_info;
694 695 int err;
695 696
696 697 if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
697 698 return (EFAULT);
698 699
699 700 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
700 701 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
701 702 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
702 703 return (err);
703 704
704 705 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
705 706 return (EFAULT);
706 707
707 708 return (0);
708 709 }
709 710
710 711 /*
711 712 * Given a memory name, return its associated serial id
712 713 */
713 714 static int
714 715 mmioctl_get_mem_sid(intptr_t data)
715 716 {
716 717 mem_name_t mem_name;
717 718 void *buf;
718 719 void *name;
719 720 size_t name_len;
720 721 size_t bufsize;
721 722 int len, err;
722 723
723 724 if ((bufsize = cpu_get_name_bufsize()) == 0)
724 725 return (ENOTSUP);
725 726
726 727 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
727 728 return (err);
728 729
729 730 buf = kmem_alloc(bufsize, KM_SLEEP);
730 731
731 732 if (mem_name.m_namelen > 1024)
732 733 mem_name.m_namelen = 1024; /* cap at 1024 bytes */
733 734
734 735 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
735 736
736 737 if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
737 738 mem_name.m_namelen, &name_len)) != 0) {
738 739 kmem_free(buf, bufsize);
739 740 kmem_free(name, mem_name.m_namelen);
740 741 return (err);
741 742 }
742 743
743 744 /*
744 745 * Call into cpu specific code to do the lookup.
745 746 */
746 747 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
747 748 kmem_free(buf, bufsize);
748 749 kmem_free(name, mem_name.m_namelen);
749 750 return (err);
750 751 }
751 752
752 753 if (len > mem_name.m_sidlen) {
753 754 kmem_free(buf, bufsize);
754 755 kmem_free(name, mem_name.m_namelen);
755 756 return (ENAMETOOLONG);
756 757 }
757 758
758 759 if (copyoutstr(buf, (char *)mem_name.m_sid,
759 760 mem_name.m_sidlen, NULL) != 0) {
760 761 kmem_free(buf, bufsize);
761 762 kmem_free(name, mem_name.m_namelen);
762 763 return (EFAULT);
763 764 }
764 765
765 766 kmem_free(buf, bufsize);
766 767 kmem_free(name, mem_name.m_namelen);
767 768 return (0);
768 769 }
769 770 #endif /* __sparc */
770 771
771 772 /*
772 773 * Private ioctls for
773 774 * libkvm to support kvm_physaddr().
774 775 * FMA support for page_retire() and memory attribute information.
775 776 */
776 777 /*ARGSUSED*/
777 778 static int
778 779 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
779 780 {
780 781 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
781 782 (cmd != MEM_VTOP && getminor(dev) != M_MEM))
782 783 return (ENXIO);
783 784
784 785 switch (cmd) {
785 786 case MEM_VTOP:
786 787 return (mmioctl_vtop(data));
787 788
788 789 case MEM_PAGE_RETIRE:
789 790 case MEM_PAGE_ISRETIRED:
790 791 case MEM_PAGE_UNRETIRE:
791 792 case MEM_PAGE_RETIRE_MCE:
792 793 case MEM_PAGE_RETIRE_UE:
793 794 case MEM_PAGE_GETERRORS:
794 795 case MEM_PAGE_RETIRE_TEST:
795 796 return (mmioctl_page_retire(cmd, data));
796 797
797 798 #ifdef __sparc
798 799 case MEM_NAME:
799 800 return (mmioctl_get_mem_name(data));
800 801
801 802 case MEM_INFO:
802 803 return (mmioctl_get_mem_info(data));
803 804
804 805 case MEM_SID:
805 806 return (mmioctl_get_mem_sid(data));
806 807 #else
807 808 case MEM_NAME:
808 809 case MEM_INFO:
809 810 case MEM_SID:
810 811 return (ENOTSUP);
811 812 #endif /* __sparc */
812 813 }
813 814 return (ENXIO);
814 815 }
815 816
816 817 /*ARGSUSED2*/
817 818 static int
818 819 mmmmap(dev_t dev, off_t off, int prot)
819 820 {
820 821 pfn_t pf;
821 822 struct memlist *pmem;
822 823 minor_t minor = getminor(dev);
823 824
824 825 switch (minor) {
825 826 case M_MEM:
826 827 pf = btop(off);
827 828 memlist_read_lock();
828 829 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
829 830 if (pf >= BTOP(pmem->ml_address) &&
830 831 pf < BTOP(pmem->ml_address + pmem->ml_size)) {
831 832 memlist_read_unlock();
832 833 return (impl_obmem_pfnum(pf));
833 834 }
834 835 }
835 836 memlist_read_unlock();
836 837 break;
837 838
838 839 case M_KMEM:
839 840 case M_ALLKMEM:
840 841 /* no longer supported with KPR */
841 842 return (-1);
842 843
843 844 case M_FULL:
844 845 case M_ZERO:
845 846 /*
846 847 * We shouldn't be mmap'ing to /dev/zero here as
847 848 * mmsegmap() should have already converted
848 849 * a mapping request for this device to a mapping
849 850 * using seg_vn for anonymous memory.
850 851 */
851 852 break;
852 853
853 854 }
854 855 return (-1);
855 856 }
856 857
857 858 /*
858 859 * This function is called when a memory device is mmap'ed.
859 860 * Set up the mapping to the correct device driver.
860 861 */
861 862 static int
862 863 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
863 864 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
864 865 {
865 866 struct segvn_crargs vn_a;
866 867 struct segdev_crargs dev_a;
867 868 int error;
868 869 minor_t minor;
869 870 off_t i;
870 871
871 872 minor = getminor(dev);
872 873
873 874 as_rangelock(as);
874 875 /*
875 876 * No need to worry about vac alignment on /dev/zero
876 877 * since this is a "clone" object that doesn't yet exist.
877 878 */
878 879 error = choose_addr(as, addrp, len, off,
879 880 (minor == M_MEM) || (minor == M_KMEM), flags);
880 881 if (error != 0) {
881 882 as_rangeunlock(as);
882 883 return (error);
883 884 }
884 885
885 886 switch (minor) {
886 887 case M_MEM:
887 888 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
888 889 if ((flags & MAP_TYPE) != MAP_SHARED) {
889 890 as_rangeunlock(as);
890 891 return (EINVAL);
891 892 }
892 893
893 894 /*
894 895 * Check to ensure that the entire range is
895 896 * legal and we are not trying to map in
896 897 * more than the device will let us.
897 898 */
898 899 for (i = 0; i < len; i += PAGESIZE) {
899 900 if (mmmmap(dev, off + i, maxprot) == -1) {
900 901 as_rangeunlock(as);
901 902 return (ENXIO);
902 903 }
903 904 }
904 905
905 906 /*
906 907 * Use seg_dev segment driver for /dev/mem mapping.
907 908 */
908 909 dev_a.mapfunc = mmmmap;
909 910 dev_a.dev = dev;
910 911 dev_a.offset = off;
911 912 dev_a.type = (flags & MAP_TYPE);
912 913 dev_a.prot = (uchar_t)prot;
913 914 dev_a.maxprot = (uchar_t)maxprot;
914 915 dev_a.hat_attr = 0;
915 916
916 917 /*
917 918 * Make /dev/mem mappings non-consistent since we can't
918 919 * alias pages that don't have page structs behind them,
919 920 * such as kernel stack pages. If someone mmap()s a kernel
920 921 * stack page and if we give them a tte with cv, a line from
921 922 * that page can get into both pages of the spitfire d$.
922 923 * But snoop from another processor will only invalidate
923 924 * the first page. This later caused kernel (xc_attention)
924 925 * to go into an infinite loop at pil 13 and no interrupts
925 926 * could come in. See 1203630.
926 927 *
927 928 */
928 929 dev_a.hat_flags = HAT_LOAD_NOCONSIST;
929 930 dev_a.devmap_data = NULL;
930 931
931 932 error = as_map(as, *addrp, len, segdev_create, &dev_a);
932 933 break;
933 934
934 935 case M_ZERO:
935 936 /*
936 937 * Use seg_vn segment driver for /dev/zero mapping.
937 938 * Passing in a NULL amp gives us the "cloning" effect.
938 939 */
939 940 vn_a.vp = NULL;
940 941 vn_a.offset = 0;
941 942 vn_a.type = (flags & MAP_TYPE);
942 943 vn_a.prot = prot;
943 944 vn_a.maxprot = maxprot;
944 945 vn_a.flags = flags & ~MAP_TYPE;
945 946 vn_a.cred = cred;
946 947 vn_a.amp = NULL;
947 948 vn_a.szc = 0;
948 949 vn_a.lgrp_mem_policy_flags = 0;
949 950 error = as_map(as, *addrp, len, segvn_create, &vn_a);
950 951 break;
951 952
952 953 case M_KMEM:
953 954 case M_ALLKMEM:
954 955 /* No longer supported with KPR. */
955 956 error = ENXIO;
956 957 break;
957 958
958 959 case M_NULL:
959 960 /*
960 961 * Use seg_dev segment driver for /dev/null mapping.
961 962 */
962 963 dev_a.mapfunc = mmmmap;
963 964 dev_a.dev = dev;
964 965 dev_a.offset = off;
965 966 dev_a.type = 0; /* neither PRIVATE nor SHARED */
966 967 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
967 968 dev_a.hat_attr = 0;
968 969 dev_a.hat_flags = 0;
969 970 error = as_map(as, *addrp, len, segdev_create, &dev_a);
970 971 break;
971 972
972 973 default:
973 974 error = ENXIO;
974 975 }
975 976
976 977 as_rangeunlock(as);
977 978 return (error);
978 979 }
979 980
980 981 static struct cb_ops mm_cb_ops = {
981 982 mmopen, /* open */
982 983 nulldev, /* close */
983 984 nodev, /* strategy */
984 985 nodev, /* print */
985 986 nodev, /* dump */
986 987 mmread, /* read */
987 988 mmwrite, /* write */
988 989 mmioctl, /* ioctl */
989 990 nodev, /* devmap */
990 991 mmmmap, /* mmap */
991 992 mmsegmap, /* segmap */
992 993 mmchpoll, /* poll */
993 994 mmpropop, /* prop_op */
994 995 0, /* streamtab */
995 996 D_NEW | D_MP | D_64BIT | D_U64BIT
996 997 };
997 998
998 999 static struct dev_ops mm_ops = {
999 1000 DEVO_REV, /* devo_rev, */
1000 1001 0, /* refcnt */
1001 1002 mm_info, /* get_dev_info */
1002 1003 nulldev, /* identify */
1003 1004 nulldev, /* probe */
1004 1005 mm_attach, /* attach */
1005 1006 nodev, /* detach */
1006 1007 nodev, /* reset */
1007 1008 &mm_cb_ops, /* driver operations */
1008 1009 (struct bus_ops *)0, /* bus operations */
1009 1010 NULL, /* power */
1010 1011 ddi_quiesce_not_needed, /* quiesce */
1011 1012 };
1012 1013
1013 1014 static struct modldrv modldrv = {
1014 1015 &mod_driverops, "memory driver", &mm_ops,
1015 1016 };
1016 1017
1017 1018 static struct modlinkage modlinkage = {
1018 1019 MODREV_1, &modldrv, NULL
1019 1020 };
1020 1021
1021 1022 int
1022 1023 _init(void)
1023 1024 {
1024 1025 return (mod_install(&modlinkage));
1025 1026 }
1026 1027
1027 1028 int
1028 1029 _info(struct modinfo *modinfop)
1029 1030 {
1030 1031 return (mod_info(&modlinkage, modinfop));
1031 1032 }
1032 1033
1033 1034 int
1034 1035 _fini(void)
1035 1036 {
1036 1037 return (mod_remove(&modlinkage));
1037 1038 }
1038 1039
1039 1040 static int
1040 1041 mm_kstat_update(kstat_t *ksp, int rw)
1041 1042 {
1042 1043 struct memlist *pmem;
1043 1044 uint_t count;
1044 1045
1045 1046 if (rw == KSTAT_WRITE)
1046 1047 return (EACCES);
1047 1048
1048 1049 count = 0;
1049 1050 memlist_read_lock();
1050 1051 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1051 1052 count++;
1052 1053 }
1053 1054 memlist_read_unlock();
1054 1055
1055 1056 ksp->ks_ndata = count;
1056 1057 ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1057 1058
1058 1059 return (0);
1059 1060 }
1060 1061
1061 1062 static int
1062 1063 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1063 1064 {
1064 1065 struct memlist *pmem;
1065 1066 struct memunit {
1066 1067 uint64_t address;
1067 1068 uint64_t size;
1068 1069 } *kspmem;
1069 1070
1070 1071 if (rw == KSTAT_WRITE)
1071 1072 return (EACCES);
1072 1073
1073 1074 ksp->ks_snaptime = gethrtime();
1074 1075
1075 1076 kspmem = (struct memunit *)buf;
1076 1077 memlist_read_lock();
1077 1078 for (pmem = phys_install; pmem != NULL;
1078 1079 pmem = pmem->ml_next, kspmem++) {
1079 1080 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1080 1081 break;
1081 1082 kspmem->address = pmem->ml_address;
1082 1083 kspmem->size = pmem->ml_size;
1083 1084 }
1084 1085 memlist_read_unlock();
1085 1086
1086 1087 return (0);
1087 1088 }
1088 1089
1089 1090 /*
1090 1091 * Read a mem_name_t from user-space and store it in the mem_name_t
1091 1092 * pointed to by the mem_name argument.
1092 1093 */
1093 1094 static int
1094 1095 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1095 1096 {
1096 1097 if (get_udatamodel() == DATAMODEL_NATIVE) {
1097 1098 if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1098 1099 return (EFAULT);
1099 1100 }
1100 1101 #ifdef _SYSCALL32
1101 1102 else {
1102 1103 mem_name32_t mem_name32;
1103 1104
1104 1105 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1105 1106 return (EFAULT);
1106 1107 mem_name->m_addr = mem_name32.m_addr;
1107 1108 mem_name->m_synd = mem_name32.m_synd;
1108 1109 mem_name->m_type[0] = mem_name32.m_type[0];
1109 1110 mem_name->m_type[1] = mem_name32.m_type[1];
1110 1111 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1111 1112 mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1112 1113 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1113 1114 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1114 1115 }
1115 1116 #endif /* _SYSCALL32 */
1116 1117
1117 1118 return (0);
1118 1119 }
|
↓ open down ↓ |
843 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX