1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 /*
29 * Copyright 2019 Joyent, Inc.
30 * Copyright 2021 Oxide Computer Company
31 */
32
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/thread.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/vnode.h>
42 #include <sys/mman.h>
43 #include <sys/kmem.h>
44 #include <sys/proc.h>
45 #include <sys/pathname.h>
46 #include <sys/policy.h>
47 #include <sys/cmn_err.h>
48 #include <sys/systm.h>
49 #include <sys/elf.h>
50 #include <sys/vmsystm.h>
51 #include <sys/debug.h>
52 #include <sys/auxv.h>
53 #include <sys/exec.h>
54 #include <sys/prsystm.h>
55 #include <vm/as.h>
56 #include <vm/rm.h>
57 #include <vm/seg.h>
58 #include <vm/seg_vn.h>
59 #include <sys/modctl.h>
60 #include <sys/systeminfo.h>
61 #include <sys/vmparam.h>
62 #include <sys/machelf.h>
63 #include <sys/shm_impl.h>
64 #include <sys/archsystm.h>
65 #include <sys/fasttrap.h>
66 #include <sys/brand.h>
67 #include "elf_impl.h"
68 #include <sys/sdt.h>
69 #include <sys/siginfo.h>
70 #include <sys/random.h>
71
72 #include <core_shstrtab.h>
73
74 #if defined(__x86)
75 #include <sys/comm_page_util.h>
76 #include <sys/fp.h>
77 #endif /* defined(__x86) */
78
79
80 extern int at_flags;
81 extern volatile size_t aslr_max_brk_skew;
82
83 #define ORIGIN_STR "ORIGIN"
84 #define ORIGIN_STR_SIZE 6
85
86 static int getelfhead(vnode_t *, cred_t *, Ehdr *, uint_t *, uint_t *,
87 uint_t *);
88 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, uint_t, caddr_t *,
89 size_t *);
90 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, uint_t, uint_t,
91 caddr_t *, size_t *, caddr_t *, size_t *);
92 static size_t elfsize(const Ehdr *, uint_t, const caddr_t, uintptr_t *);
93 static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **,
94 Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *,
95 size_t, size_t *, size_t *);
96
97 #ifdef _ELF32_COMPAT
98 /* Link against the non-compat instances when compiling the 32-bit version. */
99 extern size_t elf_datasz_max;
100 extern size_t elf_zeropg_sz;
101 extern void elf_ctx_resize_scratch(elf_core_ctx_t *, size_t);
102 extern uint_t elf_nphdr_max;
103 extern uint_t elf_nshdr_max;
104 extern size_t elf_shstrtab_max;
105 #else
106 size_t elf_datasz_max = 1 * 1024 * 1024;
107 size_t elf_zeropg_sz = 4 * 1024;
108 uint_t elf_nphdr_max = 1000;
109 uint_t elf_nshdr_max = 10000;
110 size_t elf_shstrtab_max = 100 * 1024;
111 #endif
112
113 static int
114 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
115 {
116 ASSERT(phdrp->p_type == PT_SUNWDTRACE);
117
118 /*
119 * See the comment in fasttrap.h for information on how to safely
120 * update this program header.
121 */
122 if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
123 (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
124 return (-1);
125
126 args->thrptr = phdrp->p_vaddr + base;
127
128 return (0);
129 }
130
131 static int
132 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
133 {
134 uint_t flag;
135
136 switch (dt) {
137 case DT_SUNW_ASLR:
138 flag = PROC_SEC_ASLR;
139 break;
140 default:
141 return (EINVAL);
142 }
143
144 if (val == 0) {
145 if (secflag_isset(p->p_secflags.psf_lower, flag))
146 return (EPERM);
147 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
148 secflag_isset(p->p_secflags.psf_inherit, flag))
149 return (EPERM);
150
151 secflag_clear(&p->p_secflags.psf_effective, flag);
152 } else {
153 if (!secflag_isset(p->p_secflags.psf_upper, flag))
154 return (EPERM);
155
156 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
157 !secflag_isset(p->p_secflags.psf_inherit, flag))
158 return (EPERM);
159
160 secflag_set(&p->p_secflags.psf_effective, flag);
161 }
162
163 return (0);
164 }
165
166
167 #ifndef _ELF32_COMPAT
168 void
169 elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz)
170 {
171 size_t target = MIN(sz, elf_datasz_max);
172
173 if (target > ctx->ecc_bufsz) {
174 if (ctx->ecc_buf != NULL) {
175 kmem_free(ctx->ecc_buf, ctx->ecc_bufsz);
176 }
177 ctx->ecc_buf = kmem_alloc(target, KM_SLEEP);
178 ctx->ecc_bufsz = target;
179 }
180 }
181 #endif /* _ELF32_COMPAT */
182
183 /*
184 * Map in the executable pointed to by vp. Returns 0 on success. Note that
185 * this function currently has the maximum number of arguments allowed by
186 * modstubs on x86 (MAXNARG)! Do _not_ add to this function signature without
187 * adding to MAXNARG. (Better yet, do not add to this monster of a function
188 * signature!)
189 */
190 int
191 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
192 intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
193 caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
194 {
195 size_t len, phdrsize;
196 struct vattr vat;
197 caddr_t phdrbase = NULL;
198 uint_t nshdrs, shstrndx, nphdrs;
199 int error = 0;
200 Phdr *uphdr = NULL;
201 Phdr *junk = NULL;
202 Phdr *dynphdr = NULL;
203 Phdr *dtrphdr = NULL;
204 char *interp = NULL;
205 uintptr_t lddata, minaddr;
206 size_t execsz;
207
208 if (lddatap != NULL)
209 *lddatap = 0;
210
211 if (minaddrp != NULL)
212 *minaddrp = (uintptr_t)NULL;
213
214 if (error = execpermissions(vp, &vat, args)) {
215 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
216 return (error);
217 }
218
219 if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
220 &nphdrs)) != 0 ||
221 (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
222 &phdrsize)) != 0) {
223 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
224 return (error);
225 }
226
227 if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
228 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
229 kmem_free(phdrbase, phdrsize);
230 return (ENOEXEC);
231 }
232 if (lddatap != NULL)
233 *lddatap = lddata;
234
235 if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
236 &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
237 len, &execsz, brksize)) {
238 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
239 if (uphdr != NULL && uphdr->p_flags == 0)
240 kmem_free(uphdr, sizeof (Phdr));
241 kmem_free(phdrbase, phdrsize);
242 return (error);
243 }
244
245 if (minaddrp != NULL)
246 *minaddrp = minaddr;
247
248 /*
249 * If the executable requires an interpreter, determine its name.
250 */
251 if (dynphdr != NULL) {
252 ssize_t resid;
253
254 if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
255 uprintf("%s: Invalid interpreter\n", exec_file);
256 kmem_free(phdrbase, phdrsize);
257 return (ENOEXEC);
258 }
259
260 interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
261
262 if ((error = vn_rdwr(UIO_READ, vp, interp,
263 (ssize_t)dynphdr->p_filesz,
264 (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
265 (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
266 interp[dynphdr->p_filesz - 1] != '\0') {
267 uprintf("%s: Cannot obtain interpreter pathname\n",
268 exec_file);
269 kmem_free(interp, MAXPATHLEN);
270 kmem_free(phdrbase, phdrsize);
271 return (error != 0 ? error : ENOEXEC);
272 }
273 }
274
275 /*
276 * If this is a statically linked executable, voffset should indicate
277 * the address of the executable itself (it normally holds the address
278 * of the interpreter).
279 */
280 if (ehdr->e_type == ET_EXEC && interp == NULL)
281 *voffset = minaddr;
282
283 /*
284 * If the caller has asked for the interpreter name, return it (it's
285 * up to the caller to free it); if the caller hasn't asked for it,
286 * free it ourselves.
287 */
288 if (interpp != NULL) {
289 *interpp = interp;
290 } else if (interp != NULL) {
291 kmem_free(interp, MAXPATHLEN);
292 }
293
294 if (uphdr != NULL) {
295 *uphdr_vaddr = uphdr->p_vaddr;
296
297 if (uphdr->p_flags == 0)
298 kmem_free(uphdr, sizeof (Phdr));
299 } else if (ehdr->e_type == ET_DYN) {
300 /*
301 * If we don't have a uphdr, we'll apply the logic found
302 * in mapelfexec() and use the p_vaddr of the first PT_LOAD
303 * section as the base address of the object.
304 */
305 const Phdr *phdr = (Phdr *)phdrbase;
306 const uint_t hsize = ehdr->e_phentsize;
307 uint_t i;
308
309 for (i = nphdrs; i > 0; i--) {
310 if (phdr->p_type == PT_LOAD) {
311 *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
312 ehdr->e_phoff;
313 break;
314 }
315
316 phdr = (Phdr *)((caddr_t)phdr + hsize);
317 }
318
319 /*
320 * If we don't have a PT_LOAD segment, we should have returned
321 * ENOEXEC when elfsize() returned 0, above.
322 */
323 VERIFY(i > 0);
324 } else {
325 *uphdr_vaddr = (Addr)-1;
326 }
327
328 kmem_free(phdrbase, phdrsize);
329 return (error);
330 }
331
332 /*ARGSUSED*/
333 int
334 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
335 int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
336 int *brand_action)
337 {
338 caddr_t phdrbase = NULL;
339 caddr_t bssbase = 0;
340 caddr_t brkbase = 0;
341 size_t brksize = 0;
342 size_t dlnsize, nsize = 0;
343 aux_entry_t *aux;
344 int error;
345 ssize_t resid;
346 int fd = -1;
347 intptr_t voffset;
348 Phdr *intphdr = NULL;
349 Phdr *dynamicphdr = NULL;
350 Phdr *stphdr = NULL;
351 Phdr *uphdr = NULL;
352 Phdr *junk = NULL;
353 size_t len;
354 size_t postfixsize = 0;
355 size_t i;
356 Phdr *phdrp;
357 Phdr *dataphdrp = NULL;
358 Phdr *dtrphdr;
359 Phdr *capphdr = NULL;
360 Cap *cap = NULL;
361 size_t capsize;
362 int hasu = 0;
363 int hasauxv = 0;
364 int hasintp = 0;
365 int branded = 0;
366 int dynuphdr = 0;
367
368 struct proc *p = ttoproc(curthread);
369 struct user *up = PTOU(p);
370 struct bigwad {
371 Ehdr ehdr;
372 aux_entry_t elfargs[__KERN_NAUXV_IMPL];
373 char dl_name[MAXPATHLEN];
374 char pathbuf[MAXPATHLEN];
375 struct vattr vattr;
376 struct execenv exenv;
377 } *bigwad; /* kmem_alloc this behemoth so we don't blow stack */
378 Ehdr *ehdrp;
379 uint_t nshdrs, shstrndx, nphdrs;
380 size_t phdrsize;
381 char *dlnp;
382 char *pathbufp;
383 rlim64_t limit;
384 rlim64_t roundlimit;
385
386 ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
387
388 bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
389 ehdrp = &bigwad->ehdr;
390 dlnp = bigwad->dl_name;
391 pathbufp = bigwad->pathbuf;
392
393 /*
394 * Obtain ELF and program header information.
395 */
396 if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
397 &nphdrs)) != 0 ||
398 (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
399 &phdrsize)) != 0)
400 goto out;
401
402 /*
403 * Prevent executing an ELF file that has no entry point.
404 */
405 if (ehdrp->e_entry == 0) {
406 uprintf("%s: Bad entry point\n", exec_file);
407 goto bad;
408 }
409
410 /*
411 * Put data model that we're exec-ing to into the args passed to
412 * exec_args(), so it will know what it is copying to on new stack.
413 * Now that we know whether we are exec-ing a 32-bit or 64-bit
414 * executable, we can set execsz with the appropriate NCARGS.
415 */
416 #ifdef _LP64
417 if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
418 args->to_model = DATAMODEL_ILP32;
419 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
420 } else {
421 args->to_model = DATAMODEL_LP64;
422 if (!args->stk_prot_override) {
423 args->stk_prot &= ~PROT_EXEC;
424 }
425 #if defined(__x86)
426 args->dat_prot &= ~PROT_EXEC;
427 #endif
428 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
429 }
430 #else /* _LP64 */
431 args->to_model = DATAMODEL_ILP32;
432 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
433 #endif /* _LP64 */
434
435 /*
436 * We delay invoking the brand callback until we've figured out what
437 * kind of elf binary we're trying to run, 32-bit or 64-bit. We do this
438 * because now the brand library can just check args->to_model to see if
439 * the target is 32-bit or 64-bit without having do duplicate all the
440 * code above.
441 *
442 * We also give the brand a chance to indicate that based on the ELF
443 * OSABI of the target binary it should become unbranded and optionally
444 * indicate that it should be treated as existing in a specific prefix.
445 *
446 * Note that if a brand opts to go down this route it does not actually
447 * end up being debranded. In other words, future programs that exec
448 * will still be considered for branding unless this escape hatch is
449 * used. Consider the case of lx brand for example. If a user runs
450 * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
451 * of DTrace that's in /native will take this escape hatch and be run
452 * and interpreted using the normal system call table; however, the
453 * execution of a non-illumos binary in the form of /bin/ls will still
454 * be branded and be subject to all of the normal actions of the brand.
455 *
456 * The level checks associated with brand handling below are used to
457 * prevent a loop since the brand elfexec function typically comes back
458 * through this function. We must check <= here since the nested
459 * handling in the #! interpreter code will increment the level before
460 * calling gexec to run the final elfexec interpreter.
461 */
462 if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
463 (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
464 if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
465 &args->brand_nroot) == B_TRUE) {
466 ASSERT(ehdrp->e_ident[EI_OSABI]);
467 *brand_action = EBA_NATIVE;
468 /* Add one for the trailing '/' in the path */
469 if (args->brand_nroot != NULL)
470 nsize = strlen(args->brand_nroot) + 1;
471 }
472 }
473
474 if ((level <= INTP_MAXDEPTH) &&
475 (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
476 error = BROP(p)->b_elfexec(vp, uap, args,
477 idatap, level + 1, execsz, setid, exec_file, cred,
478 brand_action);
479 goto out;
480 }
481
482 /*
483 * Determine aux size now so that stack can be built
484 * in one shot (except actual copyout of aux image),
485 * determine any non-default stack protections,
486 * and still have this code be machine independent.
487 */
488 const uint_t hsize = ehdrp->e_phentsize;
489 phdrp = (Phdr *)phdrbase;
490 for (i = nphdrs; i > 0; i--) {
491 switch (phdrp->p_type) {
492 case PT_INTERP:
493 hasauxv = hasintp = 1;
494 break;
495 case PT_PHDR:
496 hasu = 1;
497 break;
498 case PT_SUNWSTACK:
499 args->stk_prot = PROT_USER;
500 if (phdrp->p_flags & PF_R)
501 args->stk_prot |= PROT_READ;
502 if (phdrp->p_flags & PF_W)
503 args->stk_prot |= PROT_WRITE;
504 if (phdrp->p_flags & PF_X)
505 args->stk_prot |= PROT_EXEC;
506 break;
507 case PT_LOAD:
508 dataphdrp = phdrp;
509 break;
510 case PT_SUNWCAP:
511 capphdr = phdrp;
512 break;
513 case PT_DYNAMIC:
514 dynamicphdr = phdrp;
515 break;
516 }
517 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
518 }
519
520 if (ehdrp->e_type != ET_EXEC) {
521 dataphdrp = NULL;
522 hasauxv = 1;
523 }
524
525 /* Copy BSS permissions to args->dat_prot */
526 if (dataphdrp != NULL) {
527 args->dat_prot = PROT_USER;
528 if (dataphdrp->p_flags & PF_R)
529 args->dat_prot |= PROT_READ;
530 if (dataphdrp->p_flags & PF_W)
531 args->dat_prot |= PROT_WRITE;
532 if (dataphdrp->p_flags & PF_X)
533 args->dat_prot |= PROT_EXEC;
534 }
535
536 /*
537 * If a auxvector will be required - reserve the space for
538 * it now. This may be increased by exec_args if there are
539 * ISA-specific types (included in __KERN_NAUXV_IMPL).
540 */
541 if (hasauxv) {
542 /*
543 * If a AUX vector is being built - the base AUX
544 * entries are:
545 *
546 * AT_BASE
547 * AT_FLAGS
548 * AT_PAGESZ
549 * AT_RANDOM (added in stk_copyout)
550 * AT_SUN_AUXFLAGS
551 * AT_SUN_HWCAP
552 * AT_SUN_HWCAP2
553 * AT_SUN_PLATFORM (added in stk_copyout)
554 * AT_SUN_EXECNAME (added in stk_copyout)
555 * AT_NULL
556 *
557 * total == 10
558 */
559 if (hasintp && hasu) {
560 /*
561 * Has PT_INTERP & PT_PHDR - the auxvectors that
562 * will be built are:
563 *
564 * AT_PHDR
565 * AT_PHENT
566 * AT_PHNUM
567 * AT_ENTRY
568 * AT_LDDATA
569 *
570 * total = 5
571 */
572 args->auxsize = (10 + 5) * sizeof (aux_entry_t);
573 } else if (hasintp) {
574 /*
575 * Has PT_INTERP but no PT_PHDR
576 *
577 * AT_EXECFD
578 * AT_LDDATA
579 *
580 * total = 2
581 */
582 args->auxsize = (10 + 2) * sizeof (aux_entry_t);
583 } else {
584 args->auxsize = 10 * sizeof (aux_entry_t);
585 }
586 } else {
587 args->auxsize = 0;
588 }
589
590 /*
591 * If this binary is using an emulator, we need to add an
592 * AT_SUN_EMULATOR aux entry.
593 */
594 if (args->emulator != NULL)
595 args->auxsize += sizeof (aux_entry_t);
596
597 /*
598 * If this is a native binary that's been given a modified interpreter
599 * root, inform it that the native system exists at that root.
600 */
601 if (args->brand_nroot != NULL) {
602 args->auxsize += sizeof (aux_entry_t);
603 }
604
605
606 /*
607 * On supported kernels (x86_64) make room in the auxv for the
608 * AT_SUN_COMMPAGE entry. This will go unpopulated on i86xpv systems
609 * which do not provide such functionality.
610 *
611 * Additionally cover the floating point information AT_SUN_FPSIZE and
612 * AT_SUN_FPTYPE.
613 */
614 #if defined(__amd64)
615 args->auxsize += 3 * sizeof (aux_entry_t);
616 #endif /* defined(__amd64) */
617
618 /*
619 * If we have user credentials, we'll supply the following entries:
620 * AT_SUN_UID
621 * AT_SUN_RUID
622 * AT_SUN_GID
623 * AT_SUN_RGID
624 */
625 if (cred != NULL) {
626 args->auxsize += 4 * sizeof (aux_entry_t);
627 }
628
629 if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
630 branded = 1;
631 /*
632 * We will be adding 5 entries to the aux vectors. One for
633 * the the brandname and 4 for the brand specific aux vectors.
634 */
635 args->auxsize += 5 * sizeof (aux_entry_t);
636 }
637
638 /* If the binary has an explicit ASLR flag, it must be honoured */
639 if ((dynamicphdr != NULL) && (dynamicphdr->p_filesz > 0)) {
640 const size_t dynfilesz = dynamicphdr->p_filesz;
641 const size_t dynoffset = dynamicphdr->p_offset;
642 Dyn *dyn, *dp;
643
644 if (dynoffset > MAXOFFSET_T ||
645 dynfilesz > MAXOFFSET_T ||
646 dynoffset + dynfilesz > MAXOFFSET_T) {
647 uprintf("%s: cannot read full .dynamic section\n",
648 exec_file);
649 error = EINVAL;
650 goto out;
651 }
652
653 #define DYN_STRIDE 100
654 for (i = 0; i < dynfilesz; i += sizeof (*dyn) * DYN_STRIDE) {
655 const size_t remdyns = (dynfilesz - i) / sizeof (*dyn);
656 const size_t ndyns = MIN(DYN_STRIDE, remdyns);
657 const size_t dynsize = ndyns * sizeof (*dyn);
658
659 dyn = kmem_alloc(dynsize, KM_SLEEP);
660
661 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
662 (ssize_t)dynsize, (offset_t)(dynoffset + i),
663 UIO_SYSSPACE, 0, (rlim64_t)0,
664 CRED(), NULL)) != 0) {
665 uprintf("%s: cannot read .dynamic section\n",
666 exec_file);
667 goto out;
668 }
669
670 for (dp = dyn; dp < (dyn + ndyns); dp++) {
671 if (dp->d_tag == DT_SUNW_ASLR) {
672 if ((error = handle_secflag_dt(p,
673 DT_SUNW_ASLR,
674 dp->d_un.d_val)) != 0) {
675 uprintf("%s: error setting "
676 "security-flag from "
677 "DT_SUNW_ASLR: %d\n",
678 exec_file, error);
679 goto out;
680 }
681 }
682 }
683
684 kmem_free(dyn, dynsize);
685 }
686 }
687
688 /* Hardware/Software capabilities */
689 if (capphdr != NULL &&
690 (capsize = capphdr->p_filesz) > 0 &&
691 capsize <= 16 * sizeof (*cap)) {
692 const uint_t ncaps = capsize / sizeof (*cap);
693 Cap *cp;
694
695 cap = kmem_alloc(capsize, KM_SLEEP);
696 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
697 (ssize_t)capsize, (offset_t)capphdr->p_offset,
698 UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), NULL)) != 0) {
699 uprintf("%s: Cannot read capabilities section\n",
700 exec_file);
701 goto out;
702 }
703 for (cp = cap; cp < cap + ncaps; cp++) {
704 if (cp->c_tag == CA_SUNW_SF_1 &&
705 (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
706 if (args->to_model == DATAMODEL_LP64)
707 args->addr32 = 1;
708 break;
709 }
710 }
711 }
712
713 aux = bigwad->elfargs;
714 /*
715 * Move args to the user's stack.
716 * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM
717 * aux entries.
718 */
719 if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
720 if (error == -1) {
721 error = ENOEXEC;
722 goto bad;
723 }
724 goto out;
725 }
726 /* we're single threaded after this point */
727
728 /*
729 * If this is an ET_DYN executable (shared object),
730 * determine its memory size so that mapelfexec() can load it.
731 */
732 if (ehdrp->e_type == ET_DYN)
733 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
734 else
735 len = 0;
736
737 dtrphdr = NULL;
738
739 error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
740 &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
741 len, execsz, &brksize);
742 /*
743 * Our uphdr has been dynamically allocated if (and only if) its
744 * program header flags are clear. To avoid leaks, this must be
745 * checked regardless of whether mapelfexec() emitted an error.
746 */
747 dynuphdr = (uphdr != NULL && uphdr->p_flags == 0);
748
749 if (error != 0) {
750 goto bad;
751 }
752
753 if (uphdr != NULL && intphdr == NULL)
754 goto bad;
755
756 if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
757 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
758 goto bad;
759 }
760
761 if (intphdr != NULL) {
762 size_t len;
763 uintptr_t lddata;
764 char *p;
765 struct vnode *nvp;
766
767 dlnsize = intphdr->p_filesz + nsize;
768
769 /*
770 * Make sure none of the component pieces of dlnsize result in
771 * an oversized or zeroed result.
772 */
773 if (intphdr->p_filesz > MAXPATHLEN || dlnsize > MAXPATHLEN ||
774 dlnsize == 0 || dlnsize < intphdr->p_filesz) {
775 goto bad;
776 }
777
778 if (nsize != 0) {
779 bcopy(args->brand_nroot, dlnp, nsize - 1);
780 dlnp[nsize - 1] = '/';
781 }
782
783 /*
784 * Read in "interpreter" pathname.
785 */
786 if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
787 (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset,
788 UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
789 uprintf("%s: Cannot obtain interpreter pathname\n",
790 exec_file);
791 goto bad;
792 }
793
794 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
795 goto bad;
796
797 /*
798 * Search for '$ORIGIN' token in interpreter path.
799 * If found, expand it.
800 */
801 for (p = dlnp; p = strchr(p, '$'); ) {
802 uint_t len, curlen;
803 char *_ptr;
804
805 if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
806 continue;
807
808 /*
809 * We don't support $ORIGIN on setid programs to close
810 * a potential attack vector.
811 */
812 if ((setid & EXECSETID_SETID) != 0) {
813 error = ENOEXEC;
814 goto bad;
815 }
816
817 curlen = 0;
818 len = p - dlnp - 1;
819 if (len) {
820 bcopy(dlnp, pathbufp, len);
821 curlen += len;
822 }
823 if (_ptr = strrchr(args->pathname, '/')) {
824 len = _ptr - args->pathname;
825 if ((curlen + len) > MAXPATHLEN)
826 break;
827
828 bcopy(args->pathname, &pathbufp[curlen], len);
829 curlen += len;
830 } else {
831 /*
832 * executable is a basename found in the
833 * current directory. So - just substitue
834 * '.' for ORIGIN.
835 */
836 pathbufp[curlen] = '.';
837 curlen++;
838 }
839 p += ORIGIN_STR_SIZE;
840 len = strlen(p);
841
842 if ((curlen + len) > MAXPATHLEN)
843 break;
844 bcopy(p, &pathbufp[curlen], len);
845 curlen += len;
846 pathbufp[curlen++] = '\0';
847 bcopy(pathbufp, dlnp, curlen);
848 }
849
850 /*
851 * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
852 * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
853 * Just in case /usr is not mounted, change it now.
854 */
855 if (strcmp(dlnp, USR_LIB_RTLD) == 0)
856 dlnp += 4;
857 error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
858 if (error && dlnp != bigwad->dl_name) {
859 /* new kernel, old user-level */
860 error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
861 NULLVPP, &nvp);
862 }
863 if (error) {
864 uprintf("%s: Cannot find %s\n", exec_file, dlnp);
865 goto bad;
866 }
867
868 /*
869 * Setup the "aux" vector.
870 */
871 if (uphdr) {
872 if (ehdrp->e_type == ET_DYN) {
873 /* don't use the first page */
874 bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
875 bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
876 } else {
877 bigwad->exenv.ex_bssbase = bssbase;
878 bigwad->exenv.ex_brkbase = brkbase;
879 }
880 bigwad->exenv.ex_brksize = brksize;
881 bigwad->exenv.ex_magic = elfmagic;
882 bigwad->exenv.ex_vp = vp;
883 setexecenv(&bigwad->exenv);
884
885 ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
886 ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
887 ADDAUX(aux, AT_PHNUM, nphdrs)
888 ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
889 } else {
890 if ((error = execopen(&vp, &fd)) != 0) {
891 VN_RELE(nvp);
892 goto bad;
893 }
894
895 ADDAUX(aux, AT_EXECFD, fd)
896 }
897
898 if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
899 VN_RELE(nvp);
900 uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
901 goto bad;
902 }
903
904 /*
905 * Now obtain the ELF header along with the entire program
906 * header contained in "nvp".
907 */
908 kmem_free(phdrbase, phdrsize);
909 phdrbase = NULL;
910 if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
911 &shstrndx, &nphdrs)) != 0 ||
912 (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
913 &phdrsize)) != 0) {
914 VN_RELE(nvp);
915 uprintf("%s: Cannot read %s\n", exec_file, dlnp);
916 goto bad;
917 }
918
919 /*
920 * Determine memory size of the "interpreter's" loadable
921 * sections. This size is then used to obtain the virtual
922 * address of a hole, in the user's address space, large
923 * enough to map the "interpreter".
924 */
925 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
926 VN_RELE(nvp);
927 uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
928 goto bad;
929 }
930
931 dtrphdr = NULL;
932
933 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
934 &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
935 execsz, NULL);
936
937 if (error || junk != NULL) {
938 VN_RELE(nvp);
939 uprintf("%s: Cannot map %s\n", exec_file, dlnp);
940 goto bad;
941 }
942
943 /*
944 * We use the DTrace program header to initialize the
945 * architecture-specific user per-LWP location. The dtrace
946 * fasttrap provider requires ready access to per-LWP scratch
947 * space. We assume that there is only one such program header
948 * in the interpreter.
949 */
950 if (dtrphdr != NULL &&
951 dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
952 VN_RELE(nvp);
953 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
954 goto bad;
955 }
956
957 VN_RELE(nvp);
958 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
959 }
960
961 if (hasauxv) {
962 int auxf = AF_SUN_HWCAPVERIFY;
963 #if defined(__amd64)
964 size_t fpsize;
965 int fptype;
966 #endif /* defined(__amd64) */
967
968 /*
969 * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were
970 * filled in via exec_args()
971 */
972 ADDAUX(aux, AT_BASE, voffset)
973 ADDAUX(aux, AT_FLAGS, at_flags)
974 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
975 /*
976 * Linker flags. (security)
977 * p_flag not yet set at this time.
978 * We rely on gexec() to provide us with the information.
979 * If the application is set-uid but this is not reflected
980 * in a mismatch between real/effective uids/gids, then
981 * don't treat this as a set-uid exec. So we care about
982 * the EXECSETID_UGIDS flag but not the ...SETID flag.
983 */
984 if ((setid &= ~EXECSETID_SETID) != 0)
985 auxf |= AF_SUN_SETUGID;
986
987 /*
988 * If we're running a native process from within a branded
989 * zone under pfexec then we clear the AF_SUN_SETUGID flag so
990 * that the native ld.so.1 is able to link with the native
991 * libraries instead of using the brand libraries that are
992 * installed in the zone. We only do this for processes
993 * which we trust because we see they are already running
994 * under pfexec (where uid != euid). This prevents a
995 * malicious user within the zone from crafting a wrapper to
996 * run native suid commands with unsecure libraries interposed.
997 */
998 if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
999 (setid &= ~EXECSETID_SETID) != 0))
1000 auxf &= ~AF_SUN_SETUGID;
1001
1002 /*
1003 * Record the user addr of the auxflags aux vector entry
1004 * since brands may optionally want to manipulate this field.
1005 */
1006 args->auxp_auxflags =
1007 (char *)((char *)args->stackend +
1008 ((char *)&aux->a_type -
1009 (char *)bigwad->elfargs));
1010 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
1011
1012 /*
1013 * Record information about the real and effective user and
1014 * group IDs.
1015 */
1016 if (cred != NULL) {
1017 ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
1018 ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
1019 ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
1020 ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
1021 }
1022
1023 /*
1024 * Hardware capability flag word (performance hints)
1025 * Used for choosing faster library routines.
1026 * (Potentially different between 32-bit and 64-bit ABIs)
1027 */
1028 #if defined(_LP64)
1029 if (args->to_model == DATAMODEL_NATIVE) {
1030 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1031 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1032 } else {
1033 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
1034 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
1035 }
1036 #else
1037 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1038 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1039 #endif
1040 if (branded) {
1041 /*
1042 * Reserve space for the brand-private aux vectors,
1043 * and record the user addr of that space.
1044 */
1045 args->auxp_brand =
1046 (char *)((char *)args->stackend +
1047 ((char *)&aux->a_type -
1048 (char *)bigwad->elfargs));
1049 ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
1050 ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
1051 ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
1052 ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
1053 }
1054
1055 /*
1056 * Add the comm page auxv entry, mapping it in if needed. Also
1057 * take care of the FPU entries.
1058 */
1059 #if defined(__amd64)
1060 if (args->commpage != (uintptr_t)NULL ||
1061 (args->commpage = (uintptr_t)comm_page_mapin()) !=
1062 (uintptr_t)NULL) {
1063 ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
1064 } else {
1065 /*
1066 * If the comm page cannot be mapped, pad out the auxv
1067 * to satisfy later size checks.
1068 */
1069 ADDAUX(aux, AT_NULL, 0)
1070 }
1071
1072 fptype = AT_386_FPINFO_NONE;
1073 fpu_auxv_info(&fptype, &fpsize);
1074 if (fptype != AT_386_FPINFO_NONE) {
1075 ADDAUX(aux, AT_SUN_FPTYPE, fptype)
1076 ADDAUX(aux, AT_SUN_FPSIZE, fpsize)
1077 } else {
1078 ADDAUX(aux, AT_NULL, 0)
1079 ADDAUX(aux, AT_NULL, 0)
1080 }
1081 #endif /* defined(__amd64) */
1082
1083 ADDAUX(aux, AT_NULL, 0)
1084 postfixsize = (uintptr_t)aux - (uintptr_t)bigwad->elfargs;
1085
1086 /*
1087 * We make assumptions above when we determine how many aux
1088 * vector entries we will be adding. However, if we have an
1089 * invalid elf file, it is possible that mapelfexec might
1090 * behave differently (but not return an error), in which case
1091 * the number of aux entries we actually add will be different.
1092 * We detect that now and error out.
1093 */
1094 if (postfixsize != args->auxsize) {
1095 DTRACE_PROBE2(elfexec_badaux, size_t, postfixsize,
1096 size_t, args->auxsize);
1097 goto bad;
1098 }
1099 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
1100 }
1101
1102 /*
1103 * For the 64-bit kernel, the limit is big enough that rounding it up
1104 * to a page can overflow the 64-bit limit, so we check for btopr()
1105 * overflowing here by comparing it with the unrounded limit in pages.
1106 * If it hasn't overflowed, compare the exec size with the rounded up
1107 * limit in pages. Otherwise, just compare with the unrounded limit.
1108 */
1109 limit = btop(p->p_vmem_ctl);
1110 roundlimit = btopr(p->p_vmem_ctl);
1111 if ((roundlimit > limit && *execsz > roundlimit) ||
1112 (roundlimit < limit && *execsz > limit)) {
1113 mutex_enter(&p->p_lock);
1114 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1115 RCA_SAFE);
1116 mutex_exit(&p->p_lock);
1117 error = ENOMEM;
1118 goto bad;
1119 }
1120
1121 bzero(up->u_auxv, sizeof (up->u_auxv));
1122 up->u_commpagep = args->commpage;
1123 if (postfixsize) {
1124 size_t num_auxv;
1125
1126 /*
1127 * Copy the aux vector to the user stack.
1128 */
1129 error = execpoststack(args, bigwad->elfargs, postfixsize);
1130 if (error)
1131 goto bad;
1132
1133 /*
1134 * Copy auxv to the process's user structure for use by /proc.
1135 * If this is a branded process, the brand's exec routine will
1136 * copy it's private entries to the user structure later. It
1137 * relies on the fact that the blank entries are at the end.
1138 */
1139 num_auxv = postfixsize / sizeof (aux_entry_t);
1140 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
1141 aux = bigwad->elfargs;
1142 for (i = 0; i < num_auxv; i++) {
1143 up->u_auxv[i].a_type = aux[i].a_type;
1144 up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
1145 }
1146 }
1147
1148 /*
1149 * Pass back the starting address so we can set the program counter.
1150 */
1151 args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
1152
1153 if (!uphdr) {
1154 if (ehdrp->e_type == ET_DYN) {
1155 /*
1156 * If we are executing a shared library which doesn't
1157 * have a interpreter (probably ld.so.1) then
1158 * we don't set the brkbase now. Instead we
1159 * delay it's setting until the first call
1160 * via grow.c::brk(). This permits ld.so.1 to
1161 * initialize brkbase to the tail of the executable it
1162 * loads (which is where it needs to be).
1163 */
1164 bigwad->exenv.ex_brkbase = (caddr_t)0;
1165 bigwad->exenv.ex_bssbase = (caddr_t)0;
1166 bigwad->exenv.ex_brksize = 0;
1167 } else {
1168 bigwad->exenv.ex_brkbase = brkbase;
1169 bigwad->exenv.ex_bssbase = bssbase;
1170 bigwad->exenv.ex_brksize = brksize;
1171 }
1172 bigwad->exenv.ex_magic = elfmagic;
1173 bigwad->exenv.ex_vp = vp;
1174 setexecenv(&bigwad->exenv);
1175 }
1176
1177 ASSERT(error == 0);
1178 goto out;
1179
1180 bad:
1181 if (fd != -1) /* did we open the a.out yet */
1182 (void) execclose(fd);
1183
1184 psignal(p, SIGKILL);
1185
1186 if (error == 0)
1187 error = ENOEXEC;
1188 out:
1189 if (dynuphdr)
1190 kmem_free(uphdr, sizeof (Phdr));
1191 if (phdrbase != NULL)
1192 kmem_free(phdrbase, phdrsize);
1193 if (cap != NULL)
1194 kmem_free(cap, capsize);
1195 kmem_free(bigwad, sizeof (struct bigwad));
1196 return (error);
1197 }
1198
1199 /*
1200 * Compute the memory size requirement for the ELF file.
1201 */
1202 static size_t
1203 elfsize(const Ehdr *ehdrp, uint_t nphdrs, const caddr_t phdrbase,
1204 uintptr_t *lddata)
1205 {
1206 const Phdr *phdrp = (Phdr *)phdrbase;
1207 const uint_t hsize = ehdrp->e_phentsize;
1208 boolean_t dfirst = B_TRUE;
1209 uintptr_t loaddr = UINTPTR_MAX;
1210 uintptr_t hiaddr = 0;
1211 uint_t i;
1212
1213 for (i = nphdrs; i > 0; i--) {
1214 if (phdrp->p_type == PT_LOAD) {
1215 const uintptr_t lo = phdrp->p_vaddr;
1216 const uintptr_t hi = lo + phdrp->p_memsz;
1217
1218 loaddr = MIN(lo, loaddr);
1219 hiaddr = MAX(hi, hiaddr);
1220
1221 /*
1222 * save the address of the first data segment
1223 * of a object - used for the AT_SUNW_LDDATA
1224 * aux entry.
1225 */
1226 if ((lddata != NULL) && dfirst &&
1227 (phdrp->p_flags & PF_W)) {
1228 *lddata = lo;
1229 dfirst = B_FALSE;
1230 }
1231 }
1232 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1233 }
1234
1235 if (hiaddr <= loaddr) {
1236 /* No non-zero PT_LOAD segment found */
1237 return (0);
1238 }
1239
1240 return (roundup(hiaddr - (loaddr & PAGEMASK), PAGESIZE));
1241 }
1242
1243 /*
1244 * Read in the ELF header and program header table.
1245 * SUSV3 requires:
1246 * ENOEXEC File format is not recognized
1247 * EINVAL Format recognized but execution not supported
1248 */
1249 static int
1250 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
1251 uint_t *shstrndx, uint_t *nphdrs)
1252 {
1253 int error;
1254 ssize_t resid;
1255
1256 /*
1257 * We got here by the first two bytes in ident,
1258 * now read the entire ELF header.
1259 */
1260 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, sizeof (Ehdr),
1261 (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid)) != 0) {
1262 return (error);
1263 }
1264
1265 /*
1266 * Since a separate version is compiled for handling 32-bit and
1267 * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1268 * doesn't need to be able to deal with 32-bit ELF files.
1269 */
1270 if (resid != 0 ||
1271 ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1272 ehdr->e_ident[EI_MAG3] != ELFMAG3) {
1273 return (ENOEXEC);
1274 }
1275
1276 if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1277 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1278 ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1279 #else
1280 ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1281 #endif
1282 !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1283 ehdr->e_flags)) {
1284 return (EINVAL);
1285 }
1286
1287 *nshdrs = ehdr->e_shnum;
1288 *shstrndx = ehdr->e_shstrndx;
1289 *nphdrs = ehdr->e_phnum;
1290
1291 /*
1292 * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1293 * to read in the section header at index zero to access the true
1294 * values for those fields.
1295 */
1296 if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1297 *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1298 Shdr shdr;
1299
1300 if (ehdr->e_shoff == 0)
1301 return (EINVAL);
1302
1303 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1304 sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1305 (rlim64_t)0, credp, NULL)) != 0)
1306 return (error);
1307
1308 if (*nshdrs == 0)
1309 *nshdrs = shdr.sh_size;
1310 if (*shstrndx == SHN_XINDEX)
1311 *shstrndx = shdr.sh_link;
1312 if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1313 *nphdrs = shdr.sh_info;
1314 }
1315
1316 return (0);
1317 }
1318
1319 /*
1320 * We use members through p_flags on 32-bit files and p_memsz on 64-bit files,
1321 * so e_phentsize must be at least large enough to include those members.
1322 */
1323 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1324 #define MINPHENTSZ (offsetof(Phdr, p_flags) + \
1325 sizeof (((Phdr *)NULL)->p_flags))
1326 #else
1327 #define MINPHENTSZ (offsetof(Phdr, p_memsz) + \
1328 sizeof (((Phdr *)NULL)->p_memsz))
1329 #endif
1330
1331 static int
1332 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nphdrs,
1333 caddr_t *phbasep, size_t *phsizep)
1334 {
1335 int err;
1336
1337 /*
1338 * Ensure that e_phentsize is large enough for required fields to be
1339 * accessible and will maintain 8-byte alignment.
1340 */
1341 if (ehdr->e_phentsize < MINPHENTSZ || (ehdr->e_phentsize & 3))
1342 return (EINVAL);
1343
1344 *phsizep = nphdrs * ehdr->e_phentsize;
1345
1346 if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1347 if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1348 return (ENOMEM);
1349 } else {
1350 *phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1351 }
1352
1353 if ((err = vn_rdwr(UIO_READ, vp, *phbasep, (ssize_t)*phsizep,
1354 (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1355 credp, NULL)) != 0) {
1356 kmem_free(*phbasep, *phsizep);
1357 *phbasep = NULL;
1358 return (err);
1359 }
1360
1361 return (0);
1362 }
1363
1364 #define MINSHDRSZ (offsetof(Shdr, sh_entsize) + \
1365 sizeof (((Shdr *)NULL)->sh_entsize))
1366
1367 static int
1368 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nshdrs,
1369 uint_t shstrndx, caddr_t *shbasep, size_t *shsizep, char **shstrbasep,
1370 size_t *shstrsizep)
1371 {
1372 int err;
1373 Shdr *shdr;
1374
1375 /*
1376 * Since we're going to be using e_shentsize to iterate down the
1377 * array of section headers, it must be 8-byte aligned or else
1378 * a we might cause a misaligned access. We use all members through
1379 * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1380 * must be at least large enough to include that member. The index
1381 * of the string table section must also be valid.
1382 */
1383 if (ehdr->e_shentsize < MINSHDRSZ || (ehdr->e_shentsize & 3) ||
1384 nshdrs == 0 || shstrndx >= nshdrs)
1385 return (EINVAL);
1386
1387 *shsizep = nshdrs * ehdr->e_shentsize;
1388
1389 if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1390 if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1391 return (ENOMEM);
1392 } else {
1393 *shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1394 }
1395
1396 if ((err = vn_rdwr(UIO_READ, vp, *shbasep, (ssize_t)*shsizep,
1397 (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1398 credp, NULL)) != 0) {
1399 kmem_free(*shbasep, *shsizep);
1400 return (err);
1401 }
1402
1403 /*
1404 * Grab the section string table. Walking through the shdrs is
1405 * pointless if their names cannot be interrogated.
1406 */
1407 shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1408 if ((*shstrsizep = shdr->sh_size) == 0) {
1409 kmem_free(*shbasep, *shsizep);
1410 return (EINVAL);
1411 }
1412
1413 if (*shstrsizep > elf_shstrtab_max) {
1414 if ((*shstrbasep = kmem_alloc(*shstrsizep,
1415 KM_NOSLEEP)) == NULL) {
1416 kmem_free(*shbasep, *shsizep);
1417 return (ENOMEM);
1418 }
1419 } else {
1420 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1421 }
1422
1423 if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, (ssize_t)*shstrsizep,
1424 (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1425 credp, NULL)) != 0) {
1426 kmem_free(*shbasep, *shsizep);
1427 kmem_free(*shstrbasep, *shstrsizep);
1428 return (err);
1429 }
1430
1431 /*
1432 * Make sure the strtab is null-terminated to make sure we
1433 * don't run off the end of the table.
1434 */
1435 (*shstrbasep)[*shstrsizep - 1] = '\0';
1436
1437 return (0);
1438 }
1439
1440
1441 int
1442 elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, uint_t *nphdrs,
1443 caddr_t *phbasep, size_t *phsizep)
1444 {
1445 int error;
1446 uint_t nshdrs, shstrndx;
1447
1448 if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
1449 nphdrs)) != 0 ||
1450 (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
1451 phsizep)) != 0) {
1452 return (error);
1453 }
1454 return (0);
1455 }
1456
1457
1458 static int
1459 mapelfexec(
1460 vnode_t *vp,
1461 Ehdr *ehdr,
1462 uint_t nphdrs,
1463 caddr_t phdrbase,
1464 Phdr **uphdr,
1465 Phdr **intphdr,
1466 Phdr **stphdr,
1467 Phdr **dtphdr,
1468 Phdr *dataphdrp,
1469 caddr_t *bssbase,
1470 caddr_t *brkbase,
1471 intptr_t *voffset,
1472 uintptr_t *minaddrp,
1473 size_t len,
1474 size_t *execsz,
1475 size_t *brksize)
1476 {
1477 Phdr *phdr;
1478 int error, page, prot, lastprot = 0;
1479 caddr_t addr = NULL;
1480 caddr_t minaddr = (caddr_t)UINTPTR_MAX;
1481 uint_t i;
1482 size_t zfodsz, memsz;
1483 boolean_t ptload = B_FALSE;
1484 off_t offset;
1485 const uint_t hsize = ehdr->e_phentsize;
1486 uintptr_t lastaddr = 0;
1487 extern int use_brk_lpg;
1488
1489 if (ehdr->e_type == ET_DYN) {
1490 caddr_t vaddr;
1491 secflagset_t flags = 0;
1492 /*
1493 * Obtain the virtual address of a hole in the
1494 * address space to map the "interpreter".
1495 */
1496 if (secflag_enabled(curproc, PROC_SEC_ASLR))
1497 flags |= _MAP_RANDOMIZE;
1498
1499 map_addr(&addr, len, (offset_t)0, 1, flags);
1500 if (addr == NULL)
1501 return (ENOMEM);
1502
1503 /*
1504 * Despite the fact that mmapobj(2) refuses to load them, we
1505 * need to support executing ET_DYN objects that have a
1506 * non-NULL p_vaddr. When found in the wild, these objects
1507 * are likely to be due to an old (and largely obviated) Linux
1508 * facility, prelink(8), that rewrites shared objects to
1509 * prefer specific (disjoint) virtual address ranges. (Yes,
1510 * this is putatively for performance -- and yes, it has
1511 * limited applicability, many edge conditions and grisly
1512 * failure modes; even for Linux, it's insane.) As ELF
1513 * mandates that the PT_LOAD segments be in p_vaddr order, we
1514 * find the lowest p_vaddr by finding the first PT_LOAD
1515 * segment.
1516 */
1517 phdr = (Phdr *)phdrbase;
1518 for (i = nphdrs; i > 0; i--) {
1519 if (phdr->p_type == PT_LOAD) {
1520 addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
1521 break;
1522 }
1523 phdr = (Phdr *)((caddr_t)phdr + hsize);
1524 }
1525
1526 /*
1527 * We have a non-zero p_vaddr in the first PT_LOAD segment --
1528 * presumably because we're directly executing a prelink(8)'d
1529 * ld-linux.so. While we could correctly execute such an
1530 * object without locating it at its desired p_vaddr (it is,
1531 * after all, still relocatable), our inner antiquarian
1532 * derives a perverse pleasure in accommodating the steampunk
1533 * prelink(8) contraption -- goggles on!
1534 */
1535 if ((vaddr = addr) != NULL) {
1536 if (as_gap(curproc->p_as, len, &addr, &len,
1537 AH_LO, NULL) == -1 || addr != vaddr) {
1538 addr = NULL;
1539 }
1540 }
1541
1542 if (addr == NULL) {
1543 /*
1544 * We either have a NULL p_vaddr (the common case, by
1545 * many orders of magnitude) or we have a non-NULL
1546 * p_vaddr and we were unable to obtain the specified
1547 * VA range (presumably because it's an illegal
1548 * address). Either way, obtain an address in which
1549 * to map the interpreter.
1550 */
1551 map_addr(&addr, len, (offset_t)0, 1, 0);
1552 if (addr == NULL)
1553 return (ENOMEM);
1554 }
1555
1556 /*
1557 * Our voffset is the difference between where we landed and
1558 * where we wanted to be.
1559 */
1560 *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
1561 } else {
1562 *voffset = 0;
1563 }
1564
1565 phdr = (Phdr *)phdrbase;
1566 for (i = nphdrs; i > 0; i--) {
1567 switch (phdr->p_type) {
1568 case PT_LOAD:
1569 ptload = B_TRUE;
1570 prot = PROT_USER;
1571 if (phdr->p_flags & PF_R)
1572 prot |= PROT_READ;
1573 if (phdr->p_flags & PF_W)
1574 prot |= PROT_WRITE;
1575 if (phdr->p_flags & PF_X)
1576 prot |= PROT_EXEC;
1577
1578 addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1579
1580 if ((*intphdr != NULL) && uphdr != NULL &&
1581 (*uphdr == NULL)) {
1582 /*
1583 * The PT_PHDR program header is, strictly
1584 * speaking, optional. If we find that this
1585 * is missing, we will determine the location
1586 * of the program headers based on the address
1587 * of the lowest PT_LOAD segment (namely, this
1588 * one): we subtract the p_offset to get to
1589 * the ELF header and then add back the program
1590 * header offset to get to the program headers.
1591 * We then cons up a Phdr that corresponds to
1592 * the (missing) PT_PHDR, setting the flags
1593 * to 0 to denote that this is artificial and
1594 * should (must) be freed by the caller.
1595 */
1596 Phdr *cons;
1597
1598 cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
1599
1600 cons->p_flags = 0;
1601 cons->p_type = PT_PHDR;
1602 cons->p_vaddr = ((uintptr_t)addr -
1603 phdr->p_offset) + ehdr->e_phoff;
1604
1605 *uphdr = cons;
1606 }
1607
1608 /*
1609 * The ELF spec dictates that p_filesz may not be
1610 * larger than p_memsz in PT_LOAD segments.
1611 */
1612 if (phdr->p_filesz > phdr->p_memsz) {
1613 error = EINVAL;
1614 goto bad;
1615 }
1616
1617 /*
1618 * Keep track of the segment with the lowest starting
1619 * address.
1620 */
1621 if (addr < minaddr)
1622 minaddr = addr;
1623
1624 /*
1625 * Segments need not correspond to page boundaries:
1626 * they are permitted to share a page. If two PT_LOAD
1627 * segments share the same page, and the permissions
1628 * of the segments differ, the behavior is historically
1629 * that the permissions of the latter segment are used
1630 * for the page that the two segments share. This is
1631 * also historically a non-issue: binaries generated
1632 * by most anything will make sure that two PT_LOAD
1633 * segments with differing permissions don't actually
1634 * share any pages. However, there exist some crazy
1635 * things out there (including at least an obscure
1636 * Portuguese teaching language called G-Portugol) that
1637 * actually do the wrong thing and expect it to work:
1638 * they have a segment with execute permission share
1639 * a page with a subsequent segment that does not
1640 * have execute permissions and expect the resulting
1641 * shared page to in fact be executable. To accommodate
1642 * such broken link editors, we take advantage of a
1643 * latitude explicitly granted to the loader: it is
1644 * permitted to make _any_ PT_LOAD segment executable
1645 * (provided that it is readable or writable). If we
1646 * see that we're sharing a page and that the previous
1647 * page was executable, we will add execute permissions
1648 * to our segment.
1649 */
1650 if (btop(lastaddr) == btop((uintptr_t)addr) &&
1651 (phdr->p_flags & (PF_R | PF_W)) &&
1652 (lastprot & PROT_EXEC)) {
1653 prot |= PROT_EXEC;
1654 }
1655
1656 lastaddr = (uintptr_t)addr + phdr->p_filesz;
1657 lastprot = prot;
1658
1659 zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1660
1661 offset = phdr->p_offset;
1662 if (((uintptr_t)offset & PAGEOFFSET) ==
1663 ((uintptr_t)addr & PAGEOFFSET) &&
1664 (!(vp->v_flag & VNOMAP))) {
1665 page = 1;
1666 } else {
1667 page = 0;
1668 }
1669
1670 /*
1671 * Set the heap pagesize for OOB when the bss size
1672 * is known and use_brk_lpg is not 0.
1673 */
1674 if (brksize != NULL && use_brk_lpg &&
1675 zfodsz != 0 && phdr == dataphdrp &&
1676 (prot & PROT_WRITE)) {
1677 const size_t tlen = P2NPHASE((uintptr_t)addr +
1678 phdr->p_filesz, PAGESIZE);
1679
1680 if (zfodsz > tlen) {
1681 const caddr_t taddr = addr +
1682 phdr->p_filesz + tlen;
1683
1684 /*
1685 * Since a hole in the AS large enough
1686 * for this object as calculated by
1687 * elfsize() is available, we do not
1688 * need to fear overflow for 'taddr'.
1689 */
1690 curproc->p_brkpageszc =
1691 page_szc(map_pgsz(MAPPGSZ_HEAP,
1692 curproc, taddr, zfodsz - tlen, 0));
1693 }
1694 }
1695
1696 if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1697 (prot & PROT_WRITE)) {
1698 uint_t szc = curproc->p_brkpageszc;
1699 size_t pgsz = page_get_pagesize(szc);
1700 caddr_t ebss = addr + phdr->p_memsz;
1701 /*
1702 * If we need extra space to keep the BSS an
1703 * integral number of pages in size, some of
1704 * that space may fall beyond p_brkbase, so we
1705 * need to set p_brksize to account for it
1706 * being (logically) part of the brk.
1707 */
1708 size_t extra_zfodsz;
1709
1710 ASSERT(pgsz > PAGESIZE);
1711
1712 extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1713
1714 if (error = execmap(vp, addr, phdr->p_filesz,
1715 zfodsz + extra_zfodsz, phdr->p_offset,
1716 prot, page, szc))
1717 goto bad;
1718 if (brksize != NULL)
1719 *brksize = extra_zfodsz;
1720 } else {
1721 if (error = execmap(vp, addr, phdr->p_filesz,
1722 zfodsz, phdr->p_offset, prot, page, 0))
1723 goto bad;
1724 }
1725
1726 if (bssbase != NULL && addr >= *bssbase &&
1727 phdr == dataphdrp) {
1728 *bssbase = addr + phdr->p_filesz;
1729 }
1730 if (brkbase != NULL && addr >= *brkbase) {
1731 *brkbase = addr + phdr->p_memsz;
1732 }
1733
1734 memsz = btopr(phdr->p_memsz);
1735 if ((*execsz + memsz) < *execsz) {
1736 error = ENOMEM;
1737 goto bad;
1738 }
1739 *execsz += memsz;
1740 break;
1741
1742 case PT_INTERP:
1743 /*
1744 * The ELF specification is unequivocal about the
1745 * PT_INTERP program header with respect to any PT_LOAD
1746 * program header: "If it is present, it must precede
1747 * any loadable segment entry." Linux, however, makes
1748 * no attempt to enforce this -- which has allowed some
1749 * binary editing tools to get away with generating
1750 * invalid ELF binaries in the respect that PT_INTERP
1751 * occurs after the first PT_LOAD program header. This
1752 * is unfortunate (and of course, disappointing) but
1753 * it's no worse than that: there is no reason that we
1754 * can't process the PT_INTERP entry (if present) after
1755 * one or more PT_LOAD entries. We therefore
1756 * deliberately do not check ptload here and always
1757 * store dyphdr to be the PT_INTERP program header.
1758 */
1759 *intphdr = phdr;
1760 break;
1761
1762 case PT_SHLIB:
1763 *stphdr = phdr;
1764 break;
1765
1766 case PT_PHDR:
1767 if (ptload || phdr->p_flags == 0)
1768 goto bad;
1769
1770 if (uphdr != NULL)
1771 *uphdr = phdr;
1772
1773 break;
1774
1775 case PT_NULL:
1776 case PT_DYNAMIC:
1777 case PT_NOTE:
1778 break;
1779
1780 case PT_SUNWDTRACE:
1781 if (dtphdr != NULL)
1782 *dtphdr = phdr;
1783 break;
1784
1785 default:
1786 break;
1787 }
1788 phdr = (Phdr *)((caddr_t)phdr + hsize);
1789 }
1790
1791 if (minaddrp != NULL) {
1792 ASSERT(minaddr != (caddr_t)UINTPTR_MAX);
1793 *minaddrp = (uintptr_t)minaddr;
1794 }
1795
1796 if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1797 size_t off;
1798 uintptr_t base = (uintptr_t)*brkbase;
1799 uintptr_t oend = base + *brksize;
1800
1801 ASSERT(ISP2(aslr_max_brk_skew));
1802
1803 (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1804 base += P2PHASE(off, aslr_max_brk_skew);
1805 base = P2ROUNDUP(base, PAGESIZE);
1806 *brkbase = (caddr_t)base;
1807 /*
1808 * Above, we set *brksize to account for the possibility we
1809 * had to grow the 'brk' in padding out the BSS to a page
1810 * boundary.
1811 *
1812 * We now need to adjust that based on where we now are
1813 * actually putting the brk.
1814 */
1815 if (oend > base)
1816 *brksize = oend - base;
1817 else
1818 *brksize = 0;
1819 }
1820
1821 return (0);
1822 bad:
1823 if (error == 0)
1824 error = EINVAL;
1825 return (error);
1826 }
1827
1828 int
1829 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1830 rlim64_t rlimit, cred_t *credp)
1831 {
1832 Note note;
1833 int error;
1834
1835 bzero(¬e, sizeof (note));
1836 bcopy("CORE", note.name, 4);
1837 note.nhdr.n_type = type;
1838 /*
1839 * The System V ABI states that n_namesz must be the length of the
1840 * string that follows the Nhdr structure including the terminating
1841 * null. The ABI also specifies that sufficient padding should be
1842 * included so that the description that follows the name string
1843 * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1844 * respectively. However, since this change was not made correctly
1845 * at the time of the 64-bit port, both 32- and 64-bit binaries
1846 * descriptions are only guaranteed to begin on a 4-byte boundary.
1847 */
1848 note.nhdr.n_namesz = 5;
1849 note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1850
1851 if (error = core_write(vp, UIO_SYSSPACE, *offsetp, ¬e,
1852 sizeof (note), rlimit, credp))
1853 return (error);
1854
1855 *offsetp += sizeof (note);
1856
1857 if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1858 note.nhdr.n_descsz, rlimit, credp))
1859 return (error);
1860
1861 *offsetp += note.nhdr.n_descsz;
1862 return (0);
1863 }
1864
1865
1866 /*
1867 * Copy the section data from one vnode to the section of another vnode.
1868 */
1869 static void
1870 elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst)
1871 {
1872 size_t n = src->sh_size;
1873 u_offset_t off = 0;
1874 const u_offset_t soff = src->sh_offset;
1875 const u_offset_t doff = ctx->ecc_doffset;
1876 void *buf = ctx->ecc_buf;
1877 vnode_t *dst_vp = ctx->ecc_vp;
1878 cred_t *credp = ctx->ecc_credp;
1879
1880 /* Protect the copy loop below from overflow on the offsets */
1881 if (n > OFF_MAX || (n + soff) > OFF_MAX || (n + doff) > OFF_MAX ||
1882 (n + soff) < n || (n + doff) < n) {
1883 dst->sh_size = 0;
1884 dst->sh_offset = 0;
1885 return;
1886 }
1887
1888 while (n != 0) {
1889 const size_t len = MIN(ctx->ecc_bufsz, n);
1890 ssize_t resid;
1891
1892 if (vn_rdwr(UIO_READ, src_vp, buf, (ssize_t)len,
1893 (offset_t)(soff + off),
1894 UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1895 resid >= len || resid < 0 ||
1896 core_write(dst_vp, UIO_SYSSPACE, (offset_t)(doff + off),
1897 buf, len - resid, ctx->ecc_rlimit, credp) != 0) {
1898 dst->sh_size = 0;
1899 dst->sh_offset = 0;
1900 return;
1901 }
1902
1903 ASSERT(n >= len - resid);
1904
1905 n -= len - resid;
1906 off += len - resid;
1907 }
1908
1909 ctx->ecc_doffset += src->sh_size;
1910 }
1911
1912 /*
1913 * Walk sections for a given ELF object, counting (or copying) those of
1914 * interest (CTF, symtab, strtab, DWARF debug).
1915 *
1916 * Returns UINT_MAX upon low-memory.
1917 */
1918 static uint_t
1919 elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
1920 Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab)
1921 {
1922 Ehdr ehdr;
1923 const core_content_t content = ctx->ecc_content;
1924 cred_t *credp = ctx->ecc_credp;
1925 Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL;
1926 uintptr_t off = 0;
1927 uint_t nshdrs, shstrndx, nphdrs, count = 0;
1928 u_offset_t *doffp = &ctx->ecc_doffset;
1929 boolean_t ctf_link = B_FALSE;
1930 caddr_t shbase;
1931 size_t shsize, shstrsize;
1932 char *shstrbase;
1933
1934 if ((content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG))
1935 == 0) {
1936 return (0);
1937 }
1938
1939 if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, &nphdrs) != 0 ||
1940 getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, &shbase, &shsize,
1941 &shstrbase, &shstrsize) != 0) {
1942 return (0);
1943 }
1944
1945 /* Starting at index 1 skips SHT_NULL which is expected at index 0 */
1946 off = ehdr.e_shentsize;
1947 for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
1948 Shdr *shdr, *symchk = NULL, *strchk;
1949 const char *name;
1950
1951 shdr = (Shdr *)(shbase + off);
1952 if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL)
1953 continue;
1954
1955 name = shstrbase + shdr->sh_name;
1956
1957 if (ctf == NULL &&
1958 (content & CC_CONTENT_CTF) != 0 &&
1959 strcmp(name, shstrtab_data[STR_CTF]) == 0) {
1960 ctf = shdr;
1961 if (ctf->sh_link != 0 && ctf->sh_link < nshdrs) {
1962 /* check linked symtab below */
1963 symchk = (Shdr *)(shbase +
1964 shdr->sh_link * ehdr.e_shentsize);
1965 ctf_link = B_TRUE;
1966 } else {
1967 continue;
1968 }
1969 } else if (symtab == NULL &&
1970 (content & CC_CONTENT_SYMTAB) != 0 &&
1971 strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) {
1972 symchk = shdr;
1973 } else if ((content & CC_CONTENT_DEBUG) != 0 &&
1974 strncmp(name, ".debug_", strlen(".debug_")) == 0) {
1975 /*
1976 * The design of the above check is intentional. In
1977 * particular, we want to capture any sections that
1978 * begin with '.debug_' for a few reasons:
1979 *
1980 * 1) Various revisions to the DWARF spec end up
1981 * changing the set of section headers that
1982 * exist. This ensures that we don't need to change
1983 * the kernel to get a new version.
1984 *
1985 * 2) Other software uses .debug_ sections for things
1986 * which aren't DWARF. This allows them to be captured
1987 * as well.
1988 *
1989 * Because of this, we emit straight here, unlike the
1990 * other two sections where we wait until we're done
1991 * scanning.
1992 */
1993
1994 /* We're only counting, don't emit! */
1995 if (v == NULL) {
1996 count++;
1997 continue;
1998 }
1999
2000 elf_ctx_resize_scratch(ctx, shdr->sh_size);
2001 if (!shstrtab_ndx(shstrtab, name, &v[idx].sh_name)) {
2002 count = UINT_MAX;
2003 goto done;
2004 }
2005 v[idx].sh_addr = (Addr)(uintptr_t)saddr;
2006 v[idx].sh_type = shdr->sh_type;
2007 v[idx].sh_addralign = shdr->sh_addralign;
2008 *doffp = roundup(*doffp, v[idx].sh_addralign);
2009 v[idx].sh_offset = *doffp;
2010 v[idx].sh_size = shdr->sh_size;
2011 v[idx].sh_link = 0;
2012 v[idx].sh_entsize = shdr->sh_entsize;
2013 v[idx].sh_info = shdr->sh_info;
2014
2015 elf_copy_scn(ctx, shdr, mvp, &v[idx]);
2016 count++;
2017 idx++;
2018 continue;
2019 } else {
2020 continue;
2021 }
2022
2023 ASSERT(symchk != NULL);
2024 if ((symchk->sh_type != SHT_DYNSYM &&
2025 symchk->sh_type != SHT_SYMTAB) ||
2026 symchk->sh_link == 0 || symchk->sh_link >= nshdrs) {
2027 ctf_link = B_FALSE;
2028 continue;
2029 }
2030 strchk = (Shdr *)(shbase + symchk->sh_link * ehdr.e_shentsize);
2031 if (strchk->sh_type != SHT_STRTAB) {
2032 ctf_link = B_FALSE;
2033 continue;
2034 }
2035 symtab = symchk;
2036 strtab = strchk;
2037
2038 if (symtab != NULL && ctf != NULL &&
2039 (content & CC_CONTENT_DEBUG) == 0) {
2040 /* No other shdrs are of interest at this point */
2041 break;
2042 }
2043 }
2044
2045 if (ctf != NULL)
2046 count += 1;
2047 if (symtab != NULL)
2048 count += 2;
2049
2050 if (v == NULL || count == 0 || count > remain) {
2051 count = MIN(count, remain);
2052 goto done;
2053 }
2054
2055 /* output CTF section */
2056 if (ctf != NULL) {
2057 elf_ctx_resize_scratch(ctx, ctf->sh_size);
2058
2059 if (!shstrtab_ndx(shstrtab, shstrtab_data[STR_CTF],
2060 &v[idx].sh_name)) {
2061 count = UINT_MAX;
2062 goto done;
2063 }
2064
2065 v[idx].sh_addr = (Addr)(uintptr_t)saddr;
2066 v[idx].sh_type = SHT_PROGBITS;
2067 v[idx].sh_addralign = 4;
2068 *doffp = roundup(*doffp, v[idx].sh_addralign);
2069 v[idx].sh_offset = *doffp;
2070 v[idx].sh_size = ctf->sh_size;
2071
2072 if (ctf_link) {
2073 /*
2074 * The linked symtab (and strtab) will be output
2075 * immediately after this CTF section. Its shdr index
2076 * directly follows this one.
2077 */
2078 v[idx].sh_link = idx + 1;
2079 ASSERT(symtab != NULL);
2080 } else {
2081 v[idx].sh_link = 0;
2082 }
2083 elf_copy_scn(ctx, ctf, mvp, &v[idx]);
2084 idx++;
2085 }
2086
2087 /* output SYMTAB/STRTAB sections */
2088 if (symtab != NULL) {
2089 uint_t symtab_name, strtab_name;
2090
2091 elf_ctx_resize_scratch(ctx,
2092 MAX(symtab->sh_size, strtab->sh_size));
2093
2094 if (symtab->sh_type == SHT_DYNSYM) {
2095 if (!shstrtab_ndx(shstrtab, shstrtab_data[STR_DYNSYM],
2096 &symtab_name) ||
2097 !shstrtab_ndx(shstrtab, shstrtab_data[STR_DYNSTR],
2098 &strtab_name)) {
2099 count = UINT_MAX;
2100 goto done;
2101 }
2102 } else {
2103 if (!shstrtab_ndx(shstrtab, shstrtab_data[STR_SYMTAB],
2104 &symtab_name) ||
2105 !shstrtab_ndx(shstrtab, shstrtab_data[STR_STRTAB],
2106 &strtab_name)) {
2107 count = UINT_MAX;
2108 goto done;
2109 }
2110 }
2111
2112 v[idx].sh_name = symtab_name;
2113 v[idx].sh_type = symtab->sh_type;
2114 v[idx].sh_addr = symtab->sh_addr;
2115 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2116 v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2117 v[idx].sh_addralign = symtab->sh_addralign;
2118 *doffp = roundup(*doffp, v[idx].sh_addralign);
2119 v[idx].sh_offset = *doffp;
2120 v[idx].sh_size = symtab->sh_size;
2121 v[idx].sh_link = idx + 1;
2122 v[idx].sh_entsize = symtab->sh_entsize;
2123 v[idx].sh_info = symtab->sh_info;
2124
2125 elf_copy_scn(ctx, symtab, mvp, &v[idx]);
2126 idx++;
2127
2128 v[idx].sh_name = strtab_name;
2129 v[idx].sh_type = SHT_STRTAB;
2130 v[idx].sh_flags = SHF_STRINGS;
2131 v[idx].sh_addr = strtab->sh_addr;
2132 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2133 v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2134 v[idx].sh_addralign = strtab->sh_addralign;
2135 *doffp = roundup(*doffp, v[idx].sh_addralign);
2136 v[idx].sh_offset = *doffp;
2137 v[idx].sh_size = strtab->sh_size;
2138
2139 elf_copy_scn(ctx, strtab, mvp, &v[idx]);
2140 idx++;
2141 }
2142
2143 done:
2144 kmem_free(shstrbase, shstrsize);
2145 kmem_free(shbase, shsize);
2146 return (count);
2147 }
2148
2149 /*
2150 * Walk mappings in process address space, examining those which correspond to
2151 * loaded objects. It is called twice from elfcore: Once to simply count
2152 * relevant sections, and again later to copy those sections once an adequate
2153 * buffer has been allocated for the shdr details.
2154 */
2155 static int
2156 elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
2157 {
2158 vnode_t *lastvp = NULL;
2159 struct seg *seg;
2160 uint_t idx = 0, remain;
2161 shstrtab_t shstrtab;
2162 struct as *as = ctx->ecc_p->p_as;
2163 int error = 0;
2164
2165 ASSERT(AS_WRITE_HELD(as));
2166
2167 if (v != NULL) {
2168 ASSERT(nv != 0);
2169
2170 if (!shstrtab_init(&shstrtab))
2171 return (ENOMEM);
2172 remain = nv;
2173 } else {
2174 ASSERT(nv == 0);
2175
2176 /*
2177 * The shdrs are being counted, rather than outputting them
2178 * into a buffer. Leave room for two entries: the SHT_NULL at
2179 * index 0 and the shstrtab at the end.
2180 */
2181 remain = UINT_MAX - 2;
2182 }
2183
2184 /* Per the ELF spec, shdr index 0 is reserved. */
2185 idx = 1;
2186 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2187 vnode_t *mvp;
2188 void *tmp = NULL;
2189 caddr_t saddr = seg->s_base, naddr, eaddr;
2190 size_t segsize;
2191 uint_t count, prot;
2192
2193 /*
2194 * Since we're just looking for text segments of load
2195 * objects, we only care about the protection bits; we don't
2196 * care about the actual size of the segment so we use the
2197 * reserved size. If the segment's size is zero, there's
2198 * something fishy going on so we ignore this segment.
2199 */
2200 if (seg->s_ops != &segvn_ops ||
2201 SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2202 mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
2203 (segsize = pr_getsegsize(seg, 1)) == 0)
2204 continue;
2205
2206 eaddr = saddr + segsize;
2207 prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
2208 pr_getprot_done(&tmp);
2209
2210 /*
2211 * Skip this segment unless the protection bits look like
2212 * what we'd expect for a text segment.
2213 */
2214 if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
2215 continue;
2216
2217 count = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
2218 &shstrtab);
2219 if (count == UINT_MAX) {
2220 error = ENOMEM;
2221 goto done;
2222 }
2223
2224 ASSERT(count <= remain);
2225 ASSERT(v == NULL || (idx + count) < nv);
2226
2227 remain -= count;
2228 idx += count;
2229 lastvp = mvp;
2230 }
2231
2232 if (v == NULL) {
2233 if (idx == 1) {
2234 *nshdrsp = 0;
2235 } else {
2236 /* Include room for the shrstrtab at the end */
2237 *nshdrsp = idx + 1;
2238 }
2239 /* No need to free up shstrtab so we can just return. */
2240 return (0);
2241 }
2242
2243 if (idx != nv - 1) {
2244 cmn_err(CE_WARN, "elfcore: core dump failed for "
2245 "process %d; address space is changing",
2246 ctx->ecc_p->p_pid);
2247 error = EIO;
2248 goto done;
2249 }
2250
2251 if (!shstrtab_ndx(&shstrtab, shstrtab_data[STR_SHSTRTAB],
2252 &v[idx].sh_name)) {
2253 error = ENOMEM;
2254 goto done;
2255 }
2256 v[idx].sh_size = shstrtab_size(&shstrtab);
2257 v[idx].sh_addralign = 1;
2258 v[idx].sh_offset = ctx->ecc_doffset;
2259 v[idx].sh_flags = SHF_STRINGS;
2260 v[idx].sh_type = SHT_STRTAB;
2261
2262 elf_ctx_resize_scratch(ctx, v[idx].sh_size);
2263 VERIFY3U(ctx->ecc_bufsz, >=, v[idx].sh_size);
2264 shstrtab_dump(&shstrtab, ctx->ecc_buf);
2265
2266 error = core_write(ctx->ecc_vp, UIO_SYSSPACE, ctx->ecc_doffset,
2267 ctx->ecc_buf, v[idx].sh_size, ctx->ecc_rlimit, ctx->ecc_credp);
2268 if (error == 0) {
2269 ctx->ecc_doffset += v[idx].sh_size;
2270 }
2271
2272 done:
2273 if (v != NULL)
2274 shstrtab_fini(&shstrtab);
2275 return (error);
2276 }
2277
2278 int
2279 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
2280 core_content_t content)
2281 {
2282 u_offset_t poffset, soffset, doffset;
2283 int error;
2284 uint_t i, nphdrs, nshdrs;
2285 struct seg *seg;
2286 struct as *as = p->p_as;
2287 void *bigwad, *zeropg = NULL;
2288 size_t bigsize, phdrsz, shdrsz;
2289 Ehdr *ehdr;
2290 Phdr *phdr;
2291 Shdr shdr0;
2292 caddr_t brkbase, stkbase;
2293 size_t brksize, stksize;
2294 boolean_t overflowed = B_FALSE, retried = B_FALSE;
2295 klwp_t *lwp = ttolwp(curthread);
2296 elf_core_ctx_t ctx = {
2297 .ecc_vp = vp,
2298 .ecc_p = p,
2299 .ecc_credp = credp,
2300 .ecc_rlimit = rlimit,
2301 .ecc_content = content,
2302 .ecc_doffset = 0,
2303 .ecc_buf = NULL,
2304 .ecc_bufsz = 0
2305 };
2306
2307 top:
2308 /*
2309 * Make sure we have everything we need (registers, etc.).
2310 * All other lwps have already stopped and are in an orderly state.
2311 */
2312 ASSERT(p == ttoproc(curthread));
2313 prstop(0, 0);
2314
2315 AS_LOCK_ENTER(as, RW_WRITER);
2316 nphdrs = prnsegs(as, 0) + 2; /* two CORE note sections */
2317
2318 /*
2319 * Count the number of section headers we're going to need.
2320 */
2321 nshdrs = 0;
2322 if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG))
2323 VERIFY0(elf_process_scns(&ctx, NULL, 0, &nshdrs));
2324 AS_LOCK_EXIT(as);
2325
2326 /*
2327 * The core file contents may require zero section headers, but if
2328 * we overflow the 16 bits allotted to the program header count in
2329 * the ELF header, we'll need that program header at index zero.
2330 */
2331 if (nshdrs == 0 && nphdrs >= PN_XNUM) {
2332 nshdrs = 1;
2333 }
2334
2335 /*
2336 * Allocate a buffer which is sized adequately to hold the ehdr,
2337 * phdrs, DWARF debug, or shdrs needed to produce the core file. It
2338 * is used for the four tasks sequentially, not simultaneously, so it
2339 * does not need space for all four data at once, only the largest
2340 * one.
2341 */
2342 VERIFY(nphdrs >= 2);
2343 phdrsz = nphdrs * sizeof (Phdr);
2344 shdrsz = nshdrs * sizeof (Shdr);
2345 bigsize = MAX(sizeof (Ehdr), MAX(phdrsz, shdrsz));
2346 bigwad = kmem_alloc(bigsize, KM_SLEEP);
2347
2348 ehdr = (Ehdr *)bigwad;
2349 bzero(ehdr, sizeof (*ehdr));
2350
2351 ehdr->e_ident[EI_MAG0] = ELFMAG0;
2352 ehdr->e_ident[EI_MAG1] = ELFMAG1;
2353 ehdr->e_ident[EI_MAG2] = ELFMAG2;
2354 ehdr->e_ident[EI_MAG3] = ELFMAG3;
2355 ehdr->e_ident[EI_CLASS] = ELFCLASS;
2356 ehdr->e_type = ET_CORE;
2357
2358 #if !defined(_LP64) || defined(_ELF32_COMPAT)
2359
2360 #if defined(__sparc)
2361 ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2362 ehdr->e_machine = EM_SPARC;
2363 #elif defined(__i386_COMPAT)
2364 ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2365 ehdr->e_machine = EM_386;
2366 #else
2367 #error "no recognized machine type is defined"
2368 #endif
2369
2370 #else /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2371
2372 #if defined(__sparc)
2373 ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2374 ehdr->e_machine = EM_SPARCV9;
2375 #elif defined(__amd64)
2376 ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2377 ehdr->e_machine = EM_AMD64;
2378 #else
2379 #error "no recognized 64-bit machine type is defined"
2380 #endif
2381
2382 #endif /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2383
2384 poffset = sizeof (Ehdr);
2385 soffset = sizeof (Ehdr) + phdrsz;
2386 doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2387 bzero(&shdr0, sizeof (shdr0));
2388
2389 /*
2390 * If the count of program headers or section headers or the index
2391 * of the section string table can't fit in the mere 16 bits
2392 * shortsightedly allotted to them in the ELF header, we use the
2393 * extended formats and put the real values in the section header
2394 * as index 0.
2395 */
2396 if (nphdrs >= PN_XNUM) {
2397 ehdr->e_phnum = PN_XNUM;
2398 shdr0.sh_info = nphdrs;
2399 } else {
2400 ehdr->e_phnum = (unsigned short)nphdrs;
2401 }
2402
2403 if (nshdrs > 0) {
2404 if (nshdrs >= SHN_LORESERVE) {
2405 ehdr->e_shnum = 0;
2406 shdr0.sh_size = nshdrs;
2407 } else {
2408 ehdr->e_shnum = (unsigned short)nshdrs;
2409 }
2410
2411 if (nshdrs - 1 >= SHN_LORESERVE) {
2412 ehdr->e_shstrndx = SHN_XINDEX;
2413 shdr0.sh_link = nshdrs - 1;
2414 } else {
2415 ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
2416 }
2417
2418 ehdr->e_shoff = soffset;
2419 ehdr->e_shentsize = sizeof (Shdr);
2420 }
2421
2422 ehdr->e_ident[EI_VERSION] = EV_CURRENT;
2423 ehdr->e_version = EV_CURRENT;
2424 ehdr->e_ehsize = sizeof (Ehdr);
2425 ehdr->e_phoff = poffset;
2426 ehdr->e_phentsize = sizeof (Phdr);
2427
2428 if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
2429 sizeof (Ehdr), rlimit, credp)) {
2430 goto done;
2431 }
2432
2433 phdr = (Phdr *)bigwad;
2434 bzero(phdr, phdrsz);
2435
2436 setup_old_note_header(&phdr[0], p);
2437 phdr[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2438 doffset += phdr[0].p_filesz;
2439
2440 setup_note_header(&phdr[1], p);
2441 phdr[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2442 doffset += phdr[1].p_filesz;
2443
2444 mutex_enter(&p->p_lock);
2445
2446 brkbase = p->p_brkbase;
2447 brksize = p->p_brksize;
2448
2449 stkbase = p->p_usrstack - p->p_stksize;
2450 stksize = p->p_stksize;
2451
2452 mutex_exit(&p->p_lock);
2453
2454 AS_LOCK_ENTER(as, RW_WRITER);
2455 i = 2;
2456 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2457 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2458 caddr_t saddr, naddr;
2459 void *tmp = NULL;
2460 extern struct seg_ops segspt_shmops;
2461
2462 if ((seg->s_flags & S_HOLE) != 0) {
2463 continue;
2464 }
2465
2466 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2467 uint_t prot;
2468 size_t size;
2469 int type;
2470 vnode_t *mvp;
2471
2472 prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2473 prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2474 if ((size = (size_t)(naddr - saddr)) == 0) {
2475 ASSERT(tmp == NULL);
2476 continue;
2477 } else if (i == nphdrs) {
2478 pr_getprot_done(&tmp);
2479 overflowed = B_TRUE;
2480 break;
2481 }
2482 phdr[i].p_type = PT_LOAD;
2483 phdr[i].p_vaddr = (Addr)(uintptr_t)saddr;
2484 phdr[i].p_memsz = size;
2485 if (prot & PROT_READ)
2486 phdr[i].p_flags |= PF_R;
2487 if (prot & PROT_WRITE)
2488 phdr[i].p_flags |= PF_W;
2489 if (prot & PROT_EXEC)
2490 phdr[i].p_flags |= PF_X;
2491
2492 /*
2493 * Figure out which mappings to include in the core.
2494 */
2495 type = SEGOP_GETTYPE(seg, saddr);
2496
2497 if (saddr == stkbase && size == stksize) {
2498 if (!(content & CC_CONTENT_STACK))
2499 goto exclude;
2500
2501 } else if (saddr == brkbase && size == brksize) {
2502 if (!(content & CC_CONTENT_HEAP))
2503 goto exclude;
2504
2505 } else if (seg->s_ops == &segspt_shmops) {
2506 if (type & MAP_NORESERVE) {
2507 if (!(content & CC_CONTENT_DISM))
2508 goto exclude;
2509 } else {
2510 if (!(content & CC_CONTENT_ISM))
2511 goto exclude;
2512 }
2513
2514 } else if (seg->s_ops != &segvn_ops) {
2515 goto exclude;
2516
2517 } else if (type & MAP_SHARED) {
2518 if (shmgetid(p, saddr) != SHMID_NONE) {
2519 if (!(content & CC_CONTENT_SHM))
2520 goto exclude;
2521
2522 } else if (SEGOP_GETVP(seg, seg->s_base,
2523 &mvp) != 0 || mvp == NULL ||
2524 mvp->v_type != VREG) {
2525 if (!(content & CC_CONTENT_SHANON))
2526 goto exclude;
2527
2528 } else {
2529 if (!(content & CC_CONTENT_SHFILE))
2530 goto exclude;
2531 }
2532
2533 } else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2534 mvp == NULL || mvp->v_type != VREG) {
2535 if (!(content & CC_CONTENT_ANON))
2536 goto exclude;
2537
2538 } else if (prot == (PROT_READ | PROT_EXEC)) {
2539 if (!(content & CC_CONTENT_TEXT))
2540 goto exclude;
2541
2542 } else if (prot == PROT_READ) {
2543 if (!(content & CC_CONTENT_RODATA))
2544 goto exclude;
2545
2546 } else {
2547 if (!(content & CC_CONTENT_DATA))
2548 goto exclude;
2549 }
2550
2551 doffset = roundup(doffset, sizeof (Word));
2552 phdr[i].p_offset = doffset;
2553 phdr[i].p_filesz = size;
2554 doffset += size;
2555 exclude:
2556 i++;
2557 }
2558 VERIFY(tmp == NULL);
2559 if (overflowed)
2560 break;
2561 }
2562 AS_LOCK_EXIT(as);
2563
2564 if (overflowed || i != nphdrs) {
2565 if (!retried) {
2566 retried = B_TRUE;
2567 overflowed = B_FALSE;
2568 kmem_free(bigwad, bigsize);
2569 goto top;
2570 }
2571 cmn_err(CE_WARN, "elfcore: core dump failed for "
2572 "process %d; address space is changing", p->p_pid);
2573 error = EIO;
2574 goto done;
2575 }
2576
2577 if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2578 phdr, phdrsz, rlimit, credp)) != 0) {
2579 goto done;
2580 }
2581
2582 if ((error = write_old_elfnotes(p, sig, vp, phdr[0].p_offset, rlimit,
2583 credp)) != 0) {
2584 goto done;
2585 }
2586 if ((error = write_elfnotes(p, sig, vp, phdr[1].p_offset, rlimit,
2587 credp, content)) != 0) {
2588 goto done;
2589 }
2590
2591 for (i = 2; i < nphdrs; i++) {
2592 prkillinfo_t killinfo;
2593 sigqueue_t *sq;
2594 int sig, j;
2595
2596 if (phdr[i].p_filesz == 0)
2597 continue;
2598
2599 /*
2600 * If we hit a region that was mapped PROT_NONE then we cannot
2601 * continue dumping this normally as the kernel would be unable
2602 * to read from the page and that would result in us failing to
2603 * dump the page. As such, any region mapped PROT_NONE, we dump
2604 * as a zero-filled page such that this is still represented in
2605 * the map.
2606 *
2607 * If dumping out this segment fails, rather than failing
2608 * the core dump entirely, we reset the size of the mapping
2609 * to zero to indicate that the data is absent from the core
2610 * file and or in the PF_SUNW_FAILURE flag to differentiate
2611 * this from mappings that were excluded due to the core file
2612 * content settings.
2613 */
2614 if ((phdr[i].p_flags & (PF_R | PF_W | PF_X)) == 0) {
2615 size_t towrite = phdr[i].p_filesz;
2616 size_t curoff = 0;
2617
2618 if (zeropg == NULL) {
2619 zeropg = kmem_zalloc(elf_zeropg_sz, KM_SLEEP);
2620 }
2621
2622 error = 0;
2623 while (towrite != 0) {
2624 size_t len = MIN(towrite, elf_zeropg_sz);
2625
2626 error = core_write(vp, UIO_SYSSPACE,
2627 phdr[i].p_offset + curoff, zeropg, len,
2628 rlimit, credp);
2629 if (error != 0)
2630 break;
2631
2632 towrite -= len;
2633 curoff += len;
2634 }
2635 } else {
2636 error = core_seg(p, vp, phdr[i].p_offset,
2637 (caddr_t)(uintptr_t)phdr[i].p_vaddr,
2638 phdr[i].p_filesz, rlimit, credp);
2639 }
2640 if (error == 0)
2641 continue;
2642
2643 if ((sig = lwp->lwp_cursig) == 0) {
2644 /*
2645 * We failed due to something other than a signal.
2646 * Since the space reserved for the segment is now
2647 * unused, we stash the errno in the first four
2648 * bytes. This undocumented interface will let us
2649 * understand the nature of the failure.
2650 */
2651 (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2652 &error, sizeof (error), rlimit, credp);
2653
2654 phdr[i].p_filesz = 0;
2655 phdr[i].p_flags |= PF_SUNW_FAILURE;
2656 if ((error = core_write(vp, UIO_SYSSPACE,
2657 poffset + sizeof (Phdr) * i, &phdr[i],
2658 sizeof (Phdr), rlimit, credp)) != 0)
2659 goto done;
2660
2661 continue;
2662 }
2663
2664 /*
2665 * We took a signal. We want to abort the dump entirely, but
2666 * we also want to indicate what failed and why. We therefore
2667 * use the space reserved for the first failing segment to
2668 * write our error (which, for purposes of compatability with
2669 * older core dump readers, we set to EINTR) followed by any
2670 * siginfo associated with the signal.
2671 */
2672 bzero(&killinfo, sizeof (killinfo));
2673 killinfo.prk_error = EINTR;
2674
2675 sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2676
2677 if (sq != NULL) {
2678 bcopy(&sq->sq_info, &killinfo.prk_info,
2679 sizeof (sq->sq_info));
2680 } else {
2681 killinfo.prk_info.si_signo = lwp->lwp_cursig;
2682 killinfo.prk_info.si_code = SI_NOINFO;
2683 }
2684
2685 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2686 /*
2687 * If this is a 32-bit process, we need to translate from the
2688 * native siginfo to the 32-bit variant. (Core readers must
2689 * always have the same data model as their target or must
2690 * be aware of -- and compensate for -- data model differences.)
2691 */
2692 if (curproc->p_model == DATAMODEL_ILP32) {
2693 siginfo32_t si32;
2694
2695 siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2696 bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2697 }
2698 #endif
2699
2700 (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2701 &killinfo, sizeof (killinfo), rlimit, credp);
2702
2703 /*
2704 * For the segment on which we took the signal, indicate that
2705 * its data now refers to a siginfo.
2706 */
2707 phdr[i].p_filesz = 0;
2708 phdr[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2709 PF_SUNW_SIGINFO;
2710
2711 /*
2712 * And for every other segment, indicate that its absence
2713 * is due to a signal.
2714 */
2715 for (j = i + 1; j < nphdrs; j++) {
2716 phdr[j].p_filesz = 0;
2717 phdr[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2718 }
2719
2720 /*
2721 * Finally, write out our modified program headers.
2722 */
2723 if ((error = core_write(vp, UIO_SYSSPACE,
2724 poffset + sizeof (Phdr) * i, &phdr[i],
2725 sizeof (Phdr) * (nphdrs - i), rlimit, credp)) != 0) {
2726 goto done;
2727 }
2728
2729 break;
2730 }
2731
2732 if (nshdrs > 0) {
2733 Shdr *shdr = (Shdr *)bigwad;
2734
2735 bzero(shdr, shdrsz);
2736 if (nshdrs > 1) {
2737 ctx.ecc_doffset = doffset;
2738 AS_LOCK_ENTER(as, RW_WRITER);
2739 error = elf_process_scns(&ctx, shdr, nshdrs, NULL);
2740 AS_LOCK_EXIT(as);
2741 if (error != 0) {
2742 goto done;
2743 }
2744 }
2745 /* Copy any extended format data destined for the first shdr */
2746 bcopy(&shdr0, shdr, sizeof (shdr0));
2747
2748 error = core_write(vp, UIO_SYSSPACE, soffset, shdr, shdrsz,
2749 rlimit, credp);
2750 }
2751
2752 done:
2753 if (zeropg != NULL)
2754 kmem_free(zeropg, elf_zeropg_sz);
2755 if (ctx.ecc_bufsz != 0) {
2756 kmem_free(ctx.ecc_buf, ctx.ecc_bufsz);
2757 }
2758 kmem_free(bigwad, bigsize);
2759 return (error);
2760 }
2761
2762 #ifndef _ELF32_COMPAT
2763
2764 static struct execsw esw = {
2765 #ifdef _LP64
2766 elf64magicstr,
2767 #else /* _LP64 */
2768 elf32magicstr,
2769 #endif /* _LP64 */
2770 0,
2771 5,
2772 elfexec,
2773 elfcore
2774 };
2775
2776 static struct modlexec modlexec = {
2777 &mod_execops, "exec module for elf", &esw
2778 };
2779
2780 #ifdef _LP64
2781 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2782 intpdata_t *idatap, int level, size_t *execsz,
2783 int setid, caddr_t exec_file, cred_t *cred,
2784 int *brand_action);
2785 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2786 rlim64_t rlimit, int sig, core_content_t content);
2787
2788 static struct execsw esw32 = {
2789 elf32magicstr,
2790 0,
2791 5,
2792 elf32exec,
2793 elf32core
2794 };
2795
2796 static struct modlexec modlexec32 = {
2797 &mod_execops, "32-bit exec module for elf", &esw32
2798 };
2799 #endif /* _LP64 */
2800
2801 static struct modlinkage modlinkage = {
2802 MODREV_1,
2803 (void *)&modlexec,
2804 #ifdef _LP64
2805 (void *)&modlexec32,
2806 #endif /* _LP64 */
2807 NULL
2808 };
2809
2810 int
2811 _init(void)
2812 {
2813 return (mod_install(&modlinkage));
2814 }
2815
2816 int
2817 _fini(void)
2818 {
2819 return (mod_remove(&modlinkage));
2820 }
2821
2822 int
2823 _info(struct modinfo *modinfop)
2824 {
2825 return (mod_info(&modlinkage, modinfop));
2826 }
2827
2828 #endif /* !_ELF32_COMPAT */