1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 /*
29 * Copyright 2019 Joyent, Inc.
30 * Copyright 2021 Oxide Computer Company
31 */
32
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/thread.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/vnode.h>
42 #include <sys/mman.h>
43 #include <sys/kmem.h>
44 #include <sys/proc.h>
45 #include <sys/pathname.h>
46 #include <sys/policy.h>
47 #include <sys/cmn_err.h>
48 #include <sys/systm.h>
49 #include <sys/elf.h>
50 #include <sys/vmsystm.h>
51 #include <sys/debug.h>
52 #include <sys/auxv.h>
53 #include <sys/exec.h>
54 #include <sys/prsystm.h>
55 #include <vm/as.h>
56 #include <vm/rm.h>
57 #include <vm/seg.h>
58 #include <vm/seg_vn.h>
59 #include <sys/modctl.h>
60 #include <sys/systeminfo.h>
61 #include <sys/vmparam.h>
62 #include <sys/machelf.h>
63 #include <sys/shm_impl.h>
64 #include <sys/archsystm.h>
65 #include <sys/fasttrap.h>
66 #include <sys/brand.h>
67 #include "elf_impl.h"
68 #include <sys/sdt.h>
69 #include <sys/siginfo.h>
70 #include <sys/random.h>
71
72 #if defined(__x86)
73 #include <sys/comm_page_util.h>
74 #include <sys/fp.h>
75 #endif /* defined(__x86) */
76
77
78 extern int at_flags;
79 extern volatile size_t aslr_max_brk_skew;
80
81 #define ORIGIN_STR "ORIGIN"
82 #define ORIGIN_STR_SIZE 6
83
84 static int getelfhead(vnode_t *, cred_t *, Ehdr *, uint_t *, uint_t *,
85 uint_t *);
86 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, uint_t, caddr_t *,
87 size_t *);
88 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, uint_t, uint_t,
89 caddr_t *, size_t *, caddr_t *, size_t *);
90 static size_t elfsize(const Ehdr *, uint_t, const caddr_t, uintptr_t *);
91 static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **,
92 Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *,
93 size_t, size_t *, size_t *);
94
95 #ifdef _ELF32_COMPAT
96 /* Link against the non-compat instances when compiling the 32-bit version. */
97 extern size_t elf_datasz_max;
98 extern size_t elf_zeropg_sz;
99 extern void elf_ctx_resize_scratch(elf_core_ctx_t *, size_t);
100 extern uint_t elf_nphdr_max;
101 extern uint_t elf_nshdr_max;
102 extern size_t elf_shstrtab_max;
103 #else
104 size_t elf_datasz_max = 1 * 1024 * 1024;
105 size_t elf_zeropg_sz = 4 * 1024;
106 uint_t elf_nphdr_max = 1000;
107 uint_t elf_nshdr_max = 10000;
108 size_t elf_shstrtab_max = 100 * 1024;
109 #endif
110
111
112
113 typedef enum {
114 STR_CTF,
115 STR_SYMTAB,
116 STR_DYNSYM,
117 STR_STRTAB,
118 STR_DYNSTR,
119 STR_SHSTRTAB,
120 STR_NUM
121 } shstrtype_t;
122
123 static const char *shstrtab_data[] = {
124 ".SUNW_ctf",
125 ".symtab",
126 ".dynsym",
127 ".strtab",
128 ".dynstr",
129 ".shstrtab"
130 };
131
132 typedef struct shstrtab {
133 uint_t sst_ndx[STR_NUM];
134 uint_t sst_cur;
135 } shstrtab_t;
136
137 static void
138 shstrtab_init(shstrtab_t *s)
139 {
140 bzero(&s->sst_ndx, sizeof (s->sst_ndx));
141 s->sst_cur = 1;
142 }
143
144 static uint_t
145 shstrtab_ndx(shstrtab_t *s, shstrtype_t type)
146 {
147 uint_t ret;
148
149 if ((ret = s->sst_ndx[type]) != 0)
150 return (ret);
151
152 ret = s->sst_ndx[type] = s->sst_cur;
153 s->sst_cur += strlen(shstrtab_data[type]) + 1;
154
155 return (ret);
156 }
157
158 static size_t
159 shstrtab_size(const shstrtab_t *s)
160 {
161 return (s->sst_cur);
162 }
163
164 static void
165 shstrtab_dump(const shstrtab_t *s, char *buf)
166 {
167 uint_t i, ndx;
168
169 *buf = '\0';
170 for (i = 0; i < STR_NUM; i++) {
171 if ((ndx = s->sst_ndx[i]) != 0)
172 (void) strcpy(buf + ndx, shstrtab_data[i]);
173 }
174 }
175
176 static int
177 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
178 {
179 ASSERT(phdrp->p_type == PT_SUNWDTRACE);
180
181 /*
182 * See the comment in fasttrap.h for information on how to safely
183 * update this program header.
184 */
185 if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
186 (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
187 return (-1);
188
189 args->thrptr = phdrp->p_vaddr + base;
190
191 return (0);
192 }
193
194 static int
195 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
196 {
197 uint_t flag;
198
199 switch (dt) {
200 case DT_SUNW_ASLR:
201 flag = PROC_SEC_ASLR;
202 break;
203 default:
204 return (EINVAL);
205 }
206
207 if (val == 0) {
208 if (secflag_isset(p->p_secflags.psf_lower, flag))
209 return (EPERM);
210 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
211 secflag_isset(p->p_secflags.psf_inherit, flag))
212 return (EPERM);
213
214 secflag_clear(&p->p_secflags.psf_effective, flag);
215 } else {
216 if (!secflag_isset(p->p_secflags.psf_upper, flag))
217 return (EPERM);
218
219 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
220 !secflag_isset(p->p_secflags.psf_inherit, flag))
221 return (EPERM);
222
223 secflag_set(&p->p_secflags.psf_effective, flag);
224 }
225
226 return (0);
227 }
228
229
230 #ifndef _ELF32_COMPAT
231 void
232 elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz)
233 {
234 size_t target = MIN(sz, elf_datasz_max);
235
236 if (target > ctx->ecc_bufsz) {
237 if (ctx->ecc_buf != NULL) {
238 kmem_free(ctx->ecc_buf, ctx->ecc_bufsz);
239 }
240 ctx->ecc_buf = kmem_alloc(target, KM_SLEEP);
241 ctx->ecc_bufsz = target;
242 }
243 }
244 #endif /* _ELF32_COMPAT */
245
246 /*
247 * Map in the executable pointed to by vp. Returns 0 on success. Note that
248 * this function currently has the maximum number of arguments allowed by
249 * modstubs on x86 (MAXNARG)! Do _not_ add to this function signature without
250 * adding to MAXNARG. (Better yet, do not add to this monster of a function
251 * signature!)
252 */
253 int
254 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
255 intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
256 caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
257 {
258 size_t len, phdrsize;
259 struct vattr vat;
260 caddr_t phdrbase = NULL;
261 uint_t nshdrs, shstrndx, nphdrs;
262 int error = 0;
263 Phdr *uphdr = NULL;
264 Phdr *junk = NULL;
265 Phdr *dynphdr = NULL;
266 Phdr *dtrphdr = NULL;
267 char *interp = NULL;
268 uintptr_t lddata, minaddr;
269 size_t execsz;
270
271 if (lddatap != NULL)
272 *lddatap = 0;
273
274 if (minaddrp != NULL)
275 *minaddrp = (uintptr_t)NULL;
276
277 if (error = execpermissions(vp, &vat, args)) {
278 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
279 return (error);
280 }
281
282 if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
283 &nphdrs)) != 0 ||
284 (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
285 &phdrsize)) != 0) {
286 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
287 return (error);
288 }
289
290 if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
291 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
292 kmem_free(phdrbase, phdrsize);
293 return (ENOEXEC);
294 }
295 if (lddatap != NULL)
296 *lddatap = lddata;
297
298 if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
299 &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
300 len, &execsz, brksize)) {
301 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
302 if (uphdr != NULL && uphdr->p_flags == 0)
303 kmem_free(uphdr, sizeof (Phdr));
304 kmem_free(phdrbase, phdrsize);
305 return (error);
306 }
307
308 if (minaddrp != NULL)
309 *minaddrp = minaddr;
310
311 /*
312 * If the executable requires an interpreter, determine its name.
313 */
314 if (dynphdr != NULL) {
315 ssize_t resid;
316
317 if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
318 uprintf("%s: Invalid interpreter\n", exec_file);
319 kmem_free(phdrbase, phdrsize);
320 return (ENOEXEC);
321 }
322
323 interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
324
325 if ((error = vn_rdwr(UIO_READ, vp, interp,
326 (ssize_t)dynphdr->p_filesz,
327 (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
328 (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
329 interp[dynphdr->p_filesz - 1] != '\0') {
330 uprintf("%s: Cannot obtain interpreter pathname\n",
331 exec_file);
332 kmem_free(interp, MAXPATHLEN);
333 kmem_free(phdrbase, phdrsize);
334 return (error != 0 ? error : ENOEXEC);
335 }
336 }
337
338 /*
339 * If this is a statically linked executable, voffset should indicate
340 * the address of the executable itself (it normally holds the address
341 * of the interpreter).
342 */
343 if (ehdr->e_type == ET_EXEC && interp == NULL)
344 *voffset = minaddr;
345
346 /*
347 * If the caller has asked for the interpreter name, return it (it's
348 * up to the caller to free it); if the caller hasn't asked for it,
349 * free it ourselves.
350 */
351 if (interpp != NULL) {
352 *interpp = interp;
353 } else if (interp != NULL) {
354 kmem_free(interp, MAXPATHLEN);
355 }
356
357 if (uphdr != NULL) {
358 *uphdr_vaddr = uphdr->p_vaddr;
359
360 if (uphdr->p_flags == 0)
361 kmem_free(uphdr, sizeof (Phdr));
362 } else if (ehdr->e_type == ET_DYN) {
363 /*
364 * If we don't have a uphdr, we'll apply the logic found
365 * in mapelfexec() and use the p_vaddr of the first PT_LOAD
366 * section as the base address of the object.
367 */
368 const Phdr *phdr = (Phdr *)phdrbase;
369 const uint_t hsize = ehdr->e_phentsize;
370 uint_t i;
371
372 for (i = nphdrs; i > 0; i--) {
373 if (phdr->p_type == PT_LOAD) {
374 *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
375 ehdr->e_phoff;
376 break;
377 }
378
379 phdr = (Phdr *)((caddr_t)phdr + hsize);
380 }
381
382 /*
383 * If we don't have a PT_LOAD segment, we should have returned
384 * ENOEXEC when elfsize() returned 0, above.
385 */
386 VERIFY(i > 0);
387 } else {
388 *uphdr_vaddr = (Addr)-1;
389 }
390
391 kmem_free(phdrbase, phdrsize);
392 return (error);
393 }
394
395 /*ARGSUSED*/
396 int
397 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
398 int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
399 int *brand_action)
400 {
401 caddr_t phdrbase = NULL;
402 caddr_t bssbase = 0;
403 caddr_t brkbase = 0;
404 size_t brksize = 0;
405 size_t dlnsize, nsize = 0;
406 aux_entry_t *aux;
407 int error;
408 ssize_t resid;
409 int fd = -1;
410 intptr_t voffset;
411 Phdr *intphdr = NULL;
412 Phdr *dynamicphdr = NULL;
413 Phdr *stphdr = NULL;
414 Phdr *uphdr = NULL;
415 Phdr *junk = NULL;
416 size_t len;
417 size_t postfixsize = 0;
418 size_t i;
419 Phdr *phdrp;
420 Phdr *dataphdrp = NULL;
421 Phdr *dtrphdr;
422 Phdr *capphdr = NULL;
423 Cap *cap = NULL;
424 size_t capsize;
425 int hasu = 0;
426 int hasauxv = 0;
427 int hasintp = 0;
428 int branded = 0;
429 int dynuphdr = 0;
430
431 struct proc *p = ttoproc(curthread);
432 struct user *up = PTOU(p);
433 struct bigwad {
434 Ehdr ehdr;
435 aux_entry_t elfargs[__KERN_NAUXV_IMPL];
436 char dl_name[MAXPATHLEN];
437 char pathbuf[MAXPATHLEN];
438 struct vattr vattr;
439 struct execenv exenv;
440 } *bigwad; /* kmem_alloc this behemoth so we don't blow stack */
441 Ehdr *ehdrp;
442 uint_t nshdrs, shstrndx, nphdrs;
443 size_t phdrsize;
444 char *dlnp;
445 char *pathbufp;
446 rlim64_t limit;
447 rlim64_t roundlimit;
448
449 ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
450
451 bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
452 ehdrp = &bigwad->ehdr;
453 dlnp = bigwad->dl_name;
454 pathbufp = bigwad->pathbuf;
455
456 /*
457 * Obtain ELF and program header information.
458 */
459 if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
460 &nphdrs)) != 0 ||
461 (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
462 &phdrsize)) != 0)
463 goto out;
464
465 /*
466 * Prevent executing an ELF file that has no entry point.
467 */
468 if (ehdrp->e_entry == 0) {
469 uprintf("%s: Bad entry point\n", exec_file);
470 goto bad;
471 }
472
473 /*
474 * Put data model that we're exec-ing to into the args passed to
475 * exec_args(), so it will know what it is copying to on new stack.
476 * Now that we know whether we are exec-ing a 32-bit or 64-bit
477 * executable, we can set execsz with the appropriate NCARGS.
478 */
479 #ifdef _LP64
480 if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
481 args->to_model = DATAMODEL_ILP32;
482 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
483 } else {
484 args->to_model = DATAMODEL_LP64;
485 if (!args->stk_prot_override) {
486 args->stk_prot &= ~PROT_EXEC;
487 }
488 #if defined(__x86)
489 args->dat_prot &= ~PROT_EXEC;
490 #endif
491 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
492 }
493 #else /* _LP64 */
494 args->to_model = DATAMODEL_ILP32;
495 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
496 #endif /* _LP64 */
497
498 /*
499 * We delay invoking the brand callback until we've figured out what
500 * kind of elf binary we're trying to run, 32-bit or 64-bit. We do this
501 * because now the brand library can just check args->to_model to see if
502 * the target is 32-bit or 64-bit without having do duplicate all the
503 * code above.
504 *
505 * We also give the brand a chance to indicate that based on the ELF
506 * OSABI of the target binary it should become unbranded and optionally
507 * indicate that it should be treated as existing in a specific prefix.
508 *
509 * Note that if a brand opts to go down this route it does not actually
510 * end up being debranded. In other words, future programs that exec
511 * will still be considered for branding unless this escape hatch is
512 * used. Consider the case of lx brand for example. If a user runs
513 * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
514 * of DTrace that's in /native will take this escape hatch and be run
515 * and interpreted using the normal system call table; however, the
516 * execution of a non-illumos binary in the form of /bin/ls will still
517 * be branded and be subject to all of the normal actions of the brand.
518 *
519 * The level checks associated with brand handling below are used to
520 * prevent a loop since the brand elfexec function typically comes back
521 * through this function. We must check <= here since the nested
522 * handling in the #! interpreter code will increment the level before
523 * calling gexec to run the final elfexec interpreter.
524 */
525 if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
526 (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
527 if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
528 &args->brand_nroot) == B_TRUE) {
529 ASSERT(ehdrp->e_ident[EI_OSABI]);
530 *brand_action = EBA_NATIVE;
531 /* Add one for the trailing '/' in the path */
532 if (args->brand_nroot != NULL)
533 nsize = strlen(args->brand_nroot) + 1;
534 }
535 }
536
537 if ((level <= INTP_MAXDEPTH) &&
538 (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
539 error = BROP(p)->b_elfexec(vp, uap, args,
540 idatap, level + 1, execsz, setid, exec_file, cred,
541 brand_action);
542 goto out;
543 }
544
545 /*
546 * Determine aux size now so that stack can be built
547 * in one shot (except actual copyout of aux image),
548 * determine any non-default stack protections,
549 * and still have this code be machine independent.
550 */
551 const uint_t hsize = ehdrp->e_phentsize;
552 phdrp = (Phdr *)phdrbase;
553 for (i = nphdrs; i > 0; i--) {
554 switch (phdrp->p_type) {
555 case PT_INTERP:
556 hasauxv = hasintp = 1;
557 break;
558 case PT_PHDR:
559 hasu = 1;
560 break;
561 case PT_SUNWSTACK:
562 args->stk_prot = PROT_USER;
563 if (phdrp->p_flags & PF_R)
564 args->stk_prot |= PROT_READ;
565 if (phdrp->p_flags & PF_W)
566 args->stk_prot |= PROT_WRITE;
567 if (phdrp->p_flags & PF_X)
568 args->stk_prot |= PROT_EXEC;
569 break;
570 case PT_LOAD:
571 dataphdrp = phdrp;
572 break;
573 case PT_SUNWCAP:
574 capphdr = phdrp;
575 break;
576 case PT_DYNAMIC:
577 dynamicphdr = phdrp;
578 break;
579 }
580 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
581 }
582
583 if (ehdrp->e_type != ET_EXEC) {
584 dataphdrp = NULL;
585 hasauxv = 1;
586 }
587
588 /* Copy BSS permissions to args->dat_prot */
589 if (dataphdrp != NULL) {
590 args->dat_prot = PROT_USER;
591 if (dataphdrp->p_flags & PF_R)
592 args->dat_prot |= PROT_READ;
593 if (dataphdrp->p_flags & PF_W)
594 args->dat_prot |= PROT_WRITE;
595 if (dataphdrp->p_flags & PF_X)
596 args->dat_prot |= PROT_EXEC;
597 }
598
599 /*
600 * If a auxvector will be required - reserve the space for
601 * it now. This may be increased by exec_args if there are
602 * ISA-specific types (included in __KERN_NAUXV_IMPL).
603 */
604 if (hasauxv) {
605 /*
606 * If a AUX vector is being built - the base AUX
607 * entries are:
608 *
609 * AT_BASE
610 * AT_FLAGS
611 * AT_PAGESZ
612 * AT_RANDOM (added in stk_copyout)
613 * AT_SUN_AUXFLAGS
614 * AT_SUN_HWCAP
615 * AT_SUN_HWCAP2
616 * AT_SUN_PLATFORM (added in stk_copyout)
617 * AT_SUN_EXECNAME (added in stk_copyout)
618 * AT_NULL
619 *
620 * total == 10
621 */
622 if (hasintp && hasu) {
623 /*
624 * Has PT_INTERP & PT_PHDR - the auxvectors that
625 * will be built are:
626 *
627 * AT_PHDR
628 * AT_PHENT
629 * AT_PHNUM
630 * AT_ENTRY
631 * AT_LDDATA
632 *
633 * total = 5
634 */
635 args->auxsize = (10 + 5) * sizeof (aux_entry_t);
636 } else if (hasintp) {
637 /*
638 * Has PT_INTERP but no PT_PHDR
639 *
640 * AT_EXECFD
641 * AT_LDDATA
642 *
643 * total = 2
644 */
645 args->auxsize = (10 + 2) * sizeof (aux_entry_t);
646 } else {
647 args->auxsize = 10 * sizeof (aux_entry_t);
648 }
649 } else {
650 args->auxsize = 0;
651 }
652
653 /*
654 * If this binary is using an emulator, we need to add an
655 * AT_SUN_EMULATOR aux entry.
656 */
657 if (args->emulator != NULL)
658 args->auxsize += sizeof (aux_entry_t);
659
660 /*
661 * If this is a native binary that's been given a modified interpreter
662 * root, inform it that the native system exists at that root.
663 */
664 if (args->brand_nroot != NULL) {
665 args->auxsize += sizeof (aux_entry_t);
666 }
667
668
669 /*
670 * On supported kernels (x86_64) make room in the auxv for the
671 * AT_SUN_COMMPAGE entry. This will go unpopulated on i86xpv systems
672 * which do not provide such functionality.
673 *
674 * Additionally cover the floating point information AT_SUN_FPSIZE and
675 * AT_SUN_FPTYPE.
676 */
677 #if defined(__amd64)
678 args->auxsize += 3 * sizeof (aux_entry_t);
679 #endif /* defined(__amd64) */
680
681 /*
682 * If we have user credentials, we'll supply the following entries:
683 * AT_SUN_UID
684 * AT_SUN_RUID
685 * AT_SUN_GID
686 * AT_SUN_RGID
687 */
688 if (cred != NULL) {
689 args->auxsize += 4 * sizeof (aux_entry_t);
690 }
691
692 if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
693 branded = 1;
694 /*
695 * We will be adding 5 entries to the aux vectors. One for
696 * the the brandname and 4 for the brand specific aux vectors.
697 */
698 args->auxsize += 5 * sizeof (aux_entry_t);
699 }
700
701 /* If the binary has an explicit ASLR flag, it must be honoured */
702 if ((dynamicphdr != NULL) && (dynamicphdr->p_filesz > 0)) {
703 const size_t dynfilesz = dynamicphdr->p_filesz;
704 const size_t dynoffset = dynamicphdr->p_offset;
705 Dyn *dyn, *dp;
706
707 if (dynoffset > MAXOFFSET_T ||
708 dynfilesz > MAXOFFSET_T ||
709 dynoffset + dynfilesz > MAXOFFSET_T) {
710 uprintf("%s: cannot read full .dynamic section\n",
711 exec_file);
712 error = EINVAL;
713 goto out;
714 }
715
716 #define DYN_STRIDE 100
717 for (i = 0; i < dynfilesz; i += sizeof (*dyn) * DYN_STRIDE) {
718 const size_t remdyns = (dynfilesz - i) / sizeof (*dyn);
719 const size_t ndyns = MIN(DYN_STRIDE, remdyns);
720 const size_t dynsize = ndyns * sizeof (*dyn);
721
722 dyn = kmem_alloc(dynsize, KM_SLEEP);
723
724 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
725 (ssize_t)dynsize, (offset_t)(dynoffset + i),
726 UIO_SYSSPACE, 0, (rlim64_t)0,
727 CRED(), NULL)) != 0) {
728 uprintf("%s: cannot read .dynamic section\n",
729 exec_file);
730 goto out;
731 }
732
733 for (dp = dyn; dp < (dyn + ndyns); dp++) {
734 if (dp->d_tag == DT_SUNW_ASLR) {
735 if ((error = handle_secflag_dt(p,
736 DT_SUNW_ASLR,
737 dp->d_un.d_val)) != 0) {
738 uprintf("%s: error setting "
739 "security-flag from "
740 "DT_SUNW_ASLR: %d\n",
741 exec_file, error);
742 goto out;
743 }
744 }
745 }
746
747 kmem_free(dyn, dynsize);
748 }
749 }
750
751 /* Hardware/Software capabilities */
752 if (capphdr != NULL &&
753 (capsize = capphdr->p_filesz) > 0 &&
754 capsize <= 16 * sizeof (*cap)) {
755 const uint_t ncaps = capsize / sizeof (*cap);
756 Cap *cp;
757
758 cap = kmem_alloc(capsize, KM_SLEEP);
759 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
760 (ssize_t)capsize, (offset_t)capphdr->p_offset,
761 UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), NULL)) != 0) {
762 uprintf("%s: Cannot read capabilities section\n",
763 exec_file);
764 goto out;
765 }
766 for (cp = cap; cp < cap + ncaps; cp++) {
767 if (cp->c_tag == CA_SUNW_SF_1 &&
768 (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
769 if (args->to_model == DATAMODEL_LP64)
770 args->addr32 = 1;
771 break;
772 }
773 }
774 }
775
776 aux = bigwad->elfargs;
777 /*
778 * Move args to the user's stack.
779 * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM
780 * aux entries.
781 */
782 if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
783 if (error == -1) {
784 error = ENOEXEC;
785 goto bad;
786 }
787 goto out;
788 }
789 /* we're single threaded after this point */
790
791 /*
792 * If this is an ET_DYN executable (shared object),
793 * determine its memory size so that mapelfexec() can load it.
794 */
795 if (ehdrp->e_type == ET_DYN)
796 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
797 else
798 len = 0;
799
800 dtrphdr = NULL;
801
802 error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
803 &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
804 len, execsz, &brksize);
805 /*
806 * Our uphdr has been dynamically allocated if (and only if) its
807 * program header flags are clear. To avoid leaks, this must be
808 * checked regardless of whether mapelfexec() emitted an error.
809 */
810 dynuphdr = (uphdr != NULL && uphdr->p_flags == 0);
811
812 if (error != 0) {
813 goto bad;
814 }
815
816 if (uphdr != NULL && intphdr == NULL)
817 goto bad;
818
819 if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
820 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
821 goto bad;
822 }
823
824 if (intphdr != NULL) {
825 size_t len;
826 uintptr_t lddata;
827 char *p;
828 struct vnode *nvp;
829
830 dlnsize = intphdr->p_filesz + nsize;
831
832 /*
833 * Make sure none of the component pieces of dlnsize result in
834 * an oversized or zeroed result.
835 */
836 if (intphdr->p_filesz > MAXPATHLEN || dlnsize > MAXPATHLEN ||
837 dlnsize == 0 || dlnsize < intphdr->p_filesz) {
838 goto bad;
839 }
840
841 if (nsize != 0) {
842 bcopy(args->brand_nroot, dlnp, nsize - 1);
843 dlnp[nsize - 1] = '/';
844 }
845
846 /*
847 * Read in "interpreter" pathname.
848 */
849 if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
850 (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset,
851 UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
852 uprintf("%s: Cannot obtain interpreter pathname\n",
853 exec_file);
854 goto bad;
855 }
856
857 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
858 goto bad;
859
860 /*
861 * Search for '$ORIGIN' token in interpreter path.
862 * If found, expand it.
863 */
864 for (p = dlnp; p = strchr(p, '$'); ) {
865 uint_t len, curlen;
866 char *_ptr;
867
868 if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
869 continue;
870
871 /*
872 * We don't support $ORIGIN on setid programs to close
873 * a potential attack vector.
874 */
875 if ((setid & EXECSETID_SETID) != 0) {
876 error = ENOEXEC;
877 goto bad;
878 }
879
880 curlen = 0;
881 len = p - dlnp - 1;
882 if (len) {
883 bcopy(dlnp, pathbufp, len);
884 curlen += len;
885 }
886 if (_ptr = strrchr(args->pathname, '/')) {
887 len = _ptr - args->pathname;
888 if ((curlen + len) > MAXPATHLEN)
889 break;
890
891 bcopy(args->pathname, &pathbufp[curlen], len);
892 curlen += len;
893 } else {
894 /*
895 * executable is a basename found in the
896 * current directory. So - just substitue
897 * '.' for ORIGIN.
898 */
899 pathbufp[curlen] = '.';
900 curlen++;
901 }
902 p += ORIGIN_STR_SIZE;
903 len = strlen(p);
904
905 if ((curlen + len) > MAXPATHLEN)
906 break;
907 bcopy(p, &pathbufp[curlen], len);
908 curlen += len;
909 pathbufp[curlen++] = '\0';
910 bcopy(pathbufp, dlnp, curlen);
911 }
912
913 /*
914 * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
915 * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
916 * Just in case /usr is not mounted, change it now.
917 */
918 if (strcmp(dlnp, USR_LIB_RTLD) == 0)
919 dlnp += 4;
920 error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
921 if (error && dlnp != bigwad->dl_name) {
922 /* new kernel, old user-level */
923 error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
924 NULLVPP, &nvp);
925 }
926 if (error) {
927 uprintf("%s: Cannot find %s\n", exec_file, dlnp);
928 goto bad;
929 }
930
931 /*
932 * Setup the "aux" vector.
933 */
934 if (uphdr) {
935 if (ehdrp->e_type == ET_DYN) {
936 /* don't use the first page */
937 bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
938 bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
939 } else {
940 bigwad->exenv.ex_bssbase = bssbase;
941 bigwad->exenv.ex_brkbase = brkbase;
942 }
943 bigwad->exenv.ex_brksize = brksize;
944 bigwad->exenv.ex_magic = elfmagic;
945 bigwad->exenv.ex_vp = vp;
946 setexecenv(&bigwad->exenv);
947
948 ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
949 ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
950 ADDAUX(aux, AT_PHNUM, nphdrs)
951 ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
952 } else {
953 if ((error = execopen(&vp, &fd)) != 0) {
954 VN_RELE(nvp);
955 goto bad;
956 }
957
958 ADDAUX(aux, AT_EXECFD, fd)
959 }
960
961 if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
962 VN_RELE(nvp);
963 uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
964 goto bad;
965 }
966
967 /*
968 * Now obtain the ELF header along with the entire program
969 * header contained in "nvp".
970 */
971 kmem_free(phdrbase, phdrsize);
972 phdrbase = NULL;
973 if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
974 &shstrndx, &nphdrs)) != 0 ||
975 (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
976 &phdrsize)) != 0) {
977 VN_RELE(nvp);
978 uprintf("%s: Cannot read %s\n", exec_file, dlnp);
979 goto bad;
980 }
981
982 /*
983 * Determine memory size of the "interpreter's" loadable
984 * sections. This size is then used to obtain the virtual
985 * address of a hole, in the user's address space, large
986 * enough to map the "interpreter".
987 */
988 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
989 VN_RELE(nvp);
990 uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
991 goto bad;
992 }
993
994 dtrphdr = NULL;
995
996 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
997 &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
998 execsz, NULL);
999
1000 if (error || junk != NULL) {
1001 VN_RELE(nvp);
1002 uprintf("%s: Cannot map %s\n", exec_file, dlnp);
1003 goto bad;
1004 }
1005
1006 /*
1007 * We use the DTrace program header to initialize the
1008 * architecture-specific user per-LWP location. The dtrace
1009 * fasttrap provider requires ready access to per-LWP scratch
1010 * space. We assume that there is only one such program header
1011 * in the interpreter.
1012 */
1013 if (dtrphdr != NULL &&
1014 dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
1015 VN_RELE(nvp);
1016 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
1017 goto bad;
1018 }
1019
1020 VN_RELE(nvp);
1021 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
1022 }
1023
1024 if (hasauxv) {
1025 int auxf = AF_SUN_HWCAPVERIFY;
1026 #if defined(__amd64)
1027 size_t fpsize;
1028 int fptype;
1029 #endif /* defined(__amd64) */
1030
1031 /*
1032 * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were
1033 * filled in via exec_args()
1034 */
1035 ADDAUX(aux, AT_BASE, voffset)
1036 ADDAUX(aux, AT_FLAGS, at_flags)
1037 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
1038 /*
1039 * Linker flags. (security)
1040 * p_flag not yet set at this time.
1041 * We rely on gexec() to provide us with the information.
1042 * If the application is set-uid but this is not reflected
1043 * in a mismatch between real/effective uids/gids, then
1044 * don't treat this as a set-uid exec. So we care about
1045 * the EXECSETID_UGIDS flag but not the ...SETID flag.
1046 */
1047 if ((setid &= ~EXECSETID_SETID) != 0)
1048 auxf |= AF_SUN_SETUGID;
1049
1050 /*
1051 * If we're running a native process from within a branded
1052 * zone under pfexec then we clear the AF_SUN_SETUGID flag so
1053 * that the native ld.so.1 is able to link with the native
1054 * libraries instead of using the brand libraries that are
1055 * installed in the zone. We only do this for processes
1056 * which we trust because we see they are already running
1057 * under pfexec (where uid != euid). This prevents a
1058 * malicious user within the zone from crafting a wrapper to
1059 * run native suid commands with unsecure libraries interposed.
1060 */
1061 if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
1062 (setid &= ~EXECSETID_SETID) != 0))
1063 auxf &= ~AF_SUN_SETUGID;
1064
1065 /*
1066 * Record the user addr of the auxflags aux vector entry
1067 * since brands may optionally want to manipulate this field.
1068 */
1069 args->auxp_auxflags =
1070 (char *)((char *)args->stackend +
1071 ((char *)&aux->a_type -
1072 (char *)bigwad->elfargs));
1073 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
1074
1075 /*
1076 * Record information about the real and effective user and
1077 * group IDs.
1078 */
1079 if (cred != NULL) {
1080 ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
1081 ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
1082 ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
1083 ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
1084 }
1085
1086 /*
1087 * Hardware capability flag word (performance hints)
1088 * Used for choosing faster library routines.
1089 * (Potentially different between 32-bit and 64-bit ABIs)
1090 */
1091 #if defined(_LP64)
1092 if (args->to_model == DATAMODEL_NATIVE) {
1093 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1094 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1095 } else {
1096 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
1097 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
1098 }
1099 #else
1100 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1101 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1102 #endif
1103 if (branded) {
1104 /*
1105 * Reserve space for the brand-private aux vectors,
1106 * and record the user addr of that space.
1107 */
1108 args->auxp_brand =
1109 (char *)((char *)args->stackend +
1110 ((char *)&aux->a_type -
1111 (char *)bigwad->elfargs));
1112 ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
1113 ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
1114 ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
1115 ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
1116 }
1117
1118 /*
1119 * Add the comm page auxv entry, mapping it in if needed. Also
1120 * take care of the FPU entries.
1121 */
1122 #if defined(__amd64)
1123 if (args->commpage != (uintptr_t)NULL ||
1124 (args->commpage = (uintptr_t)comm_page_mapin()) !=
1125 (uintptr_t)NULL) {
1126 ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
1127 } else {
1128 /*
1129 * If the comm page cannot be mapped, pad out the auxv
1130 * to satisfy later size checks.
1131 */
1132 ADDAUX(aux, AT_NULL, 0)
1133 }
1134
1135 fptype = AT_386_FPINFO_NONE;
1136 fpu_auxv_info(&fptype, &fpsize);
1137 if (fptype != AT_386_FPINFO_NONE) {
1138 ADDAUX(aux, AT_SUN_FPTYPE, fptype)
1139 ADDAUX(aux, AT_SUN_FPSIZE, fpsize)
1140 } else {
1141 ADDAUX(aux, AT_NULL, 0)
1142 ADDAUX(aux, AT_NULL, 0)
1143 }
1144 #endif /* defined(__amd64) */
1145
1146 ADDAUX(aux, AT_NULL, 0)
1147 postfixsize = (uintptr_t)aux - (uintptr_t)bigwad->elfargs;
1148
1149 /*
1150 * We make assumptions above when we determine how many aux
1151 * vector entries we will be adding. However, if we have an
1152 * invalid elf file, it is possible that mapelfexec might
1153 * behave differently (but not return an error), in which case
1154 * the number of aux entries we actually add will be different.
1155 * We detect that now and error out.
1156 */
1157 if (postfixsize != args->auxsize) {
1158 DTRACE_PROBE2(elfexec_badaux, size_t, postfixsize,
1159 size_t, args->auxsize);
1160 goto bad;
1161 }
1162 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
1163 }
1164
1165 /*
1166 * For the 64-bit kernel, the limit is big enough that rounding it up
1167 * to a page can overflow the 64-bit limit, so we check for btopr()
1168 * overflowing here by comparing it with the unrounded limit in pages.
1169 * If it hasn't overflowed, compare the exec size with the rounded up
1170 * limit in pages. Otherwise, just compare with the unrounded limit.
1171 */
1172 limit = btop(p->p_vmem_ctl);
1173 roundlimit = btopr(p->p_vmem_ctl);
1174 if ((roundlimit > limit && *execsz > roundlimit) ||
1175 (roundlimit < limit && *execsz > limit)) {
1176 mutex_enter(&p->p_lock);
1177 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1178 RCA_SAFE);
1179 mutex_exit(&p->p_lock);
1180 error = ENOMEM;
1181 goto bad;
1182 }
1183
1184 bzero(up->u_auxv, sizeof (up->u_auxv));
1185 up->u_commpagep = args->commpage;
1186 if (postfixsize) {
1187 size_t num_auxv;
1188
1189 /*
1190 * Copy the aux vector to the user stack.
1191 */
1192 error = execpoststack(args, bigwad->elfargs, postfixsize);
1193 if (error)
1194 goto bad;
1195
1196 /*
1197 * Copy auxv to the process's user structure for use by /proc.
1198 * If this is a branded process, the brand's exec routine will
1199 * copy it's private entries to the user structure later. It
1200 * relies on the fact that the blank entries are at the end.
1201 */
1202 num_auxv = postfixsize / sizeof (aux_entry_t);
1203 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
1204 aux = bigwad->elfargs;
1205 for (i = 0; i < num_auxv; i++) {
1206 up->u_auxv[i].a_type = aux[i].a_type;
1207 up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
1208 }
1209 }
1210
1211 /*
1212 * Pass back the starting address so we can set the program counter.
1213 */
1214 args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
1215
1216 if (!uphdr) {
1217 if (ehdrp->e_type == ET_DYN) {
1218 /*
1219 * If we are executing a shared library which doesn't
1220 * have a interpreter (probably ld.so.1) then
1221 * we don't set the brkbase now. Instead we
1222 * delay it's setting until the first call
1223 * via grow.c::brk(). This permits ld.so.1 to
1224 * initialize brkbase to the tail of the executable it
1225 * loads (which is where it needs to be).
1226 */
1227 bigwad->exenv.ex_brkbase = (caddr_t)0;
1228 bigwad->exenv.ex_bssbase = (caddr_t)0;
1229 bigwad->exenv.ex_brksize = 0;
1230 } else {
1231 bigwad->exenv.ex_brkbase = brkbase;
1232 bigwad->exenv.ex_bssbase = bssbase;
1233 bigwad->exenv.ex_brksize = brksize;
1234 }
1235 bigwad->exenv.ex_magic = elfmagic;
1236 bigwad->exenv.ex_vp = vp;
1237 setexecenv(&bigwad->exenv);
1238 }
1239
1240 ASSERT(error == 0);
1241 goto out;
1242
1243 bad:
1244 if (fd != -1) /* did we open the a.out yet */
1245 (void) execclose(fd);
1246
1247 psignal(p, SIGKILL);
1248
1249 if (error == 0)
1250 error = ENOEXEC;
1251 out:
1252 if (dynuphdr)
1253 kmem_free(uphdr, sizeof (Phdr));
1254 if (phdrbase != NULL)
1255 kmem_free(phdrbase, phdrsize);
1256 if (cap != NULL)
1257 kmem_free(cap, capsize);
1258 kmem_free(bigwad, sizeof (struct bigwad));
1259 return (error);
1260 }
1261
1262 /*
1263 * Compute the memory size requirement for the ELF file.
1264 */
1265 static size_t
1266 elfsize(const Ehdr *ehdrp, uint_t nphdrs, const caddr_t phdrbase,
1267 uintptr_t *lddata)
1268 {
1269 const Phdr *phdrp = (Phdr *)phdrbase;
1270 const uint_t hsize = ehdrp->e_phentsize;
1271 boolean_t dfirst = B_TRUE;
1272 uintptr_t loaddr = UINTPTR_MAX;
1273 uintptr_t hiaddr = 0;
1274 uint_t i;
1275
1276 for (i = nphdrs; i > 0; i--) {
1277 if (phdrp->p_type == PT_LOAD) {
1278 const uintptr_t lo = phdrp->p_vaddr;
1279 const uintptr_t hi = lo + phdrp->p_memsz;
1280
1281 loaddr = MIN(lo, loaddr);
1282 hiaddr = MAX(hi, hiaddr);
1283
1284 /*
1285 * save the address of the first data segment
1286 * of a object - used for the AT_SUNW_LDDATA
1287 * aux entry.
1288 */
1289 if ((lddata != NULL) && dfirst &&
1290 (phdrp->p_flags & PF_W)) {
1291 *lddata = lo;
1292 dfirst = B_FALSE;
1293 }
1294 }
1295 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1296 }
1297
1298 if (hiaddr <= loaddr) {
1299 /* No non-zero PT_LOAD segment found */
1300 return (0);
1301 }
1302
1303 return (roundup(hiaddr - (loaddr & PAGEMASK), PAGESIZE));
1304 }
1305
1306 /*
1307 * Read in the ELF header and program header table.
1308 * SUSV3 requires:
1309 * ENOEXEC File format is not recognized
1310 * EINVAL Format recognized but execution not supported
1311 */
1312 static int
1313 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
1314 uint_t *shstrndx, uint_t *nphdrs)
1315 {
1316 int error;
1317 ssize_t resid;
1318
1319 /*
1320 * We got here by the first two bytes in ident,
1321 * now read the entire ELF header.
1322 */
1323 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, sizeof (Ehdr),
1324 (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid)) != 0) {
1325 return (error);
1326 }
1327
1328 /*
1329 * Since a separate version is compiled for handling 32-bit and
1330 * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1331 * doesn't need to be able to deal with 32-bit ELF files.
1332 */
1333 if (resid != 0 ||
1334 ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1335 ehdr->e_ident[EI_MAG3] != ELFMAG3) {
1336 return (ENOEXEC);
1337 }
1338
1339 if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1340 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1341 ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1342 #else
1343 ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1344 #endif
1345 !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1346 ehdr->e_flags)) {
1347 return (EINVAL);
1348 }
1349
1350 *nshdrs = ehdr->e_shnum;
1351 *shstrndx = ehdr->e_shstrndx;
1352 *nphdrs = ehdr->e_phnum;
1353
1354 /*
1355 * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1356 * to read in the section header at index zero to access the true
1357 * values for those fields.
1358 */
1359 if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1360 *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1361 Shdr shdr;
1362
1363 if (ehdr->e_shoff == 0)
1364 return (EINVAL);
1365
1366 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1367 sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1368 (rlim64_t)0, credp, NULL)) != 0)
1369 return (error);
1370
1371 if (*nshdrs == 0)
1372 *nshdrs = shdr.sh_size;
1373 if (*shstrndx == SHN_XINDEX)
1374 *shstrndx = shdr.sh_link;
1375 if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1376 *nphdrs = shdr.sh_info;
1377 }
1378
1379 return (0);
1380 }
1381
1382 /*
1383 * We use members through p_flags on 32-bit files and p_memsz on 64-bit files,
1384 * so e_phentsize must be at least large enough to include those members.
1385 */
1386 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1387 #define MINPHENTSZ (offsetof(Phdr, p_flags) + \
1388 sizeof (((Phdr *)NULL)->p_flags))
1389 #else
1390 #define MINPHENTSZ (offsetof(Phdr, p_memsz) + \
1391 sizeof (((Phdr *)NULL)->p_memsz))
1392 #endif
1393
1394 static int
1395 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nphdrs,
1396 caddr_t *phbasep, size_t *phsizep)
1397 {
1398 int err;
1399
1400 /*
1401 * Ensure that e_phentsize is large enough for required fields to be
1402 * accessible and will maintain 8-byte alignment.
1403 */
1404 if (ehdr->e_phentsize < MINPHENTSZ || (ehdr->e_phentsize & 3))
1405 return (EINVAL);
1406
1407 *phsizep = nphdrs * ehdr->e_phentsize;
1408
1409 if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1410 if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1411 return (ENOMEM);
1412 } else {
1413 *phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1414 }
1415
1416 if ((err = vn_rdwr(UIO_READ, vp, *phbasep, (ssize_t)*phsizep,
1417 (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1418 credp, NULL)) != 0) {
1419 kmem_free(*phbasep, *phsizep);
1420 *phbasep = NULL;
1421 return (err);
1422 }
1423
1424 return (0);
1425 }
1426
1427 #define MINSHDRSZ (offsetof(Shdr, sh_entsize) + \
1428 sizeof (((Shdr *)NULL)->sh_entsize))
1429
1430 static int
1431 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nshdrs,
1432 uint_t shstrndx, caddr_t *shbasep, size_t *shsizep, char **shstrbasep,
1433 size_t *shstrsizep)
1434 {
1435 int err;
1436 Shdr *shdr;
1437
1438 /*
1439 * Since we're going to be using e_shentsize to iterate down the
1440 * array of section headers, it must be 8-byte aligned or else
1441 * a we might cause a misaligned access. We use all members through
1442 * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1443 * must be at least large enough to include that member. The index
1444 * of the string table section must also be valid.
1445 */
1446 if (ehdr->e_shentsize < MINSHDRSZ || (ehdr->e_shentsize & 3) ||
1447 nshdrs == 0 || shstrndx >= nshdrs)
1448 return (EINVAL);
1449
1450 *shsizep = nshdrs * ehdr->e_shentsize;
1451
1452 if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1453 if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1454 return (ENOMEM);
1455 } else {
1456 *shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1457 }
1458
1459 if ((err = vn_rdwr(UIO_READ, vp, *shbasep, (ssize_t)*shsizep,
1460 (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1461 credp, NULL)) != 0) {
1462 kmem_free(*shbasep, *shsizep);
1463 return (err);
1464 }
1465
1466 /*
1467 * Grab the section string table. Walking through the shdrs is
1468 * pointless if their names cannot be interrogated.
1469 */
1470 shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1471 if ((*shstrsizep = shdr->sh_size) == 0) {
1472 kmem_free(*shbasep, *shsizep);
1473 return (EINVAL);
1474 }
1475
1476 if (*shstrsizep > elf_shstrtab_max) {
1477 if ((*shstrbasep = kmem_alloc(*shstrsizep,
1478 KM_NOSLEEP)) == NULL) {
1479 kmem_free(*shbasep, *shsizep);
1480 return (ENOMEM);
1481 }
1482 } else {
1483 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1484 }
1485
1486 if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, (ssize_t)*shstrsizep,
1487 (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1488 credp, NULL)) != 0) {
1489 kmem_free(*shbasep, *shsizep);
1490 kmem_free(*shstrbasep, *shstrsizep);
1491 return (err);
1492 }
1493
1494 /*
1495 * Make sure the strtab is null-terminated to make sure we
1496 * don't run off the end of the table.
1497 */
1498 (*shstrbasep)[*shstrsizep - 1] = '\0';
1499
1500 return (0);
1501 }
1502
1503
1504 int
1505 elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, uint_t *nphdrs,
1506 caddr_t *phbasep, size_t *phsizep)
1507 {
1508 int error;
1509 uint_t nshdrs, shstrndx;
1510
1511 if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
1512 nphdrs)) != 0 ||
1513 (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
1514 phsizep)) != 0) {
1515 return (error);
1516 }
1517 return (0);
1518 }
1519
1520
1521 static int
1522 mapelfexec(
1523 vnode_t *vp,
1524 Ehdr *ehdr,
1525 uint_t nphdrs,
1526 caddr_t phdrbase,
1527 Phdr **uphdr,
1528 Phdr **intphdr,
1529 Phdr **stphdr,
1530 Phdr **dtphdr,
1531 Phdr *dataphdrp,
1532 caddr_t *bssbase,
1533 caddr_t *brkbase,
1534 intptr_t *voffset,
1535 uintptr_t *minaddrp,
1536 size_t len,
1537 size_t *execsz,
1538 size_t *brksize)
1539 {
1540 Phdr *phdr;
1541 int error, page, prot, lastprot = 0;
1542 caddr_t addr = NULL;
1543 caddr_t minaddr = (caddr_t)UINTPTR_MAX;
1544 uint_t i;
1545 size_t zfodsz, memsz;
1546 boolean_t ptload = B_FALSE;
1547 off_t offset;
1548 const uint_t hsize = ehdr->e_phentsize;
1549 uintptr_t lastaddr = 0;
1550 extern int use_brk_lpg;
1551
1552 if (ehdr->e_type == ET_DYN) {
1553 caddr_t vaddr;
1554 secflagset_t flags = 0;
1555 /*
1556 * Obtain the virtual address of a hole in the
1557 * address space to map the "interpreter".
1558 */
1559 if (secflag_enabled(curproc, PROC_SEC_ASLR))
1560 flags |= _MAP_RANDOMIZE;
1561
1562 map_addr(&addr, len, (offset_t)0, 1, flags);
1563 if (addr == NULL)
1564 return (ENOMEM);
1565
1566 /*
1567 * Despite the fact that mmapobj(2) refuses to load them, we
1568 * need to support executing ET_DYN objects that have a
1569 * non-NULL p_vaddr. When found in the wild, these objects
1570 * are likely to be due to an old (and largely obviated) Linux
1571 * facility, prelink(8), that rewrites shared objects to
1572 * prefer specific (disjoint) virtual address ranges. (Yes,
1573 * this is putatively for performance -- and yes, it has
1574 * limited applicability, many edge conditions and grisly
1575 * failure modes; even for Linux, it's insane.) As ELF
1576 * mandates that the PT_LOAD segments be in p_vaddr order, we
1577 * find the lowest p_vaddr by finding the first PT_LOAD
1578 * segment.
1579 */
1580 phdr = (Phdr *)phdrbase;
1581 for (i = nphdrs; i > 0; i--) {
1582 if (phdr->p_type == PT_LOAD) {
1583 addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
1584 break;
1585 }
1586 phdr = (Phdr *)((caddr_t)phdr + hsize);
1587 }
1588
1589 /*
1590 * We have a non-zero p_vaddr in the first PT_LOAD segment --
1591 * presumably because we're directly executing a prelink(8)'d
1592 * ld-linux.so. While we could correctly execute such an
1593 * object without locating it at its desired p_vaddr (it is,
1594 * after all, still relocatable), our inner antiquarian
1595 * derives a perverse pleasure in accommodating the steampunk
1596 * prelink(8) contraption -- goggles on!
1597 */
1598 if ((vaddr = addr) != NULL) {
1599 if (as_gap(curproc->p_as, len, &addr, &len,
1600 AH_LO, NULL) == -1 || addr != vaddr) {
1601 addr = NULL;
1602 }
1603 }
1604
1605 if (addr == NULL) {
1606 /*
1607 * We either have a NULL p_vaddr (the common case, by
1608 * many orders of magnitude) or we have a non-NULL
1609 * p_vaddr and we were unable to obtain the specified
1610 * VA range (presumably because it's an illegal
1611 * address). Either way, obtain an address in which
1612 * to map the interpreter.
1613 */
1614 map_addr(&addr, len, (offset_t)0, 1, 0);
1615 if (addr == NULL)
1616 return (ENOMEM);
1617 }
1618
1619 /*
1620 * Our voffset is the difference between where we landed and
1621 * where we wanted to be.
1622 */
1623 *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
1624 } else {
1625 *voffset = 0;
1626 }
1627
1628 phdr = (Phdr *)phdrbase;
1629 for (i = nphdrs; i > 0; i--) {
1630 switch (phdr->p_type) {
1631 case PT_LOAD:
1632 ptload = B_TRUE;
1633 prot = PROT_USER;
1634 if (phdr->p_flags & PF_R)
1635 prot |= PROT_READ;
1636 if (phdr->p_flags & PF_W)
1637 prot |= PROT_WRITE;
1638 if (phdr->p_flags & PF_X)
1639 prot |= PROT_EXEC;
1640
1641 addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1642
1643 if ((*intphdr != NULL) && uphdr != NULL &&
1644 (*uphdr == NULL)) {
1645 /*
1646 * The PT_PHDR program header is, strictly
1647 * speaking, optional. If we find that this
1648 * is missing, we will determine the location
1649 * of the program headers based on the address
1650 * of the lowest PT_LOAD segment (namely, this
1651 * one): we subtract the p_offset to get to
1652 * the ELF header and then add back the program
1653 * header offset to get to the program headers.
1654 * We then cons up a Phdr that corresponds to
1655 * the (missing) PT_PHDR, setting the flags
1656 * to 0 to denote that this is artificial and
1657 * should (must) be freed by the caller.
1658 */
1659 Phdr *cons;
1660
1661 cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
1662
1663 cons->p_flags = 0;
1664 cons->p_type = PT_PHDR;
1665 cons->p_vaddr = ((uintptr_t)addr -
1666 phdr->p_offset) + ehdr->e_phoff;
1667
1668 *uphdr = cons;
1669 }
1670
1671 /*
1672 * The ELF spec dictates that p_filesz may not be
1673 * larger than p_memsz in PT_LOAD segments.
1674 */
1675 if (phdr->p_filesz > phdr->p_memsz) {
1676 error = EINVAL;
1677 goto bad;
1678 }
1679
1680 /*
1681 * Keep track of the segment with the lowest starting
1682 * address.
1683 */
1684 if (addr < minaddr)
1685 minaddr = addr;
1686
1687 /*
1688 * Segments need not correspond to page boundaries:
1689 * they are permitted to share a page. If two PT_LOAD
1690 * segments share the same page, and the permissions
1691 * of the segments differ, the behavior is historically
1692 * that the permissions of the latter segment are used
1693 * for the page that the two segments share. This is
1694 * also historically a non-issue: binaries generated
1695 * by most anything will make sure that two PT_LOAD
1696 * segments with differing permissions don't actually
1697 * share any pages. However, there exist some crazy
1698 * things out there (including at least an obscure
1699 * Portuguese teaching language called G-Portugol) that
1700 * actually do the wrong thing and expect it to work:
1701 * they have a segment with execute permission share
1702 * a page with a subsequent segment that does not
1703 * have execute permissions and expect the resulting
1704 * shared page to in fact be executable. To accommodate
1705 * such broken link editors, we take advantage of a
1706 * latitude explicitly granted to the loader: it is
1707 * permitted to make _any_ PT_LOAD segment executable
1708 * (provided that it is readable or writable). If we
1709 * see that we're sharing a page and that the previous
1710 * page was executable, we will add execute permissions
1711 * to our segment.
1712 */
1713 if (btop(lastaddr) == btop((uintptr_t)addr) &&
1714 (phdr->p_flags & (PF_R | PF_W)) &&
1715 (lastprot & PROT_EXEC)) {
1716 prot |= PROT_EXEC;
1717 }
1718
1719 lastaddr = (uintptr_t)addr + phdr->p_filesz;
1720 lastprot = prot;
1721
1722 zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1723
1724 offset = phdr->p_offset;
1725 if (((uintptr_t)offset & PAGEOFFSET) ==
1726 ((uintptr_t)addr & PAGEOFFSET) &&
1727 (!(vp->v_flag & VNOMAP))) {
1728 page = 1;
1729 } else {
1730 page = 0;
1731 }
1732
1733 /*
1734 * Set the heap pagesize for OOB when the bss size
1735 * is known and use_brk_lpg is not 0.
1736 */
1737 if (brksize != NULL && use_brk_lpg &&
1738 zfodsz != 0 && phdr == dataphdrp &&
1739 (prot & PROT_WRITE)) {
1740 const size_t tlen = P2NPHASE((uintptr_t)addr +
1741 phdr->p_filesz, PAGESIZE);
1742
1743 if (zfodsz > tlen) {
1744 const caddr_t taddr = addr +
1745 phdr->p_filesz + tlen;
1746
1747 /*
1748 * Since a hole in the AS large enough
1749 * for this object as calculated by
1750 * elfsize() is available, we do not
1751 * need to fear overflow for 'taddr'.
1752 */
1753 curproc->p_brkpageszc =
1754 page_szc(map_pgsz(MAPPGSZ_HEAP,
1755 curproc, taddr, zfodsz - tlen, 0));
1756 }
1757 }
1758
1759 if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1760 (prot & PROT_WRITE)) {
1761 uint_t szc = curproc->p_brkpageszc;
1762 size_t pgsz = page_get_pagesize(szc);
1763 caddr_t ebss = addr + phdr->p_memsz;
1764 /*
1765 * If we need extra space to keep the BSS an
1766 * integral number of pages in size, some of
1767 * that space may fall beyond p_brkbase, so we
1768 * need to set p_brksize to account for it
1769 * being (logically) part of the brk.
1770 */
1771 size_t extra_zfodsz;
1772
1773 ASSERT(pgsz > PAGESIZE);
1774
1775 extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1776
1777 if (error = execmap(vp, addr, phdr->p_filesz,
1778 zfodsz + extra_zfodsz, phdr->p_offset,
1779 prot, page, szc))
1780 goto bad;
1781 if (brksize != NULL)
1782 *brksize = extra_zfodsz;
1783 } else {
1784 if (error = execmap(vp, addr, phdr->p_filesz,
1785 zfodsz, phdr->p_offset, prot, page, 0))
1786 goto bad;
1787 }
1788
1789 if (bssbase != NULL && addr >= *bssbase &&
1790 phdr == dataphdrp) {
1791 *bssbase = addr + phdr->p_filesz;
1792 }
1793 if (brkbase != NULL && addr >= *brkbase) {
1794 *brkbase = addr + phdr->p_memsz;
1795 }
1796
1797 memsz = btopr(phdr->p_memsz);
1798 if ((*execsz + memsz) < *execsz) {
1799 error = ENOMEM;
1800 goto bad;
1801 }
1802 *execsz += memsz;
1803 break;
1804
1805 case PT_INTERP:
1806 /*
1807 * The ELF specification is unequivocal about the
1808 * PT_INTERP program header with respect to any PT_LOAD
1809 * program header: "If it is present, it must precede
1810 * any loadable segment entry." Linux, however, makes
1811 * no attempt to enforce this -- which has allowed some
1812 * binary editing tools to get away with generating
1813 * invalid ELF binaries in the respect that PT_INTERP
1814 * occurs after the first PT_LOAD program header. This
1815 * is unfortunate (and of course, disappointing) but
1816 * it's no worse than that: there is no reason that we
1817 * can't process the PT_INTERP entry (if present) after
1818 * one or more PT_LOAD entries. We therefore
1819 * deliberately do not check ptload here and always
1820 * store dyphdr to be the PT_INTERP program header.
1821 */
1822 *intphdr = phdr;
1823 break;
1824
1825 case PT_SHLIB:
1826 *stphdr = phdr;
1827 break;
1828
1829 case PT_PHDR:
1830 if (ptload || phdr->p_flags == 0)
1831 goto bad;
1832
1833 if (uphdr != NULL)
1834 *uphdr = phdr;
1835
1836 break;
1837
1838 case PT_NULL:
1839 case PT_DYNAMIC:
1840 case PT_NOTE:
1841 break;
1842
1843 case PT_SUNWDTRACE:
1844 if (dtphdr != NULL)
1845 *dtphdr = phdr;
1846 break;
1847
1848 default:
1849 break;
1850 }
1851 phdr = (Phdr *)((caddr_t)phdr + hsize);
1852 }
1853
1854 if (minaddrp != NULL) {
1855 ASSERT(minaddr != (caddr_t)UINTPTR_MAX);
1856 *minaddrp = (uintptr_t)minaddr;
1857 }
1858
1859 if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1860 size_t off;
1861 uintptr_t base = (uintptr_t)*brkbase;
1862 uintptr_t oend = base + *brksize;
1863
1864 ASSERT(ISP2(aslr_max_brk_skew));
1865
1866 (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1867 base += P2PHASE(off, aslr_max_brk_skew);
1868 base = P2ROUNDUP(base, PAGESIZE);
1869 *brkbase = (caddr_t)base;
1870 /*
1871 * Above, we set *brksize to account for the possibility we
1872 * had to grow the 'brk' in padding out the BSS to a page
1873 * boundary.
1874 *
1875 * We now need to adjust that based on where we now are
1876 * actually putting the brk.
1877 */
1878 if (oend > base)
1879 *brksize = oend - base;
1880 else
1881 *brksize = 0;
1882 }
1883
1884 return (0);
1885 bad:
1886 if (error == 0)
1887 error = EINVAL;
1888 return (error);
1889 }
1890
1891 int
1892 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1893 rlim64_t rlimit, cred_t *credp)
1894 {
1895 Note note;
1896 int error;
1897
1898 bzero(¬e, sizeof (note));
1899 bcopy("CORE", note.name, 4);
1900 note.nhdr.n_type = type;
1901 /*
1902 * The System V ABI states that n_namesz must be the length of the
1903 * string that follows the Nhdr structure including the terminating
1904 * null. The ABI also specifies that sufficient padding should be
1905 * included so that the description that follows the name string
1906 * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1907 * respectively. However, since this change was not made correctly
1908 * at the time of the 64-bit port, both 32- and 64-bit binaries
1909 * descriptions are only guaranteed to begin on a 4-byte boundary.
1910 */
1911 note.nhdr.n_namesz = 5;
1912 note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1913
1914 if (error = core_write(vp, UIO_SYSSPACE, *offsetp, ¬e,
1915 sizeof (note), rlimit, credp))
1916 return (error);
1917
1918 *offsetp += sizeof (note);
1919
1920 if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1921 note.nhdr.n_descsz, rlimit, credp))
1922 return (error);
1923
1924 *offsetp += note.nhdr.n_descsz;
1925 return (0);
1926 }
1927
1928
1929 /*
1930 * Copy the section data from one vnode to the section of another vnode.
1931 */
1932 static void
1933 elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst)
1934 {
1935 size_t n = src->sh_size;
1936 u_offset_t off = 0;
1937 const u_offset_t soff = src->sh_offset;
1938 const u_offset_t doff = ctx->ecc_doffset;
1939 void *buf = ctx->ecc_buf;
1940 vnode_t *dst_vp = ctx->ecc_vp;
1941 cred_t *credp = ctx->ecc_credp;
1942
1943 /* Protect the copy loop below from overflow on the offsets */
1944 if (n > OFF_MAX || (n + soff) > OFF_MAX || (n + doff) > OFF_MAX ||
1945 (n + soff) < n || (n + doff) < n) {
1946 dst->sh_size = 0;
1947 dst->sh_offset = 0;
1948 return;
1949 }
1950
1951 while (n != 0) {
1952 const size_t len = MIN(ctx->ecc_bufsz, n);
1953 ssize_t resid;
1954
1955 if (vn_rdwr(UIO_READ, src_vp, buf, (ssize_t)len,
1956 (offset_t)(soff + off),
1957 UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1958 resid >= len || resid < 0 ||
1959 core_write(dst_vp, UIO_SYSSPACE, (offset_t)(doff + off),
1960 buf, len - resid, ctx->ecc_rlimit, credp) != 0) {
1961 dst->sh_size = 0;
1962 dst->sh_offset = 0;
1963 return;
1964 }
1965
1966 ASSERT(n >= len - resid);
1967
1968 n -= len - resid;
1969 off += len - resid;
1970 }
1971
1972 ctx->ecc_doffset += src->sh_size;
1973 }
1974
1975 /*
1976 * Walk sections for a given ELF object, counting (or copying) those of
1977 * interest (CTF, symtab, strtab).
1978 */
1979 static uint_t
1980 elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
1981 Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab)
1982 {
1983 Ehdr ehdr;
1984 const core_content_t content = ctx->ecc_content;
1985 cred_t *credp = ctx->ecc_credp;
1986 Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL;
1987 uintptr_t off = 0;
1988 uint_t nshdrs, shstrndx, nphdrs, count = 0;
1989 u_offset_t *doffp = &ctx->ecc_doffset;
1990 boolean_t ctf_link = B_FALSE;
1991 caddr_t shbase;
1992 size_t shsize, shstrsize;
1993 char *shstrbase;
1994
1995 if ((content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) == 0) {
1996 return (0);
1997 }
1998
1999 if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, &nphdrs) != 0 ||
2000 getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, &shbase, &shsize,
2001 &shstrbase, &shstrsize) != 0) {
2002 return (0);
2003 }
2004
2005 /* Starting at index 1 skips SHT_NULL which is expected at index 0 */
2006 off = ehdr.e_shentsize;
2007 for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
2008 Shdr *shdr, *symchk = NULL, *strchk;
2009 const char *name;
2010
2011 shdr = (Shdr *)(shbase + off);
2012 if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL)
2013 continue;
2014
2015 name = shstrbase + shdr->sh_name;
2016
2017 if (ctf == NULL &&
2018 (content & CC_CONTENT_CTF) != 0 &&
2019 strcmp(name, shstrtab_data[STR_CTF]) == 0) {
2020 ctf = shdr;
2021 if (ctf->sh_link != 0 && ctf->sh_link < nshdrs) {
2022 /* check linked symtab below */
2023 symchk = (Shdr *)(shbase +
2024 shdr->sh_link * ehdr.e_shentsize);
2025 ctf_link = B_TRUE;
2026 } else {
2027 continue;
2028 }
2029 } else if (symtab == NULL &&
2030 (content & CC_CONTENT_SYMTAB) != 0 &&
2031 strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) {
2032 symchk = shdr;
2033 } else {
2034 continue;
2035 }
2036
2037 ASSERT(symchk != NULL);
2038 if ((symchk->sh_type != SHT_DYNSYM &&
2039 symchk->sh_type != SHT_SYMTAB) ||
2040 symchk->sh_link == 0 || symchk->sh_link >= nshdrs) {
2041 ctf_link = B_FALSE;
2042 continue;
2043 }
2044 strchk = (Shdr *)(shbase + symchk->sh_link * ehdr.e_shentsize);
2045 if (strchk->sh_type != SHT_STRTAB) {
2046 ctf_link = B_FALSE;
2047 continue;
2048 }
2049 symtab = symchk;
2050 strtab = strchk;
2051
2052 if (symtab != NULL && ctf != NULL) {
2053 /* No other shdrs are of interest at this point */
2054 break;
2055 }
2056 }
2057
2058 if (ctf != NULL)
2059 count += 1;
2060 if (symtab != NULL)
2061 count += 2;
2062 if (v == NULL || count == 0 || count > remain) {
2063 count = MIN(count, remain);
2064 goto done;
2065 }
2066
2067 /* output CTF section */
2068 if (ctf != NULL) {
2069 elf_ctx_resize_scratch(ctx, ctf->sh_size);
2070
2071 v[idx].sh_name = shstrtab_ndx(shstrtab, STR_CTF);
2072 v[idx].sh_addr = (Addr)(uintptr_t)saddr;
2073 v[idx].sh_type = SHT_PROGBITS;
2074 v[idx].sh_addralign = 4;
2075 *doffp = roundup(*doffp, v[idx].sh_addralign);
2076 v[idx].sh_offset = *doffp;
2077 v[idx].sh_size = ctf->sh_size;
2078
2079 if (ctf_link) {
2080 /*
2081 * The linked symtab (and strtab) will be output
2082 * immediately after this CTF section. Its shdr index
2083 * directly follows this one.
2084 */
2085 v[idx].sh_link = idx + 1;
2086 ASSERT(symtab != NULL);
2087 } else {
2088 v[idx].sh_link = 0;
2089 }
2090 elf_copy_scn(ctx, ctf, mvp, &v[idx]);
2091 idx++;
2092 }
2093
2094 /* output SYMTAB/STRTAB sections */
2095 if (symtab != NULL) {
2096 uint_t symtab_name, strtab_name;
2097
2098 elf_ctx_resize_scratch(ctx,
2099 MAX(symtab->sh_size, strtab->sh_size));
2100
2101 if (symtab->sh_type == SHT_DYNSYM) {
2102 symtab_name = shstrtab_ndx(shstrtab, STR_DYNSYM);
2103 strtab_name = shstrtab_ndx(shstrtab, STR_DYNSTR);
2104 } else {
2105 symtab_name = shstrtab_ndx(shstrtab, STR_SYMTAB);
2106 strtab_name = shstrtab_ndx(shstrtab, STR_STRTAB);
2107 }
2108
2109 v[idx].sh_name = symtab_name;
2110 v[idx].sh_type = symtab->sh_type;
2111 v[idx].sh_addr = symtab->sh_addr;
2112 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2113 v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2114 v[idx].sh_addralign = symtab->sh_addralign;
2115 *doffp = roundup(*doffp, v[idx].sh_addralign);
2116 v[idx].sh_offset = *doffp;
2117 v[idx].sh_size = symtab->sh_size;
2118 v[idx].sh_link = idx + 1;
2119 v[idx].sh_entsize = symtab->sh_entsize;
2120 v[idx].sh_info = symtab->sh_info;
2121
2122 elf_copy_scn(ctx, symtab, mvp, &v[idx]);
2123 idx++;
2124
2125 v[idx].sh_name = strtab_name;
2126 v[idx].sh_type = SHT_STRTAB;
2127 v[idx].sh_flags = SHF_STRINGS;
2128 v[idx].sh_addr = strtab->sh_addr;
2129 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2130 v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2131 v[idx].sh_addralign = strtab->sh_addralign;
2132 *doffp = roundup(*doffp, v[idx].sh_addralign);
2133 v[idx].sh_offset = *doffp;
2134 v[idx].sh_size = strtab->sh_size;
2135
2136 elf_copy_scn(ctx, strtab, mvp, &v[idx]);
2137 idx++;
2138 }
2139
2140 done:
2141 kmem_free(shstrbase, shstrsize);
2142 kmem_free(shbase, shsize);
2143 return (count);
2144 }
2145
2146 /*
2147 * Walk mappings in process address space, examining those which correspond to
2148 * loaded objects. It is called twice from elfcore: Once to simply count
2149 * relevant sections, and again later to copy those sections once an adequate
2150 * buffer has been allocated for the shdr details.
2151 */
2152 static int
2153 elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
2154 {
2155 vnode_t *lastvp = NULL;
2156 struct seg *seg;
2157 uint_t idx = 0, remain;
2158 shstrtab_t shstrtab;
2159 struct as *as = ctx->ecc_p->p_as;
2160 int error = 0;
2161
2162 ASSERT(AS_WRITE_HELD(as));
2163
2164 if (v != NULL) {
2165 ASSERT(nv != 0);
2166
2167 shstrtab_init(&shstrtab);
2168 remain = nv;
2169 } else {
2170 ASSERT(nv == 0);
2171
2172 /*
2173 * The shdrs are being counted, rather than outputting them
2174 * into a buffer. Leave room for two entries: the SHT_NULL at
2175 * index 0 and the shstrtab at the end.
2176 */
2177 remain = UINT_MAX - 2;
2178 }
2179
2180 /* Per the ELF spec, shdr index 0 is reserved. */
2181 idx = 1;
2182 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2183 vnode_t *mvp;
2184 void *tmp = NULL;
2185 caddr_t saddr = seg->s_base, naddr, eaddr;
2186 size_t segsize;
2187 uint_t count, prot;
2188
2189 /*
2190 * Since we're just looking for text segments of load
2191 * objects, we only care about the protection bits; we don't
2192 * care about the actual size of the segment so we use the
2193 * reserved size. If the segment's size is zero, there's
2194 * something fishy going on so we ignore this segment.
2195 */
2196 if (seg->s_ops != &segvn_ops ||
2197 SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2198 mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
2199 (segsize = pr_getsegsize(seg, 1)) == 0)
2200 continue;
2201
2202 eaddr = saddr + segsize;
2203 prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
2204 pr_getprot_done(&tmp);
2205
2206 /*
2207 * Skip this segment unless the protection bits look like
2208 * what we'd expect for a text segment.
2209 */
2210 if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
2211 continue;
2212
2213 count = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
2214 &shstrtab);
2215
2216 ASSERT(count <= remain);
2217 ASSERT(v == NULL || (idx + count) < nv);
2218
2219 remain -= count;
2220 idx += count;
2221 lastvp = mvp;
2222 }
2223
2224 if (v == NULL) {
2225 if (idx == 1) {
2226 *nshdrsp = 0;
2227 } else {
2228 /* Include room for the shrstrtab at the end */
2229 *nshdrsp = idx + 1;
2230 }
2231 return (0);
2232 }
2233
2234 if (idx != nv - 1) {
2235 cmn_err(CE_WARN, "elfcore: core dump failed for "
2236 "process %d; address space is changing",
2237 ctx->ecc_p->p_pid);
2238 return (EIO);
2239 }
2240
2241 v[idx].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB);
2242 v[idx].sh_size = shstrtab_size(&shstrtab);
2243 v[idx].sh_addralign = 1;
2244 v[idx].sh_offset = ctx->ecc_doffset;
2245 v[idx].sh_flags = SHF_STRINGS;
2246 v[idx].sh_type = SHT_STRTAB;
2247
2248 elf_ctx_resize_scratch(ctx, v[idx].sh_size);
2249 VERIFY3U(ctx->ecc_bufsz, >=, v[idx].sh_size);
2250 shstrtab_dump(&shstrtab, ctx->ecc_buf);
2251
2252 error = core_write(ctx->ecc_vp, UIO_SYSSPACE, ctx->ecc_doffset,
2253 ctx->ecc_buf, v[idx].sh_size, ctx->ecc_rlimit, ctx->ecc_credp);
2254 if (error == 0) {
2255 ctx->ecc_doffset += v[idx].sh_size;
2256 }
2257
2258 return (error);
2259 }
2260
2261 int
2262 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
2263 core_content_t content)
2264 {
2265 u_offset_t poffset, soffset, doffset;
2266 int error;
2267 uint_t i, nphdrs, nshdrs;
2268 struct seg *seg;
2269 struct as *as = p->p_as;
2270 void *bigwad, *zeropg = NULL;
2271 size_t bigsize, phdrsz, shdrsz;
2272 Ehdr *ehdr;
2273 Phdr *phdr;
2274 Shdr shdr0;
2275 caddr_t brkbase, stkbase;
2276 size_t brksize, stksize;
2277 boolean_t overflowed = B_FALSE, retried = B_FALSE;
2278 klwp_t *lwp = ttolwp(curthread);
2279 elf_core_ctx_t ctx = {
2280 .ecc_vp = vp,
2281 .ecc_p = p,
2282 .ecc_credp = credp,
2283 .ecc_rlimit = rlimit,
2284 .ecc_content = content,
2285 .ecc_doffset = 0,
2286 .ecc_buf = NULL,
2287 .ecc_bufsz = 0
2288 };
2289
2290 top:
2291 /*
2292 * Make sure we have everything we need (registers, etc.).
2293 * All other lwps have already stopped and are in an orderly state.
2294 */
2295 ASSERT(p == ttoproc(curthread));
2296 prstop(0, 0);
2297
2298 AS_LOCK_ENTER(as, RW_WRITER);
2299 nphdrs = prnsegs(as, 0) + 2; /* two CORE note sections */
2300
2301 /*
2302 * Count the number of section headers we're going to need.
2303 */
2304 nshdrs = 0;
2305 if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) {
2306 VERIFY0(elf_process_scns(&ctx, NULL, 0, &nshdrs));
2307 }
2308 AS_LOCK_EXIT(as);
2309
2310 /*
2311 * The core file contents may require zero section headers, but if
2312 * we overflow the 16 bits allotted to the program header count in
2313 * the ELF header, we'll need that program header at index zero.
2314 */
2315 if (nshdrs == 0 && nphdrs >= PN_XNUM) {
2316 nshdrs = 1;
2317 }
2318
2319 /*
2320 * Allocate a buffer which is sized adequately to hold the ehdr, phdrs
2321 * or shdrs needed to produce the core file. It is used for the three
2322 * tasks sequentially, not simultaneously, so it does not need space
2323 * for all three data at once, only the largest one.
2324 */
2325 VERIFY(nphdrs >= 2);
2326 phdrsz = nphdrs * sizeof (Phdr);
2327 shdrsz = nshdrs * sizeof (Shdr);
2328 bigsize = MAX(sizeof (Ehdr), MAX(phdrsz, shdrsz));
2329 bigwad = kmem_alloc(bigsize, KM_SLEEP);
2330
2331 ehdr = (Ehdr *)bigwad;
2332 bzero(ehdr, sizeof (*ehdr));
2333
2334 ehdr->e_ident[EI_MAG0] = ELFMAG0;
2335 ehdr->e_ident[EI_MAG1] = ELFMAG1;
2336 ehdr->e_ident[EI_MAG2] = ELFMAG2;
2337 ehdr->e_ident[EI_MAG3] = ELFMAG3;
2338 ehdr->e_ident[EI_CLASS] = ELFCLASS;
2339 ehdr->e_type = ET_CORE;
2340
2341 #if !defined(_LP64) || defined(_ELF32_COMPAT)
2342
2343 #if defined(__sparc)
2344 ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2345 ehdr->e_machine = EM_SPARC;
2346 #elif defined(__i386_COMPAT)
2347 ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2348 ehdr->e_machine = EM_386;
2349 #else
2350 #error "no recognized machine type is defined"
2351 #endif
2352
2353 #else /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2354
2355 #if defined(__sparc)
2356 ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2357 ehdr->e_machine = EM_SPARCV9;
2358 #elif defined(__amd64)
2359 ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2360 ehdr->e_machine = EM_AMD64;
2361 #else
2362 #error "no recognized 64-bit machine type is defined"
2363 #endif
2364
2365 #endif /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2366
2367 poffset = sizeof (Ehdr);
2368 soffset = sizeof (Ehdr) + phdrsz;
2369 doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2370 bzero(&shdr0, sizeof (shdr0));
2371
2372 /*
2373 * If the count of program headers or section headers or the index
2374 * of the section string table can't fit in the mere 16 bits
2375 * shortsightedly allotted to them in the ELF header, we use the
2376 * extended formats and put the real values in the section header
2377 * as index 0.
2378 */
2379 if (nphdrs >= PN_XNUM) {
2380 ehdr->e_phnum = PN_XNUM;
2381 shdr0.sh_info = nphdrs;
2382 } else {
2383 ehdr->e_phnum = (unsigned short)nphdrs;
2384 }
2385
2386 if (nshdrs > 0) {
2387 if (nshdrs >= SHN_LORESERVE) {
2388 ehdr->e_shnum = 0;
2389 shdr0.sh_size = nshdrs;
2390 } else {
2391 ehdr->e_shnum = (unsigned short)nshdrs;
2392 }
2393
2394 if (nshdrs - 1 >= SHN_LORESERVE) {
2395 ehdr->e_shstrndx = SHN_XINDEX;
2396 shdr0.sh_link = nshdrs - 1;
2397 } else {
2398 ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
2399 }
2400
2401 ehdr->e_shoff = soffset;
2402 ehdr->e_shentsize = sizeof (Shdr);
2403 }
2404
2405 ehdr->e_ident[EI_VERSION] = EV_CURRENT;
2406 ehdr->e_version = EV_CURRENT;
2407 ehdr->e_ehsize = sizeof (Ehdr);
2408 ehdr->e_phoff = poffset;
2409 ehdr->e_phentsize = sizeof (Phdr);
2410
2411 if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
2412 sizeof (Ehdr), rlimit, credp)) {
2413 goto done;
2414 }
2415
2416 phdr = (Phdr *)bigwad;
2417 bzero(phdr, phdrsz);
2418
2419 setup_old_note_header(&phdr[0], p);
2420 phdr[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2421 doffset += phdr[0].p_filesz;
2422
2423 setup_note_header(&phdr[1], p);
2424 phdr[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2425 doffset += phdr[1].p_filesz;
2426
2427 mutex_enter(&p->p_lock);
2428
2429 brkbase = p->p_brkbase;
2430 brksize = p->p_brksize;
2431
2432 stkbase = p->p_usrstack - p->p_stksize;
2433 stksize = p->p_stksize;
2434
2435 mutex_exit(&p->p_lock);
2436
2437 AS_LOCK_ENTER(as, RW_WRITER);
2438 i = 2;
2439 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2440 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2441 caddr_t saddr, naddr;
2442 void *tmp = NULL;
2443 extern struct seg_ops segspt_shmops;
2444
2445 if ((seg->s_flags & S_HOLE) != 0) {
2446 continue;
2447 }
2448
2449 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2450 uint_t prot;
2451 size_t size;
2452 int type;
2453 vnode_t *mvp;
2454
2455 prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2456 prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2457 if ((size = (size_t)(naddr - saddr)) == 0) {
2458 ASSERT(tmp == NULL);
2459 continue;
2460 } else if (i == nphdrs) {
2461 pr_getprot_done(&tmp);
2462 overflowed = B_TRUE;
2463 break;
2464 }
2465 phdr[i].p_type = PT_LOAD;
2466 phdr[i].p_vaddr = (Addr)(uintptr_t)saddr;
2467 phdr[i].p_memsz = size;
2468 if (prot & PROT_READ)
2469 phdr[i].p_flags |= PF_R;
2470 if (prot & PROT_WRITE)
2471 phdr[i].p_flags |= PF_W;
2472 if (prot & PROT_EXEC)
2473 phdr[i].p_flags |= PF_X;
2474
2475 /*
2476 * Figure out which mappings to include in the core.
2477 */
2478 type = SEGOP_GETTYPE(seg, saddr);
2479
2480 if (saddr == stkbase && size == stksize) {
2481 if (!(content & CC_CONTENT_STACK))
2482 goto exclude;
2483
2484 } else if (saddr == brkbase && size == brksize) {
2485 if (!(content & CC_CONTENT_HEAP))
2486 goto exclude;
2487
2488 } else if (seg->s_ops == &segspt_shmops) {
2489 if (type & MAP_NORESERVE) {
2490 if (!(content & CC_CONTENT_DISM))
2491 goto exclude;
2492 } else {
2493 if (!(content & CC_CONTENT_ISM))
2494 goto exclude;
2495 }
2496
2497 } else if (seg->s_ops != &segvn_ops) {
2498 goto exclude;
2499
2500 } else if (type & MAP_SHARED) {
2501 if (shmgetid(p, saddr) != SHMID_NONE) {
2502 if (!(content & CC_CONTENT_SHM))
2503 goto exclude;
2504
2505 } else if (SEGOP_GETVP(seg, seg->s_base,
2506 &mvp) != 0 || mvp == NULL ||
2507 mvp->v_type != VREG) {
2508 if (!(content & CC_CONTENT_SHANON))
2509 goto exclude;
2510
2511 } else {
2512 if (!(content & CC_CONTENT_SHFILE))
2513 goto exclude;
2514 }
2515
2516 } else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2517 mvp == NULL || mvp->v_type != VREG) {
2518 if (!(content & CC_CONTENT_ANON))
2519 goto exclude;
2520
2521 } else if (prot == (PROT_READ | PROT_EXEC)) {
2522 if (!(content & CC_CONTENT_TEXT))
2523 goto exclude;
2524
2525 } else if (prot == PROT_READ) {
2526 if (!(content & CC_CONTENT_RODATA))
2527 goto exclude;
2528
2529 } else {
2530 if (!(content & CC_CONTENT_DATA))
2531 goto exclude;
2532 }
2533
2534 doffset = roundup(doffset, sizeof (Word));
2535 phdr[i].p_offset = doffset;
2536 phdr[i].p_filesz = size;
2537 doffset += size;
2538 exclude:
2539 i++;
2540 }
2541 VERIFY(tmp == NULL);
2542 if (overflowed)
2543 break;
2544 }
2545 AS_LOCK_EXIT(as);
2546
2547 if (overflowed || i != nphdrs) {
2548 if (!retried) {
2549 retried = B_TRUE;
2550 overflowed = B_FALSE;
2551 kmem_free(bigwad, bigsize);
2552 goto top;
2553 }
2554 cmn_err(CE_WARN, "elfcore: core dump failed for "
2555 "process %d; address space is changing", p->p_pid);
2556 error = EIO;
2557 goto done;
2558 }
2559
2560 if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2561 phdr, phdrsz, rlimit, credp)) != 0) {
2562 goto done;
2563 }
2564
2565 if ((error = write_old_elfnotes(p, sig, vp, phdr[0].p_offset, rlimit,
2566 credp)) != 0) {
2567 goto done;
2568 }
2569 if ((error = write_elfnotes(p, sig, vp, phdr[1].p_offset, rlimit,
2570 credp, content)) != 0) {
2571 goto done;
2572 }
2573
2574 for (i = 2; i < nphdrs; i++) {
2575 prkillinfo_t killinfo;
2576 sigqueue_t *sq;
2577 int sig, j;
2578
2579 if (phdr[i].p_filesz == 0)
2580 continue;
2581
2582 /*
2583 * If we hit a region that was mapped PROT_NONE then we cannot
2584 * continue dumping this normally as the kernel would be unable
2585 * to read from the page and that would result in us failing to
2586 * dump the page. As such, any region mapped PROT_NONE, we dump
2587 * as a zero-filled page such that this is still represented in
2588 * the map.
2589 *
2590 * If dumping out this segment fails, rather than failing
2591 * the core dump entirely, we reset the size of the mapping
2592 * to zero to indicate that the data is absent from the core
2593 * file and or in the PF_SUNW_FAILURE flag to differentiate
2594 * this from mappings that were excluded due to the core file
2595 * content settings.
2596 */
2597 if ((phdr[i].p_flags & (PF_R | PF_W | PF_X)) == 0) {
2598 size_t towrite = phdr[i].p_filesz;
2599 size_t curoff = 0;
2600
2601 if (zeropg == NULL) {
2602 zeropg = kmem_zalloc(elf_zeropg_sz, KM_SLEEP);
2603 }
2604
2605 error = 0;
2606 while (towrite != 0) {
2607 size_t len = MIN(towrite, elf_zeropg_sz);
2608
2609 error = core_write(vp, UIO_SYSSPACE,
2610 phdr[i].p_offset + curoff, zeropg, len,
2611 rlimit, credp);
2612 if (error != 0)
2613 break;
2614
2615 towrite -= len;
2616 curoff += len;
2617 }
2618 } else {
2619 error = core_seg(p, vp, phdr[i].p_offset,
2620 (caddr_t)(uintptr_t)phdr[i].p_vaddr,
2621 phdr[i].p_filesz, rlimit, credp);
2622 }
2623 if (error == 0)
2624 continue;
2625
2626 if ((sig = lwp->lwp_cursig) == 0) {
2627 /*
2628 * We failed due to something other than a signal.
2629 * Since the space reserved for the segment is now
2630 * unused, we stash the errno in the first four
2631 * bytes. This undocumented interface will let us
2632 * understand the nature of the failure.
2633 */
2634 (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2635 &error, sizeof (error), rlimit, credp);
2636
2637 phdr[i].p_filesz = 0;
2638 phdr[i].p_flags |= PF_SUNW_FAILURE;
2639 if ((error = core_write(vp, UIO_SYSSPACE,
2640 poffset + sizeof (Phdr) * i, &phdr[i],
2641 sizeof (Phdr), rlimit, credp)) != 0)
2642 goto done;
2643
2644 continue;
2645 }
2646
2647 /*
2648 * We took a signal. We want to abort the dump entirely, but
2649 * we also want to indicate what failed and why. We therefore
2650 * use the space reserved for the first failing segment to
2651 * write our error (which, for purposes of compatability with
2652 * older core dump readers, we set to EINTR) followed by any
2653 * siginfo associated with the signal.
2654 */
2655 bzero(&killinfo, sizeof (killinfo));
2656 killinfo.prk_error = EINTR;
2657
2658 sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2659
2660 if (sq != NULL) {
2661 bcopy(&sq->sq_info, &killinfo.prk_info,
2662 sizeof (sq->sq_info));
2663 } else {
2664 killinfo.prk_info.si_signo = lwp->lwp_cursig;
2665 killinfo.prk_info.si_code = SI_NOINFO;
2666 }
2667
2668 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2669 /*
2670 * If this is a 32-bit process, we need to translate from the
2671 * native siginfo to the 32-bit variant. (Core readers must
2672 * always have the same data model as their target or must
2673 * be aware of -- and compensate for -- data model differences.)
2674 */
2675 if (curproc->p_model == DATAMODEL_ILP32) {
2676 siginfo32_t si32;
2677
2678 siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2679 bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2680 }
2681 #endif
2682
2683 (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2684 &killinfo, sizeof (killinfo), rlimit, credp);
2685
2686 /*
2687 * For the segment on which we took the signal, indicate that
2688 * its data now refers to a siginfo.
2689 */
2690 phdr[i].p_filesz = 0;
2691 phdr[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2692 PF_SUNW_SIGINFO;
2693
2694 /*
2695 * And for every other segment, indicate that its absence
2696 * is due to a signal.
2697 */
2698 for (j = i + 1; j < nphdrs; j++) {
2699 phdr[j].p_filesz = 0;
2700 phdr[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2701 }
2702
2703 /*
2704 * Finally, write out our modified program headers.
2705 */
2706 if ((error = core_write(vp, UIO_SYSSPACE,
2707 poffset + sizeof (Phdr) * i, &phdr[i],
2708 sizeof (Phdr) * (nphdrs - i), rlimit, credp)) != 0) {
2709 goto done;
2710 }
2711
2712 break;
2713 }
2714
2715 if (nshdrs > 0) {
2716 Shdr *shdr = (Shdr *)bigwad;
2717
2718 bzero(shdr, shdrsz);
2719 if (nshdrs > 1) {
2720 ctx.ecc_doffset = doffset;
2721 AS_LOCK_ENTER(as, RW_WRITER);
2722 error = elf_process_scns(&ctx, shdr, nshdrs, NULL);
2723 AS_LOCK_EXIT(as);
2724 if (error != 0) {
2725 goto done;
2726 }
2727 }
2728 /* Copy any extended format data destined for the first shdr */
2729 bcopy(&shdr0, shdr, sizeof (shdr0));
2730
2731 error = core_write(vp, UIO_SYSSPACE, soffset, shdr, shdrsz,
2732 rlimit, credp);
2733 }
2734
2735 done:
2736 if (zeropg != NULL)
2737 kmem_free(zeropg, elf_zeropg_sz);
2738 if (ctx.ecc_bufsz != 0) {
2739 kmem_free(ctx.ecc_buf, ctx.ecc_bufsz);
2740 }
2741 kmem_free(bigwad, bigsize);
2742 return (error);
2743 }
2744
2745 #ifndef _ELF32_COMPAT
2746
2747 static struct execsw esw = {
2748 #ifdef _LP64
2749 elf64magicstr,
2750 #else /* _LP64 */
2751 elf32magicstr,
2752 #endif /* _LP64 */
2753 0,
2754 5,
2755 elfexec,
2756 elfcore
2757 };
2758
2759 static struct modlexec modlexec = {
2760 &mod_execops, "exec module for elf", &esw
2761 };
2762
2763 #ifdef _LP64
2764 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2765 intpdata_t *idatap, int level, size_t *execsz,
2766 int setid, caddr_t exec_file, cred_t *cred,
2767 int *brand_action);
2768 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2769 rlim64_t rlimit, int sig, core_content_t content);
2770
2771 static struct execsw esw32 = {
2772 elf32magicstr,
2773 0,
2774 5,
2775 elf32exec,
2776 elf32core
2777 };
2778
2779 static struct modlexec modlexec32 = {
2780 &mod_execops, "32-bit exec module for elf", &esw32
2781 };
2782 #endif /* _LP64 */
2783
2784 static struct modlinkage modlinkage = {
2785 MODREV_1,
2786 (void *)&modlexec,
2787 #ifdef _LP64
2788 (void *)&modlexec32,
2789 #endif /* _LP64 */
2790 NULL
2791 };
2792
2793 int
2794 _init(void)
2795 {
2796 return (mod_install(&modlinkage));
2797 }
2798
2799 int
2800 _fini(void)
2801 {
2802 return (mod_remove(&modlinkage));
2803 }
2804
2805 int
2806 _info(struct modinfo *modinfop)
2807 {
2808 return (mod_info(&modlinkage, modinfop));
2809 }
2810
2811 #endif /* !_ELF32_COMPAT */