1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 /*
29 * Copyright 2016 Joyent, Inc.
30 */
31
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/thread.h>
35 #include <sys/sysmacros.h>
36 #include <sys/signal.h>
37 #include <sys/cred.h>
38 #include <sys/user.h>
39 #include <sys/errno.h>
40 #include <sys/vnode.h>
41 #include <sys/mman.h>
42 #include <sys/kmem.h>
43 #include <sys/proc.h>
44 #include <sys/pathname.h>
45 #include <sys/cmn_err.h>
46 #include <sys/systm.h>
47 #include <sys/elf.h>
48 #include <sys/vmsystm.h>
49 #include <sys/debug.h>
50 #include <sys/auxv.h>
51 #include <sys/exec.h>
52 #include <sys/prsystm.h>
53 #include <vm/as.h>
54 #include <vm/rm.h>
55 #include <vm/seg.h>
56 #include <vm/seg_vn.h>
57 #include <sys/modctl.h>
58 #include <sys/systeminfo.h>
59 #include <sys/vmparam.h>
60 #include <sys/machelf.h>
61 #include <sys/shm_impl.h>
62 #include <sys/archsystm.h>
63 #include <sys/fasttrap.h>
64 #include <sys/brand.h>
65 #include "elf_impl.h"
66 #include <sys/sdt.h>
67 #include <sys/siginfo.h>
68
69 #if defined(__x86)
70 #include <sys/comm_page_util.h>
71 #endif /* defined(__x86) */
72
73
74 extern int at_flags;
75
76 #define ORIGIN_STR "ORIGIN"
77 #define ORIGIN_STR_SIZE 6
78
79 static int getelfhead(vnode_t *, cred_t *, Ehdr *, int *, int *, int *);
80 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, int, caddr_t *,
81 ssize_t *);
82 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, int, int, caddr_t *,
83 ssize_t *, caddr_t *, ssize_t *);
84 static size_t elfsize(Ehdr *, int, caddr_t, uintptr_t *);
85 static int mapelfexec(vnode_t *, Ehdr *, int, caddr_t,
86 Phdr **, Phdr **, Phdr **, Phdr **, Phdr *,
87 caddr_t *, caddr_t *, intptr_t *, intptr_t *, size_t, long *, size_t *);
88
89 typedef enum {
90 STR_CTF,
91 STR_SYMTAB,
92 STR_DYNSYM,
93 STR_STRTAB,
94 STR_DYNSTR,
95 STR_SHSTRTAB,
96 STR_NUM
97 } shstrtype_t;
98
99 static const char *shstrtab_data[] = {
100 ".SUNW_ctf",
101 ".symtab",
102 ".dynsym",
103 ".strtab",
104 ".dynstr",
105 ".shstrtab"
106 };
107
108 typedef struct shstrtab {
109 int sst_ndx[STR_NUM];
110 int sst_cur;
111 } shstrtab_t;
112
113 static void
114 shstrtab_init(shstrtab_t *s)
115 {
116 bzero(&s->sst_ndx, sizeof (s->sst_ndx));
117 s->sst_cur = 1;
118 }
119
120 static int
121 shstrtab_ndx(shstrtab_t *s, shstrtype_t type)
122 {
123 int ret;
124
125 if ((ret = s->sst_ndx[type]) != 0)
126 return (ret);
127
128 ret = s->sst_ndx[type] = s->sst_cur;
129 s->sst_cur += strlen(shstrtab_data[type]) + 1;
130
131 return (ret);
132 }
133
134 static size_t
135 shstrtab_size(const shstrtab_t *s)
136 {
137 return (s->sst_cur);
138 }
139
140 static void
141 shstrtab_dump(const shstrtab_t *s, char *buf)
142 {
143 int i, ndx;
144
145 *buf = '\0';
146 for (i = 0; i < STR_NUM; i++) {
147 if ((ndx = s->sst_ndx[i]) != 0)
148 (void) strcpy(buf + ndx, shstrtab_data[i]);
149 }
150 }
151
152 static int
153 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
154 {
155 ASSERT(phdrp->p_type == PT_SUNWDTRACE);
156
157 /*
158 * See the comment in fasttrap.h for information on how to safely
159 * update this program header.
160 */
161 if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
162 (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
163 return (-1);
164
165 args->thrptr = phdrp->p_vaddr + base;
166
167 return (0);
168 }
169
170 /*
171 * Map in the executable pointed to by vp. Returns 0 on success. Note that
172 * this function currently has the maximum number of arguments allowed by
173 * modstubs on x86 (MAXNARG)! Do _not_ add to this function signature without
174 * adding to MAXNARG. (Better yet, do not add to this monster of a function
175 * signature!)
176 */
177 int
178 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
179 intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
180 caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
181 {
182 size_t len;
183 struct vattr vat;
184 caddr_t phdrbase = NULL;
185 ssize_t phdrsize;
186 int nshdrs, shstrndx, nphdrs;
187 int error = 0;
188 Phdr *uphdr = NULL;
189 Phdr *junk = NULL;
190 Phdr *dynphdr = NULL;
191 Phdr *dtrphdr = NULL;
192 char *interp = NULL;
193 uintptr_t lddata;
194 long execsz;
195 intptr_t minaddr;
196
197 if (lddatap != NULL)
198 *lddatap = NULL;
199
200 if (minaddrp != NULL)
201 *minaddrp = NULL;
202
203 if (error = execpermissions(vp, &vat, args)) {
204 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
205 return (error);
206 }
207
208 if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
209 &nphdrs)) != 0 ||
210 (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
211 &phdrsize)) != 0) {
212 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
213 return (error);
214 }
215
216 if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
217 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
218 kmem_free(phdrbase, phdrsize);
219 return (ENOEXEC);
220 }
221 if (lddatap != NULL)
222 *lddatap = lddata;
223
224 if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
225 &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
226 len, &execsz, brksize)) {
227 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
228 if (uphdr != NULL && uphdr->p_flags == 0)
229 kmem_free(uphdr, sizeof (Phdr));
230 kmem_free(phdrbase, phdrsize);
231 return (error);
232 }
233
234 if (minaddrp != NULL)
235 *minaddrp = minaddr;
236
237 /*
238 * If the executable requires an interpreter, determine its name.
239 */
240 if (dynphdr != NULL) {
241 ssize_t resid;
242
243 if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
244 uprintf("%s: Invalid interpreter\n", exec_file);
245 kmem_free(phdrbase, phdrsize);
246 return (ENOEXEC);
247 }
248
249 interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
250
251 if ((error = vn_rdwr(UIO_READ, vp, interp, dynphdr->p_filesz,
252 (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
253 (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
254 interp[dynphdr->p_filesz - 1] != '\0') {
255 uprintf("%s: Cannot obtain interpreter pathname\n",
256 exec_file);
257 kmem_free(interp, MAXPATHLEN);
258 kmem_free(phdrbase, phdrsize);
259 return (error != 0 ? error : ENOEXEC);
260 }
261 }
262
263 /*
264 * If this is a statically linked executable, voffset should indicate
265 * the address of the executable itself (it normally holds the address
266 * of the interpreter).
267 */
268 if (ehdr->e_type == ET_EXEC && interp == NULL)
269 *voffset = minaddr;
270
271 /*
272 * If the caller has asked for the interpreter name, return it (it's
273 * up to the caller to free it); if the caller hasn't asked for it,
274 * free it ourselves.
275 */
276 if (interpp != NULL) {
277 *interpp = interp;
278 } else if (interp != NULL) {
279 kmem_free(interp, MAXPATHLEN);
280 }
281
282 if (uphdr != NULL) {
283 *uphdr_vaddr = uphdr->p_vaddr;
284
285 if (uphdr->p_flags == 0)
286 kmem_free(uphdr, sizeof (Phdr));
287 } else if (ehdr->e_type == ET_DYN) {
288 /*
289 * If we don't have a uphdr, we'll apply the logic found
290 * in mapelfexec() and use the p_vaddr of the first PT_LOAD
291 * section as the base address of the object.
292 */
293 Phdr *phdr = (Phdr *)phdrbase;
294 int i, hsize = ehdr->e_phentsize;
295
296 for (i = nphdrs; i > 0; i--) {
297 if (phdr->p_type == PT_LOAD) {
298 *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
299 ehdr->e_phoff;
300 break;
301 }
302
303 phdr = (Phdr *)((caddr_t)phdr + hsize);
304 }
305
306 /*
307 * If we don't have a PT_LOAD segment, we should have returned
308 * ENOEXEC when elfsize() returned 0, above.
309 */
310 VERIFY(i > 0);
311 } else {
312 *uphdr_vaddr = (Addr)-1;
313 }
314
315 kmem_free(phdrbase, phdrsize);
316 return (error);
317 }
318
319 /*ARGSUSED*/
320 int
321 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
322 int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
323 int *brand_action)
324 {
325 caddr_t phdrbase = NULL;
326 caddr_t bssbase = 0;
327 caddr_t brkbase = 0;
328 size_t brksize = 0;
329 ssize_t dlnsize, nsize = 0;
330 aux_entry_t *aux;
331 int error;
332 ssize_t resid;
333 int fd = -1;
334 intptr_t voffset;
335 Phdr *dyphdr = NULL;
336 Phdr *stphdr = NULL;
337 Phdr *uphdr = NULL;
338 Phdr *junk = NULL;
339 size_t len;
340 ssize_t phdrsize;
341 int postfixsize = 0;
342 int i, hsize;
343 Phdr *phdrp;
344 Phdr *dataphdrp = NULL;
345 Phdr *dtrphdr;
346 Phdr *capphdr = NULL;
347 Cap *cap = NULL;
348 ssize_t capsize;
349 int hasu = 0;
350 int hasauxv = 0;
351 int hasdy = 0;
352 int branded = 0;
353 int dynuphdr = 0;
354
355 struct proc *p = ttoproc(curthread);
356 struct user *up = PTOU(p);
357 struct bigwad {
358 Ehdr ehdr;
359 aux_entry_t elfargs[__KERN_NAUXV_IMPL];
360 char dl_name[MAXPATHLEN];
361 char pathbuf[MAXPATHLEN];
362 struct vattr vattr;
363 struct execenv exenv;
364 } *bigwad; /* kmem_alloc this behemoth so we don't blow stack */
365 Ehdr *ehdrp;
366 int nshdrs, shstrndx, nphdrs;
367 char *dlnp;
368 char *pathbufp;
369 rlim64_t limit;
370 rlim64_t roundlimit;
371
372 ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
373
374 bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
375 ehdrp = &bigwad->ehdr;
376 dlnp = bigwad->dl_name;
377 pathbufp = bigwad->pathbuf;
378
379 /*
380 * Obtain ELF and program header information.
381 */
382 if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
383 &nphdrs)) != 0 ||
384 (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
385 &phdrsize)) != 0)
386 goto out;
387
388 /*
389 * Prevent executing an ELF file that has no entry point.
390 */
391 if (ehdrp->e_entry == 0) {
392 uprintf("%s: Bad entry point\n", exec_file);
393 goto bad;
394 }
395
396 /*
397 * Put data model that we're exec-ing to into the args passed to
398 * exec_args(), so it will know what it is copying to on new stack.
399 * Now that we know whether we are exec-ing a 32-bit or 64-bit
400 * executable, we can set execsz with the appropriate NCARGS.
401 */
402 #ifdef _LP64
403 if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
404 args->to_model = DATAMODEL_ILP32;
405 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
406 } else {
407 args->to_model = DATAMODEL_LP64;
408 if (!args->stk_prot_override) {
409 args->stk_prot &= ~PROT_EXEC;
410 }
411 #if defined(__i386) || defined(__amd64)
412 args->dat_prot &= ~PROT_EXEC;
413 #endif
414 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
415 }
416 #else /* _LP64 */
417 args->to_model = DATAMODEL_ILP32;
418 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
419 #endif /* _LP64 */
420
421 /*
422 * We delay invoking the brand callback until we've figured out what
423 * kind of elf binary we're trying to run, 32-bit or 64-bit. We do this
424 * because now the brand library can just check args->to_model to see if
425 * the target is 32-bit or 64-bit without having do duplicate all the
426 * code above.
427 *
428 * We also give the brand a chance to indicate that based on the ELF
429 * OSABI of the target binary it should become unbranded and optionally
430 * indicate that it should be treated as existing in a specific prefix.
431 *
432 * Note that if a brand opts to go down this route it does not actually
433 * end up being debranded. In other words, future programs that exec
434 * will still be considered for branding unless this escape hatch is
435 * used. Consider the case of lx brand for example. If a user runs
436 * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
437 * of DTrace that's in /native will take this escape hatch and be run
438 * and interpreted using the normal system call table; however, the
439 * execution of a non-illumos binary in the form of /bin/ls will still
440 * be branded and be subject to all of the normal actions of the brand.
441 *
442 * The level checks associated with brand handling below are used to
443 * prevent a loop since the brand elfexec function typically comes back
444 * through this function. We must check <= here since the nested
445 * handling in the #! interpreter code will increment the level before
446 * calling gexec to run the final elfexec interpreter.
447 */
448 if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
449 (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
450 if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
451 &args->brand_nroot) == B_TRUE) {
452 ASSERT(ehdrp->e_ident[EI_OSABI]);
453 *brand_action = EBA_NATIVE;
454 /* Add one for the trailing '/' in the path */
455 if (args->brand_nroot != NULL)
456 nsize = strlen(args->brand_nroot) + 1;
457 }
458 }
459
460 if ((level <= INTP_MAXDEPTH) &&
461 (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
462 error = BROP(p)->b_elfexec(vp, uap, args,
463 idatap, level + 1, execsz, setid, exec_file, cred,
464 brand_action);
465 goto out;
466 }
467
468 /*
469 * Determine aux size now so that stack can be built
470 * in one shot (except actual copyout of aux image),
471 * determine any non-default stack protections,
472 * and still have this code be machine independent.
473 */
474 hsize = ehdrp->e_phentsize;
475 phdrp = (Phdr *)phdrbase;
476 for (i = nphdrs; i > 0; i--) {
477 switch (phdrp->p_type) {
478 case PT_INTERP:
479 hasauxv = hasdy = 1;
480 break;
481 case PT_PHDR:
482 hasu = 1;
483 break;
484 case PT_SUNWSTACK:
485 args->stk_prot = PROT_USER;
486 if (phdrp->p_flags & PF_R)
487 args->stk_prot |= PROT_READ;
488 if (phdrp->p_flags & PF_W)
489 args->stk_prot |= PROT_WRITE;
490 if (phdrp->p_flags & PF_X)
491 args->stk_prot |= PROT_EXEC;
492 break;
493 case PT_LOAD:
494 dataphdrp = phdrp;
495 break;
496 case PT_SUNWCAP:
497 capphdr = phdrp;
498 break;
499 }
500 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
501 }
502
503 if (ehdrp->e_type != ET_EXEC) {
504 dataphdrp = NULL;
505 hasauxv = 1;
506 }
507
508 /* Copy BSS permissions to args->dat_prot */
509 if (dataphdrp != NULL) {
510 args->dat_prot = PROT_USER;
511 if (dataphdrp->p_flags & PF_R)
512 args->dat_prot |= PROT_READ;
513 if (dataphdrp->p_flags & PF_W)
514 args->dat_prot |= PROT_WRITE;
515 if (dataphdrp->p_flags & PF_X)
516 args->dat_prot |= PROT_EXEC;
517 }
518
519 /*
520 * If a auxvector will be required - reserve the space for
521 * it now. This may be increased by exec_args if there are
522 * ISA-specific types (included in __KERN_NAUXV_IMPL).
523 */
524 if (hasauxv) {
525 /*
526 * If a AUX vector is being built - the base AUX
527 * entries are:
528 *
529 * AT_BASE
530 * AT_FLAGS
531 * AT_PAGESZ
532 * AT_RANDOM (added in stk_copyout)
533 * AT_SUN_AUXFLAGS
534 * AT_SUN_HWCAP
535 * AT_SUN_HWCAP2
536 * AT_SUN_PLATFORM (added in stk_copyout)
537 * AT_SUN_EXECNAME (added in stk_copyout)
538 * AT_NULL
539 *
540 * total == 10
541 */
542 if (hasdy && hasu) {
543 /*
544 * Has PT_INTERP & PT_PHDR - the auxvectors that
545 * will be built are:
546 *
547 * AT_PHDR
548 * AT_PHENT
549 * AT_PHNUM
550 * AT_ENTRY
551 * AT_LDDATA
552 *
553 * total = 5
554 */
555 args->auxsize = (10 + 5) * sizeof (aux_entry_t);
556 } else if (hasdy) {
557 /*
558 * Has PT_INTERP but no PT_PHDR
559 *
560 * AT_EXECFD
561 * AT_LDDATA
562 *
563 * total = 2
564 */
565 args->auxsize = (10 + 2) * sizeof (aux_entry_t);
566 } else {
567 args->auxsize = 10 * sizeof (aux_entry_t);
568 }
569 } else {
570 args->auxsize = 0;
571 }
572
573 /*
574 * If this binary is using an emulator, we need to add an
575 * AT_SUN_EMULATOR aux entry.
576 */
577 if (args->emulator != NULL)
578 args->auxsize += sizeof (aux_entry_t);
579
580 /*
581 * If this is a native binary that's been given a modified interpreter
582 * root, inform it that the native system exists at that root.
583 */
584 if (args->brand_nroot != NULL) {
585 args->auxsize += sizeof (aux_entry_t);
586 }
587
588
589 /*
590 * On supported kernels (x86_64) make room in the auxv for the
591 * AT_SUN_COMMPAGE entry. This will go unpopulated on i86xpv systems
592 * which do not provide such functionality.
593 */
594 #if defined(__amd64)
595 args->auxsize += sizeof (aux_entry_t);
596 #endif /* defined(__amd64) */
597
598 /*
599 * If we have user credentials, we'll supply the following entries:
600 * AT_SUN_UID
601 * AT_SUN_RUID
602 * AT_SUN_GID
603 * AT_SUN_RGID
604 */
605 if (cred != NULL) {
606 args->auxsize += 4 * sizeof (aux_entry_t);
607 }
608
609 if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
610 branded = 1;
611 /*
612 * We will be adding 5 entries to the aux vectors. One for
613 * the the brandname and 4 for the brand specific aux vectors.
614 */
615 args->auxsize += 5 * sizeof (aux_entry_t);
616 }
617
618 /* Hardware/Software capabilities */
619 if (capphdr != NULL &&
620 (capsize = capphdr->p_filesz) > 0 &&
621 capsize <= 16 * sizeof (*cap)) {
622 int ncaps = capsize / sizeof (*cap);
623 Cap *cp;
624
625 cap = kmem_alloc(capsize, KM_SLEEP);
626 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
627 capsize, (offset_t)capphdr->p_offset,
628 UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
629 uprintf("%s: Cannot read capabilities section\n",
630 exec_file);
631 goto out;
632 }
633 for (cp = cap; cp < cap + ncaps; cp++) {
634 if (cp->c_tag == CA_SUNW_SF_1 &&
635 (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
636 if (args->to_model == DATAMODEL_LP64)
637 args->addr32 = 1;
638 break;
639 }
640 }
641 }
642
643 aux = bigwad->elfargs;
644 /*
645 * Move args to the user's stack.
646 * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM
647 * aux entries.
648 */
649 if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
650 if (error == -1) {
651 error = ENOEXEC;
652 goto bad;
653 }
654 goto out;
655 }
656 /* we're single threaded after this point */
657
658 /*
659 * If this is an ET_DYN executable (shared object),
660 * determine its memory size so that mapelfexec() can load it.
661 */
662 if (ehdrp->e_type == ET_DYN)
663 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
664 else
665 len = 0;
666
667 dtrphdr = NULL;
668
669 if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &dyphdr,
670 &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
671 len, execsz, &brksize)) != 0)
672 goto bad;
673
674 if (uphdr != NULL) {
675 /*
676 * Our uphdr has been dynamically allocated if (and only if)
677 * its program header flags are clear.
678 */
679 dynuphdr = (uphdr->p_flags == 0);
680 }
681
682 if (uphdr != NULL && dyphdr == NULL)
683 goto bad;
684
685 if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
686 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
687 goto bad;
688 }
689
690 if (dyphdr != NULL) {
691 size_t len;
692 uintptr_t lddata;
693 char *p;
694 struct vnode *nvp;
695
696 dlnsize = dyphdr->p_filesz + nsize;
697
698 if (dlnsize > MAXPATHLEN || dlnsize <= 0)
699 goto bad;
700
701 if (nsize != 0) {
702 bcopy(args->brand_nroot, dlnp, nsize - 1);
703 dlnp[nsize - 1] = '/';
704 }
705
706 /*
707 * Read in "interpreter" pathname.
708 */
709 if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
710 dyphdr->p_filesz, (offset_t)dyphdr->p_offset, UIO_SYSSPACE,
711 0, (rlim64_t)0, CRED(), &resid)) != 0) {
712 uprintf("%s: Cannot obtain interpreter pathname\n",
713 exec_file);
714 goto bad;
715 }
716
717 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
718 goto bad;
719
720 /*
721 * Search for '$ORIGIN' token in interpreter path.
722 * If found, expand it.
723 */
724 for (p = dlnp; p = strchr(p, '$'); ) {
725 uint_t len, curlen;
726 char *_ptr;
727
728 if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
729 continue;
730
731 /*
732 * We don't support $ORIGIN on setid programs to close
733 * a potential attack vector.
734 */
735 if ((setid & EXECSETID_SETID) != 0) {
736 error = ENOEXEC;
737 goto bad;
738 }
739
740 curlen = 0;
741 len = p - dlnp - 1;
742 if (len) {
743 bcopy(dlnp, pathbufp, len);
744 curlen += len;
745 }
746 if (_ptr = strrchr(args->pathname, '/')) {
747 len = _ptr - args->pathname;
748 if ((curlen + len) > MAXPATHLEN)
749 break;
750
751 bcopy(args->pathname, &pathbufp[curlen], len);
752 curlen += len;
753 } else {
754 /*
755 * executable is a basename found in the
756 * current directory. So - just substitue
757 * '.' for ORIGIN.
758 */
759 pathbufp[curlen] = '.';
760 curlen++;
761 }
762 p += ORIGIN_STR_SIZE;
763 len = strlen(p);
764
765 if ((curlen + len) > MAXPATHLEN)
766 break;
767 bcopy(p, &pathbufp[curlen], len);
768 curlen += len;
769 pathbufp[curlen++] = '\0';
770 bcopy(pathbufp, dlnp, curlen);
771 }
772
773 /*
774 * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
775 * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
776 * Just in case /usr is not mounted, change it now.
777 */
778 if (strcmp(dlnp, USR_LIB_RTLD) == 0)
779 dlnp += 4;
780 error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
781 if (error && dlnp != bigwad->dl_name) {
782 /* new kernel, old user-level */
783 error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
784 NULLVPP, &nvp);
785 }
786 if (error) {
787 uprintf("%s: Cannot find %s\n", exec_file, dlnp);
788 goto bad;
789 }
790
791 /*
792 * Setup the "aux" vector.
793 */
794 if (uphdr) {
795 if (ehdrp->e_type == ET_DYN) {
796 /* don't use the first page */
797 bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
798 bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
799 } else {
800 bigwad->exenv.ex_bssbase = bssbase;
801 bigwad->exenv.ex_brkbase = brkbase;
802 }
803 bigwad->exenv.ex_brksize = brksize;
804 bigwad->exenv.ex_magic = elfmagic;
805 bigwad->exenv.ex_vp = vp;
806 setexecenv(&bigwad->exenv);
807
808 ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
809 ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
810 ADDAUX(aux, AT_PHNUM, nphdrs)
811 ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
812 } else {
813 if ((error = execopen(&vp, &fd)) != 0) {
814 VN_RELE(nvp);
815 goto bad;
816 }
817
818 ADDAUX(aux, AT_EXECFD, fd)
819 }
820
821 if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
822 VN_RELE(nvp);
823 uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
824 goto bad;
825 }
826
827 /*
828 * Now obtain the ELF header along with the entire program
829 * header contained in "nvp".
830 */
831 kmem_free(phdrbase, phdrsize);
832 phdrbase = NULL;
833 if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
834 &shstrndx, &nphdrs)) != 0 ||
835 (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
836 &phdrsize)) != 0) {
837 VN_RELE(nvp);
838 uprintf("%s: Cannot read %s\n", exec_file, dlnp);
839 goto bad;
840 }
841
842 /*
843 * Determine memory size of the "interpreter's" loadable
844 * sections. This size is then used to obtain the virtual
845 * address of a hole, in the user's address space, large
846 * enough to map the "interpreter".
847 */
848 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
849 VN_RELE(nvp);
850 uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
851 goto bad;
852 }
853
854 dtrphdr = NULL;
855
856 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
857 &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
858 execsz, NULL);
859
860 if (error || junk != NULL) {
861 VN_RELE(nvp);
862 uprintf("%s: Cannot map %s\n", exec_file, dlnp);
863 goto bad;
864 }
865
866 /*
867 * We use the DTrace program header to initialize the
868 * architecture-specific user per-LWP location. The dtrace
869 * fasttrap provider requires ready access to per-LWP scratch
870 * space. We assume that there is only one such program header
871 * in the interpreter.
872 */
873 if (dtrphdr != NULL &&
874 dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
875 VN_RELE(nvp);
876 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
877 goto bad;
878 }
879
880 VN_RELE(nvp);
881 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
882 }
883
884 if (hasauxv) {
885 int auxf = AF_SUN_HWCAPVERIFY;
886
887 /*
888 * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were
889 * filled in via exec_args()
890 */
891 ADDAUX(aux, AT_BASE, voffset)
892 ADDAUX(aux, AT_FLAGS, at_flags)
893 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
894 /*
895 * Linker flags. (security)
896 * p_flag not yet set at this time.
897 * We rely on gexec() to provide us with the information.
898 * If the application is set-uid but this is not reflected
899 * in a mismatch between real/effective uids/gids, then
900 * don't treat this as a set-uid exec. So we care about
901 * the EXECSETID_UGIDS flag but not the ...SETID flag.
902 */
903 if ((setid &= ~EXECSETID_SETID) != 0)
904 auxf |= AF_SUN_SETUGID;
905
906 /*
907 * If we're running a native process from within a branded
908 * zone under pfexec then we clear the AF_SUN_SETUGID flag so
909 * that the native ld.so.1 is able to link with the native
910 * libraries instead of using the brand libraries that are
911 * installed in the zone. We only do this for processes
912 * which we trust because we see they are already running
913 * under pfexec (where uid != euid). This prevents a
914 * malicious user within the zone from crafting a wrapper to
915 * run native suid commands with unsecure libraries interposed.
916 */
917 if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
918 (setid &= ~EXECSETID_SETID) != 0))
919 auxf &= ~AF_SUN_SETUGID;
920
921 /*
922 * Record the user addr of the auxflags aux vector entry
923 * since brands may optionally want to manipulate this field.
924 */
925 args->auxp_auxflags =
926 (char *)((char *)args->stackend +
927 ((char *)&aux->a_type -
928 (char *)bigwad->elfargs));
929 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
930
931 /*
932 * Record information about the real and effective user and
933 * group IDs.
934 */
935 if (cred != NULL) {
936 ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
937 ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
938 ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
939 ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
940 }
941
942 /*
943 * Hardware capability flag word (performance hints)
944 * Used for choosing faster library routines.
945 * (Potentially different between 32-bit and 64-bit ABIs)
946 */
947 #if defined(_LP64)
948 if (args->to_model == DATAMODEL_NATIVE) {
949 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
950 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
951 } else {
952 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
953 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
954 }
955 #else
956 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
957 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
958 #endif
959 if (branded) {
960 /*
961 * Reserve space for the brand-private aux vectors,
962 * and record the user addr of that space.
963 */
964 args->auxp_brand =
965 (char *)((char *)args->stackend +
966 ((char *)&aux->a_type -
967 (char *)bigwad->elfargs));
968 ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
969 ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
970 ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
971 ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
972 }
973
974 /*
975 * Add the comm page auxv entry, mapping it in if needed.
976 */
977 #if defined(__amd64)
978 if (args->commpage != NULL ||
979 (args->commpage = (uintptr_t)comm_page_mapin()) != NULL) {
980 ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
981 } else {
982 /*
983 * If the comm page cannot be mapped, pad out the auxv
984 * to satisfy later size checks.
985 */
986 ADDAUX(aux, AT_NULL, 0)
987 }
988 #endif /* defined(__amd64) */
989
990 ADDAUX(aux, AT_NULL, 0)
991 postfixsize = (char *)aux - (char *)bigwad->elfargs;
992
993 /*
994 * We make assumptions above when we determine how many aux
995 * vector entries we will be adding. However, if we have an
996 * invalid elf file, it is possible that mapelfexec might
997 * behave differently (but not return an error), in which case
998 * the number of aux entries we actually add will be different.
999 * We detect that now and error out.
1000 */
1001 if (postfixsize != args->auxsize) {
1002 DTRACE_PROBE2(elfexec_badaux, int, postfixsize,
1003 int, args->auxsize);
1004 goto bad;
1005 }
1006 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
1007 }
1008
1009 /*
1010 * For the 64-bit kernel, the limit is big enough that rounding it up
1011 * to a page can overflow the 64-bit limit, so we check for btopr()
1012 * overflowing here by comparing it with the unrounded limit in pages.
1013 * If it hasn't overflowed, compare the exec size with the rounded up
1014 * limit in pages. Otherwise, just compare with the unrounded limit.
1015 */
1016 limit = btop(p->p_vmem_ctl);
1017 roundlimit = btopr(p->p_vmem_ctl);
1018 if ((roundlimit > limit && *execsz > roundlimit) ||
1019 (roundlimit < limit && *execsz > limit)) {
1020 mutex_enter(&p->p_lock);
1021 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1022 RCA_SAFE);
1023 mutex_exit(&p->p_lock);
1024 error = ENOMEM;
1025 goto bad;
1026 }
1027
1028 bzero(up->u_auxv, sizeof (up->u_auxv));
1029 up->u_commpagep = args->commpage;
1030 if (postfixsize) {
1031 int num_auxv;
1032
1033 /*
1034 * Copy the aux vector to the user stack.
1035 */
1036 error = execpoststack(args, bigwad->elfargs, postfixsize);
1037 if (error)
1038 goto bad;
1039
1040 /*
1041 * Copy auxv to the process's user structure for use by /proc.
1042 * If this is a branded process, the brand's exec routine will
1043 * copy it's private entries to the user structure later. It
1044 * relies on the fact that the blank entries are at the end.
1045 */
1046 num_auxv = postfixsize / sizeof (aux_entry_t);
1047 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
1048 aux = bigwad->elfargs;
1049 for (i = 0; i < num_auxv; i++) {
1050 up->u_auxv[i].a_type = aux[i].a_type;
1051 up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
1052 }
1053 }
1054
1055 /*
1056 * Pass back the starting address so we can set the program counter.
1057 */
1058 args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
1059
1060 if (!uphdr) {
1061 if (ehdrp->e_type == ET_DYN) {
1062 /*
1063 * If we are executing a shared library which doesn't
1064 * have a interpreter (probably ld.so.1) then
1065 * we don't set the brkbase now. Instead we
1066 * delay it's setting until the first call
1067 * via grow.c::brk(). This permits ld.so.1 to
1068 * initialize brkbase to the tail of the executable it
1069 * loads (which is where it needs to be).
1070 */
1071 bigwad->exenv.ex_brkbase = (caddr_t)0;
1072 bigwad->exenv.ex_bssbase = (caddr_t)0;
1073 bigwad->exenv.ex_brksize = 0;
1074 } else {
1075 bigwad->exenv.ex_brkbase = brkbase;
1076 bigwad->exenv.ex_bssbase = bssbase;
1077 bigwad->exenv.ex_brksize = brksize;
1078 }
1079 bigwad->exenv.ex_magic = elfmagic;
1080 bigwad->exenv.ex_vp = vp;
1081 setexecenv(&bigwad->exenv);
1082 }
1083
1084 ASSERT(error == 0);
1085 goto out;
1086
1087 bad:
1088 if (fd != -1) /* did we open the a.out yet */
1089 (void) execclose(fd);
1090
1091 psignal(p, SIGKILL);
1092
1093 if (error == 0)
1094 error = ENOEXEC;
1095 out:
1096 if (dynuphdr)
1097 kmem_free(uphdr, sizeof (Phdr));
1098 if (phdrbase != NULL)
1099 kmem_free(phdrbase, phdrsize);
1100 if (cap != NULL)
1101 kmem_free(cap, capsize);
1102 kmem_free(bigwad, sizeof (struct bigwad));
1103 return (error);
1104 }
1105
1106 /*
1107 * Compute the memory size requirement for the ELF file.
1108 */
1109 static size_t
1110 elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata)
1111 {
1112 size_t len;
1113 Phdr *phdrp = (Phdr *)phdrbase;
1114 int hsize = ehdrp->e_phentsize;
1115 int first = 1;
1116 int dfirst = 1; /* first data segment */
1117 uintptr_t loaddr = 0;
1118 uintptr_t hiaddr = 0;
1119 uintptr_t lo, hi;
1120 int i;
1121
1122 for (i = nphdrs; i > 0; i--) {
1123 if (phdrp->p_type == PT_LOAD) {
1124 lo = phdrp->p_vaddr;
1125 hi = lo + phdrp->p_memsz;
1126 if (first) {
1127 loaddr = lo;
1128 hiaddr = hi;
1129 first = 0;
1130 } else {
1131 if (loaddr > lo)
1132 loaddr = lo;
1133 if (hiaddr < hi)
1134 hiaddr = hi;
1135 }
1136
1137 /*
1138 * save the address of the first data segment
1139 * of a object - used for the AT_SUNW_LDDATA
1140 * aux entry.
1141 */
1142 if ((lddata != NULL) && dfirst &&
1143 (phdrp->p_flags & PF_W)) {
1144 *lddata = lo;
1145 dfirst = 0;
1146 }
1147 }
1148 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1149 }
1150
1151 len = hiaddr - (loaddr & PAGEMASK);
1152 len = roundup(len, PAGESIZE);
1153
1154 return (len);
1155 }
1156
1157 /*
1158 * Read in the ELF header and program header table.
1159 * SUSV3 requires:
1160 * ENOEXEC File format is not recognized
1161 * EINVAL Format recognized but execution not supported
1162 */
1163 static int
1164 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
1165 int *nphdrs)
1166 {
1167 int error;
1168 ssize_t resid;
1169
1170 /*
1171 * We got here by the first two bytes in ident,
1172 * now read the entire ELF header.
1173 */
1174 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr,
1175 sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0,
1176 (rlim64_t)0, credp, &resid)) != 0)
1177 return (error);
1178
1179 /*
1180 * Since a separate version is compiled for handling 32-bit and
1181 * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1182 * doesn't need to be able to deal with 32-bit ELF files.
1183 */
1184 if (resid != 0 ||
1185 ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1186 ehdr->e_ident[EI_MAG3] != ELFMAG3)
1187 return (ENOEXEC);
1188
1189 if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1190 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1191 ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1192 #else
1193 ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1194 #endif
1195 !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1196 ehdr->e_flags))
1197 return (EINVAL);
1198
1199 *nshdrs = ehdr->e_shnum;
1200 *shstrndx = ehdr->e_shstrndx;
1201 *nphdrs = ehdr->e_phnum;
1202
1203 /*
1204 * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1205 * to read in the section header at index zero to acces the true
1206 * values for those fields.
1207 */
1208 if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1209 *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1210 Shdr shdr;
1211
1212 if (ehdr->e_shoff == 0)
1213 return (EINVAL);
1214
1215 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1216 sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1217 (rlim64_t)0, credp, &resid)) != 0)
1218 return (error);
1219
1220 if (*nshdrs == 0)
1221 *nshdrs = shdr.sh_size;
1222 if (*shstrndx == SHN_XINDEX)
1223 *shstrndx = shdr.sh_link;
1224 if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1225 *nphdrs = shdr.sh_info;
1226 }
1227
1228 return (0);
1229 }
1230
1231 #ifdef _ELF32_COMPAT
1232 extern size_t elf_nphdr_max;
1233 #else
1234 size_t elf_nphdr_max = 1000;
1235 #endif
1236
1237 static int
1238 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs,
1239 caddr_t *phbasep, ssize_t *phsizep)
1240 {
1241 ssize_t resid, minsize;
1242 int err;
1243
1244 /*
1245 * Since we're going to be using e_phentsize to iterate down the
1246 * array of program headers, it must be 8-byte aligned or else
1247 * a we might cause a misaligned access. We use all members through
1248 * p_flags on 32-bit ELF files and p_memsz on 64-bit ELF files so
1249 * e_phentsize must be at least large enough to include those
1250 * members.
1251 */
1252 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1253 minsize = offsetof(Phdr, p_flags) + sizeof (((Phdr *)NULL)->p_flags);
1254 #else
1255 minsize = offsetof(Phdr, p_memsz) + sizeof (((Phdr *)NULL)->p_memsz);
1256 #endif
1257 if (ehdr->e_phentsize < minsize || (ehdr->e_phentsize & 3))
1258 return (EINVAL);
1259
1260 *phsizep = nphdrs * ehdr->e_phentsize;
1261
1262 if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1263 if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1264 return (ENOMEM);
1265 } else {
1266 *phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1267 }
1268
1269 if ((err = vn_rdwr(UIO_READ, vp, *phbasep, *phsizep,
1270 (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1271 credp, &resid)) != 0) {
1272 kmem_free(*phbasep, *phsizep);
1273 *phbasep = NULL;
1274 return (err);
1275 }
1276
1277 return (0);
1278 }
1279
1280 #ifdef _ELF32_COMPAT
1281 extern size_t elf_nshdr_max;
1282 extern size_t elf_shstrtab_max;
1283 #else
1284 size_t elf_nshdr_max = 10000;
1285 size_t elf_shstrtab_max = 100 * 1024;
1286 #endif
1287
1288
1289 static int
1290 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
1291 int nshdrs, int shstrndx, caddr_t *shbasep, ssize_t *shsizep,
1292 char **shstrbasep, ssize_t *shstrsizep)
1293 {
1294 ssize_t resid, minsize;
1295 int err;
1296 Shdr *shdr;
1297
1298 /*
1299 * Since we're going to be using e_shentsize to iterate down the
1300 * array of section headers, it must be 8-byte aligned or else
1301 * a we might cause a misaligned access. We use all members through
1302 * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1303 * must be at least large enough to include that member. The index
1304 * of the string table section must also be valid.
1305 */
1306 minsize = offsetof(Shdr, sh_entsize) + sizeof (shdr->sh_entsize);
1307 if (ehdr->e_shentsize < minsize || (ehdr->e_shentsize & 3) ||
1308 shstrndx >= nshdrs)
1309 return (EINVAL);
1310
1311 *shsizep = nshdrs * ehdr->e_shentsize;
1312
1313 if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1314 if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1315 return (ENOMEM);
1316 } else {
1317 *shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1318 }
1319
1320 if ((err = vn_rdwr(UIO_READ, vp, *shbasep, *shsizep,
1321 (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1322 credp, &resid)) != 0) {
1323 kmem_free(*shbasep, *shsizep);
1324 return (err);
1325 }
1326
1327 /*
1328 * Pull the section string table out of the vnode; fail if the size
1329 * is zero.
1330 */
1331 shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1332 if ((*shstrsizep = shdr->sh_size) == 0) {
1333 kmem_free(*shbasep, *shsizep);
1334 return (EINVAL);
1335 }
1336
1337 if (*shstrsizep > elf_shstrtab_max) {
1338 if ((*shstrbasep = kmem_alloc(*shstrsizep,
1339 KM_NOSLEEP)) == NULL) {
1340 kmem_free(*shbasep, *shsizep);
1341 return (ENOMEM);
1342 }
1343 } else {
1344 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1345 }
1346
1347 if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, *shstrsizep,
1348 (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1349 credp, &resid)) != 0) {
1350 kmem_free(*shbasep, *shsizep);
1351 kmem_free(*shstrbasep, *shstrsizep);
1352 return (err);
1353 }
1354
1355 /*
1356 * Make sure the strtab is null-terminated to make sure we
1357 * don't run off the end of the table.
1358 */
1359 (*shstrbasep)[*shstrsizep - 1] = '\0';
1360
1361 return (0);
1362 }
1363
1364
1365 #ifdef _ELF32_COMPAT
1366 int
1367 elf32readhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
1368 caddr_t *phbasep, ssize_t *phsizep)
1369 #else
1370 int
1371 elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
1372 caddr_t *phbasep, ssize_t *phsizep)
1373 #endif
1374 {
1375 int error, nshdrs, shstrndx;
1376
1377 if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
1378 nphdrs)) != 0 ||
1379 (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
1380 phsizep)) != 0) {
1381 return (error);
1382 }
1383 return (0);
1384 }
1385
1386
1387 static int
1388 mapelfexec(
1389 vnode_t *vp,
1390 Ehdr *ehdr,
1391 int nphdrs,
1392 caddr_t phdrbase,
1393 Phdr **uphdr,
1394 Phdr **dyphdr,
1395 Phdr **stphdr,
1396 Phdr **dtphdr,
1397 Phdr *dataphdrp,
1398 caddr_t *bssbase,
1399 caddr_t *brkbase,
1400 intptr_t *voffset,
1401 intptr_t *minaddr,
1402 size_t len,
1403 long *execsz,
1404 size_t *brksize)
1405 {
1406 Phdr *phdr;
1407 int i, prot, error, lastprot = 0;
1408 caddr_t addr = NULL;
1409 size_t zfodsz;
1410 int ptload = 0;
1411 int page;
1412 off_t offset;
1413 int hsize = ehdr->e_phentsize;
1414 caddr_t mintmp = (caddr_t)-1;
1415 uintptr_t lastaddr = NULL;
1416 extern int use_brk_lpg;
1417
1418 if (ehdr->e_type == ET_DYN) {
1419 caddr_t vaddr;
1420
1421 /*
1422 * Despite the fact that mmapobj(2) refuses to load them, we
1423 * need to support executing ET_DYN objects that have a
1424 * non-NULL p_vaddr. When found in the wild, these objects
1425 * are likely to be due to an old (and largely obviated) Linux
1426 * facility, prelink(8), that rewrites shared objects to
1427 * prefer specific (disjoint) virtual address ranges. (Yes,
1428 * this is putatively for performance -- and yes, it has
1429 * limited applicability, many edge conditions and grisly
1430 * failure modes; even for Linux, it's insane.) As ELF
1431 * mandates that the PT_LOAD segments be in p_vaddr order, we
1432 * find the lowest p_vaddr by finding the first PT_LOAD
1433 * segment.
1434 */
1435 phdr = (Phdr *)phdrbase;
1436 for (i = nphdrs; i > 0; i--) {
1437 if (phdr->p_type == PT_LOAD) {
1438 addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
1439 break;
1440 }
1441 phdr = (Phdr *)((caddr_t)phdr + hsize);
1442 }
1443
1444 /*
1445 * We have a non-zero p_vaddr in the first PT_LOAD segment --
1446 * presumably because we're directly executing a prelink(8)'d
1447 * ld-linux.so. While we could correctly execute such an
1448 * object without locating it at its desired p_vaddr (it is,
1449 * after all, still relocatable), our inner antiquarian
1450 * derives a perverse pleasure in accommodating the steampunk
1451 * prelink(8) contraption -- goggles on!
1452 */
1453 if ((vaddr = addr) != NULL) {
1454 if (as_gap(curproc->p_as, len,
1455 &addr, &len, AH_LO, NULL) == -1 || addr != vaddr) {
1456 addr = NULL;
1457 }
1458 }
1459
1460 if (addr == NULL) {
1461 /*
1462 * We either have a NULL p_vaddr (the common case, by
1463 * many orders of magnitude) or we have a non-NULL
1464 * p_vaddr and we were unable to obtain the specified
1465 * VA range (presumably because it's an illegal
1466 * address). Either way, obtain an address in which
1467 * to map the interpreter.
1468 */
1469 map_addr(&addr, len, (offset_t)0, 1, 0);
1470 if (addr == NULL)
1471 return (ENOMEM);
1472 }
1473
1474 /*
1475 * Our voffset is the difference between where we landed and
1476 * where we wanted to be.
1477 */
1478 *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
1479 } else {
1480 *voffset = 0;
1481 }
1482
1483 phdr = (Phdr *)phdrbase;
1484 for (i = nphdrs; i > 0; i--) {
1485 switch (phdr->p_type) {
1486 case PT_LOAD:
1487 ptload = 1;
1488 prot = PROT_USER;
1489 if (phdr->p_flags & PF_R)
1490 prot |= PROT_READ;
1491 if (phdr->p_flags & PF_W)
1492 prot |= PROT_WRITE;
1493 if (phdr->p_flags & PF_X)
1494 prot |= PROT_EXEC;
1495
1496 addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1497
1498 if ((*dyphdr != NULL) && uphdr != NULL &&
1499 (*uphdr == NULL)) {
1500 /*
1501 * The PT_PHDR program header is, strictly
1502 * speaking, optional. If we find that this
1503 * is missing, we will determine the location
1504 * of the program headers based on the address
1505 * of the lowest PT_LOAD segment (namely, this
1506 * one): we subtract the p_offset to get to
1507 * the ELF header and then add back the program
1508 * header offset to get to the program headers.
1509 * We then cons up a Phdr that corresponds to
1510 * the (missing) PT_PHDR, setting the flags
1511 * to 0 to denote that this is artificial and
1512 * should (must) be freed by the caller.
1513 */
1514 Phdr *cons;
1515
1516 cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
1517
1518 cons->p_flags = 0;
1519 cons->p_type = PT_PHDR;
1520 cons->p_vaddr = ((uintptr_t)addr -
1521 phdr->p_offset) + ehdr->e_phoff;
1522
1523 *uphdr = cons;
1524 }
1525
1526 /*
1527 * Keep track of the segment with the lowest starting
1528 * address.
1529 */
1530 if (addr < mintmp)
1531 mintmp = addr;
1532
1533 /*
1534 * Segments need not correspond to page boundaries:
1535 * they are permitted to share a page. If two PT_LOAD
1536 * segments share the same page, and the permissions
1537 * of the segments differ, the behavior is historically
1538 * that the permissions of the latter segment are used
1539 * for the page that the two segments share. This is
1540 * also historically a non-issue: binaries generated
1541 * by most anything will make sure that two PT_LOAD
1542 * segments with differing permissions don't actually
1543 * share any pages. However, there exist some crazy
1544 * things out there (including at least an obscure
1545 * Portuguese teaching language called G-Portugol) that
1546 * actually do the wrong thing and expect it to work:
1547 * they have a segment with execute permission share
1548 * a page with a subsequent segment that does not
1549 * have execute permissions and expect the resulting
1550 * shared page to in fact be executable. To accommodate
1551 * such broken link editors, we take advantage of a
1552 * latitude explicitly granted to the loader: it is
1553 * permitted to make _any_ PT_LOAD segment executable
1554 * (provided that it is readable or writable). If we
1555 * see that we're sharing a page and that the previous
1556 * page was executable, we will add execute permissions
1557 * to our segment.
1558 */
1559 if (btop(lastaddr) == btop((uintptr_t)addr) &&
1560 (phdr->p_flags & (PF_R | PF_W)) &&
1561 (lastprot & PROT_EXEC)) {
1562 prot |= PROT_EXEC;
1563 }
1564
1565 lastaddr = (uintptr_t)addr + phdr->p_filesz;
1566 lastprot = prot;
1567
1568 zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1569
1570 offset = phdr->p_offset;
1571 if (((uintptr_t)offset & PAGEOFFSET) ==
1572 ((uintptr_t)addr & PAGEOFFSET) &&
1573 (!(vp->v_flag & VNOMAP))) {
1574 page = 1;
1575 } else {
1576 page = 0;
1577 }
1578
1579 /*
1580 * Set the heap pagesize for OOB when the bss size
1581 * is known and use_brk_lpg is not 0.
1582 */
1583 if (brksize != NULL && use_brk_lpg &&
1584 zfodsz != 0 && phdr == dataphdrp &&
1585 (prot & PROT_WRITE)) {
1586 size_t tlen = P2NPHASE((uintptr_t)addr +
1587 phdr->p_filesz, PAGESIZE);
1588
1589 if (zfodsz > tlen) {
1590 curproc->p_brkpageszc =
1591 page_szc(map_pgsz(MAPPGSZ_HEAP,
1592 curproc, addr + phdr->p_filesz +
1593 tlen, zfodsz - tlen, 0));
1594 }
1595 }
1596
1597 if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1598 (prot & PROT_WRITE)) {
1599 uint_t szc = curproc->p_brkpageszc;
1600 size_t pgsz = page_get_pagesize(szc);
1601 caddr_t ebss = addr + phdr->p_memsz;
1602 size_t extra_zfodsz;
1603
1604 ASSERT(pgsz > PAGESIZE);
1605
1606 extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1607
1608 if (error = execmap(vp, addr, phdr->p_filesz,
1609 zfodsz + extra_zfodsz, phdr->p_offset,
1610 prot, page, szc))
1611 goto bad;
1612 if (brksize != NULL)
1613 *brksize = extra_zfodsz;
1614 } else {
1615 if (error = execmap(vp, addr, phdr->p_filesz,
1616 zfodsz, phdr->p_offset, prot, page, 0))
1617 goto bad;
1618 }
1619
1620 if (bssbase != NULL && addr >= *bssbase &&
1621 phdr == dataphdrp) {
1622 *bssbase = addr + phdr->p_filesz;
1623 }
1624 if (brkbase != NULL && addr >= *brkbase) {
1625 *brkbase = addr + phdr->p_memsz;
1626 }
1627
1628 *execsz += btopr(phdr->p_memsz);
1629 break;
1630
1631 case PT_INTERP:
1632 /*
1633 * The ELF specification is unequivocal about the
1634 * PT_INTERP program header with respect to any PT_LOAD
1635 * program header: "If it is present, it must precede
1636 * any loadable segment entry." Linux, however, makes
1637 * no attempt to enforce this -- which has allowed some
1638 * binary editing tools to get away with generating
1639 * invalid ELF binaries in the respect that PT_INTERP
1640 * occurs after the first PT_LOAD program header. This
1641 * is unfortunate (and of course, disappointing) but
1642 * it's no worse than that: there is no reason that we
1643 * can't process the PT_INTERP entry (if present) after
1644 * one or more PT_LOAD entries. We therefore
1645 * deliberately do not check ptload here and always
1646 * store dyphdr to be the PT_INTERP program header.
1647 */
1648 *dyphdr = phdr;
1649 break;
1650
1651 case PT_SHLIB:
1652 *stphdr = phdr;
1653 break;
1654
1655 case PT_PHDR:
1656 if (ptload || phdr->p_flags == 0)
1657 goto bad;
1658
1659 if (uphdr != NULL)
1660 *uphdr = phdr;
1661
1662 break;
1663
1664 case PT_NULL:
1665 case PT_DYNAMIC:
1666 case PT_NOTE:
1667 break;
1668
1669 case PT_SUNWDTRACE:
1670 if (dtphdr != NULL)
1671 *dtphdr = phdr;
1672 break;
1673
1674 default:
1675 break;
1676 }
1677 phdr = (Phdr *)((caddr_t)phdr + hsize);
1678 }
1679
1680 if (minaddr != NULL) {
1681 ASSERT(mintmp != (caddr_t)-1);
1682 *minaddr = (intptr_t)mintmp;
1683 }
1684
1685 return (0);
1686 bad:
1687 if (error == 0)
1688 error = EINVAL;
1689 return (error);
1690 }
1691
1692 int
1693 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1694 rlim64_t rlimit, cred_t *credp)
1695 {
1696 Note note;
1697 int error;
1698
1699 bzero(¬e, sizeof (note));
1700 bcopy("CORE", note.name, 4);
1701 note.nhdr.n_type = type;
1702 /*
1703 * The System V ABI states that n_namesz must be the length of the
1704 * string that follows the Nhdr structure including the terminating
1705 * null. The ABI also specifies that sufficient padding should be
1706 * included so that the description that follows the name string
1707 * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1708 * respectively. However, since this change was not made correctly
1709 * at the time of the 64-bit port, both 32- and 64-bit binaries
1710 * descriptions are only guaranteed to begin on a 4-byte boundary.
1711 */
1712 note.nhdr.n_namesz = 5;
1713 note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1714
1715 if (error = core_write(vp, UIO_SYSSPACE, *offsetp, ¬e,
1716 sizeof (note), rlimit, credp))
1717 return (error);
1718
1719 *offsetp += sizeof (note);
1720
1721 if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1722 note.nhdr.n_descsz, rlimit, credp))
1723 return (error);
1724
1725 *offsetp += note.nhdr.n_descsz;
1726 return (0);
1727 }
1728
1729 /*
1730 * Copy the section data from one vnode to the section of another vnode.
1731 */
1732 static void
1733 copy_scn(Shdr *src, vnode_t *src_vp, Shdr *dst, vnode_t *dst_vp, Off *doffset,
1734 void *buf, size_t size, cred_t *credp, rlim64_t rlimit)
1735 {
1736 ssize_t resid;
1737 size_t len, n = src->sh_size;
1738 offset_t off = 0;
1739
1740 while (n != 0) {
1741 len = MIN(size, n);
1742 if (vn_rdwr(UIO_READ, src_vp, buf, len, src->sh_offset + off,
1743 UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1744 resid >= len ||
1745 core_write(dst_vp, UIO_SYSSPACE, *doffset + off,
1746 buf, len - resid, rlimit, credp) != 0) {
1747 dst->sh_size = 0;
1748 dst->sh_offset = 0;
1749 return;
1750 }
1751
1752 ASSERT(n >= len - resid);
1753
1754 n -= len - resid;
1755 off += len - resid;
1756 }
1757
1758 *doffset += src->sh_size;
1759 }
1760
1761 #ifdef _ELF32_COMPAT
1762 extern size_t elf_datasz_max;
1763 #else
1764 size_t elf_datasz_max = 1 * 1024 * 1024;
1765 #endif
1766
1767 /*
1768 * This function processes mappings that correspond to load objects to
1769 * examine their respective sections for elfcore(). It's called once with
1770 * v set to NULL to count the number of sections that we're going to need
1771 * and then again with v set to some allocated buffer that we fill in with
1772 * all the section data.
1773 */
1774 static int
1775 process_scns(core_content_t content, proc_t *p, cred_t *credp, vnode_t *vp,
1776 Shdr *v, int nv, rlim64_t rlimit, Off *doffsetp, int *nshdrsp)
1777 {
1778 vnode_t *lastvp = NULL;
1779 struct seg *seg;
1780 int i, j;
1781 void *data = NULL;
1782 size_t datasz = 0;
1783 shstrtab_t shstrtab;
1784 struct as *as = p->p_as;
1785 int error = 0;
1786
1787 if (v != NULL)
1788 shstrtab_init(&shstrtab);
1789
1790 i = 1;
1791 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1792 uint_t prot;
1793 vnode_t *mvp;
1794 void *tmp = NULL;
1795 caddr_t saddr = seg->s_base;
1796 caddr_t naddr;
1797 caddr_t eaddr;
1798 size_t segsize;
1799
1800 Ehdr ehdr;
1801 int nshdrs, shstrndx, nphdrs;
1802 caddr_t shbase;
1803 ssize_t shsize;
1804 char *shstrbase;
1805 ssize_t shstrsize;
1806
1807 Shdr *shdr;
1808 const char *name;
1809 size_t sz;
1810 uintptr_t off;
1811
1812 int ctf_ndx = 0;
1813 int symtab_ndx = 0;
1814
1815 /*
1816 * Since we're just looking for text segments of load
1817 * objects, we only care about the protection bits; we don't
1818 * care about the actual size of the segment so we use the
1819 * reserved size. If the segment's size is zero, there's
1820 * something fishy going on so we ignore this segment.
1821 */
1822 if (seg->s_ops != &segvn_ops ||
1823 SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
1824 mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
1825 (segsize = pr_getsegsize(seg, 1)) == 0)
1826 continue;
1827
1828 eaddr = saddr + segsize;
1829 prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
1830 pr_getprot_done(&tmp);
1831
1832 /*
1833 * Skip this segment unless the protection bits look like
1834 * what we'd expect for a text segment.
1835 */
1836 if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
1837 continue;
1838
1839 if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx,
1840 &nphdrs) != 0 ||
1841 getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx,
1842 &shbase, &shsize, &shstrbase, &shstrsize) != 0)
1843 continue;
1844
1845 off = ehdr.e_shentsize;
1846 for (j = 1; j < nshdrs; j++, off += ehdr.e_shentsize) {
1847 Shdr *symtab = NULL, *strtab;
1848
1849 shdr = (Shdr *)(shbase + off);
1850
1851 if (shdr->sh_name >= shstrsize)
1852 continue;
1853
1854 name = shstrbase + shdr->sh_name;
1855
1856 if (strcmp(name, shstrtab_data[STR_CTF]) == 0) {
1857 if ((content & CC_CONTENT_CTF) == 0 ||
1858 ctf_ndx != 0)
1859 continue;
1860
1861 if (shdr->sh_link > 0 &&
1862 shdr->sh_link < nshdrs) {
1863 symtab = (Shdr *)(shbase +
1864 shdr->sh_link * ehdr.e_shentsize);
1865 }
1866
1867 if (v != NULL && i < nv - 1) {
1868 if (shdr->sh_size > datasz &&
1869 shdr->sh_size <= elf_datasz_max) {
1870 if (data != NULL)
1871 kmem_free(data, datasz);
1872
1873 datasz = shdr->sh_size;
1874 data = kmem_alloc(datasz,
1875 KM_SLEEP);
1876 }
1877
1878 v[i].sh_name = shstrtab_ndx(&shstrtab,
1879 STR_CTF);
1880 v[i].sh_addr = (Addr)(uintptr_t)saddr;
1881 v[i].sh_type = SHT_PROGBITS;
1882 v[i].sh_addralign = 4;
1883 *doffsetp = roundup(*doffsetp,
1884 v[i].sh_addralign);
1885 v[i].sh_offset = *doffsetp;
1886 v[i].sh_size = shdr->sh_size;
1887 if (symtab == NULL) {
1888 v[i].sh_link = 0;
1889 } else if (symtab->sh_type ==
1890 SHT_SYMTAB &&
1891 symtab_ndx != 0) {
1892 v[i].sh_link =
1893 symtab_ndx;
1894 } else {
1895 v[i].sh_link = i + 1;
1896 }
1897
1898 copy_scn(shdr, mvp, &v[i], vp,
1899 doffsetp, data, datasz, credp,
1900 rlimit);
1901 }
1902
1903 ctf_ndx = i++;
1904
1905 /*
1906 * We've already dumped the symtab.
1907 */
1908 if (symtab != NULL &&
1909 symtab->sh_type == SHT_SYMTAB &&
1910 symtab_ndx != 0)
1911 continue;
1912
1913 } else if (strcmp(name,
1914 shstrtab_data[STR_SYMTAB]) == 0) {
1915 if ((content & CC_CONTENT_SYMTAB) == 0 ||
1916 symtab != 0)
1917 continue;
1918
1919 symtab = shdr;
1920 }
1921
1922 if (symtab != NULL) {
1923 if ((symtab->sh_type != SHT_DYNSYM &&
1924 symtab->sh_type != SHT_SYMTAB) ||
1925 symtab->sh_link == 0 ||
1926 symtab->sh_link >= nshdrs)
1927 continue;
1928
1929 strtab = (Shdr *)(shbase +
1930 symtab->sh_link * ehdr.e_shentsize);
1931
1932 if (strtab->sh_type != SHT_STRTAB)
1933 continue;
1934
1935 if (v != NULL && i < nv - 2) {
1936 sz = MAX(symtab->sh_size,
1937 strtab->sh_size);
1938 if (sz > datasz &&
1939 sz <= elf_datasz_max) {
1940 if (data != NULL)
1941 kmem_free(data, datasz);
1942
1943 datasz = sz;
1944 data = kmem_alloc(datasz,
1945 KM_SLEEP);
1946 }
1947
1948 if (symtab->sh_type == SHT_DYNSYM) {
1949 v[i].sh_name = shstrtab_ndx(
1950 &shstrtab, STR_DYNSYM);
1951 v[i + 1].sh_name = shstrtab_ndx(
1952 &shstrtab, STR_DYNSTR);
1953 } else {
1954 v[i].sh_name = shstrtab_ndx(
1955 &shstrtab, STR_SYMTAB);
1956 v[i + 1].sh_name = shstrtab_ndx(
1957 &shstrtab, STR_STRTAB);
1958 }
1959
1960 v[i].sh_type = symtab->sh_type;
1961 v[i].sh_addr = symtab->sh_addr;
1962 if (ehdr.e_type == ET_DYN ||
1963 v[i].sh_addr == 0)
1964 v[i].sh_addr +=
1965 (Addr)(uintptr_t)saddr;
1966 v[i].sh_addralign =
1967 symtab->sh_addralign;
1968 *doffsetp = roundup(*doffsetp,
1969 v[i].sh_addralign);
1970 v[i].sh_offset = *doffsetp;
1971 v[i].sh_size = symtab->sh_size;
1972 v[i].sh_link = i + 1;
1973 v[i].sh_entsize = symtab->sh_entsize;
1974 v[i].sh_info = symtab->sh_info;
1975
1976 copy_scn(symtab, mvp, &v[i], vp,
1977 doffsetp, data, datasz, credp,
1978 rlimit);
1979
1980 v[i + 1].sh_type = SHT_STRTAB;
1981 v[i + 1].sh_flags = SHF_STRINGS;
1982 v[i + 1].sh_addr = symtab->sh_addr;
1983 if (ehdr.e_type == ET_DYN ||
1984 v[i + 1].sh_addr == 0)
1985 v[i + 1].sh_addr +=
1986 (Addr)(uintptr_t)saddr;
1987 v[i + 1].sh_addralign =
1988 strtab->sh_addralign;
1989 *doffsetp = roundup(*doffsetp,
1990 v[i + 1].sh_addralign);
1991 v[i + 1].sh_offset = *doffsetp;
1992 v[i + 1].sh_size = strtab->sh_size;
1993
1994 copy_scn(strtab, mvp, &v[i + 1], vp,
1995 doffsetp, data, datasz, credp,
1996 rlimit);
1997 }
1998
1999 if (symtab->sh_type == SHT_SYMTAB)
2000 symtab_ndx = i;
2001 i += 2;
2002 }
2003 }
2004
2005 kmem_free(shstrbase, shstrsize);
2006 kmem_free(shbase, shsize);
2007
2008 lastvp = mvp;
2009 }
2010
2011 if (v == NULL) {
2012 if (i == 1)
2013 *nshdrsp = 0;
2014 else
2015 *nshdrsp = i + 1;
2016 goto done;
2017 }
2018
2019 if (i != nv - 1) {
2020 cmn_err(CE_WARN, "elfcore: core dump failed for "
2021 "process %d; address space is changing", p->p_pid);
2022 error = EIO;
2023 goto done;
2024 }
2025
2026 v[i].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB);
2027 v[i].sh_size = shstrtab_size(&shstrtab);
2028 v[i].sh_addralign = 1;
2029 *doffsetp = roundup(*doffsetp, v[i].sh_addralign);
2030 v[i].sh_offset = *doffsetp;
2031 v[i].sh_flags = SHF_STRINGS;
2032 v[i].sh_type = SHT_STRTAB;
2033
2034 if (v[i].sh_size > datasz) {
2035 if (data != NULL)
2036 kmem_free(data, datasz);
2037
2038 datasz = v[i].sh_size;
2039 data = kmem_alloc(datasz,
2040 KM_SLEEP);
2041 }
2042
2043 shstrtab_dump(&shstrtab, data);
2044
2045 if ((error = core_write(vp, UIO_SYSSPACE, *doffsetp,
2046 data, v[i].sh_size, rlimit, credp)) != 0)
2047 goto done;
2048
2049 *doffsetp += v[i].sh_size;
2050
2051 done:
2052 if (data != NULL)
2053 kmem_free(data, datasz);
2054
2055 return (error);
2056 }
2057
2058 int
2059 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
2060 core_content_t content)
2061 {
2062 offset_t poffset, soffset;
2063 Off doffset;
2064 int error, i, nphdrs, nshdrs;
2065 int overflow = 0;
2066 struct seg *seg;
2067 struct as *as = p->p_as;
2068 union {
2069 Ehdr ehdr;
2070 Phdr phdr[1];
2071 Shdr shdr[1];
2072 } *bigwad;
2073 size_t bigsize;
2074 size_t phdrsz, shdrsz;
2075 Ehdr *ehdr;
2076 Phdr *v;
2077 caddr_t brkbase;
2078 size_t brksize;
2079 caddr_t stkbase;
2080 size_t stksize;
2081 int ntries = 0;
2082 klwp_t *lwp = ttolwp(curthread);
2083
2084 top:
2085 /*
2086 * Make sure we have everything we need (registers, etc.).
2087 * All other lwps have already stopped and are in an orderly state.
2088 */
2089 ASSERT(p == ttoproc(curthread));
2090 prstop(0, 0);
2091
2092 AS_LOCK_ENTER(as, RW_WRITER);
2093 nphdrs = prnsegs(as, 0) + 2; /* two CORE note sections */
2094
2095 /*
2096 * Count the number of section headers we're going to need.
2097 */
2098 nshdrs = 0;
2099 if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) {
2100 (void) process_scns(content, p, credp, NULL, NULL, NULL, 0,
2101 NULL, &nshdrs);
2102 }
2103 AS_LOCK_EXIT(as);
2104
2105 ASSERT(nshdrs == 0 || nshdrs > 1);
2106
2107 /*
2108 * The core file contents may required zero section headers, but if
2109 * we overflow the 16 bits allotted to the program header count in
2110 * the ELF header, we'll need that program header at index zero.
2111 */
2112 if (nshdrs == 0 && nphdrs >= PN_XNUM)
2113 nshdrs = 1;
2114
2115 phdrsz = nphdrs * sizeof (Phdr);
2116 shdrsz = nshdrs * sizeof (Shdr);
2117
2118 bigsize = MAX(sizeof (*bigwad), MAX(phdrsz, shdrsz));
2119 bigwad = kmem_alloc(bigsize, KM_SLEEP);
2120
2121 ehdr = &bigwad->ehdr;
2122 bzero(ehdr, sizeof (*ehdr));
2123
2124 ehdr->e_ident[EI_MAG0] = ELFMAG0;
2125 ehdr->e_ident[EI_MAG1] = ELFMAG1;
2126 ehdr->e_ident[EI_MAG2] = ELFMAG2;
2127 ehdr->e_ident[EI_MAG3] = ELFMAG3;
2128 ehdr->e_ident[EI_CLASS] = ELFCLASS;
2129 ehdr->e_type = ET_CORE;
2130
2131 #if !defined(_LP64) || defined(_ELF32_COMPAT)
2132
2133 #if defined(__sparc)
2134 ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2135 ehdr->e_machine = EM_SPARC;
2136 #elif defined(__i386) || defined(__i386_COMPAT)
2137 ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2138 ehdr->e_machine = EM_386;
2139 #else
2140 #error "no recognized machine type is defined"
2141 #endif
2142
2143 #else /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2144
2145 #if defined(__sparc)
2146 ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2147 ehdr->e_machine = EM_SPARCV9;
2148 #elif defined(__amd64)
2149 ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2150 ehdr->e_machine = EM_AMD64;
2151 #else
2152 #error "no recognized 64-bit machine type is defined"
2153 #endif
2154
2155 #endif /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2156
2157 /*
2158 * If the count of program headers or section headers or the index
2159 * of the section string table can't fit in the mere 16 bits
2160 * shortsightedly allotted to them in the ELF header, we use the
2161 * extended formats and put the real values in the section header
2162 * as index 0.
2163 */
2164 ehdr->e_version = EV_CURRENT;
2165 ehdr->e_ehsize = sizeof (Ehdr);
2166
2167 if (nphdrs >= PN_XNUM)
2168 ehdr->e_phnum = PN_XNUM;
2169 else
2170 ehdr->e_phnum = (unsigned short)nphdrs;
2171
2172 ehdr->e_phoff = sizeof (Ehdr);
2173 ehdr->e_phentsize = sizeof (Phdr);
2174
2175 if (nshdrs > 0) {
2176 if (nshdrs >= SHN_LORESERVE)
2177 ehdr->e_shnum = 0;
2178 else
2179 ehdr->e_shnum = (unsigned short)nshdrs;
2180
2181 if (nshdrs - 1 >= SHN_LORESERVE)
2182 ehdr->e_shstrndx = SHN_XINDEX;
2183 else
2184 ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
2185
2186 ehdr->e_shoff = ehdr->e_phoff + ehdr->e_phentsize * nphdrs;
2187 ehdr->e_shentsize = sizeof (Shdr);
2188 }
2189
2190 if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
2191 sizeof (Ehdr), rlimit, credp))
2192 goto done;
2193
2194 poffset = sizeof (Ehdr);
2195 soffset = sizeof (Ehdr) + phdrsz;
2196 doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2197
2198 v = &bigwad->phdr[0];
2199 bzero(v, phdrsz);
2200
2201 setup_old_note_header(&v[0], p);
2202 v[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2203 doffset += v[0].p_filesz;
2204
2205 setup_note_header(&v[1], p);
2206 v[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2207 doffset += v[1].p_filesz;
2208
2209 mutex_enter(&p->p_lock);
2210
2211 brkbase = p->p_brkbase;
2212 brksize = p->p_brksize;
2213
2214 stkbase = p->p_usrstack - p->p_stksize;
2215 stksize = p->p_stksize;
2216
2217 mutex_exit(&p->p_lock);
2218
2219 AS_LOCK_ENTER(as, RW_WRITER);
2220 i = 2;
2221 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2222 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2223 caddr_t saddr, naddr;
2224 void *tmp = NULL;
2225 extern struct seg_ops segspt_shmops;
2226
2227 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2228 uint_t prot;
2229 size_t size;
2230 int type;
2231 vnode_t *mvp;
2232
2233 prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2234 prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2235 if ((size = (size_t)(naddr - saddr)) == 0)
2236 continue;
2237 if (i == nphdrs) {
2238 overflow++;
2239 continue;
2240 }
2241 v[i].p_type = PT_LOAD;
2242 v[i].p_vaddr = (Addr)(uintptr_t)saddr;
2243 v[i].p_memsz = size;
2244 if (prot & PROT_READ)
2245 v[i].p_flags |= PF_R;
2246 if (prot & PROT_WRITE)
2247 v[i].p_flags |= PF_W;
2248 if (prot & PROT_EXEC)
2249 v[i].p_flags |= PF_X;
2250
2251 /*
2252 * Figure out which mappings to include in the core.
2253 */
2254 type = SEGOP_GETTYPE(seg, saddr);
2255
2256 if (saddr == stkbase && size == stksize) {
2257 if (!(content & CC_CONTENT_STACK))
2258 goto exclude;
2259
2260 } else if (saddr == brkbase && size == brksize) {
2261 if (!(content & CC_CONTENT_HEAP))
2262 goto exclude;
2263
2264 } else if (seg->s_ops == &segspt_shmops) {
2265 if (type & MAP_NORESERVE) {
2266 if (!(content & CC_CONTENT_DISM))
2267 goto exclude;
2268 } else {
2269 if (!(content & CC_CONTENT_ISM))
2270 goto exclude;
2271 }
2272
2273 } else if (seg->s_ops != &segvn_ops) {
2274 goto exclude;
2275
2276 } else if (type & MAP_SHARED) {
2277 if (shmgetid(p, saddr) != SHMID_NONE) {
2278 if (!(content & CC_CONTENT_SHM))
2279 goto exclude;
2280
2281 } else if (SEGOP_GETVP(seg, seg->s_base,
2282 &mvp) != 0 || mvp == NULL ||
2283 mvp->v_type != VREG) {
2284 if (!(content & CC_CONTENT_SHANON))
2285 goto exclude;
2286
2287 } else {
2288 if (!(content & CC_CONTENT_SHFILE))
2289 goto exclude;
2290 }
2291
2292 } else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2293 mvp == NULL || mvp->v_type != VREG) {
2294 if (!(content & CC_CONTENT_ANON))
2295 goto exclude;
2296
2297 } else if (prot == (PROT_READ | PROT_EXEC)) {
2298 if (!(content & CC_CONTENT_TEXT))
2299 goto exclude;
2300
2301 } else if (prot == PROT_READ) {
2302 if (!(content & CC_CONTENT_RODATA))
2303 goto exclude;
2304
2305 } else {
2306 if (!(content & CC_CONTENT_DATA))
2307 goto exclude;
2308 }
2309
2310 doffset = roundup(doffset, sizeof (Word));
2311 v[i].p_offset = doffset;
2312 v[i].p_filesz = size;
2313 doffset += size;
2314 exclude:
2315 i++;
2316 }
2317 ASSERT(tmp == NULL);
2318 }
2319 AS_LOCK_EXIT(as);
2320
2321 if (overflow || i != nphdrs) {
2322 if (ntries++ == 0) {
2323 kmem_free(bigwad, bigsize);
2324 overflow = 0;
2325 goto top;
2326 }
2327 cmn_err(CE_WARN, "elfcore: core dump failed for "
2328 "process %d; address space is changing", p->p_pid);
2329 error = EIO;
2330 goto done;
2331 }
2332
2333 if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2334 v, phdrsz, rlimit, credp)) != 0)
2335 goto done;
2336
2337 if ((error = write_old_elfnotes(p, sig, vp, v[0].p_offset, rlimit,
2338 credp)) != 0)
2339 goto done;
2340
2341 if ((error = write_elfnotes(p, sig, vp, v[1].p_offset, rlimit,
2342 credp, content)) != 0)
2343 goto done;
2344
2345 for (i = 2; i < nphdrs; i++) {
2346 prkillinfo_t killinfo;
2347 sigqueue_t *sq;
2348 int sig, j;
2349
2350 if (v[i].p_filesz == 0)
2351 continue;
2352
2353 /*
2354 * If dumping out this segment fails, rather than failing
2355 * the core dump entirely, we reset the size of the mapping
2356 * to zero to indicate that the data is absent from the core
2357 * file and or in the PF_SUNW_FAILURE flag to differentiate
2358 * this from mappings that were excluded due to the core file
2359 * content settings.
2360 */
2361 if ((error = core_seg(p, vp, v[i].p_offset,
2362 (caddr_t)(uintptr_t)v[i].p_vaddr, v[i].p_filesz,
2363 rlimit, credp)) == 0) {
2364 continue;
2365 }
2366
2367 if ((sig = lwp->lwp_cursig) == 0) {
2368 /*
2369 * We failed due to something other than a signal.
2370 * Since the space reserved for the segment is now
2371 * unused, we stash the errno in the first four
2372 * bytes. This undocumented interface will let us
2373 * understand the nature of the failure.
2374 */
2375 (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
2376 &error, sizeof (error), rlimit, credp);
2377
2378 v[i].p_filesz = 0;
2379 v[i].p_flags |= PF_SUNW_FAILURE;
2380 if ((error = core_write(vp, UIO_SYSSPACE,
2381 poffset + sizeof (v[i]) * i, &v[i], sizeof (v[i]),
2382 rlimit, credp)) != 0)
2383 goto done;
2384
2385 continue;
2386 }
2387
2388 /*
2389 * We took a signal. We want to abort the dump entirely, but
2390 * we also want to indicate what failed and why. We therefore
2391 * use the space reserved for the first failing segment to
2392 * write our error (which, for purposes of compatability with
2393 * older core dump readers, we set to EINTR) followed by any
2394 * siginfo associated with the signal.
2395 */
2396 bzero(&killinfo, sizeof (killinfo));
2397 killinfo.prk_error = EINTR;
2398
2399 sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2400
2401 if (sq != NULL) {
2402 bcopy(&sq->sq_info, &killinfo.prk_info,
2403 sizeof (sq->sq_info));
2404 } else {
2405 killinfo.prk_info.si_signo = lwp->lwp_cursig;
2406 killinfo.prk_info.si_code = SI_NOINFO;
2407 }
2408
2409 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2410 /*
2411 * If this is a 32-bit process, we need to translate from the
2412 * native siginfo to the 32-bit variant. (Core readers must
2413 * always have the same data model as their target or must
2414 * be aware of -- and compensate for -- data model differences.)
2415 */
2416 if (curproc->p_model == DATAMODEL_ILP32) {
2417 siginfo32_t si32;
2418
2419 siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2420 bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2421 }
2422 #endif
2423
2424 (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
2425 &killinfo, sizeof (killinfo), rlimit, credp);
2426
2427 /*
2428 * For the segment on which we took the signal, indicate that
2429 * its data now refers to a siginfo.
2430 */
2431 v[i].p_filesz = 0;
2432 v[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2433 PF_SUNW_SIGINFO;
2434
2435 /*
2436 * And for every other segment, indicate that its absence
2437 * is due to a signal.
2438 */
2439 for (j = i + 1; j < nphdrs; j++) {
2440 v[j].p_filesz = 0;
2441 v[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2442 }
2443
2444 /*
2445 * Finally, write out our modified program headers.
2446 */
2447 if ((error = core_write(vp, UIO_SYSSPACE,
2448 poffset + sizeof (v[i]) * i, &v[i],
2449 sizeof (v[i]) * (nphdrs - i), rlimit, credp)) != 0)
2450 goto done;
2451
2452 break;
2453 }
2454
2455 if (nshdrs > 0) {
2456 bzero(&bigwad->shdr[0], shdrsz);
2457
2458 if (nshdrs >= SHN_LORESERVE)
2459 bigwad->shdr[0].sh_size = nshdrs;
2460
2461 if (nshdrs - 1 >= SHN_LORESERVE)
2462 bigwad->shdr[0].sh_link = nshdrs - 1;
2463
2464 if (nphdrs >= PN_XNUM)
2465 bigwad->shdr[0].sh_info = nphdrs;
2466
2467 if (nshdrs > 1) {
2468 AS_LOCK_ENTER(as, RW_WRITER);
2469 if ((error = process_scns(content, p, credp, vp,
2470 &bigwad->shdr[0], nshdrs, rlimit, &doffset,
2471 NULL)) != 0) {
2472 AS_LOCK_EXIT(as);
2473 goto done;
2474 }
2475 AS_LOCK_EXIT(as);
2476 }
2477
2478 if ((error = core_write(vp, UIO_SYSSPACE, soffset,
2479 &bigwad->shdr[0], shdrsz, rlimit, credp)) != 0)
2480 goto done;
2481 }
2482
2483 done:
2484 kmem_free(bigwad, bigsize);
2485 return (error);
2486 }
2487
2488 #ifndef _ELF32_COMPAT
2489
2490 static struct execsw esw = {
2491 #ifdef _LP64
2492 elf64magicstr,
2493 #else /* _LP64 */
2494 elf32magicstr,
2495 #endif /* _LP64 */
2496 0,
2497 5,
2498 elfexec,
2499 elfcore
2500 };
2501
2502 static struct modlexec modlexec = {
2503 &mod_execops, "exec module for elf", &esw
2504 };
2505
2506 #ifdef _LP64
2507 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2508 intpdata_t *idatap, int level, long *execsz,
2509 int setid, caddr_t exec_file, cred_t *cred,
2510 int *brand_action);
2511 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2512 rlim64_t rlimit, int sig, core_content_t content);
2513
2514 static struct execsw esw32 = {
2515 elf32magicstr,
2516 0,
2517 5,
2518 elf32exec,
2519 elf32core
2520 };
2521
2522 static struct modlexec modlexec32 = {
2523 &mod_execops, "32-bit exec module for elf", &esw32
2524 };
2525 #endif /* _LP64 */
2526
2527 static struct modlinkage modlinkage = {
2528 MODREV_1,
2529 (void *)&modlexec,
2530 #ifdef _LP64
2531 (void *)&modlexec32,
2532 #endif /* _LP64 */
2533 NULL
2534 };
2535
2536 int
2537 _init(void)
2538 {
2539 return (mod_install(&modlinkage));
2540 }
2541
2542 int
2543 _fini(void)
2544 {
2545 return (mod_remove(&modlinkage));
2546 }
2547
2548 int
2549 _info(struct modinfo *modinfop)
2550 {
2551 return (mod_info(&modlinkage, modinfop));
2552 }
2553
2554 #endif /* !_ELF32_COMPAT */