1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*         All Rights Reserved  */
  28 /*
  29  * Copyright 2019 Joyent, Inc.
  30  * Copyright 2021 Oxide Computer Company
  31  */
  32 
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/thread.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/signal.h>
  38 #include <sys/cred.h>
  39 #include <sys/user.h>
  40 #include <sys/errno.h>
  41 #include <sys/vnode.h>
  42 #include <sys/mman.h>
  43 #include <sys/kmem.h>
  44 #include <sys/proc.h>
  45 #include <sys/pathname.h>
  46 #include <sys/policy.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/systm.h>
  49 #include <sys/elf.h>
  50 #include <sys/vmsystm.h>
  51 #include <sys/debug.h>
  52 #include <sys/auxv.h>
  53 #include <sys/exec.h>
  54 #include <sys/prsystm.h>
  55 #include <vm/as.h>
  56 #include <vm/rm.h>
  57 #include <vm/seg.h>
  58 #include <vm/seg_vn.h>
  59 #include <sys/modctl.h>
  60 #include <sys/systeminfo.h>
  61 #include <sys/vmparam.h>
  62 #include <sys/machelf.h>
  63 #include <sys/shm_impl.h>
  64 #include <sys/archsystm.h>
  65 #include <sys/fasttrap.h>
  66 #include <sys/brand.h>
  67 #include "elf_impl.h"
  68 #include <sys/sdt.h>
  69 #include <sys/siginfo.h>
  70 #include <sys/random.h>
  71 
  72 #include <core_shstrtab.h>
  73 
  74 #if defined(__x86)
  75 #include <sys/comm_page_util.h>
  76 #include <sys/fp.h>
  77 #endif /* defined(__x86) */
  78 
  79 
  80 extern int at_flags;
  81 extern volatile size_t aslr_max_brk_skew;
  82 
  83 #define ORIGIN_STR      "ORIGIN"
  84 #define ORIGIN_STR_SIZE 6
  85 
  86 static int getelfhead(vnode_t *, cred_t *, Ehdr *, uint_t *, uint_t *,
  87     uint_t *);
  88 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, uint_t, caddr_t *,
  89     size_t *);
  90 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, uint_t, uint_t,
  91     caddr_t *, size_t *, caddr_t *, size_t *);
  92 static size_t elfsize(const Ehdr *, uint_t, const caddr_t, uintptr_t *);
  93 static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **,
  94     Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *,
  95     size_t, size_t *, size_t *);
  96 
  97 #ifdef _ELF32_COMPAT
  98 /* Link against the non-compat instances when compiling the 32-bit version. */
  99 extern size_t elf_datasz_max;
 100 extern size_t elf_zeropg_sz;
 101 extern void elf_ctx_resize_scratch(elf_core_ctx_t *, size_t);
 102 extern uint_t elf_nphdr_max;
 103 extern uint_t elf_nshdr_max;
 104 extern size_t elf_shstrtab_max;
 105 #else
 106 size_t elf_datasz_max = 1 * 1024 * 1024;
 107 size_t elf_zeropg_sz = 4 * 1024;
 108 uint_t elf_nphdr_max = 1000;
 109 uint_t elf_nshdr_max = 10000;
 110 size_t elf_shstrtab_max = 100 * 1024;
 111 #endif
 112 
 113 static int
 114 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
 115 {
 116         ASSERT(phdrp->p_type == PT_SUNWDTRACE);
 117 
 118         /*
 119          * See the comment in fasttrap.h for information on how to safely
 120          * update this program header.
 121          */
 122         if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
 123             (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
 124                 return (-1);
 125 
 126         args->thrptr = phdrp->p_vaddr + base;
 127 
 128         return (0);
 129 }
 130 
 131 static int
 132 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
 133 {
 134         uint_t flag;
 135 
 136         switch (dt) {
 137         case DT_SUNW_ASLR:
 138                 flag = PROC_SEC_ASLR;
 139                 break;
 140         default:
 141                 return (EINVAL);
 142         }
 143 
 144         if (val == 0) {
 145                 if (secflag_isset(p->p_secflags.psf_lower, flag))
 146                         return (EPERM);
 147                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 148                     secflag_isset(p->p_secflags.psf_inherit, flag))
 149                         return (EPERM);
 150 
 151                 secflag_clear(&p->p_secflags.psf_effective, flag);
 152         } else {
 153                 if (!secflag_isset(p->p_secflags.psf_upper, flag))
 154                         return (EPERM);
 155 
 156                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 157                     !secflag_isset(p->p_secflags.psf_inherit, flag))
 158                         return (EPERM);
 159 
 160                 secflag_set(&p->p_secflags.psf_effective, flag);
 161         }
 162 
 163         return (0);
 164 }
 165 
 166 
 167 #ifndef _ELF32_COMPAT
 168 void
 169 elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz)
 170 {
 171         size_t target = MIN(sz, elf_datasz_max);
 172 
 173         if (target > ctx->ecc_bufsz) {
 174                 if (ctx->ecc_buf != NULL) {
 175                         kmem_free(ctx->ecc_buf, ctx->ecc_bufsz);
 176                 }
 177                 ctx->ecc_buf = kmem_alloc(target, KM_SLEEP);
 178                 ctx->ecc_bufsz = target;
 179         }
 180 }
 181 #endif /* _ELF32_COMPAT */
 182 
 183 /*
 184  * Map in the executable pointed to by vp. Returns 0 on success.  Note that
 185  * this function currently has the maximum number of arguments allowed by
 186  * modstubs on x86 (MAXNARG)!  Do _not_ add to this function signature without
 187  * adding to MAXNARG.  (Better yet, do not add to this monster of a function
 188  * signature!)
 189  */
 190 int
 191 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 192     intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
 193     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
 194 {
 195         size_t          len, phdrsize;
 196         struct vattr    vat;
 197         caddr_t         phdrbase = NULL;
 198         uint_t          nshdrs, shstrndx, nphdrs;
 199         int             error = 0;
 200         Phdr            *uphdr = NULL;
 201         Phdr            *junk = NULL;
 202         Phdr            *dynphdr = NULL;
 203         Phdr            *dtrphdr = NULL;
 204         char            *interp = NULL;
 205         uintptr_t       lddata, minaddr;
 206         size_t          execsz;
 207 
 208         if (lddatap != NULL)
 209                 *lddatap = 0;
 210 
 211         if (minaddrp != NULL)
 212                 *minaddrp = (uintptr_t)NULL;
 213 
 214         if (error = execpermissions(vp, &vat, args)) {
 215                 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
 216                 return (error);
 217         }
 218 
 219         if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
 220             &nphdrs)) != 0 ||
 221             (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
 222             &phdrsize)) != 0) {
 223                 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
 224                 return (error);
 225         }
 226 
 227         if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
 228                 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
 229                 kmem_free(phdrbase, phdrsize);
 230                 return (ENOEXEC);
 231         }
 232         if (lddatap != NULL)
 233                 *lddatap = lddata;
 234 
 235         if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
 236             &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
 237             len, &execsz, brksize)) {
 238                 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
 239                 if (uphdr != NULL && uphdr->p_flags == 0)
 240                         kmem_free(uphdr, sizeof (Phdr));
 241                 kmem_free(phdrbase, phdrsize);
 242                 return (error);
 243         }
 244 
 245         if (minaddrp != NULL)
 246                 *minaddrp = minaddr;
 247 
 248         /*
 249          * If the executable requires an interpreter, determine its name.
 250          */
 251         if (dynphdr != NULL) {
 252                 ssize_t resid;
 253 
 254                 if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
 255                         uprintf("%s: Invalid interpreter\n", exec_file);
 256                         kmem_free(phdrbase, phdrsize);
 257                         return (ENOEXEC);
 258                 }
 259 
 260                 interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 261 
 262                 if ((error = vn_rdwr(UIO_READ, vp, interp,
 263                     (ssize_t)dynphdr->p_filesz,
 264                     (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
 265                     (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
 266                     interp[dynphdr->p_filesz - 1] != '\0') {
 267                         uprintf("%s: Cannot obtain interpreter pathname\n",
 268                             exec_file);
 269                         kmem_free(interp, MAXPATHLEN);
 270                         kmem_free(phdrbase, phdrsize);
 271                         return (error != 0 ? error : ENOEXEC);
 272                 }
 273         }
 274 
 275         /*
 276          * If this is a statically linked executable, voffset should indicate
 277          * the address of the executable itself (it normally holds the address
 278          * of the interpreter).
 279          */
 280         if (ehdr->e_type == ET_EXEC && interp == NULL)
 281                 *voffset = minaddr;
 282 
 283         /*
 284          * If the caller has asked for the interpreter name, return it (it's
 285          * up to the caller to free it); if the caller hasn't asked for it,
 286          * free it ourselves.
 287          */
 288         if (interpp != NULL) {
 289                 *interpp = interp;
 290         } else if (interp != NULL) {
 291                 kmem_free(interp, MAXPATHLEN);
 292         }
 293 
 294         if (uphdr != NULL) {
 295                 *uphdr_vaddr = uphdr->p_vaddr;
 296 
 297                 if (uphdr->p_flags == 0)
 298                         kmem_free(uphdr, sizeof (Phdr));
 299         } else if (ehdr->e_type == ET_DYN) {
 300                 /*
 301                  * If we don't have a uphdr, we'll apply the logic found
 302                  * in mapelfexec() and use the p_vaddr of the first PT_LOAD
 303                  * section as the base address of the object.
 304                  */
 305                 const Phdr *phdr = (Phdr *)phdrbase;
 306                 const uint_t hsize = ehdr->e_phentsize;
 307                 uint_t i;
 308 
 309                 for (i = nphdrs; i > 0; i--) {
 310                         if (phdr->p_type == PT_LOAD) {
 311                                 *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
 312                                     ehdr->e_phoff;
 313                                 break;
 314                         }
 315 
 316                         phdr = (Phdr *)((caddr_t)phdr + hsize);
 317                 }
 318 
 319                 /*
 320                  * If we don't have a PT_LOAD segment, we should have returned
 321                  * ENOEXEC when elfsize() returned 0, above.
 322                  */
 323                 VERIFY(i > 0);
 324         } else {
 325                 *uphdr_vaddr = (Addr)-1;
 326         }
 327 
 328         kmem_free(phdrbase, phdrsize);
 329         return (error);
 330 }
 331 
 332 /*ARGSUSED*/
 333 int
 334 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 335     int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
 336     int *brand_action)
 337 {
 338         caddr_t         phdrbase = NULL;
 339         caddr_t         bssbase = 0;
 340         caddr_t         brkbase = 0;
 341         size_t          brksize = 0;
 342         size_t          dlnsize, nsize = 0;
 343         aux_entry_t     *aux;
 344         int             error;
 345         ssize_t         resid;
 346         int             fd = -1;
 347         intptr_t        voffset;
 348         Phdr            *intphdr = NULL;
 349         Phdr            *dynamicphdr = NULL;
 350         Phdr            *stphdr = NULL;
 351         Phdr            *uphdr = NULL;
 352         Phdr            *junk = NULL;
 353         size_t          len;
 354         size_t          postfixsize = 0;
 355         size_t          i;
 356         Phdr            *phdrp;
 357         Phdr            *dataphdrp = NULL;
 358         Phdr            *dtrphdr;
 359         Phdr            *capphdr = NULL;
 360         Cap             *cap = NULL;
 361         size_t          capsize;
 362         int             hasu = 0;
 363         int             hasauxv = 0;
 364         int             hasintp = 0;
 365         int             branded = 0;
 366         int             dynuphdr = 0;
 367 
 368         struct proc *p = ttoproc(curthread);
 369         struct user *up = PTOU(p);
 370         struct bigwad {
 371                 Ehdr    ehdr;
 372                 aux_entry_t     elfargs[__KERN_NAUXV_IMPL];
 373                 char            dl_name[MAXPATHLEN];
 374                 char            pathbuf[MAXPATHLEN];
 375                 struct vattr    vattr;
 376                 struct execenv  exenv;
 377         } *bigwad;      /* kmem_alloc this behemoth so we don't blow stack */
 378         Ehdr            *ehdrp;
 379         uint_t          nshdrs, shstrndx, nphdrs;
 380         size_t          phdrsize;
 381         char            *dlnp;
 382         char            *pathbufp;
 383         rlim64_t        limit;
 384         rlim64_t        roundlimit;
 385 
 386         ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
 387 
 388         bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
 389         ehdrp = &bigwad->ehdr;
 390         dlnp = bigwad->dl_name;
 391         pathbufp = bigwad->pathbuf;
 392 
 393         /*
 394          * Obtain ELF and program header information.
 395          */
 396         if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
 397             &nphdrs)) != 0 ||
 398             (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
 399             &phdrsize)) != 0)
 400                 goto out;
 401 
 402         /*
 403          * Prevent executing an ELF file that has no entry point.
 404          */
 405         if (ehdrp->e_entry == 0) {
 406                 uprintf("%s: Bad entry point\n", exec_file);
 407                 goto bad;
 408         }
 409 
 410         /*
 411          * Put data model that we're exec-ing to into the args passed to
 412          * exec_args(), so it will know what it is copying to on new stack.
 413          * Now that we know whether we are exec-ing a 32-bit or 64-bit
 414          * executable, we can set execsz with the appropriate NCARGS.
 415          */
 416 #ifdef  _LP64
 417         if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
 418                 args->to_model = DATAMODEL_ILP32;
 419                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
 420         } else {
 421                 args->to_model = DATAMODEL_LP64;
 422                 if (!args->stk_prot_override) {
 423                         args->stk_prot &= ~PROT_EXEC;
 424                 }
 425 #if defined(__x86)
 426                 args->dat_prot &= ~PROT_EXEC;
 427 #endif
 428                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
 429         }
 430 #else   /* _LP64 */
 431         args->to_model = DATAMODEL_ILP32;
 432         *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
 433 #endif  /* _LP64 */
 434 
 435         /*
 436          * We delay invoking the brand callback until we've figured out what
 437          * kind of elf binary we're trying to run, 32-bit or 64-bit.  We do this
 438          * because now the brand library can just check args->to_model to see if
 439          * the target is 32-bit or 64-bit without having do duplicate all the
 440          * code above.
 441          *
 442          * We also give the brand a chance to indicate that based on the ELF
 443          * OSABI of the target binary it should become unbranded and optionally
 444          * indicate that it should be treated as existing in a specific prefix.
 445          *
 446          * Note that if a brand opts to go down this route it does not actually
 447          * end up being debranded. In other words, future programs that exec
 448          * will still be considered for branding unless this escape hatch is
 449          * used. Consider the case of lx brand for example. If a user runs
 450          * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
 451          * of DTrace that's in /native will take this escape hatch and be run
 452          * and interpreted using the normal system call table; however, the
 453          * execution of a non-illumos binary in the form of /bin/ls will still
 454          * be branded and be subject to all of the normal actions of the brand.
 455          *
 456          * The level checks associated with brand handling below are used to
 457          * prevent a loop since the brand elfexec function typically comes back
 458          * through this function. We must check <= here since the nested
 459          * handling in the #! interpreter code will increment the level before
 460          * calling gexec to run the final elfexec interpreter.
 461          */
 462         if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
 463             (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
 464                 if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
 465                     &args->brand_nroot) == B_TRUE) {
 466                         ASSERT(ehdrp->e_ident[EI_OSABI]);
 467                         *brand_action = EBA_NATIVE;
 468                         /* Add one for the trailing '/' in the path */
 469                         if (args->brand_nroot != NULL)
 470                                 nsize = strlen(args->brand_nroot) + 1;
 471                 }
 472         }
 473 
 474         if ((level <= INTP_MAXDEPTH) &&
 475             (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 476                 error = BROP(p)->b_elfexec(vp, uap, args,
 477                     idatap, level + 1, execsz, setid, exec_file, cred,
 478                     brand_action);
 479                 goto out;
 480         }
 481 
 482         /*
 483          * Determine aux size now so that stack can be built
 484          * in one shot (except actual copyout of aux image),
 485          * determine any non-default stack protections,
 486          * and still have this code be machine independent.
 487          */
 488         const uint_t hsize = ehdrp->e_phentsize;
 489         phdrp = (Phdr *)phdrbase;
 490         for (i = nphdrs; i > 0; i--) {
 491                 switch (phdrp->p_type) {
 492                 case PT_INTERP:
 493                         hasauxv = hasintp = 1;
 494                         break;
 495                 case PT_PHDR:
 496                         hasu = 1;
 497                         break;
 498                 case PT_SUNWSTACK:
 499                         args->stk_prot = PROT_USER;
 500                         if (phdrp->p_flags & PF_R)
 501                                 args->stk_prot |= PROT_READ;
 502                         if (phdrp->p_flags & PF_W)
 503                                 args->stk_prot |= PROT_WRITE;
 504                         if (phdrp->p_flags & PF_X)
 505                                 args->stk_prot |= PROT_EXEC;
 506                         break;
 507                 case PT_LOAD:
 508                         dataphdrp = phdrp;
 509                         break;
 510                 case PT_SUNWCAP:
 511                         capphdr = phdrp;
 512                         break;
 513                 case PT_DYNAMIC:
 514                         dynamicphdr = phdrp;
 515                         break;
 516                 }
 517                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
 518         }
 519 
 520         if (ehdrp->e_type != ET_EXEC) {
 521                 dataphdrp = NULL;
 522                 hasauxv = 1;
 523         }
 524 
 525         /* Copy BSS permissions to args->dat_prot */
 526         if (dataphdrp != NULL) {
 527                 args->dat_prot = PROT_USER;
 528                 if (dataphdrp->p_flags & PF_R)
 529                         args->dat_prot |= PROT_READ;
 530                 if (dataphdrp->p_flags & PF_W)
 531                         args->dat_prot |= PROT_WRITE;
 532                 if (dataphdrp->p_flags & PF_X)
 533                         args->dat_prot |= PROT_EXEC;
 534         }
 535 
 536         /*
 537          * If a auxvector will be required - reserve the space for
 538          * it now.  This may be increased by exec_args if there are
 539          * ISA-specific types (included in __KERN_NAUXV_IMPL).
 540          */
 541         if (hasauxv) {
 542                 /*
 543                  * If a AUX vector is being built - the base AUX
 544                  * entries are:
 545                  *
 546                  *      AT_BASE
 547                  *      AT_FLAGS
 548                  *      AT_PAGESZ
 549                  *      AT_RANDOM       (added in stk_copyout)
 550                  *      AT_SUN_AUXFLAGS
 551                  *      AT_SUN_HWCAP
 552                  *      AT_SUN_HWCAP2
 553                  *      AT_SUN_PLATFORM (added in stk_copyout)
 554                  *      AT_SUN_EXECNAME (added in stk_copyout)
 555                  *      AT_NULL
 556                  *
 557                  * total == 10
 558                  */
 559                 if (hasintp && hasu) {
 560                         /*
 561                          * Has PT_INTERP & PT_PHDR - the auxvectors that
 562                          * will be built are:
 563                          *
 564                          *      AT_PHDR
 565                          *      AT_PHENT
 566                          *      AT_PHNUM
 567                          *      AT_ENTRY
 568                          *      AT_LDDATA
 569                          *
 570                          * total = 5
 571                          */
 572                         args->auxsize = (10 + 5) * sizeof (aux_entry_t);
 573                 } else if (hasintp) {
 574                         /*
 575                          * Has PT_INTERP but no PT_PHDR
 576                          *
 577                          *      AT_EXECFD
 578                          *      AT_LDDATA
 579                          *
 580                          * total = 2
 581                          */
 582                         args->auxsize = (10 + 2) * sizeof (aux_entry_t);
 583                 } else {
 584                         args->auxsize = 10 * sizeof (aux_entry_t);
 585                 }
 586         } else {
 587                 args->auxsize = 0;
 588         }
 589 
 590         /*
 591          * If this binary is using an emulator, we need to add an
 592          * AT_SUN_EMULATOR aux entry.
 593          */
 594         if (args->emulator != NULL)
 595                 args->auxsize += sizeof (aux_entry_t);
 596 
 597         /*
 598          * If this is a native binary that's been given a modified interpreter
 599          * root, inform it that the native system exists at that root.
 600          */
 601         if (args->brand_nroot != NULL) {
 602                 args->auxsize += sizeof (aux_entry_t);
 603         }
 604 
 605 
 606         /*
 607          * On supported kernels (x86_64) make room in the auxv for the
 608          * AT_SUN_COMMPAGE entry.  This will go unpopulated on i86xpv systems
 609          * which do not provide such functionality.
 610          *
 611          * Additionally cover the floating point information AT_SUN_FPSIZE and
 612          * AT_SUN_FPTYPE.
 613          */
 614 #if defined(__amd64)
 615         args->auxsize += 3 * sizeof (aux_entry_t);
 616 #endif /* defined(__amd64) */
 617 
 618         /*
 619          * If we have user credentials, we'll supply the following entries:
 620          *      AT_SUN_UID
 621          *      AT_SUN_RUID
 622          *      AT_SUN_GID
 623          *      AT_SUN_RGID
 624          */
 625         if (cred != NULL) {
 626                 args->auxsize += 4 * sizeof (aux_entry_t);
 627         }
 628 
 629         if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 630                 branded = 1;
 631                 /*
 632                  * We will be adding 5 entries to the aux vectors.  One for
 633                  * the the brandname and 4 for the brand specific aux vectors.
 634                  */
 635                 args->auxsize += 5 * sizeof (aux_entry_t);
 636         }
 637 
 638         /* If the binary has an explicit ASLR flag, it must be honoured */
 639         if ((dynamicphdr != NULL) && (dynamicphdr->p_filesz > 0)) {
 640                 const size_t dynfilesz = dynamicphdr->p_filesz;
 641                 const size_t dynoffset = dynamicphdr->p_offset;
 642                 Dyn *dyn, *dp;
 643 
 644                 if (dynoffset > MAXOFFSET_T ||
 645                     dynfilesz > MAXOFFSET_T ||
 646                     dynoffset + dynfilesz > MAXOFFSET_T) {
 647                         uprintf("%s: cannot read full .dynamic section\n",
 648                             exec_file);
 649                         error = EINVAL;
 650                         goto out;
 651                 }
 652 
 653 #define DYN_STRIDE      100
 654                 for (i = 0; i < dynfilesz; i += sizeof (*dyn) * DYN_STRIDE) {
 655                         const size_t remdyns = (dynfilesz - i) / sizeof (*dyn);
 656                         const size_t ndyns = MIN(DYN_STRIDE, remdyns);
 657                         const size_t dynsize = ndyns * sizeof (*dyn);
 658 
 659                         dyn = kmem_alloc(dynsize, KM_SLEEP);
 660 
 661                         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
 662                             (ssize_t)dynsize, (offset_t)(dynoffset + i),
 663                             UIO_SYSSPACE, 0, (rlim64_t)0,
 664                             CRED(), NULL)) != 0) {
 665                                 uprintf("%s: cannot read .dynamic section\n",
 666                                     exec_file);
 667                                 goto out;
 668                         }
 669 
 670                         for (dp = dyn; dp < (dyn + ndyns); dp++) {
 671                                 if (dp->d_tag == DT_SUNW_ASLR) {
 672                                         if ((error = handle_secflag_dt(p,
 673                                             DT_SUNW_ASLR,
 674                                             dp->d_un.d_val)) != 0) {
 675                                                 uprintf("%s: error setting "
 676                                                     "security-flag from "
 677                                                     "DT_SUNW_ASLR: %d\n",
 678                                                     exec_file, error);
 679                                                 goto out;
 680                                         }
 681                                 }
 682                         }
 683 
 684                         kmem_free(dyn, dynsize);
 685                 }
 686         }
 687 
 688         /* Hardware/Software capabilities */
 689         if (capphdr != NULL &&
 690             (capsize = capphdr->p_filesz) > 0 &&
 691             capsize <= 16 * sizeof (*cap)) {
 692                 const uint_t ncaps = capsize / sizeof (*cap);
 693                 Cap *cp;
 694 
 695                 cap = kmem_alloc(capsize, KM_SLEEP);
 696                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
 697                     (ssize_t)capsize, (offset_t)capphdr->p_offset,
 698                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), NULL)) != 0) {
 699                         uprintf("%s: Cannot read capabilities section\n",
 700                             exec_file);
 701                         goto out;
 702                 }
 703                 for (cp = cap; cp < cap + ncaps; cp++) {
 704                         if (cp->c_tag == CA_SUNW_SF_1 &&
 705                             (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
 706                                 if (args->to_model == DATAMODEL_LP64)
 707                                         args->addr32 = 1;
 708                                 break;
 709                         }
 710                 }
 711         }
 712 
 713         aux = bigwad->elfargs;
 714         /*
 715          * Move args to the user's stack.
 716          * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM
 717          * aux entries.
 718          */
 719         if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
 720                 if (error == -1) {
 721                         error = ENOEXEC;
 722                         goto bad;
 723                 }
 724                 goto out;
 725         }
 726         /* we're single threaded after this point */
 727 
 728         /*
 729          * If this is an ET_DYN executable (shared object),
 730          * determine its memory size so that mapelfexec() can load it.
 731          */
 732         if (ehdrp->e_type == ET_DYN)
 733                 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
 734         else
 735                 len = 0;
 736 
 737         dtrphdr = NULL;
 738 
 739         error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
 740             &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
 741             len, execsz, &brksize);
 742         /*
 743          * Our uphdr has been dynamically allocated if (and only if) its
 744          * program header flags are clear.  To avoid leaks, this must be
 745          * checked regardless of whether mapelfexec() emitted an error.
 746          */
 747         dynuphdr = (uphdr != NULL && uphdr->p_flags == 0);
 748 
 749         if (error != 0) {
 750                 goto bad;
 751         }
 752 
 753         if (uphdr != NULL && intphdr == NULL)
 754                 goto bad;
 755 
 756         if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 757                 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
 758                 goto bad;
 759         }
 760 
 761         if (intphdr != NULL) {
 762                 size_t          len;
 763                 uintptr_t       lddata;
 764                 char            *p;
 765                 struct vnode    *nvp;
 766 
 767                 dlnsize = intphdr->p_filesz + nsize;
 768 
 769                 /*
 770                  * Make sure none of the component pieces of dlnsize result in
 771                  * an oversized or zeroed result.
 772                  */
 773                 if (intphdr->p_filesz > MAXPATHLEN || dlnsize > MAXPATHLEN ||
 774                     dlnsize == 0 || dlnsize < intphdr->p_filesz) {
 775                         goto bad;
 776                 }
 777 
 778                 if (nsize != 0) {
 779                         bcopy(args->brand_nroot, dlnp, nsize - 1);
 780                         dlnp[nsize - 1] = '/';
 781                 }
 782 
 783                 /*
 784                  * Read in "interpreter" pathname.
 785                  */
 786                 if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
 787                     (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset,
 788                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
 789                         uprintf("%s: Cannot obtain interpreter pathname\n",
 790                             exec_file);
 791                         goto bad;
 792                 }
 793 
 794                 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
 795                         goto bad;
 796 
 797                 /*
 798                  * Search for '$ORIGIN' token in interpreter path.
 799                  * If found, expand it.
 800                  */
 801                 for (p = dlnp; p = strchr(p, '$'); ) {
 802                         uint_t  len, curlen;
 803                         char    *_ptr;
 804 
 805                         if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
 806                                 continue;
 807 
 808                         /*
 809                          * We don't support $ORIGIN on setid programs to close
 810                          * a potential attack vector.
 811                          */
 812                         if ((setid & EXECSETID_SETID) != 0) {
 813                                 error = ENOEXEC;
 814                                 goto bad;
 815                         }
 816 
 817                         curlen = 0;
 818                         len = p - dlnp - 1;
 819                         if (len) {
 820                                 bcopy(dlnp, pathbufp, len);
 821                                 curlen += len;
 822                         }
 823                         if (_ptr = strrchr(args->pathname, '/')) {
 824                                 len = _ptr - args->pathname;
 825                                 if ((curlen + len) > MAXPATHLEN)
 826                                         break;
 827 
 828                                 bcopy(args->pathname, &pathbufp[curlen], len);
 829                                 curlen += len;
 830                         } else {
 831                                 /*
 832                                  * executable is a basename found in the
 833                                  * current directory.  So - just substitue
 834                                  * '.' for ORIGIN.
 835                                  */
 836                                 pathbufp[curlen] = '.';
 837                                 curlen++;
 838                         }
 839                         p += ORIGIN_STR_SIZE;
 840                         len = strlen(p);
 841 
 842                         if ((curlen + len) > MAXPATHLEN)
 843                                 break;
 844                         bcopy(p, &pathbufp[curlen], len);
 845                         curlen += len;
 846                         pathbufp[curlen++] = '\0';
 847                         bcopy(pathbufp, dlnp, curlen);
 848                 }
 849 
 850                 /*
 851                  * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
 852                  * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
 853                  * Just in case /usr is not mounted, change it now.
 854                  */
 855                 if (strcmp(dlnp, USR_LIB_RTLD) == 0)
 856                         dlnp += 4;
 857                 error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
 858                 if (error && dlnp != bigwad->dl_name) {
 859                         /* new kernel, old user-level */
 860                         error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
 861                             NULLVPP, &nvp);
 862                 }
 863                 if (error) {
 864                         uprintf("%s: Cannot find %s\n", exec_file, dlnp);
 865                         goto bad;
 866                 }
 867 
 868                 /*
 869                  * Setup the "aux" vector.
 870                  */
 871                 if (uphdr) {
 872                         if (ehdrp->e_type == ET_DYN) {
 873                                 /* don't use the first page */
 874                                 bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
 875                                 bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
 876                         } else {
 877                                 bigwad->exenv.ex_bssbase = bssbase;
 878                                 bigwad->exenv.ex_brkbase = brkbase;
 879                         }
 880                         bigwad->exenv.ex_brksize = brksize;
 881                         bigwad->exenv.ex_magic = elfmagic;
 882                         bigwad->exenv.ex_vp = vp;
 883                         setexecenv(&bigwad->exenv);
 884 
 885                         ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
 886                         ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
 887                         ADDAUX(aux, AT_PHNUM, nphdrs)
 888                         ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
 889                 } else {
 890                         if ((error = execopen(&vp, &fd)) != 0) {
 891                                 VN_RELE(nvp);
 892                                 goto bad;
 893                         }
 894 
 895                         ADDAUX(aux, AT_EXECFD, fd)
 896                 }
 897 
 898                 if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
 899                         VN_RELE(nvp);
 900                         uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
 901                         goto bad;
 902                 }
 903 
 904                 /*
 905                  * Now obtain the ELF header along with the entire program
 906                  * header contained in "nvp".
 907                  */
 908                 kmem_free(phdrbase, phdrsize);
 909                 phdrbase = NULL;
 910                 if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
 911                     &shstrndx, &nphdrs)) != 0 ||
 912                     (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
 913                     &phdrsize)) != 0) {
 914                         VN_RELE(nvp);
 915                         uprintf("%s: Cannot read %s\n", exec_file, dlnp);
 916                         goto bad;
 917                 }
 918 
 919                 /*
 920                  * Determine memory size of the "interpreter's" loadable
 921                  * sections.  This size is then used to obtain the virtual
 922                  * address of a hole, in the user's address space, large
 923                  * enough to map the "interpreter".
 924                  */
 925                 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
 926                         VN_RELE(nvp);
 927                         uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
 928                         goto bad;
 929                 }
 930 
 931                 dtrphdr = NULL;
 932 
 933                 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
 934                     &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
 935                     execsz, NULL);
 936 
 937                 if (error || junk != NULL) {
 938                         VN_RELE(nvp);
 939                         uprintf("%s: Cannot map %s\n", exec_file, dlnp);
 940                         goto bad;
 941                 }
 942 
 943                 /*
 944                  * We use the DTrace program header to initialize the
 945                  * architecture-specific user per-LWP location. The dtrace
 946                  * fasttrap provider requires ready access to per-LWP scratch
 947                  * space. We assume that there is only one such program header
 948                  * in the interpreter.
 949                  */
 950                 if (dtrphdr != NULL &&
 951                     dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 952                         VN_RELE(nvp);
 953                         uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
 954                         goto bad;
 955                 }
 956 
 957                 VN_RELE(nvp);
 958                 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
 959         }
 960 
 961         if (hasauxv) {
 962                 int auxf = AF_SUN_HWCAPVERIFY;
 963 #if defined(__amd64)
 964                 size_t fpsize;
 965                 int fptype;
 966 #endif /* defined(__amd64) */
 967 
 968                 /*
 969                  * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were
 970                  * filled in via exec_args()
 971                  */
 972                 ADDAUX(aux, AT_BASE, voffset)
 973                 ADDAUX(aux, AT_FLAGS, at_flags)
 974                 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
 975                 /*
 976                  * Linker flags. (security)
 977                  * p_flag not yet set at this time.
 978                  * We rely on gexec() to provide us with the information.
 979                  * If the application is set-uid but this is not reflected
 980                  * in a mismatch between real/effective uids/gids, then
 981                  * don't treat this as a set-uid exec.  So we care about
 982                  * the EXECSETID_UGIDS flag but not the ...SETID flag.
 983                  */
 984                 if ((setid &= ~EXECSETID_SETID) != 0)
 985                         auxf |= AF_SUN_SETUGID;
 986 
 987                 /*
 988                  * If we're running a native process from within a branded
 989                  * zone under pfexec then we clear the AF_SUN_SETUGID flag so
 990                  * that the native ld.so.1 is able to link with the native
 991                  * libraries instead of using the brand libraries that are
 992                  * installed in the zone.  We only do this for processes
 993                  * which we trust because we see they are already running
 994                  * under pfexec (where uid != euid).  This prevents a
 995                  * malicious user within the zone from crafting a wrapper to
 996                  * run native suid commands with unsecure libraries interposed.
 997                  */
 998                 if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
 999                     (setid &= ~EXECSETID_SETID) != 0))
1000                         auxf &= ~AF_SUN_SETUGID;
1001 
1002                 /*
1003                  * Record the user addr of the auxflags aux vector entry
1004                  * since brands may optionally want to manipulate this field.
1005                  */
1006                 args->auxp_auxflags =
1007                     (char *)((char *)args->stackend +
1008                     ((char *)&aux->a_type -
1009                     (char *)bigwad->elfargs));
1010                 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
1011 
1012                 /*
1013                  * Record information about the real and effective user and
1014                  * group IDs.
1015                  */
1016                 if (cred != NULL) {
1017                         ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
1018                         ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
1019                         ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
1020                         ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
1021                 }
1022 
1023                 /*
1024                  * Hardware capability flag word (performance hints)
1025                  * Used for choosing faster library routines.
1026                  * (Potentially different between 32-bit and 64-bit ABIs)
1027                  */
1028 #if defined(_LP64)
1029                 if (args->to_model == DATAMODEL_NATIVE) {
1030                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1031                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1032                 } else {
1033                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
1034                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
1035                 }
1036 #else
1037                 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1038                 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1039 #endif
1040                 if (branded) {
1041                         /*
1042                          * Reserve space for the brand-private aux vectors,
1043                          * and record the user addr of that space.
1044                          */
1045                         args->auxp_brand =
1046                             (char *)((char *)args->stackend +
1047                             ((char *)&aux->a_type -
1048                             (char *)bigwad->elfargs));
1049                         ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
1050                         ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
1051                         ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
1052                         ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
1053                 }
1054 
1055                 /*
1056                  * Add the comm page auxv entry, mapping it in if needed. Also
1057                  * take care of the FPU entries.
1058                  */
1059 #if defined(__amd64)
1060                 if (args->commpage != (uintptr_t)NULL ||
1061                     (args->commpage = (uintptr_t)comm_page_mapin()) !=
1062                     (uintptr_t)NULL) {
1063                         ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
1064                 } else {
1065                         /*
1066                          * If the comm page cannot be mapped, pad out the auxv
1067                          * to satisfy later size checks.
1068                          */
1069                         ADDAUX(aux, AT_NULL, 0)
1070                 }
1071 
1072                 fptype = AT_386_FPINFO_NONE;
1073                 fpu_auxv_info(&fptype, &fpsize);
1074                 if (fptype != AT_386_FPINFO_NONE) {
1075                         ADDAUX(aux, AT_SUN_FPTYPE, fptype)
1076                         ADDAUX(aux, AT_SUN_FPSIZE, fpsize)
1077                 } else {
1078                         ADDAUX(aux, AT_NULL, 0)
1079                         ADDAUX(aux, AT_NULL, 0)
1080                 }
1081 #endif /* defined(__amd64) */
1082 
1083                 ADDAUX(aux, AT_NULL, 0)
1084                 postfixsize = (uintptr_t)aux - (uintptr_t)bigwad->elfargs;
1085 
1086                 /*
1087                  * We make assumptions above when we determine how many aux
1088                  * vector entries we will be adding. However, if we have an
1089                  * invalid elf file, it is possible that mapelfexec might
1090                  * behave differently (but not return an error), in which case
1091                  * the number of aux entries we actually add will be different.
1092                  * We detect that now and error out.
1093                  */
1094                 if (postfixsize != args->auxsize) {
1095                         DTRACE_PROBE2(elfexec_badaux, size_t, postfixsize,
1096                             size_t, args->auxsize);
1097                         goto bad;
1098                 }
1099                 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
1100         }
1101 
1102         /*
1103          * For the 64-bit kernel, the limit is big enough that rounding it up
1104          * to a page can overflow the 64-bit limit, so we check for btopr()
1105          * overflowing here by comparing it with the unrounded limit in pages.
1106          * If it hasn't overflowed, compare the exec size with the rounded up
1107          * limit in pages.  Otherwise, just compare with the unrounded limit.
1108          */
1109         limit = btop(p->p_vmem_ctl);
1110         roundlimit = btopr(p->p_vmem_ctl);
1111         if ((roundlimit > limit && *execsz > roundlimit) ||
1112             (roundlimit < limit && *execsz > limit)) {
1113                 mutex_enter(&p->p_lock);
1114                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1115                     RCA_SAFE);
1116                 mutex_exit(&p->p_lock);
1117                 error = ENOMEM;
1118                 goto bad;
1119         }
1120 
1121         bzero(up->u_auxv, sizeof (up->u_auxv));
1122         up->u_commpagep = args->commpage;
1123         if (postfixsize) {
1124                 size_t num_auxv;
1125 
1126                 /*
1127                  * Copy the aux vector to the user stack.
1128                  */
1129                 error = execpoststack(args, bigwad->elfargs, postfixsize);
1130                 if (error)
1131                         goto bad;
1132 
1133                 /*
1134                  * Copy auxv to the process's user structure for use by /proc.
1135                  * If this is a branded process, the brand's exec routine will
1136                  * copy it's private entries to the user structure later. It
1137                  * relies on the fact that the blank entries are at the end.
1138                  */
1139                 num_auxv = postfixsize / sizeof (aux_entry_t);
1140                 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
1141                 aux = bigwad->elfargs;
1142                 for (i = 0; i < num_auxv; i++) {
1143                         up->u_auxv[i].a_type = aux[i].a_type;
1144                         up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
1145                 }
1146         }
1147 
1148         /*
1149          * Pass back the starting address so we can set the program counter.
1150          */
1151         args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
1152 
1153         if (!uphdr) {
1154                 if (ehdrp->e_type == ET_DYN) {
1155                         /*
1156                          * If we are executing a shared library which doesn't
1157                          * have a interpreter (probably ld.so.1) then
1158                          * we don't set the brkbase now.  Instead we
1159                          * delay it's setting until the first call
1160                          * via grow.c::brk().  This permits ld.so.1 to
1161                          * initialize brkbase to the tail of the executable it
1162                          * loads (which is where it needs to be).
1163                          */
1164                         bigwad->exenv.ex_brkbase = (caddr_t)0;
1165                         bigwad->exenv.ex_bssbase = (caddr_t)0;
1166                         bigwad->exenv.ex_brksize = 0;
1167                 } else {
1168                         bigwad->exenv.ex_brkbase = brkbase;
1169                         bigwad->exenv.ex_bssbase = bssbase;
1170                         bigwad->exenv.ex_brksize = brksize;
1171                 }
1172                 bigwad->exenv.ex_magic = elfmagic;
1173                 bigwad->exenv.ex_vp = vp;
1174                 setexecenv(&bigwad->exenv);
1175         }
1176 
1177         ASSERT(error == 0);
1178         goto out;
1179 
1180 bad:
1181         if (fd != -1)           /* did we open the a.out yet */
1182                 (void) execclose(fd);
1183 
1184         psignal(p, SIGKILL);
1185 
1186         if (error == 0)
1187                 error = ENOEXEC;
1188 out:
1189         if (dynuphdr)
1190                 kmem_free(uphdr, sizeof (Phdr));
1191         if (phdrbase != NULL)
1192                 kmem_free(phdrbase, phdrsize);
1193         if (cap != NULL)
1194                 kmem_free(cap, capsize);
1195         kmem_free(bigwad, sizeof (struct bigwad));
1196         return (error);
1197 }
1198 
1199 /*
1200  * Compute the memory size requirement for the ELF file.
1201  */
1202 static size_t
1203 elfsize(const Ehdr *ehdrp, uint_t nphdrs, const caddr_t phdrbase,
1204     uintptr_t *lddata)
1205 {
1206         const Phdr *phdrp = (Phdr *)phdrbase;
1207         const uint_t hsize = ehdrp->e_phentsize;
1208         boolean_t dfirst = B_TRUE;
1209         uintptr_t loaddr = UINTPTR_MAX;
1210         uintptr_t hiaddr = 0;
1211         uint_t i;
1212 
1213         for (i = nphdrs; i > 0; i--) {
1214                 if (phdrp->p_type == PT_LOAD) {
1215                         const uintptr_t lo = phdrp->p_vaddr;
1216                         const uintptr_t hi = lo + phdrp->p_memsz;
1217 
1218                         loaddr = MIN(lo, loaddr);
1219                         hiaddr = MAX(hi, hiaddr);
1220 
1221                         /*
1222                          * save the address of the first data segment
1223                          * of a object - used for the AT_SUNW_LDDATA
1224                          * aux entry.
1225                          */
1226                         if ((lddata != NULL) && dfirst &&
1227                             (phdrp->p_flags & PF_W)) {
1228                                 *lddata = lo;
1229                                 dfirst = B_FALSE;
1230                         }
1231                 }
1232                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1233         }
1234 
1235         if (hiaddr <= loaddr) {
1236                 /* No non-zero PT_LOAD segment found */
1237                 return (0);
1238         }
1239 
1240         return (roundup(hiaddr - (loaddr & PAGEMASK), PAGESIZE));
1241 }
1242 
1243 /*
1244  * Read in the ELF header and program header table.
1245  * SUSV3 requires:
1246  *      ENOEXEC File format is not recognized
1247  *      EINVAL  Format recognized but execution not supported
1248  */
1249 static int
1250 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
1251     uint_t *shstrndx, uint_t *nphdrs)
1252 {
1253         int error;
1254         ssize_t resid;
1255 
1256         /*
1257          * We got here by the first two bytes in ident,
1258          * now read the entire ELF header.
1259          */
1260         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, sizeof (Ehdr),
1261             (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid)) != 0) {
1262                 return (error);
1263         }
1264 
1265         /*
1266          * Since a separate version is compiled for handling 32-bit and
1267          * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1268          * doesn't need to be able to deal with 32-bit ELF files.
1269          */
1270         if (resid != 0 ||
1271             ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1272             ehdr->e_ident[EI_MAG3] != ELFMAG3) {
1273                 return (ENOEXEC);
1274         }
1275 
1276         if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1277 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1278             ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1279 #else
1280             ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1281 #endif
1282             !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1283             ehdr->e_flags)) {
1284                 return (EINVAL);
1285         }
1286 
1287         *nshdrs = ehdr->e_shnum;
1288         *shstrndx = ehdr->e_shstrndx;
1289         *nphdrs = ehdr->e_phnum;
1290 
1291         /*
1292          * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1293          * to read in the section header at index zero to access the true
1294          * values for those fields.
1295          */
1296         if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1297             *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1298                 Shdr shdr;
1299 
1300                 if (ehdr->e_shoff == 0)
1301                         return (EINVAL);
1302 
1303                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1304                     sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1305                     (rlim64_t)0, credp, NULL)) != 0)
1306                         return (error);
1307 
1308                 if (*nshdrs == 0)
1309                         *nshdrs = shdr.sh_size;
1310                 if (*shstrndx == SHN_XINDEX)
1311                         *shstrndx = shdr.sh_link;
1312                 if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1313                         *nphdrs = shdr.sh_info;
1314         }
1315 
1316         return (0);
1317 }
1318 
1319 /*
1320  * We use members through p_flags on 32-bit files and p_memsz on 64-bit files,
1321  * so e_phentsize must be at least large enough to include those members.
1322  */
1323 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1324 #define MINPHENTSZ      (offsetof(Phdr, p_flags) + \
1325                         sizeof (((Phdr *)NULL)->p_flags))
1326 #else
1327 #define MINPHENTSZ      (offsetof(Phdr, p_memsz) + \
1328                         sizeof (((Phdr *)NULL)->p_memsz))
1329 #endif
1330 
1331 static int
1332 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nphdrs,
1333     caddr_t *phbasep, size_t *phsizep)
1334 {
1335         int err;
1336 
1337         /*
1338          * Ensure that e_phentsize is large enough for required fields to be
1339          * accessible and will maintain 8-byte alignment.
1340          */
1341         if (ehdr->e_phentsize < MINPHENTSZ || (ehdr->e_phentsize & 3))
1342                 return (EINVAL);
1343 
1344         *phsizep = nphdrs * ehdr->e_phentsize;
1345 
1346         if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1347                 if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1348                         return (ENOMEM);
1349         } else {
1350                 *phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1351         }
1352 
1353         if ((err = vn_rdwr(UIO_READ, vp, *phbasep, (ssize_t)*phsizep,
1354             (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1355             credp, NULL)) != 0) {
1356                 kmem_free(*phbasep, *phsizep);
1357                 *phbasep = NULL;
1358                 return (err);
1359         }
1360 
1361         return (0);
1362 }
1363 
1364 #define MINSHDRSZ       (offsetof(Shdr, sh_entsize) + \
1365                         sizeof (((Shdr *)NULL)->sh_entsize))
1366 
1367 static int
1368 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nshdrs,
1369     uint_t shstrndx, caddr_t *shbasep, size_t *shsizep, char **shstrbasep,
1370     size_t *shstrsizep)
1371 {
1372         int err;
1373         Shdr *shdr;
1374 
1375         /*
1376          * Since we're going to be using e_shentsize to iterate down the
1377          * array of section headers, it must be 8-byte aligned or else
1378          * a we might cause a misaligned access. We use all members through
1379          * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1380          * must be at least large enough to include that member. The index
1381          * of the string table section must also be valid.
1382          */
1383         if (ehdr->e_shentsize < MINSHDRSZ || (ehdr->e_shentsize & 3) ||
1384             nshdrs == 0 || shstrndx >= nshdrs)
1385                 return (EINVAL);
1386 
1387         *shsizep = nshdrs * ehdr->e_shentsize;
1388 
1389         if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1390                 if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1391                         return (ENOMEM);
1392         } else {
1393                 *shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1394         }
1395 
1396         if ((err = vn_rdwr(UIO_READ, vp, *shbasep, (ssize_t)*shsizep,
1397             (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1398             credp, NULL)) != 0) {
1399                 kmem_free(*shbasep, *shsizep);
1400                 return (err);
1401         }
1402 
1403         /*
1404          * Grab the section string table.  Walking through the shdrs is
1405          * pointless if their names cannot be interrogated.
1406          */
1407         shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1408         if ((*shstrsizep = shdr->sh_size) == 0) {
1409                 kmem_free(*shbasep, *shsizep);
1410                 return (EINVAL);
1411         }
1412 
1413         if (*shstrsizep > elf_shstrtab_max) {
1414                 if ((*shstrbasep = kmem_alloc(*shstrsizep,
1415                     KM_NOSLEEP)) == NULL) {
1416                         kmem_free(*shbasep, *shsizep);
1417                         return (ENOMEM);
1418                 }
1419         } else {
1420                 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1421         }
1422 
1423         if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, (ssize_t)*shstrsizep,
1424             (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1425             credp, NULL)) != 0) {
1426                 kmem_free(*shbasep, *shsizep);
1427                 kmem_free(*shstrbasep, *shstrsizep);
1428                 return (err);
1429         }
1430 
1431         /*
1432          * Make sure the strtab is null-terminated to make sure we
1433          * don't run off the end of the table.
1434          */
1435         (*shstrbasep)[*shstrsizep - 1] = '\0';
1436 
1437         return (0);
1438 }
1439 
1440 
1441 int
1442 elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, uint_t *nphdrs,
1443     caddr_t *phbasep, size_t *phsizep)
1444 {
1445         int error;
1446         uint_t nshdrs, shstrndx;
1447 
1448         if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
1449             nphdrs)) != 0 ||
1450             (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
1451             phsizep)) != 0) {
1452                 return (error);
1453         }
1454         return (0);
1455 }
1456 
1457 
1458 static int
1459 mapelfexec(
1460         vnode_t *vp,
1461         Ehdr *ehdr,
1462         uint_t nphdrs,
1463         caddr_t phdrbase,
1464         Phdr **uphdr,
1465         Phdr **intphdr,
1466         Phdr **stphdr,
1467         Phdr **dtphdr,
1468         Phdr *dataphdrp,
1469         caddr_t *bssbase,
1470         caddr_t *brkbase,
1471         intptr_t *voffset,
1472         uintptr_t *minaddrp,
1473         size_t len,
1474         size_t *execsz,
1475         size_t *brksize)
1476 {
1477         Phdr *phdr;
1478         int error, page, prot, lastprot = 0;
1479         caddr_t addr = NULL;
1480         caddr_t minaddr = (caddr_t)UINTPTR_MAX;
1481         uint_t i;
1482         size_t zfodsz, memsz;
1483         boolean_t ptload = B_FALSE;
1484         off_t offset;
1485         const uint_t hsize = ehdr->e_phentsize;
1486         uintptr_t lastaddr = 0;
1487         extern int use_brk_lpg;
1488 
1489         if (ehdr->e_type == ET_DYN) {
1490                 caddr_t vaddr;
1491                 secflagset_t flags = 0;
1492                 /*
1493                  * Obtain the virtual address of a hole in the
1494                  * address space to map the "interpreter".
1495                  */
1496                 if (secflag_enabled(curproc, PROC_SEC_ASLR))
1497                         flags |= _MAP_RANDOMIZE;
1498 
1499                 map_addr(&addr, len, (offset_t)0, 1, flags);
1500                 if (addr == NULL)
1501                         return (ENOMEM);
1502 
1503                 /*
1504                  * Despite the fact that mmapobj(2) refuses to load them, we
1505                  * need to support executing ET_DYN objects that have a
1506                  * non-NULL p_vaddr.  When found in the wild, these objects
1507                  * are likely to be due to an old (and largely obviated) Linux
1508                  * facility, prelink(8), that rewrites shared objects to
1509                  * prefer specific (disjoint) virtual address ranges.  (Yes,
1510                  * this is putatively for performance -- and yes, it has
1511                  * limited applicability, many edge conditions and grisly
1512                  * failure modes; even for Linux, it's insane.)  As ELF
1513                  * mandates that the PT_LOAD segments be in p_vaddr order, we
1514                  * find the lowest p_vaddr by finding the first PT_LOAD
1515                  * segment.
1516                  */
1517                 phdr = (Phdr *)phdrbase;
1518                 for (i = nphdrs; i > 0; i--) {
1519                         if (phdr->p_type == PT_LOAD) {
1520                                 addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
1521                                 break;
1522                         }
1523                         phdr = (Phdr *)((caddr_t)phdr + hsize);
1524                 }
1525 
1526                 /*
1527                  * We have a non-zero p_vaddr in the first PT_LOAD segment --
1528                  * presumably because we're directly executing a prelink(8)'d
1529                  * ld-linux.so.  While we could correctly execute such an
1530                  * object without locating it at its desired p_vaddr (it is,
1531                  * after all, still relocatable), our inner antiquarian
1532                  * derives a perverse pleasure in accommodating the steampunk
1533                  * prelink(8) contraption -- goggles on!
1534                  */
1535                 if ((vaddr = addr) != NULL) {
1536                         if (as_gap(curproc->p_as, len, &addr, &len,
1537                             AH_LO, NULL) == -1 || addr != vaddr) {
1538                                 addr = NULL;
1539                         }
1540                 }
1541 
1542                 if (addr == NULL) {
1543                         /*
1544                          * We either have a NULL p_vaddr (the common case, by
1545                          * many orders of magnitude) or we have a non-NULL
1546                          * p_vaddr and we were unable to obtain the specified
1547                          * VA range (presumably because it's an illegal
1548                          * address).  Either way, obtain an address in which
1549                          * to map the interpreter.
1550                          */
1551                         map_addr(&addr, len, (offset_t)0, 1, 0);
1552                         if (addr == NULL)
1553                                 return (ENOMEM);
1554                 }
1555 
1556                 /*
1557                  * Our voffset is the difference between where we landed and
1558                  * where we wanted to be.
1559                  */
1560                 *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
1561         } else {
1562                 *voffset = 0;
1563         }
1564 
1565         phdr = (Phdr *)phdrbase;
1566         for (i = nphdrs; i > 0; i--) {
1567                 switch (phdr->p_type) {
1568                 case PT_LOAD:
1569                         ptload = B_TRUE;
1570                         prot = PROT_USER;
1571                         if (phdr->p_flags & PF_R)
1572                                 prot |= PROT_READ;
1573                         if (phdr->p_flags & PF_W)
1574                                 prot |= PROT_WRITE;
1575                         if (phdr->p_flags & PF_X)
1576                                 prot |= PROT_EXEC;
1577 
1578                         addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1579 
1580                         if ((*intphdr != NULL) && uphdr != NULL &&
1581                             (*uphdr == NULL)) {
1582                                 /*
1583                                  * The PT_PHDR program header is, strictly
1584                                  * speaking, optional.  If we find that this
1585                                  * is missing, we will determine the location
1586                                  * of the program headers based on the address
1587                                  * of the lowest PT_LOAD segment (namely, this
1588                                  * one):  we subtract the p_offset to get to
1589                                  * the ELF header and then add back the program
1590                                  * header offset to get to the program headers.
1591                                  * We then cons up a Phdr that corresponds to
1592                                  * the (missing) PT_PHDR, setting the flags
1593                                  * to 0 to denote that this is artificial and
1594                                  * should (must) be freed by the caller.
1595                                  */
1596                                 Phdr *cons;
1597 
1598                                 cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
1599 
1600                                 cons->p_flags = 0;
1601                                 cons->p_type = PT_PHDR;
1602                                 cons->p_vaddr = ((uintptr_t)addr -
1603                                     phdr->p_offset) + ehdr->e_phoff;
1604 
1605                                 *uphdr = cons;
1606                         }
1607 
1608                         /*
1609                          * The ELF spec dictates that p_filesz may not be
1610                          * larger than p_memsz in PT_LOAD segments.
1611                          */
1612                         if (phdr->p_filesz > phdr->p_memsz) {
1613                                 error = EINVAL;
1614                                 goto bad;
1615                         }
1616 
1617                         /*
1618                          * Keep track of the segment with the lowest starting
1619                          * address.
1620                          */
1621                         if (addr < minaddr)
1622                                 minaddr = addr;
1623 
1624                         /*
1625                          * Segments need not correspond to page boundaries:
1626                          * they are permitted to share a page.  If two PT_LOAD
1627                          * segments share the same page, and the permissions
1628                          * of the segments differ, the behavior is historically
1629                          * that the permissions of the latter segment are used
1630                          * for the page that the two segments share.  This is
1631                          * also historically a non-issue:  binaries generated
1632                          * by most anything will make sure that two PT_LOAD
1633                          * segments with differing permissions don't actually
1634                          * share any pages.  However, there exist some crazy
1635                          * things out there (including at least an obscure
1636                          * Portuguese teaching language called G-Portugol) that
1637                          * actually do the wrong thing and expect it to work:
1638                          * they have a segment with execute permission share
1639                          * a page with a subsequent segment that does not
1640                          * have execute permissions and expect the resulting
1641                          * shared page to in fact be executable.  To accommodate
1642                          * such broken link editors, we take advantage of a
1643                          * latitude explicitly granted to the loader:  it is
1644                          * permitted to make _any_ PT_LOAD segment executable
1645                          * (provided that it is readable or writable).  If we
1646                          * see that we're sharing a page and that the previous
1647                          * page was executable, we will add execute permissions
1648                          * to our segment.
1649                          */
1650                         if (btop(lastaddr) == btop((uintptr_t)addr) &&
1651                             (phdr->p_flags & (PF_R | PF_W)) &&
1652                             (lastprot & PROT_EXEC)) {
1653                                 prot |= PROT_EXEC;
1654                         }
1655 
1656                         lastaddr = (uintptr_t)addr + phdr->p_filesz;
1657                         lastprot = prot;
1658 
1659                         zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1660 
1661                         offset = phdr->p_offset;
1662                         if (((uintptr_t)offset & PAGEOFFSET) ==
1663                             ((uintptr_t)addr & PAGEOFFSET) &&
1664                             (!(vp->v_flag & VNOMAP))) {
1665                                 page = 1;
1666                         } else {
1667                                 page = 0;
1668                         }
1669 
1670                         /*
1671                          * Set the heap pagesize for OOB when the bss size
1672                          * is known and use_brk_lpg is not 0.
1673                          */
1674                         if (brksize != NULL && use_brk_lpg &&
1675                             zfodsz != 0 && phdr == dataphdrp &&
1676                             (prot & PROT_WRITE)) {
1677                                 const size_t tlen = P2NPHASE((uintptr_t)addr +
1678                                     phdr->p_filesz, PAGESIZE);
1679 
1680                                 if (zfodsz > tlen) {
1681                                         const caddr_t taddr = addr +
1682                                             phdr->p_filesz + tlen;
1683 
1684                                         /*
1685                                          * Since a hole in the AS large enough
1686                                          * for this object as calculated by
1687                                          * elfsize() is available, we do not
1688                                          * need to fear overflow for 'taddr'.
1689                                          */
1690                                         curproc->p_brkpageszc =
1691                                             page_szc(map_pgsz(MAPPGSZ_HEAP,
1692                                             curproc, taddr, zfodsz - tlen, 0));
1693                                 }
1694                         }
1695 
1696                         if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1697                             (prot & PROT_WRITE)) {
1698                                 uint_t  szc = curproc->p_brkpageszc;
1699                                 size_t pgsz = page_get_pagesize(szc);
1700                                 caddr_t ebss = addr + phdr->p_memsz;
1701                                 /*
1702                                  * If we need extra space to keep the BSS an
1703                                  * integral number of pages in size, some of
1704                                  * that space may fall beyond p_brkbase, so we
1705                                  * need to set p_brksize to account for it
1706                                  * being (logically) part of the brk.
1707                                  */
1708                                 size_t extra_zfodsz;
1709 
1710                                 ASSERT(pgsz > PAGESIZE);
1711 
1712                                 extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1713 
1714                                 if (error = execmap(vp, addr, phdr->p_filesz,
1715                                     zfodsz + extra_zfodsz, phdr->p_offset,
1716                                     prot, page, szc))
1717                                         goto bad;
1718                                 if (brksize != NULL)
1719                                         *brksize = extra_zfodsz;
1720                         } else {
1721                                 if (error = execmap(vp, addr, phdr->p_filesz,
1722                                     zfodsz, phdr->p_offset, prot, page, 0))
1723                                         goto bad;
1724                         }
1725 
1726                         if (bssbase != NULL && addr >= *bssbase &&
1727                             phdr == dataphdrp) {
1728                                 *bssbase = addr + phdr->p_filesz;
1729                         }
1730                         if (brkbase != NULL && addr >= *brkbase) {
1731                                 *brkbase = addr + phdr->p_memsz;
1732                         }
1733 
1734                         memsz = btopr(phdr->p_memsz);
1735                         if ((*execsz + memsz) < *execsz) {
1736                                 error = ENOMEM;
1737                                 goto bad;
1738                         }
1739                         *execsz += memsz;
1740                         break;
1741 
1742                 case PT_INTERP:
1743                         /*
1744                          * The ELF specification is unequivocal about the
1745                          * PT_INTERP program header with respect to any PT_LOAD
1746                          * program header:  "If it is present, it must precede
1747                          * any loadable segment entry." Linux, however, makes
1748                          * no attempt to enforce this -- which has allowed some
1749                          * binary editing tools to get away with generating
1750                          * invalid ELF binaries in the respect that PT_INTERP
1751                          * occurs after the first PT_LOAD program header.  This
1752                          * is unfortunate (and of course, disappointing) but
1753                          * it's no worse than that: there is no reason that we
1754                          * can't process the PT_INTERP entry (if present) after
1755                          * one or more PT_LOAD entries.  We therefore
1756                          * deliberately do not check ptload here and always
1757                          * store dyphdr to be the PT_INTERP program header.
1758                          */
1759                         *intphdr = phdr;
1760                         break;
1761 
1762                 case PT_SHLIB:
1763                         *stphdr = phdr;
1764                         break;
1765 
1766                 case PT_PHDR:
1767                         if (ptload || phdr->p_flags == 0)
1768                                 goto bad;
1769 
1770                         if (uphdr != NULL)
1771                                 *uphdr = phdr;
1772 
1773                         break;
1774 
1775                 case PT_NULL:
1776                 case PT_DYNAMIC:
1777                 case PT_NOTE:
1778                         break;
1779 
1780                 case PT_SUNWDTRACE:
1781                         if (dtphdr != NULL)
1782                                 *dtphdr = phdr;
1783                         break;
1784 
1785                 default:
1786                         break;
1787                 }
1788                 phdr = (Phdr *)((caddr_t)phdr + hsize);
1789         }
1790 
1791         if (minaddrp != NULL) {
1792                 ASSERT(minaddr != (caddr_t)UINTPTR_MAX);
1793                 *minaddrp = (uintptr_t)minaddr;
1794         }
1795 
1796         if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1797                 size_t off;
1798                 uintptr_t base = (uintptr_t)*brkbase;
1799                 uintptr_t oend = base + *brksize;
1800 
1801                 ASSERT(ISP2(aslr_max_brk_skew));
1802 
1803                 (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1804                 base += P2PHASE(off, aslr_max_brk_skew);
1805                 base = P2ROUNDUP(base, PAGESIZE);
1806                 *brkbase = (caddr_t)base;
1807                 /*
1808                  * Above, we set *brksize to account for the possibility we
1809                  * had to grow the 'brk' in padding out the BSS to a page
1810                  * boundary.
1811                  *
1812                  * We now need to adjust that based on where we now are
1813                  * actually putting the brk.
1814                  */
1815                 if (oend > base)
1816                         *brksize = oend - base;
1817                 else
1818                         *brksize = 0;
1819         }
1820 
1821         return (0);
1822 bad:
1823         if (error == 0)
1824                 error = EINVAL;
1825         return (error);
1826 }
1827 
1828 int
1829 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1830     rlim64_t rlimit, cred_t *credp)
1831 {
1832         Note note;
1833         int error;
1834 
1835         bzero(&note, sizeof (note));
1836         bcopy("CORE", note.name, 4);
1837         note.nhdr.n_type = type;
1838         /*
1839          * The System V ABI states that n_namesz must be the length of the
1840          * string that follows the Nhdr structure including the terminating
1841          * null. The ABI also specifies that sufficient padding should be
1842          * included so that the description that follows the name string
1843          * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1844          * respectively. However, since this change was not made correctly
1845          * at the time of the 64-bit port, both 32- and 64-bit binaries
1846          * descriptions are only guaranteed to begin on a 4-byte boundary.
1847          */
1848         note.nhdr.n_namesz = 5;
1849         note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1850 
1851         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, &note,
1852             sizeof (note), rlimit, credp))
1853                 return (error);
1854 
1855         *offsetp += sizeof (note);
1856 
1857         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1858             note.nhdr.n_descsz, rlimit, credp))
1859                 return (error);
1860 
1861         *offsetp += note.nhdr.n_descsz;
1862         return (0);
1863 }
1864 
1865 
1866 /*
1867  * Copy the section data from one vnode to the section of another vnode.
1868  */
1869 static void
1870 elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst)
1871 {
1872         size_t n = src->sh_size;
1873         u_offset_t off = 0;
1874         const u_offset_t soff = src->sh_offset;
1875         const u_offset_t doff = ctx->ecc_doffset;
1876         void *buf = ctx->ecc_buf;
1877         vnode_t *dst_vp = ctx->ecc_vp;
1878         cred_t *credp = ctx->ecc_credp;
1879 
1880         /* Protect the copy loop below from overflow on the offsets */
1881         if (n > OFF_MAX || (n + soff) > OFF_MAX || (n + doff) > OFF_MAX ||
1882             (n + soff) < n || (n + doff) < n) {
1883                 dst->sh_size = 0;
1884                 dst->sh_offset = 0;
1885                 return;
1886         }
1887 
1888         while (n != 0) {
1889                 const size_t len = MIN(ctx->ecc_bufsz, n);
1890                 ssize_t resid;
1891 
1892                 if (vn_rdwr(UIO_READ, src_vp, buf, (ssize_t)len,
1893                     (offset_t)(soff + off),
1894                     UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1895                     resid >= len || resid < 0 ||
1896                     core_write(dst_vp, UIO_SYSSPACE, (offset_t)(doff + off),
1897                     buf, len - resid, ctx->ecc_rlimit, credp) != 0) {
1898                         dst->sh_size = 0;
1899                         dst->sh_offset = 0;
1900                         return;
1901                 }
1902 
1903                 ASSERT(n >= len - resid);
1904 
1905                 n -= len - resid;
1906                 off += len - resid;
1907         }
1908 
1909         ctx->ecc_doffset += src->sh_size;
1910 }
1911 
1912 /*
1913  * Walk sections for a given ELF object, counting (or copying) those of
1914  * interest (CTF, symtab, strtab, DWARF debug).
1915  *
1916  * Returns UINT_MAX upon low-memory.
1917  */
1918 static uint_t
1919 elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
1920     Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab)
1921 {
1922         Ehdr ehdr;
1923         const core_content_t content = ctx->ecc_content;
1924         cred_t *credp = ctx->ecc_credp;
1925         Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL;
1926         uintptr_t off = 0;
1927         uint_t nshdrs, shstrndx, nphdrs, count = 0;
1928         u_offset_t *doffp = &ctx->ecc_doffset;
1929         boolean_t ctf_link = B_FALSE;
1930         caddr_t shbase;
1931         size_t shsize, shstrsize;
1932         char *shstrbase;
1933 
1934         if ((content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG))
1935             == 0) {
1936                 return (0);
1937         }
1938 
1939         if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, &nphdrs) != 0 ||
1940             getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, &shbase, &shsize,
1941             &shstrbase, &shstrsize) != 0) {
1942                 return (0);
1943         }
1944 
1945         /* Starting at index 1 skips SHT_NULL which is expected at index 0 */
1946         off = ehdr.e_shentsize;
1947         for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
1948                 Shdr *shdr, *symchk = NULL, *strchk;
1949                 const char *name;
1950 
1951                 shdr = (Shdr *)(shbase + off);
1952                 if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL)
1953                         continue;
1954 
1955                 name = shstrbase + shdr->sh_name;
1956 
1957                 if (ctf == NULL &&
1958                     (content & CC_CONTENT_CTF) != 0 &&
1959                     strcmp(name, shstrtab_data[STR_CTF]) == 0) {
1960                         ctf = shdr;
1961                         if (ctf->sh_link != 0 && ctf->sh_link < nshdrs) {
1962                                 /* check linked symtab below */
1963                                 symchk = (Shdr *)(shbase +
1964                                     shdr->sh_link * ehdr.e_shentsize);
1965                                 ctf_link = B_TRUE;
1966                         } else {
1967                                 continue;
1968                         }
1969                 } else if (symtab == NULL &&
1970                     (content & CC_CONTENT_SYMTAB) != 0 &&
1971                     strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) {
1972                         symchk = shdr;
1973                 } else if ((content & CC_CONTENT_DEBUG) != 0 &&
1974                     strncmp(name, ".debug_", strlen(".debug_")) == 0) {
1975                         /*
1976                          * The design of the above check is intentional.  In
1977                          * particular, we want to capture any sections that
1978                          * begin with '.debug_' for a few reasons:
1979                          *
1980                          * 1) Various revisions to the DWARF spec end up
1981                          * changing the set of section headers that
1982                          * exist. This ensures that we don't need to change
1983                          * the kernel to get a new version.
1984                          *
1985                          * 2) Other software uses .debug_ sections for things
1986                          * which aren't DWARF. This allows them to be captured
1987                          * as well.
1988                          *
1989                          * Because of this, we emit straight here, unlike the
1990                          * other two sections where we wait until we're done
1991                          * scanning.
1992                          */
1993 
1994                         /* We're only counting, don't emit! */
1995                         if (v == NULL) {
1996                                 count++;
1997                                 continue;
1998                         }
1999 
2000                         elf_ctx_resize_scratch(ctx, shdr->sh_size);
2001                         if (!shstrtab_ndx(shstrtab, name, &v[idx].sh_name)) {
2002                                 count = UINT_MAX;
2003                                 goto done;
2004                         }
2005                         v[idx].sh_addr = (Addr)(uintptr_t)saddr;
2006                         v[idx].sh_type = shdr->sh_type;
2007                         v[idx].sh_addralign = shdr->sh_addralign;
2008                         *doffp = roundup(*doffp, v[idx].sh_addralign);
2009                         v[idx].sh_offset = *doffp;
2010                         v[idx].sh_size = shdr->sh_size;
2011                         v[idx].sh_link = 0;
2012                         v[idx].sh_entsize = shdr->sh_entsize;
2013                         v[idx].sh_info = shdr->sh_info;
2014 
2015                         elf_copy_scn(ctx, shdr, mvp, &v[idx]);
2016                         count++;
2017                         idx++;
2018                         continue;
2019                 } else {
2020                         continue;
2021                 }
2022 
2023                 ASSERT(symchk != NULL);
2024                 if ((symchk->sh_type != SHT_DYNSYM &&
2025                     symchk->sh_type != SHT_SYMTAB) ||
2026                     symchk->sh_link == 0 || symchk->sh_link >= nshdrs) {
2027                         ctf_link = B_FALSE;
2028                         continue;
2029                 }
2030                 strchk = (Shdr *)(shbase + symchk->sh_link * ehdr.e_shentsize);
2031                 if (strchk->sh_type != SHT_STRTAB) {
2032                         ctf_link = B_FALSE;
2033                         continue;
2034                 }
2035                 symtab = symchk;
2036                 strtab = strchk;
2037 
2038                 if (symtab != NULL && ctf != NULL &&
2039                     (content & CC_CONTENT_DEBUG) == 0) {
2040                         /* No other shdrs are of interest at this point */
2041                         break;
2042                 }
2043         }
2044 
2045         if (ctf != NULL)
2046                 count += 1;
2047         if (symtab != NULL)
2048                 count += 2;
2049 
2050         if (v == NULL || count == 0 || count > remain) {
2051                 count = MIN(count, remain);
2052                 goto done;
2053         }
2054 
2055         /* output CTF section */
2056         if (ctf != NULL) {
2057                 elf_ctx_resize_scratch(ctx, ctf->sh_size);
2058 
2059                 if (!shstrtab_ndx(shstrtab, shstrtab_data[STR_CTF],
2060                     &v[idx].sh_name)) {
2061                         count = UINT_MAX;
2062                         goto done;
2063                 }
2064 
2065                 v[idx].sh_addr = (Addr)(uintptr_t)saddr;
2066                 v[idx].sh_type = SHT_PROGBITS;
2067                 v[idx].sh_addralign = 4;
2068                 *doffp = roundup(*doffp, v[idx].sh_addralign);
2069                 v[idx].sh_offset = *doffp;
2070                 v[idx].sh_size = ctf->sh_size;
2071 
2072                 if (ctf_link) {
2073                         /*
2074                          * The linked symtab (and strtab) will be output
2075                          * immediately after this CTF section.  Its shdr index
2076                          * directly follows this one.
2077                          */
2078                         v[idx].sh_link = idx + 1;
2079                         ASSERT(symtab != NULL);
2080                 } else {
2081                         v[idx].sh_link = 0;
2082                 }
2083                 elf_copy_scn(ctx, ctf, mvp, &v[idx]);
2084                 idx++;
2085         }
2086 
2087         /* output SYMTAB/STRTAB sections */
2088         if (symtab != NULL) {
2089                 uint_t symtab_name, strtab_name;
2090 
2091                 elf_ctx_resize_scratch(ctx,
2092                     MAX(symtab->sh_size, strtab->sh_size));
2093 
2094                 if (symtab->sh_type == SHT_DYNSYM) {
2095                         if (!shstrtab_ndx(shstrtab, shstrtab_data[STR_DYNSYM],
2096                             &symtab_name) ||
2097                             !shstrtab_ndx(shstrtab, shstrtab_data[STR_DYNSTR],
2098                             &strtab_name)) {
2099                                 count = UINT_MAX;
2100                                 goto done;
2101                         }
2102                 } else {
2103                         if (!shstrtab_ndx(shstrtab, shstrtab_data[STR_SYMTAB],
2104                             &symtab_name) ||
2105                             !shstrtab_ndx(shstrtab, shstrtab_data[STR_STRTAB],
2106                             &strtab_name)) {
2107                                 count = UINT_MAX;
2108                                 goto done;
2109                         }
2110                 }
2111 
2112                 v[idx].sh_name = symtab_name;
2113                 v[idx].sh_type = symtab->sh_type;
2114                 v[idx].sh_addr = symtab->sh_addr;
2115                 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2116                         v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2117                 v[idx].sh_addralign = symtab->sh_addralign;
2118                 *doffp = roundup(*doffp, v[idx].sh_addralign);
2119                 v[idx].sh_offset = *doffp;
2120                 v[idx].sh_size = symtab->sh_size;
2121                 v[idx].sh_link = idx + 1;
2122                 v[idx].sh_entsize = symtab->sh_entsize;
2123                 v[idx].sh_info = symtab->sh_info;
2124 
2125                 elf_copy_scn(ctx, symtab, mvp, &v[idx]);
2126                 idx++;
2127 
2128                 v[idx].sh_name = strtab_name;
2129                 v[idx].sh_type = SHT_STRTAB;
2130                 v[idx].sh_flags = SHF_STRINGS;
2131                 v[idx].sh_addr = strtab->sh_addr;
2132                 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2133                         v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2134                 v[idx].sh_addralign = strtab->sh_addralign;
2135                 *doffp = roundup(*doffp, v[idx].sh_addralign);
2136                 v[idx].sh_offset = *doffp;
2137                 v[idx].sh_size = strtab->sh_size;
2138 
2139                 elf_copy_scn(ctx, strtab, mvp, &v[idx]);
2140                 idx++;
2141         }
2142 
2143 done:
2144         kmem_free(shstrbase, shstrsize);
2145         kmem_free(shbase, shsize);
2146         return (count);
2147 }
2148 
2149 /*
2150  * Walk mappings in process address space, examining those which correspond to
2151  * loaded objects.  It is called twice from elfcore: Once to simply count
2152  * relevant sections, and again later to copy those sections once an adequate
2153  * buffer has been allocated for the shdr details.
2154  */
2155 static int
2156 elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
2157 {
2158         vnode_t *lastvp = NULL;
2159         struct seg *seg;
2160         uint_t idx = 0, remain;
2161         shstrtab_t shstrtab;
2162         struct as *as = ctx->ecc_p->p_as;
2163         int error = 0;
2164 
2165         ASSERT(AS_WRITE_HELD(as));
2166 
2167         if (v != NULL) {
2168                 ASSERT(nv != 0);
2169 
2170                 if (!shstrtab_init(&shstrtab))
2171                         return (ENOMEM);
2172                 remain = nv;
2173         } else {
2174                 ASSERT(nv == 0);
2175 
2176                 /*
2177                  * The shdrs are being counted, rather than outputting them
2178                  * into a buffer.  Leave room for two entries: the SHT_NULL at
2179                  * index 0 and the shstrtab at the end.
2180                  */
2181                 remain = UINT_MAX - 2;
2182         }
2183 
2184         /* Per the ELF spec, shdr index 0 is reserved. */
2185         idx = 1;
2186         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2187                 vnode_t *mvp;
2188                 void *tmp = NULL;
2189                 caddr_t saddr = seg->s_base, naddr, eaddr;
2190                 size_t segsize;
2191                 uint_t count, prot;
2192 
2193                 /*
2194                  * Since we're just looking for text segments of load
2195                  * objects, we only care about the protection bits; we don't
2196                  * care about the actual size of the segment so we use the
2197                  * reserved size. If the segment's size is zero, there's
2198                  * something fishy going on so we ignore this segment.
2199                  */
2200                 if (seg->s_ops != &segvn_ops ||
2201                     SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2202                     mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
2203                     (segsize = pr_getsegsize(seg, 1)) == 0)
2204                         continue;
2205 
2206                 eaddr = saddr + segsize;
2207                 prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
2208                 pr_getprot_done(&tmp);
2209 
2210                 /*
2211                  * Skip this segment unless the protection bits look like
2212                  * what we'd expect for a text segment.
2213                  */
2214                 if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
2215                         continue;
2216 
2217                 count = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
2218                     &shstrtab);
2219                 if (count == UINT_MAX) {
2220                         error = ENOMEM;
2221                         goto done;
2222                 }
2223 
2224                 ASSERT(count <= remain);
2225                 ASSERT(v == NULL || (idx + count) < nv);
2226 
2227                 remain -= count;
2228                 idx += count;
2229                 lastvp = mvp;
2230         }
2231 
2232         if (v == NULL) {
2233                 if (idx == 1) {
2234                         *nshdrsp = 0;
2235                 } else {
2236                         /* Include room for the shrstrtab at the end */
2237                         *nshdrsp = idx + 1;
2238                 }
2239                 /* No need to free up shstrtab so we can just return. */
2240                 return (0);
2241         }
2242 
2243         if (idx != nv - 1) {
2244                 cmn_err(CE_WARN, "elfcore: core dump failed for "
2245                     "process %d; address space is changing",
2246                     ctx->ecc_p->p_pid);
2247                 error = EIO;
2248                 goto done;
2249         }
2250 
2251         if (!shstrtab_ndx(&shstrtab, shstrtab_data[STR_SHSTRTAB],
2252             &v[idx].sh_name)) {
2253                 error = ENOMEM;
2254                 goto done;
2255         }
2256         v[idx].sh_size = shstrtab_size(&shstrtab);
2257         v[idx].sh_addralign = 1;
2258         v[idx].sh_offset = ctx->ecc_doffset;
2259         v[idx].sh_flags = SHF_STRINGS;
2260         v[idx].sh_type = SHT_STRTAB;
2261 
2262         elf_ctx_resize_scratch(ctx, v[idx].sh_size);
2263         VERIFY3U(ctx->ecc_bufsz, >=, v[idx].sh_size);
2264         shstrtab_dump(&shstrtab, ctx->ecc_buf);
2265 
2266         error = core_write(ctx->ecc_vp, UIO_SYSSPACE, ctx->ecc_doffset,
2267             ctx->ecc_buf, v[idx].sh_size, ctx->ecc_rlimit, ctx->ecc_credp);
2268         if (error == 0) {
2269                 ctx->ecc_doffset += v[idx].sh_size;
2270         }
2271 
2272 done:
2273         if (v != NULL)
2274                 shstrtab_fini(&shstrtab);
2275         return (error);
2276 }
2277 
2278 int
2279 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
2280     core_content_t content)
2281 {
2282         u_offset_t poffset, soffset, doffset;
2283         int error;
2284         uint_t i, nphdrs, nshdrs;
2285         struct seg *seg;
2286         struct as *as = p->p_as;
2287         void *bigwad, *zeropg = NULL;
2288         size_t bigsize, phdrsz, shdrsz;
2289         Ehdr *ehdr;
2290         Phdr *phdr;
2291         Shdr shdr0;
2292         caddr_t brkbase, stkbase;
2293         size_t brksize, stksize;
2294         boolean_t overflowed = B_FALSE, retried = B_FALSE;
2295         klwp_t *lwp = ttolwp(curthread);
2296         elf_core_ctx_t ctx = {
2297                 .ecc_vp = vp,
2298                 .ecc_p = p,
2299                 .ecc_credp = credp,
2300                 .ecc_rlimit = rlimit,
2301                 .ecc_content = content,
2302                 .ecc_doffset = 0,
2303                 .ecc_buf = NULL,
2304                 .ecc_bufsz = 0
2305         };
2306 
2307 top:
2308         /*
2309          * Make sure we have everything we need (registers, etc.).
2310          * All other lwps have already stopped and are in an orderly state.
2311          */
2312         ASSERT(p == ttoproc(curthread));
2313         prstop(0, 0);
2314 
2315         AS_LOCK_ENTER(as, RW_WRITER);
2316         nphdrs = prnsegs(as, 0) + 2;            /* two CORE note sections */
2317 
2318         /*
2319          * Count the number of section headers we're going to need.
2320          */
2321         nshdrs = 0;
2322         if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG))
2323                 VERIFY0(elf_process_scns(&ctx, NULL, 0, &nshdrs));
2324         AS_LOCK_EXIT(as);
2325 
2326         /*
2327          * The core file contents may require zero section headers, but if
2328          * we overflow the 16 bits allotted to the program header count in
2329          * the ELF header, we'll need that program header at index zero.
2330          */
2331         if (nshdrs == 0 && nphdrs >= PN_XNUM) {
2332                 nshdrs = 1;
2333         }
2334 
2335         /*
2336          * Allocate a buffer which is sized adequately to hold the ehdr,
2337          * phdrs, DWARF debug, or shdrs needed to produce the core file.  It
2338          * is used for the four tasks sequentially, not simultaneously, so it
2339          * does not need space for all four data at once, only the largest
2340          * one.
2341          */
2342         VERIFY(nphdrs >= 2);
2343         phdrsz = nphdrs * sizeof (Phdr);
2344         shdrsz = nshdrs * sizeof (Shdr);
2345         bigsize = MAX(sizeof (Ehdr), MAX(phdrsz, shdrsz));
2346         bigwad = kmem_alloc(bigsize, KM_SLEEP);
2347 
2348         ehdr = (Ehdr *)bigwad;
2349         bzero(ehdr, sizeof (*ehdr));
2350 
2351         ehdr->e_ident[EI_MAG0] = ELFMAG0;
2352         ehdr->e_ident[EI_MAG1] = ELFMAG1;
2353         ehdr->e_ident[EI_MAG2] = ELFMAG2;
2354         ehdr->e_ident[EI_MAG3] = ELFMAG3;
2355         ehdr->e_ident[EI_CLASS] = ELFCLASS;
2356         ehdr->e_type = ET_CORE;
2357 
2358 #if !defined(_LP64) || defined(_ELF32_COMPAT)
2359 
2360 #if defined(__sparc)
2361         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2362         ehdr->e_machine = EM_SPARC;
2363 #elif defined(__i386_COMPAT)
2364         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2365         ehdr->e_machine = EM_386;
2366 #else
2367 #error "no recognized machine type is defined"
2368 #endif
2369 
2370 #else   /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2371 
2372 #if defined(__sparc)
2373         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2374         ehdr->e_machine = EM_SPARCV9;
2375 #elif defined(__amd64)
2376         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2377         ehdr->e_machine = EM_AMD64;
2378 #else
2379 #error "no recognized 64-bit machine type is defined"
2380 #endif
2381 
2382 #endif  /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2383 
2384         poffset = sizeof (Ehdr);
2385         soffset = sizeof (Ehdr) + phdrsz;
2386         doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2387         bzero(&shdr0, sizeof (shdr0));
2388 
2389         /*
2390          * If the count of program headers or section headers or the index
2391          * of the section string table can't fit in the mere 16 bits
2392          * shortsightedly allotted to them in the ELF header, we use the
2393          * extended formats and put the real values in the section header
2394          * as index 0.
2395          */
2396         if (nphdrs >= PN_XNUM) {
2397                 ehdr->e_phnum = PN_XNUM;
2398                 shdr0.sh_info = nphdrs;
2399         } else {
2400                 ehdr->e_phnum = (unsigned short)nphdrs;
2401         }
2402 
2403         if (nshdrs > 0) {
2404                 if (nshdrs >= SHN_LORESERVE) {
2405                         ehdr->e_shnum = 0;
2406                         shdr0.sh_size = nshdrs;
2407                 } else {
2408                         ehdr->e_shnum = (unsigned short)nshdrs;
2409                 }
2410 
2411                 if (nshdrs - 1 >= SHN_LORESERVE) {
2412                         ehdr->e_shstrndx = SHN_XINDEX;
2413                         shdr0.sh_link = nshdrs - 1;
2414                 } else {
2415                         ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
2416                 }
2417 
2418                 ehdr->e_shoff = soffset;
2419                 ehdr->e_shentsize = sizeof (Shdr);
2420         }
2421 
2422         ehdr->e_ident[EI_VERSION] = EV_CURRENT;
2423         ehdr->e_version = EV_CURRENT;
2424         ehdr->e_ehsize = sizeof (Ehdr);
2425         ehdr->e_phoff = poffset;
2426         ehdr->e_phentsize = sizeof (Phdr);
2427 
2428         if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
2429             sizeof (Ehdr), rlimit, credp)) {
2430                 goto done;
2431         }
2432 
2433         phdr = (Phdr *)bigwad;
2434         bzero(phdr, phdrsz);
2435 
2436         setup_old_note_header(&phdr[0], p);
2437         phdr[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2438         doffset += phdr[0].p_filesz;
2439 
2440         setup_note_header(&phdr[1], p);
2441         phdr[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2442         doffset += phdr[1].p_filesz;
2443 
2444         mutex_enter(&p->p_lock);
2445 
2446         brkbase = p->p_brkbase;
2447         brksize = p->p_brksize;
2448 
2449         stkbase = p->p_usrstack - p->p_stksize;
2450         stksize = p->p_stksize;
2451 
2452         mutex_exit(&p->p_lock);
2453 
2454         AS_LOCK_ENTER(as, RW_WRITER);
2455         i = 2;
2456         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2457                 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2458                 caddr_t saddr, naddr;
2459                 void *tmp = NULL;
2460                 extern struct seg_ops segspt_shmops;
2461 
2462                 if ((seg->s_flags & S_HOLE) != 0) {
2463                         continue;
2464                 }
2465 
2466                 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2467                         uint_t prot;
2468                         size_t size;
2469                         int type;
2470                         vnode_t *mvp;
2471 
2472                         prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2473                         prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2474                         if ((size = (size_t)(naddr - saddr)) == 0) {
2475                                 ASSERT(tmp == NULL);
2476                                 continue;
2477                         } else if (i == nphdrs) {
2478                                 pr_getprot_done(&tmp);
2479                                 overflowed = B_TRUE;
2480                                 break;
2481                         }
2482                         phdr[i].p_type = PT_LOAD;
2483                         phdr[i].p_vaddr = (Addr)(uintptr_t)saddr;
2484                         phdr[i].p_memsz = size;
2485                         if (prot & PROT_READ)
2486                                 phdr[i].p_flags |= PF_R;
2487                         if (prot & PROT_WRITE)
2488                                 phdr[i].p_flags |= PF_W;
2489                         if (prot & PROT_EXEC)
2490                                 phdr[i].p_flags |= PF_X;
2491 
2492                         /*
2493                          * Figure out which mappings to include in the core.
2494                          */
2495                         type = SEGOP_GETTYPE(seg, saddr);
2496 
2497                         if (saddr == stkbase && size == stksize) {
2498                                 if (!(content & CC_CONTENT_STACK))
2499                                         goto exclude;
2500 
2501                         } else if (saddr == brkbase && size == brksize) {
2502                                 if (!(content & CC_CONTENT_HEAP))
2503                                         goto exclude;
2504 
2505                         } else if (seg->s_ops == &segspt_shmops) {
2506                                 if (type & MAP_NORESERVE) {
2507                                         if (!(content & CC_CONTENT_DISM))
2508                                                 goto exclude;
2509                                 } else {
2510                                         if (!(content & CC_CONTENT_ISM))
2511                                                 goto exclude;
2512                                 }
2513 
2514                         } else if (seg->s_ops != &segvn_ops) {
2515                                 goto exclude;
2516 
2517                         } else if (type & MAP_SHARED) {
2518                                 if (shmgetid(p, saddr) != SHMID_NONE) {
2519                                         if (!(content & CC_CONTENT_SHM))
2520                                                 goto exclude;
2521 
2522                                 } else if (SEGOP_GETVP(seg, seg->s_base,
2523                                     &mvp) != 0 || mvp == NULL ||
2524                                     mvp->v_type != VREG) {
2525                                         if (!(content & CC_CONTENT_SHANON))
2526                                                 goto exclude;
2527 
2528                                 } else {
2529                                         if (!(content & CC_CONTENT_SHFILE))
2530                                                 goto exclude;
2531                                 }
2532 
2533                         } else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2534                             mvp == NULL || mvp->v_type != VREG) {
2535                                 if (!(content & CC_CONTENT_ANON))
2536                                         goto exclude;
2537 
2538                         } else if (prot == (PROT_READ | PROT_EXEC)) {
2539                                 if (!(content & CC_CONTENT_TEXT))
2540                                         goto exclude;
2541 
2542                         } else if (prot == PROT_READ) {
2543                                 if (!(content & CC_CONTENT_RODATA))
2544                                         goto exclude;
2545 
2546                         } else {
2547                                 if (!(content & CC_CONTENT_DATA))
2548                                         goto exclude;
2549                         }
2550 
2551                         doffset = roundup(doffset, sizeof (Word));
2552                         phdr[i].p_offset = doffset;
2553                         phdr[i].p_filesz = size;
2554                         doffset += size;
2555 exclude:
2556                         i++;
2557                 }
2558                 VERIFY(tmp == NULL);
2559                 if (overflowed)
2560                         break;
2561         }
2562         AS_LOCK_EXIT(as);
2563 
2564         if (overflowed || i != nphdrs) {
2565                 if (!retried) {
2566                         retried = B_TRUE;
2567                         overflowed = B_FALSE;
2568                         kmem_free(bigwad, bigsize);
2569                         goto top;
2570                 }
2571                 cmn_err(CE_WARN, "elfcore: core dump failed for "
2572                     "process %d; address space is changing", p->p_pid);
2573                 error = EIO;
2574                 goto done;
2575         }
2576 
2577         if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2578             phdr, phdrsz, rlimit, credp)) != 0) {
2579                 goto done;
2580         }
2581 
2582         if ((error = write_old_elfnotes(p, sig, vp, phdr[0].p_offset, rlimit,
2583             credp)) != 0) {
2584                 goto done;
2585         }
2586         if ((error = write_elfnotes(p, sig, vp, phdr[1].p_offset, rlimit,
2587             credp, content)) != 0) {
2588                 goto done;
2589         }
2590 
2591         for (i = 2; i < nphdrs; i++) {
2592                 prkillinfo_t killinfo;
2593                 sigqueue_t *sq;
2594                 int sig, j;
2595 
2596                 if (phdr[i].p_filesz == 0)
2597                         continue;
2598 
2599                 /*
2600                  * If we hit a region that was mapped PROT_NONE then we cannot
2601                  * continue dumping this normally as the kernel would be unable
2602                  * to read from the page and that would result in us failing to
2603                  * dump the page. As such, any region mapped PROT_NONE, we dump
2604                  * as a zero-filled page such that this is still represented in
2605                  * the map.
2606                  *
2607                  * If dumping out this segment fails, rather than failing
2608                  * the core dump entirely, we reset the size of the mapping
2609                  * to zero to indicate that the data is absent from the core
2610                  * file and or in the PF_SUNW_FAILURE flag to differentiate
2611                  * this from mappings that were excluded due to the core file
2612                  * content settings.
2613                  */
2614                 if ((phdr[i].p_flags & (PF_R | PF_W | PF_X)) == 0) {
2615                         size_t towrite = phdr[i].p_filesz;
2616                         size_t curoff = 0;
2617 
2618                         if (zeropg == NULL) {
2619                                 zeropg = kmem_zalloc(elf_zeropg_sz, KM_SLEEP);
2620                         }
2621 
2622                         error = 0;
2623                         while (towrite != 0) {
2624                                 size_t len = MIN(towrite, elf_zeropg_sz);
2625 
2626                                 error = core_write(vp, UIO_SYSSPACE,
2627                                     phdr[i].p_offset + curoff, zeropg, len,
2628                                     rlimit, credp);
2629                                 if (error != 0)
2630                                         break;
2631 
2632                                 towrite -= len;
2633                                 curoff += len;
2634                         }
2635                 } else {
2636                         error = core_seg(p, vp, phdr[i].p_offset,
2637                             (caddr_t)(uintptr_t)phdr[i].p_vaddr,
2638                             phdr[i].p_filesz, rlimit, credp);
2639                 }
2640                 if (error == 0)
2641                         continue;
2642 
2643                 if ((sig = lwp->lwp_cursig) == 0) {
2644                         /*
2645                          * We failed due to something other than a signal.
2646                          * Since the space reserved for the segment is now
2647                          * unused, we stash the errno in the first four
2648                          * bytes. This undocumented interface will let us
2649                          * understand the nature of the failure.
2650                          */
2651                         (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2652                             &error, sizeof (error), rlimit, credp);
2653 
2654                         phdr[i].p_filesz = 0;
2655                         phdr[i].p_flags |= PF_SUNW_FAILURE;
2656                         if ((error = core_write(vp, UIO_SYSSPACE,
2657                             poffset + sizeof (Phdr) * i, &phdr[i],
2658                             sizeof (Phdr), rlimit, credp)) != 0)
2659                                 goto done;
2660 
2661                         continue;
2662                 }
2663 
2664                 /*
2665                  * We took a signal.  We want to abort the dump entirely, but
2666                  * we also want to indicate what failed and why.  We therefore
2667                  * use the space reserved for the first failing segment to
2668                  * write our error (which, for purposes of compatability with
2669                  * older core dump readers, we set to EINTR) followed by any
2670                  * siginfo associated with the signal.
2671                  */
2672                 bzero(&killinfo, sizeof (killinfo));
2673                 killinfo.prk_error = EINTR;
2674 
2675                 sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2676 
2677                 if (sq != NULL) {
2678                         bcopy(&sq->sq_info, &killinfo.prk_info,
2679                             sizeof (sq->sq_info));
2680                 } else {
2681                         killinfo.prk_info.si_signo = lwp->lwp_cursig;
2682                         killinfo.prk_info.si_code = SI_NOINFO;
2683                 }
2684 
2685 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2686                 /*
2687                  * If this is a 32-bit process, we need to translate from the
2688                  * native siginfo to the 32-bit variant.  (Core readers must
2689                  * always have the same data model as their target or must
2690                  * be aware of -- and compensate for -- data model differences.)
2691                  */
2692                 if (curproc->p_model == DATAMODEL_ILP32) {
2693                         siginfo32_t si32;
2694 
2695                         siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2696                         bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2697                 }
2698 #endif
2699 
2700                 (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2701                     &killinfo, sizeof (killinfo), rlimit, credp);
2702 
2703                 /*
2704                  * For the segment on which we took the signal, indicate that
2705                  * its data now refers to a siginfo.
2706                  */
2707                 phdr[i].p_filesz = 0;
2708                 phdr[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2709                     PF_SUNW_SIGINFO;
2710 
2711                 /*
2712                  * And for every other segment, indicate that its absence
2713                  * is due to a signal.
2714                  */
2715                 for (j = i + 1; j < nphdrs; j++) {
2716                         phdr[j].p_filesz = 0;
2717                         phdr[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2718                 }
2719 
2720                 /*
2721                  * Finally, write out our modified program headers.
2722                  */
2723                 if ((error = core_write(vp, UIO_SYSSPACE,
2724                     poffset + sizeof (Phdr) * i, &phdr[i],
2725                     sizeof (Phdr) * (nphdrs - i), rlimit, credp)) != 0) {
2726                         goto done;
2727                 }
2728 
2729                 break;
2730         }
2731 
2732         if (nshdrs > 0) {
2733                 Shdr *shdr = (Shdr *)bigwad;
2734 
2735                 bzero(shdr, shdrsz);
2736                 if (nshdrs > 1) {
2737                         ctx.ecc_doffset = doffset;
2738                         AS_LOCK_ENTER(as, RW_WRITER);
2739                         error = elf_process_scns(&ctx, shdr, nshdrs, NULL);
2740                         AS_LOCK_EXIT(as);
2741                         if (error != 0) {
2742                                 goto done;
2743                         }
2744                 }
2745                 /* Copy any extended format data destined for the first shdr */
2746                 bcopy(&shdr0, shdr, sizeof (shdr0));
2747 
2748                 error = core_write(vp, UIO_SYSSPACE, soffset, shdr, shdrsz,
2749                     rlimit, credp);
2750         }
2751 
2752 done:
2753         if (zeropg != NULL)
2754                 kmem_free(zeropg, elf_zeropg_sz);
2755         if (ctx.ecc_bufsz != 0) {
2756                 kmem_free(ctx.ecc_buf, ctx.ecc_bufsz);
2757         }
2758         kmem_free(bigwad, bigsize);
2759         return (error);
2760 }
2761 
2762 #ifndef _ELF32_COMPAT
2763 
2764 static struct execsw esw = {
2765 #ifdef  _LP64
2766         elf64magicstr,
2767 #else   /* _LP64 */
2768         elf32magicstr,
2769 #endif  /* _LP64 */
2770         0,
2771         5,
2772         elfexec,
2773         elfcore
2774 };
2775 
2776 static struct modlexec modlexec = {
2777         &mod_execops, "exec module for elf", &esw
2778 };
2779 
2780 #ifdef  _LP64
2781 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2782                         intpdata_t *idatap, int level, size_t *execsz,
2783                         int setid, caddr_t exec_file, cred_t *cred,
2784                         int *brand_action);
2785 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2786                         rlim64_t rlimit, int sig, core_content_t content);
2787 
2788 static struct execsw esw32 = {
2789         elf32magicstr,
2790         0,
2791         5,
2792         elf32exec,
2793         elf32core
2794 };
2795 
2796 static struct modlexec modlexec32 = {
2797         &mod_execops, "32-bit exec module for elf", &esw32
2798 };
2799 #endif  /* _LP64 */
2800 
2801 static struct modlinkage modlinkage = {
2802         MODREV_1,
2803         (void *)&modlexec,
2804 #ifdef  _LP64
2805         (void *)&modlexec32,
2806 #endif  /* _LP64 */
2807         NULL
2808 };
2809 
2810 int
2811 _init(void)
2812 {
2813         return (mod_install(&modlinkage));
2814 }
2815 
2816 int
2817 _fini(void)
2818 {
2819         return (mod_remove(&modlinkage));
2820 }
2821 
2822 int
2823 _info(struct modinfo *modinfop)
2824 {
2825         return (mod_info(&modlinkage, modinfop));
2826 }
2827 
2828 #endif  /* !_ELF32_COMPAT */