1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*         All Rights Reserved  */
  28 /*
  29  * Copyright 2019 Joyent, Inc.
  30  * Copyright 2021 Oxide Computer Company
  31  */
  32 
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/thread.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/signal.h>
  38 #include <sys/cred.h>
  39 #include <sys/user.h>
  40 #include <sys/errno.h>
  41 #include <sys/vnode.h>
  42 #include <sys/mman.h>
  43 #include <sys/kmem.h>
  44 #include <sys/proc.h>
  45 #include <sys/pathname.h>
  46 #include <sys/policy.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/systm.h>
  49 #include <sys/elf.h>
  50 #include <sys/vmsystm.h>
  51 #include <sys/debug.h>
  52 #include <sys/auxv.h>
  53 #include <sys/exec.h>
  54 #include <sys/prsystm.h>
  55 #include <vm/as.h>
  56 #include <vm/rm.h>
  57 #include <vm/seg.h>
  58 #include <vm/seg_vn.h>
  59 #include <sys/modctl.h>
  60 #include <sys/systeminfo.h>
  61 #include <sys/vmparam.h>
  62 #include <sys/machelf.h>
  63 #include <sys/shm_impl.h>
  64 #include <sys/archsystm.h>
  65 #include <sys/fasttrap.h>
  66 #include <sys/brand.h>
  67 #include "elf_impl.h"
  68 #include <sys/sdt.h>
  69 #include <sys/siginfo.h>
  70 #include <sys/random.h>
  71 
  72 #if defined(__x86)
  73 #include <sys/comm_page_util.h>
  74 #include <sys/fp.h>
  75 #endif /* defined(__x86) */
  76 
  77 
  78 extern int at_flags;
  79 extern volatile size_t aslr_max_brk_skew;
  80 
  81 #define ORIGIN_STR      "ORIGIN"
  82 #define ORIGIN_STR_SIZE 6
  83 
  84 static int getelfhead(vnode_t *, cred_t *, Ehdr *, uint_t *, uint_t *,
  85     uint_t *);
  86 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, uint_t, caddr_t *,
  87     size_t *);
  88 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, uint_t, uint_t,
  89     caddr_t *, size_t *, caddr_t *, size_t *);
  90 static size_t elfsize(const Ehdr *, uint_t, const caddr_t, uintptr_t *);
  91 static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **,
  92     Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *,
  93     size_t, size_t *, size_t *);
  94 
  95 #ifdef _ELF32_COMPAT
  96 /* Link against the non-compat instances when compiling the 32-bit version. */
  97 extern size_t elf_datasz_max;
  98 extern size_t elf_zeropg_sz;
  99 extern void elf_ctx_resize_scratch(elf_core_ctx_t *, size_t);
 100 extern uint_t elf_nphdr_max;
 101 extern uint_t elf_nshdr_max;
 102 extern size_t elf_shstrtab_max;
 103 #else
 104 size_t elf_datasz_max = 1 * 1024 * 1024;
 105 size_t elf_zeropg_sz = 4 * 1024;
 106 uint_t elf_nphdr_max = 1000;
 107 uint_t elf_nshdr_max = 10000;
 108 size_t elf_shstrtab_max = 100 * 1024;
 109 #endif
 110 
 111 
 112 
 113 typedef enum {
 114         STR_CTF,
 115         STR_SYMTAB,
 116         STR_DYNSYM,
 117         STR_STRTAB,
 118         STR_DYNSTR,
 119         STR_SHSTRTAB,
 120         STR_NUM
 121 } shstrtype_t;
 122 
 123 static const char *shstrtab_data[] = {
 124         ".SUNW_ctf",
 125         ".symtab",
 126         ".dynsym",
 127         ".strtab",
 128         ".dynstr",
 129         ".shstrtab"
 130 };
 131 
 132 typedef struct shstrtab {
 133         uint_t  sst_ndx[STR_NUM];
 134         uint_t  sst_cur;
 135 } shstrtab_t;
 136 
 137 static void
 138 shstrtab_init(shstrtab_t *s)
 139 {
 140         bzero(&s->sst_ndx, sizeof (s->sst_ndx));
 141         s->sst_cur = 1;
 142 }
 143 
 144 static uint_t
 145 shstrtab_ndx(shstrtab_t *s, shstrtype_t type)
 146 {
 147         uint_t ret;
 148 
 149         if ((ret = s->sst_ndx[type]) != 0)
 150                 return (ret);
 151 
 152         ret = s->sst_ndx[type] = s->sst_cur;
 153         s->sst_cur += strlen(shstrtab_data[type]) + 1;
 154 
 155         return (ret);
 156 }
 157 
 158 static size_t
 159 shstrtab_size(const shstrtab_t *s)
 160 {
 161         return (s->sst_cur);
 162 }
 163 
 164 static void
 165 shstrtab_dump(const shstrtab_t *s, char *buf)
 166 {
 167         uint_t i, ndx;
 168 
 169         *buf = '\0';
 170         for (i = 0; i < STR_NUM; i++) {
 171                 if ((ndx = s->sst_ndx[i]) != 0)
 172                         (void) strcpy(buf + ndx, shstrtab_data[i]);
 173         }
 174 }
 175 
 176 static int
 177 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
 178 {
 179         ASSERT(phdrp->p_type == PT_SUNWDTRACE);
 180 
 181         /*
 182          * See the comment in fasttrap.h for information on how to safely
 183          * update this program header.
 184          */
 185         if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
 186             (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
 187                 return (-1);
 188 
 189         args->thrptr = phdrp->p_vaddr + base;
 190 
 191         return (0);
 192 }
 193 
 194 static int
 195 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
 196 {
 197         uint_t flag;
 198 
 199         switch (dt) {
 200         case DT_SUNW_ASLR:
 201                 flag = PROC_SEC_ASLR;
 202                 break;
 203         default:
 204                 return (EINVAL);
 205         }
 206 
 207         if (val == 0) {
 208                 if (secflag_isset(p->p_secflags.psf_lower, flag))
 209                         return (EPERM);
 210                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 211                     secflag_isset(p->p_secflags.psf_inherit, flag))
 212                         return (EPERM);
 213 
 214                 secflag_clear(&p->p_secflags.psf_effective, flag);
 215         } else {
 216                 if (!secflag_isset(p->p_secflags.psf_upper, flag))
 217                         return (EPERM);
 218 
 219                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 220                     !secflag_isset(p->p_secflags.psf_inherit, flag))
 221                         return (EPERM);
 222 
 223                 secflag_set(&p->p_secflags.psf_effective, flag);
 224         }
 225 
 226         return (0);
 227 }
 228 
 229 
 230 #ifndef _ELF32_COMPAT
 231 void
 232 elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz)
 233 {
 234         size_t target = MIN(sz, elf_datasz_max);
 235 
 236         if (target > ctx->ecc_bufsz) {
 237                 if (ctx->ecc_buf != NULL) {
 238                         kmem_free(ctx->ecc_buf, ctx->ecc_bufsz);
 239                 }
 240                 ctx->ecc_buf = kmem_alloc(target, KM_SLEEP);
 241                 ctx->ecc_bufsz = target;
 242         }
 243 }
 244 #endif /* _ELF32_COMPAT */
 245 
 246 /*
 247  * Map in the executable pointed to by vp. Returns 0 on success.  Note that
 248  * this function currently has the maximum number of arguments allowed by
 249  * modstubs on x86 (MAXNARG)!  Do _not_ add to this function signature without
 250  * adding to MAXNARG.  (Better yet, do not add to this monster of a function
 251  * signature!)
 252  */
 253 int
 254 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 255     intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
 256     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
 257 {
 258         size_t          len, phdrsize;
 259         struct vattr    vat;
 260         caddr_t         phdrbase = NULL;
 261         uint_t          nshdrs, shstrndx, nphdrs;
 262         int             error = 0;
 263         Phdr            *uphdr = NULL;
 264         Phdr            *junk = NULL;
 265         Phdr            *dynphdr = NULL;
 266         Phdr            *dtrphdr = NULL;
 267         char            *interp = NULL;
 268         uintptr_t       lddata, minaddr;
 269         size_t          execsz;
 270 
 271         if (lddatap != NULL)
 272                 *lddatap = 0;
 273 
 274         if (minaddrp != NULL)
 275                 *minaddrp = (uintptr_t)NULL;
 276 
 277         if (error = execpermissions(vp, &vat, args)) {
 278                 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
 279                 return (error);
 280         }
 281 
 282         if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
 283             &nphdrs)) != 0 ||
 284             (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
 285             &phdrsize)) != 0) {
 286                 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
 287                 return (error);
 288         }
 289 
 290         if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
 291                 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
 292                 kmem_free(phdrbase, phdrsize);
 293                 return (ENOEXEC);
 294         }
 295         if (lddatap != NULL)
 296                 *lddatap = lddata;
 297 
 298         if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
 299             &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
 300             len, &execsz, brksize)) {
 301                 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
 302                 if (uphdr != NULL && uphdr->p_flags == 0)
 303                         kmem_free(uphdr, sizeof (Phdr));
 304                 kmem_free(phdrbase, phdrsize);
 305                 return (error);
 306         }
 307 
 308         if (minaddrp != NULL)
 309                 *minaddrp = minaddr;
 310 
 311         /*
 312          * If the executable requires an interpreter, determine its name.
 313          */
 314         if (dynphdr != NULL) {
 315                 ssize_t resid;
 316 
 317                 if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
 318                         uprintf("%s: Invalid interpreter\n", exec_file);
 319                         kmem_free(phdrbase, phdrsize);
 320                         return (ENOEXEC);
 321                 }
 322 
 323                 interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 324 
 325                 if ((error = vn_rdwr(UIO_READ, vp, interp,
 326                     (ssize_t)dynphdr->p_filesz,
 327                     (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
 328                     (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
 329                     interp[dynphdr->p_filesz - 1] != '\0') {
 330                         uprintf("%s: Cannot obtain interpreter pathname\n",
 331                             exec_file);
 332                         kmem_free(interp, MAXPATHLEN);
 333                         kmem_free(phdrbase, phdrsize);
 334                         return (error != 0 ? error : ENOEXEC);
 335                 }
 336         }
 337 
 338         /*
 339          * If this is a statically linked executable, voffset should indicate
 340          * the address of the executable itself (it normally holds the address
 341          * of the interpreter).
 342          */
 343         if (ehdr->e_type == ET_EXEC && interp == NULL)
 344                 *voffset = minaddr;
 345 
 346         /*
 347          * If the caller has asked for the interpreter name, return it (it's
 348          * up to the caller to free it); if the caller hasn't asked for it,
 349          * free it ourselves.
 350          */
 351         if (interpp != NULL) {
 352                 *interpp = interp;
 353         } else if (interp != NULL) {
 354                 kmem_free(interp, MAXPATHLEN);
 355         }
 356 
 357         if (uphdr != NULL) {
 358                 *uphdr_vaddr = uphdr->p_vaddr;
 359 
 360                 if (uphdr->p_flags == 0)
 361                         kmem_free(uphdr, sizeof (Phdr));
 362         } else if (ehdr->e_type == ET_DYN) {
 363                 /*
 364                  * If we don't have a uphdr, we'll apply the logic found
 365                  * in mapelfexec() and use the p_vaddr of the first PT_LOAD
 366                  * section as the base address of the object.
 367                  */
 368                 const Phdr *phdr = (Phdr *)phdrbase;
 369                 const uint_t hsize = ehdr->e_phentsize;
 370                 uint_t i;
 371 
 372                 for (i = nphdrs; i > 0; i--) {
 373                         if (phdr->p_type == PT_LOAD) {
 374                                 *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
 375                                     ehdr->e_phoff;
 376                                 break;
 377                         }
 378 
 379                         phdr = (Phdr *)((caddr_t)phdr + hsize);
 380                 }
 381 
 382                 /*
 383                  * If we don't have a PT_LOAD segment, we should have returned
 384                  * ENOEXEC when elfsize() returned 0, above.
 385                  */
 386                 VERIFY(i > 0);
 387         } else {
 388                 *uphdr_vaddr = (Addr)-1;
 389         }
 390 
 391         kmem_free(phdrbase, phdrsize);
 392         return (error);
 393 }
 394 
 395 /*ARGSUSED*/
 396 int
 397 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 398     int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
 399     int *brand_action)
 400 {
 401         caddr_t         phdrbase = NULL;
 402         caddr_t         bssbase = 0;
 403         caddr_t         brkbase = 0;
 404         size_t          brksize = 0;
 405         size_t          dlnsize, nsize = 0;
 406         aux_entry_t     *aux;
 407         int             error;
 408         ssize_t         resid;
 409         int             fd = -1;
 410         intptr_t        voffset;
 411         Phdr            *intphdr = NULL;
 412         Phdr            *dynamicphdr = NULL;
 413         Phdr            *stphdr = NULL;
 414         Phdr            *uphdr = NULL;
 415         Phdr            *junk = NULL;
 416         size_t          len;
 417         size_t          postfixsize = 0;
 418         size_t          i;
 419         Phdr            *phdrp;
 420         Phdr            *dataphdrp = NULL;
 421         Phdr            *dtrphdr;
 422         Phdr            *capphdr = NULL;
 423         Cap             *cap = NULL;
 424         size_t          capsize;
 425         int             hasu = 0;
 426         int             hasauxv = 0;
 427         int             hasintp = 0;
 428         int             branded = 0;
 429         int             dynuphdr = 0;
 430 
 431         struct proc *p = ttoproc(curthread);
 432         struct user *up = PTOU(p);
 433         struct bigwad {
 434                 Ehdr    ehdr;
 435                 aux_entry_t     elfargs[__KERN_NAUXV_IMPL];
 436                 char            dl_name[MAXPATHLEN];
 437                 char            pathbuf[MAXPATHLEN];
 438                 struct vattr    vattr;
 439                 struct execenv  exenv;
 440         } *bigwad;      /* kmem_alloc this behemoth so we don't blow stack */
 441         Ehdr            *ehdrp;
 442         uint_t          nshdrs, shstrndx, nphdrs;
 443         size_t          phdrsize;
 444         char            *dlnp;
 445         char            *pathbufp;
 446         rlim64_t        limit;
 447         rlim64_t        roundlimit;
 448 
 449         ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
 450 
 451         bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
 452         ehdrp = &bigwad->ehdr;
 453         dlnp = bigwad->dl_name;
 454         pathbufp = bigwad->pathbuf;
 455 
 456         /*
 457          * Obtain ELF and program header information.
 458          */
 459         if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
 460             &nphdrs)) != 0 ||
 461             (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
 462             &phdrsize)) != 0)
 463                 goto out;
 464 
 465         /*
 466          * Prevent executing an ELF file that has no entry point.
 467          */
 468         if (ehdrp->e_entry == 0) {
 469                 uprintf("%s: Bad entry point\n", exec_file);
 470                 goto bad;
 471         }
 472 
 473         /*
 474          * Put data model that we're exec-ing to into the args passed to
 475          * exec_args(), so it will know what it is copying to on new stack.
 476          * Now that we know whether we are exec-ing a 32-bit or 64-bit
 477          * executable, we can set execsz with the appropriate NCARGS.
 478          */
 479 #ifdef  _LP64
 480         if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
 481                 args->to_model = DATAMODEL_ILP32;
 482                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
 483         } else {
 484                 args->to_model = DATAMODEL_LP64;
 485                 if (!args->stk_prot_override) {
 486                         args->stk_prot &= ~PROT_EXEC;
 487                 }
 488 #if defined(__x86)
 489                 args->dat_prot &= ~PROT_EXEC;
 490 #endif
 491                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
 492         }
 493 #else   /* _LP64 */
 494         args->to_model = DATAMODEL_ILP32;
 495         *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
 496 #endif  /* _LP64 */
 497 
 498         /*
 499          * We delay invoking the brand callback until we've figured out what
 500          * kind of elf binary we're trying to run, 32-bit or 64-bit.  We do this
 501          * because now the brand library can just check args->to_model to see if
 502          * the target is 32-bit or 64-bit without having do duplicate all the
 503          * code above.
 504          *
 505          * We also give the brand a chance to indicate that based on the ELF
 506          * OSABI of the target binary it should become unbranded and optionally
 507          * indicate that it should be treated as existing in a specific prefix.
 508          *
 509          * Note that if a brand opts to go down this route it does not actually
 510          * end up being debranded. In other words, future programs that exec
 511          * will still be considered for branding unless this escape hatch is
 512          * used. Consider the case of lx brand for example. If a user runs
 513          * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
 514          * of DTrace that's in /native will take this escape hatch and be run
 515          * and interpreted using the normal system call table; however, the
 516          * execution of a non-illumos binary in the form of /bin/ls will still
 517          * be branded and be subject to all of the normal actions of the brand.
 518          *
 519          * The level checks associated with brand handling below are used to
 520          * prevent a loop since the brand elfexec function typically comes back
 521          * through this function. We must check <= here since the nested
 522          * handling in the #! interpreter code will increment the level before
 523          * calling gexec to run the final elfexec interpreter.
 524          */
 525         if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
 526             (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
 527                 if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
 528                     &args->brand_nroot) == B_TRUE) {
 529                         ASSERT(ehdrp->e_ident[EI_OSABI]);
 530                         *brand_action = EBA_NATIVE;
 531                         /* Add one for the trailing '/' in the path */
 532                         if (args->brand_nroot != NULL)
 533                                 nsize = strlen(args->brand_nroot) + 1;
 534                 }
 535         }
 536 
 537         if ((level <= INTP_MAXDEPTH) &&
 538             (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 539                 error = BROP(p)->b_elfexec(vp, uap, args,
 540                     idatap, level + 1, execsz, setid, exec_file, cred,
 541                     brand_action);
 542                 goto out;
 543         }
 544 
 545         /*
 546          * Determine aux size now so that stack can be built
 547          * in one shot (except actual copyout of aux image),
 548          * determine any non-default stack protections,
 549          * and still have this code be machine independent.
 550          */
 551         const uint_t hsize = ehdrp->e_phentsize;
 552         phdrp = (Phdr *)phdrbase;
 553         for (i = nphdrs; i > 0; i--) {
 554                 switch (phdrp->p_type) {
 555                 case PT_INTERP:
 556                         hasauxv = hasintp = 1;
 557                         break;
 558                 case PT_PHDR:
 559                         hasu = 1;
 560                         break;
 561                 case PT_SUNWSTACK:
 562                         args->stk_prot = PROT_USER;
 563                         if (phdrp->p_flags & PF_R)
 564                                 args->stk_prot |= PROT_READ;
 565                         if (phdrp->p_flags & PF_W)
 566                                 args->stk_prot |= PROT_WRITE;
 567                         if (phdrp->p_flags & PF_X)
 568                                 args->stk_prot |= PROT_EXEC;
 569                         break;
 570                 case PT_LOAD:
 571                         dataphdrp = phdrp;
 572                         break;
 573                 case PT_SUNWCAP:
 574                         capphdr = phdrp;
 575                         break;
 576                 case PT_DYNAMIC:
 577                         dynamicphdr = phdrp;
 578                         break;
 579                 }
 580                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
 581         }
 582 
 583         if (ehdrp->e_type != ET_EXEC) {
 584                 dataphdrp = NULL;
 585                 hasauxv = 1;
 586         }
 587 
 588         /* Copy BSS permissions to args->dat_prot */
 589         if (dataphdrp != NULL) {
 590                 args->dat_prot = PROT_USER;
 591                 if (dataphdrp->p_flags & PF_R)
 592                         args->dat_prot |= PROT_READ;
 593                 if (dataphdrp->p_flags & PF_W)
 594                         args->dat_prot |= PROT_WRITE;
 595                 if (dataphdrp->p_flags & PF_X)
 596                         args->dat_prot |= PROT_EXEC;
 597         }
 598 
 599         /*
 600          * If a auxvector will be required - reserve the space for
 601          * it now.  This may be increased by exec_args if there are
 602          * ISA-specific types (included in __KERN_NAUXV_IMPL).
 603          */
 604         if (hasauxv) {
 605                 /*
 606                  * If a AUX vector is being built - the base AUX
 607                  * entries are:
 608                  *
 609                  *      AT_BASE
 610                  *      AT_FLAGS
 611                  *      AT_PAGESZ
 612                  *      AT_RANDOM       (added in stk_copyout)
 613                  *      AT_SUN_AUXFLAGS
 614                  *      AT_SUN_HWCAP
 615                  *      AT_SUN_HWCAP2
 616                  *      AT_SUN_PLATFORM (added in stk_copyout)
 617                  *      AT_SUN_EXECNAME (added in stk_copyout)
 618                  *      AT_NULL
 619                  *
 620                  * total == 10
 621                  */
 622                 if (hasintp && hasu) {
 623                         /*
 624                          * Has PT_INTERP & PT_PHDR - the auxvectors that
 625                          * will be built are:
 626                          *
 627                          *      AT_PHDR
 628                          *      AT_PHENT
 629                          *      AT_PHNUM
 630                          *      AT_ENTRY
 631                          *      AT_LDDATA
 632                          *
 633                          * total = 5
 634                          */
 635                         args->auxsize = (10 + 5) * sizeof (aux_entry_t);
 636                 } else if (hasintp) {
 637                         /*
 638                          * Has PT_INTERP but no PT_PHDR
 639                          *
 640                          *      AT_EXECFD
 641                          *      AT_LDDATA
 642                          *
 643                          * total = 2
 644                          */
 645                         args->auxsize = (10 + 2) * sizeof (aux_entry_t);
 646                 } else {
 647                         args->auxsize = 10 * sizeof (aux_entry_t);
 648                 }
 649         } else {
 650                 args->auxsize = 0;
 651         }
 652 
 653         /*
 654          * If this binary is using an emulator, we need to add an
 655          * AT_SUN_EMULATOR aux entry.
 656          */
 657         if (args->emulator != NULL)
 658                 args->auxsize += sizeof (aux_entry_t);
 659 
 660         /*
 661          * If this is a native binary that's been given a modified interpreter
 662          * root, inform it that the native system exists at that root.
 663          */
 664         if (args->brand_nroot != NULL) {
 665                 args->auxsize += sizeof (aux_entry_t);
 666         }
 667 
 668 
 669         /*
 670          * On supported kernels (x86_64) make room in the auxv for the
 671          * AT_SUN_COMMPAGE entry.  This will go unpopulated on i86xpv systems
 672          * which do not provide such functionality.
 673          *
 674          * Additionally cover the floating point information AT_SUN_FPSIZE and
 675          * AT_SUN_FPTYPE.
 676          */
 677 #if defined(__amd64)
 678         args->auxsize += 3 * sizeof (aux_entry_t);
 679 #endif /* defined(__amd64) */
 680 
 681         /*
 682          * If we have user credentials, we'll supply the following entries:
 683          *      AT_SUN_UID
 684          *      AT_SUN_RUID
 685          *      AT_SUN_GID
 686          *      AT_SUN_RGID
 687          */
 688         if (cred != NULL) {
 689                 args->auxsize += 4 * sizeof (aux_entry_t);
 690         }
 691 
 692         if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 693                 branded = 1;
 694                 /*
 695                  * We will be adding 5 entries to the aux vectors.  One for
 696                  * the the brandname and 4 for the brand specific aux vectors.
 697                  */
 698                 args->auxsize += 5 * sizeof (aux_entry_t);
 699         }
 700 
 701         /* If the binary has an explicit ASLR flag, it must be honoured */
 702         if ((dynamicphdr != NULL) && (dynamicphdr->p_filesz > 0)) {
 703                 const size_t dynfilesz = dynamicphdr->p_filesz;
 704                 const size_t dynoffset = dynamicphdr->p_offset;
 705                 Dyn *dyn, *dp;
 706 
 707                 if (dynoffset > MAXOFFSET_T ||
 708                     dynfilesz > MAXOFFSET_T ||
 709                     dynoffset + dynfilesz > MAXOFFSET_T) {
 710                         uprintf("%s: cannot read full .dynamic section\n",
 711                             exec_file);
 712                         error = EINVAL;
 713                         goto out;
 714                 }
 715 
 716 #define DYN_STRIDE      100
 717                 for (i = 0; i < dynfilesz; i += sizeof (*dyn) * DYN_STRIDE) {
 718                         const size_t remdyns = (dynfilesz - i) / sizeof (*dyn);
 719                         const size_t ndyns = MIN(DYN_STRIDE, remdyns);
 720                         const size_t dynsize = ndyns * sizeof (*dyn);
 721 
 722                         dyn = kmem_alloc(dynsize, KM_SLEEP);
 723 
 724                         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
 725                             (ssize_t)dynsize, (offset_t)(dynoffset + i),
 726                             UIO_SYSSPACE, 0, (rlim64_t)0,
 727                             CRED(), NULL)) != 0) {
 728                                 uprintf("%s: cannot read .dynamic section\n",
 729                                     exec_file);
 730                                 goto out;
 731                         }
 732 
 733                         for (dp = dyn; dp < (dyn + ndyns); dp++) {
 734                                 if (dp->d_tag == DT_SUNW_ASLR) {
 735                                         if ((error = handle_secflag_dt(p,
 736                                             DT_SUNW_ASLR,
 737                                             dp->d_un.d_val)) != 0) {
 738                                                 uprintf("%s: error setting "
 739                                                     "security-flag from "
 740                                                     "DT_SUNW_ASLR: %d\n",
 741                                                     exec_file, error);
 742                                                 goto out;
 743                                         }
 744                                 }
 745                         }
 746 
 747                         kmem_free(dyn, dynsize);
 748                 }
 749         }
 750 
 751         /* Hardware/Software capabilities */
 752         if (capphdr != NULL &&
 753             (capsize = capphdr->p_filesz) > 0 &&
 754             capsize <= 16 * sizeof (*cap)) {
 755                 const uint_t ncaps = capsize / sizeof (*cap);
 756                 Cap *cp;
 757 
 758                 cap = kmem_alloc(capsize, KM_SLEEP);
 759                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
 760                     (ssize_t)capsize, (offset_t)capphdr->p_offset,
 761                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), NULL)) != 0) {
 762                         uprintf("%s: Cannot read capabilities section\n",
 763                             exec_file);
 764                         goto out;
 765                 }
 766                 for (cp = cap; cp < cap + ncaps; cp++) {
 767                         if (cp->c_tag == CA_SUNW_SF_1 &&
 768                             (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
 769                                 if (args->to_model == DATAMODEL_LP64)
 770                                         args->addr32 = 1;
 771                                 break;
 772                         }
 773                 }
 774         }
 775 
 776         aux = bigwad->elfargs;
 777         /*
 778          * Move args to the user's stack.
 779          * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM
 780          * aux entries.
 781          */
 782         if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
 783                 if (error == -1) {
 784                         error = ENOEXEC;
 785                         goto bad;
 786                 }
 787                 goto out;
 788         }
 789         /* we're single threaded after this point */
 790 
 791         /*
 792          * If this is an ET_DYN executable (shared object),
 793          * determine its memory size so that mapelfexec() can load it.
 794          */
 795         if (ehdrp->e_type == ET_DYN)
 796                 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
 797         else
 798                 len = 0;
 799 
 800         dtrphdr = NULL;
 801 
 802         error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
 803             &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
 804             len, execsz, &brksize);
 805         /*
 806          * Our uphdr has been dynamically allocated if (and only if) its
 807          * program header flags are clear.  To avoid leaks, this must be
 808          * checked regardless of whether mapelfexec() emitted an error.
 809          */
 810         dynuphdr = (uphdr != NULL && uphdr->p_flags == 0);
 811 
 812         if (error != 0) {
 813                 goto bad;
 814         }
 815 
 816         if (uphdr != NULL && intphdr == NULL)
 817                 goto bad;
 818 
 819         if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 820                 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
 821                 goto bad;
 822         }
 823 
 824         if (intphdr != NULL) {
 825                 size_t          len;
 826                 uintptr_t       lddata;
 827                 char            *p;
 828                 struct vnode    *nvp;
 829 
 830                 dlnsize = intphdr->p_filesz + nsize;
 831 
 832                 /*
 833                  * Make sure none of the component pieces of dlnsize result in
 834                  * an oversized or zeroed result.
 835                  */
 836                 if (intphdr->p_filesz > MAXPATHLEN || dlnsize > MAXPATHLEN ||
 837                     dlnsize == 0 || dlnsize < intphdr->p_filesz) {
 838                         goto bad;
 839                 }
 840 
 841                 if (nsize != 0) {
 842                         bcopy(args->brand_nroot, dlnp, nsize - 1);
 843                         dlnp[nsize - 1] = '/';
 844                 }
 845 
 846                 /*
 847                  * Read in "interpreter" pathname.
 848                  */
 849                 if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
 850                     (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset,
 851                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
 852                         uprintf("%s: Cannot obtain interpreter pathname\n",
 853                             exec_file);
 854                         goto bad;
 855                 }
 856 
 857                 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
 858                         goto bad;
 859 
 860                 /*
 861                  * Search for '$ORIGIN' token in interpreter path.
 862                  * If found, expand it.
 863                  */
 864                 for (p = dlnp; p = strchr(p, '$'); ) {
 865                         uint_t  len, curlen;
 866                         char    *_ptr;
 867 
 868                         if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
 869                                 continue;
 870 
 871                         /*
 872                          * We don't support $ORIGIN on setid programs to close
 873                          * a potential attack vector.
 874                          */
 875                         if ((setid & EXECSETID_SETID) != 0) {
 876                                 error = ENOEXEC;
 877                                 goto bad;
 878                         }
 879 
 880                         curlen = 0;
 881                         len = p - dlnp - 1;
 882                         if (len) {
 883                                 bcopy(dlnp, pathbufp, len);
 884                                 curlen += len;
 885                         }
 886                         if (_ptr = strrchr(args->pathname, '/')) {
 887                                 len = _ptr - args->pathname;
 888                                 if ((curlen + len) > MAXPATHLEN)
 889                                         break;
 890 
 891                                 bcopy(args->pathname, &pathbufp[curlen], len);
 892                                 curlen += len;
 893                         } else {
 894                                 /*
 895                                  * executable is a basename found in the
 896                                  * current directory.  So - just substitue
 897                                  * '.' for ORIGIN.
 898                                  */
 899                                 pathbufp[curlen] = '.';
 900                                 curlen++;
 901                         }
 902                         p += ORIGIN_STR_SIZE;
 903                         len = strlen(p);
 904 
 905                         if ((curlen + len) > MAXPATHLEN)
 906                                 break;
 907                         bcopy(p, &pathbufp[curlen], len);
 908                         curlen += len;
 909                         pathbufp[curlen++] = '\0';
 910                         bcopy(pathbufp, dlnp, curlen);
 911                 }
 912 
 913                 /*
 914                  * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
 915                  * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
 916                  * Just in case /usr is not mounted, change it now.
 917                  */
 918                 if (strcmp(dlnp, USR_LIB_RTLD) == 0)
 919                         dlnp += 4;
 920                 error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
 921                 if (error && dlnp != bigwad->dl_name) {
 922                         /* new kernel, old user-level */
 923                         error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
 924                             NULLVPP, &nvp);
 925                 }
 926                 if (error) {
 927                         uprintf("%s: Cannot find %s\n", exec_file, dlnp);
 928                         goto bad;
 929                 }
 930 
 931                 /*
 932                  * Setup the "aux" vector.
 933                  */
 934                 if (uphdr) {
 935                         if (ehdrp->e_type == ET_DYN) {
 936                                 /* don't use the first page */
 937                                 bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
 938                                 bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
 939                         } else {
 940                                 bigwad->exenv.ex_bssbase = bssbase;
 941                                 bigwad->exenv.ex_brkbase = brkbase;
 942                         }
 943                         bigwad->exenv.ex_brksize = brksize;
 944                         bigwad->exenv.ex_magic = elfmagic;
 945                         bigwad->exenv.ex_vp = vp;
 946                         setexecenv(&bigwad->exenv);
 947 
 948                         ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
 949                         ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
 950                         ADDAUX(aux, AT_PHNUM, nphdrs)
 951                         ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
 952                 } else {
 953                         if ((error = execopen(&vp, &fd)) != 0) {
 954                                 VN_RELE(nvp);
 955                                 goto bad;
 956                         }
 957 
 958                         ADDAUX(aux, AT_EXECFD, fd)
 959                 }
 960 
 961                 if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
 962                         VN_RELE(nvp);
 963                         uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
 964                         goto bad;
 965                 }
 966 
 967                 /*
 968                  * Now obtain the ELF header along with the entire program
 969                  * header contained in "nvp".
 970                  */
 971                 kmem_free(phdrbase, phdrsize);
 972                 phdrbase = NULL;
 973                 if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
 974                     &shstrndx, &nphdrs)) != 0 ||
 975                     (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
 976                     &phdrsize)) != 0) {
 977                         VN_RELE(nvp);
 978                         uprintf("%s: Cannot read %s\n", exec_file, dlnp);
 979                         goto bad;
 980                 }
 981 
 982                 /*
 983                  * Determine memory size of the "interpreter's" loadable
 984                  * sections.  This size is then used to obtain the virtual
 985                  * address of a hole, in the user's address space, large
 986                  * enough to map the "interpreter".
 987                  */
 988                 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
 989                         VN_RELE(nvp);
 990                         uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
 991                         goto bad;
 992                 }
 993 
 994                 dtrphdr = NULL;
 995 
 996                 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
 997                     &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
 998                     execsz, NULL);
 999 
1000                 if (error || junk != NULL) {
1001                         VN_RELE(nvp);
1002                         uprintf("%s: Cannot map %s\n", exec_file, dlnp);
1003                         goto bad;
1004                 }
1005 
1006                 /*
1007                  * We use the DTrace program header to initialize the
1008                  * architecture-specific user per-LWP location. The dtrace
1009                  * fasttrap provider requires ready access to per-LWP scratch
1010                  * space. We assume that there is only one such program header
1011                  * in the interpreter.
1012                  */
1013                 if (dtrphdr != NULL &&
1014                     dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
1015                         VN_RELE(nvp);
1016                         uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
1017                         goto bad;
1018                 }
1019 
1020                 VN_RELE(nvp);
1021                 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
1022         }
1023 
1024         if (hasauxv) {
1025                 int auxf = AF_SUN_HWCAPVERIFY;
1026 #if defined(__amd64)
1027                 size_t fpsize;
1028                 int fptype;
1029 #endif /* defined(__amd64) */
1030 
1031                 /*
1032                  * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were
1033                  * filled in via exec_args()
1034                  */
1035                 ADDAUX(aux, AT_BASE, voffset)
1036                 ADDAUX(aux, AT_FLAGS, at_flags)
1037                 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
1038                 /*
1039                  * Linker flags. (security)
1040                  * p_flag not yet set at this time.
1041                  * We rely on gexec() to provide us with the information.
1042                  * If the application is set-uid but this is not reflected
1043                  * in a mismatch between real/effective uids/gids, then
1044                  * don't treat this as a set-uid exec.  So we care about
1045                  * the EXECSETID_UGIDS flag but not the ...SETID flag.
1046                  */
1047                 if ((setid &= ~EXECSETID_SETID) != 0)
1048                         auxf |= AF_SUN_SETUGID;
1049 
1050                 /*
1051                  * If we're running a native process from within a branded
1052                  * zone under pfexec then we clear the AF_SUN_SETUGID flag so
1053                  * that the native ld.so.1 is able to link with the native
1054                  * libraries instead of using the brand libraries that are
1055                  * installed in the zone.  We only do this for processes
1056                  * which we trust because we see they are already running
1057                  * under pfexec (where uid != euid).  This prevents a
1058                  * malicious user within the zone from crafting a wrapper to
1059                  * run native suid commands with unsecure libraries interposed.
1060                  */
1061                 if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
1062                     (setid &= ~EXECSETID_SETID) != 0))
1063                         auxf &= ~AF_SUN_SETUGID;
1064 
1065                 /*
1066                  * Record the user addr of the auxflags aux vector entry
1067                  * since brands may optionally want to manipulate this field.
1068                  */
1069                 args->auxp_auxflags =
1070                     (char *)((char *)args->stackend +
1071                     ((char *)&aux->a_type -
1072                     (char *)bigwad->elfargs));
1073                 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
1074 
1075                 /*
1076                  * Record information about the real and effective user and
1077                  * group IDs.
1078                  */
1079                 if (cred != NULL) {
1080                         ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
1081                         ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
1082                         ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
1083                         ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
1084                 }
1085 
1086                 /*
1087                  * Hardware capability flag word (performance hints)
1088                  * Used for choosing faster library routines.
1089                  * (Potentially different between 32-bit and 64-bit ABIs)
1090                  */
1091 #if defined(_LP64)
1092                 if (args->to_model == DATAMODEL_NATIVE) {
1093                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1094                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1095                 } else {
1096                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
1097                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
1098                 }
1099 #else
1100                 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1101                 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1102 #endif
1103                 if (branded) {
1104                         /*
1105                          * Reserve space for the brand-private aux vectors,
1106                          * and record the user addr of that space.
1107                          */
1108                         args->auxp_brand =
1109                             (char *)((char *)args->stackend +
1110                             ((char *)&aux->a_type -
1111                             (char *)bigwad->elfargs));
1112                         ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
1113                         ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
1114                         ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
1115                         ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
1116                 }
1117 
1118                 /*
1119                  * Add the comm page auxv entry, mapping it in if needed. Also
1120                  * take care of the FPU entries.
1121                  */
1122 #if defined(__amd64)
1123                 if (args->commpage != (uintptr_t)NULL ||
1124                     (args->commpage = (uintptr_t)comm_page_mapin()) !=
1125                     (uintptr_t)NULL) {
1126                         ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
1127                 } else {
1128                         /*
1129                          * If the comm page cannot be mapped, pad out the auxv
1130                          * to satisfy later size checks.
1131                          */
1132                         ADDAUX(aux, AT_NULL, 0)
1133                 }
1134 
1135                 fptype = AT_386_FPINFO_NONE;
1136                 fpu_auxv_info(&fptype, &fpsize);
1137                 if (fptype != AT_386_FPINFO_NONE) {
1138                         ADDAUX(aux, AT_SUN_FPTYPE, fptype)
1139                         ADDAUX(aux, AT_SUN_FPSIZE, fpsize)
1140                 } else {
1141                         ADDAUX(aux, AT_NULL, 0)
1142                         ADDAUX(aux, AT_NULL, 0)
1143                 }
1144 #endif /* defined(__amd64) */
1145 
1146                 ADDAUX(aux, AT_NULL, 0)
1147                 postfixsize = (uintptr_t)aux - (uintptr_t)bigwad->elfargs;
1148 
1149                 /*
1150                  * We make assumptions above when we determine how many aux
1151                  * vector entries we will be adding. However, if we have an
1152                  * invalid elf file, it is possible that mapelfexec might
1153                  * behave differently (but not return an error), in which case
1154                  * the number of aux entries we actually add will be different.
1155                  * We detect that now and error out.
1156                  */
1157                 if (postfixsize != args->auxsize) {
1158                         DTRACE_PROBE2(elfexec_badaux, size_t, postfixsize,
1159                             size_t, args->auxsize);
1160                         goto bad;
1161                 }
1162                 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
1163         }
1164 
1165         /*
1166          * For the 64-bit kernel, the limit is big enough that rounding it up
1167          * to a page can overflow the 64-bit limit, so we check for btopr()
1168          * overflowing here by comparing it with the unrounded limit in pages.
1169          * If it hasn't overflowed, compare the exec size with the rounded up
1170          * limit in pages.  Otherwise, just compare with the unrounded limit.
1171          */
1172         limit = btop(p->p_vmem_ctl);
1173         roundlimit = btopr(p->p_vmem_ctl);
1174         if ((roundlimit > limit && *execsz > roundlimit) ||
1175             (roundlimit < limit && *execsz > limit)) {
1176                 mutex_enter(&p->p_lock);
1177                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1178                     RCA_SAFE);
1179                 mutex_exit(&p->p_lock);
1180                 error = ENOMEM;
1181                 goto bad;
1182         }
1183 
1184         bzero(up->u_auxv, sizeof (up->u_auxv));
1185         up->u_commpagep = args->commpage;
1186         if (postfixsize) {
1187                 size_t num_auxv;
1188 
1189                 /*
1190                  * Copy the aux vector to the user stack.
1191                  */
1192                 error = execpoststack(args, bigwad->elfargs, postfixsize);
1193                 if (error)
1194                         goto bad;
1195 
1196                 /*
1197                  * Copy auxv to the process's user structure for use by /proc.
1198                  * If this is a branded process, the brand's exec routine will
1199                  * copy it's private entries to the user structure later. It
1200                  * relies on the fact that the blank entries are at the end.
1201                  */
1202                 num_auxv = postfixsize / sizeof (aux_entry_t);
1203                 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
1204                 aux = bigwad->elfargs;
1205                 for (i = 0; i < num_auxv; i++) {
1206                         up->u_auxv[i].a_type = aux[i].a_type;
1207                         up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
1208                 }
1209         }
1210 
1211         /*
1212          * Pass back the starting address so we can set the program counter.
1213          */
1214         args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
1215 
1216         if (!uphdr) {
1217                 if (ehdrp->e_type == ET_DYN) {
1218                         /*
1219                          * If we are executing a shared library which doesn't
1220                          * have a interpreter (probably ld.so.1) then
1221                          * we don't set the brkbase now.  Instead we
1222                          * delay it's setting until the first call
1223                          * via grow.c::brk().  This permits ld.so.1 to
1224                          * initialize brkbase to the tail of the executable it
1225                          * loads (which is where it needs to be).
1226                          */
1227                         bigwad->exenv.ex_brkbase = (caddr_t)0;
1228                         bigwad->exenv.ex_bssbase = (caddr_t)0;
1229                         bigwad->exenv.ex_brksize = 0;
1230                 } else {
1231                         bigwad->exenv.ex_brkbase = brkbase;
1232                         bigwad->exenv.ex_bssbase = bssbase;
1233                         bigwad->exenv.ex_brksize = brksize;
1234                 }
1235                 bigwad->exenv.ex_magic = elfmagic;
1236                 bigwad->exenv.ex_vp = vp;
1237                 setexecenv(&bigwad->exenv);
1238         }
1239 
1240         ASSERT(error == 0);
1241         goto out;
1242 
1243 bad:
1244         if (fd != -1)           /* did we open the a.out yet */
1245                 (void) execclose(fd);
1246 
1247         psignal(p, SIGKILL);
1248 
1249         if (error == 0)
1250                 error = ENOEXEC;
1251 out:
1252         if (dynuphdr)
1253                 kmem_free(uphdr, sizeof (Phdr));
1254         if (phdrbase != NULL)
1255                 kmem_free(phdrbase, phdrsize);
1256         if (cap != NULL)
1257                 kmem_free(cap, capsize);
1258         kmem_free(bigwad, sizeof (struct bigwad));
1259         return (error);
1260 }
1261 
1262 /*
1263  * Compute the memory size requirement for the ELF file.
1264  */
1265 static size_t
1266 elfsize(const Ehdr *ehdrp, uint_t nphdrs, const caddr_t phdrbase,
1267     uintptr_t *lddata)
1268 {
1269         const Phdr *phdrp = (Phdr *)phdrbase;
1270         const uint_t hsize = ehdrp->e_phentsize;
1271         boolean_t dfirst = B_TRUE;
1272         uintptr_t loaddr = UINTPTR_MAX;
1273         uintptr_t hiaddr = 0;
1274         uint_t i;
1275 
1276         for (i = nphdrs; i > 0; i--) {
1277                 if (phdrp->p_type == PT_LOAD) {
1278                         const uintptr_t lo = phdrp->p_vaddr;
1279                         const uintptr_t hi = lo + phdrp->p_memsz;
1280 
1281                         loaddr = MIN(lo, loaddr);
1282                         hiaddr = MAX(hi, hiaddr);
1283 
1284                         /*
1285                          * save the address of the first data segment
1286                          * of a object - used for the AT_SUNW_LDDATA
1287                          * aux entry.
1288                          */
1289                         if ((lddata != NULL) && dfirst &&
1290                             (phdrp->p_flags & PF_W)) {
1291                                 *lddata = lo;
1292                                 dfirst = B_FALSE;
1293                         }
1294                 }
1295                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1296         }
1297 
1298         if (hiaddr <= loaddr) {
1299                 /* No non-zero PT_LOAD segment found */
1300                 return (0);
1301         }
1302 
1303         return (roundup(hiaddr - (loaddr & PAGEMASK), PAGESIZE));
1304 }
1305 
1306 /*
1307  * Read in the ELF header and program header table.
1308  * SUSV3 requires:
1309  *      ENOEXEC File format is not recognized
1310  *      EINVAL  Format recognized but execution not supported
1311  */
1312 static int
1313 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
1314     uint_t *shstrndx, uint_t *nphdrs)
1315 {
1316         int error;
1317         ssize_t resid;
1318 
1319         /*
1320          * We got here by the first two bytes in ident,
1321          * now read the entire ELF header.
1322          */
1323         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, sizeof (Ehdr),
1324             (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid)) != 0) {
1325                 return (error);
1326         }
1327 
1328         /*
1329          * Since a separate version is compiled for handling 32-bit and
1330          * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1331          * doesn't need to be able to deal with 32-bit ELF files.
1332          */
1333         if (resid != 0 ||
1334             ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1335             ehdr->e_ident[EI_MAG3] != ELFMAG3) {
1336                 return (ENOEXEC);
1337         }
1338 
1339         if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1340 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1341             ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1342 #else
1343             ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1344 #endif
1345             !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1346             ehdr->e_flags)) {
1347                 return (EINVAL);
1348         }
1349 
1350         *nshdrs = ehdr->e_shnum;
1351         *shstrndx = ehdr->e_shstrndx;
1352         *nphdrs = ehdr->e_phnum;
1353 
1354         /*
1355          * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1356          * to read in the section header at index zero to access the true
1357          * values for those fields.
1358          */
1359         if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1360             *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1361                 Shdr shdr;
1362 
1363                 if (ehdr->e_shoff == 0)
1364                         return (EINVAL);
1365 
1366                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1367                     sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1368                     (rlim64_t)0, credp, NULL)) != 0)
1369                         return (error);
1370 
1371                 if (*nshdrs == 0)
1372                         *nshdrs = shdr.sh_size;
1373                 if (*shstrndx == SHN_XINDEX)
1374                         *shstrndx = shdr.sh_link;
1375                 if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1376                         *nphdrs = shdr.sh_info;
1377         }
1378 
1379         return (0);
1380 }
1381 
1382 /*
1383  * We use members through p_flags on 32-bit files and p_memsz on 64-bit files,
1384  * so e_phentsize must be at least large enough to include those members.
1385  */
1386 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1387 #define MINPHENTSZ      (offsetof(Phdr, p_flags) + \
1388                         sizeof (((Phdr *)NULL)->p_flags))
1389 #else
1390 #define MINPHENTSZ      (offsetof(Phdr, p_memsz) + \
1391                         sizeof (((Phdr *)NULL)->p_memsz))
1392 #endif
1393 
1394 static int
1395 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nphdrs,
1396     caddr_t *phbasep, size_t *phsizep)
1397 {
1398         int err;
1399 
1400         /*
1401          * Ensure that e_phentsize is large enough for required fields to be
1402          * accessible and will maintain 8-byte alignment.
1403          */
1404         if (ehdr->e_phentsize < MINPHENTSZ || (ehdr->e_phentsize & 3))
1405                 return (EINVAL);
1406 
1407         *phsizep = nphdrs * ehdr->e_phentsize;
1408 
1409         if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1410                 if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1411                         return (ENOMEM);
1412         } else {
1413                 *phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1414         }
1415 
1416         if ((err = vn_rdwr(UIO_READ, vp, *phbasep, (ssize_t)*phsizep,
1417             (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1418             credp, NULL)) != 0) {
1419                 kmem_free(*phbasep, *phsizep);
1420                 *phbasep = NULL;
1421                 return (err);
1422         }
1423 
1424         return (0);
1425 }
1426 
1427 #define MINSHDRSZ       (offsetof(Shdr, sh_entsize) + \
1428                         sizeof (((Shdr *)NULL)->sh_entsize))
1429 
1430 static int
1431 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nshdrs,
1432     uint_t shstrndx, caddr_t *shbasep, size_t *shsizep, char **shstrbasep,
1433     size_t *shstrsizep)
1434 {
1435         int err;
1436         Shdr *shdr;
1437 
1438         /*
1439          * Since we're going to be using e_shentsize to iterate down the
1440          * array of section headers, it must be 8-byte aligned or else
1441          * a we might cause a misaligned access. We use all members through
1442          * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1443          * must be at least large enough to include that member. The index
1444          * of the string table section must also be valid.
1445          */
1446         if (ehdr->e_shentsize < MINSHDRSZ || (ehdr->e_shentsize & 3) ||
1447             nshdrs == 0 || shstrndx >= nshdrs)
1448                 return (EINVAL);
1449 
1450         *shsizep = nshdrs * ehdr->e_shentsize;
1451 
1452         if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1453                 if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1454                         return (ENOMEM);
1455         } else {
1456                 *shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1457         }
1458 
1459         if ((err = vn_rdwr(UIO_READ, vp, *shbasep, (ssize_t)*shsizep,
1460             (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1461             credp, NULL)) != 0) {
1462                 kmem_free(*shbasep, *shsizep);
1463                 return (err);
1464         }
1465 
1466         /*
1467          * Grab the section string table.  Walking through the shdrs is
1468          * pointless if their names cannot be interrogated.
1469          */
1470         shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1471         if ((*shstrsizep = shdr->sh_size) == 0) {
1472                 kmem_free(*shbasep, *shsizep);
1473                 return (EINVAL);
1474         }
1475 
1476         if (*shstrsizep > elf_shstrtab_max) {
1477                 if ((*shstrbasep = kmem_alloc(*shstrsizep,
1478                     KM_NOSLEEP)) == NULL) {
1479                         kmem_free(*shbasep, *shsizep);
1480                         return (ENOMEM);
1481                 }
1482         } else {
1483                 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1484         }
1485 
1486         if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, (ssize_t)*shstrsizep,
1487             (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1488             credp, NULL)) != 0) {
1489                 kmem_free(*shbasep, *shsizep);
1490                 kmem_free(*shstrbasep, *shstrsizep);
1491                 return (err);
1492         }
1493 
1494         /*
1495          * Make sure the strtab is null-terminated to make sure we
1496          * don't run off the end of the table.
1497          */
1498         (*shstrbasep)[*shstrsizep - 1] = '\0';
1499 
1500         return (0);
1501 }
1502 
1503 
1504 int
1505 elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, uint_t *nphdrs,
1506     caddr_t *phbasep, size_t *phsizep)
1507 {
1508         int error;
1509         uint_t nshdrs, shstrndx;
1510 
1511         if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
1512             nphdrs)) != 0 ||
1513             (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
1514             phsizep)) != 0) {
1515                 return (error);
1516         }
1517         return (0);
1518 }
1519 
1520 
1521 static int
1522 mapelfexec(
1523         vnode_t *vp,
1524         Ehdr *ehdr,
1525         uint_t nphdrs,
1526         caddr_t phdrbase,
1527         Phdr **uphdr,
1528         Phdr **intphdr,
1529         Phdr **stphdr,
1530         Phdr **dtphdr,
1531         Phdr *dataphdrp,
1532         caddr_t *bssbase,
1533         caddr_t *brkbase,
1534         intptr_t *voffset,
1535         uintptr_t *minaddrp,
1536         size_t len,
1537         size_t *execsz,
1538         size_t *brksize)
1539 {
1540         Phdr *phdr;
1541         int error, page, prot, lastprot = 0;
1542         caddr_t addr = NULL;
1543         caddr_t minaddr = (caddr_t)UINTPTR_MAX;
1544         uint_t i;
1545         size_t zfodsz, memsz;
1546         boolean_t ptload = B_FALSE;
1547         off_t offset;
1548         const uint_t hsize = ehdr->e_phentsize;
1549         uintptr_t lastaddr = 0;
1550         extern int use_brk_lpg;
1551 
1552         if (ehdr->e_type == ET_DYN) {
1553                 caddr_t vaddr;
1554                 secflagset_t flags = 0;
1555                 /*
1556                  * Obtain the virtual address of a hole in the
1557                  * address space to map the "interpreter".
1558                  */
1559                 if (secflag_enabled(curproc, PROC_SEC_ASLR))
1560                         flags |= _MAP_RANDOMIZE;
1561 
1562                 map_addr(&addr, len, (offset_t)0, 1, flags);
1563                 if (addr == NULL)
1564                         return (ENOMEM);
1565 
1566                 /*
1567                  * Despite the fact that mmapobj(2) refuses to load them, we
1568                  * need to support executing ET_DYN objects that have a
1569                  * non-NULL p_vaddr.  When found in the wild, these objects
1570                  * are likely to be due to an old (and largely obviated) Linux
1571                  * facility, prelink(8), that rewrites shared objects to
1572                  * prefer specific (disjoint) virtual address ranges.  (Yes,
1573                  * this is putatively for performance -- and yes, it has
1574                  * limited applicability, many edge conditions and grisly
1575                  * failure modes; even for Linux, it's insane.)  As ELF
1576                  * mandates that the PT_LOAD segments be in p_vaddr order, we
1577                  * find the lowest p_vaddr by finding the first PT_LOAD
1578                  * segment.
1579                  */
1580                 phdr = (Phdr *)phdrbase;
1581                 for (i = nphdrs; i > 0; i--) {
1582                         if (phdr->p_type == PT_LOAD) {
1583                                 addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
1584                                 break;
1585                         }
1586                         phdr = (Phdr *)((caddr_t)phdr + hsize);
1587                 }
1588 
1589                 /*
1590                  * We have a non-zero p_vaddr in the first PT_LOAD segment --
1591                  * presumably because we're directly executing a prelink(8)'d
1592                  * ld-linux.so.  While we could correctly execute such an
1593                  * object without locating it at its desired p_vaddr (it is,
1594                  * after all, still relocatable), our inner antiquarian
1595                  * derives a perverse pleasure in accommodating the steampunk
1596                  * prelink(8) contraption -- goggles on!
1597                  */
1598                 if ((vaddr = addr) != NULL) {
1599                         if (as_gap(curproc->p_as, len, &addr, &len,
1600                             AH_LO, NULL) == -1 || addr != vaddr) {
1601                                 addr = NULL;
1602                         }
1603                 }
1604 
1605                 if (addr == NULL) {
1606                         /*
1607                          * We either have a NULL p_vaddr (the common case, by
1608                          * many orders of magnitude) or we have a non-NULL
1609                          * p_vaddr and we were unable to obtain the specified
1610                          * VA range (presumably because it's an illegal
1611                          * address).  Either way, obtain an address in which
1612                          * to map the interpreter.
1613                          */
1614                         map_addr(&addr, len, (offset_t)0, 1, 0);
1615                         if (addr == NULL)
1616                                 return (ENOMEM);
1617                 }
1618 
1619                 /*
1620                  * Our voffset is the difference between where we landed and
1621                  * where we wanted to be.
1622                  */
1623                 *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
1624         } else {
1625                 *voffset = 0;
1626         }
1627 
1628         phdr = (Phdr *)phdrbase;
1629         for (i = nphdrs; i > 0; i--) {
1630                 switch (phdr->p_type) {
1631                 case PT_LOAD:
1632                         ptload = B_TRUE;
1633                         prot = PROT_USER;
1634                         if (phdr->p_flags & PF_R)
1635                                 prot |= PROT_READ;
1636                         if (phdr->p_flags & PF_W)
1637                                 prot |= PROT_WRITE;
1638                         if (phdr->p_flags & PF_X)
1639                                 prot |= PROT_EXEC;
1640 
1641                         addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1642 
1643                         if ((*intphdr != NULL) && uphdr != NULL &&
1644                             (*uphdr == NULL)) {
1645                                 /*
1646                                  * The PT_PHDR program header is, strictly
1647                                  * speaking, optional.  If we find that this
1648                                  * is missing, we will determine the location
1649                                  * of the program headers based on the address
1650                                  * of the lowest PT_LOAD segment (namely, this
1651                                  * one):  we subtract the p_offset to get to
1652                                  * the ELF header and then add back the program
1653                                  * header offset to get to the program headers.
1654                                  * We then cons up a Phdr that corresponds to
1655                                  * the (missing) PT_PHDR, setting the flags
1656                                  * to 0 to denote that this is artificial and
1657                                  * should (must) be freed by the caller.
1658                                  */
1659                                 Phdr *cons;
1660 
1661                                 cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
1662 
1663                                 cons->p_flags = 0;
1664                                 cons->p_type = PT_PHDR;
1665                                 cons->p_vaddr = ((uintptr_t)addr -
1666                                     phdr->p_offset) + ehdr->e_phoff;
1667 
1668                                 *uphdr = cons;
1669                         }
1670 
1671                         /*
1672                          * The ELF spec dictates that p_filesz may not be
1673                          * larger than p_memsz in PT_LOAD segments.
1674                          */
1675                         if (phdr->p_filesz > phdr->p_memsz) {
1676                                 error = EINVAL;
1677                                 goto bad;
1678                         }
1679 
1680                         /*
1681                          * Keep track of the segment with the lowest starting
1682                          * address.
1683                          */
1684                         if (addr < minaddr)
1685                                 minaddr = addr;
1686 
1687                         /*
1688                          * Segments need not correspond to page boundaries:
1689                          * they are permitted to share a page.  If two PT_LOAD
1690                          * segments share the same page, and the permissions
1691                          * of the segments differ, the behavior is historically
1692                          * that the permissions of the latter segment are used
1693                          * for the page that the two segments share.  This is
1694                          * also historically a non-issue:  binaries generated
1695                          * by most anything will make sure that two PT_LOAD
1696                          * segments with differing permissions don't actually
1697                          * share any pages.  However, there exist some crazy
1698                          * things out there (including at least an obscure
1699                          * Portuguese teaching language called G-Portugol) that
1700                          * actually do the wrong thing and expect it to work:
1701                          * they have a segment with execute permission share
1702                          * a page with a subsequent segment that does not
1703                          * have execute permissions and expect the resulting
1704                          * shared page to in fact be executable.  To accommodate
1705                          * such broken link editors, we take advantage of a
1706                          * latitude explicitly granted to the loader:  it is
1707                          * permitted to make _any_ PT_LOAD segment executable
1708                          * (provided that it is readable or writable).  If we
1709                          * see that we're sharing a page and that the previous
1710                          * page was executable, we will add execute permissions
1711                          * to our segment.
1712                          */
1713                         if (btop(lastaddr) == btop((uintptr_t)addr) &&
1714                             (phdr->p_flags & (PF_R | PF_W)) &&
1715                             (lastprot & PROT_EXEC)) {
1716                                 prot |= PROT_EXEC;
1717                         }
1718 
1719                         lastaddr = (uintptr_t)addr + phdr->p_filesz;
1720                         lastprot = prot;
1721 
1722                         zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1723 
1724                         offset = phdr->p_offset;
1725                         if (((uintptr_t)offset & PAGEOFFSET) ==
1726                             ((uintptr_t)addr & PAGEOFFSET) &&
1727                             (!(vp->v_flag & VNOMAP))) {
1728                                 page = 1;
1729                         } else {
1730                                 page = 0;
1731                         }
1732 
1733                         /*
1734                          * Set the heap pagesize for OOB when the bss size
1735                          * is known and use_brk_lpg is not 0.
1736                          */
1737                         if (brksize != NULL && use_brk_lpg &&
1738                             zfodsz != 0 && phdr == dataphdrp &&
1739                             (prot & PROT_WRITE)) {
1740                                 const size_t tlen = P2NPHASE((uintptr_t)addr +
1741                                     phdr->p_filesz, PAGESIZE);
1742 
1743                                 if (zfodsz > tlen) {
1744                                         const caddr_t taddr = addr +
1745                                             phdr->p_filesz + tlen;
1746 
1747                                         /*
1748                                          * Since a hole in the AS large enough
1749                                          * for this object as calculated by
1750                                          * elfsize() is available, we do not
1751                                          * need to fear overflow for 'taddr'.
1752                                          */
1753                                         curproc->p_brkpageszc =
1754                                             page_szc(map_pgsz(MAPPGSZ_HEAP,
1755                                             curproc, taddr, zfodsz - tlen, 0));
1756                                 }
1757                         }
1758 
1759                         if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1760                             (prot & PROT_WRITE)) {
1761                                 uint_t  szc = curproc->p_brkpageszc;
1762                                 size_t pgsz = page_get_pagesize(szc);
1763                                 caddr_t ebss = addr + phdr->p_memsz;
1764                                 /*
1765                                  * If we need extra space to keep the BSS an
1766                                  * integral number of pages in size, some of
1767                                  * that space may fall beyond p_brkbase, so we
1768                                  * need to set p_brksize to account for it
1769                                  * being (logically) part of the brk.
1770                                  */
1771                                 size_t extra_zfodsz;
1772 
1773                                 ASSERT(pgsz > PAGESIZE);
1774 
1775                                 extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1776 
1777                                 if (error = execmap(vp, addr, phdr->p_filesz,
1778                                     zfodsz + extra_zfodsz, phdr->p_offset,
1779                                     prot, page, szc))
1780                                         goto bad;
1781                                 if (brksize != NULL)
1782                                         *brksize = extra_zfodsz;
1783                         } else {
1784                                 if (error = execmap(vp, addr, phdr->p_filesz,
1785                                     zfodsz, phdr->p_offset, prot, page, 0))
1786                                         goto bad;
1787                         }
1788 
1789                         if (bssbase != NULL && addr >= *bssbase &&
1790                             phdr == dataphdrp) {
1791                                 *bssbase = addr + phdr->p_filesz;
1792                         }
1793                         if (brkbase != NULL && addr >= *brkbase) {
1794                                 *brkbase = addr + phdr->p_memsz;
1795                         }
1796 
1797                         memsz = btopr(phdr->p_memsz);
1798                         if ((*execsz + memsz) < *execsz) {
1799                                 error = ENOMEM;
1800                                 goto bad;
1801                         }
1802                         *execsz += memsz;
1803                         break;
1804 
1805                 case PT_INTERP:
1806                         /*
1807                          * The ELF specification is unequivocal about the
1808                          * PT_INTERP program header with respect to any PT_LOAD
1809                          * program header:  "If it is present, it must precede
1810                          * any loadable segment entry." Linux, however, makes
1811                          * no attempt to enforce this -- which has allowed some
1812                          * binary editing tools to get away with generating
1813                          * invalid ELF binaries in the respect that PT_INTERP
1814                          * occurs after the first PT_LOAD program header.  This
1815                          * is unfortunate (and of course, disappointing) but
1816                          * it's no worse than that: there is no reason that we
1817                          * can't process the PT_INTERP entry (if present) after
1818                          * one or more PT_LOAD entries.  We therefore
1819                          * deliberately do not check ptload here and always
1820                          * store dyphdr to be the PT_INTERP program header.
1821                          */
1822                         *intphdr = phdr;
1823                         break;
1824 
1825                 case PT_SHLIB:
1826                         *stphdr = phdr;
1827                         break;
1828 
1829                 case PT_PHDR:
1830                         if (ptload || phdr->p_flags == 0)
1831                                 goto bad;
1832 
1833                         if (uphdr != NULL)
1834                                 *uphdr = phdr;
1835 
1836                         break;
1837 
1838                 case PT_NULL:
1839                 case PT_DYNAMIC:
1840                 case PT_NOTE:
1841                         break;
1842 
1843                 case PT_SUNWDTRACE:
1844                         if (dtphdr != NULL)
1845                                 *dtphdr = phdr;
1846                         break;
1847 
1848                 default:
1849                         break;
1850                 }
1851                 phdr = (Phdr *)((caddr_t)phdr + hsize);
1852         }
1853 
1854         if (minaddrp != NULL) {
1855                 ASSERT(minaddr != (caddr_t)UINTPTR_MAX);
1856                 *minaddrp = (uintptr_t)minaddr;
1857         }
1858 
1859         if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1860                 size_t off;
1861                 uintptr_t base = (uintptr_t)*brkbase;
1862                 uintptr_t oend = base + *brksize;
1863 
1864                 ASSERT(ISP2(aslr_max_brk_skew));
1865 
1866                 (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1867                 base += P2PHASE(off, aslr_max_brk_skew);
1868                 base = P2ROUNDUP(base, PAGESIZE);
1869                 *brkbase = (caddr_t)base;
1870                 /*
1871                  * Above, we set *brksize to account for the possibility we
1872                  * had to grow the 'brk' in padding out the BSS to a page
1873                  * boundary.
1874                  *
1875                  * We now need to adjust that based on where we now are
1876                  * actually putting the brk.
1877                  */
1878                 if (oend > base)
1879                         *brksize = oend - base;
1880                 else
1881                         *brksize = 0;
1882         }
1883 
1884         return (0);
1885 bad:
1886         if (error == 0)
1887                 error = EINVAL;
1888         return (error);
1889 }
1890 
1891 int
1892 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1893     rlim64_t rlimit, cred_t *credp)
1894 {
1895         Note note;
1896         int error;
1897 
1898         bzero(&note, sizeof (note));
1899         bcopy("CORE", note.name, 4);
1900         note.nhdr.n_type = type;
1901         /*
1902          * The System V ABI states that n_namesz must be the length of the
1903          * string that follows the Nhdr structure including the terminating
1904          * null. The ABI also specifies that sufficient padding should be
1905          * included so that the description that follows the name string
1906          * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1907          * respectively. However, since this change was not made correctly
1908          * at the time of the 64-bit port, both 32- and 64-bit binaries
1909          * descriptions are only guaranteed to begin on a 4-byte boundary.
1910          */
1911         note.nhdr.n_namesz = 5;
1912         note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1913 
1914         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, &note,
1915             sizeof (note), rlimit, credp))
1916                 return (error);
1917 
1918         *offsetp += sizeof (note);
1919 
1920         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1921             note.nhdr.n_descsz, rlimit, credp))
1922                 return (error);
1923 
1924         *offsetp += note.nhdr.n_descsz;
1925         return (0);
1926 }
1927 
1928 
1929 /*
1930  * Copy the section data from one vnode to the section of another vnode.
1931  */
1932 static void
1933 elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst)
1934 {
1935         size_t n = src->sh_size;
1936         u_offset_t off = 0;
1937         const u_offset_t soff = src->sh_offset;
1938         const u_offset_t doff = ctx->ecc_doffset;
1939         void *buf = ctx->ecc_buf;
1940         vnode_t *dst_vp = ctx->ecc_vp;
1941         cred_t *credp = ctx->ecc_credp;
1942 
1943         /* Protect the copy loop below from overflow on the offsets */
1944         if (n > OFF_MAX || (n + soff) > OFF_MAX || (n + doff) > OFF_MAX ||
1945             (n + soff) < n || (n + doff) < n) {
1946                 dst->sh_size = 0;
1947                 dst->sh_offset = 0;
1948                 return;
1949         }
1950 
1951         while (n != 0) {
1952                 const size_t len = MIN(ctx->ecc_bufsz, n);
1953                 ssize_t resid;
1954 
1955                 if (vn_rdwr(UIO_READ, src_vp, buf, (ssize_t)len,
1956                     (offset_t)(soff + off),
1957                     UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1958                     resid >= len || resid < 0 ||
1959                     core_write(dst_vp, UIO_SYSSPACE, (offset_t)(doff + off),
1960                     buf, len - resid, ctx->ecc_rlimit, credp) != 0) {
1961                         dst->sh_size = 0;
1962                         dst->sh_offset = 0;
1963                         return;
1964                 }
1965 
1966                 ASSERT(n >= len - resid);
1967 
1968                 n -= len - resid;
1969                 off += len - resid;
1970         }
1971 
1972         ctx->ecc_doffset += src->sh_size;
1973 }
1974 
1975 /*
1976  * Walk sections for a given ELF object, counting (or copying) those of
1977  * interest (CTF, symtab, strtab).
1978  */
1979 static uint_t
1980 elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
1981     Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab)
1982 {
1983         Ehdr ehdr;
1984         const core_content_t content = ctx->ecc_content;
1985         cred_t *credp = ctx->ecc_credp;
1986         Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL;
1987         uintptr_t off = 0;
1988         uint_t nshdrs, shstrndx, nphdrs, count = 0;
1989         u_offset_t *doffp = &ctx->ecc_doffset;
1990         boolean_t ctf_link = B_FALSE;
1991         caddr_t shbase;
1992         size_t shsize, shstrsize;
1993         char *shstrbase;
1994 
1995         if ((content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) == 0) {
1996                 return (0);
1997         }
1998 
1999         if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, &nphdrs) != 0 ||
2000             getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, &shbase, &shsize,
2001             &shstrbase, &shstrsize) != 0) {
2002                 return (0);
2003         }
2004 
2005         /* Starting at index 1 skips SHT_NULL which is expected at index 0 */
2006         off = ehdr.e_shentsize;
2007         for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
2008                 Shdr *shdr, *symchk = NULL, *strchk;
2009                 const char *name;
2010 
2011                 shdr = (Shdr *)(shbase + off);
2012                 if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL)
2013                         continue;
2014 
2015                 name = shstrbase + shdr->sh_name;
2016 
2017                 if (ctf == NULL &&
2018                     (content & CC_CONTENT_CTF) != 0 &&
2019                     strcmp(name, shstrtab_data[STR_CTF]) == 0) {
2020                         ctf = shdr;
2021                         if (ctf->sh_link != 0 && ctf->sh_link < nshdrs) {
2022                                 /* check linked symtab below */
2023                                 symchk = (Shdr *)(shbase +
2024                                     shdr->sh_link * ehdr.e_shentsize);
2025                                 ctf_link = B_TRUE;
2026                         } else {
2027                                 continue;
2028                         }
2029                 } else if (symtab == NULL &&
2030                     (content & CC_CONTENT_SYMTAB) != 0 &&
2031                     strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) {
2032                         symchk = shdr;
2033                 } else {
2034                         continue;
2035                 }
2036 
2037                 ASSERT(symchk != NULL);
2038                 if ((symchk->sh_type != SHT_DYNSYM &&
2039                     symchk->sh_type != SHT_SYMTAB) ||
2040                     symchk->sh_link == 0 || symchk->sh_link >= nshdrs) {
2041                         ctf_link = B_FALSE;
2042                         continue;
2043                 }
2044                 strchk = (Shdr *)(shbase + symchk->sh_link * ehdr.e_shentsize);
2045                 if (strchk->sh_type != SHT_STRTAB) {
2046                         ctf_link = B_FALSE;
2047                         continue;
2048                 }
2049                 symtab = symchk;
2050                 strtab = strchk;
2051 
2052                 if (symtab != NULL && ctf != NULL) {
2053                         /* No other shdrs are of interest at this point */
2054                         break;
2055                 }
2056         }
2057 
2058         if (ctf != NULL)
2059                 count += 1;
2060         if (symtab != NULL)
2061                 count += 2;
2062         if (v == NULL || count == 0 || count > remain) {
2063                 count = MIN(count, remain);
2064                 goto done;
2065         }
2066 
2067         /* output CTF section */
2068         if (ctf != NULL) {
2069                 elf_ctx_resize_scratch(ctx, ctf->sh_size);
2070 
2071                 v[idx].sh_name = shstrtab_ndx(shstrtab, STR_CTF);
2072                 v[idx].sh_addr = (Addr)(uintptr_t)saddr;
2073                 v[idx].sh_type = SHT_PROGBITS;
2074                 v[idx].sh_addralign = 4;
2075                 *doffp = roundup(*doffp, v[idx].sh_addralign);
2076                 v[idx].sh_offset = *doffp;
2077                 v[idx].sh_size = ctf->sh_size;
2078 
2079                 if (ctf_link) {
2080                         /*
2081                          * The linked symtab (and strtab) will be output
2082                          * immediately after this CTF section.  Its shdr index
2083                          * directly follows this one.
2084                          */
2085                         v[idx].sh_link = idx + 1;
2086                         ASSERT(symtab != NULL);
2087                 } else {
2088                         v[idx].sh_link = 0;
2089                 }
2090                 elf_copy_scn(ctx, ctf, mvp, &v[idx]);
2091                 idx++;
2092         }
2093 
2094         /* output SYMTAB/STRTAB sections */
2095         if (symtab != NULL) {
2096                 uint_t symtab_name, strtab_name;
2097 
2098                 elf_ctx_resize_scratch(ctx,
2099                     MAX(symtab->sh_size, strtab->sh_size));
2100 
2101                 if (symtab->sh_type == SHT_DYNSYM) {
2102                         symtab_name = shstrtab_ndx(shstrtab, STR_DYNSYM);
2103                         strtab_name = shstrtab_ndx(shstrtab, STR_DYNSTR);
2104                 } else {
2105                         symtab_name = shstrtab_ndx(shstrtab, STR_SYMTAB);
2106                         strtab_name = shstrtab_ndx(shstrtab, STR_STRTAB);
2107                 }
2108 
2109                 v[idx].sh_name = symtab_name;
2110                 v[idx].sh_type = symtab->sh_type;
2111                 v[idx].sh_addr = symtab->sh_addr;
2112                 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2113                         v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2114                 v[idx].sh_addralign = symtab->sh_addralign;
2115                 *doffp = roundup(*doffp, v[idx].sh_addralign);
2116                 v[idx].sh_offset = *doffp;
2117                 v[idx].sh_size = symtab->sh_size;
2118                 v[idx].sh_link = idx + 1;
2119                 v[idx].sh_entsize = symtab->sh_entsize;
2120                 v[idx].sh_info = symtab->sh_info;
2121 
2122                 elf_copy_scn(ctx, symtab, mvp, &v[idx]);
2123                 idx++;
2124 
2125                 v[idx].sh_name = strtab_name;
2126                 v[idx].sh_type = SHT_STRTAB;
2127                 v[idx].sh_flags = SHF_STRINGS;
2128                 v[idx].sh_addr = strtab->sh_addr;
2129                 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2130                         v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2131                 v[idx].sh_addralign = strtab->sh_addralign;
2132                 *doffp = roundup(*doffp, v[idx].sh_addralign);
2133                 v[idx].sh_offset = *doffp;
2134                 v[idx].sh_size = strtab->sh_size;
2135 
2136                 elf_copy_scn(ctx, strtab, mvp, &v[idx]);
2137                 idx++;
2138         }
2139 
2140 done:
2141         kmem_free(shstrbase, shstrsize);
2142         kmem_free(shbase, shsize);
2143         return (count);
2144 }
2145 
2146 /*
2147  * Walk mappings in process address space, examining those which correspond to
2148  * loaded objects.  It is called twice from elfcore: Once to simply count
2149  * relevant sections, and again later to copy those sections once an adequate
2150  * buffer has been allocated for the shdr details.
2151  */
2152 static int
2153 elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
2154 {
2155         vnode_t *lastvp = NULL;
2156         struct seg *seg;
2157         uint_t idx = 0, remain;
2158         shstrtab_t shstrtab;
2159         struct as *as = ctx->ecc_p->p_as;
2160         int error = 0;
2161 
2162         ASSERT(AS_WRITE_HELD(as));
2163 
2164         if (v != NULL) {
2165                 ASSERT(nv != 0);
2166 
2167                 shstrtab_init(&shstrtab);
2168                 remain = nv;
2169         } else {
2170                 ASSERT(nv == 0);
2171 
2172                 /*
2173                  * The shdrs are being counted, rather than outputting them
2174                  * into a buffer.  Leave room for two entries: the SHT_NULL at
2175                  * index 0 and the shstrtab at the end.
2176                  */
2177                 remain = UINT_MAX - 2;
2178         }
2179 
2180         /* Per the ELF spec, shdr index 0 is reserved. */
2181         idx = 1;
2182         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2183                 vnode_t *mvp;
2184                 void *tmp = NULL;
2185                 caddr_t saddr = seg->s_base, naddr, eaddr;
2186                 size_t segsize;
2187                 uint_t count, prot;
2188 
2189                 /*
2190                  * Since we're just looking for text segments of load
2191                  * objects, we only care about the protection bits; we don't
2192                  * care about the actual size of the segment so we use the
2193                  * reserved size. If the segment's size is zero, there's
2194                  * something fishy going on so we ignore this segment.
2195                  */
2196                 if (seg->s_ops != &segvn_ops ||
2197                     SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2198                     mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
2199                     (segsize = pr_getsegsize(seg, 1)) == 0)
2200                         continue;
2201 
2202                 eaddr = saddr + segsize;
2203                 prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
2204                 pr_getprot_done(&tmp);
2205 
2206                 /*
2207                  * Skip this segment unless the protection bits look like
2208                  * what we'd expect for a text segment.
2209                  */
2210                 if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
2211                         continue;
2212 
2213                 count = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
2214                     &shstrtab);
2215 
2216                 ASSERT(count <= remain);
2217                 ASSERT(v == NULL || (idx + count) < nv);
2218 
2219                 remain -= count;
2220                 idx += count;
2221                 lastvp = mvp;
2222         }
2223 
2224         if (v == NULL) {
2225                 if (idx == 1) {
2226                         *nshdrsp = 0;
2227                 } else {
2228                         /* Include room for the shrstrtab at the end */
2229                         *nshdrsp = idx + 1;
2230                 }
2231                 return (0);
2232         }
2233 
2234         if (idx != nv - 1) {
2235                 cmn_err(CE_WARN, "elfcore: core dump failed for "
2236                     "process %d; address space is changing",
2237                     ctx->ecc_p->p_pid);
2238                 return (EIO);
2239         }
2240 
2241         v[idx].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB);
2242         v[idx].sh_size = shstrtab_size(&shstrtab);
2243         v[idx].sh_addralign = 1;
2244         v[idx].sh_offset = ctx->ecc_doffset;
2245         v[idx].sh_flags = SHF_STRINGS;
2246         v[idx].sh_type = SHT_STRTAB;
2247 
2248         elf_ctx_resize_scratch(ctx, v[idx].sh_size);
2249         VERIFY3U(ctx->ecc_bufsz, >=, v[idx].sh_size);
2250         shstrtab_dump(&shstrtab, ctx->ecc_buf);
2251 
2252         error = core_write(ctx->ecc_vp, UIO_SYSSPACE, ctx->ecc_doffset,
2253             ctx->ecc_buf, v[idx].sh_size, ctx->ecc_rlimit, ctx->ecc_credp);
2254         if (error == 0) {
2255                 ctx->ecc_doffset += v[idx].sh_size;
2256         }
2257 
2258         return (error);
2259 }
2260 
2261 int
2262 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
2263     core_content_t content)
2264 {
2265         u_offset_t poffset, soffset, doffset;
2266         int error;
2267         uint_t i, nphdrs, nshdrs;
2268         struct seg *seg;
2269         struct as *as = p->p_as;
2270         void *bigwad, *zeropg = NULL;
2271         size_t bigsize, phdrsz, shdrsz;
2272         Ehdr *ehdr;
2273         Phdr *phdr;
2274         Shdr shdr0;
2275         caddr_t brkbase, stkbase;
2276         size_t brksize, stksize;
2277         boolean_t overflowed = B_FALSE, retried = B_FALSE;
2278         klwp_t *lwp = ttolwp(curthread);
2279         elf_core_ctx_t ctx = {
2280                 .ecc_vp = vp,
2281                 .ecc_p = p,
2282                 .ecc_credp = credp,
2283                 .ecc_rlimit = rlimit,
2284                 .ecc_content = content,
2285                 .ecc_doffset = 0,
2286                 .ecc_buf = NULL,
2287                 .ecc_bufsz = 0
2288         };
2289 
2290 top:
2291         /*
2292          * Make sure we have everything we need (registers, etc.).
2293          * All other lwps have already stopped and are in an orderly state.
2294          */
2295         ASSERT(p == ttoproc(curthread));
2296         prstop(0, 0);
2297 
2298         AS_LOCK_ENTER(as, RW_WRITER);
2299         nphdrs = prnsegs(as, 0) + 2;            /* two CORE note sections */
2300 
2301         /*
2302          * Count the number of section headers we're going to need.
2303          */
2304         nshdrs = 0;
2305         if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) {
2306                 VERIFY0(elf_process_scns(&ctx, NULL, 0, &nshdrs));
2307         }
2308         AS_LOCK_EXIT(as);
2309 
2310         /*
2311          * The core file contents may require zero section headers, but if
2312          * we overflow the 16 bits allotted to the program header count in
2313          * the ELF header, we'll need that program header at index zero.
2314          */
2315         if (nshdrs == 0 && nphdrs >= PN_XNUM) {
2316                 nshdrs = 1;
2317         }
2318 
2319         /*
2320          * Allocate a buffer which is sized adequately to hold the ehdr, phdrs
2321          * or shdrs needed to produce the core file.  It is used for the three
2322          * tasks sequentially, not simultaneously, so it does not need space
2323          * for all three data at once, only the largest one.
2324          */
2325         VERIFY(nphdrs >= 2);
2326         phdrsz = nphdrs * sizeof (Phdr);
2327         shdrsz = nshdrs * sizeof (Shdr);
2328         bigsize = MAX(sizeof (Ehdr), MAX(phdrsz, shdrsz));
2329         bigwad = kmem_alloc(bigsize, KM_SLEEP);
2330 
2331         ehdr = (Ehdr *)bigwad;
2332         bzero(ehdr, sizeof (*ehdr));
2333 
2334         ehdr->e_ident[EI_MAG0] = ELFMAG0;
2335         ehdr->e_ident[EI_MAG1] = ELFMAG1;
2336         ehdr->e_ident[EI_MAG2] = ELFMAG2;
2337         ehdr->e_ident[EI_MAG3] = ELFMAG3;
2338         ehdr->e_ident[EI_CLASS] = ELFCLASS;
2339         ehdr->e_type = ET_CORE;
2340 
2341 #if !defined(_LP64) || defined(_ELF32_COMPAT)
2342 
2343 #if defined(__sparc)
2344         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2345         ehdr->e_machine = EM_SPARC;
2346 #elif defined(__i386_COMPAT)
2347         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2348         ehdr->e_machine = EM_386;
2349 #else
2350 #error "no recognized machine type is defined"
2351 #endif
2352 
2353 #else   /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2354 
2355 #if defined(__sparc)
2356         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2357         ehdr->e_machine = EM_SPARCV9;
2358 #elif defined(__amd64)
2359         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2360         ehdr->e_machine = EM_AMD64;
2361 #else
2362 #error "no recognized 64-bit machine type is defined"
2363 #endif
2364 
2365 #endif  /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2366 
2367         poffset = sizeof (Ehdr);
2368         soffset = sizeof (Ehdr) + phdrsz;
2369         doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2370         bzero(&shdr0, sizeof (shdr0));
2371 
2372         /*
2373          * If the count of program headers or section headers or the index
2374          * of the section string table can't fit in the mere 16 bits
2375          * shortsightedly allotted to them in the ELF header, we use the
2376          * extended formats and put the real values in the section header
2377          * as index 0.
2378          */
2379         if (nphdrs >= PN_XNUM) {
2380                 ehdr->e_phnum = PN_XNUM;
2381                 shdr0.sh_info = nphdrs;
2382         } else {
2383                 ehdr->e_phnum = (unsigned short)nphdrs;
2384         }
2385 
2386         if (nshdrs > 0) {
2387                 if (nshdrs >= SHN_LORESERVE) {
2388                         ehdr->e_shnum = 0;
2389                         shdr0.sh_size = nshdrs;
2390                 } else {
2391                         ehdr->e_shnum = (unsigned short)nshdrs;
2392                 }
2393 
2394                 if (nshdrs - 1 >= SHN_LORESERVE) {
2395                         ehdr->e_shstrndx = SHN_XINDEX;
2396                         shdr0.sh_link = nshdrs - 1;
2397                 } else {
2398                         ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
2399                 }
2400 
2401                 ehdr->e_shoff = soffset;
2402                 ehdr->e_shentsize = sizeof (Shdr);
2403         }
2404 
2405         ehdr->e_ident[EI_VERSION] = EV_CURRENT;
2406         ehdr->e_version = EV_CURRENT;
2407         ehdr->e_ehsize = sizeof (Ehdr);
2408         ehdr->e_phoff = poffset;
2409         ehdr->e_phentsize = sizeof (Phdr);
2410 
2411         if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
2412             sizeof (Ehdr), rlimit, credp)) {
2413                 goto done;
2414         }
2415 
2416         phdr = (Phdr *)bigwad;
2417         bzero(phdr, phdrsz);
2418 
2419         setup_old_note_header(&phdr[0], p);
2420         phdr[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2421         doffset += phdr[0].p_filesz;
2422 
2423         setup_note_header(&phdr[1], p);
2424         phdr[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2425         doffset += phdr[1].p_filesz;
2426 
2427         mutex_enter(&p->p_lock);
2428 
2429         brkbase = p->p_brkbase;
2430         brksize = p->p_brksize;
2431 
2432         stkbase = p->p_usrstack - p->p_stksize;
2433         stksize = p->p_stksize;
2434 
2435         mutex_exit(&p->p_lock);
2436 
2437         AS_LOCK_ENTER(as, RW_WRITER);
2438         i = 2;
2439         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2440                 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2441                 caddr_t saddr, naddr;
2442                 void *tmp = NULL;
2443                 extern struct seg_ops segspt_shmops;
2444 
2445                 if ((seg->s_flags & S_HOLE) != 0) {
2446                         continue;
2447                 }
2448 
2449                 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2450                         uint_t prot;
2451                         size_t size;
2452                         int type;
2453                         vnode_t *mvp;
2454 
2455                         prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2456                         prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2457                         if ((size = (size_t)(naddr - saddr)) == 0) {
2458                                 ASSERT(tmp == NULL);
2459                                 continue;
2460                         } else if (i == nphdrs) {
2461                                 pr_getprot_done(&tmp);
2462                                 overflowed = B_TRUE;
2463                                 break;
2464                         }
2465                         phdr[i].p_type = PT_LOAD;
2466                         phdr[i].p_vaddr = (Addr)(uintptr_t)saddr;
2467                         phdr[i].p_memsz = size;
2468                         if (prot & PROT_READ)
2469                                 phdr[i].p_flags |= PF_R;
2470                         if (prot & PROT_WRITE)
2471                                 phdr[i].p_flags |= PF_W;
2472                         if (prot & PROT_EXEC)
2473                                 phdr[i].p_flags |= PF_X;
2474 
2475                         /*
2476                          * Figure out which mappings to include in the core.
2477                          */
2478                         type = SEGOP_GETTYPE(seg, saddr);
2479 
2480                         if (saddr == stkbase && size == stksize) {
2481                                 if (!(content & CC_CONTENT_STACK))
2482                                         goto exclude;
2483 
2484                         } else if (saddr == brkbase && size == brksize) {
2485                                 if (!(content & CC_CONTENT_HEAP))
2486                                         goto exclude;
2487 
2488                         } else if (seg->s_ops == &segspt_shmops) {
2489                                 if (type & MAP_NORESERVE) {
2490                                         if (!(content & CC_CONTENT_DISM))
2491                                                 goto exclude;
2492                                 } else {
2493                                         if (!(content & CC_CONTENT_ISM))
2494                                                 goto exclude;
2495                                 }
2496 
2497                         } else if (seg->s_ops != &segvn_ops) {
2498                                 goto exclude;
2499 
2500                         } else if (type & MAP_SHARED) {
2501                                 if (shmgetid(p, saddr) != SHMID_NONE) {
2502                                         if (!(content & CC_CONTENT_SHM))
2503                                                 goto exclude;
2504 
2505                                 } else if (SEGOP_GETVP(seg, seg->s_base,
2506                                     &mvp) != 0 || mvp == NULL ||
2507                                     mvp->v_type != VREG) {
2508                                         if (!(content & CC_CONTENT_SHANON))
2509                                                 goto exclude;
2510 
2511                                 } else {
2512                                         if (!(content & CC_CONTENT_SHFILE))
2513                                                 goto exclude;
2514                                 }
2515 
2516                         } else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2517                             mvp == NULL || mvp->v_type != VREG) {
2518                                 if (!(content & CC_CONTENT_ANON))
2519                                         goto exclude;
2520 
2521                         } else if (prot == (PROT_READ | PROT_EXEC)) {
2522                                 if (!(content & CC_CONTENT_TEXT))
2523                                         goto exclude;
2524 
2525                         } else if (prot == PROT_READ) {
2526                                 if (!(content & CC_CONTENT_RODATA))
2527                                         goto exclude;
2528 
2529                         } else {
2530                                 if (!(content & CC_CONTENT_DATA))
2531                                         goto exclude;
2532                         }
2533 
2534                         doffset = roundup(doffset, sizeof (Word));
2535                         phdr[i].p_offset = doffset;
2536                         phdr[i].p_filesz = size;
2537                         doffset += size;
2538 exclude:
2539                         i++;
2540                 }
2541                 VERIFY(tmp == NULL);
2542                 if (overflowed)
2543                         break;
2544         }
2545         AS_LOCK_EXIT(as);
2546 
2547         if (overflowed || i != nphdrs) {
2548                 if (!retried) {
2549                         retried = B_TRUE;
2550                         overflowed = B_FALSE;
2551                         kmem_free(bigwad, bigsize);
2552                         goto top;
2553                 }
2554                 cmn_err(CE_WARN, "elfcore: core dump failed for "
2555                     "process %d; address space is changing", p->p_pid);
2556                 error = EIO;
2557                 goto done;
2558         }
2559 
2560         if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2561             phdr, phdrsz, rlimit, credp)) != 0) {
2562                 goto done;
2563         }
2564 
2565         if ((error = write_old_elfnotes(p, sig, vp, phdr[0].p_offset, rlimit,
2566             credp)) != 0) {
2567                 goto done;
2568         }
2569         if ((error = write_elfnotes(p, sig, vp, phdr[1].p_offset, rlimit,
2570             credp, content)) != 0) {
2571                 goto done;
2572         }
2573 
2574         for (i = 2; i < nphdrs; i++) {
2575                 prkillinfo_t killinfo;
2576                 sigqueue_t *sq;
2577                 int sig, j;
2578 
2579                 if (phdr[i].p_filesz == 0)
2580                         continue;
2581 
2582                 /*
2583                  * If we hit a region that was mapped PROT_NONE then we cannot
2584                  * continue dumping this normally as the kernel would be unable
2585                  * to read from the page and that would result in us failing to
2586                  * dump the page. As such, any region mapped PROT_NONE, we dump
2587                  * as a zero-filled page such that this is still represented in
2588                  * the map.
2589                  *
2590                  * If dumping out this segment fails, rather than failing
2591                  * the core dump entirely, we reset the size of the mapping
2592                  * to zero to indicate that the data is absent from the core
2593                  * file and or in the PF_SUNW_FAILURE flag to differentiate
2594                  * this from mappings that were excluded due to the core file
2595                  * content settings.
2596                  */
2597                 if ((phdr[i].p_flags & (PF_R | PF_W | PF_X)) == 0) {
2598                         size_t towrite = phdr[i].p_filesz;
2599                         size_t curoff = 0;
2600 
2601                         if (zeropg == NULL) {
2602                                 zeropg = kmem_zalloc(elf_zeropg_sz, KM_SLEEP);
2603                         }
2604 
2605                         error = 0;
2606                         while (towrite != 0) {
2607                                 size_t len = MIN(towrite, elf_zeropg_sz);
2608 
2609                                 error = core_write(vp, UIO_SYSSPACE,
2610                                     phdr[i].p_offset + curoff, zeropg, len,
2611                                     rlimit, credp);
2612                                 if (error != 0)
2613                                         break;
2614 
2615                                 towrite -= len;
2616                                 curoff += len;
2617                         }
2618                 } else {
2619                         error = core_seg(p, vp, phdr[i].p_offset,
2620                             (caddr_t)(uintptr_t)phdr[i].p_vaddr,
2621                             phdr[i].p_filesz, rlimit, credp);
2622                 }
2623                 if (error == 0)
2624                         continue;
2625 
2626                 if ((sig = lwp->lwp_cursig) == 0) {
2627                         /*
2628                          * We failed due to something other than a signal.
2629                          * Since the space reserved for the segment is now
2630                          * unused, we stash the errno in the first four
2631                          * bytes. This undocumented interface will let us
2632                          * understand the nature of the failure.
2633                          */
2634                         (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2635                             &error, sizeof (error), rlimit, credp);
2636 
2637                         phdr[i].p_filesz = 0;
2638                         phdr[i].p_flags |= PF_SUNW_FAILURE;
2639                         if ((error = core_write(vp, UIO_SYSSPACE,
2640                             poffset + sizeof (Phdr) * i, &phdr[i],
2641                             sizeof (Phdr), rlimit, credp)) != 0)
2642                                 goto done;
2643 
2644                         continue;
2645                 }
2646 
2647                 /*
2648                  * We took a signal.  We want to abort the dump entirely, but
2649                  * we also want to indicate what failed and why.  We therefore
2650                  * use the space reserved for the first failing segment to
2651                  * write our error (which, for purposes of compatability with
2652                  * older core dump readers, we set to EINTR) followed by any
2653                  * siginfo associated with the signal.
2654                  */
2655                 bzero(&killinfo, sizeof (killinfo));
2656                 killinfo.prk_error = EINTR;
2657 
2658                 sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2659 
2660                 if (sq != NULL) {
2661                         bcopy(&sq->sq_info, &killinfo.prk_info,
2662                             sizeof (sq->sq_info));
2663                 } else {
2664                         killinfo.prk_info.si_signo = lwp->lwp_cursig;
2665                         killinfo.prk_info.si_code = SI_NOINFO;
2666                 }
2667 
2668 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2669                 /*
2670                  * If this is a 32-bit process, we need to translate from the
2671                  * native siginfo to the 32-bit variant.  (Core readers must
2672                  * always have the same data model as their target or must
2673                  * be aware of -- and compensate for -- data model differences.)
2674                  */
2675                 if (curproc->p_model == DATAMODEL_ILP32) {
2676                         siginfo32_t si32;
2677 
2678                         siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2679                         bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2680                 }
2681 #endif
2682 
2683                 (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2684                     &killinfo, sizeof (killinfo), rlimit, credp);
2685 
2686                 /*
2687                  * For the segment on which we took the signal, indicate that
2688                  * its data now refers to a siginfo.
2689                  */
2690                 phdr[i].p_filesz = 0;
2691                 phdr[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2692                     PF_SUNW_SIGINFO;
2693 
2694                 /*
2695                  * And for every other segment, indicate that its absence
2696                  * is due to a signal.
2697                  */
2698                 for (j = i + 1; j < nphdrs; j++) {
2699                         phdr[j].p_filesz = 0;
2700                         phdr[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2701                 }
2702 
2703                 /*
2704                  * Finally, write out our modified program headers.
2705                  */
2706                 if ((error = core_write(vp, UIO_SYSSPACE,
2707                     poffset + sizeof (Phdr) * i, &phdr[i],
2708                     sizeof (Phdr) * (nphdrs - i), rlimit, credp)) != 0) {
2709                         goto done;
2710                 }
2711 
2712                 break;
2713         }
2714 
2715         if (nshdrs > 0) {
2716                 Shdr *shdr = (Shdr *)bigwad;
2717 
2718                 bzero(shdr, shdrsz);
2719                 if (nshdrs > 1) {
2720                         ctx.ecc_doffset = doffset;
2721                         AS_LOCK_ENTER(as, RW_WRITER);
2722                         error = elf_process_scns(&ctx, shdr, nshdrs, NULL);
2723                         AS_LOCK_EXIT(as);
2724                         if (error != 0) {
2725                                 goto done;
2726                         }
2727                 }
2728                 /* Copy any extended format data destined for the first shdr */
2729                 bcopy(&shdr0, shdr, sizeof (shdr0));
2730 
2731                 error = core_write(vp, UIO_SYSSPACE, soffset, shdr, shdrsz,
2732                     rlimit, credp);
2733         }
2734 
2735 done:
2736         if (zeropg != NULL)
2737                 kmem_free(zeropg, elf_zeropg_sz);
2738         if (ctx.ecc_bufsz != 0) {
2739                 kmem_free(ctx.ecc_buf, ctx.ecc_bufsz);
2740         }
2741         kmem_free(bigwad, bigsize);
2742         return (error);
2743 }
2744 
2745 #ifndef _ELF32_COMPAT
2746 
2747 static struct execsw esw = {
2748 #ifdef  _LP64
2749         elf64magicstr,
2750 #else   /* _LP64 */
2751         elf32magicstr,
2752 #endif  /* _LP64 */
2753         0,
2754         5,
2755         elfexec,
2756         elfcore
2757 };
2758 
2759 static struct modlexec modlexec = {
2760         &mod_execops, "exec module for elf", &esw
2761 };
2762 
2763 #ifdef  _LP64
2764 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2765                         intpdata_t *idatap, int level, size_t *execsz,
2766                         int setid, caddr_t exec_file, cred_t *cred,
2767                         int *brand_action);
2768 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2769                         rlim64_t rlimit, int sig, core_content_t content);
2770 
2771 static struct execsw esw32 = {
2772         elf32magicstr,
2773         0,
2774         5,
2775         elf32exec,
2776         elf32core
2777 };
2778 
2779 static struct modlexec modlexec32 = {
2780         &mod_execops, "32-bit exec module for elf", &esw32
2781 };
2782 #endif  /* _LP64 */
2783 
2784 static struct modlinkage modlinkage = {
2785         MODREV_1,
2786         (void *)&modlexec,
2787 #ifdef  _LP64
2788         (void *)&modlexec32,
2789 #endif  /* _LP64 */
2790         NULL
2791 };
2792 
2793 int
2794 _init(void)
2795 {
2796         return (mod_install(&modlinkage));
2797 }
2798 
2799 int
2800 _fini(void)
2801 {
2802         return (mod_remove(&modlinkage));
2803 }
2804 
2805 int
2806 _info(struct modinfo *modinfop)
2807 {
2808         return (mod_info(&modlinkage, modinfop));
2809 }
2810 
2811 #endif  /* !_ELF32_COMPAT */