1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*         All Rights Reserved  */
  28 /*
  29  * Copyright 2019 Joyent, Inc.
  30  */
  31 
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/thread.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/signal.h>
  37 #include <sys/cred.h>
  38 #include <sys/user.h>
  39 #include <sys/errno.h>
  40 #include <sys/vnode.h>
  41 #include <sys/mman.h>
  42 #include <sys/kmem.h>
  43 #include <sys/proc.h>
  44 #include <sys/pathname.h>
  45 #include <sys/policy.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/systm.h>
  48 #include <sys/elf.h>
  49 #include <sys/vmsystm.h>
  50 #include <sys/debug.h>
  51 #include <sys/auxv.h>
  52 #include <sys/exec.h>
  53 #include <sys/prsystm.h>
  54 #include <vm/as.h>
  55 #include <vm/rm.h>
  56 #include <vm/seg.h>
  57 #include <vm/seg_vn.h>
  58 #include <sys/modctl.h>
  59 #include <sys/systeminfo.h>
  60 #include <sys/vmparam.h>
  61 #include <sys/machelf.h>
  62 #include <sys/shm_impl.h>
  63 #include <sys/archsystm.h>
  64 #include <sys/fasttrap.h>
  65 #include <sys/brand.h>
  66 #include "elf_impl.h"
  67 #include <sys/sdt.h>
  68 #include <sys/siginfo.h>
  69 #include <sys/random.h>
  70 
  71 #if defined(__x86)
  72 #include <sys/comm_page_util.h>
  73 #include <sys/fp.h>
  74 #endif /* defined(__x86) */
  75 
  76 
  77 extern int at_flags;
  78 extern volatile size_t aslr_max_brk_skew;
  79 
  80 #define ORIGIN_STR      "ORIGIN"
  81 #define ORIGIN_STR_SIZE 6
  82 
  83 static int getelfhead(vnode_t *, cred_t *, Ehdr *, uint_t *, uint_t *,
  84     uint_t *);
  85 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, uint_t, caddr_t *,
  86     size_t *);
  87 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, uint_t, uint_t,
  88     caddr_t *, size_t *, caddr_t *, size_t *);
  89 static size_t elfsize(const Ehdr *, uint_t, const caddr_t, uintptr_t *);
  90 static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **,
  91     Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *,
  92     size_t, size_t *, size_t *);
  93 
  94 #ifdef _ELF32_COMPAT
  95 /* Link against the non-compat instances when compiling the 32-bit version. */
  96 extern size_t elf_datasz_max;
  97 extern void elf_ctx_resize_scratch(elf_core_ctx_t *, size_t);
  98 extern uint_t elf_nphdr_max;
  99 extern uint_t elf_nshdr_max;
 100 extern size_t elf_shstrtab_max;
 101 #else
 102 size_t elf_datasz_max = 1 * 1024 * 1024;
 103 uint_t elf_nphdr_max = 1000;
 104 uint_t elf_nshdr_max = 10000;
 105 size_t elf_shstrtab_max = 100 * 1024;
 106 #endif
 107 
 108 
 109 
 110 typedef enum {
 111         STR_CTF,
 112         STR_SYMTAB,
 113         STR_DYNSYM,
 114         STR_STRTAB,
 115         STR_DYNSTR,
 116         STR_SHSTRTAB,
 117         STR_NUM
 118 } shstrtype_t;
 119 
 120 static const char *shstrtab_data[] = {
 121         ".SUNW_ctf",
 122         ".symtab",
 123         ".dynsym",
 124         ".strtab",
 125         ".dynstr",
 126         ".shstrtab"
 127 };
 128 
 129 typedef struct shstrtab {
 130         uint_t  sst_ndx[STR_NUM];
 131         uint_t  sst_cur;
 132 } shstrtab_t;
 133 
 134 static void
 135 shstrtab_init(shstrtab_t *s)
 136 {
 137         bzero(&s->sst_ndx, sizeof (s->sst_ndx));
 138         s->sst_cur = 1;
 139 }
 140 
 141 static uint_t
 142 shstrtab_ndx(shstrtab_t *s, shstrtype_t type)
 143 {
 144         uint_t ret;
 145 
 146         if ((ret = s->sst_ndx[type]) != 0)
 147                 return (ret);
 148 
 149         ret = s->sst_ndx[type] = s->sst_cur;
 150         s->sst_cur += strlen(shstrtab_data[type]) + 1;
 151 
 152         return (ret);
 153 }
 154 
 155 static size_t
 156 shstrtab_size(const shstrtab_t *s)
 157 {
 158         return (s->sst_cur);
 159 }
 160 
 161 static void
 162 shstrtab_dump(const shstrtab_t *s, char *buf)
 163 {
 164         uint_t i, ndx;
 165 
 166         *buf = '\0';
 167         for (i = 0; i < STR_NUM; i++) {
 168                 if ((ndx = s->sst_ndx[i]) != 0)
 169                         (void) strcpy(buf + ndx, shstrtab_data[i]);
 170         }
 171 }
 172 
 173 static int
 174 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
 175 {
 176         ASSERT(phdrp->p_type == PT_SUNWDTRACE);
 177 
 178         /*
 179          * See the comment in fasttrap.h for information on how to safely
 180          * update this program header.
 181          */
 182         if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
 183             (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
 184                 return (-1);
 185 
 186         args->thrptr = phdrp->p_vaddr + base;
 187 
 188         return (0);
 189 }
 190 
 191 static int
 192 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
 193 {
 194         uint_t flag;
 195 
 196         switch (dt) {
 197         case DT_SUNW_ASLR:
 198                 flag = PROC_SEC_ASLR;
 199                 break;
 200         default:
 201                 return (EINVAL);
 202         }
 203 
 204         if (val == 0) {
 205                 if (secflag_isset(p->p_secflags.psf_lower, flag))
 206                         return (EPERM);
 207                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 208                     secflag_isset(p->p_secflags.psf_inherit, flag))
 209                         return (EPERM);
 210 
 211                 secflag_clear(&p->p_secflags.psf_effective, flag);
 212         } else {
 213                 if (!secflag_isset(p->p_secflags.psf_upper, flag))
 214                         return (EPERM);
 215 
 216                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 217                     !secflag_isset(p->p_secflags.psf_inherit, flag))
 218                         return (EPERM);
 219 
 220                 secflag_set(&p->p_secflags.psf_effective, flag);
 221         }
 222 
 223         return (0);
 224 }
 225 
 226 
 227 #ifndef _ELF32_COMPAT
 228 void
 229 elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz)
 230 {
 231         size_t target = MIN(sz, elf_datasz_max);
 232 
 233         if (target > ctx->ecc_bufsz) {
 234                 if (ctx->ecc_buf != NULL) {
 235                         kmem_free(ctx->ecc_buf, ctx->ecc_bufsz);
 236                 }
 237                 ctx->ecc_buf = kmem_alloc(target, KM_SLEEP);
 238                 ctx->ecc_bufsz = target;
 239         }
 240 }
 241 #endif /* _ELF32_COMPAT */
 242 
 243 /*
 244  * Map in the executable pointed to by vp. Returns 0 on success.  Note that
 245  * this function currently has the maximum number of arguments allowed by
 246  * modstubs on x86 (MAXNARG)!  Do _not_ add to this function signature without
 247  * adding to MAXNARG.  (Better yet, do not add to this monster of a function
 248  * signature!)
 249  */
 250 int
 251 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 252     intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
 253     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
 254 {
 255         size_t          len, phdrsize;
 256         struct vattr    vat;
 257         caddr_t         phdrbase = NULL;
 258         uint_t          nshdrs, shstrndx, nphdrs;
 259         int             error = 0;
 260         Phdr            *uphdr = NULL;
 261         Phdr            *junk = NULL;
 262         Phdr            *dynphdr = NULL;
 263         Phdr            *dtrphdr = NULL;
 264         char            *interp = NULL;
 265         uintptr_t       lddata, minaddr;
 266         size_t          execsz;
 267 
 268         if (lddatap != NULL)
 269                 *lddatap = 0;
 270 
 271         if (minaddrp != NULL)
 272                 *minaddrp = (uintptr_t)NULL;
 273 
 274         if (error = execpermissions(vp, &vat, args)) {
 275                 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
 276                 return (error);
 277         }
 278 
 279         if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
 280             &nphdrs)) != 0 ||
 281             (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
 282             &phdrsize)) != 0) {
 283                 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
 284                 return (error);
 285         }
 286 
 287         if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
 288                 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
 289                 kmem_free(phdrbase, phdrsize);
 290                 return (ENOEXEC);
 291         }
 292         if (lddatap != NULL)
 293                 *lddatap = lddata;
 294 
 295         if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
 296             &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
 297             len, &execsz, brksize)) {
 298                 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
 299                 if (uphdr != NULL && uphdr->p_flags == 0)
 300                         kmem_free(uphdr, sizeof (Phdr));
 301                 kmem_free(phdrbase, phdrsize);
 302                 return (error);
 303         }
 304 
 305         if (minaddrp != NULL)
 306                 *minaddrp = minaddr;
 307 
 308         /*
 309          * If the executable requires an interpreter, determine its name.
 310          */
 311         if (dynphdr != NULL) {
 312                 ssize_t resid;
 313 
 314                 if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
 315                         uprintf("%s: Invalid interpreter\n", exec_file);
 316                         kmem_free(phdrbase, phdrsize);
 317                         return (ENOEXEC);
 318                 }
 319 
 320                 interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 321 
 322                 if ((error = vn_rdwr(UIO_READ, vp, interp,
 323                     (ssize_t)dynphdr->p_filesz,
 324                     (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
 325                     (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
 326                     interp[dynphdr->p_filesz - 1] != '\0') {
 327                         uprintf("%s: Cannot obtain interpreter pathname\n",
 328                             exec_file);
 329                         kmem_free(interp, MAXPATHLEN);
 330                         kmem_free(phdrbase, phdrsize);
 331                         return (error != 0 ? error : ENOEXEC);
 332                 }
 333         }
 334 
 335         /*
 336          * If this is a statically linked executable, voffset should indicate
 337          * the address of the executable itself (it normally holds the address
 338          * of the interpreter).
 339          */
 340         if (ehdr->e_type == ET_EXEC && interp == NULL)
 341                 *voffset = minaddr;
 342 
 343         /*
 344          * If the caller has asked for the interpreter name, return it (it's
 345          * up to the caller to free it); if the caller hasn't asked for it,
 346          * free it ourselves.
 347          */
 348         if (interpp != NULL) {
 349                 *interpp = interp;
 350         } else if (interp != NULL) {
 351                 kmem_free(interp, MAXPATHLEN);
 352         }
 353 
 354         if (uphdr != NULL) {
 355                 *uphdr_vaddr = uphdr->p_vaddr;
 356 
 357                 if (uphdr->p_flags == 0)
 358                         kmem_free(uphdr, sizeof (Phdr));
 359         } else if (ehdr->e_type == ET_DYN) {
 360                 /*
 361                  * If we don't have a uphdr, we'll apply the logic found
 362                  * in mapelfexec() and use the p_vaddr of the first PT_LOAD
 363                  * section as the base address of the object.
 364                  */
 365                 const Phdr *phdr = (Phdr *)phdrbase;
 366                 const uint_t hsize = ehdr->e_phentsize;
 367                 uint_t i;
 368 
 369                 for (i = nphdrs; i > 0; i--) {
 370                         if (phdr->p_type == PT_LOAD) {
 371                                 *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
 372                                     ehdr->e_phoff;
 373                                 break;
 374                         }
 375 
 376                         phdr = (Phdr *)((caddr_t)phdr + hsize);
 377                 }
 378 
 379                 /*
 380                  * If we don't have a PT_LOAD segment, we should have returned
 381                  * ENOEXEC when elfsize() returned 0, above.
 382                  */
 383                 VERIFY(i > 0);
 384         } else {
 385                 *uphdr_vaddr = (Addr)-1;
 386         }
 387 
 388         kmem_free(phdrbase, phdrsize);
 389         return (error);
 390 }
 391 
 392 /*ARGSUSED*/
 393 int
 394 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 395     int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
 396     int *brand_action)
 397 {
 398         caddr_t         phdrbase = NULL;
 399         caddr_t         bssbase = 0;
 400         caddr_t         brkbase = 0;
 401         size_t          brksize = 0;
 402         size_t          dlnsize, nsize = 0;
 403         aux_entry_t     *aux;
 404         int             error;
 405         ssize_t         resid;
 406         int             fd = -1;
 407         intptr_t        voffset;
 408         Phdr            *intphdr = NULL;
 409         Phdr            *dynamicphdr = NULL;
 410         Phdr            *stphdr = NULL;
 411         Phdr            *uphdr = NULL;
 412         Phdr            *junk = NULL;
 413         size_t          len;
 414         size_t          postfixsize = 0;
 415         size_t          i;
 416         Phdr            *phdrp;
 417         Phdr            *dataphdrp = NULL;
 418         Phdr            *dtrphdr;
 419         Phdr            *capphdr = NULL;
 420         Cap             *cap = NULL;
 421         size_t          capsize;
 422         int             hasu = 0;
 423         int             hasauxv = 0;
 424         int             hasintp = 0;
 425         int             branded = 0;
 426         int             dynuphdr = 0;
 427 
 428         struct proc *p = ttoproc(curthread);
 429         struct user *up = PTOU(p);
 430         struct bigwad {
 431                 Ehdr    ehdr;
 432                 aux_entry_t     elfargs[__KERN_NAUXV_IMPL];
 433                 char            dl_name[MAXPATHLEN];
 434                 char            pathbuf[MAXPATHLEN];
 435                 struct vattr    vattr;
 436                 struct execenv  exenv;
 437         } *bigwad;      /* kmem_alloc this behemoth so we don't blow stack */
 438         Ehdr            *ehdrp;
 439         uint_t          nshdrs, shstrndx, nphdrs;
 440         size_t          phdrsize;
 441         char            *dlnp;
 442         char            *pathbufp;
 443         rlim64_t        limit;
 444         rlim64_t        roundlimit;
 445 
 446         ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
 447 
 448         bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
 449         ehdrp = &bigwad->ehdr;
 450         dlnp = bigwad->dl_name;
 451         pathbufp = bigwad->pathbuf;
 452 
 453         /*
 454          * Obtain ELF and program header information.
 455          */
 456         if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
 457             &nphdrs)) != 0 ||
 458             (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
 459             &phdrsize)) != 0)
 460                 goto out;
 461 
 462         /*
 463          * Prevent executing an ELF file that has no entry point.
 464          */
 465         if (ehdrp->e_entry == 0) {
 466                 uprintf("%s: Bad entry point\n", exec_file);
 467                 goto bad;
 468         }
 469 
 470         /*
 471          * Put data model that we're exec-ing to into the args passed to
 472          * exec_args(), so it will know what it is copying to on new stack.
 473          * Now that we know whether we are exec-ing a 32-bit or 64-bit
 474          * executable, we can set execsz with the appropriate NCARGS.
 475          */
 476 #ifdef  _LP64
 477         if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
 478                 args->to_model = DATAMODEL_ILP32;
 479                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
 480         } else {
 481                 args->to_model = DATAMODEL_LP64;
 482                 if (!args->stk_prot_override) {
 483                         args->stk_prot &= ~PROT_EXEC;
 484                 }
 485 #if defined(__i386) || defined(__amd64)
 486                 args->dat_prot &= ~PROT_EXEC;
 487 #endif
 488                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
 489         }
 490 #else   /* _LP64 */
 491         args->to_model = DATAMODEL_ILP32;
 492         *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
 493 #endif  /* _LP64 */
 494 
 495         /*
 496          * We delay invoking the brand callback until we've figured out what
 497          * kind of elf binary we're trying to run, 32-bit or 64-bit.  We do this
 498          * because now the brand library can just check args->to_model to see if
 499          * the target is 32-bit or 64-bit without having do duplicate all the
 500          * code above.
 501          *
 502          * We also give the brand a chance to indicate that based on the ELF
 503          * OSABI of the target binary it should become unbranded and optionally
 504          * indicate that it should be treated as existing in a specific prefix.
 505          *
 506          * Note that if a brand opts to go down this route it does not actually
 507          * end up being debranded. In other words, future programs that exec
 508          * will still be considered for branding unless this escape hatch is
 509          * used. Consider the case of lx brand for example. If a user runs
 510          * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
 511          * of DTrace that's in /native will take this escape hatch and be run
 512          * and interpreted using the normal system call table; however, the
 513          * execution of a non-illumos binary in the form of /bin/ls will still
 514          * be branded and be subject to all of the normal actions of the brand.
 515          *
 516          * The level checks associated with brand handling below are used to
 517          * prevent a loop since the brand elfexec function typically comes back
 518          * through this function. We must check <= here since the nested
 519          * handling in the #! interpreter code will increment the level before
 520          * calling gexec to run the final elfexec interpreter.
 521          */
 522         if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
 523             (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
 524                 if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
 525                     &args->brand_nroot) == B_TRUE) {
 526                         ASSERT(ehdrp->e_ident[EI_OSABI]);
 527                         *brand_action = EBA_NATIVE;
 528                         /* Add one for the trailing '/' in the path */
 529                         if (args->brand_nroot != NULL)
 530                                 nsize = strlen(args->brand_nroot) + 1;
 531                 }
 532         }
 533 
 534         if ((level <= INTP_MAXDEPTH) &&
 535             (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 536                 error = BROP(p)->b_elfexec(vp, uap, args,
 537                     idatap, level + 1, execsz, setid, exec_file, cred,
 538                     brand_action);
 539                 goto out;
 540         }
 541 
 542         /*
 543          * Determine aux size now so that stack can be built
 544          * in one shot (except actual copyout of aux image),
 545          * determine any non-default stack protections,
 546          * and still have this code be machine independent.
 547          */
 548         const uint_t hsize = ehdrp->e_phentsize;
 549         phdrp = (Phdr *)phdrbase;
 550         for (i = nphdrs; i > 0; i--) {
 551                 switch (phdrp->p_type) {
 552                 case PT_INTERP:
 553                         hasauxv = hasintp = 1;
 554                         break;
 555                 case PT_PHDR:
 556                         hasu = 1;
 557                         break;
 558                 case PT_SUNWSTACK:
 559                         args->stk_prot = PROT_USER;
 560                         if (phdrp->p_flags & PF_R)
 561                                 args->stk_prot |= PROT_READ;
 562                         if (phdrp->p_flags & PF_W)
 563                                 args->stk_prot |= PROT_WRITE;
 564                         if (phdrp->p_flags & PF_X)
 565                                 args->stk_prot |= PROT_EXEC;
 566                         break;
 567                 case PT_LOAD:
 568                         dataphdrp = phdrp;
 569                         break;
 570                 case PT_SUNWCAP:
 571                         capphdr = phdrp;
 572                         break;
 573                 case PT_DYNAMIC:
 574                         dynamicphdr = phdrp;
 575                         break;
 576                 }
 577                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
 578         }
 579 
 580         if (ehdrp->e_type != ET_EXEC) {
 581                 dataphdrp = NULL;
 582                 hasauxv = 1;
 583         }
 584 
 585         /* Copy BSS permissions to args->dat_prot */
 586         if (dataphdrp != NULL) {
 587                 args->dat_prot = PROT_USER;
 588                 if (dataphdrp->p_flags & PF_R)
 589                         args->dat_prot |= PROT_READ;
 590                 if (dataphdrp->p_flags & PF_W)
 591                         args->dat_prot |= PROT_WRITE;
 592                 if (dataphdrp->p_flags & PF_X)
 593                         args->dat_prot |= PROT_EXEC;
 594         }
 595 
 596         /*
 597          * If a auxvector will be required - reserve the space for
 598          * it now.  This may be increased by exec_args if there are
 599          * ISA-specific types (included in __KERN_NAUXV_IMPL).
 600          */
 601         if (hasauxv) {
 602                 /*
 603                  * If a AUX vector is being built - the base AUX
 604                  * entries are:
 605                  *
 606                  *      AT_BASE
 607                  *      AT_FLAGS
 608                  *      AT_PAGESZ
 609                  *      AT_RANDOM       (added in stk_copyout)
 610                  *      AT_SUN_AUXFLAGS
 611                  *      AT_SUN_HWCAP
 612                  *      AT_SUN_HWCAP2
 613                  *      AT_SUN_PLATFORM (added in stk_copyout)
 614                  *      AT_SUN_EXECNAME (added in stk_copyout)
 615                  *      AT_NULL
 616                  *
 617                  * total == 10
 618                  */
 619                 if (hasintp && hasu) {
 620                         /*
 621                          * Has PT_INTERP & PT_PHDR - the auxvectors that
 622                          * will be built are:
 623                          *
 624                          *      AT_PHDR
 625                          *      AT_PHENT
 626                          *      AT_PHNUM
 627                          *      AT_ENTRY
 628                          *      AT_LDDATA
 629                          *
 630                          * total = 5
 631                          */
 632                         args->auxsize = (10 + 5) * sizeof (aux_entry_t);
 633                 } else if (hasintp) {
 634                         /*
 635                          * Has PT_INTERP but no PT_PHDR
 636                          *
 637                          *      AT_EXECFD
 638                          *      AT_LDDATA
 639                          *
 640                          * total = 2
 641                          */
 642                         args->auxsize = (10 + 2) * sizeof (aux_entry_t);
 643                 } else {
 644                         args->auxsize = 10 * sizeof (aux_entry_t);
 645                 }
 646         } else {
 647                 args->auxsize = 0;
 648         }
 649 
 650         /*
 651          * If this binary is using an emulator, we need to add an
 652          * AT_SUN_EMULATOR aux entry.
 653          */
 654         if (args->emulator != NULL)
 655                 args->auxsize += sizeof (aux_entry_t);
 656 
 657         /*
 658          * If this is a native binary that's been given a modified interpreter
 659          * root, inform it that the native system exists at that root.
 660          */
 661         if (args->brand_nroot != NULL) {
 662                 args->auxsize += sizeof (aux_entry_t);
 663         }
 664 
 665 
 666         /*
 667          * On supported kernels (x86_64) make room in the auxv for the
 668          * AT_SUN_COMMPAGE entry.  This will go unpopulated on i86xpv systems
 669          * which do not provide such functionality.
 670          *
 671          * Additionally cover the floating point information AT_SUN_FPSIZE and
 672          * AT_SUN_FPTYPE.
 673          */
 674 #if defined(__amd64)
 675         args->auxsize += 3 * sizeof (aux_entry_t);
 676 #endif /* defined(__amd64) */
 677 
 678         /*
 679          * If we have user credentials, we'll supply the following entries:
 680          *      AT_SUN_UID
 681          *      AT_SUN_RUID
 682          *      AT_SUN_GID
 683          *      AT_SUN_RGID
 684          */
 685         if (cred != NULL) {
 686                 args->auxsize += 4 * sizeof (aux_entry_t);
 687         }
 688 
 689         if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 690                 branded = 1;
 691                 /*
 692                  * We will be adding 5 entries to the aux vectors.  One for
 693                  * the the brandname and 4 for the brand specific aux vectors.
 694                  */
 695                 args->auxsize += 5 * sizeof (aux_entry_t);
 696         }
 697 
 698         /* If the binary has an explicit ASLR flag, it must be honoured */
 699         if ((dynamicphdr != NULL) && (dynamicphdr->p_filesz > 0)) {
 700                 const size_t dynfilesz = dynamicphdr->p_filesz;
 701                 const size_t dynoffset = dynamicphdr->p_offset;
 702                 Dyn *dyn, *dp;
 703 
 704                 if (dynoffset > MAXOFFSET_T ||
 705                     dynfilesz > MAXOFFSET_T ||
 706                     dynoffset + dynfilesz > MAXOFFSET_T) {
 707                         uprintf("%s: cannot read full .dynamic section\n",
 708                             exec_file);
 709                         error = EINVAL;
 710                         goto out;
 711                 }
 712 
 713 #define DYN_STRIDE      100
 714                 for (i = 0; i < dynfilesz; i += sizeof (*dyn) * DYN_STRIDE) {
 715                         const size_t remdyns = (dynfilesz - i) / sizeof (*dyn);
 716                         const size_t ndyns = MIN(DYN_STRIDE, remdyns);
 717                         const size_t dynsize = ndyns * sizeof (*dyn);
 718 
 719                         dyn = kmem_alloc(dynsize, KM_SLEEP);
 720 
 721                         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
 722                             (ssize_t)dynsize, (offset_t)(dynoffset + i),
 723                             UIO_SYSSPACE, 0, (rlim64_t)0,
 724                             CRED(), NULL)) != 0) {
 725                                 uprintf("%s: cannot read .dynamic section\n",
 726                                     exec_file);
 727                                 goto out;
 728                         }
 729 
 730                         for (dp = dyn; dp < (dyn + ndyns); dp++) {
 731                                 if (dp->d_tag == DT_SUNW_ASLR) {
 732                                         if ((error = handle_secflag_dt(p,
 733                                             DT_SUNW_ASLR,
 734                                             dp->d_un.d_val)) != 0) {
 735                                                 uprintf("%s: error setting "
 736                                                     "security-flag from "
 737                                                     "DT_SUNW_ASLR: %d\n",
 738                                                     exec_file, error);
 739                                                 goto out;
 740                                         }
 741                                 }
 742                         }
 743 
 744                         kmem_free(dyn, dynsize);
 745                 }
 746         }
 747 
 748         /* Hardware/Software capabilities */
 749         if (capphdr != NULL &&
 750             (capsize = capphdr->p_filesz) > 0 &&
 751             capsize <= 16 * sizeof (*cap)) {
 752                 const uint_t ncaps = capsize / sizeof (*cap);
 753                 Cap *cp;
 754 
 755                 cap = kmem_alloc(capsize, KM_SLEEP);
 756                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
 757                     (ssize_t)capsize, (offset_t)capphdr->p_offset,
 758                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), NULL)) != 0) {
 759                         uprintf("%s: Cannot read capabilities section\n",
 760                             exec_file);
 761                         goto out;
 762                 }
 763                 for (cp = cap; cp < cap + ncaps; cp++) {
 764                         if (cp->c_tag == CA_SUNW_SF_1 &&
 765                             (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
 766                                 if (args->to_model == DATAMODEL_LP64)
 767                                         args->addr32 = 1;
 768                                 break;
 769                         }
 770                 }
 771         }
 772 
 773         aux = bigwad->elfargs;
 774         /*
 775          * Move args to the user's stack.
 776          * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM
 777          * aux entries.
 778          */
 779         if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
 780                 if (error == -1) {
 781                         error = ENOEXEC;
 782                         goto bad;
 783                 }
 784                 goto out;
 785         }
 786         /* we're single threaded after this point */
 787 
 788         /*
 789          * If this is an ET_DYN executable (shared object),
 790          * determine its memory size so that mapelfexec() can load it.
 791          */
 792         if (ehdrp->e_type == ET_DYN)
 793                 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
 794         else
 795                 len = 0;
 796 
 797         dtrphdr = NULL;
 798 
 799         error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
 800             &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
 801             len, execsz, &brksize);
 802         /*
 803          * Our uphdr has been dynamically allocated if (and only if) its
 804          * program header flags are clear.  To avoid leaks, this must be
 805          * checked regardless of whether mapelfexec() emitted an error.
 806          */
 807         dynuphdr = (uphdr != NULL && uphdr->p_flags == 0);
 808 
 809         if (error != 0) {
 810                 goto bad;
 811         }
 812 
 813         if (uphdr != NULL && intphdr == NULL)
 814                 goto bad;
 815 
 816         if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 817                 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
 818                 goto bad;
 819         }
 820 
 821         if (intphdr != NULL) {
 822                 size_t          len;
 823                 uintptr_t       lddata;
 824                 char            *p;
 825                 struct vnode    *nvp;
 826 
 827                 dlnsize = intphdr->p_filesz + nsize;
 828 
 829                 /*
 830                  * Make sure none of the component pieces of dlnsize result in
 831                  * an oversized or zeroed result.
 832                  */
 833                 if (intphdr->p_filesz > MAXPATHLEN || dlnsize > MAXPATHLEN ||
 834                     dlnsize == 0 || dlnsize < intphdr->p_filesz) {
 835                         goto bad;
 836                 }
 837 
 838                 if (nsize != 0) {
 839                         bcopy(args->brand_nroot, dlnp, nsize - 1);
 840                         dlnp[nsize - 1] = '/';
 841                 }
 842 
 843                 /*
 844                  * Read in "interpreter" pathname.
 845                  */
 846                 if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
 847                     (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset,
 848                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
 849                         uprintf("%s: Cannot obtain interpreter pathname\n",
 850                             exec_file);
 851                         goto bad;
 852                 }
 853 
 854                 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
 855                         goto bad;
 856 
 857                 /*
 858                  * Search for '$ORIGIN' token in interpreter path.
 859                  * If found, expand it.
 860                  */
 861                 for (p = dlnp; p = strchr(p, '$'); ) {
 862                         uint_t  len, curlen;
 863                         char    *_ptr;
 864 
 865                         if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
 866                                 continue;
 867 
 868                         /*
 869                          * We don't support $ORIGIN on setid programs to close
 870                          * a potential attack vector.
 871                          */
 872                         if ((setid & EXECSETID_SETID) != 0) {
 873                                 error = ENOEXEC;
 874                                 goto bad;
 875                         }
 876 
 877                         curlen = 0;
 878                         len = p - dlnp - 1;
 879                         if (len) {
 880                                 bcopy(dlnp, pathbufp, len);
 881                                 curlen += len;
 882                         }
 883                         if (_ptr = strrchr(args->pathname, '/')) {
 884                                 len = _ptr - args->pathname;
 885                                 if ((curlen + len) > MAXPATHLEN)
 886                                         break;
 887 
 888                                 bcopy(args->pathname, &pathbufp[curlen], len);
 889                                 curlen += len;
 890                         } else {
 891                                 /*
 892                                  * executable is a basename found in the
 893                                  * current directory.  So - just substitue
 894                                  * '.' for ORIGIN.
 895                                  */
 896                                 pathbufp[curlen] = '.';
 897                                 curlen++;
 898                         }
 899                         p += ORIGIN_STR_SIZE;
 900                         len = strlen(p);
 901 
 902                         if ((curlen + len) > MAXPATHLEN)
 903                                 break;
 904                         bcopy(p, &pathbufp[curlen], len);
 905                         curlen += len;
 906                         pathbufp[curlen++] = '\0';
 907                         bcopy(pathbufp, dlnp, curlen);
 908                 }
 909 
 910                 /*
 911                  * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
 912                  * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
 913                  * Just in case /usr is not mounted, change it now.
 914                  */
 915                 if (strcmp(dlnp, USR_LIB_RTLD) == 0)
 916                         dlnp += 4;
 917                 error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
 918                 if (error && dlnp != bigwad->dl_name) {
 919                         /* new kernel, old user-level */
 920                         error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
 921                             NULLVPP, &nvp);
 922                 }
 923                 if (error) {
 924                         uprintf("%s: Cannot find %s\n", exec_file, dlnp);
 925                         goto bad;
 926                 }
 927 
 928                 /*
 929                  * Setup the "aux" vector.
 930                  */
 931                 if (uphdr) {
 932                         if (ehdrp->e_type == ET_DYN) {
 933                                 /* don't use the first page */
 934                                 bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
 935                                 bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
 936                         } else {
 937                                 bigwad->exenv.ex_bssbase = bssbase;
 938                                 bigwad->exenv.ex_brkbase = brkbase;
 939                         }
 940                         bigwad->exenv.ex_brksize = brksize;
 941                         bigwad->exenv.ex_magic = elfmagic;
 942                         bigwad->exenv.ex_vp = vp;
 943                         setexecenv(&bigwad->exenv);
 944 
 945                         ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
 946                         ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
 947                         ADDAUX(aux, AT_PHNUM, nphdrs)
 948                         ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
 949                 } else {
 950                         if ((error = execopen(&vp, &fd)) != 0) {
 951                                 VN_RELE(nvp);
 952                                 goto bad;
 953                         }
 954 
 955                         ADDAUX(aux, AT_EXECFD, fd)
 956                 }
 957 
 958                 if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
 959                         VN_RELE(nvp);
 960                         uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
 961                         goto bad;
 962                 }
 963 
 964                 /*
 965                  * Now obtain the ELF header along with the entire program
 966                  * header contained in "nvp".
 967                  */
 968                 kmem_free(phdrbase, phdrsize);
 969                 phdrbase = NULL;
 970                 if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
 971                     &shstrndx, &nphdrs)) != 0 ||
 972                     (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
 973                     &phdrsize)) != 0) {
 974                         VN_RELE(nvp);
 975                         uprintf("%s: Cannot read %s\n", exec_file, dlnp);
 976                         goto bad;
 977                 }
 978 
 979                 /*
 980                  * Determine memory size of the "interpreter's" loadable
 981                  * sections.  This size is then used to obtain the virtual
 982                  * address of a hole, in the user's address space, large
 983                  * enough to map the "interpreter".
 984                  */
 985                 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
 986                         VN_RELE(nvp);
 987                         uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
 988                         goto bad;
 989                 }
 990 
 991                 dtrphdr = NULL;
 992 
 993                 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
 994                     &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
 995                     execsz, NULL);
 996 
 997                 if (error || junk != NULL) {
 998                         VN_RELE(nvp);
 999                         uprintf("%s: Cannot map %s\n", exec_file, dlnp);
1000                         goto bad;
1001                 }
1002 
1003                 /*
1004                  * We use the DTrace program header to initialize the
1005                  * architecture-specific user per-LWP location. The dtrace
1006                  * fasttrap provider requires ready access to per-LWP scratch
1007                  * space. We assume that there is only one such program header
1008                  * in the interpreter.
1009                  */
1010                 if (dtrphdr != NULL &&
1011                     dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
1012                         VN_RELE(nvp);
1013                         uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
1014                         goto bad;
1015                 }
1016 
1017                 VN_RELE(nvp);
1018                 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
1019         }
1020 
1021         if (hasauxv) {
1022                 int auxf = AF_SUN_HWCAPVERIFY;
1023 #if defined(__amd64)
1024                 size_t fpsize;
1025                 int fptype;
1026 #endif /* defined(__amd64) */
1027 
1028                 /*
1029                  * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were
1030                  * filled in via exec_args()
1031                  */
1032                 ADDAUX(aux, AT_BASE, voffset)
1033                 ADDAUX(aux, AT_FLAGS, at_flags)
1034                 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
1035                 /*
1036                  * Linker flags. (security)
1037                  * p_flag not yet set at this time.
1038                  * We rely on gexec() to provide us with the information.
1039                  * If the application is set-uid but this is not reflected
1040                  * in a mismatch between real/effective uids/gids, then
1041                  * don't treat this as a set-uid exec.  So we care about
1042                  * the EXECSETID_UGIDS flag but not the ...SETID flag.
1043                  */
1044                 if ((setid &= ~EXECSETID_SETID) != 0)
1045                         auxf |= AF_SUN_SETUGID;
1046 
1047                 /*
1048                  * If we're running a native process from within a branded
1049                  * zone under pfexec then we clear the AF_SUN_SETUGID flag so
1050                  * that the native ld.so.1 is able to link with the native
1051                  * libraries instead of using the brand libraries that are
1052                  * installed in the zone.  We only do this for processes
1053                  * which we trust because we see they are already running
1054                  * under pfexec (where uid != euid).  This prevents a
1055                  * malicious user within the zone from crafting a wrapper to
1056                  * run native suid commands with unsecure libraries interposed.
1057                  */
1058                 if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
1059                     (setid &= ~EXECSETID_SETID) != 0))
1060                         auxf &= ~AF_SUN_SETUGID;
1061 
1062                 /*
1063                  * Record the user addr of the auxflags aux vector entry
1064                  * since brands may optionally want to manipulate this field.
1065                  */
1066                 args->auxp_auxflags =
1067                     (char *)((char *)args->stackend +
1068                     ((char *)&aux->a_type -
1069                     (char *)bigwad->elfargs));
1070                 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
1071 
1072                 /*
1073                  * Record information about the real and effective user and
1074                  * group IDs.
1075                  */
1076                 if (cred != NULL) {
1077                         ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
1078                         ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
1079                         ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
1080                         ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
1081                 }
1082 
1083                 /*
1084                  * Hardware capability flag word (performance hints)
1085                  * Used for choosing faster library routines.
1086                  * (Potentially different between 32-bit and 64-bit ABIs)
1087                  */
1088 #if defined(_LP64)
1089                 if (args->to_model == DATAMODEL_NATIVE) {
1090                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1091                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1092                 } else {
1093                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
1094                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
1095                 }
1096 #else
1097                 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
1098                 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
1099 #endif
1100                 if (branded) {
1101                         /*
1102                          * Reserve space for the brand-private aux vectors,
1103                          * and record the user addr of that space.
1104                          */
1105                         args->auxp_brand =
1106                             (char *)((char *)args->stackend +
1107                             ((char *)&aux->a_type -
1108                             (char *)bigwad->elfargs));
1109                         ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
1110                         ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
1111                         ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
1112                         ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
1113                 }
1114 
1115                 /*
1116                  * Add the comm page auxv entry, mapping it in if needed. Also
1117                  * take care of the FPU entries.
1118                  */
1119 #if defined(__amd64)
1120                 if (args->commpage != (uintptr_t)NULL ||
1121                     (args->commpage = (uintptr_t)comm_page_mapin()) !=
1122                     (uintptr_t)NULL) {
1123                         ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
1124                 } else {
1125                         /*
1126                          * If the comm page cannot be mapped, pad out the auxv
1127                          * to satisfy later size checks.
1128                          */
1129                         ADDAUX(aux, AT_NULL, 0)
1130                 }
1131 
1132                 fptype = AT_386_FPINFO_NONE;
1133                 fpu_auxv_info(&fptype, &fpsize);
1134                 if (fptype != AT_386_FPINFO_NONE) {
1135                         ADDAUX(aux, AT_SUN_FPTYPE, fptype)
1136                         ADDAUX(aux, AT_SUN_FPSIZE, fpsize)
1137                 } else {
1138                         ADDAUX(aux, AT_NULL, 0)
1139                         ADDAUX(aux, AT_NULL, 0)
1140                 }
1141 #endif /* defined(__amd64) */
1142 
1143                 ADDAUX(aux, AT_NULL, 0)
1144                 postfixsize = (uintptr_t)aux - (uintptr_t)bigwad->elfargs;
1145 
1146                 /*
1147                  * We make assumptions above when we determine how many aux
1148                  * vector entries we will be adding. However, if we have an
1149                  * invalid elf file, it is possible that mapelfexec might
1150                  * behave differently (but not return an error), in which case
1151                  * the number of aux entries we actually add will be different.
1152                  * We detect that now and error out.
1153                  */
1154                 if (postfixsize != args->auxsize) {
1155                         DTRACE_PROBE2(elfexec_badaux, size_t, postfixsize,
1156                             size_t, args->auxsize);
1157                         goto bad;
1158                 }
1159                 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
1160         }
1161 
1162         /*
1163          * For the 64-bit kernel, the limit is big enough that rounding it up
1164          * to a page can overflow the 64-bit limit, so we check for btopr()
1165          * overflowing here by comparing it with the unrounded limit in pages.
1166          * If it hasn't overflowed, compare the exec size with the rounded up
1167          * limit in pages.  Otherwise, just compare with the unrounded limit.
1168          */
1169         limit = btop(p->p_vmem_ctl);
1170         roundlimit = btopr(p->p_vmem_ctl);
1171         if ((roundlimit > limit && *execsz > roundlimit) ||
1172             (roundlimit < limit && *execsz > limit)) {
1173                 mutex_enter(&p->p_lock);
1174                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1175                     RCA_SAFE);
1176                 mutex_exit(&p->p_lock);
1177                 error = ENOMEM;
1178                 goto bad;
1179         }
1180 
1181         bzero(up->u_auxv, sizeof (up->u_auxv));
1182         up->u_commpagep = args->commpage;
1183         if (postfixsize) {
1184                 size_t num_auxv;
1185 
1186                 /*
1187                  * Copy the aux vector to the user stack.
1188                  */
1189                 error = execpoststack(args, bigwad->elfargs, postfixsize);
1190                 if (error)
1191                         goto bad;
1192 
1193                 /*
1194                  * Copy auxv to the process's user structure for use by /proc.
1195                  * If this is a branded process, the brand's exec routine will
1196                  * copy it's private entries to the user structure later. It
1197                  * relies on the fact that the blank entries are at the end.
1198                  */
1199                 num_auxv = postfixsize / sizeof (aux_entry_t);
1200                 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
1201                 aux = bigwad->elfargs;
1202                 for (i = 0; i < num_auxv; i++) {
1203                         up->u_auxv[i].a_type = aux[i].a_type;
1204                         up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
1205                 }
1206         }
1207 
1208         /*
1209          * Pass back the starting address so we can set the program counter.
1210          */
1211         args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
1212 
1213         if (!uphdr) {
1214                 if (ehdrp->e_type == ET_DYN) {
1215                         /*
1216                          * If we are executing a shared library which doesn't
1217                          * have a interpreter (probably ld.so.1) then
1218                          * we don't set the brkbase now.  Instead we
1219                          * delay it's setting until the first call
1220                          * via grow.c::brk().  This permits ld.so.1 to
1221                          * initialize brkbase to the tail of the executable it
1222                          * loads (which is where it needs to be).
1223                          */
1224                         bigwad->exenv.ex_brkbase = (caddr_t)0;
1225                         bigwad->exenv.ex_bssbase = (caddr_t)0;
1226                         bigwad->exenv.ex_brksize = 0;
1227                 } else {
1228                         bigwad->exenv.ex_brkbase = brkbase;
1229                         bigwad->exenv.ex_bssbase = bssbase;
1230                         bigwad->exenv.ex_brksize = brksize;
1231                 }
1232                 bigwad->exenv.ex_magic = elfmagic;
1233                 bigwad->exenv.ex_vp = vp;
1234                 setexecenv(&bigwad->exenv);
1235         }
1236 
1237         ASSERT(error == 0);
1238         goto out;
1239 
1240 bad:
1241         if (fd != -1)           /* did we open the a.out yet */
1242                 (void) execclose(fd);
1243 
1244         psignal(p, SIGKILL);
1245 
1246         if (error == 0)
1247                 error = ENOEXEC;
1248 out:
1249         if (dynuphdr)
1250                 kmem_free(uphdr, sizeof (Phdr));
1251         if (phdrbase != NULL)
1252                 kmem_free(phdrbase, phdrsize);
1253         if (cap != NULL)
1254                 kmem_free(cap, capsize);
1255         kmem_free(bigwad, sizeof (struct bigwad));
1256         return (error);
1257 }
1258 
1259 /*
1260  * Compute the memory size requirement for the ELF file.
1261  */
1262 static size_t
1263 elfsize(const Ehdr *ehdrp, uint_t nphdrs, const caddr_t phdrbase,
1264     uintptr_t *lddata)
1265 {
1266         const Phdr *phdrp = (Phdr *)phdrbase;
1267         const uint_t hsize = ehdrp->e_phentsize;
1268         boolean_t dfirst = B_TRUE;
1269         uintptr_t loaddr = UINTPTR_MAX;
1270         uintptr_t hiaddr = 0;
1271         uint_t i;
1272 
1273         for (i = nphdrs; i > 0; i--) {
1274                 if (phdrp->p_type == PT_LOAD) {
1275                         const uintptr_t lo = phdrp->p_vaddr;
1276                         const uintptr_t hi = lo + phdrp->p_memsz;
1277 
1278                         loaddr = MIN(lo, loaddr);
1279                         hiaddr = MAX(hi, hiaddr);
1280 
1281                         /*
1282                          * save the address of the first data segment
1283                          * of a object - used for the AT_SUNW_LDDATA
1284                          * aux entry.
1285                          */
1286                         if ((lddata != NULL) && dfirst &&
1287                             (phdrp->p_flags & PF_W)) {
1288                                 *lddata = lo;
1289                                 dfirst = B_FALSE;
1290                         }
1291                 }
1292                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1293         }
1294 
1295         if (hiaddr <= loaddr) {
1296                 /* No non-zero PT_LOAD segment found */
1297                 return (0);
1298         }
1299 
1300         return (roundup(hiaddr - (loaddr & PAGEMASK), PAGESIZE));
1301 }
1302 
1303 /*
1304  * Read in the ELF header and program header table.
1305  * SUSV3 requires:
1306  *      ENOEXEC File format is not recognized
1307  *      EINVAL  Format recognized but execution not supported
1308  */
1309 static int
1310 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
1311     uint_t *shstrndx, uint_t *nphdrs)
1312 {
1313         int error;
1314         ssize_t resid;
1315 
1316         /*
1317          * We got here by the first two bytes in ident,
1318          * now read the entire ELF header.
1319          */
1320         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, sizeof (Ehdr),
1321             (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid)) != 0) {
1322                 return (error);
1323         }
1324 
1325         /*
1326          * Since a separate version is compiled for handling 32-bit and
1327          * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1328          * doesn't need to be able to deal with 32-bit ELF files.
1329          */
1330         if (resid != 0 ||
1331             ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1332             ehdr->e_ident[EI_MAG3] != ELFMAG3) {
1333                 return (ENOEXEC);
1334         }
1335 
1336         if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1337 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1338             ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1339 #else
1340             ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1341 #endif
1342             !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1343             ehdr->e_flags)) {
1344                 return (EINVAL);
1345         }
1346 
1347         *nshdrs = ehdr->e_shnum;
1348         *shstrndx = ehdr->e_shstrndx;
1349         *nphdrs = ehdr->e_phnum;
1350 
1351         /*
1352          * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1353          * to read in the section header at index zero to access the true
1354          * values for those fields.
1355          */
1356         if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1357             *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1358                 Shdr shdr;
1359 
1360                 if (ehdr->e_shoff == 0)
1361                         return (EINVAL);
1362 
1363                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1364                     sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1365                     (rlim64_t)0, credp, NULL)) != 0)
1366                         return (error);
1367 
1368                 if (*nshdrs == 0)
1369                         *nshdrs = shdr.sh_size;
1370                 if (*shstrndx == SHN_XINDEX)
1371                         *shstrndx = shdr.sh_link;
1372                 if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1373                         *nphdrs = shdr.sh_info;
1374         }
1375 
1376         return (0);
1377 }
1378 
1379 /*
1380  * We use members through p_flags on 32-bit files and p_memsz on 64-bit files,
1381  * so e_phentsize must be at least large enough to include those members.
1382  */
1383 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1384 #define MINPHENTSZ      (offsetof(Phdr, p_flags) + \
1385                         sizeof (((Phdr *)NULL)->p_flags))
1386 #else
1387 #define MINPHENTSZ      (offsetof(Phdr, p_memsz) + \
1388                         sizeof (((Phdr *)NULL)->p_memsz))
1389 #endif
1390 
1391 static int
1392 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nphdrs,
1393     caddr_t *phbasep, size_t *phsizep)
1394 {
1395         int err;
1396 
1397         /*
1398          * Ensure that e_phentsize is large enough for required fields to be
1399          * accessible and will maintain 8-byte alignment.
1400          */
1401         if (ehdr->e_phentsize < MINPHENTSZ || (ehdr->e_phentsize & 3))
1402                 return (EINVAL);
1403 
1404         *phsizep = nphdrs * ehdr->e_phentsize;
1405 
1406         if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1407                 if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1408                         return (ENOMEM);
1409         } else {
1410                 *phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1411         }
1412 
1413         if ((err = vn_rdwr(UIO_READ, vp, *phbasep, (ssize_t)*phsizep,
1414             (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1415             credp, NULL)) != 0) {
1416                 kmem_free(*phbasep, *phsizep);
1417                 *phbasep = NULL;
1418                 return (err);
1419         }
1420 
1421         return (0);
1422 }
1423 
1424 #define MINSHDRSZ       (offsetof(Shdr, sh_entsize) + \
1425                         sizeof (((Shdr *)NULL)->sh_entsize))
1426 
1427 static int
1428 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nshdrs,
1429     uint_t shstrndx, caddr_t *shbasep, size_t *shsizep, char **shstrbasep,
1430     size_t *shstrsizep)
1431 {
1432         int err;
1433         Shdr *shdr;
1434 
1435         /*
1436          * Since we're going to be using e_shentsize to iterate down the
1437          * array of section headers, it must be 8-byte aligned or else
1438          * a we might cause a misaligned access. We use all members through
1439          * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1440          * must be at least large enough to include that member. The index
1441          * of the string table section must also be valid.
1442          */
1443         if (ehdr->e_shentsize < MINSHDRSZ || (ehdr->e_shentsize & 3) ||
1444             nshdrs == 0 || shstrndx >= nshdrs)
1445                 return (EINVAL);
1446 
1447         *shsizep = nshdrs * ehdr->e_shentsize;
1448 
1449         if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1450                 if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1451                         return (ENOMEM);
1452         } else {
1453                 *shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1454         }
1455 
1456         if ((err = vn_rdwr(UIO_READ, vp, *shbasep, (ssize_t)*shsizep,
1457             (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1458             credp, NULL)) != 0) {
1459                 kmem_free(*shbasep, *shsizep);
1460                 return (err);
1461         }
1462 
1463         /*
1464          * Grab the section string table.  Walking through the shdrs is
1465          * pointless if their names cannot be interrogated.
1466          */
1467         shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1468         if ((*shstrsizep = shdr->sh_size) == 0) {
1469                 kmem_free(*shbasep, *shsizep);
1470                 return (EINVAL);
1471         }
1472 
1473         if (*shstrsizep > elf_shstrtab_max) {
1474                 if ((*shstrbasep = kmem_alloc(*shstrsizep,
1475                     KM_NOSLEEP)) == NULL) {
1476                         kmem_free(*shbasep, *shsizep);
1477                         return (ENOMEM);
1478                 }
1479         } else {
1480                 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1481         }
1482 
1483         if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, (ssize_t)*shstrsizep,
1484             (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1485             credp, NULL)) != 0) {
1486                 kmem_free(*shbasep, *shsizep);
1487                 kmem_free(*shstrbasep, *shstrsizep);
1488                 return (err);
1489         }
1490 
1491         /*
1492          * Make sure the strtab is null-terminated to make sure we
1493          * don't run off the end of the table.
1494          */
1495         (*shstrbasep)[*shstrsizep - 1] = '\0';
1496 
1497         return (0);
1498 }
1499 
1500 
1501 int
1502 elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, uint_t *nphdrs,
1503     caddr_t *phbasep, size_t *phsizep)
1504 {
1505         int error;
1506         uint_t nshdrs, shstrndx;
1507 
1508         if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
1509             nphdrs)) != 0 ||
1510             (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
1511             phsizep)) != 0) {
1512                 return (error);
1513         }
1514         return (0);
1515 }
1516 
1517 
1518 static int
1519 mapelfexec(
1520         vnode_t *vp,
1521         Ehdr *ehdr,
1522         uint_t nphdrs,
1523         caddr_t phdrbase,
1524         Phdr **uphdr,
1525         Phdr **intphdr,
1526         Phdr **stphdr,
1527         Phdr **dtphdr,
1528         Phdr *dataphdrp,
1529         caddr_t *bssbase,
1530         caddr_t *brkbase,
1531         intptr_t *voffset,
1532         uintptr_t *minaddrp,
1533         size_t len,
1534         size_t *execsz,
1535         size_t *brksize)
1536 {
1537         Phdr *phdr;
1538         int error, page, prot, lastprot = 0;
1539         caddr_t addr = NULL;
1540         caddr_t minaddr = (caddr_t)UINTPTR_MAX;
1541         uint_t i;
1542         size_t zfodsz, memsz;
1543         boolean_t ptload = B_FALSE;
1544         off_t offset;
1545         const uint_t hsize = ehdr->e_phentsize;
1546         uintptr_t lastaddr = 0;
1547         extern int use_brk_lpg;
1548 
1549         if (ehdr->e_type == ET_DYN) {
1550                 caddr_t vaddr;
1551                 secflagset_t flags = 0;
1552                 /*
1553                  * Obtain the virtual address of a hole in the
1554                  * address space to map the "interpreter".
1555                  */
1556                 if (secflag_enabled(curproc, PROC_SEC_ASLR))
1557                         flags |= _MAP_RANDOMIZE;
1558 
1559                 map_addr(&addr, len, (offset_t)0, 1, flags);
1560                 if (addr == NULL)
1561                         return (ENOMEM);
1562 
1563                 /*
1564                  * Despite the fact that mmapobj(2) refuses to load them, we
1565                  * need to support executing ET_DYN objects that have a
1566                  * non-NULL p_vaddr.  When found in the wild, these objects
1567                  * are likely to be due to an old (and largely obviated) Linux
1568                  * facility, prelink(8), that rewrites shared objects to
1569                  * prefer specific (disjoint) virtual address ranges.  (Yes,
1570                  * this is putatively for performance -- and yes, it has
1571                  * limited applicability, many edge conditions and grisly
1572                  * failure modes; even for Linux, it's insane.)  As ELF
1573                  * mandates that the PT_LOAD segments be in p_vaddr order, we
1574                  * find the lowest p_vaddr by finding the first PT_LOAD
1575                  * segment.
1576                  */
1577                 phdr = (Phdr *)phdrbase;
1578                 for (i = nphdrs; i > 0; i--) {
1579                         if (phdr->p_type == PT_LOAD) {
1580                                 addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
1581                                 break;
1582                         }
1583                         phdr = (Phdr *)((caddr_t)phdr + hsize);
1584                 }
1585 
1586                 /*
1587                  * We have a non-zero p_vaddr in the first PT_LOAD segment --
1588                  * presumably because we're directly executing a prelink(8)'d
1589                  * ld-linux.so.  While we could correctly execute such an
1590                  * object without locating it at its desired p_vaddr (it is,
1591                  * after all, still relocatable), our inner antiquarian
1592                  * derives a perverse pleasure in accommodating the steampunk
1593                  * prelink(8) contraption -- goggles on!
1594                  */
1595                 if ((vaddr = addr) != NULL) {
1596                         if (as_gap(curproc->p_as, len, &addr, &len,
1597                             AH_LO, NULL) == -1 || addr != vaddr) {
1598                                 addr = NULL;
1599                         }
1600                 }
1601 
1602                 if (addr == NULL) {
1603                         /*
1604                          * We either have a NULL p_vaddr (the common case, by
1605                          * many orders of magnitude) or we have a non-NULL
1606                          * p_vaddr and we were unable to obtain the specified
1607                          * VA range (presumably because it's an illegal
1608                          * address).  Either way, obtain an address in which
1609                          * to map the interpreter.
1610                          */
1611                         map_addr(&addr, len, (offset_t)0, 1, 0);
1612                         if (addr == NULL)
1613                                 return (ENOMEM);
1614                 }
1615 
1616                 /*
1617                  * Our voffset is the difference between where we landed and
1618                  * where we wanted to be.
1619                  */
1620                 *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
1621         } else {
1622                 *voffset = 0;
1623         }
1624 
1625         phdr = (Phdr *)phdrbase;
1626         for (i = nphdrs; i > 0; i--) {
1627                 switch (phdr->p_type) {
1628                 case PT_LOAD:
1629                         ptload = B_TRUE;
1630                         prot = PROT_USER;
1631                         if (phdr->p_flags & PF_R)
1632                                 prot |= PROT_READ;
1633                         if (phdr->p_flags & PF_W)
1634                                 prot |= PROT_WRITE;
1635                         if (phdr->p_flags & PF_X)
1636                                 prot |= PROT_EXEC;
1637 
1638                         addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1639 
1640                         if ((*intphdr != NULL) && uphdr != NULL &&
1641                             (*uphdr == NULL)) {
1642                                 /*
1643                                  * The PT_PHDR program header is, strictly
1644                                  * speaking, optional.  If we find that this
1645                                  * is missing, we will determine the location
1646                                  * of the program headers based on the address
1647                                  * of the lowest PT_LOAD segment (namely, this
1648                                  * one):  we subtract the p_offset to get to
1649                                  * the ELF header and then add back the program
1650                                  * header offset to get to the program headers.
1651                                  * We then cons up a Phdr that corresponds to
1652                                  * the (missing) PT_PHDR, setting the flags
1653                                  * to 0 to denote that this is artificial and
1654                                  * should (must) be freed by the caller.
1655                                  */
1656                                 Phdr *cons;
1657 
1658                                 cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
1659 
1660                                 cons->p_flags = 0;
1661                                 cons->p_type = PT_PHDR;
1662                                 cons->p_vaddr = ((uintptr_t)addr -
1663                                     phdr->p_offset) + ehdr->e_phoff;
1664 
1665                                 *uphdr = cons;
1666                         }
1667 
1668                         /*
1669                          * The ELF spec dictates that p_filesz may not be
1670                          * larger than p_memsz in PT_LOAD segments.
1671                          */
1672                         if (phdr->p_filesz > phdr->p_memsz) {
1673                                 error = EINVAL;
1674                                 goto bad;
1675                         }
1676 
1677                         /*
1678                          * Keep track of the segment with the lowest starting
1679                          * address.
1680                          */
1681                         if (addr < minaddr)
1682                                 minaddr = addr;
1683 
1684                         /*
1685                          * Segments need not correspond to page boundaries:
1686                          * they are permitted to share a page.  If two PT_LOAD
1687                          * segments share the same page, and the permissions
1688                          * of the segments differ, the behavior is historically
1689                          * that the permissions of the latter segment are used
1690                          * for the page that the two segments share.  This is
1691                          * also historically a non-issue:  binaries generated
1692                          * by most anything will make sure that two PT_LOAD
1693                          * segments with differing permissions don't actually
1694                          * share any pages.  However, there exist some crazy
1695                          * things out there (including at least an obscure
1696                          * Portuguese teaching language called G-Portugol) that
1697                          * actually do the wrong thing and expect it to work:
1698                          * they have a segment with execute permission share
1699                          * a page with a subsequent segment that does not
1700                          * have execute permissions and expect the resulting
1701                          * shared page to in fact be executable.  To accommodate
1702                          * such broken link editors, we take advantage of a
1703                          * latitude explicitly granted to the loader:  it is
1704                          * permitted to make _any_ PT_LOAD segment executable
1705                          * (provided that it is readable or writable).  If we
1706                          * see that we're sharing a page and that the previous
1707                          * page was executable, we will add execute permissions
1708                          * to our segment.
1709                          */
1710                         if (btop(lastaddr) == btop((uintptr_t)addr) &&
1711                             (phdr->p_flags & (PF_R | PF_W)) &&
1712                             (lastprot & PROT_EXEC)) {
1713                                 prot |= PROT_EXEC;
1714                         }
1715 
1716                         lastaddr = (uintptr_t)addr + phdr->p_filesz;
1717                         lastprot = prot;
1718 
1719                         zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1720 
1721                         offset = phdr->p_offset;
1722                         if (((uintptr_t)offset & PAGEOFFSET) ==
1723                             ((uintptr_t)addr & PAGEOFFSET) &&
1724                             (!(vp->v_flag & VNOMAP))) {
1725                                 page = 1;
1726                         } else {
1727                                 page = 0;
1728                         }
1729 
1730                         /*
1731                          * Set the heap pagesize for OOB when the bss size
1732                          * is known and use_brk_lpg is not 0.
1733                          */
1734                         if (brksize != NULL && use_brk_lpg &&
1735                             zfodsz != 0 && phdr == dataphdrp &&
1736                             (prot & PROT_WRITE)) {
1737                                 const size_t tlen = P2NPHASE((uintptr_t)addr +
1738                                     phdr->p_filesz, PAGESIZE);
1739 
1740                                 if (zfodsz > tlen) {
1741                                         const caddr_t taddr = addr +
1742                                             phdr->p_filesz + tlen;
1743 
1744                                         /*
1745                                          * Since a hole in the AS large enough
1746                                          * for this object as calculated by
1747                                          * elfsize() is available, we do not
1748                                          * need to fear overflow for 'taddr'.
1749                                          */
1750                                         curproc->p_brkpageszc =
1751                                             page_szc(map_pgsz(MAPPGSZ_HEAP,
1752                                             curproc, taddr, zfodsz - tlen, 0));
1753                                 }
1754                         }
1755 
1756                         if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1757                             (prot & PROT_WRITE)) {
1758                                 uint_t  szc = curproc->p_brkpageszc;
1759                                 size_t pgsz = page_get_pagesize(szc);
1760                                 caddr_t ebss = addr + phdr->p_memsz;
1761                                 /*
1762                                  * If we need extra space to keep the BSS an
1763                                  * integral number of pages in size, some of
1764                                  * that space may fall beyond p_brkbase, so we
1765                                  * need to set p_brksize to account for it
1766                                  * being (logically) part of the brk.
1767                                  */
1768                                 size_t extra_zfodsz;
1769 
1770                                 ASSERT(pgsz > PAGESIZE);
1771 
1772                                 extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1773 
1774                                 if (error = execmap(vp, addr, phdr->p_filesz,
1775                                     zfodsz + extra_zfodsz, phdr->p_offset,
1776                                     prot, page, szc))
1777                                         goto bad;
1778                                 if (brksize != NULL)
1779                                         *brksize = extra_zfodsz;
1780                         } else {
1781                                 if (error = execmap(vp, addr, phdr->p_filesz,
1782                                     zfodsz, phdr->p_offset, prot, page, 0))
1783                                         goto bad;
1784                         }
1785 
1786                         if (bssbase != NULL && addr >= *bssbase &&
1787                             phdr == dataphdrp) {
1788                                 *bssbase = addr + phdr->p_filesz;
1789                         }
1790                         if (brkbase != NULL && addr >= *brkbase) {
1791                                 *brkbase = addr + phdr->p_memsz;
1792                         }
1793 
1794                         memsz = btopr(phdr->p_memsz);
1795                         if ((*execsz + memsz) < *execsz) {
1796                                 error = ENOMEM;
1797                                 goto bad;
1798                         }
1799                         *execsz += memsz;
1800                         break;
1801 
1802                 case PT_INTERP:
1803                         /*
1804                          * The ELF specification is unequivocal about the
1805                          * PT_INTERP program header with respect to any PT_LOAD
1806                          * program header:  "If it is present, it must precede
1807                          * any loadable segment entry." Linux, however, makes
1808                          * no attempt to enforce this -- which has allowed some
1809                          * binary editing tools to get away with generating
1810                          * invalid ELF binaries in the respect that PT_INTERP
1811                          * occurs after the first PT_LOAD program header.  This
1812                          * is unfortunate (and of course, disappointing) but
1813                          * it's no worse than that: there is no reason that we
1814                          * can't process the PT_INTERP entry (if present) after
1815                          * one or more PT_LOAD entries.  We therefore
1816                          * deliberately do not check ptload here and always
1817                          * store dyphdr to be the PT_INTERP program header.
1818                          */
1819                         *intphdr = phdr;
1820                         break;
1821 
1822                 case PT_SHLIB:
1823                         *stphdr = phdr;
1824                         break;
1825 
1826                 case PT_PHDR:
1827                         if (ptload || phdr->p_flags == 0)
1828                                 goto bad;
1829 
1830                         if (uphdr != NULL)
1831                                 *uphdr = phdr;
1832 
1833                         break;
1834 
1835                 case PT_NULL:
1836                 case PT_DYNAMIC:
1837                 case PT_NOTE:
1838                         break;
1839 
1840                 case PT_SUNWDTRACE:
1841                         if (dtphdr != NULL)
1842                                 *dtphdr = phdr;
1843                         break;
1844 
1845                 default:
1846                         break;
1847                 }
1848                 phdr = (Phdr *)((caddr_t)phdr + hsize);
1849         }
1850 
1851         if (minaddrp != NULL) {
1852                 ASSERT(minaddr != (caddr_t)UINTPTR_MAX);
1853                 *minaddrp = (uintptr_t)minaddr;
1854         }
1855 
1856         if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1857                 size_t off;
1858                 uintptr_t base = (uintptr_t)*brkbase;
1859                 uintptr_t oend = base + *brksize;
1860 
1861                 ASSERT(ISP2(aslr_max_brk_skew));
1862 
1863                 (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1864                 base += P2PHASE(off, aslr_max_brk_skew);
1865                 base = P2ROUNDUP(base, PAGESIZE);
1866                 *brkbase = (caddr_t)base;
1867                 /*
1868                  * Above, we set *brksize to account for the possibility we
1869                  * had to grow the 'brk' in padding out the BSS to a page
1870                  * boundary.
1871                  *
1872                  * We now need to adjust that based on where we now are
1873                  * actually putting the brk.
1874                  */
1875                 if (oend > base)
1876                         *brksize = oend - base;
1877                 else
1878                         *brksize = 0;
1879         }
1880 
1881         return (0);
1882 bad:
1883         if (error == 0)
1884                 error = EINVAL;
1885         return (error);
1886 }
1887 
1888 int
1889 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1890     rlim64_t rlimit, cred_t *credp)
1891 {
1892         Note note;
1893         int error;
1894 
1895         bzero(&note, sizeof (note));
1896         bcopy("CORE", note.name, 4);
1897         note.nhdr.n_type = type;
1898         /*
1899          * The System V ABI states that n_namesz must be the length of the
1900          * string that follows the Nhdr structure including the terminating
1901          * null. The ABI also specifies that sufficient padding should be
1902          * included so that the description that follows the name string
1903          * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1904          * respectively. However, since this change was not made correctly
1905          * at the time of the 64-bit port, both 32- and 64-bit binaries
1906          * descriptions are only guaranteed to begin on a 4-byte boundary.
1907          */
1908         note.nhdr.n_namesz = 5;
1909         note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1910 
1911         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, &note,
1912             sizeof (note), rlimit, credp))
1913                 return (error);
1914 
1915         *offsetp += sizeof (note);
1916 
1917         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1918             note.nhdr.n_descsz, rlimit, credp))
1919                 return (error);
1920 
1921         *offsetp += note.nhdr.n_descsz;
1922         return (0);
1923 }
1924 
1925 
1926 /*
1927  * Copy the section data from one vnode to the section of another vnode.
1928  */
1929 static void
1930 elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst)
1931 {
1932         size_t n = src->sh_size;
1933         u_offset_t off = 0;
1934         const u_offset_t soff = src->sh_offset;
1935         const u_offset_t doff = ctx->ecc_doffset;
1936         void *buf = ctx->ecc_buf;
1937         vnode_t *dst_vp = ctx->ecc_vp;
1938         cred_t *credp = ctx->ecc_credp;
1939 
1940         /* Protect the copy loop below from overflow on the offsets */
1941         if (n > OFF_MAX || (n + soff) > OFF_MAX || (n + doff) > OFF_MAX ||
1942             (n + soff) < n || (n + doff) < n) {
1943                 dst->sh_size = 0;
1944                 dst->sh_offset = 0;
1945                 return;
1946         }
1947 
1948         while (n != 0) {
1949                 const size_t len = MIN(ctx->ecc_bufsz, n);
1950                 ssize_t resid;
1951 
1952                 if (vn_rdwr(UIO_READ, src_vp, buf, (ssize_t)len,
1953                     (offset_t)(soff + off),
1954                     UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1955                     resid >= len || resid < 0 ||
1956                     core_write(dst_vp, UIO_SYSSPACE, (offset_t)(doff + off),
1957                     buf, len - resid, ctx->ecc_rlimit, credp) != 0) {
1958                         dst->sh_size = 0;
1959                         dst->sh_offset = 0;
1960                         return;
1961                 }
1962 
1963                 ASSERT(n >= len - resid);
1964 
1965                 n -= len - resid;
1966                 off += len - resid;
1967         }
1968 
1969         ctx->ecc_doffset += src->sh_size;
1970 }
1971 
1972 /*
1973  * Walk sections for a given ELF object, counting (or copying) those of
1974  * interest (CTF, symtab, strtab).
1975  */
1976 static uint_t
1977 elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
1978     Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab)
1979 {
1980         Ehdr ehdr;
1981         const core_content_t content = ctx->ecc_content;
1982         cred_t *credp = ctx->ecc_credp;
1983         Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL;
1984         uintptr_t off = 0;
1985         uint_t nshdrs, shstrndx, nphdrs, count = 0;
1986         u_offset_t *doffp = &ctx->ecc_doffset;
1987         boolean_t ctf_link = B_FALSE;
1988         caddr_t shbase;
1989         size_t shsize, shstrsize;
1990         char *shstrbase;
1991 
1992         if ((content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) == 0) {
1993                 return (0);
1994         }
1995 
1996         if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, &nphdrs) != 0 ||
1997             getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, &shbase, &shsize,
1998             &shstrbase, &shstrsize) != 0) {
1999                 return (0);
2000         }
2001 
2002         /* Starting at index 1 skips SHT_NULL which is expected at index 0 */
2003         off = ehdr.e_shentsize;
2004         for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
2005                 Shdr *shdr, *symchk = NULL, *strchk;
2006                 const char *name;
2007 
2008                 shdr = (Shdr *)(shbase + off);
2009                 if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL)
2010                         continue;
2011 
2012                 name = shstrbase + shdr->sh_name;
2013 
2014                 if (ctf == NULL &&
2015                     (content & CC_CONTENT_CTF) != 0 &&
2016                     strcmp(name, shstrtab_data[STR_CTF]) == 0) {
2017                         ctf = shdr;
2018                         if (ctf->sh_link != 0 && ctf->sh_link < nshdrs) {
2019                                 /* check linked symtab below */
2020                                 symchk = (Shdr *)(shbase +
2021                                     shdr->sh_link * ehdr.e_shentsize);
2022                                 ctf_link = B_TRUE;
2023                         } else {
2024                                 continue;
2025                         }
2026                 } else if (symtab == NULL &&
2027                     (content & CC_CONTENT_SYMTAB) != 0 &&
2028                     strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) {
2029                         symchk = shdr;
2030                 } else {
2031                         continue;
2032                 }
2033 
2034                 ASSERT(symchk != NULL);
2035                 if ((symchk->sh_type != SHT_DYNSYM &&
2036                     symchk->sh_type != SHT_SYMTAB) ||
2037                     symchk->sh_link == 0 || symchk->sh_link >= nshdrs) {
2038                         ctf_link = B_FALSE;
2039                         continue;
2040                 }
2041                 strchk = (Shdr *)(shbase + symchk->sh_link * ehdr.e_shentsize);
2042                 if (strchk->sh_type != SHT_STRTAB) {
2043                         ctf_link = B_FALSE;
2044                         continue;
2045                 }
2046                 symtab = symchk;
2047                 strtab = strchk;
2048 
2049                 if (symtab != NULL && ctf != NULL) {
2050                         /* No other shdrs are of interest at this point */
2051                         break;
2052                 }
2053         }
2054 
2055         if (ctf != NULL)
2056                 count += 1;
2057         if (symtab != NULL)
2058                 count += 2;
2059         if (v == NULL || count == 0 || count > remain) {
2060                 count = MIN(count, remain);
2061                 goto done;
2062         }
2063 
2064         /* output CTF section */
2065         if (ctf != NULL) {
2066                 elf_ctx_resize_scratch(ctx, ctf->sh_size);
2067 
2068                 v[idx].sh_name = shstrtab_ndx(shstrtab, STR_CTF);
2069                 v[idx].sh_addr = (Addr)(uintptr_t)saddr;
2070                 v[idx].sh_type = SHT_PROGBITS;
2071                 v[idx].sh_addralign = 4;
2072                 *doffp = roundup(*doffp, v[idx].sh_addralign);
2073                 v[idx].sh_offset = *doffp;
2074                 v[idx].sh_size = ctf->sh_size;
2075 
2076                 if (ctf_link) {
2077                         /*
2078                          * The linked symtab (and strtab) will be output
2079                          * immediately after this CTF section.  Its shdr index
2080                          * directly follows this one.
2081                          */
2082                         v[idx].sh_link = idx + 1;
2083                         ASSERT(symtab != NULL);
2084                 } else {
2085                         v[idx].sh_link = 0;
2086                 }
2087                 elf_copy_scn(ctx, ctf, mvp, &v[idx]);
2088                 idx++;
2089         }
2090 
2091         /* output SYMTAB/STRTAB sections */
2092         if (symtab != NULL) {
2093                 uint_t symtab_name, strtab_name;
2094 
2095                 elf_ctx_resize_scratch(ctx,
2096                     MAX(symtab->sh_size, strtab->sh_size));
2097 
2098                 if (symtab->sh_type == SHT_DYNSYM) {
2099                         symtab_name = shstrtab_ndx(shstrtab, STR_DYNSYM);
2100                         strtab_name = shstrtab_ndx(shstrtab, STR_DYNSTR);
2101                 } else {
2102                         symtab_name = shstrtab_ndx(shstrtab, STR_SYMTAB);
2103                         strtab_name = shstrtab_ndx(shstrtab, STR_STRTAB);
2104                 }
2105 
2106                 v[idx].sh_name = symtab_name;
2107                 v[idx].sh_type = symtab->sh_type;
2108                 v[idx].sh_addr = symtab->sh_addr;
2109                 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2110                         v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2111                 v[idx].sh_addralign = symtab->sh_addralign;
2112                 *doffp = roundup(*doffp, v[idx].sh_addralign);
2113                 v[idx].sh_offset = *doffp;
2114                 v[idx].sh_size = symtab->sh_size;
2115                 v[idx].sh_link = idx + 1;
2116                 v[idx].sh_entsize = symtab->sh_entsize;
2117                 v[idx].sh_info = symtab->sh_info;
2118 
2119                 elf_copy_scn(ctx, symtab, mvp, &v[idx]);
2120                 idx++;
2121 
2122                 v[idx].sh_name = strtab_name;
2123                 v[idx].sh_type = SHT_STRTAB;
2124                 v[idx].sh_flags = SHF_STRINGS;
2125                 v[idx].sh_addr = strtab->sh_addr;
2126                 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
2127                         v[idx].sh_addr += (Addr)(uintptr_t)saddr;
2128                 v[idx].sh_addralign = strtab->sh_addralign;
2129                 *doffp = roundup(*doffp, v[idx].sh_addralign);
2130                 v[idx].sh_offset = *doffp;
2131                 v[idx].sh_size = strtab->sh_size;
2132 
2133                 elf_copy_scn(ctx, strtab, mvp, &v[idx]);
2134                 idx++;
2135         }
2136 
2137 done:
2138         kmem_free(shstrbase, shstrsize);
2139         kmem_free(shbase, shsize);
2140         return (count);
2141 }
2142 
2143 /*
2144  * Walk mappings in process address space, examining those which correspond to
2145  * loaded objects.  It is called twice from elfcore: Once to simply count
2146  * relevant sections, and again later to copy those sections once an adequate
2147  * buffer has been allocated for the shdr details.
2148  */
2149 static int
2150 elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
2151 {
2152         vnode_t *lastvp = NULL;
2153         struct seg *seg;
2154         uint_t idx = 0, remain;
2155         shstrtab_t shstrtab;
2156         struct as *as = ctx->ecc_p->p_as;
2157         int error = 0;
2158 
2159         ASSERT(AS_WRITE_HELD(as));
2160 
2161         if (v != NULL) {
2162                 ASSERT(nv != 0);
2163 
2164                 shstrtab_init(&shstrtab);
2165                 remain = nv;
2166         } else {
2167                 ASSERT(nv == 0);
2168 
2169                 /*
2170                  * The shdrs are being counted, rather than outputting them
2171                  * into a buffer.  Leave room for two entries: the SHT_NULL at
2172                  * index 0 and the shstrtab at the end.
2173                  */
2174                 remain = UINT_MAX - 2;
2175         }
2176 
2177         /* Per the ELF spec, shdr index 0 is reserved. */
2178         idx = 1;
2179         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2180                 vnode_t *mvp;
2181                 void *tmp = NULL;
2182                 caddr_t saddr = seg->s_base, naddr, eaddr;
2183                 size_t segsize;
2184                 uint_t count, prot;
2185 
2186                 /*
2187                  * Since we're just looking for text segments of load
2188                  * objects, we only care about the protection bits; we don't
2189                  * care about the actual size of the segment so we use the
2190                  * reserved size. If the segment's size is zero, there's
2191                  * something fishy going on so we ignore this segment.
2192                  */
2193                 if (seg->s_ops != &segvn_ops ||
2194                     SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2195                     mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
2196                     (segsize = pr_getsegsize(seg, 1)) == 0)
2197                         continue;
2198 
2199                 eaddr = saddr + segsize;
2200                 prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
2201                 pr_getprot_done(&tmp);
2202 
2203                 /*
2204                  * Skip this segment unless the protection bits look like
2205                  * what we'd expect for a text segment.
2206                  */
2207                 if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
2208                         continue;
2209 
2210                 count = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
2211                     &shstrtab);
2212 
2213                 ASSERT(count <= remain);
2214                 ASSERT(v == NULL || (idx + count) < nv);
2215 
2216                 remain -= count;
2217                 idx += count;
2218                 lastvp = mvp;
2219         }
2220 
2221         if (v == NULL) {
2222                 if (idx == 1) {
2223                         *nshdrsp = 0;
2224                 } else {
2225                         /* Include room for the shrstrtab at the end */
2226                         *nshdrsp = idx + 1;
2227                 }
2228                 return (0);
2229         }
2230 
2231         if (idx != nv - 1) {
2232                 cmn_err(CE_WARN, "elfcore: core dump failed for "
2233                     "process %d; address space is changing",
2234                     ctx->ecc_p->p_pid);
2235                 return (EIO);
2236         }
2237 
2238         v[idx].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB);
2239         v[idx].sh_size = shstrtab_size(&shstrtab);
2240         v[idx].sh_addralign = 1;
2241         v[idx].sh_offset = ctx->ecc_doffset;
2242         v[idx].sh_flags = SHF_STRINGS;
2243         v[idx].sh_type = SHT_STRTAB;
2244 
2245         elf_ctx_resize_scratch(ctx, v[idx].sh_size);
2246         VERIFY3U(ctx->ecc_bufsz, >=, v[idx].sh_size);
2247         shstrtab_dump(&shstrtab, ctx->ecc_buf);
2248 
2249         error = core_write(ctx->ecc_vp, UIO_SYSSPACE, ctx->ecc_doffset,
2250             ctx->ecc_buf, v[idx].sh_size, ctx->ecc_rlimit, ctx->ecc_credp);
2251         if (error == 0) {
2252                 ctx->ecc_doffset += v[idx].sh_size;
2253         }
2254 
2255         return (error);
2256 }
2257 
2258 int
2259 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
2260     core_content_t content)
2261 {
2262         u_offset_t poffset, soffset, doffset;
2263         int error;
2264         uint_t i, nphdrs, nshdrs;
2265         struct seg *seg;
2266         struct as *as = p->p_as;
2267         void *bigwad;
2268         size_t bigsize, phdrsz, shdrsz;
2269         Ehdr *ehdr;
2270         Phdr *phdr;
2271         Shdr shdr0;
2272         caddr_t brkbase, stkbase;
2273         size_t brksize, stksize;
2274         boolean_t overflowed = B_FALSE, retried = B_FALSE;
2275         klwp_t *lwp = ttolwp(curthread);
2276         elf_core_ctx_t ctx = {
2277                 .ecc_vp = vp,
2278                 .ecc_p = p,
2279                 .ecc_credp = credp,
2280                 .ecc_rlimit = rlimit,
2281                 .ecc_content = content,
2282                 .ecc_doffset = 0,
2283                 .ecc_buf = NULL,
2284                 .ecc_bufsz = 0
2285         };
2286 
2287 top:
2288         /*
2289          * Make sure we have everything we need (registers, etc.).
2290          * All other lwps have already stopped and are in an orderly state.
2291          */
2292         ASSERT(p == ttoproc(curthread));
2293         prstop(0, 0);
2294 
2295         AS_LOCK_ENTER(as, RW_WRITER);
2296         nphdrs = prnsegs(as, 0) + 2;            /* two CORE note sections */
2297 
2298         /*
2299          * Count the number of section headers we're going to need.
2300          */
2301         nshdrs = 0;
2302         if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) {
2303                 VERIFY0(elf_process_scns(&ctx, NULL, 0, &nshdrs));
2304         }
2305         AS_LOCK_EXIT(as);
2306 
2307         /*
2308          * The core file contents may require zero section headers, but if
2309          * we overflow the 16 bits allotted to the program header count in
2310          * the ELF header, we'll need that program header at index zero.
2311          */
2312         if (nshdrs == 0 && nphdrs >= PN_XNUM) {
2313                 nshdrs = 1;
2314         }
2315 
2316         /*
2317          * Allocate a buffer which is sized adequately to hold the ehdr, phdrs
2318          * or shdrs needed to produce the core file.  It is used for the three
2319          * tasks sequentially, not simultaneously, so it does not need space
2320          * for all three data at once, only the largest one.
2321          */
2322         VERIFY(nphdrs >= 2);
2323         phdrsz = nphdrs * sizeof (Phdr);
2324         shdrsz = nshdrs * sizeof (Shdr);
2325         bigsize = MAX(sizeof (Ehdr), MAX(phdrsz, shdrsz));
2326         bigwad = kmem_alloc(bigsize, KM_SLEEP);
2327 
2328         ehdr = (Ehdr *)bigwad;
2329         bzero(ehdr, sizeof (*ehdr));
2330 
2331         ehdr->e_ident[EI_MAG0] = ELFMAG0;
2332         ehdr->e_ident[EI_MAG1] = ELFMAG1;
2333         ehdr->e_ident[EI_MAG2] = ELFMAG2;
2334         ehdr->e_ident[EI_MAG3] = ELFMAG3;
2335         ehdr->e_ident[EI_CLASS] = ELFCLASS;
2336         ehdr->e_type = ET_CORE;
2337 
2338 #if !defined(_LP64) || defined(_ELF32_COMPAT)
2339 
2340 #if defined(__sparc)
2341         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2342         ehdr->e_machine = EM_SPARC;
2343 #elif defined(__i386) || defined(__i386_COMPAT)
2344         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2345         ehdr->e_machine = EM_386;
2346 #else
2347 #error "no recognized machine type is defined"
2348 #endif
2349 
2350 #else   /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2351 
2352 #if defined(__sparc)
2353         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2354         ehdr->e_machine = EM_SPARCV9;
2355 #elif defined(__amd64)
2356         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2357         ehdr->e_machine = EM_AMD64;
2358 #else
2359 #error "no recognized 64-bit machine type is defined"
2360 #endif
2361 
2362 #endif  /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2363 
2364         poffset = sizeof (Ehdr);
2365         soffset = sizeof (Ehdr) + phdrsz;
2366         doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2367         bzero(&shdr0, sizeof (shdr0));
2368 
2369         /*
2370          * If the count of program headers or section headers or the index
2371          * of the section string table can't fit in the mere 16 bits
2372          * shortsightedly allotted to them in the ELF header, we use the
2373          * extended formats and put the real values in the section header
2374          * as index 0.
2375          */
2376         if (nphdrs >= PN_XNUM) {
2377                 ehdr->e_phnum = PN_XNUM;
2378                 shdr0.sh_info = nphdrs;
2379         } else {
2380                 ehdr->e_phnum = (unsigned short)nphdrs;
2381         }
2382 
2383         if (nshdrs > 0) {
2384                 if (nshdrs >= SHN_LORESERVE) {
2385                         ehdr->e_shnum = 0;
2386                         shdr0.sh_size = nshdrs;
2387                 } else {
2388                         ehdr->e_shnum = (unsigned short)nshdrs;
2389                 }
2390 
2391                 if (nshdrs - 1 >= SHN_LORESERVE) {
2392                         ehdr->e_shstrndx = SHN_XINDEX;
2393                         shdr0.sh_link = nshdrs - 1;
2394                 } else {
2395                         ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
2396                 }
2397 
2398                 ehdr->e_shoff = soffset;
2399                 ehdr->e_shentsize = sizeof (Shdr);
2400         }
2401 
2402         ehdr->e_version = EV_CURRENT;
2403         ehdr->e_ehsize = sizeof (Ehdr);
2404         ehdr->e_phoff = poffset;
2405         ehdr->e_phentsize = sizeof (Phdr);
2406 
2407         if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
2408             sizeof (Ehdr), rlimit, credp)) {
2409                 goto done;
2410         }
2411 
2412         phdr = (Phdr *)bigwad;
2413         bzero(phdr, phdrsz);
2414 
2415         setup_old_note_header(&phdr[0], p);
2416         phdr[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2417         doffset += phdr[0].p_filesz;
2418 
2419         setup_note_header(&phdr[1], p);
2420         phdr[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2421         doffset += phdr[1].p_filesz;
2422 
2423         mutex_enter(&p->p_lock);
2424 
2425         brkbase = p->p_brkbase;
2426         brksize = p->p_brksize;
2427 
2428         stkbase = p->p_usrstack - p->p_stksize;
2429         stksize = p->p_stksize;
2430 
2431         mutex_exit(&p->p_lock);
2432 
2433         AS_LOCK_ENTER(as, RW_WRITER);
2434         i = 2;
2435         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2436                 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2437                 caddr_t saddr, naddr;
2438                 void *tmp = NULL;
2439                 extern struct seg_ops segspt_shmops;
2440 
2441                 if ((seg->s_flags & S_HOLE) != 0) {
2442                         continue;
2443                 }
2444 
2445                 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2446                         uint_t prot;
2447                         size_t size;
2448                         int type;
2449                         vnode_t *mvp;
2450 
2451                         prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2452                         prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2453                         if ((size = (size_t)(naddr - saddr)) == 0) {
2454                                 ASSERT(tmp == NULL);
2455                                 continue;
2456                         } else if (i == nphdrs) {
2457                                 pr_getprot_done(&tmp);
2458                                 overflowed = B_TRUE;
2459                                 break;
2460                         }
2461                         phdr[i].p_type = PT_LOAD;
2462                         phdr[i].p_vaddr = (Addr)(uintptr_t)saddr;
2463                         phdr[i].p_memsz = size;
2464                         if (prot & PROT_READ)
2465                                 phdr[i].p_flags |= PF_R;
2466                         if (prot & PROT_WRITE)
2467                                 phdr[i].p_flags |= PF_W;
2468                         if (prot & PROT_EXEC)
2469                                 phdr[i].p_flags |= PF_X;
2470 
2471                         /*
2472                          * Figure out which mappings to include in the core.
2473                          */
2474                         type = SEGOP_GETTYPE(seg, saddr);
2475 
2476                         if (saddr == stkbase && size == stksize) {
2477                                 if (!(content & CC_CONTENT_STACK))
2478                                         goto exclude;
2479 
2480                         } else if (saddr == brkbase && size == brksize) {
2481                                 if (!(content & CC_CONTENT_HEAP))
2482                                         goto exclude;
2483 
2484                         } else if (seg->s_ops == &segspt_shmops) {
2485                                 if (type & MAP_NORESERVE) {
2486                                         if (!(content & CC_CONTENT_DISM))
2487                                                 goto exclude;
2488                                 } else {
2489                                         if (!(content & CC_CONTENT_ISM))
2490                                                 goto exclude;
2491                                 }
2492 
2493                         } else if (seg->s_ops != &segvn_ops) {
2494                                 goto exclude;
2495 
2496                         } else if (type & MAP_SHARED) {
2497                                 if (shmgetid(p, saddr) != SHMID_NONE) {
2498                                         if (!(content & CC_CONTENT_SHM))
2499                                                 goto exclude;
2500 
2501                                 } else if (SEGOP_GETVP(seg, seg->s_base,
2502                                     &mvp) != 0 || mvp == NULL ||
2503                                     mvp->v_type != VREG) {
2504                                         if (!(content & CC_CONTENT_SHANON))
2505                                                 goto exclude;
2506 
2507                                 } else {
2508                                         if (!(content & CC_CONTENT_SHFILE))
2509                                                 goto exclude;
2510                                 }
2511 
2512                         } else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2513                             mvp == NULL || mvp->v_type != VREG) {
2514                                 if (!(content & CC_CONTENT_ANON))
2515                                         goto exclude;
2516 
2517                         } else if (prot == (PROT_READ | PROT_EXEC)) {
2518                                 if (!(content & CC_CONTENT_TEXT))
2519                                         goto exclude;
2520 
2521                         } else if (prot == PROT_READ) {
2522                                 if (!(content & CC_CONTENT_RODATA))
2523                                         goto exclude;
2524 
2525                         } else {
2526                                 if (!(content & CC_CONTENT_DATA))
2527                                         goto exclude;
2528                         }
2529 
2530                         doffset = roundup(doffset, sizeof (Word));
2531                         phdr[i].p_offset = doffset;
2532                         phdr[i].p_filesz = size;
2533                         doffset += size;
2534 exclude:
2535                         i++;
2536                 }
2537                 VERIFY(tmp == NULL);
2538                 if (overflowed)
2539                         break;
2540         }
2541         AS_LOCK_EXIT(as);
2542 
2543         if (overflowed || i != nphdrs) {
2544                 if (!retried) {
2545                         retried = B_TRUE;
2546                         overflowed = B_FALSE;
2547                         kmem_free(bigwad, bigsize);
2548                         goto top;
2549                 }
2550                 cmn_err(CE_WARN, "elfcore: core dump failed for "
2551                     "process %d; address space is changing", p->p_pid);
2552                 error = EIO;
2553                 goto done;
2554         }
2555 
2556         if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2557             phdr, phdrsz, rlimit, credp)) != 0) {
2558                 goto done;
2559         }
2560 
2561         if ((error = write_old_elfnotes(p, sig, vp, phdr[0].p_offset, rlimit,
2562             credp)) != 0) {
2563                 goto done;
2564         }
2565         if ((error = write_elfnotes(p, sig, vp, phdr[1].p_offset, rlimit,
2566             credp, content)) != 0) {
2567                 goto done;
2568         }
2569 
2570         for (i = 2; i < nphdrs; i++) {
2571                 prkillinfo_t killinfo;
2572                 sigqueue_t *sq;
2573                 int sig, j;
2574 
2575                 if (phdr[i].p_filesz == 0)
2576                         continue;
2577 
2578                 /*
2579                  * If dumping out this segment fails, rather than failing
2580                  * the core dump entirely, we reset the size of the mapping
2581                  * to zero to indicate that the data is absent from the core
2582                  * file and or in the PF_SUNW_FAILURE flag to differentiate
2583                  * this from mappings that were excluded due to the core file
2584                  * content settings.
2585                  */
2586                 if ((error = core_seg(p, vp, phdr[i].p_offset,
2587                     (caddr_t)(uintptr_t)phdr[i].p_vaddr, phdr[i].p_filesz,
2588                     rlimit, credp)) == 0) {
2589                         continue;
2590                 }
2591 
2592                 if ((sig = lwp->lwp_cursig) == 0) {
2593                         /*
2594                          * We failed due to something other than a signal.
2595                          * Since the space reserved for the segment is now
2596                          * unused, we stash the errno in the first four
2597                          * bytes. This undocumented interface will let us
2598                          * understand the nature of the failure.
2599                          */
2600                         (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2601                             &error, sizeof (error), rlimit, credp);
2602 
2603                         phdr[i].p_filesz = 0;
2604                         phdr[i].p_flags |= PF_SUNW_FAILURE;
2605                         if ((error = core_write(vp, UIO_SYSSPACE,
2606                             poffset + sizeof (Phdr) * i, &phdr[i],
2607                             sizeof (Phdr), rlimit, credp)) != 0)
2608                                 goto done;
2609 
2610                         continue;
2611                 }
2612 
2613                 /*
2614                  * We took a signal.  We want to abort the dump entirely, but
2615                  * we also want to indicate what failed and why.  We therefore
2616                  * use the space reserved for the first failing segment to
2617                  * write our error (which, for purposes of compatability with
2618                  * older core dump readers, we set to EINTR) followed by any
2619                  * siginfo associated with the signal.
2620                  */
2621                 bzero(&killinfo, sizeof (killinfo));
2622                 killinfo.prk_error = EINTR;
2623 
2624                 sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2625 
2626                 if (sq != NULL) {
2627                         bcopy(&sq->sq_info, &killinfo.prk_info,
2628                             sizeof (sq->sq_info));
2629                 } else {
2630                         killinfo.prk_info.si_signo = lwp->lwp_cursig;
2631                         killinfo.prk_info.si_code = SI_NOINFO;
2632                 }
2633 
2634 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2635                 /*
2636                  * If this is a 32-bit process, we need to translate from the
2637                  * native siginfo to the 32-bit variant.  (Core readers must
2638                  * always have the same data model as their target or must
2639                  * be aware of -- and compensate for -- data model differences.)
2640                  */
2641                 if (curproc->p_model == DATAMODEL_ILP32) {
2642                         siginfo32_t si32;
2643 
2644                         siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2645                         bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2646                 }
2647 #endif
2648 
2649                 (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2650                     &killinfo, sizeof (killinfo), rlimit, credp);
2651 
2652                 /*
2653                  * For the segment on which we took the signal, indicate that
2654                  * its data now refers to a siginfo.
2655                  */
2656                 phdr[i].p_filesz = 0;
2657                 phdr[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2658                     PF_SUNW_SIGINFO;
2659 
2660                 /*
2661                  * And for every other segment, indicate that its absence
2662                  * is due to a signal.
2663                  */
2664                 for (j = i + 1; j < nphdrs; j++) {
2665                         phdr[j].p_filesz = 0;
2666                         phdr[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2667                 }
2668 
2669                 /*
2670                  * Finally, write out our modified program headers.
2671                  */
2672                 if ((error = core_write(vp, UIO_SYSSPACE,
2673                     poffset + sizeof (Phdr) * i, &phdr[i],
2674                     sizeof (Phdr) * (nphdrs - i), rlimit, credp)) != 0) {
2675                         goto done;
2676                 }
2677 
2678                 break;
2679         }
2680 
2681         if (nshdrs > 0) {
2682                 Shdr *shdr = (Shdr *)bigwad;
2683 
2684                 bzero(shdr, shdrsz);
2685                 if (nshdrs > 1) {
2686                         ctx.ecc_doffset = doffset;
2687                         AS_LOCK_ENTER(as, RW_WRITER);
2688                         error = elf_process_scns(&ctx, shdr, nshdrs, NULL);
2689                         AS_LOCK_EXIT(as);
2690                         if (error != 0) {
2691                                 goto done;
2692                         }
2693                 }
2694                 /* Copy any extended format data destined for the first shdr */
2695                 bcopy(&shdr0, shdr, sizeof (shdr0));
2696 
2697                 error = core_write(vp, UIO_SYSSPACE, soffset, shdr, shdrsz,
2698                     rlimit, credp);
2699         }
2700 
2701 done:
2702         if (ctx.ecc_bufsz != 0) {
2703                 kmem_free(ctx.ecc_buf, ctx.ecc_bufsz);
2704         }
2705         kmem_free(bigwad, bigsize);
2706         return (error);
2707 }
2708 
2709 #ifndef _ELF32_COMPAT
2710 
2711 static struct execsw esw = {
2712 #ifdef  _LP64
2713         elf64magicstr,
2714 #else   /* _LP64 */
2715         elf32magicstr,
2716 #endif  /* _LP64 */
2717         0,
2718         5,
2719         elfexec,
2720         elfcore
2721 };
2722 
2723 static struct modlexec modlexec = {
2724         &mod_execops, "exec module for elf", &esw
2725 };
2726 
2727 #ifdef  _LP64
2728 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2729                         intpdata_t *idatap, int level, size_t *execsz,
2730                         int setid, caddr_t exec_file, cred_t *cred,
2731                         int *brand_action);
2732 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2733                         rlim64_t rlimit, int sig, core_content_t content);
2734 
2735 static struct execsw esw32 = {
2736         elf32magicstr,
2737         0,
2738         5,
2739         elf32exec,
2740         elf32core
2741 };
2742 
2743 static struct modlexec modlexec32 = {
2744         &mod_execops, "32-bit exec module for elf", &esw32
2745 };
2746 #endif  /* _LP64 */
2747 
2748 static struct modlinkage modlinkage = {
2749         MODREV_1,
2750         (void *)&modlexec,
2751 #ifdef  _LP64
2752         (void *)&modlexec32,
2753 #endif  /* _LP64 */
2754         NULL
2755 };
2756 
2757 int
2758 _init(void)
2759 {
2760         return (mod_install(&modlinkage));
2761 }
2762 
2763 int
2764 _fini(void)
2765 {
2766         return (mod_remove(&modlinkage));
2767 }
2768 
2769 int
2770 _info(struct modinfo *modinfop)
2771 {
2772         return (mod_info(&modlinkage, modinfop));
2773 }
2774 
2775 #endif  /* !_ELF32_COMPAT */