Print this page
OS-5015 PT_INTERP headers should be permitted after PT_LOAD headers
OS-5451 comm page should not break i86xpv
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
OS-5293 lx brand: prelink(8)'d binaries core dump before main()
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5072 lxbrand support PT_GNU_STACK
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5202 Support AT_SECURE & AT_*ID in LX
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4824 Unlike Linux, nested interpreters don't work
(LX changes only, the rest were upstreamed...)
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
OS-3735 modstubs MAXNARG is too low.
OS-3733 Verify b_native_exec exists before calling it
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4119 lxbrand panic when running native perl inside lx zone
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4128 programs that lack PT_PHDR are not properly loaded
OS-4141 freeing phdrs induces bad kmem_free() in elfexec()
backout OS-4141: needs more work
backout OS-4128: needs more work
OS-4141 freeing phdrs induces bad kmem_free() in elfexec()
OS-4128 programs that lack PT_PHDR are not properly loaded
OS-3696 lx brand: G-Portugol programs core dump
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-3517 lx brand: branded zones don't interpret .interp section
OS-3405 lx brand: socket() fails for PF_INET6
OS-3382 lxbrand 64bit gettimeofday depends on vsyscall or vdso
OS-3280 need a way to specify the root of a native system in the lx brand
OS-3279 lx brand should allow delegated datasets
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-2949 add support for AT_RANDOM aux vector entry
OS-2877 lx_librtld_db falls to load due to NULL DT_DEBUG


   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 /*
  29  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
  30  */
  31 
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/thread.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/signal.h>
  37 #include <sys/cred.h>
  38 #include <sys/user.h>
  39 #include <sys/errno.h>
  40 #include <sys/vnode.h>
  41 #include <sys/mman.h>
  42 #include <sys/kmem.h>
  43 #include <sys/proc.h>
  44 #include <sys/pathname.h>
  45 #include <sys/cmn_err.h>
  46 #include <sys/systm.h>
  47 #include <sys/elf.h>
  48 #include <sys/vmsystm.h>
  49 #include <sys/debug.h>
  50 #include <sys/auxv.h>
  51 #include <sys/exec.h>
  52 #include <sys/prsystm.h>
  53 #include <vm/as.h>
  54 #include <vm/rm.h>
  55 #include <vm/seg.h>
  56 #include <vm/seg_vn.h>
  57 #include <sys/modctl.h>
  58 #include <sys/systeminfo.h>
  59 #include <sys/vmparam.h>
  60 #include <sys/machelf.h>
  61 #include <sys/shm_impl.h>
  62 #include <sys/archsystm.h>
  63 #include <sys/fasttrap.h>
  64 #include <sys/brand.h>
  65 #include "elf_impl.h"
  66 #include <sys/sdt.h>
  67 #include <sys/siginfo.h>
  68 





  69 extern int at_flags;
  70 
  71 #define ORIGIN_STR      "ORIGIN"
  72 #define ORIGIN_STR_SIZE 6
  73 
  74 static int getelfhead(vnode_t *, cred_t *, Ehdr *, int *, int *, int *);
  75 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, int, caddr_t *,
  76     ssize_t *);
  77 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, int, int, caddr_t *,
  78     ssize_t *, caddr_t *, ssize_t *);
  79 static size_t elfsize(Ehdr *, int, caddr_t, uintptr_t *);
  80 static int mapelfexec(vnode_t *, Ehdr *, int, caddr_t,
  81     Phdr **, Phdr **, Phdr **, Phdr **, Phdr *,
  82     caddr_t *, caddr_t *, intptr_t *, intptr_t *, size_t, long *, size_t *);
  83 
  84 typedef enum {
  85         STR_CTF,
  86         STR_SYMTAB,
  87         STR_DYNSYM,
  88         STR_STRTAB,


 146 
 147 static int
 148 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
 149 {
 150         ASSERT(phdrp->p_type == PT_SUNWDTRACE);
 151 
 152         /*
 153          * See the comment in fasttrap.h for information on how to safely
 154          * update this program header.
 155          */
 156         if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
 157             (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
 158                 return (-1);
 159 
 160         args->thrptr = phdrp->p_vaddr + base;
 161 
 162         return (0);
 163 }
 164 
 165 /*
 166  * Map in the executable pointed to by vp. Returns 0 on success.




 167  */
 168 int
 169 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 170     intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
 171     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
 172 {
 173         size_t          len;
 174         struct vattr    vat;
 175         caddr_t         phdrbase = NULL;
 176         ssize_t         phdrsize;
 177         int             nshdrs, shstrndx, nphdrs;
 178         int             error = 0;
 179         Phdr            *uphdr = NULL;
 180         Phdr            *junk = NULL;
 181         Phdr            *dynphdr = NULL;
 182         Phdr            *dtrphdr = NULL;

 183         uintptr_t       lddata;
 184         long            execsz;
 185         intptr_t        minaddr;
 186 
 187         if (lddatap != NULL)
 188                 *lddatap = NULL;
 189 



 190         if (error = execpermissions(vp, &vat, args)) {
 191                 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
 192                 return (error);
 193         }
 194 
 195         if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
 196             &nphdrs)) != 0 ||
 197             (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
 198             &phdrsize)) != 0) {
 199                 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
 200                 return (error);
 201         }
 202 
 203         if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
 204                 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
 205                 kmem_free(phdrbase, phdrsize);
 206                 return (ENOEXEC);
 207         }
 208         if (lddatap != NULL)
 209                 *lddatap = lddata;
 210 
 211         if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
 212             &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
 213             len, &execsz, brksize)) {
 214                 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);


 215                 kmem_free(phdrbase, phdrsize);
 216                 return (error);
 217         }
 218 



 219         /*
 220          * Inform our caller if the executable needs an interpreter.
 221          */
 222         *interp = (dynphdr == NULL) ? 0 : 1;

 223 




















 224         /*
 225          * If this is a statically linked executable, voffset should indicate
 226          * the address of the executable itself (it normally holds the address
 227          * of the interpreter).
 228          */
 229         if (ehdr->e_type == ET_EXEC && *interp == 0)
 230                 *voffset = minaddr;
 231 











 232         if (uphdr != NULL) {
 233                 *uphdr_vaddr = uphdr->p_vaddr;



























 234         } else {
 235                 *uphdr_vaddr = (Addr)-1;
 236         }
 237 
 238         kmem_free(phdrbase, phdrsize);
 239         return (error);
 240 }
 241 
 242 /*ARGSUSED*/
 243 int
 244 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 245     int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
 246     int brand_action)
 247 {
 248         caddr_t         phdrbase = NULL;
 249         caddr_t         bssbase = 0;
 250         caddr_t         brkbase = 0;
 251         size_t          brksize = 0;
 252         ssize_t         dlnsize;
 253         aux_entry_t     *aux;
 254         int             error;
 255         ssize_t         resid;
 256         int             fd = -1;
 257         intptr_t        voffset;
 258         Phdr            *dyphdr = NULL;
 259         Phdr            *stphdr = NULL;
 260         Phdr            *uphdr = NULL;
 261         Phdr            *junk = NULL;
 262         size_t          len;
 263         ssize_t         phdrsize;
 264         int             postfixsize = 0;
 265         int             i, hsize;
 266         Phdr            *phdrp;
 267         Phdr            *dataphdrp = NULL;
 268         Phdr            *dtrphdr;
 269         Phdr            *capphdr = NULL;
 270         Cap             *cap = NULL;
 271         ssize_t         capsize;
 272         int             hasu = 0;
 273         int             hasauxv = 0;
 274         int             hasdy = 0;
 275         int             branded = 0;

 276 
 277         struct proc *p = ttoproc(curthread);
 278         struct user *up = PTOU(p);
 279         struct bigwad {
 280                 Ehdr    ehdr;
 281                 aux_entry_t     elfargs[__KERN_NAUXV_IMPL];
 282                 char            dl_name[MAXPATHLEN];
 283                 char            pathbuf[MAXPATHLEN];
 284                 struct vattr    vattr;
 285                 struct execenv  exenv;
 286         } *bigwad;      /* kmem_alloc this behemoth so we don't blow stack */
 287         Ehdr            *ehdrp;
 288         int             nshdrs, shstrndx, nphdrs;
 289         char            *dlnp;
 290         char            *pathbufp;
 291         rlim64_t        limit;
 292         rlim64_t        roundlimit;
 293 
 294         ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
 295 


 310         /*
 311          * Prevent executing an ELF file that has no entry point.
 312          */
 313         if (ehdrp->e_entry == 0) {
 314                 uprintf("%s: Bad entry point\n", exec_file);
 315                 goto bad;
 316         }
 317 
 318         /*
 319          * Put data model that we're exec-ing to into the args passed to
 320          * exec_args(), so it will know what it is copying to on new stack.
 321          * Now that we know whether we are exec-ing a 32-bit or 64-bit
 322          * executable, we can set execsz with the appropriate NCARGS.
 323          */
 324 #ifdef  _LP64
 325         if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
 326                 args->to_model = DATAMODEL_ILP32;
 327                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
 328         } else {
 329                 args->to_model = DATAMODEL_LP64;

 330                 args->stk_prot &= ~PROT_EXEC;

 331 #if defined(__i386) || defined(__amd64)
 332                 args->dat_prot &= ~PROT_EXEC;
 333 #endif
 334                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
 335         }
 336 #else   /* _LP64 */
 337         args->to_model = DATAMODEL_ILP32;
 338         *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
 339 #endif  /* _LP64 */
 340 
 341         /*
 342          * We delay invoking the brand callback until we've figured out
 343          * what kind of elf binary we're trying to run, 32-bit or 64-bit.
 344          * We do this because now the brand library can just check
 345          * args->to_model to see if the target is 32-bit or 64-bit without
 346          * having do duplicate all the code above.
 347          *














 348          * The level checks associated with brand handling below are used to
 349          * prevent a loop since the brand elfexec function typically comes back
 350          * through this function. We must check <= here since the nested
 351          * handling in the #! interpreter code will increment the level before
 352          * calling gexec to run the final elfexec interpreter.
 353          */












 354         if ((level <= INTP_MAXDEPTH) &&
 355             (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 356                 error = BROP(p)->b_elfexec(vp, uap, args,
 357                     idatap, level + 1, execsz, setid, exec_file, cred,
 358                     brand_action);
 359                 goto out;
 360         }
 361 
 362         /*
 363          * Determine aux size now so that stack can be built
 364          * in one shot (except actual copyout of aux image),
 365          * determine any non-default stack protections,
 366          * and still have this code be machine independent.
 367          */
 368         hsize = ehdrp->e_phentsize;
 369         phdrp = (Phdr *)phdrbase;
 370         for (i = nphdrs; i > 0; i--) {
 371                 switch (phdrp->p_type) {
 372                 case PT_INTERP:
 373                         hasauxv = hasdy = 1;
 374                         break;
 375                 case PT_PHDR:


 406                         args->dat_prot |= PROT_READ;
 407                 if (dataphdrp->p_flags & PF_W)
 408                         args->dat_prot |= PROT_WRITE;
 409                 if (dataphdrp->p_flags & PF_X)
 410                         args->dat_prot |= PROT_EXEC;
 411         }
 412 
 413         /*
 414          * If a auxvector will be required - reserve the space for
 415          * it now.  This may be increased by exec_args if there are
 416          * ISA-specific types (included in __KERN_NAUXV_IMPL).
 417          */
 418         if (hasauxv) {
 419                 /*
 420                  * If a AUX vector is being built - the base AUX
 421                  * entries are:
 422                  *
 423                  *      AT_BASE
 424                  *      AT_FLAGS
 425                  *      AT_PAGESZ

 426                  *      AT_SUN_AUXFLAGS
 427                  *      AT_SUN_HWCAP
 428                  *      AT_SUN_HWCAP2
 429                  *      AT_SUN_PLATFORM (added in stk_copyout)
 430                  *      AT_SUN_EXECNAME (added in stk_copyout)
 431                  *      AT_NULL
 432                  *
 433                  * total == 9
 434                  */
 435                 if (hasdy && hasu) {
 436                         /*
 437                          * Has PT_INTERP & PT_PHDR - the auxvectors that
 438                          * will be built are:
 439                          *
 440                          *      AT_PHDR
 441                          *      AT_PHENT
 442                          *      AT_PHNUM
 443                          *      AT_ENTRY
 444                          *      AT_LDDATA
 445                          *
 446                          * total = 5
 447                          */
 448                         args->auxsize = (9 + 5) * sizeof (aux_entry_t);
 449                 } else if (hasdy) {
 450                         /*
 451                          * Has PT_INTERP but no PT_PHDR
 452                          *
 453                          *      AT_EXECFD
 454                          *      AT_LDDATA
 455                          *
 456                          * total = 2
 457                          */
 458                         args->auxsize = (9 + 2) * sizeof (aux_entry_t);
 459                 } else {
 460                         args->auxsize = 9 * sizeof (aux_entry_t);
 461                 }
 462         } else {
 463                 args->auxsize = 0;
 464         }
 465 
 466         /*
 467          * If this binary is using an emulator, we need to add an
 468          * AT_SUN_EMULATOR aux entry.
 469          */
 470         if (args->emulator != NULL)
 471                 args->auxsize += sizeof (aux_entry_t);
 472 
 473         if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 474                 branded = 1;
 475                 /*
 476                  * We will be adding 4 entries to the aux vectors.  One for
 477                  * the the brandname and 3 for the brand specific aux vectors.
 478                  */






















 479                 args->auxsize += 4 * sizeof (aux_entry_t);
 480         }
 481 









 482         /* Hardware/Software capabilities */
 483         if (capphdr != NULL &&
 484             (capsize = capphdr->p_filesz) > 0 &&
 485             capsize <= 16 * sizeof (*cap)) {
 486                 int ncaps = capsize / sizeof (*cap);
 487                 Cap *cp;
 488 
 489                 cap = kmem_alloc(capsize, KM_SLEEP);
 490                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
 491                     capsize, (offset_t)capphdr->p_offset,
 492                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
 493                         uprintf("%s: Cannot read capabilities section\n",
 494                             exec_file);
 495                         goto out;
 496                 }
 497                 for (cp = cap; cp < cap + ncaps; cp++) {
 498                         if (cp->c_tag == CA_SUNW_SF_1 &&
 499                             (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
 500                                 if (args->to_model == DATAMODEL_LP64)
 501                                         args->addr32 = 1;


 517                 goto out;
 518         }
 519         /* we're single threaded after this point */
 520 
 521         /*
 522          * If this is an ET_DYN executable (shared object),
 523          * determine its memory size so that mapelfexec() can load it.
 524          */
 525         if (ehdrp->e_type == ET_DYN)
 526                 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
 527         else
 528                 len = 0;
 529 
 530         dtrphdr = NULL;
 531 
 532         if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &dyphdr,
 533             &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
 534             len, execsz, &brksize)) != 0)
 535                 goto bad;
 536 








 537         if (uphdr != NULL && dyphdr == NULL)
 538                 goto bad;
 539 
 540         if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 541                 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
 542                 goto bad;
 543         }
 544 
 545         if (dyphdr != NULL) {
 546                 size_t          len;
 547                 uintptr_t       lddata;
 548                 char            *p;
 549                 struct vnode    *nvp;
 550 
 551                 dlnsize = dyphdr->p_filesz;
 552 
 553                 if (dlnsize > MAXPATHLEN || dlnsize <= 0)
 554                         goto bad;
 555 





 556                 /*
 557                  * Read in "interpreter" pathname.
 558                  */
 559                 if ((error = vn_rdwr(UIO_READ, vp, dlnp, dyphdr->p_filesz,
 560                     (offset_t)dyphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
 561                     CRED(), &resid)) != 0) {
 562                         uprintf("%s: Cannot obtain interpreter pathname\n",
 563                             exec_file);
 564                         goto bad;
 565                 }
 566 
 567                 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
 568                         goto bad;
 569 
 570                 /*
 571                  * Search for '$ORIGIN' token in interpreter path.
 572                  * If found, expand it.
 573                  */
 574                 for (p = dlnp; p = strchr(p, '$'); ) {
 575                         uint_t  len, curlen;
 576                         char    *_ptr;
 577 
 578                         if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
 579                                 continue;
 580 
 581                         /*


 686                     &phdrsize)) != 0) {
 687                         VN_RELE(nvp);
 688                         uprintf("%s: Cannot read %s\n", exec_file, dlnp);
 689                         goto bad;
 690                 }
 691 
 692                 /*
 693                  * Determine memory size of the "interpreter's" loadable
 694                  * sections.  This size is then used to obtain the virtual
 695                  * address of a hole, in the user's address space, large
 696                  * enough to map the "interpreter".
 697                  */
 698                 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
 699                         VN_RELE(nvp);
 700                         uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
 701                         goto bad;
 702                 }
 703 
 704                 dtrphdr = NULL;
 705 
 706                 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk,
 707                     &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
 708                     execsz, NULL);

 709                 if (error || junk != NULL) {
 710                         VN_RELE(nvp);
 711                         uprintf("%s: Cannot map %s\n", exec_file, dlnp);
 712                         goto bad;
 713                 }
 714 
 715                 /*
 716                  * We use the DTrace program header to initialize the
 717                  * architecture-specific user per-LWP location. The dtrace
 718                  * fasttrap provider requires ready access to per-LWP scratch
 719                  * space. We assume that there is only one such program header
 720                  * in the interpreter.
 721                  */
 722                 if (dtrphdr != NULL &&
 723                     dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 724                         VN_RELE(nvp);
 725                         uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
 726                         goto bad;
 727                 }
 728 
 729                 VN_RELE(nvp);
 730                 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
 731         }
 732 
 733         if (hasauxv) {
 734                 int auxf = AF_SUN_HWCAPVERIFY;

 735                 /*
 736                  * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
 737                  * exec_args()
 738                  */
 739                 ADDAUX(aux, AT_BASE, voffset)
 740                 ADDAUX(aux, AT_FLAGS, at_flags)
 741                 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
 742                 /*
 743                  * Linker flags. (security)
 744                  * p_flag not yet set at this time.
 745                  * We rely on gexec() to provide us with the information.
 746                  * If the application is set-uid but this is not reflected
 747                  * in a mismatch between real/effective uids/gids, then
 748                  * don't treat this as a set-uid exec.  So we care about
 749                  * the EXECSETID_UGIDS flag but not the ...SETID flag.
 750                  */
 751                 if ((setid &= ~EXECSETID_SETID) != 0)
 752                         auxf |= AF_SUN_SETUGID;
 753 
 754                 /*
 755                  * If we're running a native process from within a branded
 756                  * zone under pfexec then we clear the AF_SUN_SETUGID flag so
 757                  * that the native ld.so.1 is able to link with the native
 758                  * libraries instead of using the brand libraries that are
 759                  * installed in the zone.  We only do this for processes
 760                  * which we trust because we see they are already running
 761                  * under pfexec (where uid != euid).  This prevents a
 762                  * malicious user within the zone from crafting a wrapper to
 763                  * run native suid commands with unsecure libraries interposed.
 764                  */
 765                 if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
 766                     (setid &= ~EXECSETID_SETID) != 0))
 767                         auxf &= ~AF_SUN_SETUGID;
 768 
 769                 /*
 770                  * Record the user addr of the auxflags aux vector entry
 771                  * since brands may optionally want to manipulate this field.
 772                  */
 773                 args->auxp_auxflags =
 774                     (char *)((char *)args->stackend +
 775                     ((char *)&aux->a_type -
 776                     (char *)bigwad->elfargs));
 777                 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);

 778                 /*











 779                  * Hardware capability flag word (performance hints)
 780                  * Used for choosing faster library routines.
 781                  * (Potentially different between 32-bit and 64-bit ABIs)
 782                  */
 783 #if defined(_LP64)
 784                 if (args->to_model == DATAMODEL_NATIVE) {
 785                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
 786                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
 787                 } else {
 788                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
 789                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
 790                 }
 791 #else
 792                 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
 793                 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
 794 #endif
 795                 if (branded) {
 796                         /*
 797                          * Reserve space for the brand-private aux vectors,
 798                          * and record the user addr of that space.
 799                          */
 800                         args->auxp_brand =
 801                             (char *)((char *)args->stackend +
 802                             ((char *)&aux->a_type -
 803                             (char *)bigwad->elfargs));
 804                         ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
 805                         ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
 806                         ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)

 807                 }
 808 












 809                 ADDAUX(aux, AT_NULL, 0)




 810                 postfixsize = (char *)aux - (char *)bigwad->elfargs;
 811 
 812                 /*
 813                  * We make assumptions above when we determine how many aux
 814                  * vector entries we will be adding. However, if we have an
 815                  * invalid elf file, it is possible that mapelfexec might
 816                  * behave differently (but not return an error), in which case
 817                  * the number of aux entries we actually add will be different.
 818                  * We detect that now and error out.
 819                  */
 820                 if (postfixsize != args->auxsize) {
 821                         DTRACE_PROBE2(elfexec_badaux, int, postfixsize,
 822                             int, args->auxsize);
 823                         goto bad;
 824                 }
 825                 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
 826         }
 827 
 828         /*
 829          * For the 64-bit kernel, the limit is big enough that rounding it up
 830          * to a page can overflow the 64-bit limit, so we check for btopr()
 831          * overflowing here by comparing it with the unrounded limit in pages.
 832          * If it hasn't overflowed, compare the exec size with the rounded up
 833          * limit in pages.  Otherwise, just compare with the unrounded limit.
 834          */
 835         limit = btop(p->p_vmem_ctl);
 836         roundlimit = btopr(p->p_vmem_ctl);
 837         if ((roundlimit > limit && *execsz > roundlimit) ||
 838             (roundlimit < limit && *execsz > limit)) {
 839                 mutex_enter(&p->p_lock);
 840                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
 841                     RCA_SAFE);
 842                 mutex_exit(&p->p_lock);
 843                 error = ENOMEM;
 844                 goto bad;
 845         }
 846 
 847         bzero(up->u_auxv, sizeof (up->u_auxv));

 848         if (postfixsize) {
 849                 int num_auxv;
 850 
 851                 /*
 852                  * Copy the aux vector to the user stack.
 853                  */
 854                 error = execpoststack(args, bigwad->elfargs, postfixsize);
 855                 if (error)
 856                         goto bad;
 857 
 858                 /*
 859                  * Copy auxv to the process's user structure for use by /proc.
 860                  * If this is a branded process, the brand's exec routine will
 861                  * copy it's private entries to the user structure later. It
 862                  * relies on the fact that the blank entries are at the end.
 863                  */
 864                 num_auxv = postfixsize / sizeof (aux_entry_t);
 865                 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
 866                 aux = bigwad->elfargs;
 867                 for (i = 0; i < num_auxv; i++) {


 894                         bigwad->exenv.ex_bssbase = bssbase;
 895                         bigwad->exenv.ex_brksize = brksize;
 896                 }
 897                 bigwad->exenv.ex_magic = elfmagic;
 898                 bigwad->exenv.ex_vp = vp;
 899                 setexecenv(&bigwad->exenv);
 900         }
 901 
 902         ASSERT(error == 0);
 903         goto out;
 904 
 905 bad:
 906         if (fd != -1)           /* did we open the a.out yet */
 907                 (void) execclose(fd);
 908 
 909         psignal(p, SIGKILL);
 910 
 911         if (error == 0)
 912                 error = ENOEXEC;
 913 out:


 914         if (phdrbase != NULL)
 915                 kmem_free(phdrbase, phdrsize);
 916         if (cap != NULL)
 917                 kmem_free(cap, capsize);
 918         kmem_free(bigwad, sizeof (struct bigwad));
 919         return (error);
 920 }
 921 
 922 /*
 923  * Compute the memory size requirement for the ELF file.
 924  */
 925 static size_t
 926 elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata)
 927 {
 928         size_t  len;
 929         Phdr    *phdrp = (Phdr *)phdrbase;
 930         int     hsize = ehdrp->e_phentsize;
 931         int     first = 1;
 932         int     dfirst = 1;     /* first data segment */
 933         uintptr_t loaddr = 0;


1160                 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1161         }
1162 
1163         if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, *shstrsizep,
1164             (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1165             credp, &resid)) != 0) {
1166                 kmem_free(*shbasep, *shsizep);
1167                 kmem_free(*shstrbasep, *shstrsizep);
1168                 return (err);
1169         }
1170 
1171         /*
1172          * Make sure the strtab is null-terminated to make sure we
1173          * don't run off the end of the table.
1174          */
1175         (*shstrbasep)[*shstrsizep - 1] = '\0';
1176 
1177         return (0);
1178 }
1179 























1180 static int
1181 mapelfexec(
1182         vnode_t *vp,
1183         Ehdr *ehdr,
1184         int nphdrs,
1185         caddr_t phdrbase,
1186         Phdr **uphdr,
1187         Phdr **dyphdr,
1188         Phdr **stphdr,
1189         Phdr **dtphdr,
1190         Phdr *dataphdrp,
1191         caddr_t *bssbase,
1192         caddr_t *brkbase,
1193         intptr_t *voffset,
1194         intptr_t *minaddr,
1195         size_t len,
1196         long *execsz,
1197         size_t *brksize)
1198 {
1199         Phdr *phdr;
1200         int i, prot, error;
1201         caddr_t addr = NULL;
1202         size_t zfodsz;
1203         int ptload = 0;
1204         int page;
1205         off_t offset;
1206         int hsize = ehdr->e_phentsize;
1207         caddr_t mintmp = (caddr_t)-1;

1208         extern int use_brk_lpg;
1209 
1210         if (ehdr->e_type == ET_DYN) {
1211                 /*
1212                  * Obtain the virtual address of a hole in the
1213                  * address space to map the "interpreter".
1214                  */
1215                 map_addr(&addr, len, (offset_t)0, 1, 0);
1216                 if (addr == NULL)
1217                         return (ENOMEM);
1218                 *voffset = (intptr_t)addr;
1219 
1220                 /*
1221                  * Calculate the minimum vaddr so it can be subtracted out.
1222                  * According to the ELF specification, since PT_LOAD sections
1223                  * must be sorted by increasing p_vaddr values, this is
1224                  * guaranteed to be the first PT_LOAD section.








1225                  */
1226                 phdr = (Phdr *)phdrbase;
1227                 for (i = nphdrs; i > 0; i--) {
1228                         if (phdr->p_type == PT_LOAD) {
1229                                 *voffset -= (uintptr_t)phdr->p_vaddr;
1230                                 break;
1231                         }
1232                         phdr = (Phdr *)((caddr_t)phdr + hsize);
1233                 }
1234 



































1235         } else {
1236                 *voffset = 0;
1237         }

1238         phdr = (Phdr *)phdrbase;
1239         for (i = nphdrs; i > 0; i--) {
1240                 switch (phdr->p_type) {
1241                 case PT_LOAD:
1242                         if ((*dyphdr != NULL) && (*uphdr == NULL))
1243                                 return (0);
1244 
1245                         ptload = 1;
1246                         prot = PROT_USER;
1247                         if (phdr->p_flags & PF_R)
1248                                 prot |= PROT_READ;
1249                         if (phdr->p_flags & PF_W)
1250                                 prot |= PROT_WRITE;
1251                         if (phdr->p_flags & PF_X)
1252                                 prot |= PROT_EXEC;
1253 
1254                         addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1255 


1256                         /*


























1257                          * Keep track of the segment with the lowest starting
1258                          * address.
1259                          */
1260                         if (addr < mintmp)
1261                                 mintmp = addr;
1262 



































1263                         zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1264 
1265                         offset = phdr->p_offset;
1266                         if (((uintptr_t)offset & PAGEOFFSET) ==
1267                             ((uintptr_t)addr & PAGEOFFSET) &&
1268                             (!(vp->v_flag & VNOMAP))) {
1269                                 page = 1;
1270                         } else {
1271                                 page = 0;
1272                         }
1273 
1274                         /*
1275                          * Set the heap pagesize for OOB when the bss size
1276                          * is known and use_brk_lpg is not 0.
1277                          */
1278                         if (brksize != NULL && use_brk_lpg &&
1279                             zfodsz != 0 && phdr == dataphdrp &&
1280                             (prot & PROT_WRITE)) {
1281                                 size_t tlen = P2NPHASE((uintptr_t)addr +
1282                                     phdr->p_filesz, PAGESIZE);


1307                                 if (brksize != NULL)
1308                                         *brksize = extra_zfodsz;
1309                         } else {
1310                                 if (error = execmap(vp, addr, phdr->p_filesz,
1311                                     zfodsz, phdr->p_offset, prot, page, 0))
1312                                         goto bad;
1313                         }
1314 
1315                         if (bssbase != NULL && addr >= *bssbase &&
1316                             phdr == dataphdrp) {
1317                                 *bssbase = addr + phdr->p_filesz;
1318                         }
1319                         if (brkbase != NULL && addr >= *brkbase) {
1320                                 *brkbase = addr + phdr->p_memsz;
1321                         }
1322 
1323                         *execsz += btopr(phdr->p_memsz);
1324                         break;
1325 
1326                 case PT_INTERP:
1327                         if (ptload)
1328                                 goto bad;














1329                         *dyphdr = phdr;
1330                         break;
1331 
1332                 case PT_SHLIB:
1333                         *stphdr = phdr;
1334                         break;
1335 
1336                 case PT_PHDR:
1337                         if (ptload)
1338                                 goto bad;


1339                         *uphdr = phdr;

1340                         break;
1341 
1342                 case PT_NULL:
1343                 case PT_DYNAMIC:
1344                 case PT_NOTE:
1345                         break;
1346 
1347                 case PT_SUNWDTRACE:
1348                         if (dtphdr != NULL)
1349                                 *dtphdr = phdr;
1350                         break;
1351 
1352                 default:
1353                         break;
1354                 }
1355                 phdr = (Phdr *)((caddr_t)phdr + hsize);
1356         }
1357 
1358         if (minaddr != NULL) {
1359                 ASSERT(mintmp != (caddr_t)-1);


2168 static struct execsw esw = {
2169 #ifdef  _LP64
2170         elf64magicstr,
2171 #else   /* _LP64 */
2172         elf32magicstr,
2173 #endif  /* _LP64 */
2174         0,
2175         5,
2176         elfexec,
2177         elfcore
2178 };
2179 
2180 static struct modlexec modlexec = {
2181         &mod_execops, "exec module for elf", &esw
2182 };
2183 
2184 #ifdef  _LP64
2185 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2186                         intpdata_t *idatap, int level, long *execsz,
2187                         int setid, caddr_t exec_file, cred_t *cred,
2188                         int brand_action);
2189 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2190                         rlim64_t rlimit, int sig, core_content_t content);
2191 
2192 static struct execsw esw32 = {
2193         elf32magicstr,
2194         0,
2195         5,
2196         elf32exec,
2197         elf32core
2198 };
2199 
2200 static struct modlexec modlexec32 = {
2201         &mod_execops, "32-bit exec module for elf", &esw32
2202 };
2203 #endif  /* _LP64 */
2204 
2205 static struct modlinkage modlinkage = {
2206         MODREV_1,
2207         (void *)&modlexec,
2208 #ifdef  _LP64




   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 /*
  29  * Copyright 2016 Joyent, Inc.
  30  */
  31 
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/thread.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/signal.h>
  37 #include <sys/cred.h>
  38 #include <sys/user.h>
  39 #include <sys/errno.h>
  40 #include <sys/vnode.h>
  41 #include <sys/mman.h>
  42 #include <sys/kmem.h>
  43 #include <sys/proc.h>
  44 #include <sys/pathname.h>
  45 #include <sys/cmn_err.h>
  46 #include <sys/systm.h>
  47 #include <sys/elf.h>
  48 #include <sys/vmsystm.h>
  49 #include <sys/debug.h>
  50 #include <sys/auxv.h>
  51 #include <sys/exec.h>
  52 #include <sys/prsystm.h>
  53 #include <vm/as.h>
  54 #include <vm/rm.h>
  55 #include <vm/seg.h>
  56 #include <vm/seg_vn.h>
  57 #include <sys/modctl.h>
  58 #include <sys/systeminfo.h>
  59 #include <sys/vmparam.h>
  60 #include <sys/machelf.h>
  61 #include <sys/shm_impl.h>
  62 #include <sys/archsystm.h>
  63 #include <sys/fasttrap.h>
  64 #include <sys/brand.h>
  65 #include "elf_impl.h"
  66 #include <sys/sdt.h>
  67 #include <sys/siginfo.h>
  68 
  69 #if defined(__x86)
  70 #include <sys/comm_page_util.h>
  71 #endif /* defined(__x86) */
  72 
  73 
  74 extern int at_flags;
  75 
  76 #define ORIGIN_STR      "ORIGIN"
  77 #define ORIGIN_STR_SIZE 6
  78 
  79 static int getelfhead(vnode_t *, cred_t *, Ehdr *, int *, int *, int *);
  80 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, int, caddr_t *,
  81     ssize_t *);
  82 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, int, int, caddr_t *,
  83     ssize_t *, caddr_t *, ssize_t *);
  84 static size_t elfsize(Ehdr *, int, caddr_t, uintptr_t *);
  85 static int mapelfexec(vnode_t *, Ehdr *, int, caddr_t,
  86     Phdr **, Phdr **, Phdr **, Phdr **, Phdr *,
  87     caddr_t *, caddr_t *, intptr_t *, intptr_t *, size_t, long *, size_t *);
  88 
  89 typedef enum {
  90         STR_CTF,
  91         STR_SYMTAB,
  92         STR_DYNSYM,
  93         STR_STRTAB,


 151 
 152 static int
 153 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
 154 {
 155         ASSERT(phdrp->p_type == PT_SUNWDTRACE);
 156 
 157         /*
 158          * See the comment in fasttrap.h for information on how to safely
 159          * update this program header.
 160          */
 161         if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
 162             (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
 163                 return (-1);
 164 
 165         args->thrptr = phdrp->p_vaddr + base;
 166 
 167         return (0);
 168 }
 169 
 170 /*
 171  * Map in the executable pointed to by vp. Returns 0 on success.  Note that
 172  * this function currently has the maximum number of arguments allowed by
 173  * modstubs on x86 (MAXNARG)!  Do _not_ add to this function signature without
 174  * adding to MAXNARG.  (Better yet, do not add to this monster of a function
 175  * signature!)
 176  */
 177 int
 178 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 179     intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
 180     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
 181 {
 182         size_t          len;
 183         struct vattr    vat;
 184         caddr_t         phdrbase = NULL;
 185         ssize_t         phdrsize;
 186         int             nshdrs, shstrndx, nphdrs;
 187         int             error = 0;
 188         Phdr            *uphdr = NULL;
 189         Phdr            *junk = NULL;
 190         Phdr            *dynphdr = NULL;
 191         Phdr            *dtrphdr = NULL;
 192         char            *interp = NULL;
 193         uintptr_t       lddata;
 194         long            execsz;
 195         intptr_t        minaddr;
 196 
 197         if (lddatap != NULL)
 198                 *lddatap = NULL;
 199 
 200         if (minaddrp != NULL)
 201                 *minaddrp = NULL;
 202 
 203         if (error = execpermissions(vp, &vat, args)) {
 204                 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
 205                 return (error);
 206         }
 207 
 208         if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
 209             &nphdrs)) != 0 ||
 210             (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
 211             &phdrsize)) != 0) {
 212                 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
 213                 return (error);
 214         }
 215 
 216         if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
 217                 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
 218                 kmem_free(phdrbase, phdrsize);
 219                 return (ENOEXEC);
 220         }
 221         if (lddatap != NULL)
 222                 *lddatap = lddata;
 223 
 224         if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
 225             &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
 226             len, &execsz, brksize)) {
 227                 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
 228                 if (uphdr != NULL && uphdr->p_flags == 0)
 229                         kmem_free(uphdr, sizeof (Phdr));
 230                 kmem_free(phdrbase, phdrsize);
 231                 return (error);
 232         }
 233 
 234         if (minaddrp != NULL)
 235                 *minaddrp = minaddr;
 236 
 237         /*
 238          * If the executable requires an interpreter, determine its name.
 239          */
 240         if (dynphdr != NULL) {
 241                 ssize_t resid;
 242 
 243                 if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
 244                         uprintf("%s: Invalid interpreter\n", exec_file);
 245                         kmem_free(phdrbase, phdrsize);
 246                         return (ENOEXEC);
 247                 }
 248 
 249                 interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 250 
 251                 if ((error = vn_rdwr(UIO_READ, vp, interp, dynphdr->p_filesz,
 252                     (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
 253                     (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
 254                     interp[dynphdr->p_filesz - 1] != '\0') {
 255                         uprintf("%s: Cannot obtain interpreter pathname\n",
 256                             exec_file);
 257                         kmem_free(interp, MAXPATHLEN);
 258                         kmem_free(phdrbase, phdrsize);
 259                         return (error != 0 ? error : ENOEXEC);
 260                 }
 261         }
 262 
 263         /*
 264          * If this is a statically linked executable, voffset should indicate
 265          * the address of the executable itself (it normally holds the address
 266          * of the interpreter).
 267          */
 268         if (ehdr->e_type == ET_EXEC && interp == NULL)
 269                 *voffset = minaddr;
 270 
 271         /*
 272          * If the caller has asked for the interpreter name, return it (it's
 273          * up to the caller to free it); if the caller hasn't asked for it,
 274          * free it ourselves.
 275          */
 276         if (interpp != NULL) {
 277                 *interpp = interp;
 278         } else if (interp != NULL) {
 279                 kmem_free(interp, MAXPATHLEN);
 280         }
 281 
 282         if (uphdr != NULL) {
 283                 *uphdr_vaddr = uphdr->p_vaddr;
 284 
 285                 if (uphdr->p_flags == 0)
 286                         kmem_free(uphdr, sizeof (Phdr));
 287         } else if (ehdr->e_type == ET_DYN) {
 288                 /*
 289                  * If we don't have a uphdr, we'll apply the logic found
 290                  * in mapelfexec() and use the p_vaddr of the first PT_LOAD
 291                  * section as the base address of the object.
 292                  */
 293                 Phdr *phdr = (Phdr *)phdrbase;
 294                 int i, hsize = ehdr->e_phentsize;
 295 
 296                 for (i = nphdrs; i > 0; i--) {
 297                         if (phdr->p_type == PT_LOAD) {
 298                                 *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
 299                                     ehdr->e_phoff;
 300                                 break;
 301                         }
 302 
 303                         phdr = (Phdr *)((caddr_t)phdr + hsize);
 304                 }
 305 
 306                 /*
 307                  * If we don't have a PT_LOAD segment, we should have returned
 308                  * ENOEXEC when elfsize() returned 0, above.
 309                  */
 310                 VERIFY(i > 0);
 311         } else {
 312                 *uphdr_vaddr = (Addr)-1;
 313         }
 314 
 315         kmem_free(phdrbase, phdrsize);
 316         return (error);
 317 }
 318 
 319 /*ARGSUSED*/
 320 int
 321 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 322     int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
 323     int *brand_action)
 324 {
 325         caddr_t         phdrbase = NULL;
 326         caddr_t         bssbase = 0;
 327         caddr_t         brkbase = 0;
 328         size_t          brksize = 0;
 329         ssize_t         dlnsize, nsize = 0;
 330         aux_entry_t     *aux;
 331         int             error;
 332         ssize_t         resid;
 333         int             fd = -1;
 334         intptr_t        voffset;
 335         Phdr            *dyphdr = NULL;
 336         Phdr            *stphdr = NULL;
 337         Phdr            *uphdr = NULL;
 338         Phdr            *junk = NULL;
 339         size_t          len;
 340         ssize_t         phdrsize;
 341         int             postfixsize = 0;
 342         int             i, hsize;
 343         Phdr            *phdrp;
 344         Phdr            *dataphdrp = NULL;
 345         Phdr            *dtrphdr;
 346         Phdr            *capphdr = NULL;
 347         Cap             *cap = NULL;
 348         ssize_t         capsize;
 349         int             hasu = 0;
 350         int             hasauxv = 0;
 351         int             hasdy = 0;
 352         int             branded = 0;
 353         int             dynuphdr = 0;
 354 
 355         struct proc *p = ttoproc(curthread);
 356         struct user *up = PTOU(p);
 357         struct bigwad {
 358                 Ehdr    ehdr;
 359                 aux_entry_t     elfargs[__KERN_NAUXV_IMPL];
 360                 char            dl_name[MAXPATHLEN];
 361                 char            pathbuf[MAXPATHLEN];
 362                 struct vattr    vattr;
 363                 struct execenv  exenv;
 364         } *bigwad;      /* kmem_alloc this behemoth so we don't blow stack */
 365         Ehdr            *ehdrp;
 366         int             nshdrs, shstrndx, nphdrs;
 367         char            *dlnp;
 368         char            *pathbufp;
 369         rlim64_t        limit;
 370         rlim64_t        roundlimit;
 371 
 372         ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
 373 


 388         /*
 389          * Prevent executing an ELF file that has no entry point.
 390          */
 391         if (ehdrp->e_entry == 0) {
 392                 uprintf("%s: Bad entry point\n", exec_file);
 393                 goto bad;
 394         }
 395 
 396         /*
 397          * Put data model that we're exec-ing to into the args passed to
 398          * exec_args(), so it will know what it is copying to on new stack.
 399          * Now that we know whether we are exec-ing a 32-bit or 64-bit
 400          * executable, we can set execsz with the appropriate NCARGS.
 401          */
 402 #ifdef  _LP64
 403         if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
 404                 args->to_model = DATAMODEL_ILP32;
 405                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
 406         } else {
 407                 args->to_model = DATAMODEL_LP64;
 408                 if (!args->stk_prot_override) {
 409                         args->stk_prot &= ~PROT_EXEC;
 410                 }
 411 #if defined(__i386) || defined(__amd64)
 412                 args->dat_prot &= ~PROT_EXEC;
 413 #endif
 414                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
 415         }
 416 #else   /* _LP64 */
 417         args->to_model = DATAMODEL_ILP32;
 418         *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
 419 #endif  /* _LP64 */
 420 
 421         /*
 422          * We delay invoking the brand callback until we've figured out what
 423          * kind of elf binary we're trying to run, 32-bit or 64-bit.  We do this
 424          * because now the brand library can just check args->to_model to see if
 425          * the target is 32-bit or 64-bit without having do duplicate all the
 426          * code above.
 427          *
 428          * We also give the brand a chance to indicate that based on the ELF
 429          * OSABI of the target binary it should become unbranded and optionally
 430          * indicate that it should be treated as existing in a specific prefix.
 431          *
 432          * Note that if a brand opts to go down this route it does not actually
 433          * end up being debranded. In other words, future programs that exec
 434          * will still be considered for branding unless this escape hatch is
 435          * used. Consider the case of lx brand for example. If a user runs
 436          * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
 437          * of DTrace that's in /native will take this escape hatch and be run
 438          * and interpreted using the normal system call table; however, the
 439          * execution of a non-illumos binary in the form of /bin/ls will still
 440          * be branded and be subject to all of the normal actions of the brand.
 441          *
 442          * The level checks associated with brand handling below are used to
 443          * prevent a loop since the brand elfexec function typically comes back
 444          * through this function. We must check <= here since the nested
 445          * handling in the #! interpreter code will increment the level before
 446          * calling gexec to run the final elfexec interpreter.
 447          */
 448         if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
 449             (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
 450                 if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
 451                     &args->brand_nroot) == B_TRUE) {
 452                         ASSERT(ehdrp->e_ident[EI_OSABI]);
 453                         *brand_action = EBA_NATIVE;
 454                         /* Add one for the trailing '/' in the path */
 455                         if (args->brand_nroot != NULL)
 456                                 nsize = strlen(args->brand_nroot) + 1;
 457                 }
 458         }
 459 
 460         if ((level <= INTP_MAXDEPTH) &&
 461             (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 462                 error = BROP(p)->b_elfexec(vp, uap, args,
 463                     idatap, level + 1, execsz, setid, exec_file, cred,
 464                     brand_action);
 465                 goto out;
 466         }
 467 
 468         /*
 469          * Determine aux size now so that stack can be built
 470          * in one shot (except actual copyout of aux image),
 471          * determine any non-default stack protections,
 472          * and still have this code be machine independent.
 473          */
 474         hsize = ehdrp->e_phentsize;
 475         phdrp = (Phdr *)phdrbase;
 476         for (i = nphdrs; i > 0; i--) {
 477                 switch (phdrp->p_type) {
 478                 case PT_INTERP:
 479                         hasauxv = hasdy = 1;
 480                         break;
 481                 case PT_PHDR:


 512                         args->dat_prot |= PROT_READ;
 513                 if (dataphdrp->p_flags & PF_W)
 514                         args->dat_prot |= PROT_WRITE;
 515                 if (dataphdrp->p_flags & PF_X)
 516                         args->dat_prot |= PROT_EXEC;
 517         }
 518 
 519         /*
 520          * If a auxvector will be required - reserve the space for
 521          * it now.  This may be increased by exec_args if there are
 522          * ISA-specific types (included in __KERN_NAUXV_IMPL).
 523          */
 524         if (hasauxv) {
 525                 /*
 526                  * If a AUX vector is being built - the base AUX
 527                  * entries are:
 528                  *
 529                  *      AT_BASE
 530                  *      AT_FLAGS
 531                  *      AT_PAGESZ
 532                  *      AT_RANDOM
 533                  *      AT_SUN_AUXFLAGS
 534                  *      AT_SUN_HWCAP
 535                  *      AT_SUN_HWCAP2
 536                  *      AT_SUN_PLATFORM (added in stk_copyout)
 537                  *      AT_SUN_EXECNAME (added in stk_copyout)
 538                  *      AT_NULL
 539                  *
 540                  * total == 10
 541                  */
 542                 if (hasdy && hasu) {
 543                         /*
 544                          * Has PT_INTERP & PT_PHDR - the auxvectors that
 545                          * will be built are:
 546                          *
 547                          *      AT_PHDR
 548                          *      AT_PHENT
 549                          *      AT_PHNUM
 550                          *      AT_ENTRY
 551                          *      AT_LDDATA
 552                          *
 553                          * total = 5
 554                          */
 555                         args->auxsize = (10 + 5) * sizeof (aux_entry_t);
 556                 } else if (hasdy) {
 557                         /*
 558                          * Has PT_INTERP but no PT_PHDR
 559                          *
 560                          *      AT_EXECFD
 561                          *      AT_LDDATA
 562                          *
 563                          * total = 2
 564                          */
 565                         args->auxsize = (10 + 2) * sizeof (aux_entry_t);
 566                 } else {
 567                         args->auxsize = 10 * sizeof (aux_entry_t);
 568                 }
 569         } else {
 570                 args->auxsize = 0;
 571         }
 572 
 573         /*
 574          * If this binary is using an emulator, we need to add an
 575          * AT_SUN_EMULATOR aux entry.
 576          */
 577         if (args->emulator != NULL)
 578                 args->auxsize += sizeof (aux_entry_t);
 579 


 580         /*
 581          * If this is a native binary that's been given a modified interpreter
 582          * root, inform it that the native system exists at that root.
 583          */
 584         if (args->brand_nroot != NULL) {
 585                 args->auxsize += sizeof (aux_entry_t);
 586         }
 587 
 588 
 589         /*
 590          * On supported kernels (x86_64) make room in the auxv for the
 591          * AT_SUN_COMMPAGE entry.  This will go unpopulated on i86xpv systems
 592          * which do not provide such functionality.
 593          */
 594 #if defined(__amd64)
 595         args->auxsize += sizeof (aux_entry_t);
 596 #endif /* defined(__amd64) */
 597 
 598         /*
 599          * If we have user credentials, we'll supply the following entries:
 600          *      AT_SUN_UID
 601          *      AT_SUN_RUID
 602          *      AT_SUN_GID
 603          *      AT_SUN_RGID
 604          */
 605         if (cred != NULL) {
 606                 args->auxsize += 4 * sizeof (aux_entry_t);
 607         }
 608 
 609         if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 610                 branded = 1;
 611                 /*
 612                  * We will be adding 5 entries to the aux vectors.  One for
 613                  * the the brandname and 4 for the brand specific aux vectors.
 614                  */
 615                 args->auxsize += 5 * sizeof (aux_entry_t);
 616         }
 617 
 618         /* Hardware/Software capabilities */
 619         if (capphdr != NULL &&
 620             (capsize = capphdr->p_filesz) > 0 &&
 621             capsize <= 16 * sizeof (*cap)) {
 622                 int ncaps = capsize / sizeof (*cap);
 623                 Cap *cp;
 624 
 625                 cap = kmem_alloc(capsize, KM_SLEEP);
 626                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
 627                     capsize, (offset_t)capphdr->p_offset,
 628                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
 629                         uprintf("%s: Cannot read capabilities section\n",
 630                             exec_file);
 631                         goto out;
 632                 }
 633                 for (cp = cap; cp < cap + ncaps; cp++) {
 634                         if (cp->c_tag == CA_SUNW_SF_1 &&
 635                             (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
 636                                 if (args->to_model == DATAMODEL_LP64)
 637                                         args->addr32 = 1;


 653                 goto out;
 654         }
 655         /* we're single threaded after this point */
 656 
 657         /*
 658          * If this is an ET_DYN executable (shared object),
 659          * determine its memory size so that mapelfexec() can load it.
 660          */
 661         if (ehdrp->e_type == ET_DYN)
 662                 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
 663         else
 664                 len = 0;
 665 
 666         dtrphdr = NULL;
 667 
 668         if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &dyphdr,
 669             &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
 670             len, execsz, &brksize)) != 0)
 671                 goto bad;
 672 
 673         if (uphdr != NULL) {
 674                 /*
 675                  * Our uphdr has been dynamically allocated if (and only if)
 676                  * its program header flags are clear.
 677                  */
 678                 dynuphdr = (uphdr->p_flags == 0);
 679         }
 680 
 681         if (uphdr != NULL && dyphdr == NULL)
 682                 goto bad;
 683 
 684         if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 685                 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
 686                 goto bad;
 687         }
 688 
 689         if (dyphdr != NULL) {
 690                 size_t          len;
 691                 uintptr_t       lddata;
 692                 char            *p;
 693                 struct vnode    *nvp;
 694 
 695                 dlnsize = dyphdr->p_filesz + nsize;
 696 
 697                 if (dlnsize > MAXPATHLEN || dlnsize <= 0)
 698                         goto bad;
 699 
 700                 if (nsize != 0) {
 701                         bcopy(args->brand_nroot, dlnp, nsize - 1);
 702                         dlnp[nsize - 1] = '/';
 703                 }
 704 
 705                 /*
 706                  * Read in "interpreter" pathname.
 707                  */
 708                 if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
 709                     dyphdr->p_filesz, (offset_t)dyphdr->p_offset, UIO_SYSSPACE,
 710                     0, (rlim64_t)0, CRED(), &resid)) != 0) {
 711                         uprintf("%s: Cannot obtain interpreter pathname\n",
 712                             exec_file);
 713                         goto bad;
 714                 }
 715 
 716                 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
 717                         goto bad;
 718 
 719                 /*
 720                  * Search for '$ORIGIN' token in interpreter path.
 721                  * If found, expand it.
 722                  */
 723                 for (p = dlnp; p = strchr(p, '$'); ) {
 724                         uint_t  len, curlen;
 725                         char    *_ptr;
 726 
 727                         if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
 728                                 continue;
 729 
 730                         /*


 835                     &phdrsize)) != 0) {
 836                         VN_RELE(nvp);
 837                         uprintf("%s: Cannot read %s\n", exec_file, dlnp);
 838                         goto bad;
 839                 }
 840 
 841                 /*
 842                  * Determine memory size of the "interpreter's" loadable
 843                  * sections.  This size is then used to obtain the virtual
 844                  * address of a hole, in the user's address space, large
 845                  * enough to map the "interpreter".
 846                  */
 847                 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
 848                         VN_RELE(nvp);
 849                         uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
 850                         goto bad;
 851                 }
 852 
 853                 dtrphdr = NULL;
 854 
 855                 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
 856                     &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
 857                     execsz, NULL);
 858 
 859                 if (error || junk != NULL) {
 860                         VN_RELE(nvp);
 861                         uprintf("%s: Cannot map %s\n", exec_file, dlnp);
 862                         goto bad;
 863                 }
 864 
 865                 /*
 866                  * We use the DTrace program header to initialize the
 867                  * architecture-specific user per-LWP location. The dtrace
 868                  * fasttrap provider requires ready access to per-LWP scratch
 869                  * space. We assume that there is only one such program header
 870                  * in the interpreter.
 871                  */
 872                 if (dtrphdr != NULL &&
 873                     dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 874                         VN_RELE(nvp);
 875                         uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
 876                         goto bad;
 877                 }
 878 
 879                 VN_RELE(nvp);
 880                 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
 881         }
 882 
 883         if (hasauxv) {
 884                 int auxf = AF_SUN_HWCAPVERIFY;
 885 
 886                 /*
 887                  * Note: AT_SUN_PLATFORM and AT_RANDOM were filled in via
 888                  * exec_args()
 889                  */
 890                 ADDAUX(aux, AT_BASE, voffset)
 891                 ADDAUX(aux, AT_FLAGS, at_flags)
 892                 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
 893                 /*
 894                  * Linker flags. (security)
 895                  * p_flag not yet set at this time.
 896                  * We rely on gexec() to provide us with the information.
 897                  * If the application is set-uid but this is not reflected
 898                  * in a mismatch between real/effective uids/gids, then
 899                  * don't treat this as a set-uid exec.  So we care about
 900                  * the EXECSETID_UGIDS flag but not the ...SETID flag.
 901                  */
 902                 if ((setid &= ~EXECSETID_SETID) != 0)
 903                         auxf |= AF_SUN_SETUGID;
 904 
 905                 /*
 906                  * If we're running a native process from within a branded
 907                  * zone under pfexec then we clear the AF_SUN_SETUGID flag so
 908                  * that the native ld.so.1 is able to link with the native
 909                  * libraries instead of using the brand libraries that are
 910                  * installed in the zone.  We only do this for processes
 911                  * which we trust because we see they are already running
 912                  * under pfexec (where uid != euid).  This prevents a
 913                  * malicious user within the zone from crafting a wrapper to
 914                  * run native suid commands with unsecure libraries interposed.
 915                  */
 916                 if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
 917                     (setid &= ~EXECSETID_SETID) != 0))
 918                         auxf &= ~AF_SUN_SETUGID;
 919 
 920                 /*
 921                  * Record the user addr of the auxflags aux vector entry
 922                  * since brands may optionally want to manipulate this field.
 923                  */
 924                 args->auxp_auxflags =
 925                     (char *)((char *)args->stackend +
 926                     ((char *)&aux->a_type -
 927                     (char *)bigwad->elfargs));
 928                 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
 929 
 930                 /*
 931                  * Record information about the real and effective user and
 932                  * group IDs.
 933                  */
 934                 if (cred != NULL) {
 935                         ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
 936                         ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
 937                         ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
 938                         ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
 939                 }
 940 
 941                 /*
 942                  * Hardware capability flag word (performance hints)
 943                  * Used for choosing faster library routines.
 944                  * (Potentially different between 32-bit and 64-bit ABIs)
 945                  */
 946 #if defined(_LP64)
 947                 if (args->to_model == DATAMODEL_NATIVE) {
 948                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
 949                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
 950                 } else {
 951                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
 952                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
 953                 }
 954 #else
 955                 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
 956                 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
 957 #endif
 958                 if (branded) {
 959                         /*
 960                          * Reserve space for the brand-private aux vectors,
 961                          * and record the user addr of that space.
 962                          */
 963                         args->auxp_brand =
 964                             (char *)((char *)args->stackend +
 965                             ((char *)&aux->a_type -
 966                             (char *)bigwad->elfargs));
 967                         ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
 968                         ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
 969                         ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
 970                         ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
 971                 }
 972 
 973                 /*
 974                  * Add the comm page auxv entry, mapping it in if needed.
 975                  */
 976 #if defined(__amd64)
 977                 if (args->commpage != NULL ||
 978                     (args->commpage = (uintptr_t)comm_page_mapin()) != NULL) {
 979                         ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
 980                 } else {
 981                         /*
 982                          * If the comm page cannot be mapped, pad out the auxv
 983                          * to satisfy later size checks.
 984                          */
 985                         ADDAUX(aux, AT_NULL, 0)
 986                 }
 987 #endif /* defined(__amd64) */
 988 
 989                 ADDAUX(aux, AT_NULL, 0)
 990                 postfixsize = (char *)aux - (char *)bigwad->elfargs;
 991 
 992                 /*
 993                  * We make assumptions above when we determine how many aux
 994                  * vector entries we will be adding. However, if we have an
 995                  * invalid elf file, it is possible that mapelfexec might
 996                  * behave differently (but not return an error), in which case
 997                  * the number of aux entries we actually add will be different.
 998                  * We detect that now and error out.
 999                  */
1000                 if (postfixsize != args->auxsize) {
1001                         DTRACE_PROBE2(elfexec_badaux, int, postfixsize,
1002                             int, args->auxsize);
1003                         goto bad;
1004                 }
1005                 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
1006         }
1007 
1008         /*
1009          * For the 64-bit kernel, the limit is big enough that rounding it up
1010          * to a page can overflow the 64-bit limit, so we check for btopr()
1011          * overflowing here by comparing it with the unrounded limit in pages.
1012          * If it hasn't overflowed, compare the exec size with the rounded up
1013          * limit in pages.  Otherwise, just compare with the unrounded limit.
1014          */
1015         limit = btop(p->p_vmem_ctl);
1016         roundlimit = btopr(p->p_vmem_ctl);
1017         if ((roundlimit > limit && *execsz > roundlimit) ||
1018             (roundlimit < limit && *execsz > limit)) {
1019                 mutex_enter(&p->p_lock);
1020                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1021                     RCA_SAFE);
1022                 mutex_exit(&p->p_lock);
1023                 error = ENOMEM;
1024                 goto bad;
1025         }
1026 
1027         bzero(up->u_auxv, sizeof (up->u_auxv));
1028         up->u_commpagep = args->commpage;
1029         if (postfixsize) {
1030                 int num_auxv;
1031 
1032                 /*
1033                  * Copy the aux vector to the user stack.
1034                  */
1035                 error = execpoststack(args, bigwad->elfargs, postfixsize);
1036                 if (error)
1037                         goto bad;
1038 
1039                 /*
1040                  * Copy auxv to the process's user structure for use by /proc.
1041                  * If this is a branded process, the brand's exec routine will
1042                  * copy it's private entries to the user structure later. It
1043                  * relies on the fact that the blank entries are at the end.
1044                  */
1045                 num_auxv = postfixsize / sizeof (aux_entry_t);
1046                 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
1047                 aux = bigwad->elfargs;
1048                 for (i = 0; i < num_auxv; i++) {


1075                         bigwad->exenv.ex_bssbase = bssbase;
1076                         bigwad->exenv.ex_brksize = brksize;
1077                 }
1078                 bigwad->exenv.ex_magic = elfmagic;
1079                 bigwad->exenv.ex_vp = vp;
1080                 setexecenv(&bigwad->exenv);
1081         }
1082 
1083         ASSERT(error == 0);
1084         goto out;
1085 
1086 bad:
1087         if (fd != -1)           /* did we open the a.out yet */
1088                 (void) execclose(fd);
1089 
1090         psignal(p, SIGKILL);
1091 
1092         if (error == 0)
1093                 error = ENOEXEC;
1094 out:
1095         if (dynuphdr)
1096                 kmem_free(uphdr, sizeof (Phdr));
1097         if (phdrbase != NULL)
1098                 kmem_free(phdrbase, phdrsize);
1099         if (cap != NULL)
1100                 kmem_free(cap, capsize);
1101         kmem_free(bigwad, sizeof (struct bigwad));
1102         return (error);
1103 }
1104 
1105 /*
1106  * Compute the memory size requirement for the ELF file.
1107  */
1108 static size_t
1109 elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata)
1110 {
1111         size_t  len;
1112         Phdr    *phdrp = (Phdr *)phdrbase;
1113         int     hsize = ehdrp->e_phentsize;
1114         int     first = 1;
1115         int     dfirst = 1;     /* first data segment */
1116         uintptr_t loaddr = 0;


1343                 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1344         }
1345 
1346         if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, *shstrsizep,
1347             (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1348             credp, &resid)) != 0) {
1349                 kmem_free(*shbasep, *shsizep);
1350                 kmem_free(*shstrbasep, *shstrsizep);
1351                 return (err);
1352         }
1353 
1354         /*
1355          * Make sure the strtab is null-terminated to make sure we
1356          * don't run off the end of the table.
1357          */
1358         (*shstrbasep)[*shstrsizep - 1] = '\0';
1359 
1360         return (0);
1361 }
1362 
1363 
1364 #ifdef _ELF32_COMPAT
1365 int
1366 elf32readhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
1367     caddr_t *phbasep, ssize_t *phsizep)
1368 #else
1369 int
1370 elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
1371     caddr_t *phbasep, ssize_t *phsizep)
1372 #endif
1373 {
1374         int error, nshdrs, shstrndx;
1375 
1376         if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
1377             nphdrs)) != 0 ||
1378             (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
1379             phsizep)) != 0) {
1380                 return (error);
1381         }
1382         return (0);
1383 }
1384 
1385 
1386 static int
1387 mapelfexec(
1388         vnode_t *vp,
1389         Ehdr *ehdr,
1390         int nphdrs,
1391         caddr_t phdrbase,
1392         Phdr **uphdr,
1393         Phdr **dyphdr,
1394         Phdr **stphdr,
1395         Phdr **dtphdr,
1396         Phdr *dataphdrp,
1397         caddr_t *bssbase,
1398         caddr_t *brkbase,
1399         intptr_t *voffset,
1400         intptr_t *minaddr,
1401         size_t len,
1402         long *execsz,
1403         size_t *brksize)
1404 {
1405         Phdr *phdr;
1406         int i, prot, error, lastprot = 0;
1407         caddr_t addr = NULL;
1408         size_t zfodsz;
1409         int ptload = 0;
1410         int page;
1411         off_t offset;
1412         int hsize = ehdr->e_phentsize;
1413         caddr_t mintmp = (caddr_t)-1;
1414         uintptr_t lastaddr = NULL;
1415         extern int use_brk_lpg;
1416 
1417         if (ehdr->e_type == ET_DYN) {
1418                 caddr_t vaddr;







1419 
1420                 /*
1421                  * Despite the fact that mmapobj(2) refuses to load them, we
1422                  * need to support executing ET_DYN objects that have a
1423                  * non-NULL p_vaddr.  When found in the wild, these objects
1424                  * are likely to be due to an old (and largely obviated) Linux
1425                  * facility, prelink(8), that rewrites shared objects to
1426                  * prefer specific (disjoint) virtual address ranges.  (Yes,
1427                  * this is putatively for performance -- and yes, it has
1428                  * limited applicability, many edge conditions and grisly
1429                  * failure modes; even for Linux, it's insane.)  As ELF
1430                  * mandates that the PT_LOAD segments be in p_vaddr order, we
1431                  * find the lowest p_vaddr by finding the first PT_LOAD
1432                  * segment.
1433                  */
1434                 phdr = (Phdr *)phdrbase;
1435                 for (i = nphdrs; i > 0; i--) {
1436                         if (phdr->p_type == PT_LOAD) {
1437                                 addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
1438                                 break;
1439                         }
1440                         phdr = (Phdr *)((caddr_t)phdr + hsize);
1441                 }
1442 
1443                 /*
1444                  * We have a non-zero p_vaddr in the first PT_LOAD segment --
1445                  * presumably because we're directly executing a prelink(8)'d
1446                  * ld-linux.so.  While we could correctly execute such an
1447                  * object without locating it at its desired p_vaddr (it is,
1448                  * after all, still relocatable), our inner antiquarian
1449                  * derives a perverse pleasure in accommodating the steampunk
1450                  * prelink(8) contraption -- goggles on!
1451                  */
1452                 if ((vaddr = addr) != NULL) {
1453                         if (as_gap(curproc->p_as, len,
1454                             &addr, &len, AH_LO, NULL) == -1 || addr != vaddr) {
1455                                 addr = NULL;
1456                         }
1457                 }
1458 
1459                 if (addr == NULL) {
1460                         /*
1461                          * We either have a NULL p_vaddr (the common case, by
1462                          * many orders of magnitude) or we have a non-NULL
1463                          * p_vaddr and we were unable to obtain the specified
1464                          * VA range (presumably because it's an illegal
1465                          * address).  Either way, obtain an address in which
1466                          * to map the interpreter.
1467                          */
1468                         map_addr(&addr, len, (offset_t)0, 1, 0);
1469                         if (addr == NULL)
1470                                 return (ENOMEM);
1471                 }
1472 
1473                 /*
1474                  * Our voffset is the difference between where we landed and
1475                  * where we wanted to be.
1476                  */
1477                 *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
1478         } else {
1479                 *voffset = 0;
1480         }
1481 
1482         phdr = (Phdr *)phdrbase;
1483         for (i = nphdrs; i > 0; i--) {
1484                 switch (phdr->p_type) {
1485                 case PT_LOAD:



1486                         ptload = 1;
1487                         prot = PROT_USER;
1488                         if (phdr->p_flags & PF_R)
1489                                 prot |= PROT_READ;
1490                         if (phdr->p_flags & PF_W)
1491                                 prot |= PROT_WRITE;
1492                         if (phdr->p_flags & PF_X)
1493                                 prot |= PROT_EXEC;
1494 
1495                         addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1496 
1497                         if ((*dyphdr != NULL) && uphdr != NULL &&
1498                             (*uphdr == NULL)) {
1499                                 /*
1500                                  * The PT_PHDR program header is, strictly
1501                                  * speaking, optional.  If we find that this
1502                                  * is missing, we will determine the location
1503                                  * of the program headers based on the address
1504                                  * of the lowest PT_LOAD segment (namely, this
1505                                  * one):  we subtract the p_offset to get to
1506                                  * the ELF header and then add back the program
1507                                  * header offset to get to the program headers.
1508                                  * We then cons up a Phdr that corresponds to
1509                                  * the (missing) PT_PHDR, setting the flags
1510                                  * to 0 to denote that this is artificial and
1511                                  * should (must) be freed by the caller.
1512                                  */
1513                                 Phdr *cons;
1514 
1515                                 cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
1516 
1517                                 cons->p_flags = 0;
1518                                 cons->p_type = PT_PHDR;
1519                                 cons->p_vaddr = ((uintptr_t)addr -
1520                                     phdr->p_offset) + ehdr->e_phoff;
1521 
1522                                 *uphdr = cons;
1523                         }
1524 
1525                         /*
1526                          * Keep track of the segment with the lowest starting
1527                          * address.
1528                          */
1529                         if (addr < mintmp)
1530                                 mintmp = addr;
1531 
1532                         /*
1533                          * Segments need not correspond to page boundaries:
1534                          * they are permitted to share a page.  If two PT_LOAD
1535                          * segments share the same page, and the permissions
1536                          * of the segments differ, the behavior is historically
1537                          * that the permissions of the latter segment are used
1538                          * for the page that the two segments share.  This is
1539                          * also historically a non-issue:  binaries generated
1540                          * by most anything will make sure that two PT_LOAD
1541                          * segments with differing permissions don't actually
1542                          * share any pages.  However, there exist some crazy
1543                          * things out there (including at least an obscure
1544                          * Portuguese teaching language called G-Portugol) that
1545                          * actually do the wrong thing and expect it to work:
1546                          * they have a segment with execute permission share
1547                          * a page with a subsequent segment that does not
1548                          * have execute permissions and expect the resulting
1549                          * shared page to in fact be executable.  To accommodate
1550                          * such broken link editors, we take advantage of a
1551                          * latitude explicitly granted to the loader:  it is
1552                          * permitted to make _any_ PT_LOAD segment executable
1553                          * (provided that it is readable or writable).  If we
1554                          * see that we're sharing a page and that the previous
1555                          * page was executable, we will add execute permissions
1556                          * to our segment.
1557                          */
1558                         if (btop(lastaddr) == btop((uintptr_t)addr) &&
1559                             (phdr->p_flags & (PF_R | PF_W)) &&
1560                             (lastprot & PROT_EXEC)) {
1561                                 prot |= PROT_EXEC;
1562                         }
1563 
1564                         lastaddr = (uintptr_t)addr + phdr->p_filesz;
1565                         lastprot = prot;
1566 
1567                         zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1568 
1569                         offset = phdr->p_offset;
1570                         if (((uintptr_t)offset & PAGEOFFSET) ==
1571                             ((uintptr_t)addr & PAGEOFFSET) &&
1572                             (!(vp->v_flag & VNOMAP))) {
1573                                 page = 1;
1574                         } else {
1575                                 page = 0;
1576                         }
1577 
1578                         /*
1579                          * Set the heap pagesize for OOB when the bss size
1580                          * is known and use_brk_lpg is not 0.
1581                          */
1582                         if (brksize != NULL && use_brk_lpg &&
1583                             zfodsz != 0 && phdr == dataphdrp &&
1584                             (prot & PROT_WRITE)) {
1585                                 size_t tlen = P2NPHASE((uintptr_t)addr +
1586                                     phdr->p_filesz, PAGESIZE);


1611                                 if (brksize != NULL)
1612                                         *brksize = extra_zfodsz;
1613                         } else {
1614                                 if (error = execmap(vp, addr, phdr->p_filesz,
1615                                     zfodsz, phdr->p_offset, prot, page, 0))
1616                                         goto bad;
1617                         }
1618 
1619                         if (bssbase != NULL && addr >= *bssbase &&
1620                             phdr == dataphdrp) {
1621                                 *bssbase = addr + phdr->p_filesz;
1622                         }
1623                         if (brkbase != NULL && addr >= *brkbase) {
1624                                 *brkbase = addr + phdr->p_memsz;
1625                         }
1626 
1627                         *execsz += btopr(phdr->p_memsz);
1628                         break;
1629 
1630                 case PT_INTERP:
1631                         /*
1632                          * The ELF specification is unequivocal about the
1633                          * PT_INTERP program header with respect to any PT_LOAD
1634                          * program header:  "If it is present, it must precede
1635                          * any loadable segment entry." Linux, however, makes
1636                          * no attempt to enforce this -- which has allowed some
1637                          * binary editing tools to get away with generating
1638                          * invalid ELF binaries in the respect that PT_INTERP
1639                          * occurs after the first PT_LOAD program header.  This
1640                          * is unfortunate (and of course, disappointing) but
1641                          * it's no worse than that: there is no reason that we
1642                          * can't process the PT_INTERP entry (if present) after
1643                          * one or more PT_LOAD entries.  We therefore
1644                          * deliberately do not check ptload here and always
1645                          * store dyphdr to be the PT_INTERP program header.
1646                          */
1647                         *dyphdr = phdr;
1648                         break;
1649 
1650                 case PT_SHLIB:
1651                         *stphdr = phdr;
1652                         break;
1653 
1654                 case PT_PHDR:
1655                         if (ptload || phdr->p_flags == 0)
1656                                 goto bad;
1657 
1658                         if (uphdr != NULL)
1659                                 *uphdr = phdr;
1660 
1661                         break;
1662 
1663                 case PT_NULL:
1664                 case PT_DYNAMIC:
1665                 case PT_NOTE:
1666                         break;
1667 
1668                 case PT_SUNWDTRACE:
1669                         if (dtphdr != NULL)
1670                                 *dtphdr = phdr;
1671                         break;
1672 
1673                 default:
1674                         break;
1675                 }
1676                 phdr = (Phdr *)((caddr_t)phdr + hsize);
1677         }
1678 
1679         if (minaddr != NULL) {
1680                 ASSERT(mintmp != (caddr_t)-1);


2489 static struct execsw esw = {
2490 #ifdef  _LP64
2491         elf64magicstr,
2492 #else   /* _LP64 */
2493         elf32magicstr,
2494 #endif  /* _LP64 */
2495         0,
2496         5,
2497         elfexec,
2498         elfcore
2499 };
2500 
2501 static struct modlexec modlexec = {
2502         &mod_execops, "exec module for elf", &esw
2503 };
2504 
2505 #ifdef  _LP64
2506 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2507                         intpdata_t *idatap, int level, long *execsz,
2508                         int setid, caddr_t exec_file, cred_t *cred,
2509                         int *brand_action);
2510 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2511                         rlim64_t rlimit, int sig, core_content_t content);
2512 
2513 static struct execsw esw32 = {
2514         elf32magicstr,
2515         0,
2516         5,
2517         elf32exec,
2518         elf32core
2519 };
2520 
2521 static struct modlexec modlexec32 = {
2522         &mod_execops, "32-bit exec module for elf", &esw32
2523 };
2524 #endif  /* _LP64 */
2525 
2526 static struct modlinkage modlinkage = {
2527         MODREV_1,
2528         (void *)&modlexec,
2529 #ifdef  _LP64