1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright 2013 Joyent, Inc.  All rights reserved.
  27  */
  28 
  29 
  30 #include <sys/types.h>
  31 #include <sys/machparam.h>
  32 #include <sys/x86_archext.h>
  33 #include <sys/systm.h>
  34 #include <sys/mach_mmu.h>
  35 #include <sys/multiboot.h>
  36 #include <sys/multiboot2.h>
  37 #include <sys/multiboot2_impl.h>
  38 #include <sys/sysmacros.h>
  39 #include <sys/sha1.h>
  40 #include <util/string.h>
  41 #include <util/strtolctype.h>
  42 #include <sys/efi.h>
  43 
  44 #if defined(__xpv)
  45 
  46 #include <sys/hypervisor.h>
  47 uintptr_t xen_virt_start;
  48 pfn_t *mfn_to_pfn_mapping;
  49 
  50 #else /* !__xpv */
  51 
  52 extern multiboot_header_t mb_header;
  53 extern uint32_t mb2_load_addr;
  54 extern int have_cpuid(void);
  55 
  56 #endif /* !__xpv */
  57 
  58 #include <sys/inttypes.h>
  59 #include <sys/bootinfo.h>
  60 #include <sys/mach_mmu.h>
  61 #include <sys/boot_console.h>
  62 
  63 #include "dboot_asm.h"
  64 #include "dboot_printf.h"
  65 #include "dboot_xboot.h"
  66 #include "dboot_elfload.h"
  67 
  68 #define SHA1_ASCII_LENGTH       (SHA1_DIGEST_LENGTH * 2)
  69 
  70 /*
  71  * This file contains code that runs to transition us from either a multiboot
  72  * compliant loader (32 bit non-paging) or a XPV domain loader to
  73  * regular kernel execution. Its task is to setup the kernel memory image
  74  * and page tables.
  75  *
  76  * The code executes as:
  77  *      - 32 bits under GRUB (for 32 or 64 bit Solaris)
  78  *      - a 32 bit program for the 32-bit PV hypervisor
  79  *      - a 64 bit program for the 64-bit PV hypervisor (at least for now)
  80  *
  81  * Under the PV hypervisor, we must create mappings for any memory beyond the
  82  * initial start of day allocation (such as the kernel itself).
  83  *
  84  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
  85  * Since we are running in real mode, so all such memory is accessible.
  86  */
  87 
  88 /*
  89  * Standard bits used in PTE (page level) and PTP (internal levels)
  90  */
  91 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
  92 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
  93 
  94 /*
  95  * This is the target addresses (physical) where the kernel text and data
  96  * nucleus pages will be unpacked. On the hypervisor this is actually a
  97  * virtual address.
  98  */
  99 paddr_t ktext_phys;
 100 uint32_t ksize = 2 * FOUR_MEG;  /* kernel nucleus is 8Meg */
 101 
 102 static uint64_t target_kernel_text;     /* value to use for KERNEL_TEXT */
 103 
 104 /*
 105  * The stack is setup in assembler before entering startup_kernel()
 106  */
 107 char stack_space[STACK_SIZE];
 108 
 109 /*
 110  * Used to track physical memory allocation
 111  */
 112 static paddr_t next_avail_addr = 0;
 113 
 114 #if defined(__xpv)
 115 /*
 116  * Additional information needed for hypervisor memory allocation.
 117  * Only memory up to scratch_end is mapped by page tables.
 118  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
 119  * to derive a pfn from a pointer, you subtract mfn_base.
 120  */
 121 
 122 static paddr_t scratch_end = 0; /* we can't write all of mem here */
 123 static paddr_t mfn_base;                /* addr corresponding to mfn_list[0] */
 124 start_info_t *xen_info;
 125 
 126 #else   /* __xpv */
 127 
 128 /*
 129  * If on the metal, then we have a multiboot loader.
 130  */
 131 uint32_t mb_magic;                      /* magic from boot loader */
 132 uint32_t mb_addr;                       /* multiboot info package from loader */
 133 int multiboot_version;
 134 multiboot_info_t *mb_info;
 135 multiboot2_info_header_t *mb2_info;
 136 multiboot_tag_mmap_t *mb2_mmap_tagp;
 137 int num_entries;                        /* mmap entry count */
 138 boolean_t num_entries_set;              /* is mmap entry count set */
 139 uintptr_t load_addr;
 140 
 141 /* can not be automatic variables because of alignment */
 142 static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID;
 143 static efi_guid_t smbios = SMBIOS_TABLE_GUID;
 144 static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID;
 145 static efi_guid_t acpi1 = ACPI_10_TABLE_GUID;
 146 #endif  /* __xpv */
 147 
 148 /*
 149  * This contains information passed to the kernel
 150  */
 151 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */
 152 struct xboot_info *bi;
 153 
 154 /*
 155  * Page table and memory stuff.
 156  */
 157 static paddr_t max_mem;                 /* maximum memory address */
 158 
 159 /*
 160  * Information about processor MMU
 161  */
 162 int amd64_support = 0;
 163 int largepage_support = 0;
 164 int pae_support = 0;
 165 int pge_support = 0;
 166 int NX_support = 0;
 167 
 168 /*
 169  * Low 32 bits of kernel entry address passed back to assembler.
 170  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
 171  */
 172 uint32_t entry_addr_low;
 173 
 174 /*
 175  * Memlists for the kernel. We shouldn't need a lot of these.
 176  */
 177 #define MAX_MEMLIST (50)
 178 struct boot_memlist memlists[MAX_MEMLIST];
 179 uint_t memlists_used = 0;
 180 struct boot_memlist pcimemlists[MAX_MEMLIST];
 181 uint_t pcimemlists_used = 0;
 182 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
 183 uint_t rsvdmemlists_used = 0;
 184 
 185 /*
 186  * This should match what's in the bootloader.  It's arbitrary, but GRUB
 187  * in particular has limitations on how much space it can use before it
 188  * stops working properly.  This should be enough.
 189  */
 190 struct boot_modules modules[MAX_BOOT_MODULES];
 191 uint_t modules_used = 0;
 192 
 193 #ifdef __xpv
 194 /*
 195  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
 196  * definition in Xen source.
 197  */
 198 typedef struct {
 199         uint32_t        base_addr_low;
 200         uint32_t        base_addr_high;
 201         uint32_t        length_low;
 202         uint32_t        length_high;
 203         uint32_t        type;
 204 } mmap_t;
 205 
 206 /*
 207  * There is 512KB of scratch area after the boot stack page.
 208  * We'll use that for everything except the kernel nucleus pages which are too
 209  * big to fit there and are allocated last anyway.
 210  */
 211 #define MAXMAPS 100
 212 static mmap_t map_buffer[MAXMAPS];
 213 #else
 214 typedef mb_memory_map_t mmap_t;
 215 #endif
 216 
 217 /*
 218  * Debugging macros
 219  */
 220 uint_t prom_debug = 0;
 221 uint_t map_debug = 0;
 222 
 223 static char noname[2] = "-";
 224 
 225 /*
 226  * Either hypervisor-specific or grub-specific code builds the initial
 227  * memlists. This code does the sort/merge/link for final use.
 228  */
 229 static void
 230 sort_physinstall(void)
 231 {
 232         int i;
 233 #if !defined(__xpv)
 234         int j;
 235         struct boot_memlist tmp;
 236 
 237         /*
 238          * Now sort the memlists, in case they weren't in order.
 239          * Yeah, this is a bubble sort; small, simple and easy to get right.
 240          */
 241         DBG_MSG("Sorting phys-installed list\n");
 242         for (j = memlists_used - 1; j > 0; --j) {
 243                 for (i = 0; i < j; ++i) {
 244                         if (memlists[i].addr < memlists[i + 1].addr)
 245                                 continue;
 246                         tmp = memlists[i];
 247                         memlists[i] = memlists[i + 1];
 248                         memlists[i + 1] = tmp;
 249                 }
 250         }
 251 
 252         /*
 253          * Merge any memlists that don't have holes between them.
 254          */
 255         for (i = 0; i <= memlists_used - 1; ++i) {
 256                 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
 257                         continue;
 258 
 259                 if (prom_debug)
 260                         dboot_printf(
 261                             "merging mem segs %" PRIx64 "...%" PRIx64
 262                             " w/ %" PRIx64 "...%" PRIx64 "\n",
 263                             memlists[i].addr,
 264                             memlists[i].addr + memlists[i].size,
 265                             memlists[i + 1].addr,
 266                             memlists[i + 1].addr + memlists[i + 1].size);
 267 
 268                 memlists[i].size += memlists[i + 1].size;
 269                 for (j = i + 1; j < memlists_used - 1; ++j)
 270                         memlists[j] = memlists[j + 1];
 271                 --memlists_used;
 272                 DBG(memlists_used);
 273                 --i;    /* after merging we need to reexamine, so do this */
 274         }
 275 #endif  /* __xpv */
 276 
 277         if (prom_debug) {
 278                 dboot_printf("\nFinal memlists:\n");
 279                 for (i = 0; i < memlists_used; ++i) {
 280                         dboot_printf("\t%d: addr=%" PRIx64 " size=%"
 281                             PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
 282                 }
 283         }
 284 
 285         /*
 286          * link together the memlists with native size pointers
 287          */
 288         memlists[0].next = 0;
 289         memlists[0].prev = 0;
 290         for (i = 1; i < memlists_used; ++i) {
 291                 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
 292                 memlists[i].next = 0;
 293                 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
 294         }
 295         bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
 296         DBG(bi->bi_phys_install);
 297 }
 298 
 299 /*
 300  * build bios reserved memlists
 301  */
 302 static void
 303 build_rsvdmemlists(void)
 304 {
 305         int i;
 306 
 307         rsvdmemlists[0].next = 0;
 308         rsvdmemlists[0].prev = 0;
 309         for (i = 1; i < rsvdmemlists_used; ++i) {
 310                 rsvdmemlists[i].prev =
 311                     (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
 312                 rsvdmemlists[i].next = 0;
 313                 rsvdmemlists[i - 1].next =
 314                     (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
 315         }
 316         bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
 317         DBG(bi->bi_rsvdmem);
 318 }
 319 
 320 #if defined(__xpv)
 321 
 322 /*
 323  * halt on the hypervisor after a delay to drain console output
 324  */
 325 void
 326 dboot_halt(void)
 327 {
 328         uint_t i = 10000;
 329 
 330         while (--i)
 331                 (void) HYPERVISOR_yield();
 332         (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 333 }
 334 
 335 /*
 336  * From a machine address, find the corresponding pseudo-physical address.
 337  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
 338  * Machine addresses are the real underlying hardware addresses.
 339  * These are needed for page table entries. Note that this routine is
 340  * poorly protected. A bad value of "ma" will cause a page fault.
 341  */
 342 paddr_t
 343 ma_to_pa(maddr_t ma)
 344 {
 345         ulong_t pgoff = ma & MMU_PAGEOFFSET;
 346         ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
 347         paddr_t pa;
 348 
 349         if (pfn >= xen_info->nr_pages)
 350                 return (-(paddr_t)1);
 351         pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
 352 #ifdef DEBUG
 353         if (ma != pa_to_ma(pa))
 354                 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
 355                     "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
 356 #endif
 357         return (pa);
 358 }
 359 
 360 /*
 361  * From a pseudo-physical address, find the corresponding machine address.
 362  */
 363 maddr_t
 364 pa_to_ma(paddr_t pa)
 365 {
 366         pfn_t pfn;
 367         ulong_t mfn;
 368 
 369         pfn = mmu_btop(pa - mfn_base);
 370         if (pa < mfn_base || pfn >= xen_info->nr_pages)
 371                 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
 372         mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
 373 #ifdef DEBUG
 374         if (mfn_to_pfn_mapping[mfn] != pfn)
 375                 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
 376                     pfn, mfn, mfn_to_pfn_mapping[mfn]);
 377 #endif
 378         return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
 379 }
 380 
 381 #endif  /* __xpv */
 382 
 383 x86pte_t
 384 get_pteval(paddr_t table, uint_t index)
 385 {
 386         if (pae_support)
 387                 return (((x86pte_t *)(uintptr_t)table)[index]);
 388         return (((x86pte32_t *)(uintptr_t)table)[index]);
 389 }
 390 
 391 /*ARGSUSED*/
 392 void
 393 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
 394 {
 395 #ifdef __xpv
 396         mmu_update_t t;
 397         maddr_t mtable = pa_to_ma(table);
 398         int retcnt;
 399 
 400         t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
 401         t.val = pteval;
 402         if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
 403                 dboot_panic("HYPERVISOR_mmu_update() failed");
 404 #else /* __xpv */
 405         uintptr_t tab_addr = (uintptr_t)table;
 406 
 407         if (pae_support)
 408                 ((x86pte_t *)tab_addr)[index] = pteval;
 409         else
 410                 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
 411         if (level == top_level && level == 2)
 412                 reload_cr3();
 413 #endif /* __xpv */
 414 }
 415 
 416 paddr_t
 417 make_ptable(x86pte_t *pteval, uint_t level)
 418 {
 419         paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
 420 
 421         if (level == top_level && level == 2)
 422                 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
 423         else
 424                 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
 425 
 426 #ifdef __xpv
 427         /* Remove write permission to the new page table. */
 428         if (HYPERVISOR_update_va_mapping(new_table,
 429             *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
 430                 dboot_panic("HYP_update_va_mapping error");
 431 #endif
 432 
 433         if (map_debug)
 434                 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
 435                     PRIx64 "\n", level, (ulong_t)new_table, *pteval);
 436         return (new_table);
 437 }
 438 
 439 x86pte_t *
 440 map_pte(paddr_t table, uint_t index)
 441 {
 442         return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
 443 }
 444 
 445 /*
 446  * dump out the contents of page tables...
 447  */
 448 static void
 449 dump_tables(void)
 450 {
 451         uint_t save_index[4];   /* for recursion */
 452         char *save_table[4];    /* for recursion */
 453         uint_t  l;
 454         uint64_t va;
 455         uint64_t pgsize;
 456         int index;
 457         int i;
 458         x86pte_t pteval;
 459         char *table;
 460         static char *tablist = "\t\t\t";
 461         char *tabs = tablist + 3 - top_level;
 462         uint_t pa, pa1;
 463 #if !defined(__xpv)
 464 #define maddr_t paddr_t
 465 #endif /* !__xpv */
 466 
 467         dboot_printf("Finished pagetables:\n");
 468         table = (char *)(uintptr_t)top_page_table;
 469         l = top_level;
 470         va = 0;
 471         for (index = 0; index < ptes_per_table; ++index) {
 472                 pgsize = 1ull << shift_amt[l];
 473                 if (pae_support)
 474                         pteval = ((x86pte_t *)table)[index];
 475                 else
 476                         pteval = ((x86pte32_t *)table)[index];
 477                 if (pteval == 0)
 478                         goto next_entry;
 479 
 480                 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
 481                     tabs + l, (void *)table, index, (uint64_t)pteval, va);
 482                 pa = ma_to_pa(pteval & MMU_PAGEMASK);
 483                 dboot_printf(" physaddr=%x\n", pa);
 484 
 485                 /*
 486                  * Don't try to walk hypervisor private pagetables
 487                  */
 488                 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
 489                         save_table[l] = table;
 490                         save_index[l] = index;
 491                         --l;
 492                         index = -1;
 493                         table = (char *)(uintptr_t)
 494                             ma_to_pa(pteval & MMU_PAGEMASK);
 495                         goto recursion;
 496                 }
 497 
 498                 /*
 499                  * shorten dump for consecutive mappings
 500                  */
 501                 for (i = 1; index + i < ptes_per_table; ++i) {
 502                         if (pae_support)
 503                                 pteval = ((x86pte_t *)table)[index + i];
 504                         else
 505                                 pteval = ((x86pte32_t *)table)[index + i];
 506                         if (pteval == 0)
 507                                 break;
 508                         pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
 509                         if (pa1 != pa + i * pgsize)
 510                                 break;
 511                 }
 512                 if (i > 2) {
 513                         dboot_printf("%s...\n", tabs + l);
 514                         va += pgsize * (i - 2);
 515                         index += i - 2;
 516                 }
 517 next_entry:
 518                 va += pgsize;
 519                 if (l == 3 && index == 256)     /* VA hole */
 520                         va = 0xffff800000000000ull;
 521 recursion:
 522                 ;
 523         }
 524         if (l < top_level) {
 525                 ++l;
 526                 index = save_index[l];
 527                 table = save_table[l];
 528                 goto recursion;
 529         }
 530 }
 531 
 532 /*
 533  * Add a mapping for the machine page at the given virtual address.
 534  */
 535 static void
 536 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
 537 {
 538         x86pte_t *ptep;
 539         x86pte_t pteval;
 540 
 541         pteval = ma | pte_bits;
 542         if (level > 0)
 543                 pteval |= PT_PAGESIZE;
 544         if (va >= target_kernel_text && pge_support)
 545                 pteval |= PT_GLOBAL;
 546 
 547         if (map_debug && ma != va)
 548                 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
 549                     " pte=0x%" PRIx64 " l=%d\n",
 550                     (uint64_t)ma, (uint64_t)va, pteval, level);
 551 
 552 #if defined(__xpv)
 553         /*
 554          * see if we can avoid find_pte() on the hypervisor
 555          */
 556         if (HYPERVISOR_update_va_mapping(va, pteval,
 557             UVMF_INVLPG | UVMF_LOCAL) == 0)
 558                 return;
 559 #endif
 560 
 561         /*
 562          * Find the pte that will map this address. This creates any
 563          * missing intermediate level page tables
 564          */
 565         ptep = find_pte(va, NULL, level, 0);
 566 
 567         /*
 568          * When paravirtualized, we must use hypervisor calls to modify the
 569          * PTE, since paging is active. On real hardware we just write to
 570          * the pagetables which aren't in use yet.
 571          */
 572 #if defined(__xpv)
 573         ptep = ptep;    /* shut lint up */
 574         if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
 575                 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
 576                     " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
 577                     (uint64_t)va, level, (uint64_t)ma, pteval);
 578 #else
 579         if (va < 1024 * 1024)
 580                 pteval |= PT_NOCACHE;           /* for video RAM */
 581         if (pae_support)
 582                 *ptep = pteval;
 583         else
 584                 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
 585 #endif
 586 }
 587 
 588 /*
 589  * Add a mapping for the physical page at the given virtual address.
 590  */
 591 static void
 592 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
 593 {
 594         map_ma_at_va(pa_to_ma(pa), va, level);
 595 }
 596 
 597 /*
 598  * This is called to remove start..end from the
 599  * possible range of PCI addresses.
 600  */
 601 const uint64_t pci_lo_limit = 0x00100000ul;
 602 const uint64_t pci_hi_limit = 0xfff00000ul;
 603 static void
 604 exclude_from_pci(uint64_t start, uint64_t end)
 605 {
 606         int i;
 607         int j;
 608         struct boot_memlist *ml;
 609 
 610         for (i = 0; i < pcimemlists_used; ++i) {
 611                 ml = &pcimemlists[i];
 612 
 613                 /* delete the entire range? */
 614                 if (start <= ml->addr && ml->addr + ml->size <= end) {
 615                         --pcimemlists_used;
 616                         for (j = i; j < pcimemlists_used; ++j)
 617                                 pcimemlists[j] = pcimemlists[j + 1];
 618                         --i;    /* to revisit the new one at this index */
 619                 }
 620 
 621                 /* split a range? */
 622                 else if (ml->addr < start && end < ml->addr + ml->size) {
 623 
 624                         ++pcimemlists_used;
 625                         if (pcimemlists_used > MAX_MEMLIST)
 626                                 dboot_panic("too many pcimemlists");
 627 
 628                         for (j = pcimemlists_used - 1; j > i; --j)
 629                                 pcimemlists[j] = pcimemlists[j - 1];
 630                         ml->size = start - ml->addr;
 631 
 632                         ++ml;
 633                         ml->size = (ml->addr + ml->size) - end;
 634                         ml->addr = end;
 635                         ++i;    /* skip on to next one */
 636                 }
 637 
 638                 /* cut memory off the start? */
 639                 else if (ml->addr < end && end < ml->addr + ml->size) {
 640                         ml->size -= end - ml->addr;
 641                         ml->addr = end;
 642                 }
 643 
 644                 /* cut memory off the end? */
 645                 else if (ml->addr <= start && start < ml->addr + ml->size) {
 646                         ml->size = start - ml->addr;
 647                 }
 648         }
 649 }
 650 
 651 /*
 652  * During memory allocation, find the highest address not used yet.
 653  */
 654 static void
 655 check_higher(paddr_t a)
 656 {
 657         if (a < next_avail_addr)
 658                 return;
 659         next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
 660         DBG(next_avail_addr);
 661 }
 662 
 663 static int
 664 dboot_loader_mmap_entries(void)
 665 {
 666 #if !defined(__xpv)
 667         if (num_entries_set == B_TRUE)
 668                 return (num_entries);
 669 
 670         switch (multiboot_version) {
 671         case 1:
 672                 DBG(mb_info->flags);
 673                 if (mb_info->flags & 0x40) {
 674                         mb_memory_map_t *mmap;
 675 
 676                         DBG(mb_info->mmap_addr);
 677                         DBG(mb_info->mmap_length);
 678                         check_higher(mb_info->mmap_addr + mb_info->mmap_length);
 679 
 680                         for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
 681                             (uint32_t)mmap < mb_info->mmap_addr +
 682                             mb_info->mmap_length;
 683                             mmap = (mb_memory_map_t *)((uint32_t)mmap +
 684                             mmap->size + sizeof (mmap->size)))
 685                                 ++num_entries;
 686 
 687                         num_entries_set = B_TRUE;
 688                 }
 689                 break;
 690         case 2:
 691                 num_entries_set = B_TRUE;
 692                 num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
 693                     mb2_mmap_tagp);
 694                 break;
 695         default:
 696                 dboot_panic("Unknown multiboot version: %d\n",
 697                     multiboot_version);
 698                 break;
 699         }
 700         return (num_entries);
 701 #else
 702         return (MAXMAPS);
 703 #endif
 704 }
 705 
 706 static uint32_t
 707 dboot_loader_mmap_get_type(int index)
 708 {
 709 #if !defined(__xpv)
 710         mb_memory_map_t *mp, *mpend;
 711         int i;
 712 
 713         switch (multiboot_version) {
 714         case 1:
 715                 mp = (mb_memory_map_t *)mb_info->mmap_addr;
 716                 mpend = (mb_memory_map_t *)
 717                     (mb_info->mmap_addr + mb_info->mmap_length);
 718 
 719                 for (i = 0; mp < mpend && i != index; i++)
 720                         mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
 721                             sizeof (mp->size));
 722                 if (mp >= mpend) {
 723                         dboot_panic("dboot_loader_mmap_get_type(): index "
 724                             "out of bounds: %d\n", index);
 725                 }
 726                 return (mp->type);
 727 
 728         case 2:
 729                 return (dboot_multiboot2_mmap_get_type(mb2_info,
 730                     mb2_mmap_tagp, index));
 731 
 732         default:
 733                 dboot_panic("Unknown multiboot version: %d\n",
 734                     multiboot_version);
 735                 break;
 736         }
 737         return (0);
 738 #else
 739         return (map_buffer[index].type);
 740 #endif
 741 }
 742 
 743 static uint64_t
 744 dboot_loader_mmap_get_base(int index)
 745 {
 746 #if !defined(__xpv)
 747         mb_memory_map_t *mp, *mpend;
 748         int i;
 749 
 750         switch (multiboot_version) {
 751         case 1:
 752                 mp = (mb_memory_map_t *)mb_info->mmap_addr;
 753                 mpend = (mb_memory_map_t *)
 754                     (mb_info->mmap_addr + mb_info->mmap_length);
 755 
 756                 for (i = 0; mp < mpend && i != index; i++)
 757                         mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
 758                             sizeof (mp->size));
 759                 if (mp >= mpend) {
 760                         dboot_panic("dboot_loader_mmap_get_base(): index "
 761                             "out of bounds: %d\n", index);
 762                 }
 763                 return (((uint64_t)mp->base_addr_high << 32) +
 764                     (uint64_t)mp->base_addr_low);
 765 
 766         case 2:
 767                 return (dboot_multiboot2_mmap_get_base(mb2_info,
 768                     mb2_mmap_tagp, index));
 769 
 770         default:
 771                 dboot_panic("Unknown multiboot version: %d\n",
 772                     multiboot_version);
 773                 break;
 774         }
 775         return (0);
 776 #else
 777         return (((uint64_t)map_buffer[index].base_addr_high << 32) +
 778             (uint64_t)map_buffer[index].base_addr_low);
 779 #endif
 780 }
 781 
 782 static uint64_t
 783 dboot_loader_mmap_get_length(int index)
 784 {
 785 #if !defined(__xpv)
 786         mb_memory_map_t *mp, *mpend;
 787         int i;
 788 
 789         switch (multiboot_version) {
 790         case 1:
 791                 mp = (mb_memory_map_t *)mb_info->mmap_addr;
 792                 mpend = (mb_memory_map_t *)
 793                     (mb_info->mmap_addr + mb_info->mmap_length);
 794 
 795                 for (i = 0; mp < mpend && i != index; i++)
 796                         mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
 797                             sizeof (mp->size));
 798                 if (mp >= mpend) {
 799                         dboot_panic("dboot_loader_mmap_get_length(): index "
 800                             "out of bounds: %d\n", index);
 801                 }
 802                 return (((uint64_t)mp->length_high << 32) +
 803                     (uint64_t)mp->length_low);
 804 
 805         case 2:
 806                 return (dboot_multiboot2_mmap_get_length(mb2_info,
 807                     mb2_mmap_tagp, index));
 808 
 809         default:
 810                 dboot_panic("Unknown multiboot version: %d\n",
 811                     multiboot_version);
 812                 break;
 813         }
 814         return (0);
 815 #else
 816         return (((uint64_t)map_buffer[index].length_high << 32) +
 817             (uint64_t)map_buffer[index].length_low);
 818 #endif
 819 }
 820 
 821 static void
 822 build_pcimemlists(void)
 823 {
 824         uint64_t page_offset = MMU_PAGEOFFSET;  /* needs to be 64 bits */
 825         uint64_t start;
 826         uint64_t end;
 827         int i, num;
 828 
 829         /*
 830          * initialize
 831          */
 832         pcimemlists[0].addr = pci_lo_limit;
 833         pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
 834         pcimemlists_used = 1;
 835 
 836         num = dboot_loader_mmap_entries();
 837         /*
 838          * Fill in PCI memlists.
 839          */
 840         for (i = 0; i < num; ++i) {
 841                 start = dboot_loader_mmap_get_base(i);
 842                 end = start + dboot_loader_mmap_get_length(i);
 843 
 844                 if (prom_debug)
 845                         dboot_printf("\ttype: %d %" PRIx64 "..%"
 846                             PRIx64 "\n", dboot_loader_mmap_get_type(i),
 847                             start, end);
 848 
 849                 /*
 850                  * page align start and end
 851                  */
 852                 start = (start + page_offset) & ~page_offset;
 853                 end &= ~page_offset;
 854                 if (end <= start)
 855                         continue;
 856 
 857                 exclude_from_pci(start, end);
 858         }
 859 
 860         /*
 861          * Finish off the pcimemlist
 862          */
 863         if (prom_debug) {
 864                 for (i = 0; i < pcimemlists_used; ++i) {
 865                         dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
 866                             PRIx64 "\n", pcimemlists[i].addr,
 867                             pcimemlists[i].addr + pcimemlists[i].size);
 868                 }
 869         }
 870         pcimemlists[0].next = 0;
 871         pcimemlists[0].prev = 0;
 872         for (i = 1; i < pcimemlists_used; ++i) {
 873                 pcimemlists[i].prev =
 874                     (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
 875                 pcimemlists[i].next = 0;
 876                 pcimemlists[i - 1].next =
 877                     (native_ptr_t)(uintptr_t)(pcimemlists + i);
 878         }
 879         bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
 880         DBG(bi->bi_pcimem);
 881 }
 882 
 883 #if defined(__xpv)
 884 /*
 885  * Initialize memory allocator stuff from hypervisor-supplied start info.
 886  */
 887 static void
 888 init_mem_alloc(void)
 889 {
 890         int     local;  /* variables needed to find start region */
 891         paddr_t scratch_start;
 892         xen_memory_map_t map;
 893 
 894         DBG_MSG("Entered init_mem_alloc()\n");
 895 
 896         /*
 897          * Free memory follows the stack. There's at least 512KB of scratch
 898          * space, rounded up to at least 2Mb alignment.  That should be enough
 899          * for the page tables we'll need to build.  The nucleus memory is
 900          * allocated last and will be outside the addressible range.  We'll
 901          * switch to new page tables before we unpack the kernel
 902          */
 903         scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
 904         DBG(scratch_start);
 905         scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
 906         DBG(scratch_end);
 907 
 908         /*
 909          * For paranoia, leave some space between hypervisor data and ours.
 910          * Use 500 instead of 512.
 911          */
 912         next_avail_addr = scratch_end - 500 * 1024;
 913         DBG(next_avail_addr);
 914 
 915         /*
 916          * The domain builder gives us at most 1 module
 917          */
 918         DBG(xen_info->mod_len);
 919         if (xen_info->mod_len > 0) {
 920                 DBG(xen_info->mod_start);
 921                 modules[0].bm_addr =
 922                     (native_ptr_t)(uintptr_t)xen_info->mod_start;
 923                 modules[0].bm_size = xen_info->mod_len;
 924                 bi->bi_module_cnt = 1;
 925                 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
 926         } else {
 927                 bi->bi_module_cnt = 0;
 928                 bi->bi_modules = (native_ptr_t)(uintptr_t)NULL;
 929         }
 930         DBG(bi->bi_module_cnt);
 931         DBG(bi->bi_modules);
 932 
 933         DBG(xen_info->mfn_list);
 934         DBG(xen_info->nr_pages);
 935         max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
 936         DBG(max_mem);
 937 
 938         /*
 939          * Using pseudo-physical addresses, so only 1 memlist element
 940          */
 941         memlists[0].addr = 0;
 942         DBG(memlists[0].addr);
 943         memlists[0].size = max_mem;
 944         DBG(memlists[0].size);
 945         memlists_used = 1;
 946         DBG(memlists_used);
 947 
 948         /*
 949          * finish building physinstall list
 950          */
 951         sort_physinstall();
 952 
 953         /*
 954          * build bios reserved memlists
 955          */
 956         build_rsvdmemlists();
 957 
 958         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
 959                 /*
 960                  * build PCI Memory list
 961                  */
 962                 map.nr_entries = MAXMAPS;
 963                 /*LINTED: constant in conditional context*/
 964                 set_xen_guest_handle(map.buffer, map_buffer);
 965                 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
 966                         dboot_panic("getting XENMEM_machine_memory_map failed");
 967                 build_pcimemlists();
 968         }
 969 }
 970 
 971 #else   /* !__xpv */
 972 
 973 static void
 974 dboot_multiboot1_xboot_consinfo(void)
 975 {
 976         bi->bi_framebuffer = NULL;
 977 }
 978 
 979 static void
 980 dboot_multiboot2_xboot_consinfo(void)
 981 {
 982         multiboot_tag_framebuffer_t *fb;
 983         fb = dboot_multiboot2_find_tag(mb2_info,
 984             MULTIBOOT_TAG_TYPE_FRAMEBUFFER);
 985         bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb;
 986 }
 987 
 988 static int
 989 dboot_multiboot_modcount(void)
 990 {
 991         switch (multiboot_version) {
 992         case 1:
 993                 return (mb_info->mods_count);
 994 
 995         case 2:
 996                 return (dboot_multiboot2_modcount(mb2_info));
 997 
 998         default:
 999                 dboot_panic("Unknown multiboot version: %d\n",
1000                     multiboot_version);
1001                 break;
1002         }
1003         return (0);
1004 }
1005 
1006 static uint32_t
1007 dboot_multiboot_modstart(int index)
1008 {
1009         switch (multiboot_version) {
1010         case 1:
1011                 return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1012 
1013         case 2:
1014                 return (dboot_multiboot2_modstart(mb2_info, index));
1015 
1016         default:
1017                 dboot_panic("Unknown multiboot version: %d\n",
1018                     multiboot_version);
1019                 break;
1020         }
1021         return (0);
1022 }
1023 
1024 static uint32_t
1025 dboot_multiboot_modend(int index)
1026 {
1027         switch (multiboot_version) {
1028         case 1:
1029                 return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1030 
1031         case 2:
1032                 return (dboot_multiboot2_modend(mb2_info, index));
1033 
1034         default:
1035                 dboot_panic("Unknown multiboot version: %d\n",
1036                     multiboot_version);
1037                 break;
1038         }
1039         return (0);
1040 }
1041 
1042 static char *
1043 dboot_multiboot_modcmdline(int index)
1044 {
1045         switch (multiboot_version) {
1046         case 1:
1047                 return ((char *)((mb_module_t *)
1048                     mb_info->mods_addr)[index].mod_name);
1049 
1050         case 2:
1051                 return (dboot_multiboot2_modcmdline(mb2_info, index));
1052 
1053         default:
1054                 dboot_panic("Unknown multiboot version: %d\n",
1055                     multiboot_version);
1056                 break;
1057         }
1058         return (0);
1059 }
1060 
1061 /*
1062  * Find the environment module for console setup.
1063  * Since we need the console to print early boot messages, the console is set up
1064  * before anything else and therefore we need to pick up the environment module
1065  * early too.
1066  *
1067  * Note, we just will search for and if found, will pass the env
1068  * module to console setup, the proper module list processing will happen later.
1069  */
1070 static void
1071 dboot_find_env(void)
1072 {
1073         int i, modcount;
1074         uint32_t mod_start, mod_end;
1075         char *cmdline;
1076 
1077         modcount = dboot_multiboot_modcount();
1078 
1079         for (i = 0; i < modcount; ++i) {
1080                 cmdline = dboot_multiboot_modcmdline(i);
1081                 if (cmdline == NULL)
1082                         continue;
1083 
1084                 if (strstr(cmdline, "type=environment") == NULL)
1085                         continue;
1086 
1087                 mod_start = dboot_multiboot_modstart(i);
1088                 mod_end = dboot_multiboot_modend(i);
1089                 modules[0].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1090                 modules[0].bm_size = mod_end - mod_start;
1091                 modules[0].bm_name = (native_ptr_t)(uintptr_t)NULL;
1092                 modules[0].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1093                 modules[0].bm_type = BMT_ENV;
1094                 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1095                 bi->bi_module_cnt = 1;
1096                 return;
1097         }
1098 }
1099 
1100 static boolean_t
1101 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1102 {
1103         boolean_t rv = B_FALSE;
1104 
1105         switch (multiboot_version) {
1106         case 1:
1107                 if (mb_info->flags & 0x01) {
1108                         *lower = mb_info->mem_lower;
1109                         *upper = mb_info->mem_upper;
1110                         rv = B_TRUE;
1111                 }
1112                 break;
1113 
1114         case 2:
1115                 return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1116 
1117         default:
1118                 dboot_panic("Unknown multiboot version: %d\n",
1119                     multiboot_version);
1120                 break;
1121         }
1122         return (rv);
1123 }
1124 
1125 static uint8_t
1126 dboot_a2h(char v)
1127 {
1128         if (v >= 'a')
1129                 return (v - 'a' + 0xa);
1130         else if (v >= 'A')
1131                 return (v - 'A' + 0xa);
1132         else if (v >= '0')
1133                 return (v - '0');
1134         else
1135                 dboot_panic("bad ASCII hex character %c\n", v);
1136 
1137         return (0);
1138 }
1139 
1140 static void
1141 digest_a2h(const char *ascii, uint8_t *digest)
1142 {
1143         unsigned int i;
1144 
1145         for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1146                 digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1147                 digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1148         }
1149 }
1150 
1151 /*
1152  * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1153  * the ASCII-format hash found in the 40-byte buffer at ascii.  If they
1154  * match, return 0, otherwise -1.  This works only for images smaller than
1155  * 4 GB, which should not be a problem.
1156  */
1157 static int
1158 check_image_hash(uint_t midx)
1159 {
1160         const char *ascii;
1161         const void *image;
1162         size_t len;
1163         SHA1_CTX ctx;
1164         uint8_t digest[SHA1_DIGEST_LENGTH];
1165         uint8_t baseline[SHA1_DIGEST_LENGTH];
1166         unsigned int i;
1167 
1168         ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1169         image = (const void *)(uintptr_t)modules[midx].bm_addr;
1170         len = (size_t)modules[midx].bm_size;
1171 
1172         digest_a2h(ascii, baseline);
1173 
1174         SHA1Init(&ctx);
1175         SHA1Update(&ctx, image, len);
1176         SHA1Final(digest, &ctx);
1177 
1178         for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1179                 if (digest[i] != baseline[i])
1180                         return (-1);
1181         }
1182 
1183         return (0);
1184 }
1185 
1186 static const char *
1187 type_to_str(boot_module_type_t type)
1188 {
1189         switch (type) {
1190         case BMT_ROOTFS:
1191                 return ("rootfs");
1192         case BMT_FILE:
1193                 return ("file");
1194         case BMT_HASH:
1195                 return ("hash");
1196         case BMT_ENV:
1197                 return ("environment");
1198         default:
1199                 return ("unknown");
1200         }
1201 }
1202 
1203 static void
1204 check_images(void)
1205 {
1206         uint_t i;
1207         char displayhash[SHA1_ASCII_LENGTH + 1];
1208 
1209         for (i = 0; i < modules_used; i++) {
1210                 if (prom_debug) {
1211                         dboot_printf("module #%d: name %s type %s "
1212                             "addr %lx size %lx\n",
1213                             i, (char *)(uintptr_t)modules[i].bm_name,
1214                             type_to_str(modules[i].bm_type),
1215                             (ulong_t)modules[i].bm_addr,
1216                             (ulong_t)modules[i].bm_size);
1217                 }
1218 
1219                 if (modules[i].bm_type == BMT_HASH ||
1220                     modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) {
1221                         DBG_MSG("module has no hash; skipping check\n");
1222                         continue;
1223                 }
1224                 (void) memcpy(displayhash,
1225                     (void *)(uintptr_t)modules[i].bm_hash,
1226                     SHA1_ASCII_LENGTH);
1227                 displayhash[SHA1_ASCII_LENGTH] = '\0';
1228                 if (prom_debug) {
1229                         dboot_printf("checking expected hash [%s]: ",
1230                             displayhash);
1231                 }
1232 
1233                 if (check_image_hash(i) != 0)
1234                         dboot_panic("hash mismatch!\n");
1235                 else
1236                         DBG_MSG("OK\n");
1237         }
1238 }
1239 
1240 /*
1241  * Determine the module's starting address, size, name, and type, and fill the
1242  * boot_modules structure.  This structure is used by the bop code, except for
1243  * hashes which are checked prior to transferring control to the kernel.
1244  */
1245 static void
1246 process_module(int midx)
1247 {
1248         uint32_t mod_start = dboot_multiboot_modstart(midx);
1249         uint32_t mod_end = dboot_multiboot_modend(midx);
1250         char *cmdline = dboot_multiboot_modcmdline(midx);
1251         char *p, *q;
1252 
1253         check_higher(mod_end);
1254         if (prom_debug) {
1255                 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1256                     midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1257         }
1258 
1259         if (mod_start > mod_end) {
1260                 dboot_panic("module #%d: module start address 0x%lx greater "
1261                     "than end address 0x%lx", midx,
1262                     (ulong_t)mod_start, (ulong_t)mod_end);
1263         }
1264 
1265         /*
1266          * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1267          * the address of the last valid byte in a module plus 1 as mod_end.
1268          * This is of course a bug; the multiboot specification simply states
1269          * that mod_start and mod_end "contain the start and end addresses of
1270          * the boot module itself" which is pretty obviously not what GRUB is
1271          * doing.  However, fixing it requires that not only this code be
1272          * changed but also that other code consuming this value and values
1273          * derived from it be fixed, and that the kernel and GRUB must either
1274          * both have the bug or neither.  While there are a lot of combinations
1275          * that will work, there are also some that won't, so for simplicity
1276          * we'll just cope with the bug.  That means we won't actually hash the
1277          * byte at mod_end, and we will expect that mod_end for the hash file
1278          * itself is one greater than some multiple of 41 (40 bytes of ASCII
1279          * hash plus a newline for each module).  We set bm_size to the true
1280          * correct number of bytes in each module, achieving exactly this.
1281          */
1282 
1283         modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1284         modules[midx].bm_size = mod_end - mod_start;
1285         modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1286         modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1287         modules[midx].bm_type = BMT_FILE;
1288 
1289         if (cmdline == NULL) {
1290                 modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1291                 return;
1292         }
1293 
1294         p = cmdline;
1295         modules[midx].bm_name =
1296             (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1297 
1298         while (p != NULL) {
1299                 q = strsep(&p, " \t\f\n\r");
1300                 if (strncmp(q, "name=", 5) == 0) {
1301                         if (q[5] != '\0' && !isspace(q[5])) {
1302                                 modules[midx].bm_name =
1303                                     (native_ptr_t)(uintptr_t)(q + 5);
1304                         }
1305                         continue;
1306                 }
1307 
1308                 if (strncmp(q, "type=", 5) == 0) {
1309                         if (q[5] == '\0' || isspace(q[5]))
1310                                 continue;
1311                         q += 5;
1312                         if (strcmp(q, "rootfs") == 0) {
1313                                 modules[midx].bm_type = BMT_ROOTFS;
1314                         } else if (strcmp(q, "hash") == 0) {
1315                                 modules[midx].bm_type = BMT_HASH;
1316                         } else if (strcmp(q, "environment") == 0) {
1317                                 modules[midx].bm_type = BMT_ENV;
1318                         } else if (strcmp(q, "file") != 0) {
1319                                 dboot_printf("\tmodule #%d: unknown module "
1320                                     "type '%s'; defaulting to 'file'",
1321                                     midx, q);
1322                         }
1323                         continue;
1324                 }
1325 
1326                 if (strncmp(q, "hash=", 5) == 0) {
1327                         if (q[5] != '\0' && !isspace(q[5])) {
1328                                 modules[midx].bm_hash =
1329                                     (native_ptr_t)(uintptr_t)(q + 5);
1330                         }
1331                         continue;
1332                 }
1333 
1334                 dboot_printf("ignoring unknown option '%s'\n", q);
1335         }
1336 }
1337 
1338 /*
1339  * Backward compatibility: if there are exactly one or two modules, both
1340  * of type 'file' and neither with an embedded hash value, we have been
1341  * given the legacy style modules.  In this case we need to treat the first
1342  * module as a rootfs and the second as a hash referencing that module.
1343  * Otherwise, even if the configuration is invalid, we assume that the
1344  * operator knows what he's doing or at least isn't being bitten by this
1345  * interface change.
1346  */
1347 static void
1348 fixup_modules(void)
1349 {
1350         if (modules_used == 0 || modules_used > 2)
1351                 return;
1352 
1353         if (modules[0].bm_type != BMT_FILE ||
1354             modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1355                 return;
1356         }
1357 
1358         if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL ||
1359             modules_used > 1 &&
1360             modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1361                 return;
1362         }
1363 
1364         modules[0].bm_type = BMT_ROOTFS;
1365         if (modules_used > 1) {
1366                 modules[1].bm_type = BMT_HASH;
1367                 modules[1].bm_name = modules[0].bm_name;
1368         }
1369 }
1370 
1371 /*
1372  * For modules that do not have assigned hashes but have a separate hash module,
1373  * find the assigned hash module and set the primary module's bm_hash to point
1374  * to the hash data from that module.  We will then ignore modules of type
1375  * BMT_HASH from this point forward.
1376  */
1377 static void
1378 assign_module_hashes(void)
1379 {
1380         uint_t i, j;
1381 
1382         for (i = 0; i < modules_used; i++) {
1383                 if (modules[i].bm_type == BMT_HASH ||
1384                     modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1385                         continue;
1386                 }
1387 
1388                 for (j = 0; j < modules_used; j++) {
1389                         if (modules[j].bm_type != BMT_HASH ||
1390                             strcmp((char *)(uintptr_t)modules[j].bm_name,
1391                             (char *)(uintptr_t)modules[i].bm_name) != 0) {
1392                                 continue;
1393                         }
1394 
1395                         if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1396                                 dboot_printf("Short hash module of length "
1397                                     "0x%lx bytes; ignoring\n",
1398                                     (ulong_t)modules[j].bm_size);
1399                         } else {
1400                                 modules[i].bm_hash = modules[j].bm_addr;
1401                         }
1402                         break;
1403                 }
1404         }
1405 }
1406 
1407 /*
1408  * Walk through the module information finding the last used address.
1409  * The first available address will become the top level page table.
1410  */
1411 static void
1412 dboot_process_modules(void)
1413 {
1414         int i, modcount;
1415         extern char _end[];
1416 
1417         DBG_MSG("\nFinding Modules\n");
1418         modcount = dboot_multiboot_modcount();
1419         if (modcount > MAX_BOOT_MODULES) {
1420                 dboot_panic("Too many modules (%d) -- the maximum is %d.",
1421                     modcount, MAX_BOOT_MODULES);
1422         }
1423         /*
1424          * search the modules to find the last used address
1425          * we'll build the module list while we're walking through here
1426          */
1427         check_higher((paddr_t)(uintptr_t)&_end);
1428         for (i = 0; i < modcount; ++i) {
1429                 process_module(i);
1430                 modules_used++;
1431         }
1432         bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1433         DBG(bi->bi_modules);
1434         bi->bi_module_cnt = modcount;
1435         DBG(bi->bi_module_cnt);
1436 
1437         fixup_modules();
1438         assign_module_hashes();
1439         check_images();
1440 }
1441 
1442 /*
1443  * We then build the phys_install memlist from the multiboot information.
1444  */
1445 static void
1446 dboot_process_mmap(void)
1447 {
1448         uint64_t start;
1449         uint64_t end;
1450         uint64_t page_offset = MMU_PAGEOFFSET;  /* needs to be 64 bits */
1451         uint32_t lower, upper;
1452         int i, mmap_entries;
1453 
1454         /*
1455          * Walk through the memory map from multiboot and build our memlist
1456          * structures. Note these will have native format pointers.
1457          */
1458         DBG_MSG("\nFinding Memory Map\n");
1459         num_entries = 0;
1460         num_entries_set = B_FALSE;
1461         max_mem = 0;
1462         if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1463                 for (i = 0; i < mmap_entries; i++) {
1464                         uint32_t type = dboot_loader_mmap_get_type(i);
1465                         start = dboot_loader_mmap_get_base(i);
1466                         end = start + dboot_loader_mmap_get_length(i);
1467 
1468                         if (prom_debug)
1469                                 dboot_printf("\ttype: %d %" PRIx64 "..%"
1470                                     PRIx64 "\n", type, start, end);
1471 
1472                         /*
1473                          * page align start and end
1474                          */
1475                         start = (start + page_offset) & ~page_offset;
1476                         end &= ~page_offset;
1477                         if (end <= start)
1478                                 continue;
1479 
1480                         /*
1481                          * only type 1 is usable RAM
1482                          */
1483                         switch (type) {
1484                         case 1:
1485                                 if (end > max_mem)
1486                                         max_mem = end;
1487                                 memlists[memlists_used].addr = start;
1488                                 memlists[memlists_used].size = end - start;
1489                                 ++memlists_used;
1490                                 if (memlists_used > MAX_MEMLIST)
1491                                         dboot_panic("too many memlists");
1492                                 break;
1493                         case 2:
1494                                 rsvdmemlists[rsvdmemlists_used].addr = start;
1495                                 rsvdmemlists[rsvdmemlists_used].size =
1496                                     end - start;
1497                                 ++rsvdmemlists_used;
1498                                 if (rsvdmemlists_used > MAX_MEMLIST)
1499                                         dboot_panic("too many rsvdmemlists");
1500                                 break;
1501                         default:
1502                                 continue;
1503                         }
1504                 }
1505                 build_pcimemlists();
1506         } else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1507                 DBG(lower);
1508                 memlists[memlists_used].addr = 0;
1509                 memlists[memlists_used].size = lower * 1024;
1510                 ++memlists_used;
1511                 DBG(upper);
1512                 memlists[memlists_used].addr = 1024 * 1024;
1513                 memlists[memlists_used].size = upper * 1024;
1514                 ++memlists_used;
1515 
1516                 /*
1517                  * Old platform - assume I/O space at the end of memory.
1518                  */
1519                 pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1520                 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1521                 pcimemlists[0].next = 0;
1522                 pcimemlists[0].prev = 0;
1523                 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1524                 DBG(bi->bi_pcimem);
1525         } else {
1526                 dboot_panic("No memory info from boot loader!!!");
1527         }
1528 
1529         /*
1530          * finish processing the physinstall list
1531          */
1532         sort_physinstall();
1533 
1534         /*
1535          * build bios reserved mem lists
1536          */
1537         build_rsvdmemlists();
1538 }
1539 
1540 /*
1541  * The highest address is used as the starting point for dboot's simple
1542  * memory allocator.
1543  *
1544  * Finding the highest address in case of Multiboot 1 protocol is
1545  * quite painful in the sense that some information provided by
1546  * the multiboot info structure points to BIOS data, and some to RAM.
1547  *
1548  * The module list was processed and checked already by dboot_process_modules(),
1549  * so we will check the command line string and the memory map.
1550  *
1551  * This list of to be checked items is based on our current knowledge of
1552  * allocations made by grub1 and will need to be reviewed if there
1553  * are updates about the information provided by Multiboot 1.
1554  *
1555  * In the case of the Multiboot 2, our life is much simpler, as the MB2
1556  * information tag list is one contiguous chunk of memory.
1557  */
1558 static paddr_t
1559 dboot_multiboot1_highest_addr(void)
1560 {
1561         paddr_t addr = (paddr_t)(uintptr_t)NULL;
1562         char *cmdl = (char *)mb_info->cmdline;
1563 
1564         if (mb_info->flags & MB_INFO_CMDLINE)
1565                 addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1566 
1567         if (mb_info->flags & MB_INFO_MEM_MAP)
1568                 addr = MAX(addr,
1569                     ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1570         return (addr);
1571 }
1572 
1573 static void
1574 dboot_multiboot_highest_addr(void)
1575 {
1576         paddr_t addr;
1577 
1578         switch (multiboot_version) {
1579         case 1:
1580                 addr = dboot_multiboot1_highest_addr();
1581                 if (addr != (paddr_t)(uintptr_t)NULL)
1582                         check_higher(addr);
1583                 break;
1584         case 2:
1585                 addr = dboot_multiboot2_highest_addr(mb2_info);
1586                 if (addr != (paddr_t)(uintptr_t)NULL)
1587                         check_higher(addr);
1588                 break;
1589         default:
1590                 dboot_panic("Unknown multiboot version: %d\n",
1591                     multiboot_version);
1592                 break;
1593         }
1594 }
1595 
1596 /*
1597  * Walk the boot loader provided information and find the highest free address.
1598  */
1599 static void
1600 init_mem_alloc(void)
1601 {
1602         DBG_MSG("Entered init_mem_alloc()\n");
1603         dboot_process_modules();
1604         dboot_process_mmap();
1605         dboot_multiboot_highest_addr();
1606 }
1607 
1608 static int
1609 dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2)
1610 {
1611         int i;
1612 
1613         if (g1->time_low != g2->time_low)
1614                 return (0);
1615         if (g1->time_mid != g2->time_mid)
1616                 return (0);
1617         if (g1->time_hi_and_version != g2->time_hi_and_version)
1618                 return (0);
1619         if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved)
1620                 return (0);
1621         if (g1->clock_seq_low != g2->clock_seq_low)
1622                 return (0);
1623 
1624         for (i = 0; i < 6; i++) {
1625                 if (g1->node_addr[i] != g2->node_addr[i])
1626                         return (0);
1627         }
1628         return (1);
1629 }
1630 
1631 static void
1632 process_efi32(EFI_SYSTEM_TABLE32 *efi)
1633 {
1634         uint32_t entries;
1635         EFI_CONFIGURATION_TABLE32 *config;
1636         int i;
1637 
1638         entries = efi->NumberOfTableEntries;
1639         config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1640             efi->ConfigurationTable;
1641 
1642         for (i = 0; i < entries; i++) {
1643                 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1644                         bi->bi_smbios = (native_ptr_t)(uintptr_t)
1645                             config[i].VendorTable;
1646                 }
1647                 if (bi->bi_smbios == NULL &&
1648                     dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1649                         bi->bi_smbios = (native_ptr_t)(uintptr_t)
1650                             config[i].VendorTable;
1651                 }
1652                 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1653                         bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1654                             config[i].VendorTable;
1655                 }
1656                 if (bi->bi_acpi_rsdp == NULL &&
1657                     dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1658                         bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1659                             config[i].VendorTable;
1660                 }
1661         }
1662 }
1663 
1664 static void
1665 process_efi64(EFI_SYSTEM_TABLE64 *efi)
1666 {
1667         uint64_t entries;
1668         EFI_CONFIGURATION_TABLE64 *config;
1669         int i;
1670 
1671         entries = efi->NumberOfTableEntries;
1672         config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1673             efi->ConfigurationTable;
1674 
1675         for (i = 0; i < entries; i++) {
1676                 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1677                         bi->bi_smbios = (native_ptr_t)(uintptr_t)
1678                             config[i].VendorTable;
1679                 }
1680                 if (bi->bi_smbios == NULL &&
1681                     dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1682                         bi->bi_smbios = (native_ptr_t)(uintptr_t)
1683                             config[i].VendorTable;
1684                 }
1685                 /* Prefer acpi v2+ over v1. */
1686                 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1687                         bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1688                             config[i].VendorTable;
1689                 }
1690                 if (bi->bi_acpi_rsdp == NULL &&
1691                     dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1692                         bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1693                             config[i].VendorTable;
1694                 }
1695         }
1696 }
1697 
1698 static void
1699 dboot_multiboot_get_fwtables(void)
1700 {
1701         multiboot_tag_new_acpi_t *nacpitagp;
1702         multiboot_tag_old_acpi_t *oacpitagp;
1703         multiboot_tag_efi64_t *efi64tagp = NULL;
1704         multiboot_tag_efi32_t *efi32tagp = NULL;
1705 
1706         /* no fw tables from multiboot 1 */
1707         if (multiboot_version != 2)
1708                 return;
1709 
1710         efi64tagp = (multiboot_tag_efi64_t *)
1711             dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64);
1712         if (efi64tagp != NULL) {
1713                 bi->bi_uefi_arch = XBI_UEFI_ARCH_64;
1714                 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1715                     efi64tagp->mb_pointer;
1716                 process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
1717                     efi64tagp->mb_pointer);
1718         } else {
1719                 efi32tagp = (multiboot_tag_efi32_t *)
1720                     dboot_multiboot2_find_tag(mb2_info,
1721                     MULTIBOOT_TAG_TYPE_EFI32);
1722                 if (efi32tagp != NULL) {
1723                         bi->bi_uefi_arch = XBI_UEFI_ARCH_32;
1724                         bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1725                             efi32tagp->mb_pointer;
1726                         process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
1727                             efi32tagp->mb_pointer);
1728                 }
1729         }
1730 
1731         /*
1732          * The ACPI RSDP can be found by scanning the BIOS memory areas or
1733          * from the EFI system table. The boot loader may pass in the address
1734          * it found the ACPI tables at.
1735          */
1736         nacpitagp = (multiboot_tag_new_acpi_t *)
1737             dboot_multiboot2_find_tag(mb2_info,
1738             MULTIBOOT_TAG_TYPE_ACPI_NEW);
1739         oacpitagp = (multiboot_tag_old_acpi_t *)
1740             dboot_multiboot2_find_tag(mb2_info,
1741             MULTIBOOT_TAG_TYPE_ACPI_OLD);
1742 
1743         if (nacpitagp != NULL) {
1744                 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1745                     &nacpitagp->mb_rsdp[0];
1746         } else if (oacpitagp != NULL) {
1747                 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1748                     &oacpitagp->mb_rsdp[0];
1749         }
1750 }
1751 
1752 /* print out EFI version string with newline */
1753 static void
1754 dboot_print_efi_version(uint32_t ver)
1755 {
1756         int rev;
1757 
1758         dboot_printf("%d.", EFI_REV_MAJOR(ver));
1759 
1760         rev = EFI_REV_MINOR(ver);
1761         if ((rev % 10) != 0) {
1762                 dboot_printf("%d.%d\n", rev / 10, rev % 10);
1763         } else {
1764                 dboot_printf("%d\n", rev / 10);
1765         }
1766 }
1767 
1768 static void
1769 print_efi32(EFI_SYSTEM_TABLE32 *efi)
1770 {
1771         uint16_t *data;
1772         EFI_CONFIGURATION_TABLE32 *conf;
1773         int i;
1774 
1775         dboot_printf("EFI32 signature: %llx\n",
1776             (unsigned long long)efi->Hdr.Signature);
1777         dboot_printf("EFI system version: ");
1778         dboot_print_efi_version(efi->Hdr.Revision);
1779         dboot_printf("EFI system vendor: ");
1780         data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1781         for (i = 0; data[i] != 0; i++)
1782                 dboot_printf("%c", (char)data[i]);
1783         dboot_printf("\nEFI firmware revision: ");
1784         dboot_print_efi_version(efi->FirmwareRevision);
1785         dboot_printf("EFI system table number of entries: %d\n",
1786             efi->NumberOfTableEntries);
1787         conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1788             efi->ConfigurationTable;
1789         for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1790                 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1791                     conf[i].VendorGuid.time_low,
1792                     conf[i].VendorGuid.time_mid,
1793                     conf[i].VendorGuid.time_hi_and_version,
1794                     conf[i].VendorGuid.clock_seq_hi_and_reserved,
1795                     conf[i].VendorGuid.clock_seq_low);
1796                 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1797                     conf[i].VendorGuid.node_addr[0],
1798                     conf[i].VendorGuid.node_addr[1],
1799                     conf[i].VendorGuid.node_addr[2],
1800                     conf[i].VendorGuid.node_addr[3],
1801                     conf[i].VendorGuid.node_addr[4],
1802                     conf[i].VendorGuid.node_addr[5]);
1803         }
1804 }
1805 
1806 static void
1807 print_efi64(EFI_SYSTEM_TABLE64 *efi)
1808 {
1809         uint16_t *data;
1810         EFI_CONFIGURATION_TABLE64 *conf;
1811         int i;
1812 
1813         dboot_printf("EFI64 signature: %llx\n",
1814             (unsigned long long)efi->Hdr.Signature);
1815         dboot_printf("EFI system version: ");
1816         dboot_print_efi_version(efi->Hdr.Revision);
1817         dboot_printf("EFI system vendor: ");
1818         data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1819         for (i = 0; data[i] != 0; i++)
1820                 dboot_printf("%c", (char)data[i]);
1821         dboot_printf("\nEFI firmware revision: ");
1822         dboot_print_efi_version(efi->FirmwareRevision);
1823         dboot_printf("EFI system table number of entries: %lld\n",
1824             efi->NumberOfTableEntries);
1825         conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1826             efi->ConfigurationTable;
1827         for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1828                 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1829                     conf[i].VendorGuid.time_low,
1830                     conf[i].VendorGuid.time_mid,
1831                     conf[i].VendorGuid.time_hi_and_version,
1832                     conf[i].VendorGuid.clock_seq_hi_and_reserved,
1833                     conf[i].VendorGuid.clock_seq_low);
1834                 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1835                     conf[i].VendorGuid.node_addr[0],
1836                     conf[i].VendorGuid.node_addr[1],
1837                     conf[i].VendorGuid.node_addr[2],
1838                     conf[i].VendorGuid.node_addr[3],
1839                     conf[i].VendorGuid.node_addr[4],
1840                     conf[i].VendorGuid.node_addr[5]);
1841         }
1842 }
1843 #endif /* !__xpv */
1844 
1845 /*
1846  * Simple memory allocator, allocates aligned physical memory.
1847  * Note that startup_kernel() only allocates memory, never frees.
1848  * Memory usage just grows in an upward direction.
1849  */
1850 static void *
1851 do_mem_alloc(uint32_t size, uint32_t align)
1852 {
1853         uint_t i;
1854         uint64_t best;
1855         uint64_t start;
1856         uint64_t end;
1857 
1858         /*
1859          * make sure size is a multiple of pagesize
1860          */
1861         size = RNDUP(size, MMU_PAGESIZE);
1862         next_avail_addr = RNDUP(next_avail_addr, align);
1863 
1864         /*
1865          * XXPV fixme joe
1866          *
1867          * a really large bootarchive that causes you to run out of memory
1868          * may cause this to blow up
1869          */
1870         /* LINTED E_UNEXPECTED_UINT_PROMOTION */
1871         best = (uint64_t)-size;
1872         for (i = 0; i < memlists_used; ++i) {
1873                 start = memlists[i].addr;
1874 #if defined(__xpv)
1875                 start += mfn_base;
1876 #endif
1877                 end = start + memlists[i].size;
1878 
1879                 /*
1880                  * did we find the desired address?
1881                  */
1882                 if (start <= next_avail_addr && next_avail_addr + size <= end) {
1883                         best = next_avail_addr;
1884                         goto done;
1885                 }
1886 
1887                 /*
1888                  * if not is this address the best so far?
1889                  */
1890                 if (start > next_avail_addr && start < best &&
1891                     RNDUP(start, align) + size <= end)
1892                         best = RNDUP(start, align);
1893         }
1894 
1895         /*
1896          * We didn't find exactly the address we wanted, due to going off the
1897          * end of a memory region. Return the best found memory address.
1898          */
1899 done:
1900         next_avail_addr = best + size;
1901 #if defined(__xpv)
1902         if (next_avail_addr > scratch_end)
1903                 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1904                     "0x%lx", (ulong_t)next_avail_addr,
1905                     (ulong_t)scratch_end);
1906 #endif
1907         (void) memset((void *)(uintptr_t)best, 0, size);
1908         return ((void *)(uintptr_t)best);
1909 }
1910 
1911 void *
1912 mem_alloc(uint32_t size)
1913 {
1914         return (do_mem_alloc(size, MMU_PAGESIZE));
1915 }
1916 
1917 
1918 /*
1919  * Build page tables to map all of memory used so far as well as the kernel.
1920  */
1921 static void
1922 build_page_tables(void)
1923 {
1924         uint32_t psize;
1925         uint32_t level;
1926         uint32_t off;
1927         uint64_t start;
1928 #if !defined(__xpv)
1929         uint32_t i;
1930         uint64_t end;
1931 #endif  /* __xpv */
1932 
1933         /*
1934          * If we're on metal, we need to create the top level pagetable.
1935          */
1936 #if defined(__xpv)
1937         top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1938 #else /* __xpv */
1939         top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1940 #endif /* __xpv */
1941         DBG((uintptr_t)top_page_table);
1942 
1943         /*
1944          * Determine if we'll use large mappings for kernel, then map it.
1945          */
1946         if (largepage_support) {
1947                 psize = lpagesize;
1948                 level = 1;
1949         } else {
1950                 psize = MMU_PAGESIZE;
1951                 level = 0;
1952         }
1953 
1954         DBG_MSG("Mapping kernel\n");
1955         DBG(ktext_phys);
1956         DBG(target_kernel_text);
1957         DBG(ksize);
1958         DBG(psize);
1959         for (off = 0; off < ksize; off += psize)
1960                 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1961 
1962         /*
1963          * The kernel will need a 1 page window to work with page tables
1964          */
1965         bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1966         DBG(bi->bi_pt_window);
1967         bi->bi_pte_to_pt_window =
1968             (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1969         DBG(bi->bi_pte_to_pt_window);
1970 
1971 #if defined(__xpv)
1972         if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1973                 /* If this is a domU we're done. */
1974                 DBG_MSG("\nPage tables constructed\n");
1975                 return;
1976         }
1977 #endif /* __xpv */
1978 
1979         /*
1980          * We need 1:1 mappings for the lower 1M of memory to access
1981          * BIOS tables used by a couple of drivers during boot.
1982          *
1983          * The following code works because our simple memory allocator
1984          * only grows usage in an upwards direction.
1985          *
1986          * Note that by this point in boot some mappings for low memory
1987          * may already exist because we've already accessed device in low
1988          * memory.  (Specifically the video frame buffer and keyboard
1989          * status ports.)  If we're booting on raw hardware then GRUB
1990          * created these mappings for us.  If we're booting under a
1991          * hypervisor then we went ahead and remapped these devices into
1992          * memory allocated within dboot itself.
1993          */
1994         if (map_debug)
1995                 dboot_printf("1:1 map pa=0..1Meg\n");
1996         for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1997 #if defined(__xpv)
1998                 map_ma_at_va(start, start, 0);
1999 #else /* __xpv */
2000                 map_pa_at_va(start, start, 0);
2001 #endif /* __xpv */
2002         }
2003 
2004 #if !defined(__xpv)
2005 
2006         for (i = 0; i < memlists_used; ++i) {
2007                 start = memlists[i].addr;
2008                 end = start + memlists[i].size;
2009 
2010                 if (map_debug)
2011                         dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2012                             start, end);
2013                 while (start < end && start < next_avail_addr) {
2014                         map_pa_at_va(start, start, 0);
2015                         start += MMU_PAGESIZE;
2016                 }
2017                 if (start >= next_avail_addr)
2018                         break;
2019         }
2020 
2021         /*
2022          * Map framebuffer memory as PT_NOCACHE as this is memory from a
2023          * device and therefore must not be cached.
2024          */
2025         if (bi->bi_framebuffer != NULL) {
2026                 multiboot_tag_framebuffer_t *fb;
2027                 fb = (multiboot_tag_framebuffer_t *)(uintptr_t)
2028                     bi->bi_framebuffer;
2029 
2030                 start = fb->framebuffer_common.framebuffer_addr;
2031                 end = start + fb->framebuffer_common.framebuffer_height *
2032                     fb->framebuffer_common.framebuffer_pitch;
2033 
2034                 pte_bits |= PT_NOCACHE;
2035                 while (start < end) {
2036                         map_pa_at_va(start, start, 0);
2037                         start += MMU_PAGESIZE;
2038                 }
2039                 pte_bits &= ~PT_NOCACHE;
2040         }
2041 #endif /* !__xpv */
2042 
2043         DBG_MSG("\nPage tables constructed\n");
2044 }
2045 
2046 #define NO_MULTIBOOT    \
2047 "multiboot is no longer used to boot the Solaris Operating System.\n\
2048 The grub entry should be changed to:\n\
2049 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
2050 module$ /platform/i86pc/$ISADIR/boot_archive\n\
2051 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
2052 
2053 static void
2054 dboot_init_xboot_consinfo(void)
2055 {
2056         uintptr_t addr;
2057         /*
2058          * boot info must be 16 byte aligned for 64 bit kernel ABI
2059          */
2060         addr = (uintptr_t)boot_info;
2061         addr = (addr + 0xf) & ~0xf;
2062         bi = (struct xboot_info *)addr;
2063 
2064 #if !defined(__xpv)
2065         switch (multiboot_version) {
2066         case 1:
2067                 dboot_multiboot1_xboot_consinfo();
2068                 break;
2069         case 2:
2070                 dboot_multiboot2_xboot_consinfo();
2071                 break;
2072         default:
2073                 dboot_panic("Unknown multiboot version: %d\n",
2074                     multiboot_version);
2075                 break;
2076         }
2077         /*
2078          * Lookup environment module for the console. Complete module list
2079          * will be built after console setup.
2080          */
2081         dboot_find_env();
2082 #endif
2083 }
2084 
2085 /*
2086  * Set up basic data from the boot loader.
2087  * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
2088  * 32-bit dboot code setup used to set up and start 64-bit kernel.
2089  * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
2090  * start 64-bit illumos kernel.
2091  */
2092 static void
2093 dboot_loader_init(void)
2094 {
2095 #if !defined(__xpv)
2096         mb_info = NULL;
2097         mb2_info = NULL;
2098 
2099         switch (mb_magic) {
2100         case MB_BOOTLOADER_MAGIC:
2101                 multiboot_version = 1;
2102                 mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
2103 #if defined(_BOOT_TARGET_amd64)
2104                 load_addr = mb_header.load_addr;
2105 #endif
2106                 break;
2107 
2108         case MULTIBOOT2_BOOTLOADER_MAGIC:
2109                 multiboot_version = 2;
2110                 mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
2111                 mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
2112 #if defined(_BOOT_TARGET_amd64)
2113                 load_addr = mb2_load_addr;
2114 #endif
2115                 break;
2116 
2117         default:
2118                 dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
2119                 break;
2120         }
2121 #endif  /* !defined(__xpv) */
2122 }
2123 
2124 /* Extract the kernel command line from [multi]boot information. */
2125 static char *
2126 dboot_loader_cmdline(void)
2127 {
2128         char *line = NULL;
2129 
2130 #if defined(__xpv)
2131         line = (char *)xen_info->cmd_line;
2132 #else /* __xpv */
2133 
2134         switch (multiboot_version) {
2135         case 1:
2136                 if (mb_info->flags & MB_INFO_CMDLINE)
2137                         line = (char *)mb_info->cmdline;
2138                 break;
2139 
2140         case 2:
2141                 line = dboot_multiboot2_cmdline(mb2_info);
2142                 break;
2143 
2144         default:
2145                 dboot_panic("Unknown multiboot version: %d\n",
2146                     multiboot_version);
2147                 break;
2148         }
2149 
2150 #endif /* __xpv */
2151 
2152         /*
2153          * Make sure we have valid pointer so the string operations
2154          * will not crash us.
2155          */
2156         if (line == NULL)
2157                 line = "";
2158 
2159         return (line);
2160 }
2161 
2162 static char *
2163 dboot_loader_name(void)
2164 {
2165 #if defined(__xpv)
2166         return (NULL);
2167 #else /* __xpv */
2168         multiboot_tag_string_t *tag;
2169 
2170         switch (multiboot_version) {
2171         case 1:
2172                 return ((char *)mb_info->boot_loader_name);
2173 
2174         case 2:
2175                 tag = dboot_multiboot2_find_tag(mb2_info,
2176                     MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
2177                 return (tag->mb_string);
2178         default:
2179                 dboot_panic("Unknown multiboot version: %d\n",
2180                     multiboot_version);
2181                 break;
2182         }
2183 
2184         return (NULL);
2185 #endif /* __xpv */
2186 }
2187 
2188 /*
2189  * startup_kernel has a pretty simple job. It builds pagetables which reflect
2190  * 1:1 mappings for all memory in use. It then also adds mappings for
2191  * the kernel nucleus at virtual address of target_kernel_text using large page
2192  * mappings. The page table pages are also accessible at 1:1 mapped
2193  * virtual addresses.
2194  */
2195 /*ARGSUSED*/
2196 void
2197 startup_kernel(void)
2198 {
2199         char *cmdline;
2200         char *bootloader;
2201 #if defined(__xpv)
2202         physdev_set_iopl_t set_iopl;
2203 #endif /* __xpv */
2204 
2205         dboot_loader_init();
2206         /*
2207          * At this point we are executing in a 32 bit real mode.
2208          */
2209 
2210         bootloader = dboot_loader_name();
2211         cmdline = dboot_loader_cmdline();
2212 
2213 #if defined(__xpv)
2214         /*
2215          * For dom0, before we initialize the console subsystem we'll
2216          * need to enable io operations, so set I/O priveldge level to 1.
2217          */
2218         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
2219                 set_iopl.iopl = 1;
2220                 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
2221         }
2222 #endif /* __xpv */
2223 
2224         dboot_init_xboot_consinfo();
2225         bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
2226         bcons_init(bi);
2227 
2228         prom_debug = (find_boot_prop("prom_debug") != NULL);
2229         map_debug = (find_boot_prop("map_debug") != NULL);
2230 
2231 #if !defined(__xpv)
2232         dboot_multiboot_get_fwtables();
2233 #endif
2234         DBG_MSG("\n\nillumos prekernel set: ");
2235         DBG_MSG(cmdline);
2236         DBG_MSG("\n");
2237 
2238         if (bootloader != NULL && prom_debug) {
2239                 dboot_printf("Kernel loaded by: %s\n", bootloader);
2240 #if !defined(__xpv)
2241                 dboot_printf("Using multiboot %d boot protocol.\n",
2242                     multiboot_version);
2243 #endif
2244         }
2245 
2246         if (strstr(cmdline, "multiboot") != NULL) {
2247                 dboot_panic(NO_MULTIBOOT);
2248         }
2249 
2250         DBG((uintptr_t)bi);
2251 #if !defined(__xpv)
2252         DBG((uintptr_t)mb_info);
2253         DBG((uintptr_t)mb2_info);
2254         if (mb2_info != NULL)
2255                 DBG(mb2_info->mbi_total_size);
2256         DBG(bi->bi_acpi_rsdp);
2257         DBG(bi->bi_smbios);
2258         DBG(bi->bi_uefi_arch);
2259         DBG(bi->bi_uefi_systab);
2260 
2261         if (bi->bi_uefi_systab && prom_debug) {
2262                 if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) {
2263                         print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
2264                             bi->bi_uefi_systab);
2265                 } else {
2266                         print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
2267                             bi->bi_uefi_systab);
2268                 }
2269         }
2270 #endif
2271 
2272         /*
2273          * Need correct target_kernel_text value
2274          */
2275 #if defined(_BOOT_TARGET_amd64)
2276         target_kernel_text = KERNEL_TEXT_amd64;
2277 #elif defined(__xpv)
2278         target_kernel_text = KERNEL_TEXT_i386_xpv;
2279 #else
2280         target_kernel_text = KERNEL_TEXT_i386;
2281 #endif
2282         DBG(target_kernel_text);
2283 
2284 #if defined(__xpv)
2285 
2286         /*
2287          * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
2288          */
2289 
2290 #if defined(_BOOT_TARGET_amd64)
2291         /*
2292          * 64-bit hypervisor.
2293          */
2294         amd64_support = 1;
2295         pae_support = 1;
2296 
2297 #else   /* _BOOT_TARGET_amd64 */
2298 
2299         /*
2300          * See if we are running on a PAE Hypervisor
2301          */
2302         {
2303                 xen_capabilities_info_t caps;
2304 
2305                 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2306                         dboot_panic("HYPERVISOR_xen_version(caps) failed");
2307                 caps[sizeof (caps) - 1] = 0;
2308                 if (prom_debug)
2309                         dboot_printf("xen capabilities %s\n", caps);
2310                 if (strstr(caps, "x86_32p") != NULL)
2311                         pae_support = 1;
2312         }
2313 
2314 #endif  /* _BOOT_TARGET_amd64 */
2315         {
2316                 xen_platform_parameters_t p;
2317 
2318                 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2319                         dboot_panic("HYPERVISOR_xen_version(parms) failed");
2320                 DBG(p.virt_start);
2321                 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2322         }
2323 
2324         /*
2325          * The hypervisor loads stuff starting at 1Gig
2326          */
2327         mfn_base = ONE_GIG;
2328         DBG(mfn_base);
2329 
2330         /*
2331          * enable writable page table mode for the hypervisor
2332          */
2333         if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2334             VMASST_TYPE_writable_pagetables) < 0)
2335                 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2336 
2337         /*
2338          * check for NX support
2339          */
2340         if (pae_support) {
2341                 uint32_t eax = 0x80000000;
2342                 uint32_t edx = get_cpuid_edx(&eax);
2343 
2344                 if (eax >= 0x80000001) {
2345                         eax = 0x80000001;
2346                         edx = get_cpuid_edx(&eax);
2347                         if (edx & CPUID_AMD_EDX_NX)
2348                                 NX_support = 1;
2349                 }
2350         }
2351 
2352 #if !defined(_BOOT_TARGET_amd64)
2353 
2354         /*
2355          * The 32-bit hypervisor uses segmentation to protect itself from
2356          * guests. This means when a guest attempts to install a flat 4GB
2357          * code or data descriptor the 32-bit hypervisor will protect itself
2358          * by silently shrinking the segment such that if the guest attempts
2359          * any access where the hypervisor lives a #gp fault is generated.
2360          * The problem is that some applications expect a full 4GB flat
2361          * segment for their current thread pointer and will use negative
2362          * offset segment wrap around to access data. TLS support in linux
2363          * brand is one example of this.
2364          *
2365          * The 32-bit hypervisor can catch the #gp fault in these cases
2366          * and emulate the access without passing the #gp fault to the guest
2367          * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2368          * Seems like this should have been the default.
2369          * Either way, we want the hypervisor -- and not Solaris -- to deal
2370          * to deal with emulating these accesses.
2371          */
2372         if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2373             VMASST_TYPE_4gb_segments) < 0)
2374                 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2375 #endif  /* !_BOOT_TARGET_amd64 */
2376 
2377 #else   /* __xpv */
2378 
2379         /*
2380          * use cpuid to enable MMU features
2381          */
2382         if (have_cpuid()) {
2383                 uint32_t eax, edx;
2384 
2385                 eax = 1;
2386                 edx = get_cpuid_edx(&eax);
2387                 if (edx & CPUID_INTC_EDX_PSE)
2388                         largepage_support = 1;
2389                 if (edx & CPUID_INTC_EDX_PGE)
2390                         pge_support = 1;
2391                 if (edx & CPUID_INTC_EDX_PAE)
2392                         pae_support = 1;
2393 
2394                 eax = 0x80000000;
2395                 edx = get_cpuid_edx(&eax);
2396                 if (eax >= 0x80000001) {
2397                         eax = 0x80000001;
2398                         edx = get_cpuid_edx(&eax);
2399                         if (edx & CPUID_AMD_EDX_LM)
2400                                 amd64_support = 1;
2401                         if (edx & CPUID_AMD_EDX_NX)
2402                                 NX_support = 1;
2403                 }
2404         } else {
2405                 dboot_printf("cpuid not supported\n");
2406         }
2407 #endif /* __xpv */
2408 
2409 
2410 #if defined(_BOOT_TARGET_amd64)
2411         if (amd64_support == 0)
2412                 dboot_panic("long mode not supported, rebooting");
2413         else if (pae_support == 0)
2414                 dboot_panic("long mode, but no PAE; rebooting");
2415 #else
2416         /*
2417          * Allow the command line to over-ride use of PAE for 32 bit.
2418          */
2419         if (strstr(cmdline, "disablePAE=true") != NULL) {
2420                 pae_support = 0;
2421                 NX_support = 0;
2422                 amd64_support = 0;
2423         }
2424 #endif
2425 
2426         /*
2427          * initialize the simple memory allocator
2428          */
2429         init_mem_alloc();
2430 
2431 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2432         /*
2433          * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2434          */
2435         if (max_mem < FOUR_GIG && NX_support == 0)
2436                 pae_support = 0;
2437 #endif
2438 
2439         /*
2440          * configure mmu information
2441          */
2442         if (pae_support) {
2443                 shift_amt = shift_amt_pae;
2444                 ptes_per_table = 512;
2445                 pte_size = 8;
2446                 lpagesize = TWO_MEG;
2447 #if defined(_BOOT_TARGET_amd64)
2448                 top_level = 3;
2449 #else
2450                 top_level = 2;
2451 #endif
2452         } else {
2453                 pae_support = 0;
2454                 NX_support = 0;
2455                 shift_amt = shift_amt_nopae;
2456                 ptes_per_table = 1024;
2457                 pte_size = 4;
2458                 lpagesize = FOUR_MEG;
2459                 top_level = 1;
2460         }
2461 
2462         DBG(pge_support);
2463         DBG(NX_support);
2464         DBG(largepage_support);
2465         DBG(amd64_support);
2466         DBG(top_level);
2467         DBG(pte_size);
2468         DBG(ptes_per_table);
2469         DBG(lpagesize);
2470 
2471 #if defined(__xpv)
2472         ktext_phys = ONE_GIG;           /* from UNIX Mapfile */
2473 #else
2474         ktext_phys = FOUR_MEG;          /* from UNIX Mapfile */
2475 #endif
2476 
2477 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2478         /*
2479          * For grub, copy kernel bits from the ELF64 file to final place.
2480          */
2481         DBG_MSG("\nAllocating nucleus pages.\n");
2482         ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2483 
2484         if (ktext_phys == 0)
2485                 dboot_panic("failed to allocate aligned kernel memory");
2486         DBG(load_addr);
2487         if (dboot_elfload64(load_addr) != 0)
2488                 dboot_panic("failed to parse kernel ELF image, rebooting");
2489 #endif
2490 
2491         DBG(ktext_phys);
2492 
2493         /*
2494          * Allocate page tables.
2495          */
2496         build_page_tables();
2497 
2498         /*
2499          * return to assembly code to switch to running kernel
2500          */
2501         entry_addr_low = (uint32_t)target_kernel_text;
2502         DBG(entry_addr_low);
2503         bi->bi_use_largepage = largepage_support;
2504         bi->bi_use_pae = pae_support;
2505         bi->bi_use_pge = pge_support;
2506         bi->bi_use_nx = NX_support;
2507 
2508 #if defined(__xpv)
2509 
2510         bi->bi_next_paddr = next_avail_addr - mfn_base;
2511         DBG(bi->bi_next_paddr);
2512         bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2513         DBG(bi->bi_next_vaddr);
2514 
2515         /*
2516          * unmap unused pages in start area to make them available for DMA
2517          */
2518         while (next_avail_addr < scratch_end) {
2519                 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
2520                     0, UVMF_INVLPG | UVMF_LOCAL);
2521                 next_avail_addr += MMU_PAGESIZE;
2522         }
2523 
2524         bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
2525         DBG((uintptr_t)HYPERVISOR_shared_info);
2526         bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2527         bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2528 
2529 #else /* __xpv */
2530 
2531         bi->bi_next_paddr = next_avail_addr;
2532         DBG(bi->bi_next_paddr);
2533         bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2534         DBG(bi->bi_next_vaddr);
2535         bi->bi_mb_version = multiboot_version;
2536 
2537         switch (multiboot_version) {
2538         case 1:
2539                 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info;
2540                 break;
2541         case 2:
2542                 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info;
2543                 break;
2544         default:
2545                 dboot_panic("Unknown multiboot version: %d\n",
2546                     multiboot_version);
2547                 break;
2548         }
2549         bi->bi_top_page_table = (uintptr_t)top_page_table;
2550 
2551 #endif /* __xpv */
2552 
2553         bi->bi_kseg_size = FOUR_MEG;
2554         DBG(bi->bi_kseg_size);
2555 
2556 #ifndef __xpv
2557         if (map_debug)
2558                 dump_tables();
2559 #endif
2560 
2561         DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2562 }