1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright 2013 Joyent, Inc. All rights reserved.
27 */
28
29
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/multiboot2.h>
37 #include <sys/multiboot2_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/sha1.h>
40 #include <util/string.h>
41 #include <util/strtolctype.h>
42 #include <sys/efi.h>
43
44 #if defined(__xpv)
45
46 #include <sys/hypervisor.h>
47 uintptr_t xen_virt_start;
48 pfn_t *mfn_to_pfn_mapping;
49
50 #else /* !__xpv */
51
52 extern multiboot_header_t mb_header;
53 extern uint32_t mb2_load_addr;
54 extern int have_cpuid(void);
55
56 #endif /* !__xpv */
57
58 #include <sys/inttypes.h>
59 #include <sys/bootinfo.h>
60 #include <sys/mach_mmu.h>
61 #include <sys/boot_console.h>
62
63 #include "dboot_asm.h"
64 #include "dboot_printf.h"
65 #include "dboot_xboot.h"
66 #include "dboot_elfload.h"
67
68 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2)
69
70 /*
71 * This file contains code that runs to transition us from either a multiboot
72 * compliant loader (32 bit non-paging) or a XPV domain loader to
73 * regular kernel execution. Its task is to setup the kernel memory image
74 * and page tables.
75 *
76 * The code executes as:
77 * - 32 bits under GRUB (for 32 or 64 bit Solaris)
78 * - a 32 bit program for the 32-bit PV hypervisor
79 * - a 64 bit program for the 64-bit PV hypervisor (at least for now)
80 *
81 * Under the PV hypervisor, we must create mappings for any memory beyond the
82 * initial start of day allocation (such as the kernel itself).
83 *
84 * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
85 * Since we are running in real mode, so all such memory is accessible.
86 */
87
88 /*
89 * Standard bits used in PTE (page level) and PTP (internal levels)
90 */
91 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
92 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
93
94 /*
95 * This is the target addresses (physical) where the kernel text and data
96 * nucleus pages will be unpacked. On the hypervisor this is actually a
97 * virtual address.
98 */
99 paddr_t ktext_phys;
100 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */
101
102 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */
103
104 /*
105 * The stack is setup in assembler before entering startup_kernel()
106 */
107 char stack_space[STACK_SIZE];
108
109 /*
110 * Used to track physical memory allocation
111 */
112 static paddr_t next_avail_addr = 0;
113
114 #if defined(__xpv)
115 /*
116 * Additional information needed for hypervisor memory allocation.
117 * Only memory up to scratch_end is mapped by page tables.
118 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
119 * to derive a pfn from a pointer, you subtract mfn_base.
120 */
121
122 static paddr_t scratch_end = 0; /* we can't write all of mem here */
123 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */
124 start_info_t *xen_info;
125
126 #else /* __xpv */
127
128 /*
129 * If on the metal, then we have a multiboot loader.
130 */
131 uint32_t mb_magic; /* magic from boot loader */
132 uint32_t mb_addr; /* multiboot info package from loader */
133 int multiboot_version;
134 multiboot_info_t *mb_info;
135 multiboot2_info_header_t *mb2_info;
136 multiboot_tag_mmap_t *mb2_mmap_tagp;
137 int num_entries; /* mmap entry count */
138 boolean_t num_entries_set; /* is mmap entry count set */
139 uintptr_t load_addr;
140
141 /* can not be automatic variables because of alignment */
142 static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID;
143 static efi_guid_t smbios = SMBIOS_TABLE_GUID;
144 static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID;
145 static efi_guid_t acpi1 = ACPI_10_TABLE_GUID;
146 #endif /* __xpv */
147
148 /*
149 * This contains information passed to the kernel
150 */
151 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */
152 struct xboot_info *bi;
153
154 /*
155 * Page table and memory stuff.
156 */
157 static paddr_t max_mem; /* maximum memory address */
158
159 /*
160 * Information about processor MMU
161 */
162 int amd64_support = 0;
163 int largepage_support = 0;
164 int pae_support = 0;
165 int pge_support = 0;
166 int NX_support = 0;
167
168 /*
169 * Low 32 bits of kernel entry address passed back to assembler.
170 * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
171 */
172 uint32_t entry_addr_low;
173
174 /*
175 * Memlists for the kernel. We shouldn't need a lot of these.
176 */
177 #define MAX_MEMLIST (50)
178 struct boot_memlist memlists[MAX_MEMLIST];
179 uint_t memlists_used = 0;
180 struct boot_memlist pcimemlists[MAX_MEMLIST];
181 uint_t pcimemlists_used = 0;
182 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
183 uint_t rsvdmemlists_used = 0;
184
185 /*
186 * This should match what's in the bootloader. It's arbitrary, but GRUB
187 * in particular has limitations on how much space it can use before it
188 * stops working properly. This should be enough.
189 */
190 struct boot_modules modules[MAX_BOOT_MODULES];
191 uint_t modules_used = 0;
192
193 #ifdef __xpv
194 /*
195 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
196 * definition in Xen source.
197 */
198 typedef struct {
199 uint32_t base_addr_low;
200 uint32_t base_addr_high;
201 uint32_t length_low;
202 uint32_t length_high;
203 uint32_t type;
204 } mmap_t;
205
206 /*
207 * There is 512KB of scratch area after the boot stack page.
208 * We'll use that for everything except the kernel nucleus pages which are too
209 * big to fit there and are allocated last anyway.
210 */
211 #define MAXMAPS 100
212 static mmap_t map_buffer[MAXMAPS];
213 #else
214 typedef mb_memory_map_t mmap_t;
215 #endif
216
217 /*
218 * Debugging macros
219 */
220 uint_t prom_debug = 0;
221 uint_t map_debug = 0;
222
223 static char noname[2] = "-";
224
225 /*
226 * Either hypervisor-specific or grub-specific code builds the initial
227 * memlists. This code does the sort/merge/link for final use.
228 */
229 static void
230 sort_physinstall(void)
231 {
232 int i;
233 #if !defined(__xpv)
234 int j;
235 struct boot_memlist tmp;
236
237 /*
238 * Now sort the memlists, in case they weren't in order.
239 * Yeah, this is a bubble sort; small, simple and easy to get right.
240 */
241 DBG_MSG("Sorting phys-installed list\n");
242 for (j = memlists_used - 1; j > 0; --j) {
243 for (i = 0; i < j; ++i) {
244 if (memlists[i].addr < memlists[i + 1].addr)
245 continue;
246 tmp = memlists[i];
247 memlists[i] = memlists[i + 1];
248 memlists[i + 1] = tmp;
249 }
250 }
251
252 /*
253 * Merge any memlists that don't have holes between them.
254 */
255 for (i = 0; i <= memlists_used - 1; ++i) {
256 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
257 continue;
258
259 if (prom_debug)
260 dboot_printf(
261 "merging mem segs %" PRIx64 "...%" PRIx64
262 " w/ %" PRIx64 "...%" PRIx64 "\n",
263 memlists[i].addr,
264 memlists[i].addr + memlists[i].size,
265 memlists[i + 1].addr,
266 memlists[i + 1].addr + memlists[i + 1].size);
267
268 memlists[i].size += memlists[i + 1].size;
269 for (j = i + 1; j < memlists_used - 1; ++j)
270 memlists[j] = memlists[j + 1];
271 --memlists_used;
272 DBG(memlists_used);
273 --i; /* after merging we need to reexamine, so do this */
274 }
275 #endif /* __xpv */
276
277 if (prom_debug) {
278 dboot_printf("\nFinal memlists:\n");
279 for (i = 0; i < memlists_used; ++i) {
280 dboot_printf("\t%d: addr=%" PRIx64 " size=%"
281 PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
282 }
283 }
284
285 /*
286 * link together the memlists with native size pointers
287 */
288 memlists[0].next = 0;
289 memlists[0].prev = 0;
290 for (i = 1; i < memlists_used; ++i) {
291 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
292 memlists[i].next = 0;
293 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
294 }
295 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
296 DBG(bi->bi_phys_install);
297 }
298
299 /*
300 * build bios reserved memlists
301 */
302 static void
303 build_rsvdmemlists(void)
304 {
305 int i;
306
307 rsvdmemlists[0].next = 0;
308 rsvdmemlists[0].prev = 0;
309 for (i = 1; i < rsvdmemlists_used; ++i) {
310 rsvdmemlists[i].prev =
311 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
312 rsvdmemlists[i].next = 0;
313 rsvdmemlists[i - 1].next =
314 (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
315 }
316 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
317 DBG(bi->bi_rsvdmem);
318 }
319
320 #if defined(__xpv)
321
322 /*
323 * halt on the hypervisor after a delay to drain console output
324 */
325 void
326 dboot_halt(void)
327 {
328 uint_t i = 10000;
329
330 while (--i)
331 (void) HYPERVISOR_yield();
332 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
333 }
334
335 /*
336 * From a machine address, find the corresponding pseudo-physical address.
337 * Pseudo-physical address are contiguous and run from mfn_base in each VM.
338 * Machine addresses are the real underlying hardware addresses.
339 * These are needed for page table entries. Note that this routine is
340 * poorly protected. A bad value of "ma" will cause a page fault.
341 */
342 paddr_t
343 ma_to_pa(maddr_t ma)
344 {
345 ulong_t pgoff = ma & MMU_PAGEOFFSET;
346 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
347 paddr_t pa;
348
349 if (pfn >= xen_info->nr_pages)
350 return (-(paddr_t)1);
351 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
352 #ifdef DEBUG
353 if (ma != pa_to_ma(pa))
354 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
355 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
356 #endif
357 return (pa);
358 }
359
360 /*
361 * From a pseudo-physical address, find the corresponding machine address.
362 */
363 maddr_t
364 pa_to_ma(paddr_t pa)
365 {
366 pfn_t pfn;
367 ulong_t mfn;
368
369 pfn = mmu_btop(pa - mfn_base);
370 if (pa < mfn_base || pfn >= xen_info->nr_pages)
371 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
372 mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
373 #ifdef DEBUG
374 if (mfn_to_pfn_mapping[mfn] != pfn)
375 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
376 pfn, mfn, mfn_to_pfn_mapping[mfn]);
377 #endif
378 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
379 }
380
381 #endif /* __xpv */
382
383 x86pte_t
384 get_pteval(paddr_t table, uint_t index)
385 {
386 if (pae_support)
387 return (((x86pte_t *)(uintptr_t)table)[index]);
388 return (((x86pte32_t *)(uintptr_t)table)[index]);
389 }
390
391 /*ARGSUSED*/
392 void
393 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
394 {
395 #ifdef __xpv
396 mmu_update_t t;
397 maddr_t mtable = pa_to_ma(table);
398 int retcnt;
399
400 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
401 t.val = pteval;
402 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
403 dboot_panic("HYPERVISOR_mmu_update() failed");
404 #else /* __xpv */
405 uintptr_t tab_addr = (uintptr_t)table;
406
407 if (pae_support)
408 ((x86pte_t *)tab_addr)[index] = pteval;
409 else
410 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
411 if (level == top_level && level == 2)
412 reload_cr3();
413 #endif /* __xpv */
414 }
415
416 paddr_t
417 make_ptable(x86pte_t *pteval, uint_t level)
418 {
419 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
420
421 if (level == top_level && level == 2)
422 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
423 else
424 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
425
426 #ifdef __xpv
427 /* Remove write permission to the new page table. */
428 if (HYPERVISOR_update_va_mapping(new_table,
429 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
430 dboot_panic("HYP_update_va_mapping error");
431 #endif
432
433 if (map_debug)
434 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
435 PRIx64 "\n", level, (ulong_t)new_table, *pteval);
436 return (new_table);
437 }
438
439 x86pte_t *
440 map_pte(paddr_t table, uint_t index)
441 {
442 return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
443 }
444
445 /*
446 * dump out the contents of page tables...
447 */
448 static void
449 dump_tables(void)
450 {
451 uint_t save_index[4]; /* for recursion */
452 char *save_table[4]; /* for recursion */
453 uint_t l;
454 uint64_t va;
455 uint64_t pgsize;
456 int index;
457 int i;
458 x86pte_t pteval;
459 char *table;
460 static char *tablist = "\t\t\t";
461 char *tabs = tablist + 3 - top_level;
462 uint_t pa, pa1;
463 #if !defined(__xpv)
464 #define maddr_t paddr_t
465 #endif /* !__xpv */
466
467 dboot_printf("Finished pagetables:\n");
468 table = (char *)(uintptr_t)top_page_table;
469 l = top_level;
470 va = 0;
471 for (index = 0; index < ptes_per_table; ++index) {
472 pgsize = 1ull << shift_amt[l];
473 if (pae_support)
474 pteval = ((x86pte_t *)table)[index];
475 else
476 pteval = ((x86pte32_t *)table)[index];
477 if (pteval == 0)
478 goto next_entry;
479
480 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
481 tabs + l, (void *)table, index, (uint64_t)pteval, va);
482 pa = ma_to_pa(pteval & MMU_PAGEMASK);
483 dboot_printf(" physaddr=%x\n", pa);
484
485 /*
486 * Don't try to walk hypervisor private pagetables
487 */
488 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
489 save_table[l] = table;
490 save_index[l] = index;
491 --l;
492 index = -1;
493 table = (char *)(uintptr_t)
494 ma_to_pa(pteval & MMU_PAGEMASK);
495 goto recursion;
496 }
497
498 /*
499 * shorten dump for consecutive mappings
500 */
501 for (i = 1; index + i < ptes_per_table; ++i) {
502 if (pae_support)
503 pteval = ((x86pte_t *)table)[index + i];
504 else
505 pteval = ((x86pte32_t *)table)[index + i];
506 if (pteval == 0)
507 break;
508 pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
509 if (pa1 != pa + i * pgsize)
510 break;
511 }
512 if (i > 2) {
513 dboot_printf("%s...\n", tabs + l);
514 va += pgsize * (i - 2);
515 index += i - 2;
516 }
517 next_entry:
518 va += pgsize;
519 if (l == 3 && index == 256) /* VA hole */
520 va = 0xffff800000000000ull;
521 recursion:
522 ;
523 }
524 if (l < top_level) {
525 ++l;
526 index = save_index[l];
527 table = save_table[l];
528 goto recursion;
529 }
530 }
531
532 /*
533 * Add a mapping for the machine page at the given virtual address.
534 */
535 static void
536 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
537 {
538 x86pte_t *ptep;
539 x86pte_t pteval;
540
541 pteval = ma | pte_bits;
542 if (level > 0)
543 pteval |= PT_PAGESIZE;
544 if (va >= target_kernel_text && pge_support)
545 pteval |= PT_GLOBAL;
546
547 if (map_debug && ma != va)
548 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
549 " pte=0x%" PRIx64 " l=%d\n",
550 (uint64_t)ma, (uint64_t)va, pteval, level);
551
552 #if defined(__xpv)
553 /*
554 * see if we can avoid find_pte() on the hypervisor
555 */
556 if (HYPERVISOR_update_va_mapping(va, pteval,
557 UVMF_INVLPG | UVMF_LOCAL) == 0)
558 return;
559 #endif
560
561 /*
562 * Find the pte that will map this address. This creates any
563 * missing intermediate level page tables
564 */
565 ptep = find_pte(va, NULL, level, 0);
566
567 /*
568 * When paravirtualized, we must use hypervisor calls to modify the
569 * PTE, since paging is active. On real hardware we just write to
570 * the pagetables which aren't in use yet.
571 */
572 #if defined(__xpv)
573 ptep = ptep; /* shut lint up */
574 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
575 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
576 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
577 (uint64_t)va, level, (uint64_t)ma, pteval);
578 #else
579 if (va < 1024 * 1024)
580 pteval |= PT_NOCACHE; /* for video RAM */
581 if (pae_support)
582 *ptep = pteval;
583 else
584 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
585 #endif
586 }
587
588 /*
589 * Add a mapping for the physical page at the given virtual address.
590 */
591 static void
592 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
593 {
594 map_ma_at_va(pa_to_ma(pa), va, level);
595 }
596
597 /*
598 * This is called to remove start..end from the
599 * possible range of PCI addresses.
600 */
601 const uint64_t pci_lo_limit = 0x00100000ul;
602 const uint64_t pci_hi_limit = 0xfff00000ul;
603 static void
604 exclude_from_pci(uint64_t start, uint64_t end)
605 {
606 int i;
607 int j;
608 struct boot_memlist *ml;
609
610 for (i = 0; i < pcimemlists_used; ++i) {
611 ml = &pcimemlists[i];
612
613 /* delete the entire range? */
614 if (start <= ml->addr && ml->addr + ml->size <= end) {
615 --pcimemlists_used;
616 for (j = i; j < pcimemlists_used; ++j)
617 pcimemlists[j] = pcimemlists[j + 1];
618 --i; /* to revisit the new one at this index */
619 }
620
621 /* split a range? */
622 else if (ml->addr < start && end < ml->addr + ml->size) {
623
624 ++pcimemlists_used;
625 if (pcimemlists_used > MAX_MEMLIST)
626 dboot_panic("too many pcimemlists");
627
628 for (j = pcimemlists_used - 1; j > i; --j)
629 pcimemlists[j] = pcimemlists[j - 1];
630 ml->size = start - ml->addr;
631
632 ++ml;
633 ml->size = (ml->addr + ml->size) - end;
634 ml->addr = end;
635 ++i; /* skip on to next one */
636 }
637
638 /* cut memory off the start? */
639 else if (ml->addr < end && end < ml->addr + ml->size) {
640 ml->size -= end - ml->addr;
641 ml->addr = end;
642 }
643
644 /* cut memory off the end? */
645 else if (ml->addr <= start && start < ml->addr + ml->size) {
646 ml->size = start - ml->addr;
647 }
648 }
649 }
650
651 /*
652 * During memory allocation, find the highest address not used yet.
653 */
654 static void
655 check_higher(paddr_t a)
656 {
657 if (a < next_avail_addr)
658 return;
659 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
660 DBG(next_avail_addr);
661 }
662
663 static int
664 dboot_loader_mmap_entries(void)
665 {
666 #if !defined(__xpv)
667 if (num_entries_set == B_TRUE)
668 return (num_entries);
669
670 switch (multiboot_version) {
671 case 1:
672 DBG(mb_info->flags);
673 if (mb_info->flags & 0x40) {
674 mb_memory_map_t *mmap;
675
676 DBG(mb_info->mmap_addr);
677 DBG(mb_info->mmap_length);
678 check_higher(mb_info->mmap_addr + mb_info->mmap_length);
679
680 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
681 (uint32_t)mmap < mb_info->mmap_addr +
682 mb_info->mmap_length;
683 mmap = (mb_memory_map_t *)((uint32_t)mmap +
684 mmap->size + sizeof (mmap->size)))
685 ++num_entries;
686
687 num_entries_set = B_TRUE;
688 }
689 break;
690 case 2:
691 num_entries_set = B_TRUE;
692 num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
693 mb2_mmap_tagp);
694 break;
695 default:
696 dboot_panic("Unknown multiboot version: %d\n",
697 multiboot_version);
698 break;
699 }
700 return (num_entries);
701 #else
702 return (MAXMAPS);
703 #endif
704 }
705
706 static uint32_t
707 dboot_loader_mmap_get_type(int index)
708 {
709 #if !defined(__xpv)
710 mb_memory_map_t *mp, *mpend;
711 int i;
712
713 switch (multiboot_version) {
714 case 1:
715 mp = (mb_memory_map_t *)mb_info->mmap_addr;
716 mpend = (mb_memory_map_t *)
717 (mb_info->mmap_addr + mb_info->mmap_length);
718
719 for (i = 0; mp < mpend && i != index; i++)
720 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
721 sizeof (mp->size));
722 if (mp >= mpend) {
723 dboot_panic("dboot_loader_mmap_get_type(): index "
724 "out of bounds: %d\n", index);
725 }
726 return (mp->type);
727
728 case 2:
729 return (dboot_multiboot2_mmap_get_type(mb2_info,
730 mb2_mmap_tagp, index));
731
732 default:
733 dboot_panic("Unknown multiboot version: %d\n",
734 multiboot_version);
735 break;
736 }
737 return (0);
738 #else
739 return (map_buffer[index].type);
740 #endif
741 }
742
743 static uint64_t
744 dboot_loader_mmap_get_base(int index)
745 {
746 #if !defined(__xpv)
747 mb_memory_map_t *mp, *mpend;
748 int i;
749
750 switch (multiboot_version) {
751 case 1:
752 mp = (mb_memory_map_t *)mb_info->mmap_addr;
753 mpend = (mb_memory_map_t *)
754 (mb_info->mmap_addr + mb_info->mmap_length);
755
756 for (i = 0; mp < mpend && i != index; i++)
757 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
758 sizeof (mp->size));
759 if (mp >= mpend) {
760 dboot_panic("dboot_loader_mmap_get_base(): index "
761 "out of bounds: %d\n", index);
762 }
763 return (((uint64_t)mp->base_addr_high << 32) +
764 (uint64_t)mp->base_addr_low);
765
766 case 2:
767 return (dboot_multiboot2_mmap_get_base(mb2_info,
768 mb2_mmap_tagp, index));
769
770 default:
771 dboot_panic("Unknown multiboot version: %d\n",
772 multiboot_version);
773 break;
774 }
775 return (0);
776 #else
777 return (((uint64_t)map_buffer[index].base_addr_high << 32) +
778 (uint64_t)map_buffer[index].base_addr_low);
779 #endif
780 }
781
782 static uint64_t
783 dboot_loader_mmap_get_length(int index)
784 {
785 #if !defined(__xpv)
786 mb_memory_map_t *mp, *mpend;
787 int i;
788
789 switch (multiboot_version) {
790 case 1:
791 mp = (mb_memory_map_t *)mb_info->mmap_addr;
792 mpend = (mb_memory_map_t *)
793 (mb_info->mmap_addr + mb_info->mmap_length);
794
795 for (i = 0; mp < mpend && i != index; i++)
796 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
797 sizeof (mp->size));
798 if (mp >= mpend) {
799 dboot_panic("dboot_loader_mmap_get_length(): index "
800 "out of bounds: %d\n", index);
801 }
802 return (((uint64_t)mp->length_high << 32) +
803 (uint64_t)mp->length_low);
804
805 case 2:
806 return (dboot_multiboot2_mmap_get_length(mb2_info,
807 mb2_mmap_tagp, index));
808
809 default:
810 dboot_panic("Unknown multiboot version: %d\n",
811 multiboot_version);
812 break;
813 }
814 return (0);
815 #else
816 return (((uint64_t)map_buffer[index].length_high << 32) +
817 (uint64_t)map_buffer[index].length_low);
818 #endif
819 }
820
821 static void
822 build_pcimemlists(void)
823 {
824 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
825 uint64_t start;
826 uint64_t end;
827 int i, num;
828
829 /*
830 * initialize
831 */
832 pcimemlists[0].addr = pci_lo_limit;
833 pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
834 pcimemlists_used = 1;
835
836 num = dboot_loader_mmap_entries();
837 /*
838 * Fill in PCI memlists.
839 */
840 for (i = 0; i < num; ++i) {
841 start = dboot_loader_mmap_get_base(i);
842 end = start + dboot_loader_mmap_get_length(i);
843
844 if (prom_debug)
845 dboot_printf("\ttype: %d %" PRIx64 "..%"
846 PRIx64 "\n", dboot_loader_mmap_get_type(i),
847 start, end);
848
849 /*
850 * page align start and end
851 */
852 start = (start + page_offset) & ~page_offset;
853 end &= ~page_offset;
854 if (end <= start)
855 continue;
856
857 exclude_from_pci(start, end);
858 }
859
860 /*
861 * Finish off the pcimemlist
862 */
863 if (prom_debug) {
864 for (i = 0; i < pcimemlists_used; ++i) {
865 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
866 PRIx64 "\n", pcimemlists[i].addr,
867 pcimemlists[i].addr + pcimemlists[i].size);
868 }
869 }
870 pcimemlists[0].next = 0;
871 pcimemlists[0].prev = 0;
872 for (i = 1; i < pcimemlists_used; ++i) {
873 pcimemlists[i].prev =
874 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
875 pcimemlists[i].next = 0;
876 pcimemlists[i - 1].next =
877 (native_ptr_t)(uintptr_t)(pcimemlists + i);
878 }
879 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
880 DBG(bi->bi_pcimem);
881 }
882
883 #if defined(__xpv)
884 /*
885 * Initialize memory allocator stuff from hypervisor-supplied start info.
886 */
887 static void
888 init_mem_alloc(void)
889 {
890 int local; /* variables needed to find start region */
891 paddr_t scratch_start;
892 xen_memory_map_t map;
893
894 DBG_MSG("Entered init_mem_alloc()\n");
895
896 /*
897 * Free memory follows the stack. There's at least 512KB of scratch
898 * space, rounded up to at least 2Mb alignment. That should be enough
899 * for the page tables we'll need to build. The nucleus memory is
900 * allocated last and will be outside the addressible range. We'll
901 * switch to new page tables before we unpack the kernel
902 */
903 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
904 DBG(scratch_start);
905 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
906 DBG(scratch_end);
907
908 /*
909 * For paranoia, leave some space between hypervisor data and ours.
910 * Use 500 instead of 512.
911 */
912 next_avail_addr = scratch_end - 500 * 1024;
913 DBG(next_avail_addr);
914
915 /*
916 * The domain builder gives us at most 1 module
917 */
918 DBG(xen_info->mod_len);
919 if (xen_info->mod_len > 0) {
920 DBG(xen_info->mod_start);
921 modules[0].bm_addr =
922 (native_ptr_t)(uintptr_t)xen_info->mod_start;
923 modules[0].bm_size = xen_info->mod_len;
924 bi->bi_module_cnt = 1;
925 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
926 } else {
927 bi->bi_module_cnt = 0;
928 bi->bi_modules = (native_ptr_t)(uintptr_t)NULL;
929 }
930 DBG(bi->bi_module_cnt);
931 DBG(bi->bi_modules);
932
933 DBG(xen_info->mfn_list);
934 DBG(xen_info->nr_pages);
935 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
936 DBG(max_mem);
937
938 /*
939 * Using pseudo-physical addresses, so only 1 memlist element
940 */
941 memlists[0].addr = 0;
942 DBG(memlists[0].addr);
943 memlists[0].size = max_mem;
944 DBG(memlists[0].size);
945 memlists_used = 1;
946 DBG(memlists_used);
947
948 /*
949 * finish building physinstall list
950 */
951 sort_physinstall();
952
953 /*
954 * build bios reserved memlists
955 */
956 build_rsvdmemlists();
957
958 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
959 /*
960 * build PCI Memory list
961 */
962 map.nr_entries = MAXMAPS;
963 /*LINTED: constant in conditional context*/
964 set_xen_guest_handle(map.buffer, map_buffer);
965 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
966 dboot_panic("getting XENMEM_machine_memory_map failed");
967 build_pcimemlists();
968 }
969 }
970
971 #else /* !__xpv */
972
973 static void
974 dboot_multiboot1_xboot_consinfo(void)
975 {
976 bi->bi_framebuffer = NULL;
977 }
978
979 static void
980 dboot_multiboot2_xboot_consinfo(void)
981 {
982 multiboot_tag_framebuffer_t *fb;
983 fb = dboot_multiboot2_find_tag(mb2_info,
984 MULTIBOOT_TAG_TYPE_FRAMEBUFFER);
985 bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb;
986 }
987
988 static int
989 dboot_multiboot_modcount(void)
990 {
991 switch (multiboot_version) {
992 case 1:
993 return (mb_info->mods_count);
994
995 case 2:
996 return (dboot_multiboot2_modcount(mb2_info));
997
998 default:
999 dboot_panic("Unknown multiboot version: %d\n",
1000 multiboot_version);
1001 break;
1002 }
1003 return (0);
1004 }
1005
1006 static uint32_t
1007 dboot_multiboot_modstart(int index)
1008 {
1009 switch (multiboot_version) {
1010 case 1:
1011 return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1012
1013 case 2:
1014 return (dboot_multiboot2_modstart(mb2_info, index));
1015
1016 default:
1017 dboot_panic("Unknown multiboot version: %d\n",
1018 multiboot_version);
1019 break;
1020 }
1021 return (0);
1022 }
1023
1024 static uint32_t
1025 dboot_multiboot_modend(int index)
1026 {
1027 switch (multiboot_version) {
1028 case 1:
1029 return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1030
1031 case 2:
1032 return (dboot_multiboot2_modend(mb2_info, index));
1033
1034 default:
1035 dboot_panic("Unknown multiboot version: %d\n",
1036 multiboot_version);
1037 break;
1038 }
1039 return (0);
1040 }
1041
1042 static char *
1043 dboot_multiboot_modcmdline(int index)
1044 {
1045 switch (multiboot_version) {
1046 case 1:
1047 return ((char *)((mb_module_t *)
1048 mb_info->mods_addr)[index].mod_name);
1049
1050 case 2:
1051 return (dboot_multiboot2_modcmdline(mb2_info, index));
1052
1053 default:
1054 dboot_panic("Unknown multiboot version: %d\n",
1055 multiboot_version);
1056 break;
1057 }
1058 return (0);
1059 }
1060
1061 /*
1062 * Find the environment module for console setup.
1063 * Since we need the console to print early boot messages, the console is set up
1064 * before anything else and therefore we need to pick up the environment module
1065 * early too.
1066 *
1067 * Note, we just will search for and if found, will pass the env
1068 * module to console setup, the proper module list processing will happen later.
1069 */
1070 static void
1071 dboot_find_env(void)
1072 {
1073 int i, modcount;
1074 uint32_t mod_start, mod_end;
1075 char *cmdline;
1076
1077 modcount = dboot_multiboot_modcount();
1078
1079 for (i = 0; i < modcount; ++i) {
1080 cmdline = dboot_multiboot_modcmdline(i);
1081 if (cmdline == NULL)
1082 continue;
1083
1084 if (strstr(cmdline, "type=environment") == NULL)
1085 continue;
1086
1087 mod_start = dboot_multiboot_modstart(i);
1088 mod_end = dboot_multiboot_modend(i);
1089 modules[0].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1090 modules[0].bm_size = mod_end - mod_start;
1091 modules[0].bm_name = (native_ptr_t)(uintptr_t)NULL;
1092 modules[0].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1093 modules[0].bm_type = BMT_ENV;
1094 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1095 bi->bi_module_cnt = 1;
1096 return;
1097 }
1098 }
1099
1100 static boolean_t
1101 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1102 {
1103 boolean_t rv = B_FALSE;
1104
1105 switch (multiboot_version) {
1106 case 1:
1107 if (mb_info->flags & 0x01) {
1108 *lower = mb_info->mem_lower;
1109 *upper = mb_info->mem_upper;
1110 rv = B_TRUE;
1111 }
1112 break;
1113
1114 case 2:
1115 return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1116
1117 default:
1118 dboot_panic("Unknown multiboot version: %d\n",
1119 multiboot_version);
1120 break;
1121 }
1122 return (rv);
1123 }
1124
1125 static uint8_t
1126 dboot_a2h(char v)
1127 {
1128 if (v >= 'a')
1129 return (v - 'a' + 0xa);
1130 else if (v >= 'A')
1131 return (v - 'A' + 0xa);
1132 else if (v >= '0')
1133 return (v - '0');
1134 else
1135 dboot_panic("bad ASCII hex character %c\n", v);
1136
1137 return (0);
1138 }
1139
1140 static void
1141 digest_a2h(const char *ascii, uint8_t *digest)
1142 {
1143 unsigned int i;
1144
1145 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1146 digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1147 digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1148 }
1149 }
1150
1151 /*
1152 * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1153 * the ASCII-format hash found in the 40-byte buffer at ascii. If they
1154 * match, return 0, otherwise -1. This works only for images smaller than
1155 * 4 GB, which should not be a problem.
1156 */
1157 static int
1158 check_image_hash(uint_t midx)
1159 {
1160 const char *ascii;
1161 const void *image;
1162 size_t len;
1163 SHA1_CTX ctx;
1164 uint8_t digest[SHA1_DIGEST_LENGTH];
1165 uint8_t baseline[SHA1_DIGEST_LENGTH];
1166 unsigned int i;
1167
1168 ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1169 image = (const void *)(uintptr_t)modules[midx].bm_addr;
1170 len = (size_t)modules[midx].bm_size;
1171
1172 digest_a2h(ascii, baseline);
1173
1174 SHA1Init(&ctx);
1175 SHA1Update(&ctx, image, len);
1176 SHA1Final(digest, &ctx);
1177
1178 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1179 if (digest[i] != baseline[i])
1180 return (-1);
1181 }
1182
1183 return (0);
1184 }
1185
1186 static const char *
1187 type_to_str(boot_module_type_t type)
1188 {
1189 switch (type) {
1190 case BMT_ROOTFS:
1191 return ("rootfs");
1192 case BMT_FILE:
1193 return ("file");
1194 case BMT_HASH:
1195 return ("hash");
1196 case BMT_ENV:
1197 return ("environment");
1198 default:
1199 return ("unknown");
1200 }
1201 }
1202
1203 static void
1204 check_images(void)
1205 {
1206 uint_t i;
1207 char displayhash[SHA1_ASCII_LENGTH + 1];
1208
1209 for (i = 0; i < modules_used; i++) {
1210 if (prom_debug) {
1211 dboot_printf("module #%d: name %s type %s "
1212 "addr %lx size %lx\n",
1213 i, (char *)(uintptr_t)modules[i].bm_name,
1214 type_to_str(modules[i].bm_type),
1215 (ulong_t)modules[i].bm_addr,
1216 (ulong_t)modules[i].bm_size);
1217 }
1218
1219 if (modules[i].bm_type == BMT_HASH ||
1220 modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) {
1221 DBG_MSG("module has no hash; skipping check\n");
1222 continue;
1223 }
1224 (void) memcpy(displayhash,
1225 (void *)(uintptr_t)modules[i].bm_hash,
1226 SHA1_ASCII_LENGTH);
1227 displayhash[SHA1_ASCII_LENGTH] = '\0';
1228 if (prom_debug) {
1229 dboot_printf("checking expected hash [%s]: ",
1230 displayhash);
1231 }
1232
1233 if (check_image_hash(i) != 0)
1234 dboot_panic("hash mismatch!\n");
1235 else
1236 DBG_MSG("OK\n");
1237 }
1238 }
1239
1240 /*
1241 * Determine the module's starting address, size, name, and type, and fill the
1242 * boot_modules structure. This structure is used by the bop code, except for
1243 * hashes which are checked prior to transferring control to the kernel.
1244 */
1245 static void
1246 process_module(int midx)
1247 {
1248 uint32_t mod_start = dboot_multiboot_modstart(midx);
1249 uint32_t mod_end = dboot_multiboot_modend(midx);
1250 char *cmdline = dboot_multiboot_modcmdline(midx);
1251 char *p, *q;
1252
1253 check_higher(mod_end);
1254 if (prom_debug) {
1255 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1256 midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1257 }
1258
1259 if (mod_start > mod_end) {
1260 dboot_panic("module #%d: module start address 0x%lx greater "
1261 "than end address 0x%lx", midx,
1262 (ulong_t)mod_start, (ulong_t)mod_end);
1263 }
1264
1265 /*
1266 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1267 * the address of the last valid byte in a module plus 1 as mod_end.
1268 * This is of course a bug; the multiboot specification simply states
1269 * that mod_start and mod_end "contain the start and end addresses of
1270 * the boot module itself" which is pretty obviously not what GRUB is
1271 * doing. However, fixing it requires that not only this code be
1272 * changed but also that other code consuming this value and values
1273 * derived from it be fixed, and that the kernel and GRUB must either
1274 * both have the bug or neither. While there are a lot of combinations
1275 * that will work, there are also some that won't, so for simplicity
1276 * we'll just cope with the bug. That means we won't actually hash the
1277 * byte at mod_end, and we will expect that mod_end for the hash file
1278 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1279 * hash plus a newline for each module). We set bm_size to the true
1280 * correct number of bytes in each module, achieving exactly this.
1281 */
1282
1283 modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1284 modules[midx].bm_size = mod_end - mod_start;
1285 modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1286 modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1287 modules[midx].bm_type = BMT_FILE;
1288
1289 if (cmdline == NULL) {
1290 modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1291 return;
1292 }
1293
1294 p = cmdline;
1295 modules[midx].bm_name =
1296 (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1297
1298 while (p != NULL) {
1299 q = strsep(&p, " \t\f\n\r");
1300 if (strncmp(q, "name=", 5) == 0) {
1301 if (q[5] != '\0' && !isspace(q[5])) {
1302 modules[midx].bm_name =
1303 (native_ptr_t)(uintptr_t)(q + 5);
1304 }
1305 continue;
1306 }
1307
1308 if (strncmp(q, "type=", 5) == 0) {
1309 if (q[5] == '\0' || isspace(q[5]))
1310 continue;
1311 q += 5;
1312 if (strcmp(q, "rootfs") == 0) {
1313 modules[midx].bm_type = BMT_ROOTFS;
1314 } else if (strcmp(q, "hash") == 0) {
1315 modules[midx].bm_type = BMT_HASH;
1316 } else if (strcmp(q, "environment") == 0) {
1317 modules[midx].bm_type = BMT_ENV;
1318 } else if (strcmp(q, "file") != 0) {
1319 dboot_printf("\tmodule #%d: unknown module "
1320 "type '%s'; defaulting to 'file'",
1321 midx, q);
1322 }
1323 continue;
1324 }
1325
1326 if (strncmp(q, "hash=", 5) == 0) {
1327 if (q[5] != '\0' && !isspace(q[5])) {
1328 modules[midx].bm_hash =
1329 (native_ptr_t)(uintptr_t)(q + 5);
1330 }
1331 continue;
1332 }
1333
1334 dboot_printf("ignoring unknown option '%s'\n", q);
1335 }
1336 }
1337
1338 /*
1339 * Backward compatibility: if there are exactly one or two modules, both
1340 * of type 'file' and neither with an embedded hash value, we have been
1341 * given the legacy style modules. In this case we need to treat the first
1342 * module as a rootfs and the second as a hash referencing that module.
1343 * Otherwise, even if the configuration is invalid, we assume that the
1344 * operator knows what he's doing or at least isn't being bitten by this
1345 * interface change.
1346 */
1347 static void
1348 fixup_modules(void)
1349 {
1350 if (modules_used == 0 || modules_used > 2)
1351 return;
1352
1353 if (modules[0].bm_type != BMT_FILE ||
1354 modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1355 return;
1356 }
1357
1358 if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL ||
1359 modules_used > 1 &&
1360 modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1361 return;
1362 }
1363
1364 modules[0].bm_type = BMT_ROOTFS;
1365 if (modules_used > 1) {
1366 modules[1].bm_type = BMT_HASH;
1367 modules[1].bm_name = modules[0].bm_name;
1368 }
1369 }
1370
1371 /*
1372 * For modules that do not have assigned hashes but have a separate hash module,
1373 * find the assigned hash module and set the primary module's bm_hash to point
1374 * to the hash data from that module. We will then ignore modules of type
1375 * BMT_HASH from this point forward.
1376 */
1377 static void
1378 assign_module_hashes(void)
1379 {
1380 uint_t i, j;
1381
1382 for (i = 0; i < modules_used; i++) {
1383 if (modules[i].bm_type == BMT_HASH ||
1384 modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1385 continue;
1386 }
1387
1388 for (j = 0; j < modules_used; j++) {
1389 if (modules[j].bm_type != BMT_HASH ||
1390 strcmp((char *)(uintptr_t)modules[j].bm_name,
1391 (char *)(uintptr_t)modules[i].bm_name) != 0) {
1392 continue;
1393 }
1394
1395 if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1396 dboot_printf("Short hash module of length "
1397 "0x%lx bytes; ignoring\n",
1398 (ulong_t)modules[j].bm_size);
1399 } else {
1400 modules[i].bm_hash = modules[j].bm_addr;
1401 }
1402 break;
1403 }
1404 }
1405 }
1406
1407 /*
1408 * Walk through the module information finding the last used address.
1409 * The first available address will become the top level page table.
1410 */
1411 static void
1412 dboot_process_modules(void)
1413 {
1414 int i, modcount;
1415 extern char _end[];
1416
1417 DBG_MSG("\nFinding Modules\n");
1418 modcount = dboot_multiboot_modcount();
1419 if (modcount > MAX_BOOT_MODULES) {
1420 dboot_panic("Too many modules (%d) -- the maximum is %d.",
1421 modcount, MAX_BOOT_MODULES);
1422 }
1423 /*
1424 * search the modules to find the last used address
1425 * we'll build the module list while we're walking through here
1426 */
1427 check_higher((paddr_t)(uintptr_t)&_end);
1428 for (i = 0; i < modcount; ++i) {
1429 process_module(i);
1430 modules_used++;
1431 }
1432 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1433 DBG(bi->bi_modules);
1434 bi->bi_module_cnt = modcount;
1435 DBG(bi->bi_module_cnt);
1436
1437 fixup_modules();
1438 assign_module_hashes();
1439 check_images();
1440 }
1441
1442 /*
1443 * We then build the phys_install memlist from the multiboot information.
1444 */
1445 static void
1446 dboot_process_mmap(void)
1447 {
1448 uint64_t start;
1449 uint64_t end;
1450 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
1451 uint32_t lower, upper;
1452 int i, mmap_entries;
1453
1454 /*
1455 * Walk through the memory map from multiboot and build our memlist
1456 * structures. Note these will have native format pointers.
1457 */
1458 DBG_MSG("\nFinding Memory Map\n");
1459 num_entries = 0;
1460 num_entries_set = B_FALSE;
1461 max_mem = 0;
1462 if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1463 for (i = 0; i < mmap_entries; i++) {
1464 uint32_t type = dboot_loader_mmap_get_type(i);
1465 start = dboot_loader_mmap_get_base(i);
1466 end = start + dboot_loader_mmap_get_length(i);
1467
1468 if (prom_debug)
1469 dboot_printf("\ttype: %d %" PRIx64 "..%"
1470 PRIx64 "\n", type, start, end);
1471
1472 /*
1473 * page align start and end
1474 */
1475 start = (start + page_offset) & ~page_offset;
1476 end &= ~page_offset;
1477 if (end <= start)
1478 continue;
1479
1480 /*
1481 * only type 1 is usable RAM
1482 */
1483 switch (type) {
1484 case 1:
1485 if (end > max_mem)
1486 max_mem = end;
1487 memlists[memlists_used].addr = start;
1488 memlists[memlists_used].size = end - start;
1489 ++memlists_used;
1490 if (memlists_used > MAX_MEMLIST)
1491 dboot_panic("too many memlists");
1492 break;
1493 case 2:
1494 rsvdmemlists[rsvdmemlists_used].addr = start;
1495 rsvdmemlists[rsvdmemlists_used].size =
1496 end - start;
1497 ++rsvdmemlists_used;
1498 if (rsvdmemlists_used > MAX_MEMLIST)
1499 dboot_panic("too many rsvdmemlists");
1500 break;
1501 default:
1502 continue;
1503 }
1504 }
1505 build_pcimemlists();
1506 } else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1507 DBG(lower);
1508 memlists[memlists_used].addr = 0;
1509 memlists[memlists_used].size = lower * 1024;
1510 ++memlists_used;
1511 DBG(upper);
1512 memlists[memlists_used].addr = 1024 * 1024;
1513 memlists[memlists_used].size = upper * 1024;
1514 ++memlists_used;
1515
1516 /*
1517 * Old platform - assume I/O space at the end of memory.
1518 */
1519 pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1520 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1521 pcimemlists[0].next = 0;
1522 pcimemlists[0].prev = 0;
1523 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1524 DBG(bi->bi_pcimem);
1525 } else {
1526 dboot_panic("No memory info from boot loader!!!");
1527 }
1528
1529 /*
1530 * finish processing the physinstall list
1531 */
1532 sort_physinstall();
1533
1534 /*
1535 * build bios reserved mem lists
1536 */
1537 build_rsvdmemlists();
1538 }
1539
1540 /*
1541 * The highest address is used as the starting point for dboot's simple
1542 * memory allocator.
1543 *
1544 * Finding the highest address in case of Multiboot 1 protocol is
1545 * quite painful in the sense that some information provided by
1546 * the multiboot info structure points to BIOS data, and some to RAM.
1547 *
1548 * The module list was processed and checked already by dboot_process_modules(),
1549 * so we will check the command line string and the memory map.
1550 *
1551 * This list of to be checked items is based on our current knowledge of
1552 * allocations made by grub1 and will need to be reviewed if there
1553 * are updates about the information provided by Multiboot 1.
1554 *
1555 * In the case of the Multiboot 2, our life is much simpler, as the MB2
1556 * information tag list is one contiguous chunk of memory.
1557 */
1558 static paddr_t
1559 dboot_multiboot1_highest_addr(void)
1560 {
1561 paddr_t addr = (paddr_t)(uintptr_t)NULL;
1562 char *cmdl = (char *)mb_info->cmdline;
1563
1564 if (mb_info->flags & MB_INFO_CMDLINE)
1565 addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1566
1567 if (mb_info->flags & MB_INFO_MEM_MAP)
1568 addr = MAX(addr,
1569 ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1570 return (addr);
1571 }
1572
1573 static void
1574 dboot_multiboot_highest_addr(void)
1575 {
1576 paddr_t addr;
1577
1578 switch (multiboot_version) {
1579 case 1:
1580 addr = dboot_multiboot1_highest_addr();
1581 if (addr != (paddr_t)(uintptr_t)NULL)
1582 check_higher(addr);
1583 break;
1584 case 2:
1585 addr = dboot_multiboot2_highest_addr(mb2_info);
1586 if (addr != (paddr_t)(uintptr_t)NULL)
1587 check_higher(addr);
1588 break;
1589 default:
1590 dboot_panic("Unknown multiboot version: %d\n",
1591 multiboot_version);
1592 break;
1593 }
1594 }
1595
1596 /*
1597 * Walk the boot loader provided information and find the highest free address.
1598 */
1599 static void
1600 init_mem_alloc(void)
1601 {
1602 DBG_MSG("Entered init_mem_alloc()\n");
1603 dboot_process_modules();
1604 dboot_process_mmap();
1605 dboot_multiboot_highest_addr();
1606 }
1607
1608 static int
1609 dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2)
1610 {
1611 int i;
1612
1613 if (g1->time_low != g2->time_low)
1614 return (0);
1615 if (g1->time_mid != g2->time_mid)
1616 return (0);
1617 if (g1->time_hi_and_version != g2->time_hi_and_version)
1618 return (0);
1619 if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved)
1620 return (0);
1621 if (g1->clock_seq_low != g2->clock_seq_low)
1622 return (0);
1623
1624 for (i = 0; i < 6; i++) {
1625 if (g1->node_addr[i] != g2->node_addr[i])
1626 return (0);
1627 }
1628 return (1);
1629 }
1630
1631 static void
1632 process_efi32(EFI_SYSTEM_TABLE32 *efi)
1633 {
1634 uint32_t entries;
1635 EFI_CONFIGURATION_TABLE32 *config;
1636 int i;
1637
1638 entries = efi->NumberOfTableEntries;
1639 config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1640 efi->ConfigurationTable;
1641
1642 for (i = 0; i < entries; i++) {
1643 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1644 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1645 config[i].VendorTable;
1646 }
1647 if (bi->bi_smbios == NULL &&
1648 dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1649 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1650 config[i].VendorTable;
1651 }
1652 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1653 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1654 config[i].VendorTable;
1655 }
1656 if (bi->bi_acpi_rsdp == NULL &&
1657 dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1658 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1659 config[i].VendorTable;
1660 }
1661 }
1662 }
1663
1664 static void
1665 process_efi64(EFI_SYSTEM_TABLE64 *efi)
1666 {
1667 uint64_t entries;
1668 EFI_CONFIGURATION_TABLE64 *config;
1669 int i;
1670
1671 entries = efi->NumberOfTableEntries;
1672 config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1673 efi->ConfigurationTable;
1674
1675 for (i = 0; i < entries; i++) {
1676 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1677 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1678 config[i].VendorTable;
1679 }
1680 if (bi->bi_smbios == NULL &&
1681 dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1682 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1683 config[i].VendorTable;
1684 }
1685 /* Prefer acpi v2+ over v1. */
1686 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1687 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1688 config[i].VendorTable;
1689 }
1690 if (bi->bi_acpi_rsdp == NULL &&
1691 dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1692 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1693 config[i].VendorTable;
1694 }
1695 }
1696 }
1697
1698 static void
1699 dboot_multiboot_get_fwtables(void)
1700 {
1701 multiboot_tag_new_acpi_t *nacpitagp;
1702 multiboot_tag_old_acpi_t *oacpitagp;
1703 multiboot_tag_efi64_t *efi64tagp = NULL;
1704 multiboot_tag_efi32_t *efi32tagp = NULL;
1705
1706 /* no fw tables from multiboot 1 */
1707 if (multiboot_version != 2)
1708 return;
1709
1710 efi64tagp = (multiboot_tag_efi64_t *)
1711 dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64);
1712 if (efi64tagp != NULL) {
1713 bi->bi_uefi_arch = XBI_UEFI_ARCH_64;
1714 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1715 efi64tagp->mb_pointer;
1716 process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
1717 efi64tagp->mb_pointer);
1718 } else {
1719 efi32tagp = (multiboot_tag_efi32_t *)
1720 dboot_multiboot2_find_tag(mb2_info,
1721 MULTIBOOT_TAG_TYPE_EFI32);
1722 if (efi32tagp != NULL) {
1723 bi->bi_uefi_arch = XBI_UEFI_ARCH_32;
1724 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1725 efi32tagp->mb_pointer;
1726 process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
1727 efi32tagp->mb_pointer);
1728 }
1729 }
1730
1731 /*
1732 * The ACPI RSDP can be found by scanning the BIOS memory areas or
1733 * from the EFI system table. The boot loader may pass in the address
1734 * it found the ACPI tables at.
1735 */
1736 nacpitagp = (multiboot_tag_new_acpi_t *)
1737 dboot_multiboot2_find_tag(mb2_info,
1738 MULTIBOOT_TAG_TYPE_ACPI_NEW);
1739 oacpitagp = (multiboot_tag_old_acpi_t *)
1740 dboot_multiboot2_find_tag(mb2_info,
1741 MULTIBOOT_TAG_TYPE_ACPI_OLD);
1742
1743 if (nacpitagp != NULL) {
1744 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1745 &nacpitagp->mb_rsdp[0];
1746 } else if (oacpitagp != NULL) {
1747 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1748 &oacpitagp->mb_rsdp[0];
1749 }
1750 }
1751
1752 /* print out EFI version string with newline */
1753 static void
1754 dboot_print_efi_version(uint32_t ver)
1755 {
1756 int rev;
1757
1758 dboot_printf("%d.", EFI_REV_MAJOR(ver));
1759
1760 rev = EFI_REV_MINOR(ver);
1761 if ((rev % 10) != 0) {
1762 dboot_printf("%d.%d\n", rev / 10, rev % 10);
1763 } else {
1764 dboot_printf("%d\n", rev / 10);
1765 }
1766 }
1767
1768 static void
1769 print_efi32(EFI_SYSTEM_TABLE32 *efi)
1770 {
1771 uint16_t *data;
1772 EFI_CONFIGURATION_TABLE32 *conf;
1773 int i;
1774
1775 dboot_printf("EFI32 signature: %llx\n",
1776 (unsigned long long)efi->Hdr.Signature);
1777 dboot_printf("EFI system version: ");
1778 dboot_print_efi_version(efi->Hdr.Revision);
1779 dboot_printf("EFI system vendor: ");
1780 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1781 for (i = 0; data[i] != 0; i++)
1782 dboot_printf("%c", (char)data[i]);
1783 dboot_printf("\nEFI firmware revision: ");
1784 dboot_print_efi_version(efi->FirmwareRevision);
1785 dboot_printf("EFI system table number of entries: %d\n",
1786 efi->NumberOfTableEntries);
1787 conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1788 efi->ConfigurationTable;
1789 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1790 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1791 conf[i].VendorGuid.time_low,
1792 conf[i].VendorGuid.time_mid,
1793 conf[i].VendorGuid.time_hi_and_version,
1794 conf[i].VendorGuid.clock_seq_hi_and_reserved,
1795 conf[i].VendorGuid.clock_seq_low);
1796 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1797 conf[i].VendorGuid.node_addr[0],
1798 conf[i].VendorGuid.node_addr[1],
1799 conf[i].VendorGuid.node_addr[2],
1800 conf[i].VendorGuid.node_addr[3],
1801 conf[i].VendorGuid.node_addr[4],
1802 conf[i].VendorGuid.node_addr[5]);
1803 }
1804 }
1805
1806 static void
1807 print_efi64(EFI_SYSTEM_TABLE64 *efi)
1808 {
1809 uint16_t *data;
1810 EFI_CONFIGURATION_TABLE64 *conf;
1811 int i;
1812
1813 dboot_printf("EFI64 signature: %llx\n",
1814 (unsigned long long)efi->Hdr.Signature);
1815 dboot_printf("EFI system version: ");
1816 dboot_print_efi_version(efi->Hdr.Revision);
1817 dboot_printf("EFI system vendor: ");
1818 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1819 for (i = 0; data[i] != 0; i++)
1820 dboot_printf("%c", (char)data[i]);
1821 dboot_printf("\nEFI firmware revision: ");
1822 dboot_print_efi_version(efi->FirmwareRevision);
1823 dboot_printf("EFI system table number of entries: %lld\n",
1824 efi->NumberOfTableEntries);
1825 conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1826 efi->ConfigurationTable;
1827 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1828 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1829 conf[i].VendorGuid.time_low,
1830 conf[i].VendorGuid.time_mid,
1831 conf[i].VendorGuid.time_hi_and_version,
1832 conf[i].VendorGuid.clock_seq_hi_and_reserved,
1833 conf[i].VendorGuid.clock_seq_low);
1834 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1835 conf[i].VendorGuid.node_addr[0],
1836 conf[i].VendorGuid.node_addr[1],
1837 conf[i].VendorGuid.node_addr[2],
1838 conf[i].VendorGuid.node_addr[3],
1839 conf[i].VendorGuid.node_addr[4],
1840 conf[i].VendorGuid.node_addr[5]);
1841 }
1842 }
1843 #endif /* !__xpv */
1844
1845 /*
1846 * Simple memory allocator, allocates aligned physical memory.
1847 * Note that startup_kernel() only allocates memory, never frees.
1848 * Memory usage just grows in an upward direction.
1849 */
1850 static void *
1851 do_mem_alloc(uint32_t size, uint32_t align)
1852 {
1853 uint_t i;
1854 uint64_t best;
1855 uint64_t start;
1856 uint64_t end;
1857
1858 /*
1859 * make sure size is a multiple of pagesize
1860 */
1861 size = RNDUP(size, MMU_PAGESIZE);
1862 next_avail_addr = RNDUP(next_avail_addr, align);
1863
1864 /*
1865 * XXPV fixme joe
1866 *
1867 * a really large bootarchive that causes you to run out of memory
1868 * may cause this to blow up
1869 */
1870 /* LINTED E_UNEXPECTED_UINT_PROMOTION */
1871 best = (uint64_t)-size;
1872 for (i = 0; i < memlists_used; ++i) {
1873 start = memlists[i].addr;
1874 #if defined(__xpv)
1875 start += mfn_base;
1876 #endif
1877 end = start + memlists[i].size;
1878
1879 /*
1880 * did we find the desired address?
1881 */
1882 if (start <= next_avail_addr && next_avail_addr + size <= end) {
1883 best = next_avail_addr;
1884 goto done;
1885 }
1886
1887 /*
1888 * if not is this address the best so far?
1889 */
1890 if (start > next_avail_addr && start < best &&
1891 RNDUP(start, align) + size <= end)
1892 best = RNDUP(start, align);
1893 }
1894
1895 /*
1896 * We didn't find exactly the address we wanted, due to going off the
1897 * end of a memory region. Return the best found memory address.
1898 */
1899 done:
1900 next_avail_addr = best + size;
1901 #if defined(__xpv)
1902 if (next_avail_addr > scratch_end)
1903 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1904 "0x%lx", (ulong_t)next_avail_addr,
1905 (ulong_t)scratch_end);
1906 #endif
1907 (void) memset((void *)(uintptr_t)best, 0, size);
1908 return ((void *)(uintptr_t)best);
1909 }
1910
1911 void *
1912 mem_alloc(uint32_t size)
1913 {
1914 return (do_mem_alloc(size, MMU_PAGESIZE));
1915 }
1916
1917
1918 /*
1919 * Build page tables to map all of memory used so far as well as the kernel.
1920 */
1921 static void
1922 build_page_tables(void)
1923 {
1924 uint32_t psize;
1925 uint32_t level;
1926 uint32_t off;
1927 uint64_t start;
1928 #if !defined(__xpv)
1929 uint32_t i;
1930 uint64_t end;
1931 #endif /* __xpv */
1932
1933 /*
1934 * If we're on metal, we need to create the top level pagetable.
1935 */
1936 #if defined(__xpv)
1937 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1938 #else /* __xpv */
1939 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1940 #endif /* __xpv */
1941 DBG((uintptr_t)top_page_table);
1942
1943 /*
1944 * Determine if we'll use large mappings for kernel, then map it.
1945 */
1946 if (largepage_support) {
1947 psize = lpagesize;
1948 level = 1;
1949 } else {
1950 psize = MMU_PAGESIZE;
1951 level = 0;
1952 }
1953
1954 DBG_MSG("Mapping kernel\n");
1955 DBG(ktext_phys);
1956 DBG(target_kernel_text);
1957 DBG(ksize);
1958 DBG(psize);
1959 for (off = 0; off < ksize; off += psize)
1960 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1961
1962 /*
1963 * The kernel will need a 1 page window to work with page tables
1964 */
1965 bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1966 DBG(bi->bi_pt_window);
1967 bi->bi_pte_to_pt_window =
1968 (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1969 DBG(bi->bi_pte_to_pt_window);
1970
1971 #if defined(__xpv)
1972 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1973 /* If this is a domU we're done. */
1974 DBG_MSG("\nPage tables constructed\n");
1975 return;
1976 }
1977 #endif /* __xpv */
1978
1979 /*
1980 * We need 1:1 mappings for the lower 1M of memory to access
1981 * BIOS tables used by a couple of drivers during boot.
1982 *
1983 * The following code works because our simple memory allocator
1984 * only grows usage in an upwards direction.
1985 *
1986 * Note that by this point in boot some mappings for low memory
1987 * may already exist because we've already accessed device in low
1988 * memory. (Specifically the video frame buffer and keyboard
1989 * status ports.) If we're booting on raw hardware then GRUB
1990 * created these mappings for us. If we're booting under a
1991 * hypervisor then we went ahead and remapped these devices into
1992 * memory allocated within dboot itself.
1993 */
1994 if (map_debug)
1995 dboot_printf("1:1 map pa=0..1Meg\n");
1996 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1997 #if defined(__xpv)
1998 map_ma_at_va(start, start, 0);
1999 #else /* __xpv */
2000 map_pa_at_va(start, start, 0);
2001 #endif /* __xpv */
2002 }
2003
2004 #if !defined(__xpv)
2005
2006 for (i = 0; i < memlists_used; ++i) {
2007 start = memlists[i].addr;
2008 end = start + memlists[i].size;
2009
2010 if (map_debug)
2011 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2012 start, end);
2013 while (start < end && start < next_avail_addr) {
2014 map_pa_at_va(start, start, 0);
2015 start += MMU_PAGESIZE;
2016 }
2017 if (start >= next_avail_addr)
2018 break;
2019 }
2020
2021 /*
2022 * Map framebuffer memory as PT_NOCACHE as this is memory from a
2023 * device and therefore must not be cached.
2024 */
2025 if (bi->bi_framebuffer != NULL) {
2026 multiboot_tag_framebuffer_t *fb;
2027 fb = (multiboot_tag_framebuffer_t *)(uintptr_t)
2028 bi->bi_framebuffer;
2029
2030 start = fb->framebuffer_common.framebuffer_addr;
2031 end = start + fb->framebuffer_common.framebuffer_height *
2032 fb->framebuffer_common.framebuffer_pitch;
2033
2034 pte_bits |= PT_NOCACHE;
2035 while (start < end) {
2036 map_pa_at_va(start, start, 0);
2037 start += MMU_PAGESIZE;
2038 }
2039 pte_bits &= ~PT_NOCACHE;
2040 }
2041 #endif /* !__xpv */
2042
2043 DBG_MSG("\nPage tables constructed\n");
2044 }
2045
2046 #define NO_MULTIBOOT \
2047 "multiboot is no longer used to boot the Solaris Operating System.\n\
2048 The grub entry should be changed to:\n\
2049 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
2050 module$ /platform/i86pc/$ISADIR/boot_archive\n\
2051 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
2052
2053 static void
2054 dboot_init_xboot_consinfo(void)
2055 {
2056 uintptr_t addr;
2057 /*
2058 * boot info must be 16 byte aligned for 64 bit kernel ABI
2059 */
2060 addr = (uintptr_t)boot_info;
2061 addr = (addr + 0xf) & ~0xf;
2062 bi = (struct xboot_info *)addr;
2063
2064 #if !defined(__xpv)
2065 switch (multiboot_version) {
2066 case 1:
2067 dboot_multiboot1_xboot_consinfo();
2068 break;
2069 case 2:
2070 dboot_multiboot2_xboot_consinfo();
2071 break;
2072 default:
2073 dboot_panic("Unknown multiboot version: %d\n",
2074 multiboot_version);
2075 break;
2076 }
2077 /*
2078 * Lookup environment module for the console. Complete module list
2079 * will be built after console setup.
2080 */
2081 dboot_find_env();
2082 #endif
2083 }
2084
2085 /*
2086 * Set up basic data from the boot loader.
2087 * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
2088 * 32-bit dboot code setup used to set up and start 64-bit kernel.
2089 * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
2090 * start 64-bit illumos kernel.
2091 */
2092 static void
2093 dboot_loader_init(void)
2094 {
2095 #if !defined(__xpv)
2096 mb_info = NULL;
2097 mb2_info = NULL;
2098
2099 switch (mb_magic) {
2100 case MB_BOOTLOADER_MAGIC:
2101 multiboot_version = 1;
2102 mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
2103 #if defined(_BOOT_TARGET_amd64)
2104 load_addr = mb_header.load_addr;
2105 #endif
2106 break;
2107
2108 case MULTIBOOT2_BOOTLOADER_MAGIC:
2109 multiboot_version = 2;
2110 mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
2111 mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
2112 #if defined(_BOOT_TARGET_amd64)
2113 load_addr = mb2_load_addr;
2114 #endif
2115 break;
2116
2117 default:
2118 dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
2119 break;
2120 }
2121 #endif /* !defined(__xpv) */
2122 }
2123
2124 /* Extract the kernel command line from [multi]boot information. */
2125 static char *
2126 dboot_loader_cmdline(void)
2127 {
2128 char *line = NULL;
2129
2130 #if defined(__xpv)
2131 line = (char *)xen_info->cmd_line;
2132 #else /* __xpv */
2133
2134 switch (multiboot_version) {
2135 case 1:
2136 if (mb_info->flags & MB_INFO_CMDLINE)
2137 line = (char *)mb_info->cmdline;
2138 break;
2139
2140 case 2:
2141 line = dboot_multiboot2_cmdline(mb2_info);
2142 break;
2143
2144 default:
2145 dboot_panic("Unknown multiboot version: %d\n",
2146 multiboot_version);
2147 break;
2148 }
2149
2150 #endif /* __xpv */
2151
2152 /*
2153 * Make sure we have valid pointer so the string operations
2154 * will not crash us.
2155 */
2156 if (line == NULL)
2157 line = "";
2158
2159 return (line);
2160 }
2161
2162 static char *
2163 dboot_loader_name(void)
2164 {
2165 #if defined(__xpv)
2166 return (NULL);
2167 #else /* __xpv */
2168 multiboot_tag_string_t *tag;
2169
2170 switch (multiboot_version) {
2171 case 1:
2172 return ((char *)mb_info->boot_loader_name);
2173
2174 case 2:
2175 tag = dboot_multiboot2_find_tag(mb2_info,
2176 MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
2177 return (tag->mb_string);
2178 default:
2179 dboot_panic("Unknown multiboot version: %d\n",
2180 multiboot_version);
2181 break;
2182 }
2183
2184 return (NULL);
2185 #endif /* __xpv */
2186 }
2187
2188 /*
2189 * startup_kernel has a pretty simple job. It builds pagetables which reflect
2190 * 1:1 mappings for all memory in use. It then also adds mappings for
2191 * the kernel nucleus at virtual address of target_kernel_text using large page
2192 * mappings. The page table pages are also accessible at 1:1 mapped
2193 * virtual addresses.
2194 */
2195 /*ARGSUSED*/
2196 void
2197 startup_kernel(void)
2198 {
2199 char *cmdline;
2200 char *bootloader;
2201 #if defined(__xpv)
2202 physdev_set_iopl_t set_iopl;
2203 #endif /* __xpv */
2204
2205 dboot_loader_init();
2206 /*
2207 * At this point we are executing in a 32 bit real mode.
2208 */
2209
2210 bootloader = dboot_loader_name();
2211 cmdline = dboot_loader_cmdline();
2212
2213 #if defined(__xpv)
2214 /*
2215 * For dom0, before we initialize the console subsystem we'll
2216 * need to enable io operations, so set I/O priveldge level to 1.
2217 */
2218 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
2219 set_iopl.iopl = 1;
2220 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
2221 }
2222 #endif /* __xpv */
2223
2224 dboot_init_xboot_consinfo();
2225 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
2226 bcons_init(bi);
2227
2228 prom_debug = (find_boot_prop("prom_debug") != NULL);
2229 map_debug = (find_boot_prop("map_debug") != NULL);
2230
2231 #if !defined(__xpv)
2232 dboot_multiboot_get_fwtables();
2233 #endif
2234 DBG_MSG("\n\nillumos prekernel set: ");
2235 DBG_MSG(cmdline);
2236 DBG_MSG("\n");
2237
2238 if (bootloader != NULL && prom_debug) {
2239 dboot_printf("Kernel loaded by: %s\n", bootloader);
2240 #if !defined(__xpv)
2241 dboot_printf("Using multiboot %d boot protocol.\n",
2242 multiboot_version);
2243 #endif
2244 }
2245
2246 if (strstr(cmdline, "multiboot") != NULL) {
2247 dboot_panic(NO_MULTIBOOT);
2248 }
2249
2250 DBG((uintptr_t)bi);
2251 #if !defined(__xpv)
2252 DBG((uintptr_t)mb_info);
2253 DBG((uintptr_t)mb2_info);
2254 if (mb2_info != NULL)
2255 DBG(mb2_info->mbi_total_size);
2256 DBG(bi->bi_acpi_rsdp);
2257 DBG(bi->bi_smbios);
2258 DBG(bi->bi_uefi_arch);
2259 DBG(bi->bi_uefi_systab);
2260
2261 if (bi->bi_uefi_systab && prom_debug) {
2262 if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) {
2263 print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
2264 bi->bi_uefi_systab);
2265 } else {
2266 print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
2267 bi->bi_uefi_systab);
2268 }
2269 }
2270 #endif
2271
2272 /*
2273 * Need correct target_kernel_text value
2274 */
2275 #if defined(_BOOT_TARGET_amd64)
2276 target_kernel_text = KERNEL_TEXT_amd64;
2277 #elif defined(__xpv)
2278 target_kernel_text = KERNEL_TEXT_i386_xpv;
2279 #else
2280 target_kernel_text = KERNEL_TEXT_i386;
2281 #endif
2282 DBG(target_kernel_text);
2283
2284 #if defined(__xpv)
2285
2286 /*
2287 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
2288 */
2289
2290 #if defined(_BOOT_TARGET_amd64)
2291 /*
2292 * 64-bit hypervisor.
2293 */
2294 amd64_support = 1;
2295 pae_support = 1;
2296
2297 #else /* _BOOT_TARGET_amd64 */
2298
2299 /*
2300 * See if we are running on a PAE Hypervisor
2301 */
2302 {
2303 xen_capabilities_info_t caps;
2304
2305 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2306 dboot_panic("HYPERVISOR_xen_version(caps) failed");
2307 caps[sizeof (caps) - 1] = 0;
2308 if (prom_debug)
2309 dboot_printf("xen capabilities %s\n", caps);
2310 if (strstr(caps, "x86_32p") != NULL)
2311 pae_support = 1;
2312 }
2313
2314 #endif /* _BOOT_TARGET_amd64 */
2315 {
2316 xen_platform_parameters_t p;
2317
2318 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2319 dboot_panic("HYPERVISOR_xen_version(parms) failed");
2320 DBG(p.virt_start);
2321 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2322 }
2323
2324 /*
2325 * The hypervisor loads stuff starting at 1Gig
2326 */
2327 mfn_base = ONE_GIG;
2328 DBG(mfn_base);
2329
2330 /*
2331 * enable writable page table mode for the hypervisor
2332 */
2333 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2334 VMASST_TYPE_writable_pagetables) < 0)
2335 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2336
2337 /*
2338 * check for NX support
2339 */
2340 if (pae_support) {
2341 uint32_t eax = 0x80000000;
2342 uint32_t edx = get_cpuid_edx(&eax);
2343
2344 if (eax >= 0x80000001) {
2345 eax = 0x80000001;
2346 edx = get_cpuid_edx(&eax);
2347 if (edx & CPUID_AMD_EDX_NX)
2348 NX_support = 1;
2349 }
2350 }
2351
2352 #if !defined(_BOOT_TARGET_amd64)
2353
2354 /*
2355 * The 32-bit hypervisor uses segmentation to protect itself from
2356 * guests. This means when a guest attempts to install a flat 4GB
2357 * code or data descriptor the 32-bit hypervisor will protect itself
2358 * by silently shrinking the segment such that if the guest attempts
2359 * any access where the hypervisor lives a #gp fault is generated.
2360 * The problem is that some applications expect a full 4GB flat
2361 * segment for their current thread pointer and will use negative
2362 * offset segment wrap around to access data. TLS support in linux
2363 * brand is one example of this.
2364 *
2365 * The 32-bit hypervisor can catch the #gp fault in these cases
2366 * and emulate the access without passing the #gp fault to the guest
2367 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2368 * Seems like this should have been the default.
2369 * Either way, we want the hypervisor -- and not Solaris -- to deal
2370 * to deal with emulating these accesses.
2371 */
2372 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2373 VMASST_TYPE_4gb_segments) < 0)
2374 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2375 #endif /* !_BOOT_TARGET_amd64 */
2376
2377 #else /* __xpv */
2378
2379 /*
2380 * use cpuid to enable MMU features
2381 */
2382 if (have_cpuid()) {
2383 uint32_t eax, edx;
2384
2385 eax = 1;
2386 edx = get_cpuid_edx(&eax);
2387 if (edx & CPUID_INTC_EDX_PSE)
2388 largepage_support = 1;
2389 if (edx & CPUID_INTC_EDX_PGE)
2390 pge_support = 1;
2391 if (edx & CPUID_INTC_EDX_PAE)
2392 pae_support = 1;
2393
2394 eax = 0x80000000;
2395 edx = get_cpuid_edx(&eax);
2396 if (eax >= 0x80000001) {
2397 eax = 0x80000001;
2398 edx = get_cpuid_edx(&eax);
2399 if (edx & CPUID_AMD_EDX_LM)
2400 amd64_support = 1;
2401 if (edx & CPUID_AMD_EDX_NX)
2402 NX_support = 1;
2403 }
2404 } else {
2405 dboot_printf("cpuid not supported\n");
2406 }
2407 #endif /* __xpv */
2408
2409
2410 #if defined(_BOOT_TARGET_amd64)
2411 if (amd64_support == 0)
2412 dboot_panic("long mode not supported, rebooting");
2413 else if (pae_support == 0)
2414 dboot_panic("long mode, but no PAE; rebooting");
2415 #else
2416 /*
2417 * Allow the command line to over-ride use of PAE for 32 bit.
2418 */
2419 if (strstr(cmdline, "disablePAE=true") != NULL) {
2420 pae_support = 0;
2421 NX_support = 0;
2422 amd64_support = 0;
2423 }
2424 #endif
2425
2426 /*
2427 * initialize the simple memory allocator
2428 */
2429 init_mem_alloc();
2430
2431 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2432 /*
2433 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2434 */
2435 if (max_mem < FOUR_GIG && NX_support == 0)
2436 pae_support = 0;
2437 #endif
2438
2439 /*
2440 * configure mmu information
2441 */
2442 if (pae_support) {
2443 shift_amt = shift_amt_pae;
2444 ptes_per_table = 512;
2445 pte_size = 8;
2446 lpagesize = TWO_MEG;
2447 #if defined(_BOOT_TARGET_amd64)
2448 top_level = 3;
2449 #else
2450 top_level = 2;
2451 #endif
2452 } else {
2453 pae_support = 0;
2454 NX_support = 0;
2455 shift_amt = shift_amt_nopae;
2456 ptes_per_table = 1024;
2457 pte_size = 4;
2458 lpagesize = FOUR_MEG;
2459 top_level = 1;
2460 }
2461
2462 DBG(pge_support);
2463 DBG(NX_support);
2464 DBG(largepage_support);
2465 DBG(amd64_support);
2466 DBG(top_level);
2467 DBG(pte_size);
2468 DBG(ptes_per_table);
2469 DBG(lpagesize);
2470
2471 #if defined(__xpv)
2472 ktext_phys = ONE_GIG; /* from UNIX Mapfile */
2473 #else
2474 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */
2475 #endif
2476
2477 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2478 /*
2479 * For grub, copy kernel bits from the ELF64 file to final place.
2480 */
2481 DBG_MSG("\nAllocating nucleus pages.\n");
2482 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2483
2484 if (ktext_phys == 0)
2485 dboot_panic("failed to allocate aligned kernel memory");
2486 DBG(load_addr);
2487 if (dboot_elfload64(load_addr) != 0)
2488 dboot_panic("failed to parse kernel ELF image, rebooting");
2489 #endif
2490
2491 DBG(ktext_phys);
2492
2493 /*
2494 * Allocate page tables.
2495 */
2496 build_page_tables();
2497
2498 /*
2499 * return to assembly code to switch to running kernel
2500 */
2501 entry_addr_low = (uint32_t)target_kernel_text;
2502 DBG(entry_addr_low);
2503 bi->bi_use_largepage = largepage_support;
2504 bi->bi_use_pae = pae_support;
2505 bi->bi_use_pge = pge_support;
2506 bi->bi_use_nx = NX_support;
2507
2508 #if defined(__xpv)
2509
2510 bi->bi_next_paddr = next_avail_addr - mfn_base;
2511 DBG(bi->bi_next_paddr);
2512 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2513 DBG(bi->bi_next_vaddr);
2514
2515 /*
2516 * unmap unused pages in start area to make them available for DMA
2517 */
2518 while (next_avail_addr < scratch_end) {
2519 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
2520 0, UVMF_INVLPG | UVMF_LOCAL);
2521 next_avail_addr += MMU_PAGESIZE;
2522 }
2523
2524 bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
2525 DBG((uintptr_t)HYPERVISOR_shared_info);
2526 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2527 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2528
2529 #else /* __xpv */
2530
2531 bi->bi_next_paddr = next_avail_addr;
2532 DBG(bi->bi_next_paddr);
2533 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2534 DBG(bi->bi_next_vaddr);
2535 bi->bi_mb_version = multiboot_version;
2536
2537 switch (multiboot_version) {
2538 case 1:
2539 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info;
2540 break;
2541 case 2:
2542 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info;
2543 break;
2544 default:
2545 dboot_panic("Unknown multiboot version: %d\n",
2546 multiboot_version);
2547 break;
2548 }
2549 bi->bi_top_page_table = (uintptr_t)top_page_table;
2550
2551 #endif /* __xpv */
2552
2553 bi->bi_kseg_size = FOUR_MEG;
2554 DBG(bi->bi_kseg_size);
2555
2556 #ifndef __xpv
2557 if (map_debug)
2558 dump_tables();
2559 #endif
2560
2561 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2562 }