1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright 2013 Joyent, Inc. All rights reserved.
27 */
28
29
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/multiboot2.h>
37 #include <sys/multiboot2_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/framebuffer.h>
40 #include <sys/sha1.h>
41 #include <util/string.h>
42 #include <util/strtolctype.h>
43 #include <sys/efi.h>
44
45 #if defined(__xpv)
46
47 #include <sys/hypervisor.h>
48 uintptr_t xen_virt_start;
49 pfn_t *mfn_to_pfn_mapping;
50
51 #else /* !__xpv */
52
53 extern multiboot_header_t mb_header;
54 extern uint32_t mb2_load_addr;
55 extern int have_cpuid(void);
56
57 #endif /* !__xpv */
58
59 #include <sys/inttypes.h>
60 #include <sys/bootinfo.h>
61 #include <sys/mach_mmu.h>
62 #include <sys/boot_console.h>
63
64 #include "dboot_asm.h"
65 #include "dboot_printf.h"
66 #include "dboot_xboot.h"
67 #include "dboot_elfload.h"
68
69 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2)
70
71 /*
72 * This file contains code that runs to transition us from either a multiboot
73 * compliant loader (32 bit non-paging) or a XPV domain loader to
74 * regular kernel execution. Its task is to setup the kernel memory image
75 * and page tables.
76 *
77 * The code executes as:
78 * - 32 bits under GRUB (for 32 or 64 bit Solaris)
79 * - a 32 bit program for the 32-bit PV hypervisor
80 * - a 64 bit program for the 64-bit PV hypervisor (at least for now)
81 *
82 * Under the PV hypervisor, we must create mappings for any memory beyond the
83 * initial start of day allocation (such as the kernel itself).
84 *
85 * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
86 * Since we are running in real mode, so all such memory is accessible.
87 */
88
89 /*
90 * Standard bits used in PTE (page level) and PTP (internal levels)
91 */
92 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
93 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
94
95 /*
96 * This is the target addresses (physical) where the kernel text and data
97 * nucleus pages will be unpacked. On the hypervisor this is actually a
98 * virtual address.
99 */
100 paddr_t ktext_phys;
101 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */
102
103 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */
104
105 /*
106 * The stack is setup in assembler before entering startup_kernel()
107 */
108 char stack_space[STACK_SIZE];
109
110 /*
111 * Used to track physical memory allocation
112 */
113 static paddr_t next_avail_addr = 0;
114
115 #if defined(__xpv)
116 /*
117 * Additional information needed for hypervisor memory allocation.
118 * Only memory up to scratch_end is mapped by page tables.
119 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
120 * to derive a pfn from a pointer, you subtract mfn_base.
121 */
122
123 static paddr_t scratch_end = 0; /* we can't write all of mem here */
124 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */
125 start_info_t *xen_info;
126
127 #else /* __xpv */
128
129 /*
130 * If on the metal, then we have a multiboot loader.
131 */
132 uint32_t mb_magic; /* magic from boot loader */
133 uint32_t mb_addr; /* multiboot info package from loader */
134 int multiboot_version;
135 multiboot_info_t *mb_info;
136 multiboot2_info_header_t *mb2_info;
137 multiboot_tag_mmap_t *mb2_mmap_tagp;
138 int num_entries; /* mmap entry count */
139 boolean_t num_entries_set; /* is mmap entry count set */
140 uintptr_t load_addr;
141 static boot_framebuffer_t framebuffer[2];
142 static boot_framebuffer_t *fb;
143
144 /* can not be automatic variables because of alignment */
145 static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID;
146 static efi_guid_t smbios = SMBIOS_TABLE_GUID;
147 static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID;
148 static efi_guid_t acpi1 = ACPI_10_TABLE_GUID;
149 #endif /* __xpv */
150
151 /*
152 * This contains information passed to the kernel
153 */
154 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */
155 struct xboot_info *bi;
156
157 /*
158 * Page table and memory stuff.
159 */
160 static paddr_t max_mem; /* maximum memory address */
161
162 /*
163 * Information about processor MMU
164 */
165 int amd64_support = 0;
166 int largepage_support = 0;
167 int pae_support = 0;
168 int pge_support = 0;
169 int NX_support = 0;
170 int PAT_support = 0;
171
172 /*
173 * Low 32 bits of kernel entry address passed back to assembler.
174 * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
175 */
176 uint32_t entry_addr_low;
177
178 /*
179 * Memlists for the kernel. We shouldn't need a lot of these.
180 */
181 #define MAX_MEMLIST (50)
182 struct boot_memlist memlists[MAX_MEMLIST];
183 uint_t memlists_used = 0;
184 struct boot_memlist pcimemlists[MAX_MEMLIST];
185 uint_t pcimemlists_used = 0;
186 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
187 uint_t rsvdmemlists_used = 0;
188
189 /*
190 * This should match what's in the bootloader. It's arbitrary, but GRUB
191 * in particular has limitations on how much space it can use before it
192 * stops working properly. This should be enough.
193 */
194 struct boot_modules modules[MAX_BOOT_MODULES];
195 uint_t modules_used = 0;
196
197 #ifdef __xpv
198 /*
199 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
200 * definition in Xen source.
201 */
202 typedef struct {
203 uint32_t base_addr_low;
204 uint32_t base_addr_high;
205 uint32_t length_low;
206 uint32_t length_high;
207 uint32_t type;
208 } mmap_t;
209
210 /*
211 * There is 512KB of scratch area after the boot stack page.
212 * We'll use that for everything except the kernel nucleus pages which are too
213 * big to fit there and are allocated last anyway.
214 */
215 #define MAXMAPS 100
216 static mmap_t map_buffer[MAXMAPS];
217 #else
218 typedef mb_memory_map_t mmap_t;
219 #endif
220
221 /*
222 * Debugging macros
223 */
224 uint_t prom_debug = 0;
225 uint_t map_debug = 0;
226
227 static char noname[2] = "-";
228
229 /*
230 * Either hypervisor-specific or grub-specific code builds the initial
231 * memlists. This code does the sort/merge/link for final use.
232 */
233 static void
234 sort_physinstall(void)
235 {
236 int i;
237 #if !defined(__xpv)
238 int j;
239 struct boot_memlist tmp;
240
241 /*
242 * Now sort the memlists, in case they weren't in order.
243 * Yeah, this is a bubble sort; small, simple and easy to get right.
244 */
245 DBG_MSG("Sorting phys-installed list\n");
246 for (j = memlists_used - 1; j > 0; --j) {
247 for (i = 0; i < j; ++i) {
248 if (memlists[i].addr < memlists[i + 1].addr)
249 continue;
250 tmp = memlists[i];
251 memlists[i] = memlists[i + 1];
252 memlists[i + 1] = tmp;
253 }
254 }
255
256 /*
257 * Merge any memlists that don't have holes between them.
258 */
259 for (i = 0; i <= memlists_used - 1; ++i) {
260 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
261 continue;
262
263 if (prom_debug)
264 dboot_printf(
265 "merging mem segs %" PRIx64 "...%" PRIx64
266 " w/ %" PRIx64 "...%" PRIx64 "\n",
267 memlists[i].addr,
268 memlists[i].addr + memlists[i].size,
269 memlists[i + 1].addr,
270 memlists[i + 1].addr + memlists[i + 1].size);
271
272 memlists[i].size += memlists[i + 1].size;
273 for (j = i + 1; j < memlists_used - 1; ++j)
274 memlists[j] = memlists[j + 1];
275 --memlists_used;
276 DBG(memlists_used);
277 --i; /* after merging we need to reexamine, so do this */
278 }
279 #endif /* __xpv */
280
281 if (prom_debug) {
282 dboot_printf("\nFinal memlists:\n");
283 for (i = 0; i < memlists_used; ++i) {
284 dboot_printf("\t%d: addr=%" PRIx64 " size=%"
285 PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
286 }
287 }
288
289 /*
290 * link together the memlists with native size pointers
291 */
292 memlists[0].next = 0;
293 memlists[0].prev = 0;
294 for (i = 1; i < memlists_used; ++i) {
295 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
296 memlists[i].next = 0;
297 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
298 }
299 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
300 DBG(bi->bi_phys_install);
301 }
302
303 /*
304 * build bios reserved memlists
305 */
306 static void
307 build_rsvdmemlists(void)
308 {
309 int i;
310
311 rsvdmemlists[0].next = 0;
312 rsvdmemlists[0].prev = 0;
313 for (i = 1; i < rsvdmemlists_used; ++i) {
314 rsvdmemlists[i].prev =
315 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
316 rsvdmemlists[i].next = 0;
317 rsvdmemlists[i - 1].next =
318 (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
319 }
320 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
321 DBG(bi->bi_rsvdmem);
322 }
323
324 #if defined(__xpv)
325
326 /*
327 * halt on the hypervisor after a delay to drain console output
328 */
329 void
330 dboot_halt(void)
331 {
332 uint_t i = 10000;
333
334 while (--i)
335 (void) HYPERVISOR_yield();
336 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
337 }
338
339 /*
340 * From a machine address, find the corresponding pseudo-physical address.
341 * Pseudo-physical address are contiguous and run from mfn_base in each VM.
342 * Machine addresses are the real underlying hardware addresses.
343 * These are needed for page table entries. Note that this routine is
344 * poorly protected. A bad value of "ma" will cause a page fault.
345 */
346 paddr_t
347 ma_to_pa(maddr_t ma)
348 {
349 ulong_t pgoff = ma & MMU_PAGEOFFSET;
350 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
351 paddr_t pa;
352
353 if (pfn >= xen_info->nr_pages)
354 return (-(paddr_t)1);
355 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
356 #ifdef DEBUG
357 if (ma != pa_to_ma(pa))
358 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
359 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
360 #endif
361 return (pa);
362 }
363
364 /*
365 * From a pseudo-physical address, find the corresponding machine address.
366 */
367 maddr_t
368 pa_to_ma(paddr_t pa)
369 {
370 pfn_t pfn;
371 ulong_t mfn;
372
373 pfn = mmu_btop(pa - mfn_base);
374 if (pa < mfn_base || pfn >= xen_info->nr_pages)
375 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
376 mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
377 #ifdef DEBUG
378 if (mfn_to_pfn_mapping[mfn] != pfn)
379 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
380 pfn, mfn, mfn_to_pfn_mapping[mfn]);
381 #endif
382 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
383 }
384
385 #endif /* __xpv */
386
387 x86pte_t
388 get_pteval(paddr_t table, uint_t index)
389 {
390 if (pae_support)
391 return (((x86pte_t *)(uintptr_t)table)[index]);
392 return (((x86pte32_t *)(uintptr_t)table)[index]);
393 }
394
395 /*ARGSUSED*/
396 void
397 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
398 {
399 #ifdef __xpv
400 mmu_update_t t;
401 maddr_t mtable = pa_to_ma(table);
402 int retcnt;
403
404 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
405 t.val = pteval;
406 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
407 dboot_panic("HYPERVISOR_mmu_update() failed");
408 #else /* __xpv */
409 uintptr_t tab_addr = (uintptr_t)table;
410
411 if (pae_support)
412 ((x86pte_t *)tab_addr)[index] = pteval;
413 else
414 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
415 if (level == top_level && level == 2)
416 reload_cr3();
417 #endif /* __xpv */
418 }
419
420 paddr_t
421 make_ptable(x86pte_t *pteval, uint_t level)
422 {
423 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
424
425 if (level == top_level && level == 2)
426 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
427 else
428 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
429
430 #ifdef __xpv
431 /* Remove write permission to the new page table. */
432 if (HYPERVISOR_update_va_mapping(new_table,
433 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
434 dboot_panic("HYP_update_va_mapping error");
435 #endif
436
437 if (map_debug)
438 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
439 PRIx64 "\n", level, (ulong_t)new_table, *pteval);
440 return (new_table);
441 }
442
443 x86pte_t *
444 map_pte(paddr_t table, uint_t index)
445 {
446 return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
447 }
448
449 /*
450 * dump out the contents of page tables...
451 */
452 static void
453 dump_tables(void)
454 {
455 uint_t save_index[4]; /* for recursion */
456 char *save_table[4]; /* for recursion */
457 uint_t l;
458 uint64_t va;
459 uint64_t pgsize;
460 int index;
461 int i;
462 x86pte_t pteval;
463 char *table;
464 static char *tablist = "\t\t\t";
465 char *tabs = tablist + 3 - top_level;
466 uint_t pa, pa1;
467 #if !defined(__xpv)
468 #define maddr_t paddr_t
469 #endif /* !__xpv */
470
471 dboot_printf("Finished pagetables:\n");
472 table = (char *)(uintptr_t)top_page_table;
473 l = top_level;
474 va = 0;
475 for (index = 0; index < ptes_per_table; ++index) {
476 pgsize = 1ull << shift_amt[l];
477 if (pae_support)
478 pteval = ((x86pte_t *)table)[index];
479 else
480 pteval = ((x86pte32_t *)table)[index];
481 if (pteval == 0)
482 goto next_entry;
483
484 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
485 tabs + l, (void *)table, index, (uint64_t)pteval, va);
486 pa = ma_to_pa(pteval & MMU_PAGEMASK);
487 dboot_printf(" physaddr=%x\n", pa);
488
489 /*
490 * Don't try to walk hypervisor private pagetables
491 */
492 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
493 save_table[l] = table;
494 save_index[l] = index;
495 --l;
496 index = -1;
497 table = (char *)(uintptr_t)
498 ma_to_pa(pteval & MMU_PAGEMASK);
499 goto recursion;
500 }
501
502 /*
503 * shorten dump for consecutive mappings
504 */
505 for (i = 1; index + i < ptes_per_table; ++i) {
506 if (pae_support)
507 pteval = ((x86pte_t *)table)[index + i];
508 else
509 pteval = ((x86pte32_t *)table)[index + i];
510 if (pteval == 0)
511 break;
512 pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
513 if (pa1 != pa + i * pgsize)
514 break;
515 }
516 if (i > 2) {
517 dboot_printf("%s...\n", tabs + l);
518 va += pgsize * (i - 2);
519 index += i - 2;
520 }
521 next_entry:
522 va += pgsize;
523 if (l == 3 && index == 256) /* VA hole */
524 va = 0xffff800000000000ull;
525 recursion:
526 ;
527 }
528 if (l < top_level) {
529 ++l;
530 index = save_index[l];
531 table = save_table[l];
532 goto recursion;
533 }
534 }
535
536 /*
537 * Add a mapping for the machine page at the given virtual address.
538 */
539 static void
540 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
541 {
542 x86pte_t *ptep;
543 x86pte_t pteval;
544
545 pteval = ma | pte_bits;
546 if (level > 0)
547 pteval |= PT_PAGESIZE;
548 if (va >= target_kernel_text && pge_support)
549 pteval |= PT_GLOBAL;
550
551 if (map_debug && ma != va)
552 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
553 " pte=0x%" PRIx64 " l=%d\n",
554 (uint64_t)ma, (uint64_t)va, pteval, level);
555
556 #if defined(__xpv)
557 /*
558 * see if we can avoid find_pte() on the hypervisor
559 */
560 if (HYPERVISOR_update_va_mapping(va, pteval,
561 UVMF_INVLPG | UVMF_LOCAL) == 0)
562 return;
563 #endif
564
565 /*
566 * Find the pte that will map this address. This creates any
567 * missing intermediate level page tables
568 */
569 ptep = find_pte(va, NULL, level, 0);
570
571 /*
572 * When paravirtualized, we must use hypervisor calls to modify the
573 * PTE, since paging is active. On real hardware we just write to
574 * the pagetables which aren't in use yet.
575 */
576 #if defined(__xpv)
577 ptep = ptep; /* shut lint up */
578 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
579 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
580 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
581 (uint64_t)va, level, (uint64_t)ma, pteval);
582 #else
583 if (va < 1024 * 1024)
584 pteval |= PT_NOCACHE; /* for video RAM */
585 if (pae_support)
586 *ptep = pteval;
587 else
588 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
589 #endif
590 }
591
592 /*
593 * Add a mapping for the physical page at the given virtual address.
594 */
595 static void
596 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
597 {
598 map_ma_at_va(pa_to_ma(pa), va, level);
599 }
600
601 /*
602 * This is called to remove start..end from the
603 * possible range of PCI addresses.
604 */
605 const uint64_t pci_lo_limit = 0x00100000ul;
606 const uint64_t pci_hi_limit = 0xfff00000ul;
607 static void
608 exclude_from_pci(uint64_t start, uint64_t end)
609 {
610 int i;
611 int j;
612 struct boot_memlist *ml;
613
614 for (i = 0; i < pcimemlists_used; ++i) {
615 ml = &pcimemlists[i];
616
617 /* delete the entire range? */
618 if (start <= ml->addr && ml->addr + ml->size <= end) {
619 --pcimemlists_used;
620 for (j = i; j < pcimemlists_used; ++j)
621 pcimemlists[j] = pcimemlists[j + 1];
622 --i; /* to revisit the new one at this index */
623 }
624
625 /* split a range? */
626 else if (ml->addr < start && end < ml->addr + ml->size) {
627
628 ++pcimemlists_used;
629 if (pcimemlists_used > MAX_MEMLIST)
630 dboot_panic("too many pcimemlists");
631
632 for (j = pcimemlists_used - 1; j > i; --j)
633 pcimemlists[j] = pcimemlists[j - 1];
634 ml->size = start - ml->addr;
635
636 ++ml;
637 ml->size = (ml->addr + ml->size) - end;
638 ml->addr = end;
639 ++i; /* skip on to next one */
640 }
641
642 /* cut memory off the start? */
643 else if (ml->addr < end && end < ml->addr + ml->size) {
644 ml->size -= end - ml->addr;
645 ml->addr = end;
646 }
647
648 /* cut memory off the end? */
649 else if (ml->addr <= start && start < ml->addr + ml->size) {
650 ml->size = start - ml->addr;
651 }
652 }
653 }
654
655 /*
656 * During memory allocation, find the highest address not used yet.
657 */
658 static void
659 check_higher(paddr_t a)
660 {
661 if (a < next_avail_addr)
662 return;
663 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
664 DBG(next_avail_addr);
665 }
666
667 static int
668 dboot_loader_mmap_entries(void)
669 {
670 #if !defined(__xpv)
671 if (num_entries_set == B_TRUE)
672 return (num_entries);
673
674 switch (multiboot_version) {
675 case 1:
676 DBG(mb_info->flags);
677 if (mb_info->flags & 0x40) {
678 mb_memory_map_t *mmap;
679
680 DBG(mb_info->mmap_addr);
681 DBG(mb_info->mmap_length);
682 check_higher(mb_info->mmap_addr + mb_info->mmap_length);
683
684 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
685 (uint32_t)mmap < mb_info->mmap_addr +
686 mb_info->mmap_length;
687 mmap = (mb_memory_map_t *)((uint32_t)mmap +
688 mmap->size + sizeof (mmap->size)))
689 ++num_entries;
690
691 num_entries_set = B_TRUE;
692 }
693 break;
694 case 2:
695 num_entries_set = B_TRUE;
696 num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
697 mb2_mmap_tagp);
698 break;
699 default:
700 dboot_panic("Unknown multiboot version: %d\n",
701 multiboot_version);
702 break;
703 }
704 return (num_entries);
705 #else
706 return (MAXMAPS);
707 #endif
708 }
709
710 static uint32_t
711 dboot_loader_mmap_get_type(int index)
712 {
713 #if !defined(__xpv)
714 mb_memory_map_t *mp, *mpend;
715 int i;
716
717 switch (multiboot_version) {
718 case 1:
719 mp = (mb_memory_map_t *)mb_info->mmap_addr;
720 mpend = (mb_memory_map_t *)
721 (mb_info->mmap_addr + mb_info->mmap_length);
722
723 for (i = 0; mp < mpend && i != index; i++)
724 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
725 sizeof (mp->size));
726 if (mp >= mpend) {
727 dboot_panic("dboot_loader_mmap_get_type(): index "
728 "out of bounds: %d\n", index);
729 }
730 return (mp->type);
731
732 case 2:
733 return (dboot_multiboot2_mmap_get_type(mb2_info,
734 mb2_mmap_tagp, index));
735
736 default:
737 dboot_panic("Unknown multiboot version: %d\n",
738 multiboot_version);
739 break;
740 }
741 return (0);
742 #else
743 return (map_buffer[index].type);
744 #endif
745 }
746
747 static uint64_t
748 dboot_loader_mmap_get_base(int index)
749 {
750 #if !defined(__xpv)
751 mb_memory_map_t *mp, *mpend;
752 int i;
753
754 switch (multiboot_version) {
755 case 1:
756 mp = (mb_memory_map_t *)mb_info->mmap_addr;
757 mpend = (mb_memory_map_t *)
758 (mb_info->mmap_addr + mb_info->mmap_length);
759
760 for (i = 0; mp < mpend && i != index; i++)
761 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
762 sizeof (mp->size));
763 if (mp >= mpend) {
764 dboot_panic("dboot_loader_mmap_get_base(): index "
765 "out of bounds: %d\n", index);
766 }
767 return (((uint64_t)mp->base_addr_high << 32) +
768 (uint64_t)mp->base_addr_low);
769
770 case 2:
771 return (dboot_multiboot2_mmap_get_base(mb2_info,
772 mb2_mmap_tagp, index));
773
774 default:
775 dboot_panic("Unknown multiboot version: %d\n",
776 multiboot_version);
777 break;
778 }
779 return (0);
780 #else
781 return (((uint64_t)map_buffer[index].base_addr_high << 32) +
782 (uint64_t)map_buffer[index].base_addr_low);
783 #endif
784 }
785
786 static uint64_t
787 dboot_loader_mmap_get_length(int index)
788 {
789 #if !defined(__xpv)
790 mb_memory_map_t *mp, *mpend;
791 int i;
792
793 switch (multiboot_version) {
794 case 1:
795 mp = (mb_memory_map_t *)mb_info->mmap_addr;
796 mpend = (mb_memory_map_t *)
797 (mb_info->mmap_addr + mb_info->mmap_length);
798
799 for (i = 0; mp < mpend && i != index; i++)
800 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
801 sizeof (mp->size));
802 if (mp >= mpend) {
803 dboot_panic("dboot_loader_mmap_get_length(): index "
804 "out of bounds: %d\n", index);
805 }
806 return (((uint64_t)mp->length_high << 32) +
807 (uint64_t)mp->length_low);
808
809 case 2:
810 return (dboot_multiboot2_mmap_get_length(mb2_info,
811 mb2_mmap_tagp, index));
812
813 default:
814 dboot_panic("Unknown multiboot version: %d\n",
815 multiboot_version);
816 break;
817 }
818 return (0);
819 #else
820 return (((uint64_t)map_buffer[index].length_high << 32) +
821 (uint64_t)map_buffer[index].length_low);
822 #endif
823 }
824
825 static void
826 build_pcimemlists(void)
827 {
828 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
829 uint64_t start;
830 uint64_t end;
831 int i, num;
832
833 /*
834 * initialize
835 */
836 pcimemlists[0].addr = pci_lo_limit;
837 pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
838 pcimemlists_used = 1;
839
840 num = dboot_loader_mmap_entries();
841 /*
842 * Fill in PCI memlists.
843 */
844 for (i = 0; i < num; ++i) {
845 start = dboot_loader_mmap_get_base(i);
846 end = start + dboot_loader_mmap_get_length(i);
847
848 if (prom_debug)
849 dboot_printf("\ttype: %d %" PRIx64 "..%"
850 PRIx64 "\n", dboot_loader_mmap_get_type(i),
851 start, end);
852
853 /*
854 * page align start and end
855 */
856 start = (start + page_offset) & ~page_offset;
857 end &= ~page_offset;
858 if (end <= start)
859 continue;
860
861 exclude_from_pci(start, end);
862 }
863
864 /*
865 * Finish off the pcimemlist
866 */
867 if (prom_debug) {
868 for (i = 0; i < pcimemlists_used; ++i) {
869 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
870 PRIx64 "\n", pcimemlists[i].addr,
871 pcimemlists[i].addr + pcimemlists[i].size);
872 }
873 }
874 pcimemlists[0].next = 0;
875 pcimemlists[0].prev = 0;
876 for (i = 1; i < pcimemlists_used; ++i) {
877 pcimemlists[i].prev =
878 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
879 pcimemlists[i].next = 0;
880 pcimemlists[i - 1].next =
881 (native_ptr_t)(uintptr_t)(pcimemlists + i);
882 }
883 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
884 DBG(bi->bi_pcimem);
885 }
886
887 #if defined(__xpv)
888 /*
889 * Initialize memory allocator stuff from hypervisor-supplied start info.
890 */
891 static void
892 init_mem_alloc(void)
893 {
894 int local; /* variables needed to find start region */
895 paddr_t scratch_start;
896 xen_memory_map_t map;
897
898 DBG_MSG("Entered init_mem_alloc()\n");
899
900 /*
901 * Free memory follows the stack. There's at least 512KB of scratch
902 * space, rounded up to at least 2Mb alignment. That should be enough
903 * for the page tables we'll need to build. The nucleus memory is
904 * allocated last and will be outside the addressible range. We'll
905 * switch to new page tables before we unpack the kernel
906 */
907 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
908 DBG(scratch_start);
909 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
910 DBG(scratch_end);
911
912 /*
913 * For paranoia, leave some space between hypervisor data and ours.
914 * Use 500 instead of 512.
915 */
916 next_avail_addr = scratch_end - 500 * 1024;
917 DBG(next_avail_addr);
918
919 /*
920 * The domain builder gives us at most 1 module
921 */
922 DBG(xen_info->mod_len);
923 if (xen_info->mod_len > 0) {
924 DBG(xen_info->mod_start);
925 modules[0].bm_addr =
926 (native_ptr_t)(uintptr_t)xen_info->mod_start;
927 modules[0].bm_size = xen_info->mod_len;
928 bi->bi_module_cnt = 1;
929 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
930 } else {
931 bi->bi_module_cnt = 0;
932 bi->bi_modules = (native_ptr_t)(uintptr_t)NULL;
933 }
934 DBG(bi->bi_module_cnt);
935 DBG(bi->bi_modules);
936
937 DBG(xen_info->mfn_list);
938 DBG(xen_info->nr_pages);
939 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
940 DBG(max_mem);
941
942 /*
943 * Using pseudo-physical addresses, so only 1 memlist element
944 */
945 memlists[0].addr = 0;
946 DBG(memlists[0].addr);
947 memlists[0].size = max_mem;
948 DBG(memlists[0].size);
949 memlists_used = 1;
950 DBG(memlists_used);
951
952 /*
953 * finish building physinstall list
954 */
955 sort_physinstall();
956
957 /*
958 * build bios reserved memlists
959 */
960 build_rsvdmemlists();
961
962 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
963 /*
964 * build PCI Memory list
965 */
966 map.nr_entries = MAXMAPS;
967 /*LINTED: constant in conditional context*/
968 set_xen_guest_handle(map.buffer, map_buffer);
969 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
970 dboot_panic("getting XENMEM_machine_memory_map failed");
971 build_pcimemlists();
972 }
973 }
974
975 #else /* !__xpv */
976
977 static void
978 dboot_multiboot1_xboot_consinfo(void)
979 {
980 fb->framebuffer = 0;
981 }
982
983 static void
984 dboot_multiboot2_xboot_consinfo(void)
985 {
986 multiboot_tag_framebuffer_t *fbtag;
987 fbtag = dboot_multiboot2_find_tag(mb2_info,
988 MULTIBOOT_TAG_TYPE_FRAMEBUFFER);
989 fb->framebuffer = (uint64_t)(uintptr_t)fbtag;
990 fb->boot_fb_virt = 0;
991 }
992
993 static int
994 dboot_multiboot_modcount(void)
995 {
996 switch (multiboot_version) {
997 case 1:
998 return (mb_info->mods_count);
999
1000 case 2:
1001 return (dboot_multiboot2_modcount(mb2_info));
1002
1003 default:
1004 dboot_panic("Unknown multiboot version: %d\n",
1005 multiboot_version);
1006 break;
1007 }
1008 return (0);
1009 }
1010
1011 static uint32_t
1012 dboot_multiboot_modstart(int index)
1013 {
1014 switch (multiboot_version) {
1015 case 1:
1016 return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1017
1018 case 2:
1019 return (dboot_multiboot2_modstart(mb2_info, index));
1020
1021 default:
1022 dboot_panic("Unknown multiboot version: %d\n",
1023 multiboot_version);
1024 break;
1025 }
1026 return (0);
1027 }
1028
1029 static uint32_t
1030 dboot_multiboot_modend(int index)
1031 {
1032 switch (multiboot_version) {
1033 case 1:
1034 return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1035
1036 case 2:
1037 return (dboot_multiboot2_modend(mb2_info, index));
1038
1039 default:
1040 dboot_panic("Unknown multiboot version: %d\n",
1041 multiboot_version);
1042 break;
1043 }
1044 return (0);
1045 }
1046
1047 static char *
1048 dboot_multiboot_modcmdline(int index)
1049 {
1050 switch (multiboot_version) {
1051 case 1:
1052 return ((char *)((mb_module_t *)
1053 mb_info->mods_addr)[index].mod_name);
1054
1055 case 2:
1056 return (dboot_multiboot2_modcmdline(mb2_info, index));
1057
1058 default:
1059 dboot_panic("Unknown multiboot version: %d\n",
1060 multiboot_version);
1061 break;
1062 }
1063 return (0);
1064 }
1065
1066 /*
1067 * Find the modules used by console setup.
1068 * Since we need the console to print early boot messages, the console is set up
1069 * before anything else and therefore we need to pick up the needed modules.
1070 *
1071 * Note, we just will search for and if found, will pass the modules
1072 * to console setup, the proper module list processing will happen later.
1073 * Currenly used modules are boot environment and consoler font.
1074 */
1075 static void
1076 dboot_find_console_modules(void)
1077 {
1078 int i, modcount;
1079 uint32_t mod_start, mod_end;
1080 char *cmdline;
1081
1082 modcount = dboot_multiboot_modcount();
1083 bi->bi_module_cnt = 0;
1084 for (i = 0; i < modcount; ++i) {
1085 cmdline = dboot_multiboot_modcmdline(i);
1086 if (cmdline == NULL)
1087 continue;
1088
1089 if (strstr(cmdline, "type=console-font") != NULL)
1090 modules[bi->bi_module_cnt].bm_type = BMT_FONT;
1091 else if (strstr(cmdline, "type=environment") != NULL)
1092 modules[bi->bi_module_cnt].bm_type = BMT_ENV;
1093 else
1094 continue;
1095
1096 mod_start = dboot_multiboot_modstart(i);
1097 mod_end = dboot_multiboot_modend(i);
1098 modules[bi->bi_module_cnt].bm_addr =
1099 (native_ptr_t)(uintptr_t)mod_start;
1100 modules[bi->bi_module_cnt].bm_size = mod_end - mod_start;
1101 modules[bi->bi_module_cnt].bm_name =
1102 (native_ptr_t)(uintptr_t)NULL;
1103 modules[bi->bi_module_cnt].bm_hash =
1104 (native_ptr_t)(uintptr_t)NULL;
1105 bi->bi_module_cnt++;
1106 }
1107 if (bi->bi_module_cnt != 0)
1108 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1109 }
1110
1111 static boolean_t
1112 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1113 {
1114 boolean_t rv = B_FALSE;
1115
1116 switch (multiboot_version) {
1117 case 1:
1118 if (mb_info->flags & 0x01) {
1119 *lower = mb_info->mem_lower;
1120 *upper = mb_info->mem_upper;
1121 rv = B_TRUE;
1122 }
1123 break;
1124
1125 case 2:
1126 return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1127
1128 default:
1129 dboot_panic("Unknown multiboot version: %d\n",
1130 multiboot_version);
1131 break;
1132 }
1133 return (rv);
1134 }
1135
1136 static uint8_t
1137 dboot_a2h(char v)
1138 {
1139 if (v >= 'a')
1140 return (v - 'a' + 0xa);
1141 else if (v >= 'A')
1142 return (v - 'A' + 0xa);
1143 else if (v >= '0')
1144 return (v - '0');
1145 else
1146 dboot_panic("bad ASCII hex character %c\n", v);
1147
1148 return (0);
1149 }
1150
1151 static void
1152 digest_a2h(const char *ascii, uint8_t *digest)
1153 {
1154 unsigned int i;
1155
1156 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1157 digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1158 digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1159 }
1160 }
1161
1162 /*
1163 * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1164 * the ASCII-format hash found in the 40-byte buffer at ascii. If they
1165 * match, return 0, otherwise -1. This works only for images smaller than
1166 * 4 GB, which should not be a problem.
1167 */
1168 static int
1169 check_image_hash(uint_t midx)
1170 {
1171 const char *ascii;
1172 const void *image;
1173 size_t len;
1174 SHA1_CTX ctx;
1175 uint8_t digest[SHA1_DIGEST_LENGTH];
1176 uint8_t baseline[SHA1_DIGEST_LENGTH];
1177 unsigned int i;
1178
1179 ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1180 image = (const void *)(uintptr_t)modules[midx].bm_addr;
1181 len = (size_t)modules[midx].bm_size;
1182
1183 digest_a2h(ascii, baseline);
1184
1185 SHA1Init(&ctx);
1186 SHA1Update(&ctx, image, len);
1187 SHA1Final(digest, &ctx);
1188
1189 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1190 if (digest[i] != baseline[i])
1191 return (-1);
1192 }
1193
1194 return (0);
1195 }
1196
1197 static const char *
1198 type_to_str(boot_module_type_t type)
1199 {
1200 switch (type) {
1201 case BMT_ROOTFS:
1202 return ("rootfs");
1203 case BMT_FILE:
1204 return ("file");
1205 case BMT_HASH:
1206 return ("hash");
1207 case BMT_ENV:
1208 return ("environment");
1209 case BMT_FONT:
1210 return ("console-font");
1211 default:
1212 return ("unknown");
1213 }
1214 }
1215
1216 static void
1217 check_images(void)
1218 {
1219 uint_t i;
1220 char displayhash[SHA1_ASCII_LENGTH + 1];
1221
1222 for (i = 0; i < modules_used; i++) {
1223 if (prom_debug) {
1224 dboot_printf("module #%d: name %s type %s "
1225 "addr %lx size %lx\n",
1226 i, (char *)(uintptr_t)modules[i].bm_name,
1227 type_to_str(modules[i].bm_type),
1228 (ulong_t)modules[i].bm_addr,
1229 (ulong_t)modules[i].bm_size);
1230 }
1231
1232 if (modules[i].bm_type == BMT_HASH ||
1233 modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) {
1234 DBG_MSG("module has no hash; skipping check\n");
1235 continue;
1236 }
1237 (void) memcpy(displayhash,
1238 (void *)(uintptr_t)modules[i].bm_hash,
1239 SHA1_ASCII_LENGTH);
1240 displayhash[SHA1_ASCII_LENGTH] = '\0';
1241 if (prom_debug) {
1242 dboot_printf("checking expected hash [%s]: ",
1243 displayhash);
1244 }
1245
1246 if (check_image_hash(i) != 0)
1247 dboot_panic("hash mismatch!\n");
1248 else
1249 DBG_MSG("OK\n");
1250 }
1251 }
1252
1253 /*
1254 * Determine the module's starting address, size, name, and type, and fill the
1255 * boot_modules structure. This structure is used by the bop code, except for
1256 * hashes which are checked prior to transferring control to the kernel.
1257 */
1258 static void
1259 process_module(int midx)
1260 {
1261 uint32_t mod_start = dboot_multiboot_modstart(midx);
1262 uint32_t mod_end = dboot_multiboot_modend(midx);
1263 char *cmdline = dboot_multiboot_modcmdline(midx);
1264 char *p, *q;
1265
1266 check_higher(mod_end);
1267 if (prom_debug) {
1268 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1269 midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1270 }
1271
1272 if (mod_start > mod_end) {
1273 dboot_panic("module #%d: module start address 0x%lx greater "
1274 "than end address 0x%lx", midx,
1275 (ulong_t)mod_start, (ulong_t)mod_end);
1276 }
1277
1278 /*
1279 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1280 * the address of the last valid byte in a module plus 1 as mod_end.
1281 * This is of course a bug; the multiboot specification simply states
1282 * that mod_start and mod_end "contain the start and end addresses of
1283 * the boot module itself" which is pretty obviously not what GRUB is
1284 * doing. However, fixing it requires that not only this code be
1285 * changed but also that other code consuming this value and values
1286 * derived from it be fixed, and that the kernel and GRUB must either
1287 * both have the bug or neither. While there are a lot of combinations
1288 * that will work, there are also some that won't, so for simplicity
1289 * we'll just cope with the bug. That means we won't actually hash the
1290 * byte at mod_end, and we will expect that mod_end for the hash file
1291 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1292 * hash plus a newline for each module). We set bm_size to the true
1293 * correct number of bytes in each module, achieving exactly this.
1294 */
1295
1296 modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1297 modules[midx].bm_size = mod_end - mod_start;
1298 modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1299 modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1300 modules[midx].bm_type = BMT_FILE;
1301
1302 if (cmdline == NULL) {
1303 modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1304 return;
1305 }
1306
1307 p = cmdline;
1308 modules[midx].bm_name =
1309 (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1310
1311 while (p != NULL) {
1312 q = strsep(&p, " \t\f\n\r");
1313 if (strncmp(q, "name=", 5) == 0) {
1314 if (q[5] != '\0' && !isspace(q[5])) {
1315 modules[midx].bm_name =
1316 (native_ptr_t)(uintptr_t)(q + 5);
1317 }
1318 continue;
1319 }
1320
1321 if (strncmp(q, "type=", 5) == 0) {
1322 if (q[5] == '\0' || isspace(q[5]))
1323 continue;
1324 q += 5;
1325 if (strcmp(q, "rootfs") == 0) {
1326 modules[midx].bm_type = BMT_ROOTFS;
1327 } else if (strcmp(q, "hash") == 0) {
1328 modules[midx].bm_type = BMT_HASH;
1329 } else if (strcmp(q, "environment") == 0) {
1330 modules[midx].bm_type = BMT_ENV;
1331 } else if (strcmp(q, "console-font") == 0) {
1332 modules[midx].bm_type = BMT_FONT;
1333 } else if (strcmp(q, "file") != 0) {
1334 dboot_printf("\tmodule #%d: unknown module "
1335 "type '%s'; defaulting to 'file'",
1336 midx, q);
1337 }
1338 continue;
1339 }
1340
1341 if (strncmp(q, "hash=", 5) == 0) {
1342 if (q[5] != '\0' && !isspace(q[5])) {
1343 modules[midx].bm_hash =
1344 (native_ptr_t)(uintptr_t)(q + 5);
1345 }
1346 continue;
1347 }
1348
1349 dboot_printf("ignoring unknown option '%s'\n", q);
1350 }
1351 }
1352
1353 /*
1354 * Backward compatibility: if there are exactly one or two modules, both
1355 * of type 'file' and neither with an embedded hash value, we have been
1356 * given the legacy style modules. In this case we need to treat the first
1357 * module as a rootfs and the second as a hash referencing that module.
1358 * Otherwise, even if the configuration is invalid, we assume that the
1359 * operator knows what he's doing or at least isn't being bitten by this
1360 * interface change.
1361 */
1362 static void
1363 fixup_modules(void)
1364 {
1365 if (modules_used == 0 || modules_used > 2)
1366 return;
1367
1368 if (modules[0].bm_type != BMT_FILE ||
1369 modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1370 return;
1371 }
1372
1373 if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL ||
1374 modules_used > 1 &&
1375 modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1376 return;
1377 }
1378
1379 modules[0].bm_type = BMT_ROOTFS;
1380 if (modules_used > 1) {
1381 modules[1].bm_type = BMT_HASH;
1382 modules[1].bm_name = modules[0].bm_name;
1383 }
1384 }
1385
1386 /*
1387 * For modules that do not have assigned hashes but have a separate hash module,
1388 * find the assigned hash module and set the primary module's bm_hash to point
1389 * to the hash data from that module. We will then ignore modules of type
1390 * BMT_HASH from this point forward.
1391 */
1392 static void
1393 assign_module_hashes(void)
1394 {
1395 uint_t i, j;
1396
1397 for (i = 0; i < modules_used; i++) {
1398 if (modules[i].bm_type == BMT_HASH ||
1399 modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1400 continue;
1401 }
1402
1403 for (j = 0; j < modules_used; j++) {
1404 if (modules[j].bm_type != BMT_HASH ||
1405 strcmp((char *)(uintptr_t)modules[j].bm_name,
1406 (char *)(uintptr_t)modules[i].bm_name) != 0) {
1407 continue;
1408 }
1409
1410 if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1411 dboot_printf("Short hash module of length "
1412 "0x%lx bytes; ignoring\n",
1413 (ulong_t)modules[j].bm_size);
1414 } else {
1415 modules[i].bm_hash = modules[j].bm_addr;
1416 }
1417 break;
1418 }
1419 }
1420 }
1421
1422 /*
1423 * Walk through the module information finding the last used address.
1424 * The first available address will become the top level page table.
1425 */
1426 static void
1427 dboot_process_modules(void)
1428 {
1429 int i, modcount;
1430 extern char _end[];
1431
1432 DBG_MSG("\nFinding Modules\n");
1433 modcount = dboot_multiboot_modcount();
1434 if (modcount > MAX_BOOT_MODULES) {
1435 dboot_panic("Too many modules (%d) -- the maximum is %d.",
1436 modcount, MAX_BOOT_MODULES);
1437 }
1438 /*
1439 * search the modules to find the last used address
1440 * we'll build the module list while we're walking through here
1441 */
1442 check_higher((paddr_t)(uintptr_t)&_end);
1443 for (i = 0; i < modcount; ++i) {
1444 process_module(i);
1445 modules_used++;
1446 }
1447 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1448 DBG(bi->bi_modules);
1449 bi->bi_module_cnt = modcount;
1450 DBG(bi->bi_module_cnt);
1451
1452 fixup_modules();
1453 assign_module_hashes();
1454 check_images();
1455 }
1456
1457 /*
1458 * We then build the phys_install memlist from the multiboot information.
1459 */
1460 static void
1461 dboot_process_mmap(void)
1462 {
1463 uint64_t start;
1464 uint64_t end;
1465 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
1466 uint32_t lower, upper;
1467 int i, mmap_entries;
1468
1469 /*
1470 * Walk through the memory map from multiboot and build our memlist
1471 * structures. Note these will have native format pointers.
1472 */
1473 DBG_MSG("\nFinding Memory Map\n");
1474 num_entries = 0;
1475 num_entries_set = B_FALSE;
1476 max_mem = 0;
1477 if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1478 for (i = 0; i < mmap_entries; i++) {
1479 uint32_t type = dboot_loader_mmap_get_type(i);
1480 start = dboot_loader_mmap_get_base(i);
1481 end = start + dboot_loader_mmap_get_length(i);
1482
1483 if (prom_debug)
1484 dboot_printf("\ttype: %d %" PRIx64 "..%"
1485 PRIx64 "\n", type, start, end);
1486
1487 /*
1488 * page align start and end
1489 */
1490 start = (start + page_offset) & ~page_offset;
1491 end &= ~page_offset;
1492 if (end <= start)
1493 continue;
1494
1495 /*
1496 * only type 1 is usable RAM
1497 */
1498 switch (type) {
1499 case 1:
1500 if (end > max_mem)
1501 max_mem = end;
1502 memlists[memlists_used].addr = start;
1503 memlists[memlists_used].size = end - start;
1504 ++memlists_used;
1505 if (memlists_used > MAX_MEMLIST)
1506 dboot_panic("too many memlists");
1507 break;
1508 case 2:
1509 rsvdmemlists[rsvdmemlists_used].addr = start;
1510 rsvdmemlists[rsvdmemlists_used].size =
1511 end - start;
1512 ++rsvdmemlists_used;
1513 if (rsvdmemlists_used > MAX_MEMLIST)
1514 dboot_panic("too many rsvdmemlists");
1515 break;
1516 default:
1517 continue;
1518 }
1519 }
1520 build_pcimemlists();
1521 } else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1522 DBG(lower);
1523 memlists[memlists_used].addr = 0;
1524 memlists[memlists_used].size = lower * 1024;
1525 ++memlists_used;
1526 DBG(upper);
1527 memlists[memlists_used].addr = 1024 * 1024;
1528 memlists[memlists_used].size = upper * 1024;
1529 ++memlists_used;
1530
1531 /*
1532 * Old platform - assume I/O space at the end of memory.
1533 */
1534 pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1535 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1536 pcimemlists[0].next = 0;
1537 pcimemlists[0].prev = 0;
1538 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1539 DBG(bi->bi_pcimem);
1540 } else {
1541 dboot_panic("No memory info from boot loader!!!");
1542 }
1543
1544 /*
1545 * finish processing the physinstall list
1546 */
1547 sort_physinstall();
1548
1549 /*
1550 * build bios reserved mem lists
1551 */
1552 build_rsvdmemlists();
1553 }
1554
1555 /*
1556 * The highest address is used as the starting point for dboot's simple
1557 * memory allocator.
1558 *
1559 * Finding the highest address in case of Multiboot 1 protocol is
1560 * quite painful in the sense that some information provided by
1561 * the multiboot info structure points to BIOS data, and some to RAM.
1562 *
1563 * The module list was processed and checked already by dboot_process_modules(),
1564 * so we will check the command line string and the memory map.
1565 *
1566 * This list of to be checked items is based on our current knowledge of
1567 * allocations made by grub1 and will need to be reviewed if there
1568 * are updates about the information provided by Multiboot 1.
1569 *
1570 * In the case of the Multiboot 2, our life is much simpler, as the MB2
1571 * information tag list is one contiguous chunk of memory.
1572 */
1573 static paddr_t
1574 dboot_multiboot1_highest_addr(void)
1575 {
1576 paddr_t addr = (paddr_t)(uintptr_t)NULL;
1577 char *cmdl = (char *)mb_info->cmdline;
1578
1579 if (mb_info->flags & MB_INFO_CMDLINE)
1580 addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1581
1582 if (mb_info->flags & MB_INFO_MEM_MAP)
1583 addr = MAX(addr,
1584 ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1585 return (addr);
1586 }
1587
1588 static void
1589 dboot_multiboot_highest_addr(void)
1590 {
1591 paddr_t addr;
1592
1593 switch (multiboot_version) {
1594 case 1:
1595 addr = dboot_multiboot1_highest_addr();
1596 if (addr != (paddr_t)(uintptr_t)NULL)
1597 check_higher(addr);
1598 break;
1599 case 2:
1600 addr = dboot_multiboot2_highest_addr(mb2_info);
1601 if (addr != (paddr_t)(uintptr_t)NULL)
1602 check_higher(addr);
1603 break;
1604 default:
1605 dboot_panic("Unknown multiboot version: %d\n",
1606 multiboot_version);
1607 break;
1608 }
1609 }
1610
1611 /*
1612 * Walk the boot loader provided information and find the highest free address.
1613 */
1614 static void
1615 init_mem_alloc(void)
1616 {
1617 DBG_MSG("Entered init_mem_alloc()\n");
1618 dboot_process_modules();
1619 dboot_process_mmap();
1620 dboot_multiboot_highest_addr();
1621 }
1622
1623 static int
1624 dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2)
1625 {
1626 int i;
1627
1628 if (g1->time_low != g2->time_low)
1629 return (0);
1630 if (g1->time_mid != g2->time_mid)
1631 return (0);
1632 if (g1->time_hi_and_version != g2->time_hi_and_version)
1633 return (0);
1634 if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved)
1635 return (0);
1636 if (g1->clock_seq_low != g2->clock_seq_low)
1637 return (0);
1638
1639 for (i = 0; i < 6; i++) {
1640 if (g1->node_addr[i] != g2->node_addr[i])
1641 return (0);
1642 }
1643 return (1);
1644 }
1645
1646 static void
1647 process_efi32(EFI_SYSTEM_TABLE32 *efi)
1648 {
1649 uint32_t entries;
1650 EFI_CONFIGURATION_TABLE32 *config;
1651 int i;
1652
1653 entries = efi->NumberOfTableEntries;
1654 config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1655 efi->ConfigurationTable;
1656
1657 for (i = 0; i < entries; i++) {
1658 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1659 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1660 config[i].VendorTable;
1661 }
1662 if (bi->bi_smbios == NULL &&
1663 dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1664 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1665 config[i].VendorTable;
1666 }
1667 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1668 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1669 config[i].VendorTable;
1670 }
1671 if (bi->bi_acpi_rsdp == NULL &&
1672 dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1673 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1674 config[i].VendorTable;
1675 }
1676 }
1677 }
1678
1679 static void
1680 process_efi64(EFI_SYSTEM_TABLE64 *efi)
1681 {
1682 uint64_t entries;
1683 EFI_CONFIGURATION_TABLE64 *config;
1684 int i;
1685
1686 entries = efi->NumberOfTableEntries;
1687 config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1688 efi->ConfigurationTable;
1689
1690 for (i = 0; i < entries; i++) {
1691 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1692 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1693 config[i].VendorTable;
1694 }
1695 if (bi->bi_smbios == NULL &&
1696 dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1697 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1698 config[i].VendorTable;
1699 }
1700 /* Prefer acpi v2+ over v1. */
1701 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1702 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1703 config[i].VendorTable;
1704 }
1705 if (bi->bi_acpi_rsdp == NULL &&
1706 dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1707 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1708 config[i].VendorTable;
1709 }
1710 }
1711 }
1712
1713 static void
1714 dboot_multiboot_get_fwtables(void)
1715 {
1716 multiboot_tag_new_acpi_t *nacpitagp;
1717 multiboot_tag_old_acpi_t *oacpitagp;
1718 multiboot_tag_efi64_t *efi64tagp = NULL;
1719 multiboot_tag_efi32_t *efi32tagp = NULL;
1720
1721 /* no fw tables from multiboot 1 */
1722 if (multiboot_version != 2)
1723 return;
1724
1725 efi64tagp = (multiboot_tag_efi64_t *)
1726 dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64);
1727 if (efi64tagp != NULL) {
1728 bi->bi_uefi_arch = XBI_UEFI_ARCH_64;
1729 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1730 efi64tagp->mb_pointer;
1731 process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
1732 efi64tagp->mb_pointer);
1733 } else {
1734 efi32tagp = (multiboot_tag_efi32_t *)
1735 dboot_multiboot2_find_tag(mb2_info,
1736 MULTIBOOT_TAG_TYPE_EFI32);
1737 if (efi32tagp != NULL) {
1738 bi->bi_uefi_arch = XBI_UEFI_ARCH_32;
1739 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1740 efi32tagp->mb_pointer;
1741 process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
1742 efi32tagp->mb_pointer);
1743 }
1744 }
1745
1746 /*
1747 * The ACPI RSDP can be found by scanning the BIOS memory areas or
1748 * from the EFI system table. The boot loader may pass in the address
1749 * it found the ACPI tables at.
1750 */
1751 nacpitagp = (multiboot_tag_new_acpi_t *)
1752 dboot_multiboot2_find_tag(mb2_info,
1753 MULTIBOOT_TAG_TYPE_ACPI_NEW);
1754 oacpitagp = (multiboot_tag_old_acpi_t *)
1755 dboot_multiboot2_find_tag(mb2_info,
1756 MULTIBOOT_TAG_TYPE_ACPI_OLD);
1757
1758 if (nacpitagp != NULL) {
1759 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1760 &nacpitagp->mb_rsdp[0];
1761 } else if (oacpitagp != NULL) {
1762 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1763 &oacpitagp->mb_rsdp[0];
1764 }
1765 }
1766
1767 /* print out EFI version string with newline */
1768 static void
1769 dboot_print_efi_version(uint32_t ver)
1770 {
1771 int rev;
1772
1773 dboot_printf("%d.", EFI_REV_MAJOR(ver));
1774
1775 rev = EFI_REV_MINOR(ver);
1776 if ((rev % 10) != 0) {
1777 dboot_printf("%d.%d\n", rev / 10, rev % 10);
1778 } else {
1779 dboot_printf("%d\n", rev / 10);
1780 }
1781 }
1782
1783 static void
1784 print_efi32(EFI_SYSTEM_TABLE32 *efi)
1785 {
1786 uint16_t *data;
1787 EFI_CONFIGURATION_TABLE32 *conf;
1788 int i;
1789
1790 dboot_printf("EFI32 signature: %llx\n",
1791 (unsigned long long)efi->Hdr.Signature);
1792 dboot_printf("EFI system version: ");
1793 dboot_print_efi_version(efi->Hdr.Revision);
1794 dboot_printf("EFI system vendor: ");
1795 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1796 for (i = 0; data[i] != 0; i++)
1797 dboot_printf("%c", (char)data[i]);
1798 dboot_printf("\nEFI firmware revision: ");
1799 dboot_print_efi_version(efi->FirmwareRevision);
1800 dboot_printf("EFI system table number of entries: %d\n",
1801 efi->NumberOfTableEntries);
1802 conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1803 efi->ConfigurationTable;
1804 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1805 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1806 conf[i].VendorGuid.time_low,
1807 conf[i].VendorGuid.time_mid,
1808 conf[i].VendorGuid.time_hi_and_version,
1809 conf[i].VendorGuid.clock_seq_hi_and_reserved,
1810 conf[i].VendorGuid.clock_seq_low);
1811 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1812 conf[i].VendorGuid.node_addr[0],
1813 conf[i].VendorGuid.node_addr[1],
1814 conf[i].VendorGuid.node_addr[2],
1815 conf[i].VendorGuid.node_addr[3],
1816 conf[i].VendorGuid.node_addr[4],
1817 conf[i].VendorGuid.node_addr[5]);
1818 }
1819 }
1820
1821 static void
1822 print_efi64(EFI_SYSTEM_TABLE64 *efi)
1823 {
1824 uint16_t *data;
1825 EFI_CONFIGURATION_TABLE64 *conf;
1826 int i;
1827
1828 dboot_printf("EFI64 signature: %llx\n",
1829 (unsigned long long)efi->Hdr.Signature);
1830 dboot_printf("EFI system version: ");
1831 dboot_print_efi_version(efi->Hdr.Revision);
1832 dboot_printf("EFI system vendor: ");
1833 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1834 for (i = 0; data[i] != 0; i++)
1835 dboot_printf("%c", (char)data[i]);
1836 dboot_printf("\nEFI firmware revision: ");
1837 dboot_print_efi_version(efi->FirmwareRevision);
1838 dboot_printf("EFI system table number of entries: %lld\n",
1839 efi->NumberOfTableEntries);
1840 conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1841 efi->ConfigurationTable;
1842 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1843 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1844 conf[i].VendorGuid.time_low,
1845 conf[i].VendorGuid.time_mid,
1846 conf[i].VendorGuid.time_hi_and_version,
1847 conf[i].VendorGuid.clock_seq_hi_and_reserved,
1848 conf[i].VendorGuid.clock_seq_low);
1849 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1850 conf[i].VendorGuid.node_addr[0],
1851 conf[i].VendorGuid.node_addr[1],
1852 conf[i].VendorGuid.node_addr[2],
1853 conf[i].VendorGuid.node_addr[3],
1854 conf[i].VendorGuid.node_addr[4],
1855 conf[i].VendorGuid.node_addr[5]);
1856 }
1857 }
1858 #endif /* !__xpv */
1859
1860 /*
1861 * Simple memory allocator, allocates aligned physical memory.
1862 * Note that startup_kernel() only allocates memory, never frees.
1863 * Memory usage just grows in an upward direction.
1864 */
1865 static void *
1866 do_mem_alloc(uint32_t size, uint32_t align)
1867 {
1868 uint_t i;
1869 uint64_t best;
1870 uint64_t start;
1871 uint64_t end;
1872
1873 /*
1874 * make sure size is a multiple of pagesize
1875 */
1876 size = RNDUP(size, MMU_PAGESIZE);
1877 next_avail_addr = RNDUP(next_avail_addr, align);
1878
1879 /*
1880 * XXPV fixme joe
1881 *
1882 * a really large bootarchive that causes you to run out of memory
1883 * may cause this to blow up
1884 */
1885 /* LINTED E_UNEXPECTED_UINT_PROMOTION */
1886 best = (uint64_t)-size;
1887 for (i = 0; i < memlists_used; ++i) {
1888 start = memlists[i].addr;
1889 #if defined(__xpv)
1890 start += mfn_base;
1891 #endif
1892 end = start + memlists[i].size;
1893
1894 /*
1895 * did we find the desired address?
1896 */
1897 if (start <= next_avail_addr && next_avail_addr + size <= end) {
1898 best = next_avail_addr;
1899 goto done;
1900 }
1901
1902 /*
1903 * if not is this address the best so far?
1904 */
1905 if (start > next_avail_addr && start < best &&
1906 RNDUP(start, align) + size <= end)
1907 best = RNDUP(start, align);
1908 }
1909
1910 /*
1911 * We didn't find exactly the address we wanted, due to going off the
1912 * end of a memory region. Return the best found memory address.
1913 */
1914 done:
1915 next_avail_addr = best + size;
1916 #if defined(__xpv)
1917 if (next_avail_addr > scratch_end)
1918 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1919 "0x%lx", (ulong_t)next_avail_addr,
1920 (ulong_t)scratch_end);
1921 #endif
1922 (void) memset((void *)(uintptr_t)best, 0, size);
1923 return ((void *)(uintptr_t)best);
1924 }
1925
1926 void *
1927 mem_alloc(uint32_t size)
1928 {
1929 return (do_mem_alloc(size, MMU_PAGESIZE));
1930 }
1931
1932
1933 /*
1934 * Build page tables to map all of memory used so far as well as the kernel.
1935 */
1936 static void
1937 build_page_tables(void)
1938 {
1939 uint32_t psize;
1940 uint32_t level;
1941 uint32_t off;
1942 uint64_t start;
1943 #if !defined(__xpv)
1944 uint32_t i;
1945 uint64_t end;
1946 #endif /* __xpv */
1947
1948 /*
1949 * If we're on metal, we need to create the top level pagetable.
1950 */
1951 #if defined(__xpv)
1952 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1953 #else /* __xpv */
1954 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1955 #endif /* __xpv */
1956 DBG((uintptr_t)top_page_table);
1957
1958 /*
1959 * Determine if we'll use large mappings for kernel, then map it.
1960 */
1961 if (largepage_support) {
1962 psize = lpagesize;
1963 level = 1;
1964 } else {
1965 psize = MMU_PAGESIZE;
1966 level = 0;
1967 }
1968
1969 DBG_MSG("Mapping kernel\n");
1970 DBG(ktext_phys);
1971 DBG(target_kernel_text);
1972 DBG(ksize);
1973 DBG(psize);
1974 for (off = 0; off < ksize; off += psize)
1975 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1976
1977 /*
1978 * The kernel will need a 1 page window to work with page tables
1979 */
1980 bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1981 DBG(bi->bi_pt_window);
1982 bi->bi_pte_to_pt_window =
1983 (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1984 DBG(bi->bi_pte_to_pt_window);
1985
1986 #if defined(__xpv)
1987 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1988 /* If this is a domU we're done. */
1989 DBG_MSG("\nPage tables constructed\n");
1990 return;
1991 }
1992 #endif /* __xpv */
1993
1994 /*
1995 * We need 1:1 mappings for the lower 1M of memory to access
1996 * BIOS tables used by a couple of drivers during boot.
1997 *
1998 * The following code works because our simple memory allocator
1999 * only grows usage in an upwards direction.
2000 *
2001 * Note that by this point in boot some mappings for low memory
2002 * may already exist because we've already accessed device in low
2003 * memory. (Specifically the video frame buffer and keyboard
2004 * status ports.) If we're booting on raw hardware then GRUB
2005 * created these mappings for us. If we're booting under a
2006 * hypervisor then we went ahead and remapped these devices into
2007 * memory allocated within dboot itself.
2008 */
2009 if (map_debug)
2010 dboot_printf("1:1 map pa=0..1Meg\n");
2011 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
2012 #if defined(__xpv)
2013 map_ma_at_va(start, start, 0);
2014 #else /* __xpv */
2015 map_pa_at_va(start, start, 0);
2016 #endif /* __xpv */
2017 }
2018
2019 #if !defined(__xpv)
2020
2021 for (i = 0; i < memlists_used; ++i) {
2022 start = memlists[i].addr;
2023 end = start + memlists[i].size;
2024
2025 if (map_debug)
2026 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2027 start, end);
2028 while (start < end && start < next_avail_addr) {
2029 map_pa_at_va(start, start, 0);
2030 start += MMU_PAGESIZE;
2031 }
2032 if (start >= next_avail_addr)
2033 break;
2034 }
2035
2036 /*
2037 * Map framebuffer memory as PT_NOCACHE as this is memory from a
2038 * device and therefore must not be cached.
2039 */
2040 if (fb != NULL && fb->framebuffer != 0) {
2041 multiboot_tag_framebuffer_t *fb_tagp;
2042 fb_tagp = (multiboot_tag_framebuffer_t *)(uintptr_t)
2043 fb->framebuffer;
2044
2045 start = fb_tagp->framebuffer_common.framebuffer_addr;
2046 end = start + fb_tagp->framebuffer_common.framebuffer_height *
2047 fb_tagp->framebuffer_common.framebuffer_pitch;
2048
2049 /* VGA text memory is already mapped. */
2050 if (fb_tagp->framebuffer_common.framebuffer_type !=
2051 MULTIBOOT_FRAMEBUFFER_TYPE_EGA_TEXT) {
2052 uint64_t vaddr;
2053
2054 #if defined(_BOOT_TARGET_amd64)
2055 vaddr = start;
2056 #else
2057 vaddr = (uintptr_t)mem_alloc(end - start);
2058 #endif
2059 fb->boot_fb_virt = vaddr;
2060 if (map_debug) {
2061 dboot_printf("FB map pa=%" PRIx64 "..%"
2062 PRIx64 "\n", start, end);
2063 }
2064
2065 pte_bits |= PT_NOCACHE;
2066 if (PAT_support != 0)
2067 pte_bits |= PT_PAT_4K;
2068
2069 while (start < end) {
2070 map_pa_at_va(start, vaddr, 0);
2071 start += MMU_PAGESIZE;
2072 vaddr += MMU_PAGESIZE;
2073 }
2074 pte_bits &= ~PT_NOCACHE;
2075 if (PAT_support != 0)
2076 pte_bits &= ~PT_PAT_4K;
2077 }
2078 }
2079 #endif /* !__xpv */
2080
2081 DBG_MSG("\nPage tables constructed\n");
2082 }
2083
2084 #define NO_MULTIBOOT \
2085 "multiboot is no longer used to boot the Solaris Operating System.\n\
2086 The grub entry should be changed to:\n\
2087 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
2088 module$ /platform/i86pc/$ISADIR/boot_archive\n\
2089 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
2090
2091 static void
2092 dboot_init_xboot_consinfo(void)
2093 {
2094 uintptr_t addr;
2095 /*
2096 * boot info must be 16 byte aligned for 64 bit kernel ABI
2097 */
2098 addr = (uintptr_t)boot_info;
2099 addr = (addr + 0xf) & ~0xf;
2100 bi = (struct xboot_info *)addr;
2101
2102 #if !defined(__xpv)
2103 /*
2104 * fb info must be 16 byte aligned for 64 bit kernel ABI
2105 */
2106 addr = (uintptr_t)framebuffer;
2107 addr = (addr + 0xf) & ~0xf;
2108 fb = (boot_framebuffer_t *)addr;
2109 bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb;
2110
2111 switch (multiboot_version) {
2112 case 1:
2113 dboot_multiboot1_xboot_consinfo();
2114 break;
2115 case 2:
2116 dboot_multiboot2_xboot_consinfo();
2117 break;
2118 default:
2119 dboot_panic("Unknown multiboot version: %d\n",
2120 multiboot_version);
2121 break;
2122 }
2123 /*
2124 * Lookup environment module for the console. Complete module list
2125 * will be built after console setup.
2126 */
2127 dboot_find_console_modules();
2128 #endif
2129 }
2130
2131 /*
2132 * Set up basic data from the boot loader.
2133 * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
2134 * 32-bit dboot code setup used to set up and start 64-bit kernel.
2135 * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
2136 * start 64-bit illumos kernel.
2137 */
2138 static void
2139 dboot_loader_init(void)
2140 {
2141 #if !defined(__xpv)
2142 mb_info = NULL;
2143 mb2_info = NULL;
2144
2145 switch (mb_magic) {
2146 case MB_BOOTLOADER_MAGIC:
2147 multiboot_version = 1;
2148 mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
2149 #if defined(_BOOT_TARGET_amd64)
2150 load_addr = mb_header.load_addr;
2151 #endif
2152 break;
2153
2154 case MULTIBOOT2_BOOTLOADER_MAGIC:
2155 multiboot_version = 2;
2156 mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
2157 mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
2158 #if defined(_BOOT_TARGET_amd64)
2159 load_addr = mb2_load_addr;
2160 #endif
2161 break;
2162
2163 default:
2164 dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
2165 break;
2166 }
2167 #endif /* !defined(__xpv) */
2168 }
2169
2170 /* Extract the kernel command line from [multi]boot information. */
2171 static char *
2172 dboot_loader_cmdline(void)
2173 {
2174 char *line = NULL;
2175
2176 #if defined(__xpv)
2177 line = (char *)xen_info->cmd_line;
2178 #else /* __xpv */
2179
2180 switch (multiboot_version) {
2181 case 1:
2182 if (mb_info->flags & MB_INFO_CMDLINE)
2183 line = (char *)mb_info->cmdline;
2184 break;
2185
2186 case 2:
2187 line = dboot_multiboot2_cmdline(mb2_info);
2188 break;
2189
2190 default:
2191 dboot_panic("Unknown multiboot version: %d\n",
2192 multiboot_version);
2193 break;
2194 }
2195
2196 #endif /* __xpv */
2197
2198 /*
2199 * Make sure we have valid pointer so the string operations
2200 * will not crash us.
2201 */
2202 if (line == NULL)
2203 line = "";
2204
2205 return (line);
2206 }
2207
2208 static char *
2209 dboot_loader_name(void)
2210 {
2211 #if defined(__xpv)
2212 return (NULL);
2213 #else /* __xpv */
2214 multiboot_tag_string_t *tag;
2215
2216 switch (multiboot_version) {
2217 case 1:
2218 return ((char *)mb_info->boot_loader_name);
2219
2220 case 2:
2221 tag = dboot_multiboot2_find_tag(mb2_info,
2222 MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
2223 return (tag->mb_string);
2224 default:
2225 dboot_panic("Unknown multiboot version: %d\n",
2226 multiboot_version);
2227 break;
2228 }
2229
2230 return (NULL);
2231 #endif /* __xpv */
2232 }
2233
2234 /*
2235 * startup_kernel has a pretty simple job. It builds pagetables which reflect
2236 * 1:1 mappings for all memory in use. It then also adds mappings for
2237 * the kernel nucleus at virtual address of target_kernel_text using large page
2238 * mappings. The page table pages are also accessible at 1:1 mapped
2239 * virtual addresses.
2240 */
2241 /*ARGSUSED*/
2242 void
2243 startup_kernel(void)
2244 {
2245 char *cmdline;
2246 char *bootloader;
2247 #if defined(__xpv)
2248 physdev_set_iopl_t set_iopl;
2249 #endif /* __xpv */
2250
2251 bcons_init(NULL); /* Set very early console to ttya. */
2252 dboot_loader_init();
2253 /*
2254 * At this point we are executing in a 32 bit real mode.
2255 */
2256
2257 bootloader = dboot_loader_name();
2258 cmdline = dboot_loader_cmdline();
2259
2260 #if defined(__xpv)
2261 /*
2262 * For dom0, before we initialize the console subsystem we'll
2263 * need to enable io operations, so set I/O priveldge level to 1.
2264 */
2265 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
2266 set_iopl.iopl = 1;
2267 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
2268 }
2269 #endif /* __xpv */
2270
2271 dboot_init_xboot_consinfo();
2272 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
2273 bcons_init(bi); /* Now we can set the real console. */
2274
2275 prom_debug = (find_boot_prop("prom_debug") != NULL);
2276 map_debug = (find_boot_prop("map_debug") != NULL);
2277
2278 #if !defined(__xpv)
2279 dboot_multiboot_get_fwtables();
2280 #endif
2281 DBG_MSG("\n\nillumos prekernel set: ");
2282 DBG_MSG(cmdline);
2283 DBG_MSG("\n");
2284
2285 if (bootloader != NULL && prom_debug) {
2286 dboot_printf("Kernel loaded by: %s\n", bootloader);
2287 #if !defined(__xpv)
2288 dboot_printf("Using multiboot %d boot protocol.\n",
2289 multiboot_version);
2290 #endif
2291 }
2292
2293 if (strstr(cmdline, "multiboot") != NULL) {
2294 dboot_panic(NO_MULTIBOOT);
2295 }
2296
2297 DBG((uintptr_t)bi);
2298 #if !defined(__xpv)
2299 DBG((uintptr_t)mb_info);
2300 DBG((uintptr_t)mb2_info);
2301 if (mb2_info != NULL)
2302 DBG(mb2_info->mbi_total_size);
2303 DBG(bi->bi_acpi_rsdp);
2304 DBG(bi->bi_smbios);
2305 DBG(bi->bi_uefi_arch);
2306 DBG(bi->bi_uefi_systab);
2307
2308 if (bi->bi_uefi_systab && prom_debug) {
2309 if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) {
2310 print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
2311 bi->bi_uefi_systab);
2312 } else {
2313 print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
2314 bi->bi_uefi_systab);
2315 }
2316 }
2317 #endif
2318
2319 /*
2320 * Need correct target_kernel_text value
2321 */
2322 #if defined(_BOOT_TARGET_amd64)
2323 target_kernel_text = KERNEL_TEXT_amd64;
2324 #elif defined(__xpv)
2325 target_kernel_text = KERNEL_TEXT_i386_xpv;
2326 #else
2327 target_kernel_text = KERNEL_TEXT_i386;
2328 #endif
2329 DBG(target_kernel_text);
2330
2331 #if defined(__xpv)
2332
2333 /*
2334 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
2335 */
2336
2337 #if defined(_BOOT_TARGET_amd64)
2338 /*
2339 * 64-bit hypervisor.
2340 */
2341 amd64_support = 1;
2342 pae_support = 1;
2343
2344 #else /* _BOOT_TARGET_amd64 */
2345
2346 /*
2347 * See if we are running on a PAE Hypervisor
2348 */
2349 {
2350 xen_capabilities_info_t caps;
2351
2352 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2353 dboot_panic("HYPERVISOR_xen_version(caps) failed");
2354 caps[sizeof (caps) - 1] = 0;
2355 if (prom_debug)
2356 dboot_printf("xen capabilities %s\n", caps);
2357 if (strstr(caps, "x86_32p") != NULL)
2358 pae_support = 1;
2359 }
2360
2361 #endif /* _BOOT_TARGET_amd64 */
2362 {
2363 xen_platform_parameters_t p;
2364
2365 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2366 dboot_panic("HYPERVISOR_xen_version(parms) failed");
2367 DBG(p.virt_start);
2368 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2369 }
2370
2371 /*
2372 * The hypervisor loads stuff starting at 1Gig
2373 */
2374 mfn_base = ONE_GIG;
2375 DBG(mfn_base);
2376
2377 /*
2378 * enable writable page table mode for the hypervisor
2379 */
2380 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2381 VMASST_TYPE_writable_pagetables) < 0)
2382 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2383
2384 /*
2385 * check for NX support
2386 */
2387 if (pae_support) {
2388 uint32_t eax = 0x80000000;
2389 uint32_t edx = get_cpuid_edx(&eax);
2390
2391 if (eax >= 0x80000001) {
2392 eax = 0x80000001;
2393 edx = get_cpuid_edx(&eax);
2394 if (edx & CPUID_AMD_EDX_NX)
2395 NX_support = 1;
2396 }
2397 }
2398
2399 /*
2400 * check for PAT support
2401 */
2402 {
2403 uint32_t eax = 1;
2404 uint32_t edx = get_cpuid_edx(&eax);
2405
2406 if (edx & CPUID_INTC_EDX_PAT)
2407 PAT_support = 1;
2408 }
2409 #if !defined(_BOOT_TARGET_amd64)
2410
2411 /*
2412 * The 32-bit hypervisor uses segmentation to protect itself from
2413 * guests. This means when a guest attempts to install a flat 4GB
2414 * code or data descriptor the 32-bit hypervisor will protect itself
2415 * by silently shrinking the segment such that if the guest attempts
2416 * any access where the hypervisor lives a #gp fault is generated.
2417 * The problem is that some applications expect a full 4GB flat
2418 * segment for their current thread pointer and will use negative
2419 * offset segment wrap around to access data. TLS support in linux
2420 * brand is one example of this.
2421 *
2422 * The 32-bit hypervisor can catch the #gp fault in these cases
2423 * and emulate the access without passing the #gp fault to the guest
2424 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2425 * Seems like this should have been the default.
2426 * Either way, we want the hypervisor -- and not Solaris -- to deal
2427 * to deal with emulating these accesses.
2428 */
2429 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2430 VMASST_TYPE_4gb_segments) < 0)
2431 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2432 #endif /* !_BOOT_TARGET_amd64 */
2433
2434 #else /* __xpv */
2435
2436 /*
2437 * use cpuid to enable MMU features
2438 */
2439 if (have_cpuid()) {
2440 uint32_t eax, edx;
2441
2442 eax = 1;
2443 edx = get_cpuid_edx(&eax);
2444 if (edx & CPUID_INTC_EDX_PSE)
2445 largepage_support = 1;
2446 if (edx & CPUID_INTC_EDX_PGE)
2447 pge_support = 1;
2448 if (edx & CPUID_INTC_EDX_PAE)
2449 pae_support = 1;
2450 if (edx & CPUID_INTC_EDX_PAT)
2451 PAT_support = 1;
2452
2453 eax = 0x80000000;
2454 edx = get_cpuid_edx(&eax);
2455 if (eax >= 0x80000001) {
2456 eax = 0x80000001;
2457 edx = get_cpuid_edx(&eax);
2458 if (edx & CPUID_AMD_EDX_LM)
2459 amd64_support = 1;
2460 if (edx & CPUID_AMD_EDX_NX)
2461 NX_support = 1;
2462 }
2463 } else {
2464 dboot_printf("cpuid not supported\n");
2465 }
2466 #endif /* __xpv */
2467
2468
2469 #if defined(_BOOT_TARGET_amd64)
2470 if (amd64_support == 0)
2471 dboot_panic("long mode not supported, rebooting");
2472 else if (pae_support == 0)
2473 dboot_panic("long mode, but no PAE; rebooting");
2474 #else
2475 /*
2476 * Allow the command line to over-ride use of PAE for 32 bit.
2477 */
2478 if (strstr(cmdline, "disablePAE=true") != NULL) {
2479 pae_support = 0;
2480 NX_support = 0;
2481 amd64_support = 0;
2482 }
2483 #endif
2484
2485 /*
2486 * initialize the simple memory allocator
2487 */
2488 init_mem_alloc();
2489
2490 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2491 /*
2492 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2493 */
2494 if (max_mem < FOUR_GIG && NX_support == 0)
2495 pae_support = 0;
2496 #endif
2497
2498 /*
2499 * configure mmu information
2500 */
2501 if (pae_support) {
2502 shift_amt = shift_amt_pae;
2503 ptes_per_table = 512;
2504 pte_size = 8;
2505 lpagesize = TWO_MEG;
2506 #if defined(_BOOT_TARGET_amd64)
2507 top_level = 3;
2508 #else
2509 top_level = 2;
2510 #endif
2511 } else {
2512 pae_support = 0;
2513 NX_support = 0;
2514 shift_amt = shift_amt_nopae;
2515 ptes_per_table = 1024;
2516 pte_size = 4;
2517 lpagesize = FOUR_MEG;
2518 top_level = 1;
2519 }
2520
2521 DBG(PAT_support);
2522 DBG(pge_support);
2523 DBG(NX_support);
2524 DBG(largepage_support);
2525 DBG(amd64_support);
2526 DBG(top_level);
2527 DBG(pte_size);
2528 DBG(ptes_per_table);
2529 DBG(lpagesize);
2530
2531 #if defined(__xpv)
2532 ktext_phys = ONE_GIG; /* from UNIX Mapfile */
2533 #else
2534 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */
2535 #endif
2536
2537 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2538 /*
2539 * For grub, copy kernel bits from the ELF64 file to final place.
2540 */
2541 DBG_MSG("\nAllocating nucleus pages.\n");
2542 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2543
2544 if (ktext_phys == 0)
2545 dboot_panic("failed to allocate aligned kernel memory");
2546 DBG(load_addr);
2547 if (dboot_elfload64(load_addr) != 0)
2548 dboot_panic("failed to parse kernel ELF image, rebooting");
2549 #endif
2550
2551 DBG(ktext_phys);
2552
2553 /*
2554 * Allocate page tables.
2555 */
2556 build_page_tables();
2557
2558 /*
2559 * return to assembly code to switch to running kernel
2560 */
2561 entry_addr_low = (uint32_t)target_kernel_text;
2562 DBG(entry_addr_low);
2563 bi->bi_use_largepage = largepage_support;
2564 bi->bi_use_pae = pae_support;
2565 bi->bi_use_pge = pge_support;
2566 bi->bi_use_nx = NX_support;
2567
2568 #if defined(__xpv)
2569
2570 bi->bi_next_paddr = next_avail_addr - mfn_base;
2571 DBG(bi->bi_next_paddr);
2572 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2573 DBG(bi->bi_next_vaddr);
2574
2575 /*
2576 * unmap unused pages in start area to make them available for DMA
2577 */
2578 while (next_avail_addr < scratch_end) {
2579 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
2580 0, UVMF_INVLPG | UVMF_LOCAL);
2581 next_avail_addr += MMU_PAGESIZE;
2582 }
2583
2584 bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
2585 DBG((uintptr_t)HYPERVISOR_shared_info);
2586 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2587 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2588
2589 #else /* __xpv */
2590
2591 bi->bi_next_paddr = next_avail_addr;
2592 DBG(bi->bi_next_paddr);
2593 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2594 DBG(bi->bi_next_vaddr);
2595 bi->bi_mb_version = multiboot_version;
2596
2597 switch (multiboot_version) {
2598 case 1:
2599 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info;
2600 break;
2601 case 2:
2602 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info;
2603 break;
2604 default:
2605 dboot_panic("Unknown multiboot version: %d\n",
2606 multiboot_version);
2607 break;
2608 }
2609 bi->bi_top_page_table = (uintptr_t)top_page_table;
2610
2611 #endif /* __xpv */
2612
2613 bi->bi_kseg_size = FOUR_MEG;
2614 DBG(bi->bi_kseg_size);
2615
2616 #ifndef __xpv
2617 if (map_debug)
2618 dump_tables();
2619 #endif
2620
2621 #ifndef __xpv
2622 /* Update boot info with FB data */
2623 fb->cursor.origin.x = fb_info.cursor.origin.x;
2624 fb->cursor.origin.y = fb_info.cursor.origin.y;
2625 fb->cursor.pos.x = fb_info.cursor.pos.x;
2626 fb->cursor.pos.y = fb_info.cursor.pos.y;
2627 fb->cursor.visible = fb_info.cursor.visible;
2628 #endif
2629
2630 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2631 }