big-one New usr/src/uts/sun4u/opl/os/opl.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 #include <sys/cpuvar.h>
  27 #include <sys/systm.h>
  28 #include <sys/sysmacros.h>
  29 #include <sys/promif.h>
  30 #include <sys/platform_module.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/errno.h>
  33 #include <sys/machsystm.h>
  34 #include <sys/bootconf.h>
  35 #include <sys/nvpair.h>
  36 #include <sys/kobj.h>
  37 #include <sys/mem_cage.h>
  38 #include <sys/opl.h>
  39 #include <sys/scfd/scfostoescf.h>
  40 #include <sys/cpu_sgnblk_defs.h>
  41 #include <sys/utsname.h>
  42 #include <sys/ddi.h>
  43 #include <sys/sunndi.h>
  44 #include <sys/lgrp.h>
  45 #include <sys/memnode.h>
  46 #include <sys/sysmacros.h>
  47 #include <sys/time.h>
  48 #include <sys/cpu.h>
  49 #include <sys/dumphdr.h>
  50 #include <vm/vm_dep.h>
  51 
  52 int (*opl_get_mem_unum)(int, uint64_t, char *, int, int *);
  53 int (*opl_get_mem_sid)(char *unum, char *buf, int buflen, int *lenp);
  54 int (*opl_get_mem_offset)(uint64_t paddr, uint64_t *offp);
  55 int (*opl_get_mem_addr)(char *unum, char *sid,
  56     uint64_t offset, uint64_t *paddr);
  57 
  58 /* Memory for fcode claims.  16k times # maximum possible IO units */
  59 #define EFCODE_SIZE     (OPL_MAX_BOARDS * OPL_MAX_IO_UNITS_PER_BOARD * 0x4000)
  60 int efcode_size = EFCODE_SIZE;
  61 
  62 #define OPL_MC_MEMBOARD_SHIFT 38        /* Boards on 256BG boundary */
  63 
  64 /* Set the maximum number of boards for DR */
  65 int opl_boards = OPL_MAX_BOARDS;
  66 
  67 void sgn_update_all_cpus(ushort_t, uchar_t, uchar_t);
  68 
  69 extern int tsb_lgrp_affinity;
  70 
  71 int opl_tsb_spares = (OPL_MAX_BOARDS) * (OPL_MAX_PCICH_UNITS_PER_BOARD) *
  72         (OPL_MAX_TSBS_PER_PCICH);
  73 
  74 pgcnt_t opl_startup_cage_size = 0;
  75 
  76 /*
  77  * The length of the delay in seconds in communication with XSCF after
  78  * which the warning message will be logged.
  79  */
  80 uint_t  xscf_connect_delay = 60 * 15;
  81 
  82 static opl_model_info_t opl_models[] = {
  83         { "FF1", OPL_MAX_BOARDS_FF1, FF1, STD_DISPATCH_TABLE },
  84         { "FF2", OPL_MAX_BOARDS_FF2, FF2, STD_DISPATCH_TABLE },
  85         { "DC1", OPL_MAX_BOARDS_DC1, DC1, STD_DISPATCH_TABLE },
  86         { "DC2", OPL_MAX_BOARDS_DC2, DC2, EXT_DISPATCH_TABLE },
  87         { "DC3", OPL_MAX_BOARDS_DC3, DC3, EXT_DISPATCH_TABLE },
  88         { "IKKAKU", OPL_MAX_BOARDS_IKKAKU, IKKAKU, STD_DISPATCH_TABLE },
  89 };
  90 static  int     opl_num_models = sizeof (opl_models)/sizeof (opl_model_info_t);
  91 
  92 /*
  93  * opl_cur_model
  94  */
  95 static  opl_model_info_t *opl_cur_model = NULL;
  96 
  97 static struct memlist *opl_memlist_per_board(struct memlist *ml);
  98 static void post_xscf_msg(char *, int);
  99 static void pass2xscf_thread();
 100 
 101 /*
 102  * Note FF/DC out-of-order instruction engine takes only a
 103  * single cycle to execute each spin loop
 104  * for comparison, Panther takes 6 cycles for same loop
 105  * OPL_BOFF_SPIN = base spin loop, roughly one memory reference time
 106  * OPL_BOFF_TM = approx nsec for OPL sleep instruction (1600 for OPL-C)
 107  * OPL_BOFF_SLEEP = approx number of SPIN iterations to equal one sleep
 108  * OPL_BOFF_MAX_SCALE - scaling factor for max backoff based on active cpus
 109  * Listed values tuned for 2.15GHz to 2.64GHz systems
 110  * Value may change for future systems
 111  */
 112 #define OPL_BOFF_SPIN 7
 113 #define OPL_BOFF_SLEEP 4
 114 #define OPL_BOFF_TM 1600
 115 #define OPL_BOFF_MAX_SCALE 8
 116 
 117 #define OPL_CLOCK_TICK_THRESHOLD        128
 118 #define OPL_CLOCK_TICK_NCPUS            64
 119 
 120 extern int      clock_tick_threshold;
 121 extern int      clock_tick_ncpus;
 122 
 123 int
 124 set_platform_max_ncpus(void)
 125 {
 126         return (OPL_MAX_CPU_PER_BOARD * OPL_MAX_BOARDS);
 127 }
 128 
 129 int
 130 set_platform_tsb_spares(void)
 131 {
 132         return (MIN(opl_tsb_spares, MAX_UPA));
 133 }
 134 
 135 static void
 136 set_model_info()
 137 {
 138         extern int ts_dispatch_extended;
 139         char    name[MAXSYSNAME];
 140         int     i;
 141 
 142         /*
 143          * Get model name from the root node.
 144          *
 145          * We are using the prom device tree since, at this point,
 146          * the Solaris device tree is not yet setup.
 147          */
 148         (void) prom_getprop(prom_rootnode(), "model", (caddr_t)name);
 149 
 150         for (i = 0; i < opl_num_models; i++) {
 151                 if (strncmp(name, opl_models[i].model_name, MAXSYSNAME) == 0) {
 152                         opl_cur_model = &opl_models[i];
 153                         break;
 154                 }
 155         }
 156 
 157         /*
 158          * If model not matched, it's an unknown model.
 159          * Just return.  It will default to standard dispatch tables.
 160          */
 161         if (i == opl_num_models)
 162                 return;
 163 
 164         if ((opl_cur_model->model_cmds & EXT_DISPATCH_TABLE) &&
 165             (ts_dispatch_extended == -1)) {
 166                 /*
 167                  * Based on a platform model, select a dispatch table.
 168                  * Only DC2 and DC3 systems uses the alternate/extended
 169                  * TS dispatch table.
 170                  * IKKAKU, FF1, FF2 and DC1 systems use standard dispatch
 171                  * tables.
 172                  */
 173                 ts_dispatch_extended = 1;
 174         }
 175 
 176 }
 177 
 178 static void
 179 set_max_mmu_ctxdoms()
 180 {
 181         extern uint_t   max_mmu_ctxdoms;
 182         int             max_boards;
 183 
 184         /*
 185          * From the model, get the maximum number of boards
 186          * supported and set the value accordingly. If the model
 187          * could not be determined or recognized, we assume the max value.
 188          */
 189         if (opl_cur_model == NULL)
 190                 max_boards = OPL_MAX_BOARDS;
 191         else
 192                 max_boards = opl_cur_model->model_max_boards;
 193 
 194         /*
 195          * On OPL, cores and MMUs are one-to-one.
 196          */
 197         max_mmu_ctxdoms = OPL_MAX_CORE_UNITS_PER_BOARD * max_boards;
 198 }
 199 
 200 #pragma weak mmu_init_large_pages
 201 
 202 void
 203 set_platform_defaults(void)
 204 {
 205         extern char *tod_module_name;
 206         extern void cpu_sgn_update(ushort_t, uchar_t, uchar_t, int);
 207         extern void mmu_init_large_pages(size_t);
 208 
 209         /* Set the CPU signature function pointer */
 210         cpu_sgn_func = cpu_sgn_update;
 211 
 212         /* Set appropriate tod module for OPL platform */
 213         ASSERT(tod_module_name == NULL);
 214         tod_module_name = "todopl";
 215 
 216         if ((mmu_page_sizes == max_mmu_page_sizes) &&
 217             (mmu_ism_pagesize != DEFAULT_ISM_PAGESIZE)) {
 218                 if (&mmu_init_large_pages)
 219                         mmu_init_large_pages(mmu_ism_pagesize);
 220         }
 221 
 222         tsb_lgrp_affinity = 1;
 223 
 224         set_max_mmu_ctxdoms();
 225 }
 226 
 227 /*
 228  * Convert logical a board number to a physical one.
 229  */
 230 
 231 #define LSBPROP         "board#"
 232 #define PSBPROP         "physical-board#"
 233 
 234 int
 235 opl_get_physical_board(int id)
 236 {
 237         dev_info_t      *root_dip, *dip = NULL;
 238         char            *dname = NULL;
 239         int             circ;
 240 
 241         pnode_t         pnode;
 242         char            pname[MAXSYSNAME] = {0};
 243 
 244         int             lsb_id; /* Logical System Board ID */
 245         int             psb_id; /* Physical System Board ID */
 246 
 247 
 248         /*
 249          * This function is called on early stage of bootup when the
 250          * kernel device tree is not initialized yet, and also
 251          * later on when the device tree is up. We want to try
 252          * the fast track first.
 253          */
 254         root_dip = ddi_root_node();
 255         if (root_dip) {
 256                 /* Get from devinfo node */
 257                 ndi_devi_enter(root_dip, &circ);
 258                 for (dip = ddi_get_child(root_dip); dip;
 259                     dip = ddi_get_next_sibling(dip)) {
 260 
 261                         dname = ddi_node_name(dip);
 262                         if (strncmp(dname, "pseudo-mc", 9) != 0)
 263                                 continue;
 264 
 265                         if ((lsb_id = (int)ddi_getprop(DDI_DEV_T_ANY, dip,
 266                             DDI_PROP_DONTPASS, LSBPROP, -1)) == -1)
 267                                 continue;
 268 
 269                         if (id == lsb_id) {
 270                                 if ((psb_id = (int)ddi_getprop(DDI_DEV_T_ANY,
 271                                     dip, DDI_PROP_DONTPASS, PSBPROP, -1))
 272                                     == -1) {
 273                                         ndi_devi_exit(root_dip, circ);
 274                                         return (-1);
 275                                 } else {
 276                                         ndi_devi_exit(root_dip, circ);
 277                                         return (psb_id);
 278                                 }
 279                         }
 280                 }
 281                 ndi_devi_exit(root_dip, circ);
 282         }
 283 
 284         /*
 285          * We do not have the kernel device tree, or we did not
 286          * find the node for some reason (let's say the kernel
 287          * device tree was modified), let's try the OBP tree.
 288          */
 289         pnode = prom_rootnode();
 290         for (pnode = prom_childnode(pnode); pnode;
 291             pnode = prom_nextnode(pnode)) {
 292 
 293                 if ((prom_getprop(pnode, "name", (caddr_t)pname) == -1) ||
 294                     (strncmp(pname, "pseudo-mc", 9) != 0))
 295                         continue;
 296 
 297                 if (prom_getprop(pnode, LSBPROP, (caddr_t)&lsb_id) == -1)
 298                         continue;
 299 
 300                 if (id == lsb_id) {
 301                         if (prom_getprop(pnode, PSBPROP,
 302                             (caddr_t)&psb_id) == -1) {
 303                                 return (-1);
 304                         } else {
 305                                 return (psb_id);
 306                         }
 307                 }
 308         }
 309 
 310         return (-1);
 311 }
 312 
 313 /*
 314  * For OPL it's possible that memory from two or more successive boards
 315  * will be contiguous across the boards, and therefore represented as a
 316  * single chunk.
 317  * This function splits such chunks down the board boundaries.
 318  */
 319 static struct memlist *
 320 opl_memlist_per_board(struct memlist *ml)
 321 {
 322         uint64_t ssize, low, high, boundary;
 323         struct memlist *head, *tail, *new;
 324 
 325         ssize = (1ull << OPL_MC_MEMBOARD_SHIFT);
 326 
 327         head = tail = NULL;
 328 
 329         for (; ml; ml = ml->ml_next) {
 330                 low  = (uint64_t)ml->ml_address;
 331                 high = low+(uint64_t)(ml->ml_size);
 332                 while (low < high) {
 333                         boundary = roundup(low+1, ssize);
 334                         boundary = MIN(high, boundary);
 335                         new = kmem_zalloc(sizeof (struct memlist), KM_SLEEP);
 336                         new->ml_address = low;
 337                         new->ml_size = boundary - low;
 338                         if (head == NULL)
 339                                 head = new;
 340                         if (tail) {
 341                                 tail->ml_next = new;
 342                                 new->ml_prev = tail;
 343                         }
 344                         tail = new;
 345                         low = boundary;
 346                 }
 347         }
 348         return (head);
 349 }
 350 
 351 void
 352 set_platform_cage_params(void)
 353 {
 354         extern pgcnt_t total_pages;
 355         extern struct memlist *phys_avail;
 356         struct memlist *ml, *tml;
 357 
 358         if (kernel_cage_enable) {
 359                 pgcnt_t preferred_cage_size;
 360 
 361                 preferred_cage_size = MAX(opl_startup_cage_size,
 362                     total_pages / 256);
 363 
 364                 ml = opl_memlist_per_board(phys_avail);
 365 
 366                 /*
 367                  * Note: we are assuming that post has load the
 368                  * whole show in to the high end of memory. Having
 369                  * taken this leap, we copy the whole of phys_avail
 370                  * the glist and arrange for the cage to grow
 371                  * downward (descending pfns).
 372                  */
 373                 kcage_range_init(ml, KCAGE_DOWN, preferred_cage_size);
 374 
 375                 /* free the memlist */
 376                 do {
 377                         tml = ml->ml_next;
 378                         kmem_free(ml, sizeof (struct memlist));
 379                         ml = tml;
 380                 } while (ml != NULL);
 381         }
 382 
 383         if (kcage_on)
 384                 cmn_err(CE_NOTE, "!DR Kernel Cage is ENABLED");
 385         else
 386                 cmn_err(CE_NOTE, "!DR Kernel Cage is DISABLED");
 387 }
 388 
 389 /*ARGSUSED*/
 390 int
 391 plat_cpu_poweron(struct cpu *cp)
 392 {
 393         int (*opl_cpu_poweron)(struct cpu *) = NULL;
 394 
 395         opl_cpu_poweron =
 396             (int (*)(struct cpu *))kobj_getsymvalue("drmach_cpu_poweron", 0);
 397 
 398         if (opl_cpu_poweron == NULL)
 399                 return (ENOTSUP);
 400         else
 401                 return ((opl_cpu_poweron)(cp));
 402 
 403 }
 404 
 405 /*ARGSUSED*/
 406 int
 407 plat_cpu_poweroff(struct cpu *cp)
 408 {
 409         int (*opl_cpu_poweroff)(struct cpu *) = NULL;
 410 
 411         opl_cpu_poweroff =
 412             (int (*)(struct cpu *))kobj_getsymvalue("drmach_cpu_poweroff", 0);
 413 
 414         if (opl_cpu_poweroff == NULL)
 415                 return (ENOTSUP);
 416         else
 417                 return ((opl_cpu_poweroff)(cp));
 418 
 419 }
 420 
 421 int
 422 plat_max_boards(void)
 423 {
 424         /*
 425          * If the model cannot be determined, default to the max value.
 426          * Otherwise, Ikkaku model only supports 1 system board.
 427          */
 428         if ((opl_cur_model != NULL) && (opl_cur_model->model_type == IKKAKU))
 429                 return (OPL_MAX_BOARDS_IKKAKU);
 430         else
 431                 return (OPL_MAX_BOARDS);
 432 }
 433 
 434 int
 435 plat_max_cpu_units_per_board(void)
 436 {
 437         return (OPL_MAX_CPU_PER_BOARD);
 438 }
 439 
 440 int
 441 plat_max_mem_units_per_board(void)
 442 {
 443         return (OPL_MAX_MEM_UNITS_PER_BOARD);
 444 }
 445 
 446 int
 447 plat_max_io_units_per_board(void)
 448 {
 449         return (OPL_MAX_IO_UNITS_PER_BOARD);
 450 }
 451 
 452 int
 453 plat_max_cmp_units_per_board(void)
 454 {
 455         return (OPL_MAX_CMP_UNITS_PER_BOARD);
 456 }
 457 
 458 int
 459 plat_max_core_units_per_board(void)
 460 {
 461         return (OPL_MAX_CORE_UNITS_PER_BOARD);
 462 }
 463 
 464 int
 465 plat_pfn_to_mem_node(pfn_t pfn)
 466 {
 467         return (pfn >> mem_node_pfn_shift);
 468 }
 469 
 470 /* ARGSUSED */
 471 void
 472 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
 473 {
 474         size_t  elem;
 475         pfn_t   basepfn;
 476         pgcnt_t npgs;
 477         uint64_t        boundary, ssize;
 478         uint64_t        low, high;
 479 
 480         /*
 481          * OPL mem slices are always aligned on a 256GB boundary.
 482          */
 483         mem_node_pfn_shift = OPL_MC_MEMBOARD_SHIFT - MMU_PAGESHIFT;
 484         mem_node_physalign = 0;
 485 
 486         /*
 487          * Boot install lists are arranged <addr, len>, <addr, len>, ...
 488          */
 489         ssize = (1ull << OPL_MC_MEMBOARD_SHIFT);
 490         for (elem = 0; elem < nelems; list++, elem++) {
 491                 low  = list->addr;
 492                 high = low + list->size;
 493                 while (low < high) {
 494                         boundary = roundup(low+1, ssize);
 495                         boundary = MIN(high, boundary);
 496                         basepfn = btop(low);
 497                         npgs = btop(boundary - low);
 498                         mem_node_add_slice(basepfn, basepfn + npgs - 1);
 499                         low = boundary;
 500                 }
 501         }
 502 }
 503 
 504 /*
 505  * Find the CPU associated with a slice at boot-time.
 506  */
 507 void
 508 plat_fill_mc(pnode_t nodeid)
 509 {
 510         int board;
 511         int memnode;
 512         struct {
 513                 uint64_t        addr;
 514                 uint64_t        size;
 515         } mem_range;
 516 
 517         if (prom_getprop(nodeid, "board#", (caddr_t)&board) < 0) {
 518                 panic("Can not find board# property in mc node %x", nodeid);
 519         }
 520         if (prom_getprop(nodeid, "sb-mem-ranges", (caddr_t)&mem_range) < 0) {
 521                 panic("Can not find sb-mem-ranges property in mc node %x",
 522                     nodeid);
 523         }
 524         memnode = mem_range.addr >> OPL_MC_MEMBOARD_SHIFT;
 525         plat_assign_lgrphand_to_mem_node(board, memnode);
 526 }
 527 
 528 /*
 529  * Return the platform handle for the lgroup containing the given CPU
 530  *
 531  * For OPL, lgroup platform handle == board #.
 532  */
 533 
 534 extern int mpo_disabled;
 535 extern lgrp_handle_t lgrp_default_handle;
 536 
 537 lgrp_handle_t
 538 plat_lgrp_cpu_to_hand(processorid_t id)
 539 {
 540         lgrp_handle_t plathand;
 541 
 542         /*
 543          * Return the real platform handle for the CPU until
 544          * such time as we know that MPO should be disabled.
 545          * At that point, we set the "mpo_disabled" flag to true,
 546          * and from that point on, return the default handle.
 547          *
 548          * By the time we know that MPO should be disabled, the
 549          * first CPU will have already been added to a leaf
 550          * lgroup, but that's ok. The common lgroup code will
 551          * double check that the boot CPU is in the correct place,
 552          * and in the case where mpo should be disabled, will move
 553          * it to the root if necessary.
 554          */
 555         if (mpo_disabled) {
 556                 /* If MPO is disabled, return the default (UMA) handle */
 557                 plathand = lgrp_default_handle;
 558         } else
 559                 plathand = (lgrp_handle_t)LSB_ID(id);
 560         return (plathand);
 561 }
 562 
 563 /*
 564  * Platform specific lgroup initialization
 565  */
 566 void
 567 plat_lgrp_init(void)
 568 {
 569         extern uint32_t lgrp_expand_proc_thresh;
 570         extern uint32_t lgrp_expand_proc_diff;
 571         const uint_t m = LGRP_LOADAVG_THREAD_MAX;
 572 
 573         /*
 574          * Set tuneables for the OPL architecture
 575          *
 576          * lgrp_expand_proc_thresh is the threshold load on the set of
 577          * lgroups a process is currently using on before considering
 578          * adding another lgroup to the set.  For Oly-C and Jupiter
 579          * systems, there are four sockets per lgroup. Setting
 580          * lgrp_expand_proc_thresh to add lgroups when the load reaches
 581          * four threads will spread the load when it exceeds one thread
 582          * per socket, optimizing memory bandwidth and L2 cache space.
 583          *
 584          * lgrp_expand_proc_diff determines how much less another lgroup
 585          * must be loaded before shifting the start location of a thread
 586          * to it.
 587          *
 588          * lgrp_loadavg_tolerance is the threshold where two lgroups are
 589          * considered to have different loads.  It is set to be less than
 590          * 1% so that even a small residual load will be considered different
 591          * from no residual load.
 592          *
 593          * We note loadavg values are not precise.
 594          * Every 1/10 of a second loadavg values are reduced by 5%.
 595          * This adjustment can come in the middle of the lgroup selection
 596          * process, and for larger parallel apps with many threads can
 597          * frequently occur between the start of the second thread
 598          * placement and the finish of the last thread placement.
 599          * We also must be careful to not use too small of a threshold
 600          * since the cumulative decay for 1 second idle time is 40%.
 601          * That is, the residual load from completed threads will still
 602          * be 60% one second after the proc goes idle or 8% after 5 seconds.
 603          *
 604          * To allow for lag time in loadavg calculations
 605          * remote thresh = 3.75 * LGRP_LOADAVG_THREAD_MAX
 606          * local thresh  = 0.75 * LGRP_LOADAVG_THREAD_MAX
 607          * tolerance     = 0.0078 * LGRP_LOADAVG_THREAD_MAX
 608          *
 609          * The load placement algorithms consider LGRP_LOADAVG_THREAD_MAX
 610          * as the equivalent of a load of 1. To make the code more compact,
 611          * we set m = LGRP_LOADAVG_THREAD_MAX.
 612          */
 613         lgrp_expand_proc_thresh = (m * 3) + (m >> 1) + (m >> 2);
 614         lgrp_expand_proc_diff = (m >> 1) + (m >> 2);
 615         lgrp_loadavg_tolerance = (m >> 7);
 616 }
 617 
 618 /*
 619  * Platform notification of lgroup (re)configuration changes
 620  */
 621 /*ARGSUSED*/
 622 void
 623 plat_lgrp_config(lgrp_config_flag_t evt, uintptr_t arg)
 624 {
 625         update_membounds_t *umb;
 626         lgrp_config_mem_rename_t lmr;
 627         int sbd, tbd;
 628         lgrp_handle_t hand, shand, thand;
 629         int mnode, snode, tnode;
 630         pfn_t start, end;
 631 
 632         if (mpo_disabled)
 633                 return;
 634 
 635         switch (evt) {
 636 
 637         case LGRP_CONFIG_MEM_ADD:
 638                 /*
 639                  * Establish the lgroup handle to memnode translation.
 640                  */
 641                 umb = (update_membounds_t *)arg;
 642 
 643                 hand = umb->u_board;
 644                 mnode = plat_pfn_to_mem_node(umb->u_base >> MMU_PAGESHIFT);
 645                 plat_assign_lgrphand_to_mem_node(hand, mnode);
 646 
 647                 break;
 648 
 649         case LGRP_CONFIG_MEM_DEL:
 650                 /*
 651                  * Special handling for possible memory holes.
 652                  */
 653                 umb = (update_membounds_t *)arg;
 654                 hand = umb->u_board;
 655                 if ((mnode = plat_lgrphand_to_mem_node(hand)) != -1) {
 656                         if (mem_node_config[mnode].exists) {
 657                                 start = mem_node_config[mnode].physbase;
 658                                 end = mem_node_config[mnode].physmax;
 659                                 mem_node_del_slice(start, end);
 660                         }
 661                 }
 662 
 663                 break;
 664 
 665         case LGRP_CONFIG_MEM_RENAME:
 666                 /*
 667                  * During a DR copy-rename operation, all of the memory
 668                  * on one board is moved to another board -- but the
 669                  * addresses/pfns and memnodes don't change. This means
 670                  * the memory has changed locations without changing identity.
 671                  *
 672                  * Source is where we are copying from and target is where we
 673                  * are copying to.  After source memnode is copied to target
 674                  * memnode, the physical addresses of the target memnode are
 675                  * renamed to match what the source memnode had.  Then target
 676                  * memnode can be removed and source memnode can take its
 677                  * place.
 678                  *
 679                  * To do this, swap the lgroup handle to memnode mappings for
 680                  * the boards, so target lgroup will have source memnode and
 681                  * source lgroup will have empty target memnode which is where
 682                  * its memory will go (if any is added to it later).
 683                  *
 684                  * Then source memnode needs to be removed from its lgroup
 685                  * and added to the target lgroup where the memory was living
 686                  * but under a different name/memnode.  The memory was in the
 687                  * target memnode and now lives in the source memnode with
 688                  * different physical addresses even though it is the same
 689                  * memory.
 690                  */
 691                 sbd = arg & 0xffff;
 692                 tbd = (arg & 0xffff0000) >> 16;
 693                 shand = sbd;
 694                 thand = tbd;
 695                 snode = plat_lgrphand_to_mem_node(shand);
 696                 tnode = plat_lgrphand_to_mem_node(thand);
 697 
 698                 /*
 699                  * Special handling for possible memory holes.
 700                  */
 701                 if (tnode != -1 && mem_node_config[tnode].exists) {
 702                         start = mem_node_config[tnode].physbase;
 703                         end = mem_node_config[tnode].physmax;
 704                         mem_node_del_slice(start, end);
 705                 }
 706 
 707                 plat_assign_lgrphand_to_mem_node(thand, snode);
 708                 plat_assign_lgrphand_to_mem_node(shand, tnode);
 709 
 710                 lmr.lmem_rename_from = shand;
 711                 lmr.lmem_rename_to = thand;
 712 
 713                 /*
 714                  * Remove source memnode of copy rename from its lgroup
 715                  * and add it to its new target lgroup
 716                  */
 717                 lgrp_config(LGRP_CONFIG_MEM_RENAME, (uintptr_t)snode,
 718                     (uintptr_t)&lmr);
 719 
 720                 break;
 721 
 722         default:
 723                 break;
 724         }
 725 }
 726 
 727 /*
 728  * Return latency between "from" and "to" lgroups
 729  *
 730  * This latency number can only be used for relative comparison
 731  * between lgroups on the running system, cannot be used across platforms,
 732  * and may not reflect the actual latency.  It is platform and implementation
 733  * specific, so platform gets to decide its value.  It would be nice if the
 734  * number was at least proportional to make comparisons more meaningful though.
 735  * NOTE: The numbers below are supposed to be load latencies for uncached
 736  * memory divided by 10.
 737  *
 738  */
 739 int
 740 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
 741 {
 742         /*
 743          * Return min remote latency when there are more than two lgroups
 744          * (root and child) and getting latency between two different lgroups
 745          * or root is involved
 746          */
 747         if (lgrp_optimizations() && (from != to ||
 748             from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE))
 749                 return (42);
 750         else
 751                 return (35);
 752 }
 753 
 754 /*
 755  * Return platform handle for root lgroup
 756  */
 757 lgrp_handle_t
 758 plat_lgrp_root_hand(void)
 759 {
 760         if (mpo_disabled)
 761                 return (lgrp_default_handle);
 762 
 763         return (LGRP_DEFAULT_HANDLE);
 764 }
 765 
 766 /*ARGSUSED*/
 767 void
 768 plat_freelist_process(int mnode)
 769 {
 770 }
 771 
 772 void
 773 load_platform_drivers(void)
 774 {
 775         (void) i_ddi_attach_pseudo_node("dr");
 776 }
 777 
 778 /*
 779  * No platform drivers on this platform
 780  */
 781 char *platform_module_list[] = {
 782         (char *)0
 783 };
 784 
 785 /*ARGSUSED*/
 786 void
 787 plat_tod_fault(enum tod_fault_type tod_bad)
 788 {
 789 }
 790 
 791 /*ARGSUSED*/
 792 void
 793 cpu_sgn_update(ushort_t sgn, uchar_t state, uchar_t sub_state, int cpuid)
 794 {
 795         static void (*scf_panic_callback)(int);
 796         static void (*scf_shutdown_callback)(int);
 797 
 798         /*
 799          * This is for notifing system panic/shutdown to SCF.
 800          * In case of shutdown and panic, SCF call back
 801          * function should be called.
 802          *  <SCF call back functions>
 803          *   scf_panic_callb()   : panicsys()->panic_quiesce_hw()
 804          *   scf_shutdown_callb(): halt() or power_down() or reboot_machine()
 805          * cpuid should be -1 and state should be SIGST_EXIT.
 806          */
 807         if (state == SIGST_EXIT && cpuid == -1) {
 808 
 809                 /*
 810                  * find the symbol for the SCF panic callback routine in driver
 811                  */
 812                 if (scf_panic_callback == NULL)
 813                         scf_panic_callback = (void (*)(int))
 814                             modgetsymvalue("scf_panic_callb", 0);
 815                 if (scf_shutdown_callback == NULL)
 816                         scf_shutdown_callback = (void (*)(int))
 817                             modgetsymvalue("scf_shutdown_callb", 0);
 818 
 819                 switch (sub_state) {
 820                 case SIGSUBST_PANIC:
 821                         if (scf_panic_callback == NULL) {
 822                                 cmn_err(CE_NOTE, "!cpu_sgn_update: "
 823                                     "scf_panic_callb not found\n");
 824                                 return;
 825                         }
 826                         scf_panic_callback(SIGSUBST_PANIC);
 827                         break;
 828 
 829                 case SIGSUBST_HALT:
 830                         if (scf_shutdown_callback == NULL) {
 831                                 cmn_err(CE_NOTE, "!cpu_sgn_update: "
 832                                     "scf_shutdown_callb not found\n");
 833                                 return;
 834                         }
 835                         scf_shutdown_callback(SIGSUBST_HALT);
 836                         break;
 837 
 838                 case SIGSUBST_ENVIRON:
 839                         if (scf_shutdown_callback == NULL) {
 840                                 cmn_err(CE_NOTE, "!cpu_sgn_update: "
 841                                     "scf_shutdown_callb not found\n");
 842                                 return;
 843                         }
 844                         scf_shutdown_callback(SIGSUBST_ENVIRON);
 845                         break;
 846 
 847                 case SIGSUBST_REBOOT:
 848                         if (scf_shutdown_callback == NULL) {
 849                                 cmn_err(CE_NOTE, "!cpu_sgn_update: "
 850                                     "scf_shutdown_callb not found\n");
 851                                 return;
 852                         }
 853                         scf_shutdown_callback(SIGSUBST_REBOOT);
 854                         break;
 855                 }
 856         }
 857 }
 858 
 859 /*ARGSUSED*/
 860 int
 861 plat_get_mem_unum(int synd_code, uint64_t flt_addr, int flt_bus_id,
 862     int flt_in_memory, ushort_t flt_status, char *buf, int buflen, int *lenp)
 863 {
 864         /*
 865          * check if it's a Memory error.
 866          */
 867         if (flt_in_memory) {
 868                 if (opl_get_mem_unum != NULL) {
 869                         return (opl_get_mem_unum(synd_code, flt_addr, buf,
 870                             buflen, lenp));
 871                 } else {
 872                         return (ENOTSUP);
 873                 }
 874         } else {
 875                 return (ENOTSUP);
 876         }
 877 }
 878 
 879 /*ARGSUSED*/
 880 int
 881 plat_get_cpu_unum(int cpuid, char *buf, int buflen, int *lenp)
 882 {
 883         int     ret = 0;
 884         int     sb;
 885         int     plen;
 886 
 887         sb = opl_get_physical_board(LSB_ID(cpuid));
 888         if (sb == -1) {
 889                 return (ENXIO);
 890         }
 891 
 892         /*
 893          * opl_cur_model is assigned here
 894          */
 895         if (opl_cur_model == NULL) {
 896                 set_model_info();
 897 
 898                 /*
 899                  * if not matched, return
 900                  */
 901                 if (opl_cur_model == NULL)
 902                         return (ENODEV);
 903         }
 904 
 905         ASSERT((opl_cur_model - opl_models) == (opl_cur_model->model_type));
 906 
 907         switch (opl_cur_model->model_type) {
 908         case FF1:
 909                 plen = snprintf(buf, buflen, "/%s/CPUM%d", "MBU_A",
 910                     CHIP_ID(cpuid) / 2);
 911                 break;
 912 
 913         case FF2:
 914                 plen = snprintf(buf, buflen, "/%s/CPUM%d", "MBU_B",
 915                     (CHIP_ID(cpuid) / 2) + (sb * 2));
 916                 break;
 917 
 918         case DC1:
 919         case DC2:
 920         case DC3:
 921                 plen = snprintf(buf, buflen, "/%s%02d/CPUM%d", "CMU", sb,
 922                     CHIP_ID(cpuid));
 923                 break;
 924 
 925         case IKKAKU:
 926                 plen = snprintf(buf, buflen, "/%s", "MBU_A");
 927                 break;
 928 
 929         default:
 930                 /* This should never happen */
 931                 return (ENODEV);
 932         }
 933 
 934         if (plen >= buflen) {
 935                 ret = ENOSPC;
 936         } else {
 937                 if (lenp)
 938                         *lenp = strlen(buf);
 939         }
 940         return (ret);
 941 }
 942 
 943 void
 944 plat_nodename_set(void)
 945 {
 946         post_xscf_msg((char *)&utsname, sizeof (struct utsname));
 947 }
 948 
 949 caddr_t efcode_vaddr = NULL;
 950 
 951 /*
 952  * Preallocate enough memory for fcode claims.
 953  */
 954 
 955 caddr_t
 956 efcode_alloc(caddr_t alloc_base)
 957 {
 958         caddr_t efcode_alloc_base = (caddr_t)roundup((uintptr_t)alloc_base,
 959             MMU_PAGESIZE);
 960         caddr_t vaddr;
 961 
 962         /*
 963          * allocate the physical memory for the Oberon fcode.
 964          */
 965         if ((vaddr = (caddr_t)BOP_ALLOC(bootops, efcode_alloc_base,
 966             efcode_size, MMU_PAGESIZE)) == NULL)
 967                 cmn_err(CE_PANIC, "Cannot allocate Efcode Memory");
 968 
 969         efcode_vaddr = vaddr;
 970 
 971         return (efcode_alloc_base + efcode_size);
 972 }
 973 
 974 caddr_t
 975 plat_startup_memlist(caddr_t alloc_base)
 976 {
 977         caddr_t tmp_alloc_base;
 978 
 979         tmp_alloc_base = efcode_alloc(alloc_base);
 980         tmp_alloc_base =
 981             (caddr_t)roundup((uintptr_t)tmp_alloc_base, ecache_alignsize);
 982         return (tmp_alloc_base);
 983 }
 984 
 985 /* need to forward declare these */
 986 static void plat_lock_delay(uint_t);
 987 
 988 void
 989 startup_platform(void)
 990 {
 991         if (clock_tick_threshold == 0)
 992                 clock_tick_threshold = OPL_CLOCK_TICK_THRESHOLD;
 993         if (clock_tick_ncpus == 0)
 994                 clock_tick_ncpus = OPL_CLOCK_TICK_NCPUS;
 995         mutex_lock_delay = plat_lock_delay;
 996         mutex_cap_factor = OPL_BOFF_MAX_SCALE;
 997 }
 998 
 999 static uint_t
1000 get_mmu_id(processorid_t cpuid)
1001 {
1002         int pb = opl_get_physical_board(LSB_ID(cpuid));
1003 
1004         if (pb == -1) {
1005                 cmn_err(CE_PANIC,
1006                     "opl_get_physical_board failed (cpu %d LSB %u)",
1007                     cpuid, LSB_ID(cpuid));
1008         }
1009         return (pb * OPL_MAX_COREID_PER_BOARD) + (CHIP_ID(cpuid) *
1010             OPL_MAX_COREID_PER_CMP) + CORE_ID(cpuid);
1011 }
1012 
1013 void
1014 plat_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *info)
1015 {
1016         int     impl;
1017 
1018         impl = cpunodes[cpuid].implementation;
1019         if (IS_OLYMPUS_C(impl) || IS_JUPITER(impl)) {
1020                 info->mmu_idx = get_mmu_id(cpuid);
1021                 info->mmu_nctxs = 8192;
1022         } else {
1023                 cmn_err(CE_PANIC, "Unknown processor %d", impl);
1024         }
1025 }
1026 
1027 int
1028 plat_get_mem_sid(char *unum, char *buf, int buflen, int *lenp)
1029 {
1030         if (opl_get_mem_sid == NULL) {
1031                 return (ENOTSUP);
1032         }
1033         return (opl_get_mem_sid(unum, buf, buflen, lenp));
1034 }
1035 
1036 int
1037 plat_get_mem_offset(uint64_t paddr, uint64_t *offp)
1038 {
1039         if (opl_get_mem_offset == NULL) {
1040                 return (ENOTSUP);
1041         }
1042         return (opl_get_mem_offset(paddr, offp));
1043 }
1044 
1045 int
1046 plat_get_mem_addr(char *unum, char *sid, uint64_t offset, uint64_t *addrp)
1047 {
1048         if (opl_get_mem_addr == NULL) {
1049                 return (ENOTSUP);
1050         }
1051         return (opl_get_mem_addr(unum, sid, offset, addrp));
1052 }
1053 
1054 void
1055 plat_lock_delay(uint_t backoff)
1056 {
1057         int i;
1058         uint_t cnt, remcnt;
1059         int ctr;
1060         hrtime_t delay_start, rem_delay;
1061         /*
1062          * Platform specific lock delay code for OPL
1063          *
1064          * Using staged linear increases in the delay.
1065          * The sleep instruction is the preferred method of delay,
1066          * but is too large of granularity for the initial backoff.
1067          */
1068 
1069         if (backoff < 100) {
1070                 /*
1071                  * If desired backoff is long enough,
1072                  * use sleep for most of it
1073                  */
1074                 for (cnt = backoff;
1075                     cnt >= OPL_BOFF_SLEEP;
1076                     cnt -= OPL_BOFF_SLEEP) {
1077                         cpu_smt_pause();
1078                 }
1079                 /*
1080                  * spin for small remainder of backoff
1081                  */
1082                 for (ctr = cnt * OPL_BOFF_SPIN; ctr; ctr--) {
1083                         mutex_delay_default();
1084                 }
1085         } else {
1086                 /* backoff is large.  Fill it by sleeping */
1087                 delay_start = gethrtime_waitfree();
1088                 cnt = backoff / OPL_BOFF_SLEEP;
1089                 /*
1090                  * use sleep instructions for delay
1091                  */
1092                 for (i = 0; i < cnt; i++) {
1093                         cpu_smt_pause();
1094                 }
1095 
1096                 /*
1097                  * Note: if the other strand executes a sleep instruction,
1098                  * then the sleep ends immediately with a minimum time of
1099                  * 42 clocks.  We check gethrtime to insure we have
1100                  * waited long enough.  And we include both a short
1101                  * spin loop and a sleep for repeated delay times.
1102                  */
1103 
1104                 rem_delay = gethrtime_waitfree() - delay_start;
1105                 while (rem_delay < cnt * OPL_BOFF_TM) {
1106                         remcnt = cnt - (rem_delay / OPL_BOFF_TM);
1107                         for (i = 0; i < remcnt; i++) {
1108                                 cpu_smt_pause();
1109                                 for (ctr = OPL_BOFF_SPIN; ctr; ctr--) {
1110                                         mutex_delay_default();
1111                                 }
1112                         }
1113                         rem_delay = gethrtime_waitfree() - delay_start;
1114                 }
1115         }
1116 }
1117 
1118 /*
1119  * The following code implements asynchronous call to XSCF to setup the
1120  * domain node name.
1121  */
1122 
1123 #define FREE_MSG(m)             kmem_free((m), NM_LEN((m)->len))
1124 
1125 /*
1126  * The following three macros define the all operations on the request
1127  * list we are using here, and hide the details of the list
1128  * implementation from the code.
1129  */
1130 #define PUSH(m) \
1131         { \
1132                 (m)->next = ctl_msg.head; \
1133                 (m)->prev = NULL; \
1134                 if ((m)->next != NULL) \
1135                         (m)->next->prev = (m); \
1136                 ctl_msg.head = (m); \
1137         }
1138 
1139 #define REMOVE(m) \
1140         { \
1141                 if ((m)->prev != NULL) \
1142                         (m)->prev->next = (m)->next; \
1143                 else \
1144                         ctl_msg.head = (m)->next; \
1145                 if ((m)->next != NULL) \
1146                         (m)->next->prev = (m)->prev; \
1147         }
1148 
1149 #define FREE_THE_TAIL(head) \
1150         { \
1151                 nm_msg_t *n_msg, *m; \
1152                 m = (head)->next; \
1153                 (head)->next = NULL; \
1154                 while (m != NULL) { \
1155                         n_msg = m->next; \
1156                         FREE_MSG(m); \
1157                         m = n_msg; \
1158                 } \
1159         }
1160 
1161 #define SCF_PUTINFO(f, s, p) \
1162         f(KEY_ESCF, 0x01, 0, s, p)
1163 
1164 #define PASS2XSCF(m, r) ((r = SCF_PUTINFO(ctl_msg.scf_service_function, \
1165                                             (m)->len, (m)->data)) == 0)
1166 
1167 /*
1168  * The value of the following macro loosely depends on the
1169  * value of the "device busy" timeout used in the SCF driver.
1170  * (See pass2xscf_thread()).
1171  */
1172 #define SCF_DEVBUSY_DELAY       10
1173 
1174 /*
1175  * The default number of attempts to contact the scf driver
1176  * if we cannot fetch any information about the timeout value
1177  * it uses.
1178  */
1179 
1180 #define REPEATS         4
1181 
1182 typedef struct nm_msg {
1183         struct nm_msg *next;
1184         struct nm_msg *prev;
1185         int len;
1186         char data[1];
1187 } nm_msg_t;
1188 
1189 #define NM_LEN(len)             (sizeof (nm_msg_t) + (len) - 1)
1190 
1191 static struct ctlmsg {
1192         nm_msg_t        *head;
1193         nm_msg_t        *now_serving;
1194         kmutex_t        nm_lock;
1195         kthread_t       *nmt;
1196         int             cnt;
1197         int (*scf_service_function)(uint32_t, uint8_t,
1198                                     uint32_t, uint32_t, void *);
1199 } ctl_msg;
1200 
1201 static void
1202 post_xscf_msg(char *dp, int len)
1203 {
1204         nm_msg_t *msg;
1205 
1206         msg = (nm_msg_t *)kmem_zalloc(NM_LEN(len), KM_SLEEP);
1207 
1208         bcopy(dp, msg->data, len);
1209         msg->len = len;
1210 
1211         mutex_enter(&ctl_msg.nm_lock);
1212         if (ctl_msg.nmt == NULL) {
1213                 ctl_msg.nmt =  thread_create(NULL, 0, pass2xscf_thread,
1214                     NULL, 0, &p0, TS_RUN, minclsyspri);
1215         }
1216 
1217         PUSH(msg);
1218         ctl_msg.cnt++;
1219         mutex_exit(&ctl_msg.nm_lock);
1220 }
1221 
1222 static void
1223 pass2xscf_thread()
1224 {
1225         nm_msg_t *msg;
1226         int ret;
1227         uint_t i, msg_sent, xscf_driver_delay;
1228         static uint_t repeat_cnt;
1229         uint_t *scf_wait_cnt;
1230 
1231         mutex_enter(&ctl_msg.nm_lock);
1232 
1233         /*
1234          * Find the address of the SCF put routine if it's not done yet.
1235          */
1236         if (ctl_msg.scf_service_function == NULL) {
1237                 if ((ctl_msg.scf_service_function =
1238                     (int (*)(uint32_t, uint8_t, uint32_t, uint32_t, void *))
1239                     modgetsymvalue("scf_service_putinfo", 0)) == NULL) {
1240                         cmn_err(CE_NOTE, "pass2xscf_thread: "
1241                             "scf_service_putinfo not found\n");
1242                         ctl_msg.nmt = NULL;
1243                         mutex_exit(&ctl_msg.nm_lock);
1244                         return;
1245                 }
1246         }
1247 
1248         /*
1249          * Calculate the number of attempts to connect XSCF based on the
1250          * scf driver delay (which is
1251          * SCF_DEVBUSY_DELAY*scf_online_wait_rcnt seconds) and the value
1252          * of xscf_connect_delay (the total number of seconds to wait
1253          * till xscf get ready.)
1254          */
1255         if (repeat_cnt == 0) {
1256                 if ((scf_wait_cnt =
1257                     (uint_t *)
1258                     modgetsymvalue("scf_online_wait_rcnt", 0)) == NULL) {
1259                         repeat_cnt = REPEATS;
1260                 } else {
1261 
1262                         xscf_driver_delay = *scf_wait_cnt *
1263                             SCF_DEVBUSY_DELAY;
1264                         repeat_cnt = (xscf_connect_delay/xscf_driver_delay) + 1;
1265                 }
1266         }
1267 
1268         while (ctl_msg.cnt != 0) {
1269 
1270                 /*
1271                  * Take the very last request from the queue,
1272                  */
1273                 ctl_msg.now_serving = ctl_msg.head;
1274                 ASSERT(ctl_msg.now_serving != NULL);
1275 
1276                 /*
1277                  * and discard all the others if any.
1278                  */
1279                 FREE_THE_TAIL(ctl_msg.now_serving);
1280                 ctl_msg.cnt = 1;
1281                 mutex_exit(&ctl_msg.nm_lock);
1282 
1283                 /*
1284                  * Pass the name to XSCF. Note please, we do not hold the
1285                  * mutex while we are doing this.
1286                  */
1287                 msg_sent = 0;
1288                 for (i = 0; i < repeat_cnt; i++) {
1289                         if (PASS2XSCF(ctl_msg.now_serving, ret)) {
1290                                 msg_sent = 1;
1291                                 break;
1292                         } else {
1293                                 if (ret != EBUSY) {
1294                                         cmn_err(CE_NOTE, "pass2xscf_thread:"
1295                                             " unexpected return code"
1296                                             " from scf_service_putinfo():"
1297                                             " %d\n", ret);
1298                                 }
1299                         }
1300                 }
1301 
1302                 if (msg_sent) {
1303 
1304                         /*
1305                          * Remove the request from the list
1306                          */
1307                         mutex_enter(&ctl_msg.nm_lock);
1308                         msg = ctl_msg.now_serving;
1309                         ctl_msg.now_serving = NULL;
1310                         REMOVE(msg);
1311                         ctl_msg.cnt--;
1312                         mutex_exit(&ctl_msg.nm_lock);
1313                         FREE_MSG(msg);
1314                 } else {
1315 
1316                         /*
1317                          * If while we have tried to communicate with
1318                          * XSCF there were any other requests we are
1319                          * going to drop this one and take the latest
1320                          * one.  Otherwise we will try to pass this one
1321                          * again.
1322                          */
1323                         cmn_err(CE_NOTE,
1324                             "pass2xscf_thread: "
1325                             "scf_service_putinfo "
1326                             "not responding\n");
1327                 }
1328                 mutex_enter(&ctl_msg.nm_lock);
1329         }
1330 
1331         /*
1332          * The request queue is empty, exit.
1333          */
1334         ctl_msg.nmt = NULL;
1335         mutex_exit(&ctl_msg.nm_lock);
1336 }