Print this page
    
6062 Workaround broken KVM handling of directed EOIs
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/i86pc/io/pcplusmp/apic_regops.c
          +++ new/usr/src/uts/i86pc/io/pcplusmp/apic_regops.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  /*
  26   26   * Copyright 2014 Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
       27 + * Copyright (c) 2014 by Delphix. All rights reserved.
  27   28   */
  28   29  
  29   30  #include <sys/cpuvar.h>
  30   31  #include <sys/psm.h>
  31   32  #include <sys/archsystm.h>
  32   33  #include <sys/apic.h>
  33   34  #include <sys/sunddi.h>
  34   35  #include <sys/ddi_impldefs.h>
  35   36  #include <sys/mach_intr.h>
  36   37  #include <sys/sysmacros.h>
  37   38  #include <sys/trap.h>
  38   39  #include <sys/x86_archext.h>
  39   40  #include <sys/privregs.h>
  40   41  #include <sys/psm_common.h>
  41   42  
  42   43  /* Function prototypes of local apic and X2APIC */
  43   44  static uint64_t local_apic_read(uint32_t reg);
  44   45  static void local_apic_write(uint32_t reg, uint64_t value);
  45   46  static int get_local_apic_pri(void);
  46   47  static void local_apic_write_task_reg(uint64_t value);
  47   48  static void local_apic_write_int_cmd(uint32_t cpu_id, uint32_t cmd1);
  48   49  static uint64_t local_x2apic_read(uint32_t msr);
  49   50  static void local_x2apic_write(uint32_t msr, uint64_t value);
  50   51  static int get_local_x2apic_pri(void);
  51   52  static void local_x2apic_write_task_reg(uint64_t value);
  52   53  static void local_x2apic_write_int_cmd(uint32_t cpu_id, uint32_t cmd1);
  53   54  
  54   55  /*
  55   56   * According to the X2APIC specification:
  56   57   *
  57   58   *   xAPIC global enable    X2APIC enable         Description
  58   59   *   (IA32_APIC_BASE[11])   (IA32_APIC_BASE[10])
  
    | 
      ↓ open down ↓ | 
    22 lines elided | 
    
      ↑ open up ↑ | 
  
  59   60   * -----------------------------------------------------------
  60   61   *      0                       0       APIC is disabled
  61   62   *      0                       1       Invalid
  62   63   *      1                       0       APIC is enabled in xAPIC mode
  63   64   *      1                       1       APIC is enabled in X2APIC mode
  64   65   * -----------------------------------------------------------
  65   66   */
  66   67  int     x2apic_enable = 1;
  67   68  apic_mode_t apic_mode = LOCAL_APIC;     /* Default mode is Local APIC */
  68   69  
       70 +/* See apic_directed_EOI_supported().  Currently 3-state variable. */
       71 +volatile int apic_directed_eoi_state = 2;
       72 +
  69   73  /* Uses MMIO (Memory Mapped IO) */
  70   74  static apic_reg_ops_t local_apic_regs_ops = {
  71   75          local_apic_read,
  72   76          local_apic_write,
  73   77          get_local_apic_pri,
  74   78          local_apic_write_task_reg,
  75   79          local_apic_write_int_cmd,
  76   80          apic_send_EOI,
  77   81  };
  78   82  
  79   83  /* X2APIC : Uses RDMSR/WRMSR instructions to access APIC registers */
  80   84  static apic_reg_ops_t x2apic_regs_ops = {
  81   85          local_x2apic_read,
  82   86          local_x2apic_write,
  83   87          get_local_x2apic_pri,
  84   88          local_x2apic_write_task_reg,
  85   89          local_x2apic_write_int_cmd,
  86   90          apic_send_EOI,
  87   91  };
  88   92  
  89   93  int apic_have_32bit_cr8 = 0;
  90   94  
  91   95  /* The default ops is local APIC (Memory Mapped IO) */
  92   96  apic_reg_ops_t *apic_reg_ops = &local_apic_regs_ops;
  93   97  
  94   98  /*
  95   99   * APIC register ops related data sturctures and functions.
  96  100   */
  97  101  void apic_send_EOI();
  98  102  void apic_send_directed_EOI(uint32_t irq);
  99  103  
 100  104  #define X2APIC_ENABLE_BIT       10
 101  105  
 102  106  /*
 103  107   * Local APIC Implementation
 104  108   */
 105  109  static uint64_t
 106  110  local_apic_read(uint32_t reg)
 107  111  {
 108  112          return ((uint32_t)apicadr[reg]);
 109  113  }
 110  114  
 111  115  static void
 112  116  local_apic_write(uint32_t reg, uint64_t value)
 113  117  {
 114  118          apicadr[reg] = (uint32_t)value;
 115  119  }
 116  120  
 117  121  static int
 118  122  get_local_apic_pri(void)
 119  123  {
 120  124  #if defined(__amd64)
 121  125          return ((int)getcr8());
 122  126  #else
 123  127          if (apic_have_32bit_cr8)
 124  128                  return ((int)getcr8());
 125  129          return (apicadr[APIC_TASK_REG]);
 126  130  #endif
 127  131  }
 128  132  
 129  133  static void
 130  134  local_apic_write_task_reg(uint64_t value)
 131  135  {
 132  136  #if defined(__amd64)
 133  137          setcr8((ulong_t)(value >> APIC_IPL_SHIFT));
 134  138  #else
 135  139          if (apic_have_32bit_cr8)
 136  140                  setcr8((ulong_t)(value >> APIC_IPL_SHIFT));
 137  141          else
 138  142                  apicadr[APIC_TASK_REG] = (uint32_t)value;
 139  143  #endif
 140  144  }
 141  145  
 142  146  static void
 143  147  local_apic_write_int_cmd(uint32_t cpu_id, uint32_t cmd1)
 144  148  {
 145  149          apicadr[APIC_INT_CMD2] = cpu_id << APIC_ICR_ID_BIT_OFFSET;
 146  150          apicadr[APIC_INT_CMD1] = cmd1;
 147  151  }
 148  152  
 149  153  /*
 150  154   * X2APIC Implementation.
 151  155   */
 152  156  static uint64_t
 153  157  local_x2apic_read(uint32_t msr)
 154  158  {
 155  159          uint64_t i;
 156  160  
 157  161          i = (uint64_t)(rdmsr(REG_X2APIC_BASE_MSR + (msr >> 2)) & 0xffffffff);
 158  162          return (i);
 159  163  }
 160  164  
 161  165  static void
 162  166  local_x2apic_write(uint32_t msr, uint64_t value)
 163  167  {
 164  168          uint64_t tmp;
 165  169  
 166  170          if (msr != APIC_EOI_REG) {
 167  171                  tmp = rdmsr(REG_X2APIC_BASE_MSR + (msr >> 2));
 168  172                  tmp = (tmp & 0xffffffff00000000) | value;
 169  173          } else {
 170  174                  tmp = 0;
 171  175          }
 172  176  
 173  177          wrmsr((REG_X2APIC_BASE_MSR + (msr >> 2)), tmp);
 174  178  }
 175  179  
 176  180  static int
 177  181  get_local_x2apic_pri(void)
 178  182  {
 179  183          return (rdmsr(REG_X2APIC_BASE_MSR + (APIC_TASK_REG >> 2)));
 180  184  }
 181  185  
 182  186  static void
 183  187  local_x2apic_write_task_reg(uint64_t value)
 184  188  {
 185  189          X2APIC_WRITE(APIC_TASK_REG, value);
 186  190  }
 187  191  
 188  192  static void
 189  193  local_x2apic_write_int_cmd(uint32_t cpu_id, uint32_t cmd1)
 190  194  {
 191  195          wrmsr((REG_X2APIC_BASE_MSR + (APIC_INT_CMD1 >> 2)),
 192  196              (((uint64_t)cpu_id << 32) | cmd1));
 193  197  }
 194  198  
 195  199  /*ARGSUSED*/
 196  200  void
 197  201  apic_send_EOI(uint32_t irq)
 198  202  {
 199  203          apic_reg_ops->apic_write(APIC_EOI_REG, 0);
 200  204  }
 201  205  
 202  206  /*
 203  207   * Support for Directed EOI capability is available in both the xAPIC
 204  208   * and x2APIC mode.
 205  209   */
 206  210  void
 207  211  apic_send_directed_EOI(uint32_t irq)
 208  212  {
 209  213          uchar_t ioapicindex;
 210  214          uchar_t vector;
 211  215          apic_irq_t *apic_irq;
 212  216          short intr_index;
 213  217  
 214  218          /*
 215  219           * Following the EOI to the local APIC unit, perform a directed
 216  220           * EOI to the IOxAPIC generating the interrupt by writing to its
 217  221           * EOI register.
 218  222           *
 219  223           * A broadcast EOI is not generated.
 220  224           */
 221  225          apic_reg_ops->apic_write(APIC_EOI_REG, 0);
 222  226  
 223  227          apic_irq = apic_irq_table[irq];
 224  228          while (apic_irq) {
 225  229                  intr_index = apic_irq->airq_mps_intr_index;
 226  230                  if (intr_index == ACPI_INDEX || intr_index >= 0) {
 227  231                          ioapicindex = apic_irq->airq_ioapicindex;
 228  232                          vector = apic_irq->airq_vector;
 229  233                          ioapic_write_eoi(ioapicindex, vector);
 230  234                  }
 231  235                  apic_irq = apic_irq->airq_next;
 232  236          }
 233  237  }
 234  238  
 235  239  int
 236  240  apic_detect_x2apic(void)
 237  241  {
 238  242          if (x2apic_enable == 0)
 239  243                  return (0);
 240  244  
 241  245          return (is_x86_feature(x86_featureset, X86FSET_X2APIC));
 242  246  }
 243  247  
 244  248  void
 245  249  apic_enable_x2apic(void)
 246  250  {
 247  251          uint64_t apic_base_msr;
 248  252  
 249  253          if (apic_local_mode() == LOCAL_X2APIC) {
 250  254                  /* BIOS apparently has enabled X2APIC */
 251  255                  if (apic_mode != LOCAL_X2APIC)
 252  256                          x2apic_update_psm();
 253  257                  return;
 254  258          }
 255  259  
 256  260          /*
 257  261           * This is the first time we are enabling X2APIC on this CPU
 258  262           */
 259  263          apic_base_msr = rdmsr(REG_APIC_BASE_MSR);
 260  264          apic_base_msr = apic_base_msr | (0x1 << X2APIC_ENABLE_BIT);
 261  265          wrmsr(REG_APIC_BASE_MSR, apic_base_msr);
 262  266  
 263  267          if (apic_mode != LOCAL_X2APIC)
 264  268                  x2apic_update_psm();
 265  269  }
 266  270  
 267  271  /*
 268  272   * Determine which mode the current CPU is in. See the table above.
 269  273   * (IA32_APIC_BASE[11])   (IA32_APIC_BASE[10])
 270  274   */
 271  275  int
 272  276  apic_local_mode(void)
 273  277  {
 274  278          uint64_t apic_base_msr;
 275  279          int bit = ((0x1 << (X2APIC_ENABLE_BIT + 1)) |
 276  280              (0x1 << X2APIC_ENABLE_BIT));
 277  281  
 278  282          apic_base_msr = rdmsr(REG_APIC_BASE_MSR);
 279  283  
 280  284          if ((apic_base_msr & bit) == bit)
 281  285                  return (LOCAL_X2APIC);
 282  286          else
 283  287                  return (LOCAL_APIC);
 284  288  }
 285  289  
 286  290  void
  
    | 
      ↓ open down ↓ | 
    208 lines elided | 
    
      ↑ open up ↑ | 
  
 287  291  apic_set_directed_EOI_handler()
 288  292  {
 289  293          apic_reg_ops->apic_send_eoi = apic_send_directed_EOI;
 290  294  }
 291  295  
 292  296  int
 293  297  apic_directed_EOI_supported()
 294  298  {
 295  299          uint32_t ver;
 296  300  
      301 +        /*
      302 +         * There are some known issues with some versions of Linux KVM and QEMU
      303 +         * where by directed EOIs do not properly function and instead get
      304 +         * coalesced at the hypervisor, causing the host not to see interrupts.
      305 +         * Thus, when the platform is KVM, we would like to disable it by
      306 +         * default, but keep it available otherwise.
      307 +         *
      308 +         * We use a three-state variable (apic_directed_eoi_state) to determine
      309 +         * how we handle directed EOI.
      310 +         *
      311 +         * 0 --> Don't do directed EOI at all.
      312 +         * 1 --> Do directed EOI if available, no matter the HW environment.
      313 +         * 2 --> Don't do directed EOI on KVM, but do it otherwise if available.
      314 +         *
      315 +         * If some grinning weirdo put something else in there, treat it as '2'
      316 +         * (i.e. the current default).
      317 +         *
      318 +         * Note, at this time illumos KVM does not identify as KVM. If it does,
      319 +         * we'll need to do some work to determine if it should be caught by
      320 +         * this or if it should show up as its own value of platform_type.
      321 +         */
      322 +        switch (apic_directed_eoi_state) {
      323 +        case 0:
      324 +                /* Don't do it at all. */
      325 +                return (0);
      326 +        case 1:
      327 +                break;
      328 +        case 2:
      329 +        default:
      330 +                /* Only do it if we aren't on KVM. */
      331 +                if (get_hwenv() == HW_KVM)
      332 +                        return (0);
      333 +                /* FALLTHRU */
      334 +        }
      335 +
 297  336          ver = apic_reg_ops->apic_read(APIC_VERS_REG);
 298  337          if (ver & APIC_DIRECTED_EOI_BIT)
 299  338                  return (1);
 300  339  
 301  340          return (0);
 302  341  }
 303  342  
 304  343  /*
 305  344   * Change apic_reg_ops depending upon the apic_mode.
 306  345   */
 307  346  void
 308  347  apic_change_ops()
 309  348  {
 310  349          if (apic_mode == LOCAL_APIC)
 311  350                  apic_reg_ops = &local_apic_regs_ops;
 312  351          else if (apic_mode == LOCAL_X2APIC)
 313  352                  apic_reg_ops = &x2apic_regs_ops;
 314  353  }
 315  354  
 316  355  /*
 317  356   * Generates an interprocessor interrupt to another CPU when X2APIC mode is
 318  357   * enabled.
 319  358   */
 320  359  void
 321  360  x2apic_send_ipi(int cpun, int ipl)
 322  361  {
 323  362          int vector;
 324  363          ulong_t flag;
 325  364  
 326  365          ASSERT(apic_mode == LOCAL_X2APIC);
 327  366  
 328  367          /*
 329  368           * With X2APIC, Intel relaxed the semantics of the
 330  369           * WRMSR instruction such that references to the X2APIC
 331  370           * MSR registers are no longer serializing instructions.
 332  371           * The code that initiates IPIs assumes that some sort
 333  372           * of memory serialization occurs. The old APIC code
 334  373           * did a write to uncachable memory mapped registers.
 335  374           * Any reference to uncached memory is a serializing
 336  375           * operation. To mimic those semantics here, we do an
 337  376           * atomic operation, which translates to a LOCK OR instruction,
 338  377           * which is serializing.
 339  378           */
 340  379          atomic_or_ulong(&flag, 1);
 341  380  
 342  381          vector = apic_resv_vector[ipl];
 343  382  
 344  383          flag = intr_clear();
 345  384  
 346  385          /*
 347  386           * According to X2APIC specification in section '2.3.5.1' of
 348  387           * Interrupt Command Register Semantics, the semantics of
 349  388           * programming Interrupt Command Register to dispatch an interrupt
 350  389           * is simplified. A single MSR write to the 64-bit ICR is required
 351  390           * for dispatching an interrupt. Specifically with the 64-bit MSR
 352  391           * interface to ICR, system software is not required to check the
 353  392           * status of the delivery status bit prior to writing to the ICR
 354  393           * to send an IPI. With the removal of the Delivery Status bit,
 355  394           * system software no longer has a reason to read the ICR. It remains
 356  395           * readable only to aid in debugging.
 357  396           */
 358  397  #ifdef  DEBUG
 359  398          APIC_AV_PENDING_SET();
 360  399  #endif  /* DEBUG */
 361  400  
 362  401          if ((cpun == psm_get_cpu_id())) {
 363  402                  X2APIC_WRITE(X2APIC_SELF_IPI, vector);
 364  403          } else {
 365  404                  apic_reg_ops->apic_write_int_cmd(
 366  405                      apic_cpus[cpun].aci_local_id, vector);
 367  406          }
 368  407  
 369  408          intr_restore(flag);
 370  409  }
 371  410  
 372  411  /*
 373  412   * Generates IPI to another CPU depending on the local APIC mode.
 374  413   * apic_send_ipi() and x2apic_send_ipi() depends on the configured
 375  414   * mode of the local APIC, but that may not match the actual mode
 376  415   * early in CPU startup.
 377  416   *
 378  417   * Any changes made to this routine must be accompanied by similar
 379  418   * changes to apic_send_ipi().
 380  419   */
 381  420  void
 382  421  apic_common_send_ipi(int cpun, int ipl)
 383  422  {
 384  423          int vector;
 385  424          ulong_t flag;
 386  425          int mode = apic_local_mode();
 387  426  
 388  427          if (mode == LOCAL_X2APIC) {
 389  428                  x2apic_send_ipi(cpun, ipl);
 390  429                  return;
 391  430          }
 392  431  
 393  432          ASSERT(mode == LOCAL_APIC);
 394  433  
 395  434          vector = apic_resv_vector[ipl];
 396  435          ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
 397  436          flag = intr_clear();
 398  437          while (local_apic_regs_ops.apic_read(APIC_INT_CMD1) & AV_PENDING)
 399  438                  apic_ret();
 400  439          local_apic_regs_ops.apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
 401  440              vector);
 402  441          intr_restore(flag);
 403  442  }
  
    | 
      ↓ open down ↓ | 
    97 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX