patch-2.4.27 linux-2.4.27/arch/x86_64/kernel/bluesmoke.c
Next file: linux-2.4.27/arch/x86_64/kernel/e820.c
Previous file: linux-2.4.27/arch/x86_64/kernel/acpi.c
Back to the patch index
Back to the overall index
- Lines: 649
- Date:
2004-08-07 16:26:04.630345791 -0700
- Orig file:
linux-2.4.26/arch/x86_64/kernel/bluesmoke.c
- Orig date:
2004-02-18 05:36:31.000000000 -0800
diff -urN linux-2.4.26/arch/x86_64/kernel/bluesmoke.c linux-2.4.27/arch/x86_64/kernel/bluesmoke.c
@@ -1,19 +1,25 @@
/*
* Machine check handler.
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Rest from unknown author(s).
+ * Additional K8 decoding and simplification Copyright 2003 Eric Morton, Newisys Inc
+ * Rest from unknown author(s).
*/
+#include <linux/config.h>
+#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/ctype.h>
+#include <linux/pci.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <asm/bluesmoke.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/kdebug.h>
-#include <linux/pci.h>
-#include <linux/timer.h>
+#include <asm/smp.h>
static int mce_disabled __initdata;
static unsigned long mce_cpus;
@@ -25,18 +31,27 @@
static int banks;
static unsigned long ignored_banks, disabled_banks;
+struct notifier_block *mc_notifier_list = NULL;
+EXPORT_SYMBOL(mc_notifier_list);
+
static void generic_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
-
+ struct notifier_mc_err mc_err;
+
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if(mcgstl&(1<<0)) /* Recoverable ? */
recover=0;
- printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl);
+ /* Make sure unrecoverable MCEs reach the console */
+ if(recover & 3)
+ oops_in_progress++;
+
+ printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+ smp_processor_id(), mcgsth, mcgstl);
if (regs && (mcgstl & 2))
printk(KERN_EMERG "RIP <%02lx>:%016lx RSP %016lx\n",
@@ -50,6 +65,10 @@
rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
if(high&(1<<31))
{
+ memset(&mc_err, 0x00, sizeof(mc_err));
+ mc_err.cpunum = safe_smp_processor_id();
+ mc_err.banknum = i;
+ mc_err.mci_status = ((u64)high << 32) | low;
if(high&(1<<29))
recover|=1;
if(high&(1<<25))
@@ -59,19 +78,26 @@
if(high&(1<<27))
{
rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+ mc_err.mci_misc = ((u64)ahigh << 32) | alow;
printk("[%08x%08x]", alow, ahigh);
}
if(high&(1<<26))
{
rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
- printk(" at %08x%08x",
+ mc_err.mci_addr = ((u64)ahigh << 32) | alow;
+ printk(" at %08x%08x",
ahigh, alow);
}
+ rdmsr(MSR_IA32_MC0_CTL+i*4, alow, ahigh);
+ mc_err.mci_ctl = ((u64)ahigh << 32) | alow;
+
printk("\n");
+
/* Clear it */
wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
+ notifier_call_chain(&mc_notifier_list, X86_VENDOR_INTEL, &mc_err);
}
}
@@ -105,56 +131,51 @@
* K8 machine check.
*/
-static struct pci_dev *find_k8_nb(void)
-{
- struct pci_dev *dev;
- int cpu = smp_processor_id();
- pci_for_each_dev(dev) {
- if (dev->bus->number==0 && PCI_FUNC(dev->devfn)==3 &&
- PCI_SLOT(dev->devfn) == (24+cpu))
- return dev;
- }
- return NULL;
-}
-
+static char *k8bank[] = {
+ "data cache",
+ "instruction cache",
+ "bus unit",
+ "load/store unit",
+ "northbridge"
+};
static char *transaction[] = {
"instruction", "data", "generic", "reserved"
};
static char *cachelevel[] = {
- "level 0", "level 1", "level 2", "level generic"
+ "0", "1", "2", "generic"
};
static char *memtrans[] = {
"generic error", "generic read", "generic write", "data read",
- "data write", "instruction fetch", "prefetch", "snoop",
+ "data write", "instruction fetch", "prefetch", "evict", "snoop",
"?", "?", "?", "?", "?", "?", "?"
};
static char *partproc[] = {
"local node origin", "local node response",
- "local node observed", "generic"
+ "local node observed", "generic participation"
};
static char *timeout[] = {
"request didn't time out",
"request timed out"
};
static char *memoryio[] = {
- "memory access", "res.", "i/o access", "generic"
-};
-static char *extendederr[] = {
- "ecc error",
- "crc error",
- "sync error",
- "mst abort",
- "tgt abort",
- "gart error",
- "rmw error",
- "wdog error",
- "chipkill ecc error",
+ "memory", "res.", "i/o", "generic"
+};
+static char *nbextendederr[] = {
+ "ECC error",
+ "CRC error",
+ "Sync error",
+ "Master abort",
+ "Target abort",
+ "GART error",
+ "RMW error",
+ "Watchdog error",
+ "Chipkill ECC error",
"<9>","<10>","<11>","<12>",
"<13>","<14>","<15>"
-};
+};
static char *highbits[32] = {
- [31] = "previous error lost",
- [30] = "error overflow",
+ [31] = "valid",
+ [30] = "error overflow (multiple errors)",
[29] = "error uncorrected",
[28] = "error enable",
[27] = "misc error valid",
@@ -169,128 +190,290 @@
[11] = "res11",
[10] = "res10",
[9] = "res9",
- [8] = "dram scrub error",
+ [8] = "error found by scrub",
[7] = "res7",
/* 6-4 ht link number of error */
[3] = "res3",
[2] = "res2",
- [1] = "err cpu0",
- [0] = "err cpu1",
+ [1] = "err cpu1",
+ [0] = "err cpu0",
};
-static void check_k8_nb(int header)
-{
- struct pci_dev *nb;
- nb = find_k8_nb();
- if (nb == NULL)
- return;
-
- u32 statuslow, statushigh;
- pci_read_config_dword(nb, 0x48, &statuslow);
- pci_read_config_dword(nb, 0x4c, &statushigh);
- if (!(statushigh & (1<<31)))
- return;
- if (header)
- printk(KERN_ERR "CPU %d: Silent Northbridge MCE\n", smp_processor_id());
- printk(KERN_ERR "Northbridge status %08x:%08x\n",
- statushigh,statuslow);
+static void decode_k8_generic_errcode(unsigned int cpunum, u64 status)
+{
+ unsigned short errcode = status & 0xffff;
+ int i;
- printk(KERN_ERR " Error %s\n", extendederr[(statuslow >> 16) & 0xf]);
+ for (i=0; i<32; i++) {
+ if (i==31 || i==28 || i==26)
+ continue;
+ if (highbits[i] && (status & (1UL<<(i+32)))) {
+ printk(KERN_ERR "CPU%d: bit%d = %s\n", cpunum, i+32, highbits[i]);
+ }
+ }
- unsigned short errcode = statuslow & 0xffff;
- switch ((statuslow >> 16) & 0xF) {
- case 5:
- printk(KERN_ERR " GART TLB error %s %s\n",
- transaction[(errcode >> 2) & 3],
+ if ((errcode & 0xFFF0) == 0x0010) {
+ printk(KERN_ERR "CPU%d: TLB error '%s transaction, level %s'\n",
+ cpunum,
+ transaction[(errcode >> 2) & 3],
cachelevel[errcode & 3]);
- break;
- case 8:
- printk(KERN_ERR " ECC error syndrome %x\n",
- (((statuslow >> 24) & 0xff) << 8) | ((statushigh >> 15) & 0x7f));
- /*FALL THROUGH*/
- default:
- printk(KERN_ERR " bus error %s, %s\n %s\n %s, %s\n",
+ }
+ else if ((errcode & 0xFF00) == 0x0100) {
+ printk(KERN_ERR "CPU%d: memory/cache error '%s mem transaction, %s transaction, level %s'\n",
+ cpunum,
+ memtrans[(errcode >> 4) & 0xf],
+ transaction[(errcode >> 2) & 3],
+ cachelevel[errcode & 3]);
+ }
+ else if ((errcode & 0xF800) == 0x0800) {
+ printk(KERN_ERR "CPU%d: bus error '%s, %s\n %s mem transaction\n %s access, level %s'\n",
+ cpunum,
partproc[(errcode >> 9) & 0x3],
timeout[(errcode >> 8) & 1],
- memtrans[(errcode >> 4) & 0xf],
- memoryio[(errcode >> 2) & 0x3],
- cachelevel[(errcode & 0x3)]);
- /* should only print when it was a HyperTransport related error. */
- printk(KERN_ERR " link number %x\n", (statushigh >> 4) & 3);
+ memtrans[(errcode >> 4) & 0xf],
+ memoryio[(errcode >> 2) & 0x3],
+ cachelevel[(errcode & 0x3)]);
+ }
+}
+
+static void decode_k8_dc_mc(unsigned int cpunum, u64 status)
+{
+ unsigned short exterrcode = (status >> 16) & 0x0f;
+ unsigned short errcode = status & 0xffff;
+
+ if(status&(3UL<<45)) {
+ printk(KERN_ERR "CPU%d: Data cache ECC error (syndrome %x)",
+ cpunum,
+ (u32) (status >> 47) & 0xff);
+ if(status&(1UL<<40)) {
+ printk(" found by scrubber");
+ }
+ printk("\n");
+ }
+
+ if ((errcode & 0xFFF0) == 0x0010) {
+ printk(KERN_ERR "CPU%d: TLB parity error in %s array\n",
+ cpunum,
+ (exterrcode == 0) ? "physical" : "virtual");
+ }
+
+ decode_k8_generic_errcode(cpunum, status);
+}
+
+static void decode_k8_ic_mc(unsigned int cpunum, u64 status)
+{
+ unsigned short exterrcode = (status >> 16) & 0x0f;
+ unsigned short errcode = status & 0xffff;
+
+ if(status&(3UL<<45)) {
+ printk(KERN_ERR "CPU%d: Instruction cache ECC error\n",
+ cpunum);
+ }
+
+ if ((errcode & 0xFFF0) == 0x0010) {
+ printk(KERN_ERR "CPU%d: TLB parity error in %s array\n",
+ cpunum,
+ (exterrcode == 0) ? "physical" : "virtual");
+ }
+
+ decode_k8_generic_errcode(cpunum, status);
+}
+
+static void decode_k8_bu_mc(unsigned int cpunum, u64 status)
+{
+ unsigned short exterrcode = (status >> 16) & 0x0f;
+
+ if(status&(3UL<<45)) {
+ printk(KERN_ERR "CPU%d: L2 cache ECC error\n",
+ cpunum);
+ }
+
+ printk(KERN_ERR "CPU%d: %s array error\n",
+ cpunum,
+ (exterrcode == 0) ? "Bus or cache" : "Cache tag");
+
+ decode_k8_generic_errcode(cpunum, status);
+}
+
+static void decode_k8_ls_mc(unsigned int cpunum, u64 status)
+{
+ decode_k8_generic_errcode(cpunum, status);
+}
+
+static void decode_k8_nb_mc(unsigned int cpunum, u64 status)
+{
+ unsigned short exterrcode = (status >> 16) & 0x0f;
+
+ printk(KERN_ERR "CPU%d: Northbridge %s\n", cpunum, nbextendederr[exterrcode]);
+
+ switch (exterrcode) {
+ case 0:
+ printk(KERN_ERR "CPU%d: ECC syndrome = %x\n",
+ cpunum,
+ (u32) (status >> 47) & 0xff);
break;
- }
-
+ case 8:
+ printk(KERN_ERR "CPU%d: Chipkill ECC syndrome = %x\n",
+ cpunum,
+ (u32) ((((status >> 24) & 0xff) << 8) | ((status >> 47) & 0xff)));
+ break;
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 6:
+ printk(KERN_ERR "CPU%d: link number = %x\n",
+ cpunum,
+ (u32) (status >> 36) & 0x7);
+ break;
+ }
- int i;
- for (i = 0; i < 32; i++) {
- if (i == 26 || i == 28)
- continue;
- if (highbits[i] && (statushigh & (1<<i)))
- printk(KERN_ERR " %s\n", highbits[i]);
+ decode_k8_generic_errcode(cpunum, status);
+}
+
+static void decode_k8_mc(unsigned int banknum, unsigned int cpunum, u64 status)
+{
+ switch(banknum) {
+ case 0:
+ decode_k8_dc_mc(cpunum, status);
+ break;
+ case 1:
+ decode_k8_ic_mc(cpunum, status);
+ break;
+ case 2:
+ decode_k8_bu_mc(cpunum, status);
+ break;
+ case 3:
+ decode_k8_ls_mc(cpunum, status);
+ break;
+ case 4:
+ decode_k8_nb_mc(cpunum, status);
+ break;
}
+}
- if (statushigh & (1<<26)) {
- u32 addrhigh, addrlow;
- pci_read_config_dword(nb, 0x54, &addrhigh);
- pci_read_config_dword(nb, 0x50, &addrlow);
- printk(KERN_ERR " NB error address %08x%08x\n", addrhigh,addrlow);
+static void k8_poll_machine_check(void)
+{
+ int cpunum = safe_smp_processor_id();
+ int banknum;
+ u64 address, status, ctl;
+ struct notifier_mc_err mc_err;
+
+ for(banknum=0; banknum<banks; banknum++) {
+ if ((1UL<<banknum) & ignored_banks)
+ continue;
+
+ rdmsrl(MSR_IA32_MC0_STATUS+banknum*4, status);
+ if(status&(1UL<<63)) {
+ mc_err.cpunum = cpunum;
+ mc_err.banknum = banknum;
+ mc_err.mci_status = status;
+ rdmsrl(MSR_IA32_MC0_ADDR+banknum*4, address);
+ mc_err.mci_addr = address;
+
+ /* Can't write anything but zeros to status, or K8 will GPF */
+ wrmsrl(MSR_IA32_MC0_STATUS+banknum*4, 0UL);
+ printk(KERN_ERR "CPU%d %s polled machine check error status: %016Lx",
+ cpunum,
+ k8bank[banknum],
+ status);
+ if(status&(1UL<<58)) {
+ printk(" at address %016Lx", address);
+ }
+ printk("\n");
+ rdmsrl(MSR_IA32_MC0_CTL+banknum*4, ctl);
+ mc_err.mci_ctl = ctl;
+ notifier_call_chain(&mc_notifier_list, X86_VENDOR_AMD, &mc_err);
+ decode_k8_mc(banknum, cpunum, status);
+ }
}
- statushigh &= ~(1<<31);
- pci_write_config_dword(nb, 0x4c, statushigh);
}
static void k8_machine_check(struct pt_regs * regs, long error_code)
-{
- u64 status, nbstatus;
-
- rdmsrl(MSR_IA32_MCG_STATUS, status);
- if ((status & (1<<2)) == 0) {
- if (!regs)
- check_k8_nb(1);
- return;
+{
+ int norecover=1;
+ u64 addr, status, ctl, mcgst;
+ int banknum;
+ unsigned int cpunum = safe_smp_processor_id();
+ struct notifier_mc_err mc_err;
+
+ rdmsrl(MSR_IA32_MCG_STATUS, mcgst);
+ if(mcgst&(1UL<<0)) { /* Recoverable ? */
+ norecover=0;
}
- printk(KERN_EMERG "CPU %d: Machine Check Exception: %016Lx\n", smp_processor_id(), status);
- if (status & 1)
- printk(KERN_EMERG "MCG_STATUS: unrecoverable\n");
+ /* Make sure unrecoverable MCEs reach the console */
+ if(norecover)
+ oops_in_progress++;
+
+ printk(KERN_EMERG "CPU %d: Machine Check Exception: %016Lx\n", cpunum, mcgst);
+
+ if (regs && (mcgst & (1UL<<1))) {
+ printk(KERN_EMERG "CPU%d: RIP <%02lx>:%016lx RSP %016lx %s\n",
+ cpunum, regs->cs, regs->rip, regs->rsp,
+ (mcgst & 1) ? "" : "!INEXACT!");
+ }
- rdmsrl(MSR_IA32_MC0_STATUS+4*4, nbstatus);
- if ((nbstatus & (1UL<<63)) == 0)
- goto others;
-
- printk(KERN_EMERG "Northbridge Machine Check %s %016lx %lx\n",
- regs ? "exception" : "timer",
- (unsigned long)nbstatus, error_code);
- if (nbstatus & (1UL<<62))
- printk(KERN_EMERG "Lost at least one NB error condition\n");
- if (nbstatus & (1UL<<61))
- printk(KERN_EMERG "Uncorrectable condition\n");
- if (nbstatus & (1UL<57))
- printk(KERN_EMERG "Unrecoverable condition\n");
-
- check_k8_nb(0);
+ for(banknum=0; banknum<banks; banknum++) {
+ if ((1UL<<banknum) & ignored_banks)
+ continue;
+ rdmsrl(MSR_IA32_MC0_STATUS+banknum*4, status);
+ if(status&(1UL<<63)) {
+ memset(&mc_err, 0x00, sizeof(mc_err));
+ mc_err.cpunum = cpunum;
+ mc_err.banknum = banknum;
+ mc_err.mci_status = status;
+ if(status&(1UL<<61)) {
+ norecover|=1; /* uncorrectable */
+ }
+ if(status &(1UL<<57)) {
+ norecover|=2; /* processor context corrupt */
+ }
+ printk(KERN_EMERG "CPU%d %s error status: %016Lx",
+ cpunum,
+ k8bank[banknum],
+ status);
+ if(status&(1UL<<58)) {
+ rdmsrl(MSR_IA32_MC0_ADDR+banknum*4, addr);
+ mc_err.mci_addr = addr;
+ printk(" at address %016Lx", addr);
+ } else if ((banknum==4) && (((status>>16)&0x0f)==7)) {
+ /* NB watchdog, address reg has details but validity bit is not set */
+ rdmsrl(MSR_IA32_MC0_ADDR+banknum*4, addr);
+ mc_err.mci_addr = addr;
+ printk(" error details %016Lx", addr);
+ }
+ printk("\n");
+ rdmsrl(MSR_IA32_MC0_CTL+banknum*4, ctl);
+ mc_err.mci_ctl = ctl;
+ /* Clear it */
+ /* Can't write anything but zeros to status, or K8 will GPF */
+ wrmsrl(MSR_IA32_MC0_STATUS+banknum*4, 0UL);
+ /* Serialize */
+ wmb();
+ notifier_call_chain(&mc_notifier_list, X86_VENDOR_AMD, &mc_err);
+ }
+ }
- if (nbstatus & (1UL<<58)) {
- u64 adr;
- rdmsrl(MSR_IA32_MC0_ADDR+4*4, adr);
- printk(KERN_EMERG "Address: %016lx\n", (unsigned long)adr);
+ if(norecover&2) {
+ panic("CPU context corrupt");
}
-
- wrmsrl(MSR_IA32_MC0_STATUS+4*4, 0);
- wrmsrl(MSR_IA32_MCG_STATUS, 0);
-
- others:
- generic_machine_check(regs, error_code);
-}
+ if(norecover&1) {
+ panic("Unable to continue");
+ }
+ printk(KERN_EMERG "Attempting to continue.\n");
+ mcgst&=~(1UL<<2);
+ wrmsrl(MSR_IA32_MCG_STATUS,mcgst);
+}
static struct timer_list mcheck_timer;
-int mcheck_interval = 30*HZ;
+int mcheck_interval = 60*HZ;
#ifndef CONFIG_SMP
static void mcheck_timer_handler(unsigned long data)
{
- k8_machine_check(NULL,0);
+ k8_poll_machine_check();
mcheck_timer.expires = jiffies + mcheck_interval;
add_timer(&mcheck_timer);
}
@@ -301,13 +484,13 @@
static void mcheck_timer_other(void *data)
{
- k8_machine_check(NULL, 0);
+ k8_poll_machine_check();
}
static void mcheck_timer_dist(void *data)
{
smp_call_function(mcheck_timer_other,0,0,0);
- k8_machine_check(NULL, 0);
+ k8_poll_machine_check();
mcheck_timer.expires = jiffies + mcheck_interval;
add_timer(&mcheck_timer);
}
@@ -334,17 +517,20 @@
rdmsrl(MSR_IA32_MCG_CAP, cap);
banks = cap&0xff;
- machine_check_vector = k8_machine_check;
for (i = 0; i < banks; i++) {
u64 val = ((1UL<<i) & disabled_banks) ? 0 : ~0UL;
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i,0UL);
+ wrmsrl(MSR_IA32_MC0_ADDR+4*i,0UL);
wrmsrl(MSR_IA32_MC0_CTL+4*i, val);
- wrmsrl(MSR_IA32_MC0_STATUS+4*i,0);
}
- if (cap & (1<<8))
- wrmsrl(MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
+ /* set up the vector first, before enabling MCG_CTL */
+ machine_check_vector = k8_machine_check;
+ set_in_cr4(X86_CR4_MCE);
- set_in_cr4(X86_CR4_MCE);
+ if (cap & (1<<8)) {
+ wrmsrl(MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
+ }
if (mcheck_interval && (smp_processor_id() == 0)) {
init_timer(&mcheck_timer);
@@ -399,7 +585,8 @@
wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", smp_processor_id());
+ printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+ smp_processor_id());
done=1;
}
@@ -419,8 +606,10 @@
case X86_VENDOR_AMD:
if (c->x86 == 15 && !nok8) {
k8_mcheck_init(c);
- break;
+ } else {
+ generic_mcheck_init(c);
}
+ break;
/* FALL THROUGH */
default:
case X86_VENDOR_INTEL:
@@ -447,13 +636,13 @@
char *p;
while ((p = strsep(&str,",")) != NULL) {
if (isdigit(*p))
- mcheck_interval = simple_strtol(p,NULL,0) * HZ;
+ mcheck_interval = simple_strtol(p,NULL,0) * HZ;
else if (!strcmp(p,"off"))
mce_disabled = 1;
else if (!strncmp(p,"enable",6))
- disabled_banks &= ~(1<<simple_strtol(p+6,NULL,0));
+ disabled_banks &= ~(1UL << simple_strtol(p+6,NULL,0));
else if (!strncmp(p,"disable",7))
- disabled_banks |= ~(1<<simple_strtol(p+7,NULL,0));
+ disabled_banks |= 1UL << simple_strtol(p+7,NULL,0);
else if (!strcmp(p,"nok8"))
nok8 = 1;
}
@@ -461,4 +650,4 @@
}
__setup("nomce", mcheck_disable);
-__setup("mce", mcheck_enable);
+__setup("mce=", mcheck_enable);
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)