x86, mce: pass mce info to EDAC for decoding
authorBorislav Petkov <borislav.petkov@amd.com>
Fri, 24 Jul 2009 11:51:42 +0000 (13:51 +0200)
committerBorislav Petkov <borislav.petkov@amd.com>
Mon, 14 Sep 2009 16:59:17 +0000 (18:59 +0200)
Move NB decoder along with required defines to EDAC MCE core. Add
registration routines for further decoding of the MCE info in the AMD64
EDAC module.

CC: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
arch/x86/kernel/cpu/mcheck/mce.c
drivers/edac/amd64_edac.c
drivers/edac/amd64_edac.h
drivers/edac/amd64_edac_dbg.c
drivers/edac/edac_mce_amd.c
drivers/edac/edac_mce_amd.h

index 01213048f62f9a1a7e8a58926337c25c67b2711f..b82866f6adf554a0a03bbd637944e44937a77a70 100644 (file)
@@ -183,6 +183,11 @@ void mce_log(struct mce *mce)
        set_bit(0, &mce_need_notify);
 }
 
+void __weak decode_mce(struct mce *m)
+{
+       return;
+}
+
 static void print_mce(struct mce *m)
 {
        printk(KERN_EMERG
@@ -205,6 +210,8 @@ static void print_mce(struct mce *m)
        printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
                        m->cpuvendor, m->cpuid, m->time, m->socketid,
                        m->apicid);
+
+       decode_mce(m);
 }
 
 static void print_mce_head(void)
index 82f48ee90f1138af916e4271321ae55a53a23b0f..2080b1e2e8a2f6a5b17e45e8b5dada3b367a1d20 100644 (file)
@@ -2282,8 +2282,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci,
        }
 }
 
-static void amd64_decode_bus_error(struct mem_ctl_info *mci,
-                                  struct err_regs *info, int ecc_type)
+static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
+                                           struct err_regs *info, int ecc_type)
 {
        u32 ec  = ERROR_CODE(info->nbsl);
        u32 xec = EXT_ERROR_CODE(info->nbsl);
@@ -2316,86 +2316,23 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
                edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
 }
 
-void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
-                        int handle_errors)
+void amd64_decode_bus_error(int node_id, struct err_regs *regs,
+                                  int ecc_type)
 {
-       struct amd64_pvt *pvt = mci->pvt_info;
-       int ecc;
-       u32 ec  = ERROR_CODE(regs->nbsl);
-       u32 xec = EXT_ERROR_CODE(regs->nbsl);
-
-       if (!handle_errors)
-               return;
-
-       pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id);
-
-       /*
-        * F10h, revD can disable ErrCpu[3:0] so check that first and also the
-        * value encoding has changed so interpret those differently
-        */
-       if ((boot_cpu_data.x86 == 0x10) &&
-           (boot_cpu_data.x86_model > 8)) {
-               if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
-                       pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
-       } else {
-               pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
-       }
-
-       pr_emerg(" Error: %sorrected",
-                ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
-       pr_cont(", Report Error: %s",
-                ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
-       pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
-               ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
-               ((regs->nbsh & K8_NBSH_PCC)   ? "yes" : "no"));
-
-       /* do the two bits[14:13] together */
-       ecc = regs->nbsh & (0x3 << 13);
-       if (ecc)
-               pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
-
-       pr_cont("\n");
-
-       if (TLB_ERROR(ec)) {
-               /*
-                * GART errors are intended to help graphics driver developers
-                * to detect bad GART PTEs. It is recommended by AMD to disable
-                * GART table walk error reporting by default[1] (currently
-                * being disabled in mce_cpu_quirks()) and according to the
-                * comment in mce_cpu_quirks(), such GART errors can be
-                * incorrectly triggered. We may see these errors anyway and
-                * unless requested by the user, they won't be reported.
-                *
-                * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
-                *     AMD NPT family 0Fh processors
-                */
-               if (!report_gart_errors)
-                       return;
-
-               pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
-                        TT_MSG(ec), LL_MSG(ec));
-       } else if (MEM_ERROR(ec)) {
-               pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
-                        " Cache Level: %s",
-                        RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
-       } else if (BUS_ERROR(ec)) {
-               pr_emerg(" Bus (Link/DRAM) error\n");
-               amd64_decode_bus_error(mci, regs, ecc);
-       } else {
-               /* shouldn't reach here! */
-               amd64_mc_printk(mci, KERN_WARNING,
-                               "%s(): unknown MCE error 0x%x\n", __func__, ec);
-       }
+       struct mem_ctl_info *mci = mci_lookup[node_id];
 
-       pr_emerg("%s.\n", EXT_ERR_MSG(xec));
+       __amd64_decode_bus_error(mci, regs, ecc_type);
 
        /*
         * Check the UE bit of the NB status high register, if set generate some
         * logs. If NOT a GART error, then process the event as a NO-INFO event.
         * If it was a GART error, skip that process.
+        *
+        * FIXME: this should go somewhere else, if at all.
         */
        if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
                edac_mc_handle_ue_no_info(mci, "UE bit is set");
+
 }
 
 /*
@@ -2406,8 +2343,10 @@ static void amd64_check(struct mem_ctl_info *mci)
 {
        struct err_regs regs;
 
-       if (amd64_get_error_info(mci, &regs))
-               amd64_decode_nb_mce(mci, &regs, 1);
+       if (amd64_get_error_info(mci, &regs)) {
+               struct amd64_pvt *pvt = mci->pvt_info;
+               amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
+       }
 }
 
 /*
@@ -3103,6 +3042,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt)
 
        mci_lookup[node_id] = mci;
        pvt_lookup[node_id] = NULL;
+
+       /* register stuff with EDAC MCE */
+       if (report_gart_errors)
+               amd_report_gart_errors(true);
+
+       amd_register_ecc_decoder(amd64_decode_bus_error);
+
        return 0;
 
 err_add_mc:
@@ -3169,6 +3115,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
 
        mci_lookup[pvt->mc_node_id] = NULL;
 
+       /* unregister from EDAC MCE */
+       amd_report_gart_errors(false);
+       amd_unregister_ecc_decoder(amd64_decode_bus_error);
+
        /* Free the EDAC CORE resources */
        edac_mc_free(mci);
 }
index ecab0c9fd14eaa00cf2fdab6eff76e0484dd193a..8ea07e2715dcad85ec1faa03e4921066ce6fb514 100644 (file)
@@ -346,24 +346,8 @@ enum {
 #define K8_NBSL_PP_OBS                 0x2
 #define K8_NBSL_PP_GENERIC             0x3
 
-
-#define K8_NBSH                                0x4C
-
-#define K8_NBSH_VALID_BIT              BIT(31)
-#define K8_NBSH_OVERFLOW               BIT(30)
-#define K8_NBSH_UC_ERR                 BIT(29)
-#define K8_NBSH_ERR_EN                 BIT(28)
-#define K8_NBSH_MISCV                  BIT(27)
-#define K8_NBSH_VALID_ERROR_ADDR       BIT(26)
-#define K8_NBSH_PCC                    BIT(25)
-#define K8_NBSH_ERR_CPU_VAL            BIT(24)
-#define K8_NBSH_CECC                   BIT(14)
-#define K8_NBSH_UECC                   BIT(13)
-#define K8_NBSH_ERR_SCRUBER            BIT(8)
-
 #define EXTRACT_ERR_CPU_MAP(x)         ((x) & 0xF)
 
-
 #define K8_NBEAL                       0x50
 #define K8_NBEAH                       0x54
 #define K8_SCRCTRL                     0x58
@@ -428,23 +412,6 @@ enum amd64_chipset_families {
        F11_CPUS,
 };
 
-/*
- * Structure to hold:
- *
- * 1) dynamically read status and error address HW registers
- * 2) sysfs entered values
- * 3) MCE values
- *
- * Depends on entry into the modules
- */
-struct err_regs {
-       u32 nbcfg;
-       u32 nbsh;
-       u32 nbsl;
-       u32 nbeah;
-       u32 nbeal;
-};
-
 /* Error injection control structure */
 struct error_injection {
        u32     section;
@@ -610,8 +577,5 @@ static inline struct low_ops *family_ops(int index)
 #define F10_MIN_SCRUB_RATE_BITS        0x5
 #define F11_MIN_SCRUB_RATE_BITS        0x6
 
-void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info,
-                       int handle_errors);
-
 int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
                             u64 *hole_offset, u64 *hole_size);
index bcb4e2eba3dc087033260a1dc6a2cd309616a8fb..59cf2cf6e11ec3fc9628b5f6b32aa525ae3a80cf 100644 (file)
@@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
 
                /* Process the Mapping request */
                /* TODO: Add race prevention */
-               amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1);
+               amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
 
                return count;
        }
index 918567e8cfd51ae76a2853325403cf4437c27493..444c2cc4472d4f609af5d881e87134a0807ecfb9 100644 (file)
@@ -1,6 +1,31 @@
 #include <linux/module.h>
 #include "edac_mce_amd.h"
 
+static bool report_gart_errors;
+static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type);
+
+void amd_report_gart_errors(bool v)
+{
+       report_gart_errors = v;
+}
+EXPORT_SYMBOL_GPL(amd_report_gart_errors);
+
+void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int))
+{
+       nb_bus_decoder = f;
+}
+EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
+
+void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int))
+{
+       if (nb_bus_decoder) {
+               WARN_ON(nb_bus_decoder != f);
+
+               nb_bus_decoder = NULL;
+       }
+}
+EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
+
 /*
  * string representation for the different MCA reported error types, see F3x48
  * or MSR0000_0411.
@@ -102,3 +127,93 @@ const char *ext_msgs[] = {
        "Probe Filter error"                            /* 1_1111b */
 };
 EXPORT_SYMBOL_GPL(ext_msgs);
+
+void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
+{
+       int ecc;
+       u32 ec  = ERROR_CODE(regs->nbsl);
+       u32 xec = EXT_ERROR_CODE(regs->nbsl);
+
+       if (!handle_errors)
+               return;
+
+       pr_emerg(" Northbridge Error, node %d", node_id);
+
+       /*
+        * F10h, revD can disable ErrCpu[3:0] so check that first and also the
+        * value encoding has changed so interpret those differently
+        */
+       if ((boot_cpu_data.x86 == 0x10) &&
+           (boot_cpu_data.x86_model > 8)) {
+               if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
+                       pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
+       } else {
+               pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
+       }
+
+       pr_emerg(" Error: %sorrected",
+                ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
+       pr_cont(", Report Error: %s",
+                ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
+       pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
+               ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
+               ((regs->nbsh & K8_NBSH_PCC)   ? "yes" : "no"));
+
+       /* do the two bits[14:13] together */
+       ecc = regs->nbsh & (0x3 << 13);
+       if (ecc)
+               pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
+
+       pr_cont("\n");
+
+       if (TLB_ERROR(ec)) {
+               /*
+                * GART errors are intended to help graphics driver developers
+                * to detect bad GART PTEs. It is recommended by AMD to disable
+                * GART table walk error reporting by default[1] (currently
+                * being disabled in mce_cpu_quirks()) and according to the
+                * comment in mce_cpu_quirks(), such GART errors can be
+                * incorrectly triggered. We may see these errors anyway and
+                * unless requested by the user, they won't be reported.
+                *
+                * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
+                *     AMD NPT family 0Fh processors
+                */
+               if (!report_gart_errors)
+                       return;
+
+               pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
+                        TT_MSG(ec), LL_MSG(ec));
+       } else if (MEM_ERROR(ec)) {
+               pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
+                        " Cache Level: %s",
+                        RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
+       } else if (BUS_ERROR(ec)) {
+               pr_emerg(" Bus (Link/DRAM) error\n");
+               if (nb_bus_decoder)
+                       nb_bus_decoder(node_id, regs, ecc);
+       } else {
+               /* shouldn't reach here! */
+               pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec);
+       }
+
+       pr_emerg("%s.\n", EXT_ERR_MSG(xec));
+}
+EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
+
+void decode_mce(struct mce *m)
+{
+       struct err_regs regs;
+       int node;
+
+       if (m->bank != 4)
+               return;
+
+       regs.nbsl  = (u32) m->status;
+       regs.nbsh  = (u32)(m->status >> 32);
+       regs.nbeal = (u32) m->addr;
+       regs.nbeah = (u32)(m->addr >> 32);
+       node       = topology_cpu_node_id(m->extcpu);
+
+       amd_decode_nb_mce(node, &regs, 1);
+}
index 39971cdabb5116377cb72cc78b2a88f4b8482a38..9114dc62782bc4dd0326972072dc7b6d24584247 100644 (file)
@@ -1,3 +1,8 @@
+#ifndef _EDAC_MCE_AMD_H
+#define _EDAC_MCE_AMD_H
+
+#include <asm/mce.h>
+
 #define ERROR_CODE(x)                  ((x) & 0xffff)
 #define EXT_ERROR_CODE(x)              (((x) >> 16) & 0x1f)
 #define EXT_ERR_MSG(x)                 ext_msgs[EXT_ERROR_CODE(x)]
 #define PP(x)                          (((x) >> 9) & 0x3)
 #define PP_MSG(x)                      pp_msgs[PP(x)]
 
+#define K8_NBSH                                0x4C
+
+#define K8_NBSH_VALID_BIT              BIT(31)
+#define K8_NBSH_OVERFLOW               BIT(30)
+#define K8_NBSH_UC_ERR                 BIT(29)
+#define K8_NBSH_ERR_EN                 BIT(28)
+#define K8_NBSH_MISCV                  BIT(27)
+#define K8_NBSH_VALID_ERROR_ADDR       BIT(26)
+#define K8_NBSH_PCC                    BIT(25)
+#define K8_NBSH_ERR_CPU_VAL            BIT(24)
+#define K8_NBSH_CECC                   BIT(14)
+#define K8_NBSH_UECC                   BIT(13)
+#define K8_NBSH_ERR_SCRUBER            BIT(8)
+
 extern const char *tt_msgs[];
 extern const char *ll_msgs[];
 extern const char *rrrr_msgs[];
@@ -29,3 +48,22 @@ extern const char *pp_msgs[];
 extern const char *to_msgs[];
 extern const char *ii_msgs[];
 extern const char *ext_msgs[];
+
+/*
+ * relevant NB regs
+ */
+struct err_regs {
+       u32 nbcfg;
+       u32 nbsh;
+       u32 nbsl;
+       u32 nbeah;
+       u32 nbeal;
+};
+
+
+void amd_report_gart_errors(bool);
+void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int));
+void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int));
+void amd_decode_nb_mce(int, struct err_regs *, int);
+
+#endif /* _EDAC_MCE_AMD_H */