From: Paul Mackerras This patch adds explicit checking for EEH slot isolation events into the PCI config space read path. The change itself would have been minor, except that pci config reads don't have a pointer to a struct pci_dev. Thus, I had to restructure the eeh code to accomodate this, which seems to be a good thing anyway, making it a tad cleaner. Signed-off-by: Linas Vepstas Signed-off-by: Paul Mackerras Signed-off-by: Andrew Morton --- 25-akpm/arch/ppc64/kernel/eeh.c | 89 +++++++++++++++++++------------- 25-akpm/arch/ppc64/kernel/pSeries_pci.c | 24 +++++++- 25-akpm/include/asm-ppc64/eeh.h | 11 +++ 3 files changed, 85 insertions(+), 39 deletions(-) diff -puN arch/ppc64/kernel/eeh.c~ppc64-test-for-eeh-error-in-pci-config-read-path arch/ppc64/kernel/eeh.c --- 25/arch/ppc64/kernel/eeh.c~ppc64-test-for-eeh-error-in-pci-config-read-path 2004-09-01 22:57:05.660541592 -0700 +++ 25-akpm/arch/ppc64/kernel/eeh.c 2004-09-01 22:57:05.668540376 -0700 @@ -348,7 +348,7 @@ void __init pci_addr_cache_build(void) * ths routine does *not* convert I/O BAR addresses (which start * with 0xE...) to phys addresses! */ -static unsigned long eeh_token_to_phys(unsigned long token) +static inline unsigned long eeh_token_to_phys(unsigned long token) { pte_t *ptep; unsigned long pa, vaddr; @@ -365,24 +365,22 @@ static unsigned long eeh_token_to_phys(u } /** - * eeh_check_failure - check if all 1's data is due to EEH slot freeze - * @token i/o token, should be address in the form 0xA.... - * @val value, should be all 1's (XXX why do we need this arg??) + * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze + * @dn device node + * @dev pci device, if known + * + * Check for an EEH failure for the given device node. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze event. This routine + * will query firmware for the EEH status. * - * Check for an eeh failure at the given token address. - * The given value has been read and it should be 1's (0xff, 0xffff or - * 0xffffffff). + * Returns 0 if there has not been an EEH error; otherwise returns + * an error code. * - * Probe to determine if an error actually occurred. If not return val. - * Otherwise panic. - * - * Note this routine might be called in an interrupt context ... + * It is safe to call this routine in an interrupt context. */ -unsigned long eeh_check_failure(void *token, unsigned long val) +int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) { - unsigned long addr; - struct pci_dev *dev; - struct device_node *dn; int ret; int rets[2]; unsigned long flags; @@ -390,30 +388,19 @@ unsigned long eeh_check_failure(void *to __get_cpu_var(total_mmio_ffs)++; if (!eeh_subsystem_enabled) - return val; - - /* Finding the phys addr + pci device; this is pretty quick. */ - addr = eeh_token_to_phys((unsigned long)token); - dev = pci_get_device_by_addr(addr); - if (!dev) - return val; + return 0; - dn = pci_device_to_OF_node(dev); - if (!dn) { - pci_dev_put(dev); - return val; - } + if (!dn) + return 0; /* Access to IO BARs might get this far and still not want checking. */ if (!(dn->eeh_mode & EEH_MODE_SUPPORTED) || dn->eeh_mode & EEH_MODE_NOCHECK) { - pci_dev_put(dev); - return val; + return 0; } if (!dn->eeh_config_addr) { - pci_dev_put(dev); - return val; + return 0; } /* @@ -439,7 +426,7 @@ unsigned long eeh_check_failure(void *to BUID_LO(dn->phb->buid), NULL, 0, virt_to_phys(slot_errbuf), eeh_error_buf_size, - 2 /* Permanent Error */); + 1 /* Temporary Error */); if (log_event == 0) log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, @@ -456,19 +443,53 @@ unsigned long eeh_check_failure(void *to */ if (panic_on_oops) { panic("EEH: MMIO failure (%d) on device:%s %s\n", - rets[0], pci_name(dev), pci_pretty_name(dev)); + rets[0], dn->name, dn->full_name); } else { __get_cpu_var(ignored_failures)++; printk(KERN_INFO "EEH: MMIO failure (%d) on device:%s %s\n", - rets[0], pci_name(dev), pci_pretty_name(dev)); + rets[0], dn->name, dn->full_name); } } else { __get_cpu_var(false_positives)++; } + return 0; +} + +EXPORT_SYMBOL(eeh_dn_check_failure); + +/** + * eeh_check_failure - check if all 1's data is due to EEH slot freeze + * @token i/o token, should be address in the form 0xA.... + * @val value, should be all 1's (XXX why do we need this arg??) + * + * Check for an eeh failure at the given token address. + * Check for an EEH failure at the given token address. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze event. This routine + * will query firmware for the EEH status. + * + * Note this routine is safe to call in an interrupt context. + */ +unsigned long eeh_check_failure(void *token, unsigned long val) +{ + unsigned long addr; + struct pci_dev *dev; + struct device_node *dn; + + /* Finding the phys addr + pci device; this is pretty quick. */ + addr = eeh_token_to_phys((unsigned long)token); + dev = pci_get_device_by_addr(addr); + if (!dev) + return val; + + dn = pci_device_to_OF_node(dev); + eeh_dn_check_failure (dn, dev); + pci_dev_put(dev); return val; } + EXPORT_SYMBOL(eeh_check_failure); struct eeh_early_enable_info { diff -puN arch/ppc64/kernel/pSeries_pci.c~ppc64-test-for-eeh-error-in-pci-config-read-path arch/ppc64/kernel/pSeries_pci.c --- 25/arch/ppc64/kernel/pSeries_pci.c~ppc64-test-for-eeh-error-in-pci-config-read-path 2004-09-01 22:57:05.661541440 -0700 +++ 25-akpm/arch/ppc64/kernel/pSeries_pci.c 2004-09-01 22:57:05.668540376 -0700 @@ -68,7 +68,9 @@ static int rtas_read_config(struct devic int ret; if (!dn) - return -2; + return PCIBIOS_DEVICE_NOT_FOUND; + if (where & (size - 1)) + return PCIBIOS_BAD_REGISTER_NUMBER; addr = (dn->busno << 16) | (dn->devfn << 8) | where; buid = dn->phb->buid; @@ -79,7 +81,15 @@ static int rtas_read_config(struct devic ret = rtas_call(read_pci_config, 2, 2, &returnval, addr, size); } *val = returnval; - return ret; + + if (ret) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (returnval == EEH_IO_ERROR_VALUE(size) + && eeh_dn_check_failure (dn, NULL)) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; } static int rtas_pci_read_config(struct pci_bus *bus, @@ -106,7 +116,9 @@ static int rtas_write_config(struct devi int ret; if (!dn) - return -2; + return PCIBIOS_DEVICE_NOT_FOUND; + if (where & (size - 1)) + return PCIBIOS_BAD_REGISTER_NUMBER; addr = (dn->busno << 16) | (dn->devfn << 8) | where; buid = dn->phb->buid; @@ -115,7 +127,11 @@ static int rtas_write_config(struct devi } else { ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val); } - return ret; + + if (ret) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; } static int rtas_pci_write_config(struct pci_bus *bus, diff -puN include/asm-ppc64/eeh.h~ppc64-test-for-eeh-error-in-pci-config-read-path include/asm-ppc64/eeh.h --- 25/include/asm-ppc64/eeh.h~ppc64-test-for-eeh-error-in-pci-config-read-path 2004-09-01 22:57:05.663541136 -0700 +++ 25-akpm/include/asm-ppc64/eeh.h 2004-09-01 22:57:05.669540224 -0700 @@ -44,6 +44,7 @@ struct device_node; extern void __init eeh_init(void); unsigned long eeh_check_failure(void *token, unsigned long val); +int eeh_dn_check_failure (struct device_node *dn, struct pci_dev *dev); void *eeh_ioremap(unsigned long addr, void *vaddr); void __init pci_addr_cache_build(void); @@ -89,7 +90,15 @@ int eeh_set_option(struct pci_dev *dev, */ #define EEH_POSSIBLE_IO_ERROR(val, type) ((val) == (type)~0) -/* The vaddr will equal the addr if EEH checking is disabled for +/* + * Reads from a device which has been isolated by EEH will return + * all 1s. This macro gives an all-1s value of the given size (in + * bytes: 1, 2, or 4) for comparing with the result of a read. + */ +#define EEH_IO_ERROR_VALUE(size) (~0U >> ((4 - (size)) * 8)) + +/* + * The vaddr will equal the addr if EEH checking is disabled for * this device. This is because eeh_ioremap() will not have * remapped to 0xA0, and thus both vaddr and addr will be 0xE0... */ _