GIT ee47602e64ca1c1e08708c85d5b71123c490620f master.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git#test --- Signed-off-by: Andrew Morton --- ia64/Kconfig | 0 arch/ia64/configs/sn2_defconfig | 1 arch/ia64/configs/tiger_defconfig | 1 arch/ia64/configs/zx1_defconfig | 1 arch/ia64/defconfig | 1 arch/ia64/hp/sim/boot/boot_head.S | 31 arch/ia64/kernel/acpi.c | 2 arch/ia64/kernel/asm-offsets.c | 40 - arch/ia64/kernel/entry.S | 6 arch/ia64/kernel/iosapic.c | 2 arch/ia64/kernel/mca.c | 808 ++++++++++++--------- arch/ia64/kernel/mca_asm.S | 1281 +++++++++++++++++----------------- arch/ia64/kernel/mca_drv.c | 37 arch/ia64/kernel/minstate.h | 88 -- arch/ia64/kernel/palinfo.c | 123 +-- arch/ia64/kernel/salinfo.c | 62 - arch/ia64/kernel/unwind.c | 22 arch/ia64/kernel/vmlinux.lds.S | 1 arch/ia64/lib/memcpy_mck.S | 3 arch/ia64/mm/fault.c | 6 arch/ia64/mm/init.c | 15 arch/ia64/pci/pci.c | 2 arch/ia64/sn/kernel/setup.c | 30 arch/ia64/sn/kernel/tiocx.c | 58 - arch/ia64/sn/kernel/xpc.h | 288 +++++++ arch/ia64/sn/kernel/xpc_channel.c | 216 +++-- arch/ia64/sn/kernel/xpc_main.c | 244 ++++-- arch/ia64/sn/kernel/xpc_partition.c | 312 ++++++-- include/asm-ia64/iosapic.h | 4 include/asm-ia64/irq.h | 4 include/asm-ia64/mca.h | 102 +- include/asm-ia64/mca_asm.h | 131 --- include/asm-ia64/ptrace.h | 2 include/asm-ia64/sn/l1.h | 12 include/asm-ia64/sn/sn_feature_sets.h | 57 + include/asm-ia64/sn/sn_sal.h | 79 +- include/asm-ia64/sn/tiocx.h | 3 include/asm-ia64/sn/xp.h | 10 include/asm-ia64/thread_info.h | 2 include/asm-ia64/unwind.h | 7 include/linux/sched.h | 2 kernel/sched.c | 28 42 files changed, 2487 insertions(+), 1637 deletions(-) diff -puN arch/ia64/configs/sn2_defconfig~git-ia64 arch/ia64/configs/sn2_defconfig --- devel/arch/ia64/configs/sn2_defconfig~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/configs/sn2_defconfig 2005-09-07 19:42:41.000000000 -0700 @@ -111,7 +111,6 @@ CONFIG_COMPAT=y CONFIG_IA64_MCA_RECOVERY=y CONFIG_PERFMON=y CONFIG_IA64_PALINFO=y -CONFIG_ACPI_DEALLOCATE_IRQ=y # # Firmware Drivers diff -puN arch/ia64/configs/tiger_defconfig~git-ia64 arch/ia64/configs/tiger_defconfig --- devel/arch/ia64/configs/tiger_defconfig~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/configs/tiger_defconfig 2005-09-07 19:42:41.000000000 -0700 @@ -109,7 +109,6 @@ CONFIG_COMPAT=y CONFIG_IA64_MCA_RECOVERY=y CONFIG_PERFMON=y CONFIG_IA64_PALINFO=y -CONFIG_ACPI_DEALLOCATE_IRQ=y # # Firmware Drivers diff -puN arch/ia64/configs/zx1_defconfig~git-ia64 arch/ia64/configs/zx1_defconfig --- devel/arch/ia64/configs/zx1_defconfig~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/configs/zx1_defconfig 2005-09-07 19:42:41.000000000 -0700 @@ -109,7 +109,6 @@ CONFIG_COMPAT=y CONFIG_IA64_MCA_RECOVERY=y CONFIG_PERFMON=y CONFIG_IA64_PALINFO=y -CONFIG_ACPI_DEALLOCATE_IRQ=y # # Firmware Drivers diff -puN arch/ia64/defconfig~git-ia64 arch/ia64/defconfig --- devel/arch/ia64/defconfig~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/defconfig 2005-09-07 19:42:41.000000000 -0700 @@ -99,7 +99,6 @@ CONFIG_COMPAT=y CONFIG_IA64_MCA_RECOVERY=y CONFIG_PERFMON=y CONFIG_IA64_PALINFO=y -CONFIG_ACPI_DEALLOCATE_IRQ=y # # Firmware Drivers diff -puN arch/ia64/hp/sim/boot/boot_head.S~git-ia64 arch/ia64/hp/sim/boot/boot_head.S --- devel/arch/ia64/hp/sim/boot/boot_head.S~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/hp/sim/boot/boot_head.S 2005-09-07 19:42:41.000000000 -0700 @@ -4,6 +4,7 @@ */ #include +#include .bss .align 16 @@ -49,7 +50,11 @@ GLOBAL_ENTRY(jmp_to_kernel) br.sptk.few b7 END(jmp_to_kernel) - +/* + * r28 contains the index of the PAL function + * r29--31 the args + * Return values in ret0--3 (r8--11) + */ GLOBAL_ENTRY(pal_emulator_static) mov r8=-1 mov r9=256 @@ -62,7 +67,7 @@ GLOBAL_ENTRY(pal_emulator_static) cmp.gtu p6,p7=r9,r28 (p6) br.cond.sptk.few stacked ;; -static: cmp.eq p6,p7=6,r28 /* PAL_PTCE_INFO */ +static: cmp.eq p6,p7=PAL_PTCE_INFO,r28 (p7) br.cond.sptk.few 1f ;; mov r8=0 /* status = 0 */ @@ -70,21 +75,21 @@ static: cmp.eq p6,p7=6,r28 /* PAL_PTCE_ movl r10=0x0000000200000003 /* count[0], count[1] */ movl r11=0x1000000000002000 /* stride[0], stride[1] */ br.cond.sptk.few rp -1: cmp.eq p6,p7=14,r28 /* PAL_FREQ_RATIOS */ +1: cmp.eq p6,p7=PAL_FREQ_RATIOS,r28 (p7) br.cond.sptk.few 1f mov r8=0 /* status = 0 */ movl r9 =0x100000064 /* proc_ratio (1/100) */ movl r10=0x100000100 /* bus_ratio<<32 (1/256) */ movl r11=0x100000064 /* itc_ratio<<32 (1/100) */ ;; -1: cmp.eq p6,p7=19,r28 /* PAL_RSE_INFO */ +1: cmp.eq p6,p7=PAL_RSE_INFO,r28 (p7) br.cond.sptk.few 1f mov r8=0 /* status = 0 */ mov r9=96 /* num phys stacked */ mov r10=0 /* hints */ mov r11=0 br.cond.sptk.few rp -1: cmp.eq p6,p7=1,r28 /* PAL_CACHE_FLUSH */ +1: cmp.eq p6,p7=PAL_CACHE_FLUSH,r28 /* PAL_CACHE_FLUSH */ (p7) br.cond.sptk.few 1f mov r9=ar.lc movl r8=524288 /* flush 512k million cache lines (16MB) */ @@ -102,7 +107,7 @@ static: cmp.eq p6,p7=6,r28 /* PAL_PTCE_ mov ar.lc=r9 mov r8=r0 ;; -1: cmp.eq p6,p7=15,r28 /* PAL_PERF_MON_INFO */ +1: cmp.eq p6,p7=PAL_PERF_MON_INFO,r28 (p7) br.cond.sptk.few 1f mov r8=0 /* status = 0 */ movl r9 =0x08122f04 /* generic=4 width=47 retired=8 cycles=18 */ @@ -138,6 +143,20 @@ static: cmp.eq p6,p7=6,r28 /* PAL_PTCE_ st8 [r29]=r0,16 /* clear remaining bits */ st8 [r18]=r0,16 /* clear remaining bits */ ;; +1: cmp.eq p6,p7=PAL_VM_SUMMARY,r28 +(p7) br.cond.sptk.few 1f + mov r8=0 /* status = 0 */ + movl r9=0x2044040020F1865 /* num_tc_levels=2, num_unique_tcs=4 */ + /* max_itr_entry=64, max_dtr_entry=64 */ + /* hash_tag_id=2, max_pkr=15 */ + /* key_size=24, phys_add_size=50, vw=1 */ + movl r10=0x183C /* rid_size=24, impl_va_msb=60 */ + ;; +1: cmp.eq p6,p7=PAL_MEM_ATTRIB,r28 +(p7) br.cond.sptk.few 1f + mov r8=0 /* status = 0 */ + mov r9=0x80|0x01 /* NatPage|WB */ + ;; 1: br.cond.sptk.few rp stacked: br.ret.sptk.few rp diff -puN arch/ia64/Kconfig~git-ia64 arch/ia64/Kconfig diff -puN arch/ia64/kernel/acpi.c~git-ia64 arch/ia64/kernel/acpi.c --- devel/arch/ia64/kernel/acpi.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/acpi.c 2005-09-07 19:43:10.000000000 -0700 @@ -583,14 +583,12 @@ int acpi_register_gsi(u32 gsi, int edge_ EXPORT_SYMBOL(acpi_register_gsi); -#ifdef CONFIG_ACPI_DEALLOCATE_IRQ void acpi_unregister_gsi(u32 gsi) { iosapic_unregister_intr(gsi); } EXPORT_SYMBOL(acpi_unregister_gsi); -#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */ static int __init acpi_parse_fadt(unsigned long phys_addr, unsigned long size) { diff -puN arch/ia64/kernel/asm-offsets.c~git-ia64 arch/ia64/kernel/asm-offsets.c --- devel/arch/ia64/kernel/asm-offsets.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/asm-offsets.c 2005-09-07 19:42:41.000000000 -0700 @@ -211,17 +211,41 @@ void foo(void) #endif BLANK(); - DEFINE(IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, - offsetof (struct ia64_mca_cpu, proc_state_dump)); - DEFINE(IA64_MCA_CPU_STACK_OFFSET, - offsetof (struct ia64_mca_cpu, stack)); - DEFINE(IA64_MCA_CPU_STACKFRAME_OFFSET, - offsetof (struct ia64_mca_cpu, stackframe)); - DEFINE(IA64_MCA_CPU_RBSTORE_OFFSET, - offsetof (struct ia64_mca_cpu, rbstore)); + DEFINE(IA64_MCA_CPU_MCA_STACK_OFFSET, + offsetof (struct ia64_mca_cpu, mca_stack)); DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET, offsetof (struct ia64_mca_cpu, init_stack)); BLANK(); + DEFINE(IA64_SAL_OS_STATE_COMMON_OFFSET, + offsetof (struct ia64_sal_os_state, sal_ra)); + DEFINE(IA64_SAL_OS_STATE_OS_GP_OFFSET, + offsetof (struct ia64_sal_os_state, os_gp)); + DEFINE(IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET, + offsetof (struct ia64_sal_os_state, pal_min_state)); + DEFINE(IA64_SAL_OS_STATE_PROC_STATE_PARAM_OFFSET, + offsetof (struct ia64_sal_os_state, proc_state_param)); + DEFINE(IA64_SAL_OS_STATE_SIZE, + sizeof (struct ia64_sal_os_state)); + DEFINE(IA64_PMSA_GR_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_gr)); + DEFINE(IA64_PMSA_BANK1_GR_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_bank1_gr)); + DEFINE(IA64_PMSA_PR_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_pr)); + DEFINE(IA64_PMSA_BR0_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_br0)); + DEFINE(IA64_PMSA_RSC_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_rsc)); + DEFINE(IA64_PMSA_IIP_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_iip)); + DEFINE(IA64_PMSA_IPSR_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_ipsr)); + DEFINE(IA64_PMSA_IFS_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_ifs)); + DEFINE(IA64_PMSA_XIP_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_xip)); + BLANK(); + /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr)); DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source)); diff -puN arch/ia64/kernel/entry.S~git-ia64 arch/ia64/kernel/entry.S --- devel/arch/ia64/kernel/entry.S~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/entry.S 2005-09-07 19:42:41.000000000 -0700 @@ -204,9 +204,6 @@ GLOBAL_ENTRY(ia64_switch_to) (p6) br.cond.dpnt .map ;; .done: -(p6) ssm psr.ic // if we had to map, reenable the psr.ic bit FIRST!!! - ;; -(p6) srlz.d ld8 sp=[r21] // load kernel stack pointer of new task mov IA64_KR(CURRENT)=in0 // update "current" application register mov r8=r13 // return pointer to previously running task @@ -234,6 +231,9 @@ GLOBAL_ENTRY(ia64_switch_to) mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped... ;; itr.d dtr[r25]=r23 // wire in new mapping... + ssm psr.ic // reenable the psr.ic bit + ;; + srlz.d br.cond.sptk .done END(ia64_switch_to) diff -puN arch/ia64/kernel/iosapic.c~git-ia64 arch/ia64/kernel/iosapic.c --- devel/arch/ia64/kernel/iosapic.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/iosapic.c 2005-09-07 19:42:41.000000000 -0700 @@ -782,7 +782,6 @@ again: return vector; } -#ifdef CONFIG_ACPI_DEALLOCATE_IRQ void iosapic_unregister_intr (unsigned int gsi) { @@ -865,7 +864,6 @@ iosapic_unregister_intr (unsigned int gs spin_unlock(&iosapic_lock); spin_unlock_irqrestore(&idesc->lock, flags); } -#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */ /* * ACPI calls this when it finds an entry for a platform interrupt. diff -puN arch/ia64/kernel/mca_asm.S~git-ia64 arch/ia64/kernel/mca_asm.S --- devel/arch/ia64/kernel/mca_asm.S~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/mca_asm.S 2005-09-07 19:42:41.000000000 -0700 @@ -16,6 +16,9 @@ // 04/11/12 Russ Anderson // Added per cpu MCA/INIT stack save areas. // +// 12/08/05 Keith Owens +// Use per cpu MCA/INIT stacks for all data. +// #include #include @@ -25,96 +28,23 @@ #include #include -/* - * When we get a machine check, the kernel stack pointer is no longer - * valid, so we need to set a new stack pointer. - */ -#define MINSTATE_PHYS /* Make sure stack access is physical for MINSTATE */ - -/* - * Needed for return context to SAL - */ -#define IA64_MCA_SAME_CONTEXT 0 -#define IA64_MCA_COLD_BOOT -2 - -#include "minstate.h" - -/* - * SAL_TO_OS_MCA_HANDOFF_STATE (SAL 3.0 spec) - * 1. GR1 = OS GP - * 2. GR8 = PAL_PROC physical address - * 3. GR9 = SAL_PROC physical address - * 4. GR10 = SAL GP (physical) - * 5. GR11 = Rendez state - * 6. GR12 = Return address to location within SAL_CHECK - */ -#define SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(_tmp) \ - LOAD_PHYSICAL(p0, _tmp, ia64_sal_to_os_handoff_state);; \ - st8 [_tmp]=r1,0x08;; \ - st8 [_tmp]=r8,0x08;; \ - st8 [_tmp]=r9,0x08;; \ - st8 [_tmp]=r10,0x08;; \ - st8 [_tmp]=r11,0x08;; \ - st8 [_tmp]=r12,0x08;; \ - st8 [_tmp]=r17,0x08;; \ - st8 [_tmp]=r18,0x08 - -/* - * OS_MCA_TO_SAL_HANDOFF_STATE (SAL 3.0 spec) - * (p6) is executed if we never entered virtual mode (TLB error) - * (p7) is executed if we entered virtual mode as expected (normal case) - * 1. GR8 = OS_MCA return status - * 2. GR9 = SAL GP (physical) - * 3. GR10 = 0/1 returning same/new context - * 4. GR22 = New min state save area pointer - * returns ptr to SAL rtn save loc in _tmp - */ -#define OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(_tmp) \ - movl _tmp=ia64_os_to_sal_handoff_state;; \ - DATA_VA_TO_PA(_tmp);; \ - ld8 r8=[_tmp],0x08;; \ - ld8 r9=[_tmp],0x08;; \ - ld8 r10=[_tmp],0x08;; \ - ld8 r22=[_tmp],0x08;; - // now _tmp is pointing to SAL rtn save location - -/* - * COLD_BOOT_HANDOFF_STATE() sets ia64_mca_os_to_sal_state - * imots_os_status=IA64_MCA_COLD_BOOT - * imots_sal_gp=SAL GP - * imots_context=IA64_MCA_SAME_CONTEXT - * imots_new_min_state=Min state save area pointer - * imots_sal_check_ra=Return address to location within SAL_CHECK - * - */ -#define COLD_BOOT_HANDOFF_STATE(sal_to_os_handoff,os_to_sal_handoff,tmp)\ - movl tmp=IA64_MCA_COLD_BOOT; \ - movl sal_to_os_handoff=__pa(ia64_sal_to_os_handoff_state); \ - movl os_to_sal_handoff=__pa(ia64_os_to_sal_handoff_state);; \ - st8 [os_to_sal_handoff]=tmp,8;; \ - ld8 tmp=[sal_to_os_handoff],48;; \ - st8 [os_to_sal_handoff]=tmp,8;; \ - movl tmp=IA64_MCA_SAME_CONTEXT;; \ - st8 [os_to_sal_handoff]=tmp,8;; \ - ld8 tmp=[sal_to_os_handoff],-8;; \ - st8 [os_to_sal_handoff]=tmp,8;; \ - ld8 tmp=[sal_to_os_handoff];; \ - st8 [os_to_sal_handoff]=tmp;; +#include "entry.h" #define GET_IA64_MCA_DATA(reg) \ GET_THIS_PADDR(reg, ia64_mca_data) \ ;; \ ld8 reg=[reg] - .global ia64_os_mca_dispatch - .global ia64_os_mca_dispatch_end - .global ia64_sal_to_os_handoff_state - .global ia64_os_to_sal_handoff_state .global ia64_do_tlb_purge + .global ia64_os_mca_dispatch + .global ia64_os_init_dispatch_monarch + .global ia64_os_init_dispatch_slave .text .align 16 +//StartMain//////////////////////////////////////////////////////////////////// + /* * Just the TLB purge part is moved to a separate function * so we can re-use the code for cpu hotplug code as well @@ -207,34 +137,31 @@ ia64_do_tlb_purge: br.sptk.many b1 ;; -ia64_os_mca_dispatch: +//EndMain////////////////////////////////////////////////////////////////////// + +//StartMain//////////////////////////////////////////////////////////////////// +ia64_os_mca_dispatch: // Serialize all MCA processing mov r3=1;; LOAD_PHYSICAL(p0,r2,ia64_mca_serialize);; ia64_os_mca_spin: - xchg8 r4=[r2],r3;; + xchg4 r4=[r2],r3;; cmp.ne p6,p0=r4,r0 (p6) br ia64_os_mca_spin - // Save the SAL to OS MCA handoff state as defined - // by SAL SPEC 3.0 - // NOTE : The order in which the state gets saved - // is dependent on the way the C-structure - // for ia64_mca_sal_to_os_state_t has been - // defined in include/asm/mca.h - SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(r2) - ;; - - // LOG PROCESSOR STATE INFO FROM HERE ON.. -begin_os_mca_dump: - br ia64_os_mca_proc_state_dump;; - -ia64_os_mca_done_dump: + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + mov r19=1 // All MCA events are treated as monarch (for now) + br.sptk ia64_state_save // save the state that is not in minstate +1: - LOAD_PHYSICAL(p0,r16,ia64_sal_to_os_handoff_state+56) + GET_IA64_MCA_DATA(r2) + // Using MCA stack, struct ia64_sal_os_state, variable proc_state_param + ;; + add r3=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET+IA64_SAL_OS_STATE_PROC_STATE_PARAM_OFFSET, r2 ;; - ld8 r18=[r16] // Get processor state parameter on existing PALE_CHECK. + ld8 r18=[r3] // Get processor state parameter on existing PALE_CHECK. ;; tbit.nz p6,p7=r18,60 (p7) br.spnt done_tlb_purge_and_reload @@ -323,624 +250,710 @@ ia64_reload_tr: itr.d dtr[r20]=r16 ;; srlz.d - ;; - br.sptk.many done_tlb_purge_and_reload -err: - COLD_BOOT_HANDOFF_STATE(r20,r21,r22) - br.sptk.many ia64_os_mca_done_restore done_tlb_purge_and_reload: - // Setup new stack frame for OS_MCA handling - GET_IA64_MCA_DATA(r2) - ;; - add r3 = IA64_MCA_CPU_STACKFRAME_OFFSET, r2 - add r2 = IA64_MCA_CPU_RBSTORE_OFFSET, r2 - ;; - rse_switch_context(r6,r3,r2);; // RSC management in this new context + // switch to per cpu MCA stack + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_new_stack +1: + + // everything saved, now we can set the kernel registers + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_set_kernel_registers +1: + // This must be done in physical mode GET_IA64_MCA_DATA(r2) ;; - add r2 = IA64_MCA_CPU_STACK_OFFSET+IA64_MCA_STACK_SIZE-16, r2 - ;; - mov r12=r2 // establish new stack-pointer + mov r7=r2 // Enter virtual mode from physical mode VIRTUAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_begin, r4) -ia64_os_mca_virtual_begin: + + // This code returns to SAL via SOS r2, in general SAL has no unwind + // data. To get a clean termination when backtracing the C MCA/INIT + // handler, create a dummy return address of 0 in this routine. That + // requires that ia64_os_mca_virtual_begin be a global function. +ENTRY(ia64_os_mca_virtual_begin) + .prologue ASM_UNW_PRLG_RP, ASM_UNW_PRLG_GRSAVE(0) + + mov ar.rsc=3 // set eager mode for C handler + mov r2=r7 // see GET_IA64_MCA_DATA above + ;; // Call virtual mode handler - movl r2=ia64_mca_ucmc_handler;; - mov b6=r2;; - br.call.sptk.many b0=b6;; -.ret0: + alloc r14=ar.pfs,0,1,3,0 + mov loc0=r0 // dummy rp of 0 to terminate backtrace + .body + ;; + DATA_PA_TO_VA(r2,r7) + ;; + add out0=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_PT_REGS_OFFSET, r2 + add out1=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SWITCH_STACK_OFFSET, r2 + add out2=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET, r2 + br.call.sptk.many b0=ia64_mca_handler + // Revert back to physical mode before going back to SAL PHYSICAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_end, r4) ia64_os_mca_virtual_end: - // restore the original stack frame here - GET_IA64_MCA_DATA(r2) +END(ia64_os_mca_virtual_begin) + + // switch back to previous stack + alloc r14=ar.pfs,0,0,0,0 // remove the MCA handler frame + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_old_stack +1: + + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_state_restore // restore the SAL state +1: + + mov b0=r12 // SAL_CHECK return address + + // release lock + LOAD_PHYSICAL(p0,r3,ia64_mca_serialize);; + st4.rel [r3]=r0 ;; - add r2 = IA64_MCA_CPU_STACKFRAME_OFFSET, r2 + mov r31=-1 + LOAD_PHYSICAL(p0,r3,ia64_mca_init_leave);; ;; - movl r4=IA64_PSR_MC + st4.rel [r3]=r31 + + br b0 + +//EndMain////////////////////////////////////////////////////////////////////// + +//StartMain//////////////////////////////////////////////////////////////////// + +// +// SAL to OS entry point for INIT on all processors. This has been defined for +// registration purposes with SAL as a part of ia64_mca_init. Monarch and +// slave INIT have identical processing, except for the value of the +// sos->monarch flag in r19. +// + +ia64_os_init_dispatch_monarch: + mov r19=1 // Bow, bow, ye lower middle classes! + br.sptk ia64_os_init_dispatch + +ia64_os_init_dispatch_slave: + mov r19=0 // yeth, mathter + +ia64_os_init_dispatch: + + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_state_save // save the state that is not in minstate +1: + + // switch to per cpu INIT stack + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_new_stack +1: + + // everything saved, now we can set the kernel registers + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_set_kernel_registers +1: + + // This must be done in physical mode + GET_IA64_MCA_DATA(r2) ;; - rse_return_context(r4,r3,r2) // switch from interrupt context for RSE + mov r7=r2 + + // Enter virtual mode from physical mode + VIRTUAL_MODE_ENTER(r2, r3, ia64_os_init_virtual_begin, r4) + + // This code returns to SAL via SOS r2, in general SAL has no unwind + // data. To get a clean termination when backtracing the C MCA/INIT + // handler, create a dummy return address of 0 in this routine. That + // requires that ia64_os_init_virtual_begin be a global function. +ENTRY(ia64_os_init_virtual_begin) + .prologue ASM_UNW_PRLG_RP, ASM_UNW_PRLG_GRSAVE(0) - // let us restore all the registers from our PSI structure - mov r8=gp + mov ar.rsc=3 // set eager mode for C handler + mov r2=r7 // see GET_IA64_MCA_DATA above ;; -begin_os_mca_restore: - br ia64_os_mca_proc_state_restore;; -ia64_os_mca_done_restore: - OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(r2);; - // branch back to SALE_CHECK - ld8 r3=[r2];; - mov b0=r3;; // SAL_CHECK return address + // Call virtual mode handler + alloc r14=ar.pfs,0,1,3,0 + mov loc0=r0 // dummy rp of 0 to terminate backtrace + .body + ;; + DATA_PA_TO_VA(r2,r7) + ;; + add out0=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_PT_REGS_OFFSET, r2 + add out1=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_SWITCH_STACK_OFFSET, r2 + add out2=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_SOS_OFFSET, r2 + br.call.sptk.many b0=ia64_init_handler - // release lock - movl r3=ia64_mca_serialize;; - DATA_VA_TO_PA(r3);; - st8.rel [r3]=r0 + // Revert back to physical mode before going back to SAL + PHYSICAL_MODE_ENTER(r2, r3, ia64_os_init_virtual_end, r4) +ia64_os_init_virtual_end: - br b0 +END(ia64_os_init_virtual_begin) + + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_state_restore // restore the SAL state +1: + + // switch back to previous stack + alloc r14=ar.pfs,0,0,0,0 // remove the INIT handler frame + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_old_stack +1: + + mov b0=r12 // SAL_CHECK return address + ;; + mov r31=-1 + LOAD_PHYSICAL(p0,r3,ia64_mca_init_leave);; ;; -ia64_os_mca_dispatch_end: + st4.rel [r3]=r31 + br b0 + //EndMain////////////////////////////////////////////////////////////////////// +// common defines for the stubs +#define ms r4 +#define regs r5 +#define temp1 r2 /* careful, it overlaps with input registers */ +#define temp2 r3 /* careful, it overlaps with input registers */ +#define temp3 r7 +#define temp4 r14 + //++ // Name: -// ia64_os_mca_proc_state_dump() +// ia64_state_save() // // Stub Description: // -// This stub dumps the processor state during MCHK to a data area +// Save the state that is not in minstate. This is sensitive to the layout of +// struct ia64_sal_os_state in mca.h. +// +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. +// +// The OS to SAL section of struct ia64_sal_os_state is set to a default +// value of cold boot (MCA) or warm boot (INIT) and return to the same +// context. ia64_sal_os_state is also used to hold some registers that +// need to be saved and restored across the stack switches. +// +// Most input registers to this stub come from PAL/SAL +// r1 os gp, physical +// r8 pal_proc entry point +// r9 sal_proc entry point +// r10 sal gp +// r11 MCA - rendevzous state, INIT - reason code +// r12 sal return address +// r17 pal min_state +// r18 processor state parameter +// r19 monarch flag, set by the caller of this routine +// +// In addition to the SAL to OS state, this routine saves all the +// registers that appear in struct pt_regs and struct switch_stack, +// excluding those that are already in the PAL minstate area. This +// results in a partial pt_regs and switch_stack, the C code copies the +// remaining registers from PAL minstate to pt_regs and switch_stack. The +// resulting structures contain all the state of the original process when +// MCA/INIT occurred. // //-- -ia64_os_mca_proc_state_dump: -// Save bank 1 GRs 16-31 which will be used by c-language code when we switch -// to virtual addressing mode. - GET_IA64_MCA_DATA(r2) +ia64_state_save: + add regs=MCA_SOS_OFFSET, r3 + add ms=MCA_SOS_OFFSET+8, r3 + mov b0=r2 // save return address + cmp.eq p1,p2=IA64_MCA_CPU_MCA_STACK_OFFSET, r3 + ;; + GET_IA64_MCA_DATA(temp2) + ;; + add temp1=temp2, regs // struct ia64_sal_os_state on MCA or INIT stack + add temp2=temp2, ms // struct ia64_sal_os_state+8 on MCA or INIT stack + ;; + mov regs=temp1 // save the start of sos + st8 [temp1]=r1,16 // os_gp + st8 [temp2]=r8,16 // pal_proc + ;; + st8 [temp1]=r9,16 // sal_proc + st8 [temp2]=r11,16 // rv_rc + mov r11=cr.iipa + ;; + st8 [temp1]=r18,16 // proc_state_param + st8 [temp2]=r19,16 // monarch + mov r6=IA64_KR(CURRENT) + ;; + st8 [temp1]=r12,16 // sal_ra + st8 [temp2]=r10,16 // sal_gp + mov r12=cr.isr + ;; + st8 [temp1]=r17,16 // pal_min_state + st8 [temp2]=r6,16 // prev_IA64_KR_CURRENT + mov r6=cr.ifa + ;; + st8 [temp1]=r0,16 // prev_task, starts off as NULL + st8 [temp2]=r12,16 // cr.isr + mov r12=cr.itir + ;; + st8 [temp1]=r6,16 // cr.ifa + st8 [temp2]=r12,16 // cr.itir + mov r12=cr.iim + ;; + st8 [temp1]=r11,16 // cr.iipa + st8 [temp2]=r12,16 // cr.iim + mov r6=cr.iha +(p1) mov r12=IA64_MCA_COLD_BOOT +(p2) mov r12=IA64_INIT_WARM_BOOT + ;; + st8 [temp1]=r6,16 // cr.iha + st8 [temp2]=r12 // os_status, default is cold boot + mov r6=IA64_MCA_SAME_CONTEXT + ;; + st8 [temp1]=r6 // context, default is same context + + // Save the pt_regs data that is not in minstate. The previous code + // left regs at sos. + add regs=MCA_PT_REGS_OFFSET-MCA_SOS_OFFSET, regs + ;; + add temp1=PT(B6), regs + mov temp3=b6 + mov temp4=b7 + add temp2=PT(B7), regs + ;; + st8 [temp1]=temp3,PT(AR_CSD)-PT(B6) // save b6 + st8 [temp2]=temp4,PT(AR_SSD)-PT(B7) // save b7 + mov temp3=ar.csd + mov temp4=ar.ssd + cover // must be last in group + ;; + st8 [temp1]=temp3,PT(AR_UNAT)-PT(AR_CSD) // save ar.csd + st8 [temp2]=temp4,PT(AR_PFS)-PT(AR_SSD) // save ar.ssd + mov temp3=ar.unat + mov temp4=ar.pfs + ;; + st8 [temp1]=temp3,PT(AR_RNAT)-PT(AR_UNAT) // save ar.unat + st8 [temp2]=temp4,PT(AR_BSPSTORE)-PT(AR_PFS) // save ar.pfs + mov temp3=ar.rnat + mov temp4=ar.bspstore + ;; + st8 [temp1]=temp3,PT(LOADRS)-PT(AR_RNAT) // save ar.rnat + st8 [temp2]=temp4,PT(AR_FPSR)-PT(AR_BSPSTORE) // save ar.bspstore + mov temp3=ar.bsp + ;; + sub temp3=temp3, temp4 // ar.bsp - ar.bspstore + mov temp4=ar.fpsr + ;; + shl temp3=temp3,16 // compute ar.rsc to be used for "loadrs" ;; - add r2 = IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, r2 + st8 [temp1]=temp3,PT(AR_CCV)-PT(LOADRS) // save loadrs + st8 [temp2]=temp4,PT(F6)-PT(AR_FPSR) // save ar.fpsr + mov temp3=ar.ccv ;; -// save ar.NaT - mov r5=ar.unat // ar.unat + st8 [temp1]=temp3,PT(F7)-PT(AR_CCV) // save ar.ccv + stf.spill [temp2]=f6,PT(F8)-PT(F6) + ;; + stf.spill [temp1]=f7,PT(F9)-PT(F7) + stf.spill [temp2]=f8,PT(F10)-PT(F8) + ;; + stf.spill [temp1]=f9,PT(F11)-PT(F9) + stf.spill [temp2]=f10 + ;; + stf.spill [temp1]=f11 -// save banked GRs 16-31 along with NaT bits - bsw.1;; - st8.spill [r2]=r16,8;; - st8.spill [r2]=r17,8;; - st8.spill [r2]=r18,8;; - st8.spill [r2]=r19,8;; - st8.spill [r2]=r20,8;; - st8.spill [r2]=r21,8;; - st8.spill [r2]=r22,8;; - st8.spill [r2]=r23,8;; - st8.spill [r2]=r24,8;; - st8.spill [r2]=r25,8;; - st8.spill [r2]=r26,8;; - st8.spill [r2]=r27,8;; - st8.spill [r2]=r28,8;; - st8.spill [r2]=r29,8;; - st8.spill [r2]=r30,8;; - st8.spill [r2]=r31,8;; - - mov r4=ar.unat;; - st8 [r2]=r4,8 // save User NaT bits for r16-r31 - mov ar.unat=r5 // restore original unat - bsw.0;; - -//save BRs - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2 // duplicate r2 in r4 - - mov r3=b0 - mov r5=b1 - mov r7=b2;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=b3 - mov r5=b4 - mov r7=b5;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=b6 - mov r5=b7;; - st8 [r2]=r3,2*8 - st8 [r4]=r5,2*8;; - -cSaveCRs: -// save CRs - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2 // duplicate r2 in r4 - - mov r3=cr.dcr - mov r5=cr.itm - mov r7=cr.iva;; - - st8 [r2]=r3,8*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; // 48 byte rements - - mov r3=cr.pta;; - st8 [r2]=r3,8*8;; // 64 byte rements - -// if PSR.ic=0, reading interruption registers causes an illegal operation fault - mov r3=psr;; - tbit.nz.unc p6,p0=r3,PSR_IC;; // PSI Valid Log bit pos. test -(p6) st8 [r2]=r0,9*8+160 // increment by 232 byte inc. -begin_skip_intr_regs: -(p6) br SkipIntrRegs;; - - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2 // duplicate r2 in r6 - - mov r3=cr.ipsr - mov r5=cr.isr - mov r7=r0;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=cr.iip - mov r5=cr.ifa - mov r7=cr.itir;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=cr.iipa - mov r5=cr.ifs - mov r7=cr.iim;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=cr25;; // cr.iha - st8 [r2]=r3,160;; // 160 byte rement - -SkipIntrRegs: - st8 [r2]=r0,152;; // another 152 byte . - - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2 // duplicate r2 in r6 - - mov r3=cr.lid -// mov r5=cr.ivr // cr.ivr, don't read it - mov r7=cr.tpr;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=r0 // cr.eoi => cr67 - mov r5=r0 // cr.irr0 => cr68 - mov r7=r0;; // cr.irr1 => cr69 - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=r0 // cr.irr2 => cr70 - mov r5=r0 // cr.irr3 => cr71 - mov r7=cr.itv;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=cr.pmv - mov r5=cr.cmcv;; - st8 [r2]=r3,7*8 - st8 [r4]=r5,7*8;; - - mov r3=r0 // cr.lrr0 => cr80 - mov r5=r0;; // cr.lrr1 => cr81 - st8 [r2]=r3,23*8 - st8 [r4]=r5,23*8;; - - adds r2=25*8,r2;; - -cSaveARs: -// save ARs - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2 // duplicate r2 in r6 - - mov r3=ar.k0 - mov r5=ar.k1 - mov r7=ar.k2;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=ar.k3 - mov r5=ar.k4 - mov r7=ar.k5;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=ar.k6 - mov r5=ar.k7 - mov r7=r0;; // ar.kr8 - st8 [r2]=r3,10*8 - st8 [r4]=r5,10*8 - st8 [r6]=r7,10*8;; // rement by 72 bytes - - mov r3=ar.rsc - mov ar.rsc=r0 // put RSE in enforced lazy mode - mov r5=ar.bsp - ;; - mov r7=ar.bspstore;; - st8 [r2]=r3,3*8 - st8 [r4]=r5,3*8 - st8 [r6]=r7,3*8;; - - mov r3=ar.rnat;; - st8 [r2]=r3,8*13 // increment by 13x8 bytes - - mov r3=ar.ccv;; - st8 [r2]=r3,8*4 - - mov r3=ar.unat;; - st8 [r2]=r3,8*4 - - mov r3=ar.fpsr;; - st8 [r2]=r3,8*4 - - mov r3=ar.itc;; - st8 [r2]=r3,160 // 160 - - mov r3=ar.pfs;; - st8 [r2]=r3,8 - - mov r3=ar.lc;; - st8 [r2]=r3,8 - - mov r3=ar.ec;; - st8 [r2]=r3 - add r2=8*62,r2 //padding - -// save RRs - mov ar.lc=0x08-1 - movl r4=0x00;; - -cStRR: - dep.z r5=r4,61,3;; - mov r3=rr[r5];; - st8 [r2]=r3,8 - add r4=1,r4 - br.cloop.sptk.few cStRR + // Save the switch_stack data that is not in minstate nor pt_regs. The + // previous code left regs at pt_regs. + add regs=MCA_SWITCH_STACK_OFFSET-MCA_PT_REGS_OFFSET, regs + ;; + add temp1=SW(F2), regs + add temp2=SW(F3), regs + ;; + stf.spill [temp1]=f2,32 + stf.spill [temp2]=f3,32 + ;; + stf.spill [temp1]=f4,32 + stf.spill [temp2]=f5,32 + ;; + stf.spill [temp1]=f12,32 + stf.spill [temp2]=f13,32 + ;; + stf.spill [temp1]=f14,32 + stf.spill [temp2]=f15,32 + ;; + stf.spill [temp1]=f16,32 + stf.spill [temp2]=f17,32 + ;; + stf.spill [temp1]=f18,32 + stf.spill [temp2]=f19,32 + ;; + stf.spill [temp1]=f20,32 + stf.spill [temp2]=f21,32 + ;; + stf.spill [temp1]=f22,32 + stf.spill [temp2]=f23,32 + ;; + stf.spill [temp1]=f24,32 + stf.spill [temp2]=f25,32 ;; -end_os_mca_dump: - br ia64_os_mca_done_dump;; + stf.spill [temp1]=f26,32 + stf.spill [temp2]=f27,32 + ;; + stf.spill [temp1]=f28,32 + stf.spill [temp2]=f29,32 + ;; + stf.spill [temp1]=f30,SW(B2)-SW(F30) + stf.spill [temp2]=f31,SW(B3)-SW(F31) + mov temp3=b2 + mov temp4=b3 + ;; + st8 [temp1]=temp3,16 // save b2 + st8 [temp2]=temp4,16 // save b3 + mov temp3=b4 + mov temp4=b5 + ;; + st8 [temp1]=temp3,SW(AR_LC)-SW(B4) // save b4 + st8 [temp2]=temp4 // save b5 + mov temp3=ar.lc + ;; + st8 [temp1]=temp3 // save ar.lc + + br.sptk b0 //EndStub////////////////////////////////////////////////////////////////////// //++ // Name: -// ia64_os_mca_proc_state_restore() +// ia64_state_restore() // // Stub Description: // -// This is a stub to restore the saved processor state during MCHK +// Restore the SAL/OS state. This is sensitive to the layout of struct +// ia64_sal_os_state in mca.h. +// +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. +// +// In addition to the SAL to OS state, this routine restores all the +// registers that appear in struct pt_regs and struct switch_stack, +// excluding those in the PAL minstate area. // //-- -ia64_os_mca_proc_state_restore: +ia64_state_restore: + // Restore the switch_stack data that is not in minstate nor pt_regs. + add regs=MCA_SWITCH_STACK_OFFSET, r3 + mov b0=r2 // save return address + ;; + GET_IA64_MCA_DATA(temp2) + ;; + add regs=temp2, regs + ;; + add temp1=SW(F2), regs + add temp2=SW(F3), regs + ;; + ldf.fill f2=[temp1],32 + ldf.fill f3=[temp2],32 + ;; + ldf.fill f4=[temp1],32 + ldf.fill f5=[temp2],32 + ;; + ldf.fill f12=[temp1],32 + ldf.fill f13=[temp2],32 + ;; + ldf.fill f14=[temp1],32 + ldf.fill f15=[temp2],32 + ;; + ldf.fill f16=[temp1],32 + ldf.fill f17=[temp2],32 + ;; + ldf.fill f18=[temp1],32 + ldf.fill f19=[temp2],32 + ;; + ldf.fill f20=[temp1],32 + ldf.fill f21=[temp2],32 + ;; + ldf.fill f22=[temp1],32 + ldf.fill f23=[temp2],32 + ;; + ldf.fill f24=[temp1],32 + ldf.fill f25=[temp2],32 + ;; + ldf.fill f26=[temp1],32 + ldf.fill f27=[temp2],32 + ;; + ldf.fill f28=[temp1],32 + ldf.fill f29=[temp2],32 + ;; + ldf.fill f30=[temp1],SW(B2)-SW(F30) + ldf.fill f31=[temp2],SW(B3)-SW(F31) + ;; + ld8 temp3=[temp1],16 // restore b2 + ld8 temp4=[temp2],16 // restore b3 + ;; + mov b2=temp3 + mov b3=temp4 + ld8 temp3=[temp1],SW(AR_LC)-SW(B4) // restore b4 + ld8 temp4=[temp2] // restore b5 + ;; + mov b4=temp3 + mov b5=temp4 + ld8 temp3=[temp1] // restore ar.lc + ;; + mov ar.lc=temp3 -// Restore bank1 GR16-31 - GET_IA64_MCA_DATA(r2) + // Restore the pt_regs data that is not in minstate. The previous code + // left regs at switch_stack. + add regs=MCA_PT_REGS_OFFSET-MCA_SWITCH_STACK_OFFSET, regs + ;; + add temp1=PT(B6), regs + add temp2=PT(B7), regs + ;; + ld8 temp3=[temp1],PT(AR_CSD)-PT(B6) // restore b6 + ld8 temp4=[temp2],PT(AR_SSD)-PT(B7) // restore b7 + ;; + mov b6=temp3 + mov b7=temp4 + ld8 temp3=[temp1],PT(AR_UNAT)-PT(AR_CSD) // restore ar.csd + ld8 temp4=[temp2],PT(AR_PFS)-PT(AR_SSD) // restore ar.ssd + ;; + mov ar.csd=temp3 + mov ar.ssd=temp4 + ld8 temp3=[temp1] // restore ar.unat + add temp1=PT(AR_CCV)-PT(AR_UNAT), temp1 + ld8 temp4=[temp2],PT(AR_FPSR)-PT(AR_PFS) // restore ar.pfs ;; - add r2 = IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, r2 + mov ar.unat=temp3 + mov ar.pfs=temp4 + // ar.rnat, ar.bspstore, loadrs are restore in ia64_old_stack. + ld8 temp3=[temp1],PT(F6)-PT(AR_CCV) // restore ar.ccv + ld8 temp4=[temp2],PT(F7)-PT(AR_FPSR) // restore ar.fpsr + ;; + mov ar.ccv=temp3 + mov ar.fpsr=temp4 + ldf.fill f6=[temp1],PT(F8)-PT(F6) + ldf.fill f7=[temp2],PT(F9)-PT(F7) + ;; + ldf.fill f8=[temp1],PT(F10)-PT(F8) + ldf.fill f9=[temp2],PT(F11)-PT(F9) + ;; + ldf.fill f10=[temp1] + ldf.fill f11=[temp2] -restore_GRs: // restore bank-1 GRs 16-31 - bsw.1;; - add r3=16*8,r2;; // to get to NaT of GR 16-31 - ld8 r3=[r3];; - mov ar.unat=r3;; // first restore NaT - - ld8.fill r16=[r2],8;; - ld8.fill r17=[r2],8;; - ld8.fill r18=[r2],8;; - ld8.fill r19=[r2],8;; - ld8.fill r20=[r2],8;; - ld8.fill r21=[r2],8;; - ld8.fill r22=[r2],8;; - ld8.fill r23=[r2],8;; - ld8.fill r24=[r2],8;; - ld8.fill r25=[r2],8;; - ld8.fill r26=[r2],8;; - ld8.fill r27=[r2],8;; - ld8.fill r28=[r2],8;; - ld8.fill r29=[r2],8;; - ld8.fill r30=[r2],8;; - ld8.fill r31=[r2],8;; - - ld8 r3=[r2],8;; // increment to skip NaT - bsw.0;; - -restore_BRs: - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2;; // duplicate r2 in r4 - - ld8 r3=[r2],3*8 - ld8 r5=[r4],3*8 - ld8 r7=[r6],3*8;; - mov b0=r3 - mov b1=r5 - mov b2=r7;; - - ld8 r3=[r2],3*8 - ld8 r5=[r4],3*8 - ld8 r7=[r6],3*8;; - mov b3=r3 - mov b4=r5 - mov b5=r7;; - - ld8 r3=[r2],2*8 - ld8 r5=[r4],2*8;; - mov b6=r3 - mov b7=r5;; - -restore_CRs: - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2;; // duplicate r2 in r4 - - ld8 r3=[r2],8*8 - ld8 r5=[r4],3*8 - ld8 r7=[r6],3*8;; // 48 byte increments - mov cr.dcr=r3 - mov cr.itm=r5 - mov cr.iva=r7;; - - ld8 r3=[r2],8*8;; // 64 byte increments -// mov cr.pta=r3 - - -// if PSR.ic=1, reading interruption registers causes an illegal operation fault - mov r3=psr;; - tbit.nz.unc p6,p0=r3,PSR_IC;; // PSI Valid Log bit pos. test -(p6) st8 [r2]=r0,9*8+160 // increment by 232 byte inc. - -begin_rskip_intr_regs: -(p6) br rSkipIntrRegs;; - - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2;; // duplicate r2 in r4 - - ld8 r3=[r2],3*8 - ld8 r5=[r4],3*8 - ld8 r7=[r6],3*8;; - mov cr.ipsr=r3 -// mov cr.isr=r5 // cr.isr is read only - - ld8 r3=[r2],3*8 - ld8 r5=[r4],3*8 - ld8 r7=[r6],3*8;; - mov cr.iip=r3 - mov cr.ifa=r5 - mov cr.itir=r7;; - - ld8 r3=[r2],3*8 - ld8 r5=[r4],3*8 - ld8 r7=[r6],3*8;; - mov cr.iipa=r3 - mov cr.ifs=r5 - mov cr.iim=r7 - - ld8 r3=[r2],160;; // 160 byte increment - mov cr.iha=r3 - -rSkipIntrRegs: - ld8 r3=[r2],152;; // another 152 byte inc. - - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2;; // duplicate r2 in r6 - - ld8 r3=[r2],8*3 - ld8 r5=[r4],8*3 - ld8 r7=[r6],8*3;; - mov cr.lid=r3 -// mov cr.ivr=r5 // cr.ivr is read only - mov cr.tpr=r7;; - - ld8 r3=[r2],8*3 - ld8 r5=[r4],8*3 - ld8 r7=[r6],8*3;; -// mov cr.eoi=r3 -// mov cr.irr0=r5 // cr.irr0 is read only -// mov cr.irr1=r7;; // cr.irr1 is read only - - ld8 r3=[r2],8*3 - ld8 r5=[r4],8*3 - ld8 r7=[r6],8*3;; -// mov cr.irr2=r3 // cr.irr2 is read only -// mov cr.irr3=r5 // cr.irr3 is read only - mov cr.itv=r7;; - - ld8 r3=[r2],8*7 - ld8 r5=[r4],8*7;; - mov cr.pmv=r3 - mov cr.cmcv=r5;; - - ld8 r3=[r2],8*23 - ld8 r5=[r4],8*23;; - adds r2=8*23,r2 - adds r4=8*23,r4;; -// mov cr.lrr0=r3 -// mov cr.lrr1=r5 - - adds r2=8*2,r2;; - -restore_ARs: - add r4=8,r2 // duplicate r2 in r4 - add r6=2*8,r2;; // duplicate r2 in r4 - - ld8 r3=[r2],3*8 - ld8 r5=[r4],3*8 - ld8 r7=[r6],3*8;; - mov ar.k0=r3 - mov ar.k1=r5 - mov ar.k2=r7;; - - ld8 r3=[r2],3*8 - ld8 r5=[r4],3*8 - ld8 r7=[r6],3*8;; - mov ar.k3=r3 - mov ar.k4=r5 - mov ar.k5=r7;; - - ld8 r3=[r2],10*8 - ld8 r5=[r4],10*8 - ld8 r7=[r6],10*8;; - mov ar.k6=r3 - mov ar.k7=r5 - ;; - - ld8 r3=[r2],3*8 - ld8 r5=[r4],3*8 - ld8 r7=[r6],3*8;; -// mov ar.rsc=r3 -// mov ar.bsp=r5 // ar.bsp is read only - mov ar.rsc=r0 // make sure that RSE is in enforced lazy mode - ;; - mov ar.bspstore=r7;; - - ld8 r9=[r2],8*13;; - mov ar.rnat=r9 - - mov ar.rsc=r3 - ld8 r3=[r2],8*4;; - mov ar.ccv=r3 - - ld8 r3=[r2],8*4;; - mov ar.unat=r3 - - ld8 r3=[r2],8*4;; - mov ar.fpsr=r3 - - ld8 r3=[r2],160;; // 160 -// mov ar.itc=r3 - - ld8 r3=[r2],8;; - mov ar.pfs=r3 - - ld8 r3=[r2],8;; - mov ar.lc=r3 - - ld8 r3=[r2];; - mov ar.ec=r3 - add r2=8*62,r2;; // padding - -restore_RRs: - mov r5=ar.lc - mov ar.lc=0x08-1 - movl r4=0x00;; -cStRRr: - dep.z r7=r4,61,3 - ld8 r3=[r2],8;; - mov rr[r7]=r3 // what are its access previledges? - add r4=1,r4 - br.cloop.sptk.few cStRRr + // Restore the SAL to OS state. The previous code left regs at pt_regs. + add regs=MCA_SOS_OFFSET-MCA_PT_REGS_OFFSET, regs + ;; + add temp1=IA64_SAL_OS_STATE_COMMON_OFFSET, regs + add temp2=IA64_SAL_OS_STATE_COMMON_OFFSET+8, regs + ;; + ld8 r12=[temp1],16 // sal_ra + ld8 r9=[temp2],16 // sal_gp + ;; + ld8 r22=[temp1],24 // pal_min_state, virtual. skip prev_task + ld8 r21=[temp2],16 // prev_IA64_KR_CURRENT ;; - mov ar.lc=r5 + ld8 temp3=[temp1],16 // cr.isr + ld8 temp4=[temp2],16 // cr.ifa ;; -end_os_mca_restore: - br ia64_os_mca_done_restore;; + mov cr.isr=temp3 + mov cr.ifa=temp4 + ld8 temp3=[temp1],16 // cr.itir + ld8 temp4=[temp2],16 // cr.iipa + ;; + mov cr.itir=temp3 + mov cr.iipa=temp4 + ld8 temp3=[temp1],16 // cr.iim + ld8 temp4=[temp2],16 // cr.iha + ;; + mov cr.iim=temp3 + mov cr.iha=temp4 + dep r22=0,r22,62,2 // pal_min_state, physical, uncached + mov IA64_KR(CURRENT)=r21 + ld8 r8=[temp1] // os_status + ld8 r10=[temp2] // context + + br.sptk b0 //EndStub////////////////////////////////////////////////////////////////////// -// ok, the issue here is that we need to save state information so -// it can be useable by the kernel debugger and show regs routines. -// In order to do this, our best bet is save the current state (plus -// the state information obtain from the MIN_STATE_AREA) into a pt_regs -// format. This way we can pass it on in a useable format. +//++ +// Name: +// ia64_new_stack() // - +// Stub Description: // -// SAL to OS entry point for INIT on the monarch processor -// This has been defined for registration purposes with SAL -// as a part of ia64_mca_init. +// Switch to the MCA/INIT stack. // -// When we get here, the following registers have been -// set by the SAL for our use +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. // -// 1. GR1 = OS INIT GP -// 2. GR8 = PAL_PROC physical address -// 3. GR9 = SAL_PROC physical address -// 4. GR10 = SAL GP (physical) -// 5. GR11 = Init Reason -// 0 = Received INIT for event other than crash dump switch -// 1 = Received wakeup at the end of an OS_MCA corrected machine check -// 2 = Received INIT dude to CrashDump switch assertion +// On entry RBS is still on the original stack, this routine switches RBS +// to use the MCA/INIT stack. // -// 6. GR12 = Return address to location within SAL_INIT procedure - +// On entry, sos->pal_min_state is physical, on exit it is virtual. +// +//-- -GLOBAL_ENTRY(ia64_monarch_init_handler) - .prologue - // stash the information the SAL passed to os - SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(r2) +ia64_new_stack: + add regs=MCA_PT_REGS_OFFSET, r3 + add temp2=MCA_SOS_OFFSET+IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET, r3 + mov b0=r2 // save return address + GET_IA64_MCA_DATA(temp1) + invala ;; - SAVE_MIN_WITH_COVER + add temp2=temp2, temp1 // struct ia64_sal_os_state.pal_min_state on MCA or INIT stack + add regs=regs, temp1 // struct pt_regs on MCA or INIT stack ;; - mov r8=cr.ifa - mov r9=cr.isr - adds r3=8,r2 // set up second base pointer + // Address of minstate area provided by PAL is physical, uncacheable. + // Convert to Linux virtual address in region 6 for C code. + ld8 ms=[temp2] // pal_min_state, physical + ;; + dep temp1=-1,ms,62,2 // set region 6 + mov temp3=IA64_RBS_OFFSET-MCA_PT_REGS_OFFSET + ;; + st8 [temp2]=temp1 // pal_min_state, virtual + + add temp4=temp3, regs // start of bspstore on new stack ;; - SAVE_REST + mov ar.bspstore=temp4 // switch RBS to MCA/INIT stack + ;; + flushrs // must be first in group + br.sptk b0 + +//EndStub////////////////////////////////////////////////////////////////////// + + +//++ +// Name: +// ia64_old_stack() +// +// Stub Description: +// +// Switch to the old stack. +// +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. +// +// On entry, pal_min_state is virtual, on exit it is physical. +// +// On entry RBS is on the MCA/INIT stack, this routine switches RBS +// back to the previous stack. +// +// The psr is set to all zeroes. SAL return requires either all zeroes or +// just psr.mc set. Leaving psr.mc off allows INIT to be issued if this +// code does not perform correctly. +// +// The dirty registers at the time of the event were flushed to the +// MCA/INIT stack in ia64_pt_regs_save(). Restore the dirty registers +// before reverting to the previous bspstore. +//-- -// ok, enough should be saved at this point to be dangerous, and supply -// information for a dump -// We need to switch to Virtual mode before hitting the C functions. +ia64_old_stack: + add regs=MCA_PT_REGS_OFFSET, r3 + mov b0=r2 // save return address + GET_IA64_MCA_DATA(temp2) + LOAD_PHYSICAL(p0,temp1,1f) + ;; + mov cr.ipsr=r0 + mov cr.ifs=r0 + mov cr.iip=temp1 + ;; + invala + rfi +1: - movl r2=IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN - mov r3=psr // get the current psr, minimum enabled at this point + add regs=regs, temp2 // struct pt_regs on MCA or INIT stack ;; - or r2=r2,r3 + add temp1=PT(LOADRS), regs ;; - movl r3=IVirtual_Switch + ld8 temp2=[temp1],PT(AR_BSPSTORE)-PT(LOADRS) // restore loadrs ;; - mov cr.iip=r3 // short return to set the appropriate bits - mov cr.ipsr=r2 // need to do an rfi to set appropriate bits + ld8 temp3=[temp1],PT(AR_RNAT)-PT(AR_BSPSTORE) // restore ar.bspstore + mov ar.rsc=temp2 ;; - rfi + loadrs + ld8 temp4=[temp1] // restore ar.rnat ;; -IVirtual_Switch: - // - // We should now be running virtual - // - // Let's call the C handler to get the rest of the state info - // - alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!) + mov ar.bspstore=temp3 // back to old stack ;; - adds out0=16,sp // out0 = pointer to pt_regs + mov ar.rnat=temp4 ;; - DO_SAVE_SWITCH_STACK - .body - adds out1=16,sp // out0 = pointer to switch_stack - br.call.sptk.many rp=ia64_init_handler -.ret1: + br.sptk b0 + +//EndStub////////////////////////////////////////////////////////////////////// -return_from_init: - br.sptk return_from_init -END(ia64_monarch_init_handler) +//++ +// Name: +// ia64_set_kernel_registers() // -// SAL to OS entry point for INIT on the slave processor -// This has been defined for registration purposes with SAL -// as a part of ia64_mca_init. +// Stub Description: // +// Set the registers that are required by the C code in order to run on an +// MCA/INIT stack. +// +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. +// +//-- + +ia64_set_kernel_registers: + add temp3=MCA_SP_OFFSET, r3 + add temp4=MCA_SOS_OFFSET+IA64_SAL_OS_STATE_OS_GP_OFFSET, r3 + mov b0=r2 // save return address + GET_IA64_MCA_DATA(temp1) + ;; + add temp4=temp4, temp1 // &struct ia64_sal_os_state.os_gp + add r12=temp1, temp3 // kernel stack pointer on MCA/INIT stack + add r13=temp1, r3 // set current to start of MCA/INIT stack + ;; + ld8 r1=[temp4] // OS GP from SAL OS state + ;; + DATA_PA_TO_VA(r1,temp1) + DATA_PA_TO_VA(r12,temp2) + DATA_PA_TO_VA(r13,temp3) + ;; + mov IA64_KR(CURRENT)=r13 + + // FIXME: do I need to wire IA64_KR_CURRENT_STACK and IA64_TR_CURRENT_STACK? + + br.sptk b0 + +//EndStub////////////////////////////////////////////////////////////////////// -GLOBAL_ENTRY(ia64_slave_init_handler) -1: br.sptk 1b -END(ia64_slave_init_handler) +#undef ms +#undef regs +#undef temp1 +#undef temp2 +#undef temp3 +#undef temp4 + + +// Support function for mca.c, it is here to avoid using inline asm. Given the +// address of an rnat slot, if that address is below the current ar.bspstore +// then return the contents of that slot, otherwise return the contents of +// ar.rnat. +GLOBAL_ENTRY(ia64_get_rnat) + alloc r14=ar.pfs,1,0,0,0 + mov ar.rsc=0 + ;; + mov r14=ar.bspstore + ;; + cmp.lt p6,p7=in0,r14 + ;; +(p6) ld8 r8=[in0] +(p7) mov r8=ar.rnat + mov ar.rsc=3 + br.ret.sptk.many rp +END(ia64_get_rnat) diff -puN arch/ia64/kernel/mca.c~git-ia64 arch/ia64/kernel/mca.c --- devel/arch/ia64/kernel/mca.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/mca.c 2005-09-07 19:42:41.000000000 -0700 @@ -48,6 +48,9 @@ * Delete dead variables and functions. * Reorder to remove the need for forward declarations and to consolidate * related code. + * + * 2005-08-12 Keith Owens + * Convert MCA/INIT handlers to use per event stacks and SAL/OS state. */ #include #include @@ -77,6 +80,8 @@ #include #include +#include "entry.h" + #if defined(IA64_MCA_DEBUG_INFO) # define IA64_MCA_DEBUG(fmt...) printk(fmt) #else @@ -84,9 +89,8 @@ #endif /* Used by mca_asm.S */ -ia64_mca_sal_to_os_state_t ia64_sal_to_os_handoff_state; -ia64_mca_os_to_sal_state_t ia64_os_to_sal_handoff_state; -u64 ia64_mca_serialize; +u32 ia64_mca_serialize; +s32 ia64_mca_init_leave = -1; DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */ DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */ DEFINE_PER_CPU(u64, ia64_mca_pal_pte); /* PTE to map PAL code */ @@ -95,8 +99,10 @@ DEFINE_PER_CPU(u64, ia64_mca_pal_base); unsigned long __per_cpu_mca[NR_CPUS]; /* In mca_asm.S */ -extern void ia64_monarch_init_handler (void); -extern void ia64_slave_init_handler (void); +extern void ia64_os_init_dispatch_monarch (void); +extern void ia64_os_init_dispatch_slave (void); + +static int monarch_cpu = -1; static ia64_mc_info_t ia64_mc_info; @@ -234,7 +240,8 @@ ia64_log_get(int sal_info_type, u8 **buf * This function retrieves a specified error record type from SAL * and wakes up any processes waiting for error records. * - * Inputs : sal_info_type (Type of error record MCA/CMC/CPE/INIT) + * Inputs : sal_info_type (Type of error record MCA/CMC/CPE) + * FIXME: remove MCA and irq_safe. */ static void ia64_mca_log_sal_error_record(int sal_info_type) @@ -242,7 +249,7 @@ ia64_mca_log_sal_error_record(int sal_in u8 *buffer; sal_log_record_header_t *rh; u64 size; - int irq_safe = sal_info_type != SAL_INFO_TYPE_MCA && sal_info_type != SAL_INFO_TYPE_INIT; + int irq_safe = sal_info_type != SAL_INFO_TYPE_MCA; #ifdef IA64_MCA_DEBUG_INFO static const char * const rec_name[] = { "MCA", "INIT", "CMC", "CPE" }; #endif @@ -330,191 +337,6 @@ ia64_mca_cpe_int_handler (int cpe_irq, v #endif /* CONFIG_ACPI */ -static void -show_min_state (pal_min_state_area_t *minstate) -{ - u64 iip = minstate->pmsa_iip + ((struct ia64_psr *)(&minstate->pmsa_ipsr))->ri; - u64 xip = minstate->pmsa_xip + ((struct ia64_psr *)(&minstate->pmsa_xpsr))->ri; - - printk("NaT bits\t%016lx\n", minstate->pmsa_nat_bits); - printk("pr\t\t%016lx\n", minstate->pmsa_pr); - printk("b0\t\t%016lx ", minstate->pmsa_br0); print_symbol("%s\n", minstate->pmsa_br0); - printk("ar.rsc\t\t%016lx\n", minstate->pmsa_rsc); - printk("cr.iip\t\t%016lx ", iip); print_symbol("%s\n", iip); - printk("cr.ipsr\t\t%016lx\n", minstate->pmsa_ipsr); - printk("cr.ifs\t\t%016lx\n", minstate->pmsa_ifs); - printk("xip\t\t%016lx ", xip); print_symbol("%s\n", xip); - printk("xpsr\t\t%016lx\n", minstate->pmsa_xpsr); - printk("xfs\t\t%016lx\n", minstate->pmsa_xfs); - printk("b1\t\t%016lx ", minstate->pmsa_br1); - print_symbol("%s\n", minstate->pmsa_br1); - - printk("\nstatic registers r0-r15:\n"); - printk(" r0- 3 %016lx %016lx %016lx %016lx\n", - 0UL, minstate->pmsa_gr[0], minstate->pmsa_gr[1], minstate->pmsa_gr[2]); - printk(" r4- 7 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_gr[3], minstate->pmsa_gr[4], - minstate->pmsa_gr[5], minstate->pmsa_gr[6]); - printk(" r8-11 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_gr[7], minstate->pmsa_gr[8], - minstate->pmsa_gr[9], minstate->pmsa_gr[10]); - printk("r12-15 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_gr[11], minstate->pmsa_gr[12], - minstate->pmsa_gr[13], minstate->pmsa_gr[14]); - - printk("\nbank 0:\n"); - printk("r16-19 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_bank0_gr[0], minstate->pmsa_bank0_gr[1], - minstate->pmsa_bank0_gr[2], minstate->pmsa_bank0_gr[3]); - printk("r20-23 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_bank0_gr[4], minstate->pmsa_bank0_gr[5], - minstate->pmsa_bank0_gr[6], minstate->pmsa_bank0_gr[7]); - printk("r24-27 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_bank0_gr[8], minstate->pmsa_bank0_gr[9], - minstate->pmsa_bank0_gr[10], minstate->pmsa_bank0_gr[11]); - printk("r28-31 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_bank0_gr[12], minstate->pmsa_bank0_gr[13], - minstate->pmsa_bank0_gr[14], minstate->pmsa_bank0_gr[15]); - - printk("\nbank 1:\n"); - printk("r16-19 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_bank1_gr[0], minstate->pmsa_bank1_gr[1], - minstate->pmsa_bank1_gr[2], minstate->pmsa_bank1_gr[3]); - printk("r20-23 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_bank1_gr[4], minstate->pmsa_bank1_gr[5], - minstate->pmsa_bank1_gr[6], minstate->pmsa_bank1_gr[7]); - printk("r24-27 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_bank1_gr[8], minstate->pmsa_bank1_gr[9], - minstate->pmsa_bank1_gr[10], minstate->pmsa_bank1_gr[11]); - printk("r28-31 %016lx %016lx %016lx %016lx\n", - minstate->pmsa_bank1_gr[12], minstate->pmsa_bank1_gr[13], - minstate->pmsa_bank1_gr[14], minstate->pmsa_bank1_gr[15]); -} - -static void -fetch_min_state (pal_min_state_area_t *ms, struct pt_regs *pt, struct switch_stack *sw) -{ - u64 *dst_banked, *src_banked, bit, shift, nat_bits; - int i; - - /* - * First, update the pt-regs and switch-stack structures with the contents stored - * in the min-state area: - */ - if (((struct ia64_psr *) &ms->pmsa_ipsr)->ic == 0) { - pt->cr_ipsr = ms->pmsa_xpsr; - pt->cr_iip = ms->pmsa_xip; - pt->cr_ifs = ms->pmsa_xfs; - } else { - pt->cr_ipsr = ms->pmsa_ipsr; - pt->cr_iip = ms->pmsa_iip; - pt->cr_ifs = ms->pmsa_ifs; - } - pt->ar_rsc = ms->pmsa_rsc; - pt->pr = ms->pmsa_pr; - pt->r1 = ms->pmsa_gr[0]; - pt->r2 = ms->pmsa_gr[1]; - pt->r3 = ms->pmsa_gr[2]; - sw->r4 = ms->pmsa_gr[3]; - sw->r5 = ms->pmsa_gr[4]; - sw->r6 = ms->pmsa_gr[5]; - sw->r7 = ms->pmsa_gr[6]; - pt->r8 = ms->pmsa_gr[7]; - pt->r9 = ms->pmsa_gr[8]; - pt->r10 = ms->pmsa_gr[9]; - pt->r11 = ms->pmsa_gr[10]; - pt->r12 = ms->pmsa_gr[11]; - pt->r13 = ms->pmsa_gr[12]; - pt->r14 = ms->pmsa_gr[13]; - pt->r15 = ms->pmsa_gr[14]; - dst_banked = &pt->r16; /* r16-r31 are contiguous in struct pt_regs */ - src_banked = ms->pmsa_bank1_gr; - for (i = 0; i < 16; ++i) - dst_banked[i] = src_banked[i]; - pt->b0 = ms->pmsa_br0; - sw->b1 = ms->pmsa_br1; - - /* construct the NaT bits for the pt-regs structure: */ -# define PUT_NAT_BIT(dst, addr) \ - do { \ - bit = nat_bits & 1; nat_bits >>= 1; \ - shift = ((unsigned long) addr >> 3) & 0x3f; \ - dst = ((dst) & ~(1UL << shift)) | (bit << shift); \ - } while (0) - - /* Rotate the saved NaT bits such that bit 0 corresponds to pmsa_gr[0]: */ - shift = ((unsigned long) &ms->pmsa_gr[0] >> 3) & 0x3f; - nat_bits = (ms->pmsa_nat_bits >> shift) | (ms->pmsa_nat_bits << (64 - shift)); - - PUT_NAT_BIT(sw->caller_unat, &pt->r1); - PUT_NAT_BIT(sw->caller_unat, &pt->r2); - PUT_NAT_BIT(sw->caller_unat, &pt->r3); - PUT_NAT_BIT(sw->ar_unat, &sw->r4); - PUT_NAT_BIT(sw->ar_unat, &sw->r5); - PUT_NAT_BIT(sw->ar_unat, &sw->r6); - PUT_NAT_BIT(sw->ar_unat, &sw->r7); - PUT_NAT_BIT(sw->caller_unat, &pt->r8); PUT_NAT_BIT(sw->caller_unat, &pt->r9); - PUT_NAT_BIT(sw->caller_unat, &pt->r10); PUT_NAT_BIT(sw->caller_unat, &pt->r11); - PUT_NAT_BIT(sw->caller_unat, &pt->r12); PUT_NAT_BIT(sw->caller_unat, &pt->r13); - PUT_NAT_BIT(sw->caller_unat, &pt->r14); PUT_NAT_BIT(sw->caller_unat, &pt->r15); - nat_bits >>= 16; /* skip over bank0 NaT bits */ - PUT_NAT_BIT(sw->caller_unat, &pt->r16); PUT_NAT_BIT(sw->caller_unat, &pt->r17); - PUT_NAT_BIT(sw->caller_unat, &pt->r18); PUT_NAT_BIT(sw->caller_unat, &pt->r19); - PUT_NAT_BIT(sw->caller_unat, &pt->r20); PUT_NAT_BIT(sw->caller_unat, &pt->r21); - PUT_NAT_BIT(sw->caller_unat, &pt->r22); PUT_NAT_BIT(sw->caller_unat, &pt->r23); - PUT_NAT_BIT(sw->caller_unat, &pt->r24); PUT_NAT_BIT(sw->caller_unat, &pt->r25); - PUT_NAT_BIT(sw->caller_unat, &pt->r26); PUT_NAT_BIT(sw->caller_unat, &pt->r27); - PUT_NAT_BIT(sw->caller_unat, &pt->r28); PUT_NAT_BIT(sw->caller_unat, &pt->r29); - PUT_NAT_BIT(sw->caller_unat, &pt->r30); PUT_NAT_BIT(sw->caller_unat, &pt->r31); -} - -static void -init_handler_platform (pal_min_state_area_t *ms, - struct pt_regs *pt, struct switch_stack *sw) -{ - struct unw_frame_info info; - - /* if a kernel debugger is available call it here else just dump the registers */ - - /* - * Wait for a bit. On some machines (e.g., HP's zx2000 and zx6000, INIT can be - * generated via the BMC's command-line interface, but since the console is on the - * same serial line, the user will need some time to switch out of the BMC before - * the dump begins. - */ - printk("Delaying for 5 seconds...\n"); - udelay(5*1000000); - show_min_state(ms); - - printk("Backtrace of current task (pid %d, %s)\n", current->pid, current->comm); - fetch_min_state(ms, pt, sw); - unw_init_from_interruption(&info, current, pt, sw); - ia64_do_show_stack(&info, NULL); - -#ifdef CONFIG_SMP - /* read_trylock() would be handy... */ - if (!tasklist_lock.write_lock) - read_lock(&tasklist_lock); -#endif - { - struct task_struct *g, *t; - do_each_thread (g, t) { - if (t == current) - continue; - - printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); - show_stack(t, NULL); - } while_each_thread (g, t); - } -#ifdef CONFIG_SMP - if (!tasklist_lock.write_lock) - read_unlock(&tasklist_lock); -#endif - - printk("\nINIT dump complete. Please reboot now.\n"); - while (1); /* hang city if no debugger */ -} - #ifdef CONFIG_ACPI /* * ia64_mca_register_cpev @@ -657,42 +479,6 @@ ia64_mca_cmc_vector_enable_keventd(void } /* - * ia64_mca_wakeup_ipi_wait - * - * Wait for the inter-cpu interrupt to be sent by the - * monarch processor once it is done with handling the - * MCA. - * - * Inputs : None - * Outputs : None - */ -static void -ia64_mca_wakeup_ipi_wait(void) -{ - int irr_num = (IA64_MCA_WAKEUP_VECTOR >> 6); - int irr_bit = (IA64_MCA_WAKEUP_VECTOR & 0x3f); - u64 irr = 0; - - do { - switch(irr_num) { - case 0: - irr = ia64_getreg(_IA64_REG_CR_IRR0); - break; - case 1: - irr = ia64_getreg(_IA64_REG_CR_IRR1); - break; - case 2: - irr = ia64_getreg(_IA64_REG_CR_IRR2); - break; - case 3: - irr = ia64_getreg(_IA64_REG_CR_IRR3); - break; - } - cpu_relax(); - } while (!(irr & (1UL << irr_bit))) ; -} - -/* * ia64_mca_wakeup * * Send an inter-cpu interrupt to wake-up a particular cpu @@ -757,11 +543,9 @@ ia64_mca_rendez_int_handler(int rendez_i */ ia64_sal_mc_rendez(); - /* Wait for the wakeup IPI from the monarch - * This waiting is done by polling on the wakeup-interrupt - * vector bit in the processor's IRRs - */ - ia64_mca_wakeup_ipi_wait(); + /* Wait for the monarch cpu to exit. */ + while (monarch_cpu != -1) + cpu_relax(); /* spin until monarch leaves */ /* Enable all interrupts */ local_irq_restore(flags); @@ -789,53 +573,13 @@ ia64_mca_wakeup_int_handler(int wakeup_i return IRQ_HANDLED; } -/* - * ia64_return_to_sal_check - * - * This is function called before going back from the OS_MCA handler - * to the OS_MCA dispatch code which finally takes the control back - * to the SAL. - * The main purpose of this routine is to setup the OS_MCA to SAL - * return state which can be used by the OS_MCA dispatch code - * just before going back to SAL. - * - * Inputs : None - * Outputs : None - */ - -static void -ia64_return_to_sal_check(int recover) -{ - - /* Copy over some relevant stuff from the sal_to_os_mca_handoff - * so that it can be used at the time of os_mca_to_sal_handoff - */ - ia64_os_to_sal_handoff_state.imots_sal_gp = - ia64_sal_to_os_handoff_state.imsto_sal_gp; - - ia64_os_to_sal_handoff_state.imots_sal_check_ra = - ia64_sal_to_os_handoff_state.imsto_sal_check_ra; - - if (recover) - ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_CORRECTED; - else - ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_COLD_BOOT; - - /* Default = tell SAL to return to same context */ - ia64_os_to_sal_handoff_state.imots_context = IA64_MCA_SAME_CONTEXT; - - ia64_os_to_sal_handoff_state.imots_new_min_state = - (u64 *)ia64_sal_to_os_handoff_state.pal_min_state; - -} - /* Function pointer for extra MCA recovery */ int (*ia64_mca_ucmc_extension) - (void*,ia64_mca_sal_to_os_state_t*,ia64_mca_os_to_sal_state_t*) + (void*,struct ia64_sal_os_state*) = NULL; int -ia64_reg_MCA_extension(void *fn) +ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *)) { if (ia64_mca_ucmc_extension) return 1; @@ -854,8 +598,324 @@ ia64_unreg_MCA_extension(void) EXPORT_SYMBOL(ia64_reg_MCA_extension); EXPORT_SYMBOL(ia64_unreg_MCA_extension); + +static inline void +copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat) +{ + u64 fslot, tslot, nat; + *tr = *fr; + fslot = ((unsigned long)fr >> 3) & 63; + tslot = ((unsigned long)tr >> 3) & 63; + *tnat &= ~(1UL << tslot); + nat = (fnat >> fslot) & 1; + *tnat |= (nat << tslot); +} + +/* On entry to this routine, we are running on the per cpu stack, see + * mca_asm.h. The original stack has not been touched by this event. Some of + * the original stack's registers will be in the RBS on this stack. This stack + * also contains a partial pt_regs and switch_stack, the rest of the data is in + * PAL minstate. + * + * The first thing to do is modify the original stack to look like a blocked + * task so we can run backtrace on the original task. Also mark the per cpu + * stack as current to ensure that we use the correct task state, it also means + * that we can do backtrace on the MCA/INIT handler code itself. + */ + +static task_t * +ia64_mca_modify_original_stack(struct pt_regs *regs, + const struct switch_stack *sw, + struct ia64_sal_os_state *sos, + const char *type) +{ + char *p, comm[sizeof(current->comm)]; + ia64_va va; + extern char ia64_leave_kernel[]; /* Need asm address, not function descriptor */ + const pal_min_state_area_t *ms = sos->pal_min_state; + task_t *previous_current; + struct pt_regs *old_regs; + struct switch_stack *old_sw; + unsigned size = sizeof(struct pt_regs) + + sizeof(struct switch_stack) + 16; + u64 *old_bspstore, *old_bsp; + u64 *new_bspstore, *new_bsp; + u64 old_unat, old_rnat, new_rnat, nat; + u64 slots, loadrs = regs->loadrs; + u64 r12 = ms->pmsa_gr[12-1], r13 = ms->pmsa_gr[13-1]; + u64 ar_bspstore = regs->ar_bspstore; + u64 ar_bsp = regs->ar_bspstore + (loadrs >> 16); + const u64 *bank; + const char *msg; + int cpu = smp_processor_id(); + + previous_current = curr_task(cpu); + set_curr_task(cpu, current); + if ((p = strchr(current->comm, ' '))) + *p = '\0'; + + /* Best effort attempt to cope with MCA/INIT delivered while in + * physical mode. + */ + regs->cr_ipsr = ms->pmsa_ipsr; + if (ia64_psr(regs)->dt == 0) { + va.l = r12; + if (va.f.reg == 0) { + va.f.reg = 7; + r12 = va.l; + } + va.l = r13; + if (va.f.reg == 0) { + va.f.reg = 7; + r13 = va.l; + } + } + if (ia64_psr(regs)->rt == 0) { + va.l = ar_bspstore; + if (va.f.reg == 0) { + va.f.reg = 7; + ar_bspstore = va.l; + } + va.l = ar_bsp; + if (va.f.reg == 0) { + va.f.reg = 7; + ar_bsp = va.l; + } + } + + /* mca_asm.S ia64_old_stack() cannot assume that the dirty registers + * have been copied to the old stack, the old stack may fail the + * validation tests below. So ia64_old_stack() must restore the dirty + * registers from the new stack. The old and new bspstore probably + * have different alignments, so loadrs calculated on the old bsp + * cannot be used to restore from the new bsp. Calculate a suitable + * loadrs for the new stack and save it in the new pt_regs, where + * ia64_old_stack() can get it. + */ + old_bspstore = (u64 *)ar_bspstore; + old_bsp = (u64 *)ar_bsp; + slots = ia64_rse_num_regs(old_bspstore, old_bsp); + new_bspstore = (u64 *)((u64)current + IA64_RBS_OFFSET); + new_bsp = ia64_rse_skip_regs(new_bspstore, slots); + regs->loadrs = (new_bsp - new_bspstore) * 8 << 16; + + /* Verify the previous stack state before we change it */ + if (user_mode(regs)) { + msg = "occurred in user space"; + goto no_mod; + } + if ((r12 & -KERNEL_STACK_SIZE) != r13) { + msg = "inconsistent r12 and r13"; + goto no_mod; + } + if ((ar_bspstore & -KERNEL_STACK_SIZE) != r13) { + msg = "inconsistent ar.bspstore and r13"; + goto no_mod; + } + va.p = old_bspstore; + if (va.f.reg < 5) { + msg = "old_bspstore is in the wrong region"; + goto no_mod; + } + if ((ar_bsp & -KERNEL_STACK_SIZE) != r13) { + msg = "inconsistent ar.bsp and r13"; + goto no_mod; + } + size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8; + if (ar_bspstore + size > r12) { + msg = "no room for blocked state"; + goto no_mod; + } + + /* Change the comm field on the MCA/INT task to include the pid that + * was interrupted, it makes for easier debugging. If that pid was 0 + * (swapper or nested MCA/INIT) then use the start of the previous comm + * field suffixed with its cpu. + */ + if (previous_current->pid) + snprintf(comm, sizeof(comm), "%s %d", + current->comm, previous_current->pid); + else { + int l; + if ((p = strchr(previous_current->comm, ' '))) + l = p - previous_current->comm; + else + l = strlen(previous_current->comm); + snprintf(comm, sizeof(comm), "%s %*s %d", + current->comm, l, previous_current->comm, + previous_current->thread_info->cpu); + } + memcpy(current->comm, comm, sizeof(current->comm)); + + /* Make the original task look blocked. First stack a struct pt_regs, + * describing the state at the time of interrupt. mca_asm.S built a + * partial pt_regs, copy it and fill in the blanks using minstate. + */ + p = (char *)r12 - sizeof(*regs); + old_regs = (struct pt_regs *)p; + memcpy(old_regs, regs, sizeof(*regs)); + /* If ipsr.ic then use pmsa_{iip,ipsr,ifs}, else use + * pmsa_{xip,xpsr,xfs} + */ + if (ia64_psr(regs)->ic) { + old_regs->cr_iip = ms->pmsa_iip; + old_regs->cr_ipsr = ms->pmsa_ipsr; + old_regs->cr_ifs = ms->pmsa_ifs; + } else { + old_regs->cr_iip = ms->pmsa_xip; + old_regs->cr_ipsr = ms->pmsa_xpsr; + old_regs->cr_ifs = ms->pmsa_xfs; + } + old_regs->pr = ms->pmsa_pr; + old_regs->b0 = ms->pmsa_br0; + old_regs->loadrs = loadrs; + old_regs->ar_rsc = ms->pmsa_rsc; + old_unat = old_regs->ar_unat; + copy_reg(&ms->pmsa_gr[1-1], ms->pmsa_nat_bits, &old_regs->r1, &old_unat); + copy_reg(&ms->pmsa_gr[2-1], ms->pmsa_nat_bits, &old_regs->r2, &old_unat); + copy_reg(&ms->pmsa_gr[3-1], ms->pmsa_nat_bits, &old_regs->r3, &old_unat); + copy_reg(&ms->pmsa_gr[8-1], ms->pmsa_nat_bits, &old_regs->r8, &old_unat); + copy_reg(&ms->pmsa_gr[9-1], ms->pmsa_nat_bits, &old_regs->r9, &old_unat); + copy_reg(&ms->pmsa_gr[10-1], ms->pmsa_nat_bits, &old_regs->r10, &old_unat); + copy_reg(&ms->pmsa_gr[11-1], ms->pmsa_nat_bits, &old_regs->r11, &old_unat); + copy_reg(&ms->pmsa_gr[12-1], ms->pmsa_nat_bits, &old_regs->r12, &old_unat); + copy_reg(&ms->pmsa_gr[13-1], ms->pmsa_nat_bits, &old_regs->r13, &old_unat); + copy_reg(&ms->pmsa_gr[14-1], ms->pmsa_nat_bits, &old_regs->r14, &old_unat); + copy_reg(&ms->pmsa_gr[15-1], ms->pmsa_nat_bits, &old_regs->r15, &old_unat); + if (ia64_psr(old_regs)->bn) + bank = ms->pmsa_bank1_gr; + else + bank = ms->pmsa_bank0_gr; + copy_reg(&bank[16-16], ms->pmsa_nat_bits, &old_regs->r16, &old_unat); + copy_reg(&bank[17-16], ms->pmsa_nat_bits, &old_regs->r17, &old_unat); + copy_reg(&bank[18-16], ms->pmsa_nat_bits, &old_regs->r18, &old_unat); + copy_reg(&bank[19-16], ms->pmsa_nat_bits, &old_regs->r19, &old_unat); + copy_reg(&bank[20-16], ms->pmsa_nat_bits, &old_regs->r20, &old_unat); + copy_reg(&bank[21-16], ms->pmsa_nat_bits, &old_regs->r21, &old_unat); + copy_reg(&bank[22-16], ms->pmsa_nat_bits, &old_regs->r22, &old_unat); + copy_reg(&bank[23-16], ms->pmsa_nat_bits, &old_regs->r23, &old_unat); + copy_reg(&bank[24-16], ms->pmsa_nat_bits, &old_regs->r24, &old_unat); + copy_reg(&bank[25-16], ms->pmsa_nat_bits, &old_regs->r25, &old_unat); + copy_reg(&bank[26-16], ms->pmsa_nat_bits, &old_regs->r26, &old_unat); + copy_reg(&bank[27-16], ms->pmsa_nat_bits, &old_regs->r27, &old_unat); + copy_reg(&bank[28-16], ms->pmsa_nat_bits, &old_regs->r28, &old_unat); + copy_reg(&bank[29-16], ms->pmsa_nat_bits, &old_regs->r29, &old_unat); + copy_reg(&bank[30-16], ms->pmsa_nat_bits, &old_regs->r30, &old_unat); + copy_reg(&bank[31-16], ms->pmsa_nat_bits, &old_regs->r31, &old_unat); + + /* Next stack a struct switch_stack. mca_asm.S built a partial + * switch_stack, copy it and fill in the blanks using pt_regs and + * minstate. + * + * In the synthesized switch_stack, b0 points to ia64_leave_kernel, + * ar.pfs is set to 0. + * + * unwind.c::unw_unwind() does special processing for interrupt frames. + * It checks if the PRED_NON_SYSCALL predicate is set, if the predicate + * is clear then unw_unwind() does _not_ adjust bsp over pt_regs. Not + * that this is documented, of course. Set PRED_NON_SYSCALL in the + * switch_stack on the original stack so it will unwind correctly when + * unwind.c reads pt_regs. + * + * thread.ksp is updated to point to the synthesized switch_stack. + */ + p -= sizeof(struct switch_stack); + old_sw = (struct switch_stack *)p; + memcpy(old_sw, sw, sizeof(*sw)); + old_sw->caller_unat = old_unat; + old_sw->ar_fpsr = old_regs->ar_fpsr; + copy_reg(&ms->pmsa_gr[4-1], ms->pmsa_nat_bits, &old_sw->r4, &old_unat); + copy_reg(&ms->pmsa_gr[5-1], ms->pmsa_nat_bits, &old_sw->r5, &old_unat); + copy_reg(&ms->pmsa_gr[6-1], ms->pmsa_nat_bits, &old_sw->r6, &old_unat); + copy_reg(&ms->pmsa_gr[7-1], ms->pmsa_nat_bits, &old_sw->r7, &old_unat); + old_sw->b0 = (u64)ia64_leave_kernel; + old_sw->b1 = ms->pmsa_br1; + old_sw->ar_pfs = 0; + old_sw->ar_unat = old_unat; + old_sw->pr = old_regs->pr | (1UL << PRED_NON_SYSCALL); + previous_current->thread.ksp = (u64)p - 16; + + /* Finally copy the original stack's registers back to its RBS. + * Registers from ar.bspstore through ar.bsp at the time of the event + * are in the current RBS, copy them back to the original stack. The + * copy must be done register by register because the original bspstore + * and the current one have different alignments, so the saved RNAT + * data occurs at different places. + * + * mca_asm does cover, so the old_bsp already includes all registers at + * the time of MCA/INIT. It also does flushrs, so all registers before + * this function have been written to backing store on the MCA/INIT + * stack. + */ + new_rnat = ia64_get_rnat(ia64_rse_rnat_addr(new_bspstore)); + old_rnat = regs->ar_rnat; + while (slots--) { + if (ia64_rse_is_rnat_slot(new_bspstore)) { + new_rnat = ia64_get_rnat(new_bspstore++); + } + if (ia64_rse_is_rnat_slot(old_bspstore)) { + *old_bspstore++ = old_rnat; + old_rnat = 0; + } + nat = (new_rnat >> ia64_rse_slot_num(new_bspstore)) & 1UL; + old_rnat &= ~(1UL << ia64_rse_slot_num(old_bspstore)); + old_rnat |= (nat << ia64_rse_slot_num(old_bspstore)); + *old_bspstore++ = *new_bspstore++; + } + old_sw->ar_bspstore = (unsigned long)old_bspstore; + old_sw->ar_rnat = old_rnat; + + sos->prev_task = previous_current; + return previous_current; + +no_mod: + printk(KERN_INFO "cpu %d, %s %s, original stack not modified\n", + smp_processor_id(), type, msg); + return previous_current; +} + +/* The monarch/slave interaction is based on monarch_cpu and requires that all + * slaves have entered rendezvous before the monarch leaves. If any cpu has + * not entered rendezvous yet then wait a bit. The assumption is that any + * slave that has not rendezvoused after a reasonable time is never going to do + * so. In this context, slave includes cpus that respond to the MCA rendezvous + * interrupt, as well as cpus that receive the INIT slave event. + */ + +static void +ia64_wait_for_slaves(int monarch) +{ + int c, wait = 0; + for_each_online_cpu(c) { + if (c == monarch) + continue; + if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { + udelay(1000); /* short wait first */ + wait = 1; + break; + } + } + if (!wait) + return; + for_each_online_cpu(c) { + if (c == monarch) + continue; + if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { + udelay(5*1000000); /* wait 5 seconds for slaves (arbitrary) */ + break; + } + } +} + +static void +mca_init_leave(int cpu) +{ + while (cmpxchg_acq(&ia64_mca_init_leave, -1, cpu) != -1) + cpu_relax(); +} + /* - * ia64_mca_ucmc_handler + * ia64_mca_handler * * This is uncorrectable machine check handler called from OS_MCA * dispatch code which is in turn called from SAL_CHECK(). @@ -866,16 +926,28 @@ EXPORT_SYMBOL(ia64_unreg_MCA_extension); * further MCA logging is enabled by clearing logs. * Monarch also has the duty of sending wakeup-IPIs to pull the * slave processors out of rendezvous spinloop. - * - * Inputs : None - * Outputs : None */ void -ia64_mca_ucmc_handler(void) +ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, + struct ia64_sal_os_state *sos) { pal_processor_state_info_t *psp = (pal_processor_state_info_t *) - &ia64_sal_to_os_handoff_state.proc_state_param; - int recover; + &sos->proc_state_param; + int recover, cpu = smp_processor_id(); + task_t *previous_current; + + oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ + previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA"); + monarch_cpu = cpu; + ia64_wait_for_slaves(cpu); + + /* Wakeup all the processors which are spinning in the rendezvous loop. + * They will leave SAL, then spin in the OS with interrupts disabled + * until this monarch cpu leaves the MCA handler. That gets control + * back to the OS so we can backtrace the other cpus, backtrace when + * spinning in SAL does not work. + */ + ia64_mca_wakeup_all(); /* Get the MCA error record and log it */ ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA); @@ -883,25 +955,21 @@ ia64_mca_ucmc_handler(void) /* TLB error is only exist in this SAL error record */ recover = (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) /* other error recovery */ - || (ia64_mca_ucmc_extension + || (ia64_mca_ucmc_extension && ia64_mca_ucmc_extension( IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA), - &ia64_sal_to_os_handoff_state, - &ia64_os_to_sal_handoff_state)); + sos)); if (recover) { sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA); rh->severity = sal_log_severity_corrected; ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA); + sos->os_status = IA64_MCA_CORRECTED; } - /* - * Wakeup all the processors which are spinning in the rendezvous - * loop. - */ - ia64_mca_wakeup_all(); - /* Return to SAL */ - ia64_return_to_sal_check(recover); + set_curr_task(cpu, previous_current); + mca_init_leave(cpu); + monarch_cpu = -1; } static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd, NULL); @@ -1125,34 +1193,87 @@ ia64_mca_cpe_poll (unsigned long dummy) /* * C portion of the OS INIT handler * - * Called from ia64_monarch_init_handler - * - * Inputs: pointer to pt_regs where processor info was saved. - * - * Returns: - * 0 if SAL must warm boot the System - * 1 if SAL must return to interrupted context using PAL_MC_RESUME + * Called from ia64_os_init_dispatch * + * Inputs: pointer to pt_regs where processor info was saved. SAL/OS state for + * this event. This code is used for both monarch and slave INIT events, see + * sos->monarch. + * + * All INIT events switch to the INIT stack and change the previous process to + * blocked status. If one of the INIT events is the monarch then we are + * probably processing the nmi button/command. Use the monarch cpu to dump all + * the processes. The slave INIT events all spin until the monarch cpu + * returns. We can also get INIT slave events for MCA, in which case the MCA + * process is the monarch. */ + void -ia64_init_handler (struct pt_regs *pt, struct switch_stack *sw) +ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, + struct ia64_sal_os_state *sos) { - pal_min_state_area_t *ms; + task_t *previous_current; + int cpu = smp_processor_id(), c; + struct task_struct *g, *t; - oops_in_progress = 1; /* avoid deadlock in printk, but it makes recovery dodgy */ + oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ console_loglevel = 15; /* make sure printks make it to console */ printk(KERN_INFO "Entered OS INIT handler. PSP=%lx\n", - ia64_sal_to_os_handoff_state.proc_state_param); + sos->proc_state_param); + salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0); + + previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "INIT"); + sos->os_status = IA64_INIT_RESUME; + if (!sos->monarch) { + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_INIT; + while (monarch_cpu == -1) + cpu_relax(); /* spin until monarch enters */ + while (monarch_cpu != -1) + cpu_relax(); /* spin until monarch leaves */ + printk("slave returning %d\n", cpu); + set_curr_task(cpu, previous_current); + mca_init_leave(cpu); + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + return; + } + + monarch_cpu = cpu; /* - * Address of minstate area provided by PAL is physical, - * uncacheable (bit 63 set). Convert to Linux virtual - * address in region 6. + * Wait for a bit. On some machines (e.g., HP's zx2000 and zx6000, INIT can be + * generated via the BMC's command-line interface, but since the console is on the + * same serial line, the user will need some time to switch out of the BMC before + * the dump begins. */ - ms = (pal_min_state_area_t *)(ia64_sal_to_os_handoff_state.pal_min_state | (6ul<<61)); - - init_handler_platform(ms, pt, sw); /* call platform specific routines */ + printk("Delaying for 5 seconds...\n"); + udelay(5*1000000); + ia64_wait_for_slaves(cpu); + printk(KERN_ERR "Processes interrupted by INIT -"); + for_each_online_cpu(c) { + struct ia64_sal_os_state *s; + t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET); + s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET); + g = s->prev_task; + if (g) { + if (g->pid) + printk(" %d", g->pid); + else + printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g); + } + } + printk("\n\n"); + if (read_trylock(&tasklist_lock)) { + do_each_thread (g, t) { + printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); + show_stack(t, NULL); + } while_each_thread (g, t); + read_unlock(&tasklist_lock); + } + printk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu); + set_curr_task(cpu, previous_current); + mca_init_leave(cpu); + monarch_cpu = -1; + return; } static int __init @@ -1202,6 +1323,34 @@ static struct irqaction mca_cpep_irqacti }; #endif /* CONFIG_ACPI */ +/* Minimal format of the MCA/INIT stacks. The pseudo processes that run on + * these stacks can never sleep, they cannot return from the kernel to user + * space, they do not appear in a normal ps listing. So there is no need to + * format most of the fields. + */ + +static void +format_mca_init_stack(void *mca_data, unsigned long offset, + const char *type, int cpu) +{ + struct task_struct *p = (struct task_struct *)((char *)mca_data + offset); + struct thread_info *ti; + memset(p, 0, KERNEL_STACK_SIZE); + ti = (struct thread_info *)((char *)p + IA64_TASK_SIZE); + ti->flags = _TIF_MCA_INIT; + ti->preempt_count = 1; + ti->task = p; + ti->cpu = cpu; + p->thread_info = ti; + p->state = TASK_UNINTERRUPTIBLE; + __set_bit(cpu, &p->cpus_allowed); + INIT_LIST_HEAD(&p->tasks); + p->parent = p->real_parent = p->group_leader = p; + INIT_LIST_HEAD(&p->children); + INIT_LIST_HEAD(&p->sibling); + strncpy(p->comm, type, sizeof(p->comm)-1); +} + /* Do per-CPU MCA-related initialization. */ void __devinit @@ -1214,19 +1363,28 @@ ia64_mca_cpu_init(void *cpu_data) int cpu; mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu) - * NR_CPUS); + * NR_CPUS + KERNEL_STACK_SIZE); + mca_data = (void *)(((unsigned long)mca_data + + KERNEL_STACK_SIZE - 1) & + (-KERNEL_STACK_SIZE)); for (cpu = 0; cpu < NR_CPUS; cpu++) { + format_mca_init_stack(mca_data, + offsetof(struct ia64_mca_cpu, mca_stack), + "MCA", cpu); + format_mca_init_stack(mca_data, + offsetof(struct ia64_mca_cpu, init_stack), + "INIT", cpu); __per_cpu_mca[cpu] = __pa(mca_data); mca_data += sizeof(struct ia64_mca_cpu); } } - /* - * The MCA info structure was allocated earlier and its - * physical address saved in __per_cpu_mca[cpu]. Copy that - * address * to ia64_mca_data so we can access it as a per-CPU - * variable. - */ + /* + * The MCA info structure was allocated earlier and its + * physical address saved in __per_cpu_mca[cpu]. Copy that + * address * to ia64_mca_data so we can access it as a per-CPU + * variable. + */ __get_cpu_var(ia64_mca_data) = __per_cpu_mca[smp_processor_id()]; /* @@ -1236,11 +1394,11 @@ ia64_mca_cpu_init(void *cpu_data) __get_cpu_var(ia64_mca_per_cpu_pte) = pte_val(mk_pte_phys(__pa(cpu_data), PAGE_KERNEL)); - /* - * Also, stash away a copy of the PAL address and the PTE - * needed to map it. - */ - pal_vaddr = efi_get_pal_addr(); + /* + * Also, stash away a copy of the PAL address and the PTE + * needed to map it. + */ + pal_vaddr = efi_get_pal_addr(); if (!pal_vaddr) return; __get_cpu_var(ia64_mca_pal_base) = @@ -1272,8 +1430,8 @@ ia64_mca_cpu_init(void *cpu_data) void __init ia64_mca_init(void) { - ia64_fptr_t *mon_init_ptr = (ia64_fptr_t *)ia64_monarch_init_handler; - ia64_fptr_t *slave_init_ptr = (ia64_fptr_t *)ia64_slave_init_handler; + ia64_fptr_t *init_hldlr_ptr_monarch = (ia64_fptr_t *)ia64_os_init_dispatch_monarch; + ia64_fptr_t *init_hldlr_ptr_slave = (ia64_fptr_t *)ia64_os_init_dispatch_slave; ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch; int i; s64 rc; @@ -1351,9 +1509,9 @@ ia64_mca_init(void) * XXX - disable SAL checksum by setting size to 0, should be * size of the actual init handler in mca_asm.S. */ - ia64_mc_info.imi_monarch_init_handler = ia64_tpa(mon_init_ptr->fp); + ia64_mc_info.imi_monarch_init_handler = ia64_tpa(init_hldlr_ptr_monarch->fp); ia64_mc_info.imi_monarch_init_handler_size = 0; - ia64_mc_info.imi_slave_init_handler = ia64_tpa(slave_init_ptr->fp); + ia64_mc_info.imi_slave_init_handler = ia64_tpa(init_hldlr_ptr_slave->fp); ia64_mc_info.imi_slave_init_handler_size = 0; IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __FUNCTION__, diff -puN arch/ia64/kernel/mca_drv.c~git-ia64 arch/ia64/kernel/mca_drv.c --- devel/arch/ia64/kernel/mca_drv.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/mca_drv.c 2005-09-07 19:42:41.000000000 -0700 @@ -4,6 +4,8 @@ * * Copyright (C) 2004 FUJITSU LIMITED * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com) + * Copyright (C) 2005 Silicon Graphics, Inc + * Copyright (C) 2005 Keith Owens */ #include #include @@ -38,10 +40,6 @@ /* max size of SAL error record (default) */ static int sal_rec_max = 10000; -/* from mca.c */ -static ia64_mca_sal_to_os_state_t *sal_to_os_handoff_state; -static ia64_mca_os_to_sal_state_t *os_to_sal_handoff_state; - /* from mca_drv_asm.S */ extern void *mca_handler_bhhook(void); @@ -316,7 +314,8 @@ init_record_index_pools(void) */ static mca_type_t -is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci) +is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci, + struct ia64_sal_os_state *sos) { pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx); @@ -327,7 +326,7 @@ is_mca_global(peidx_table_t *peidx, pal_ * Therefore it is local MCA when rendezvous has not been requested. * Failed to rendezvous, the system must be down. */ - switch (sal_to_os_handoff_state->imsto_rendez_state) { + switch (sos->rv_rc) { case -1: /* SAL rendezvous unsuccessful */ return MCA_IS_GLOBAL; case 0: /* SAL rendezvous not required */ @@ -388,7 +387,8 @@ is_mca_global(peidx_table_t *peidx, pal_ */ static int -recover_from_read_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci) +recover_from_read_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci, + struct ia64_sal_os_state *sos) { sal_log_mod_error_info_t *smei; pal_min_state_area_t *pmsa; @@ -426,7 +426,7 @@ recover_from_read_error(slidx_table_t *s * setup for resume to bottom half of MCA, * "mca_handler_bhhook" */ - pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61)); + pmsa = sos->pal_min_state; /* pass to bhhook as 1st argument (gr8) */ pmsa->pmsa_gr[8-1] = smei->target_identifier; /* set interrupted return address (but no use) */ @@ -459,7 +459,8 @@ recover_from_read_error(slidx_table_t *s */ static int -recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci) +recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci, + struct ia64_sal_os_state *sos) { int status = 0; pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx); @@ -469,7 +470,7 @@ recover_from_platform_error(slidx_table_ case 1: /* partial read */ case 3: /* full line(cpu) read */ case 9: /* I/O space read */ - status = recover_from_read_error(slidx, peidx, pbci); + status = recover_from_read_error(slidx, peidx, pbci, sos); break; case 0: /* unknown */ case 2: /* partial write */ @@ -508,7 +509,8 @@ recover_from_platform_error(slidx_table_ */ static int -recover_from_processor_error(int platform, slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci) +recover_from_processor_error(int platform, slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci, + struct ia64_sal_os_state *sos) { pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx); @@ -545,7 +547,7 @@ recover_from_processor_error(int platfor * This means "there are some platform errors". */ if (platform) - return recover_from_platform_error(slidx, peidx, pbci); + return recover_from_platform_error(slidx, peidx, pbci, sos); /* * On account of strange SAL error record, we cannot recover. */ @@ -562,8 +564,7 @@ recover_from_processor_error(int platfor static int mca_try_to_recover(void *rec, - ia64_mca_sal_to_os_state_t *sal_to_os_state, - ia64_mca_os_to_sal_state_t *os_to_sal_state) + struct ia64_sal_os_state *sos) { int platform_err; int n_proc_err; @@ -571,10 +572,6 @@ mca_try_to_recover(void *rec, peidx_table_t peidx; pal_bus_check_info_t pbci; - /* handoff state from/to mca.c */ - sal_to_os_handoff_state = sal_to_os_state; - os_to_sal_handoff_state = os_to_sal_state; - /* Make index of SAL error record */ platform_err = mca_make_slidx(rec, &slidx); @@ -597,11 +594,11 @@ mca_try_to_recover(void *rec, *((u64*)&pbci) = peidx_check_info(&peidx, bus_check, 0); /* Check whether MCA is global or not */ - if (is_mca_global(&peidx, &pbci)) + if (is_mca_global(&peidx, &pbci, sos)) return 0; /* Try to recover a processor error */ - return recover_from_processor_error(platform_err, &slidx, &peidx, &pbci); + return recover_from_processor_error(platform_err, &slidx, &peidx, &pbci, sos); } /* diff -puN arch/ia64/kernel/minstate.h~git-ia64 arch/ia64/kernel/minstate.h --- devel/arch/ia64/kernel/minstate.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/minstate.h 2005-09-07 19:42:41.000000000 -0700 @@ -5,73 +5,6 @@ #include "entry.h" /* - * For ivt.s we want to access the stack virtually so we don't have to disable translation - * on interrupts. - * - * On entry: - * r1: pointer to current task (ar.k6) - */ -#define MINSTATE_START_SAVE_MIN_VIRT \ -(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ - ;; \ -(pUStk) mov.m r24=ar.rnat; \ -(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \ -(pKStk) mov r1=sp; /* get sp */ \ - ;; \ -(pUStk) lfetch.fault.excl.nt1 [r22]; \ -(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ -(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ - ;; \ -(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ -(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ - ;; \ -(pUStk) mov r18=ar.bsp; \ -(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ - -#define MINSTATE_END_SAVE_MIN_VIRT \ - bsw.1; /* switch back to bank 1 (must be last in insn group) */ \ - ;; - -/* - * For mca_asm.S we want to access the stack physically since the state is saved before we - * go virtual and don't want to destroy the iip or ipsr. - */ -#define MINSTATE_START_SAVE_MIN_PHYS \ -(pKStk) mov r3=IA64_KR(PER_CPU_DATA);; \ -(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;; \ -(pKStk) ld8 r3 = [r3];; \ -(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;; \ -(pKStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3; \ -(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ -(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of register backing store */ \ - ;; \ -(pUStk) mov r24=ar.rnat; \ -(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ -(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ -(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \ - ;; \ -(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ - ;; \ -(pUStk) mov r18=ar.bsp; \ -(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ - -#define MINSTATE_END_SAVE_MIN_PHYS \ - dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \ - ;; - -#ifdef MINSTATE_VIRT -# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT) -# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT -# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT -#endif - -#ifdef MINSTATE_PHYS -# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg -# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS -# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS -#endif - -/* * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves * the minimum state necessary that allows us to turn psr.ic back * on. @@ -97,7 +30,7 @@ * we can pass interruption state as arguments to a handler. */ #define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \ - MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \ + mov r16=IA64_KR(CURRENT); /* M */ \ mov r27=ar.rsc; /* M */ \ mov r20=r1; /* A */ \ mov r25=ar.unat; /* M */ \ @@ -118,7 +51,21 @@ SAVE_IFS; \ cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \ ;; \ - MINSTATE_START_SAVE_MIN \ +(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ + ;; \ +(pUStk) mov.m r24=ar.rnat; \ +(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \ +(pKStk) mov r1=sp; /* get sp */ \ + ;; \ +(pUStk) lfetch.fault.excl.nt1 [r22]; \ +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ +(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ + ;; \ +(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ + ;; \ +(pUStk) mov r18=ar.bsp; \ +(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \ adds r16=PT(CR_IPSR),r1; \ ;; \ @@ -181,7 +128,8 @@ EXTRA; \ movl r1=__gp; /* establish kernel global pointer */ \ ;; \ - MINSTATE_END_SAVE_MIN + bsw.1; /* switch back to bank 1 (must be last in insn group) */ \ + ;; /* * SAVE_REST saves the remainder of pt_regs (with psr.ic on). diff -puN arch/ia64/kernel/palinfo.c~git-ia64 arch/ia64/kernel/palinfo.c --- devel/arch/ia64/kernel/palinfo.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/palinfo.c 2005-09-07 19:42:41.000000000 -0700 @@ -307,11 +307,9 @@ vm_info(char *page) if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) { printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status); - return 0; - } + } else { - - p += sprintf(p, + p += sprintf(p, "Physical Address Space : %d bits\n" "Virtual Address Space : %d bits\n" "Protection Key Registers(PKR) : %d\n" @@ -319,92 +317,99 @@ vm_info(char *page) "Hash Tag ID : 0x%x\n" "Size of RR.rid : %d\n", vm_info_1.pal_vm_info_1_s.phys_add_size, - vm_info_2.pal_vm_info_2_s.impl_va_msb+1, vm_info_1.pal_vm_info_1_s.max_pkr+1, - vm_info_1.pal_vm_info_1_s.key_size, vm_info_1.pal_vm_info_1_s.hash_tag_id, + vm_info_2.pal_vm_info_2_s.impl_va_msb+1, + vm_info_1.pal_vm_info_1_s.max_pkr+1, + vm_info_1.pal_vm_info_1_s.key_size, + vm_info_1.pal_vm_info_1_s.hash_tag_id, vm_info_2.pal_vm_info_2_s.rid_size); + } - if (ia64_pal_mem_attrib(&attrib) != 0) - return 0; - - p += sprintf(p, "Supported memory attributes : "); - sep = ""; - for (i = 0; i < 8; i++) { - if (attrib & (1 << i)) { - p += sprintf(p, "%s%s", sep, mem_attrib[i]); - sep = ", "; + if (ia64_pal_mem_attrib(&attrib) == 0) { + p += sprintf(p, "Supported memory attributes : "); + sep = ""; + for (i = 0; i < 8; i++) { + if (attrib & (1 << i)) { + p += sprintf(p, "%s%s", sep, mem_attrib[i]); + sep = ", "; + } } + p += sprintf(p, "\n"); } - p += sprintf(p, "\n"); if ((status = ia64_pal_vm_page_size(&tr_pages, &vw_pages)) !=0) { printk(KERN_ERR "ia64_pal_vm_page_size=%ld\n", status); - return 0; - } - - p += sprintf(p, - "\nTLB walker : %simplemented\n" - "Number of DTR : %d\n" - "Number of ITR : %d\n" - "TLB insertable page sizes : ", - vm_info_1.pal_vm_info_1_s.vw ? "" : "not ", - vm_info_1.pal_vm_info_1_s.max_dtr_entry+1, - vm_info_1.pal_vm_info_1_s.max_itr_entry+1); + } else { + p += sprintf(p, + "\nTLB walker : %simplemented\n" + "Number of DTR : %d\n" + "Number of ITR : %d\n" + "TLB insertable page sizes : ", + vm_info_1.pal_vm_info_1_s.vw ? "" : "not ", + vm_info_1.pal_vm_info_1_s.max_dtr_entry+1, + vm_info_1.pal_vm_info_1_s.max_itr_entry+1); - p = bitvector_process(p, tr_pages); - p += sprintf(p, "\nTLB purgeable page sizes : "); + p = bitvector_process(p, tr_pages); - p = bitvector_process(p, vw_pages); + p += sprintf(p, "\nTLB purgeable page sizes : "); + p = bitvector_process(p, vw_pages); + } if ((status=ia64_get_ptce(&ptce)) != 0) { printk(KERN_ERR "ia64_get_ptce=%ld\n", status); - return 0; - } - - p += sprintf(p, + } else { + p += sprintf(p, "\nPurge base address : 0x%016lx\n" "Purge outer loop count : %d\n" "Purge inner loop count : %d\n" "Purge outer loop stride : %d\n" "Purge inner loop stride : %d\n", - ptce.base, ptce.count[0], ptce.count[1], ptce.stride[0], ptce.stride[1]); + ptce.base, ptce.count[0], ptce.count[1], + ptce.stride[0], ptce.stride[1]); - p += sprintf(p, + p += sprintf(p, "TC Levels : %d\n" "Unique TC(s) : %d\n", vm_info_1.pal_vm_info_1_s.num_tc_levels, vm_info_1.pal_vm_info_1_s.max_unique_tcs); - for(i=0; i < vm_info_1.pal_vm_info_1_s.num_tc_levels; i++) { - for (j=2; j>0 ; j--) { - tc_pages = 0; /* just in case */ + for(i=0; i < vm_info_1.pal_vm_info_1_s.num_tc_levels; i++) { + for (j=2; j>0 ; j--) { + tc_pages = 0; /* just in case */ - /* even without unification, some levels may not be present */ - if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0) { - continue; - } + /* even without unification, some levels may not be present */ + if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0) { + continue; + } - p += sprintf(p, + p += sprintf(p, "\n%s Translation Cache Level %d:\n" "\tHash sets : %d\n" "\tAssociativity : %d\n" "\tNumber of entries : %d\n" "\tFlags : ", - cache_types[j+tc_info.tc_unified], i+1, tc_info.tc_num_sets, - tc_info.tc_associativity, tc_info.tc_num_entries); - - if (tc_info.tc_pf) p += sprintf(p, "PreferredPageSizeOptimized "); - if (tc_info.tc_unified) p += sprintf(p, "Unified "); - if (tc_info.tc_reduce_tr) p += sprintf(p, "TCReduction"); - - p += sprintf(p, "\n\tSupported page sizes: "); - - p = bitvector_process(p, tc_pages); - - /* when unified date (j=2) is enough */ - if (tc_info.tc_unified) break; + cache_types[j+tc_info.tc_unified], i+1, + tc_info.tc_num_sets, + tc_info.tc_associativity, + tc_info.tc_num_entries); + + if (tc_info.tc_pf) + p += sprintf(p, "PreferredPageSizeOptimized "); + if (tc_info.tc_unified) + p += sprintf(p, "Unified "); + if (tc_info.tc_reduce_tr) + p += sprintf(p, "TCReduction"); + + p += sprintf(p, "\n\tSupported page sizes: "); + + p = bitvector_process(p, tc_pages); + + /* when unified date (j=2) is enough */ + if (tc_info.tc_unified) + break; + } } } p += sprintf(p, "\n"); @@ -440,14 +445,14 @@ register_info(char *page) p += sprintf(p, "\n"); } - if (ia64_pal_rse_info(&phys_stacked, &hints) != 0) return 0; + if (ia64_pal_rse_info(&phys_stacked, &hints) == 0) { p += sprintf(p, "RSE stacked physical registers : %ld\n" "RSE load/store hints : %ld (%s)\n", phys_stacked, hints.ph_data, hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)"); - + } if (ia64_pal_debug_info(&iregs, &dregs)) return 0; diff -puN arch/ia64/kernel/salinfo.c~git-ia64 arch/ia64/kernel/salinfo.c --- devel/arch/ia64/kernel/salinfo.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/salinfo.c 2005-09-07 19:42:41.000000000 -0700 @@ -22,6 +22,11 @@ * * Dec 5 2004 kaos@sgi.com * Standardize which records are cleared automatically. + * + * Aug 18 2005 kaos@sgi.com + * mca.c may not pass a buffer, a NULL buffer just indicates that a new + * record is available in SAL. + * Replace some NR_CPUS by cpus_online, for hotplug cpu. */ #include @@ -193,7 +198,7 @@ shift1_data_saved (struct salinfo_data * * The buffer passed from mca.c points to the output from ia64_log_get. This is * a persistent buffer but its contents can change between the interrupt and * when user space processes the record. Save the record id to identify - * changes. + * changes. If the buffer is NULL then just update the bitmap. */ void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe) @@ -206,27 +211,29 @@ salinfo_log_wakeup(int type, u8 *buffer, BUG_ON(type >= ARRAY_SIZE(salinfo_log_name)); - if (irqsafe) - spin_lock_irqsave(&data_saved_lock, flags); - for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) { - if (!data_saved->buffer) - break; - } - if (i == saved_size) { - if (!data->saved_num) { - shift1_data_saved(data, 0); - data_saved = data->data_saved + saved_size - 1; - } else - data_saved = NULL; - } - if (data_saved) { - data_saved->cpu = smp_processor_id(); - data_saved->id = ((sal_log_record_header_t *)buffer)->id; - data_saved->size = size; - data_saved->buffer = buffer; + if (buffer) { + if (irqsafe) + spin_lock_irqsave(&data_saved_lock, flags); + for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) { + if (!data_saved->buffer) + break; + } + if (i == saved_size) { + if (!data->saved_num) { + shift1_data_saved(data, 0); + data_saved = data->data_saved + saved_size - 1; + } else + data_saved = NULL; + } + if (data_saved) { + data_saved->cpu = smp_processor_id(); + data_saved->id = ((sal_log_record_header_t *)buffer)->id; + data_saved->size = size; + data_saved->buffer = buffer; + } + if (irqsafe) + spin_unlock_irqrestore(&data_saved_lock, flags); } - if (irqsafe) - spin_unlock_irqrestore(&data_saved_lock, flags); if (!test_and_set_bit(smp_processor_id(), &data->cpu_event)) { if (irqsafe) @@ -244,7 +251,7 @@ salinfo_timeout_check(struct salinfo_dat int i; if (!data->open) return; - for (i = 0; i < NR_CPUS; ++i) { + for_each_online_cpu(i) { if (test_bit(i, &data->cpu_event)) { /* double up() is not a problem, user space will see no * records for the additional "events". @@ -291,7 +298,7 @@ retry: n = data->cpu_check; for (i = 0; i < NR_CPUS; i++) { - if (test_bit(n, &data->cpu_event)) { + if (test_bit(n, &data->cpu_event) && cpu_online(n)) { cpu = n; break; } @@ -585,11 +592,10 @@ salinfo_init(void) /* we missed any events before now */ online = 0; - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) { - set_bit(j, &data->cpu_event); - ++online; - } + for_each_online_cpu(j) { + set_bit(j, &data->cpu_event); + ++online; + } sema_init(&data->sem, online); *sdir++ = dir; diff -puN arch/ia64/kernel/unwind.c~git-ia64 arch/ia64/kernel/unwind.c --- devel/arch/ia64/kernel/unwind.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/unwind.c 2005-09-07 19:42:41.000000000 -0700 @@ -2020,28 +2020,6 @@ init_frame_info (struct unw_frame_info * } void -unw_init_from_interruption (struct unw_frame_info *info, struct task_struct *t, - struct pt_regs *pt, struct switch_stack *sw) -{ - unsigned long sof; - - init_frame_info(info, t, sw, pt->r12); - info->cfm_loc = &pt->cr_ifs; - info->unat_loc = &pt->ar_unat; - info->pfs_loc = &pt->ar_pfs; - sof = *info->cfm_loc & 0x7f; - info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sof); - info->ip = pt->cr_iip + ia64_psr(pt)->ri; - info->pt = (unsigned long) pt; - UNW_DPRINT(3, "unwind.%s:\n" - " bsp 0x%lx\n" - " sof 0x%lx\n" - " ip 0x%lx\n", - __FUNCTION__, info->bsp, sof, info->ip); - find_save_locs(info); -} - -void unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct switch_stack *sw) { unsigned long sol; diff -puN arch/ia64/kernel/vmlinux.lds.S~git-ia64 arch/ia64/kernel/vmlinux.lds.S --- devel/arch/ia64/kernel/vmlinux.lds.S~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/vmlinux.lds.S 2005-09-07 19:42:41.000000000 -0700 @@ -166,6 +166,7 @@ SECTIONS __init_end = .; /* The initial task and kernel stack */ + . = ALIGN(KERNEL_STACK_SIZE); .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) } diff -puN arch/ia64/lib/memcpy_mck.S~git-ia64 arch/ia64/lib/memcpy_mck.S --- devel/arch/ia64/lib/memcpy_mck.S~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/lib/memcpy_mck.S 2005-09-07 19:42:41.000000000 -0700 @@ -625,8 +625,11 @@ EK(.ex_handler, (p17) st8 [dst1]=r39,8) clrrrb ;; alloc saved_pfs_stack=ar.pfs,3,3,3,0 + cmp.lt p8,p0=A,r0 sub B = dst0, saved_in0 // how many byte copied so far ;; +(p8) mov A = 0; // A shouldn't be negative, cap it + ;; sub C = A, B sub D = saved_in2, A ;; diff -puN arch/ia64/mm/fault.c~git-ia64 arch/ia64/mm/fault.c --- devel/arch/ia64/mm/fault.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/mm/fault.c 2005-09-07 19:42:41.000000000 -0700 @@ -230,9 +230,6 @@ ia64_do_page_fault (unsigned long addres return; } - if (ia64_done_with_exception(regs)) - return; - /* * Since we have no vma's for region 5, we might get here even if the address is * valid, due to the VHPT walker inserting a non present translation that becomes @@ -243,6 +240,9 @@ ia64_do_page_fault (unsigned long addres if (REGION_NUMBER(address) == 5 && mapped_kernel_page_is_present(address)) return; + if (ia64_done_with_exception(regs)) + return; + /* * Oops. The kernel tried to access some bad page. We'll have to terminate things * with extreme prejudice. diff -puN arch/ia64/mm/init.c~git-ia64 arch/ia64/mm/init.c --- devel/arch/ia64/mm/init.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/mm/init.c 2005-09-07 19:42:41.000000000 -0700 @@ -382,13 +382,22 @@ ia64_mmu_init (void *my_cpu_data) if (impl_va_bits < 51 || impl_va_bits > 61) panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1); + /* + * mapped_space_bits - PAGE_SHIFT is the total number of ptes we need, + * which must fit into "vmlpt_bits - pte_bits" slots. Second half of + * the test makes sure that our mapped space doesn't overlap the + * unimplemented hole in the middle of the region. + */ + if ((mapped_space_bits - PAGE_SHIFT > vmlpt_bits - pte_bits) || + (mapped_space_bits > impl_va_bits - 1)) + panic("Cannot build a big enough virtual-linear page table" + " to cover mapped address space.\n" + " Try using a smaller page size.\n"); + /* place the VMLPT at the end of each page-table mapped region: */ pta = POW2(61) - POW2(vmlpt_bits); - if (POW2(mapped_space_bits) >= pta) - panic("mm/init: overlap between virtually mapped linear page table and " - "mapped kernel space!"); /* * Set the (virtually mapped linear) page table address. Bit * 8 selects between the short and long format, bits 2-7 the diff -puN arch/ia64/pci/pci.c~git-ia64 arch/ia64/pci/pci.c --- devel/arch/ia64/pci/pci.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/pci/pci.c 2005-09-07 19:42:41.000000000 -0700 @@ -498,13 +498,11 @@ pcibios_enable_device (struct pci_dev *d return acpi_pci_irq_enable(dev); } -#ifdef CONFIG_ACPI_DEALLOCATE_IRQ void pcibios_disable_device (struct pci_dev *dev) { acpi_pci_irq_disable(dev); } -#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */ void pcibios_align_resource (void *data, struct resource *res, diff -puN arch/ia64/sn/kernel/setup.c~git-ia64 arch/ia64/sn/kernel/setup.c --- devel/arch/ia64/sn/kernel/setup.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/sn/kernel/setup.c 2005-09-07 19:42:41.000000000 -0700 @@ -49,6 +49,7 @@ #include #include #include +#include #include "xtalk/xwidgetdev.h" #include "xtalk/hubdev.h" #include @@ -97,6 +98,7 @@ EXPORT_SYMBOL(sn_region_size); int sn_prom_type; /* 0=hardware, 1=medusa/realprom, 2=medusa/fakeprom */ short physical_node_map[MAX_PHYSNODE_ID]; +static unsigned long sn_prom_features[MAX_PROM_FEATURE_SETS]; EXPORT_SYMBOL(physical_node_map); @@ -271,7 +273,10 @@ void __init sn_setup(char **cmdline_p) u32 version = sn_sal_rev(); extern void sn_cpu_init(void); - ia64_sn_plat_set_error_handling_features(); + ia64_sn_plat_set_error_handling_features(); // obsolete + ia64_sn_set_os_feature(OSF_MCA_SLV_TO_OS_INIT_SLV); + ia64_sn_set_os_feature(OSF_FEAT_LOG_SBES); + #if defined(CONFIG_VT) && defined(CONFIG_VGA_CONSOLE) /* @@ -314,16 +319,6 @@ void __init sn_setup(char **cmdline_p) printk("SGI SAL version %x.%02x\n", version >> 8, version & 0x00FF); - /* - * Confirm the SAL we're running on is recent enough... - */ - if (version < SN_SAL_MIN_VERSION) { - printk(KERN_ERR "This kernel needs SGI SAL version >= " - "%x.%02x\n", SN_SAL_MIN_VERSION >> 8, - SN_SAL_MIN_VERSION & 0x00FF); - panic("PROM version too old\n"); - } - master_nasid = boot_get_nasid(); status = @@ -480,6 +475,10 @@ void __init sn_cpu_init(void) if (nodepdaindr[0] == NULL) return; + for (i = 0; i < MAX_PROM_FEATURE_SETS; i++) + if (ia64_sn_get_prom_feature_set(i, &sn_prom_features[i]) != 0) + break; + cpuid = smp_processor_id(); cpuphyid = get_sapicid(); @@ -651,3 +650,12 @@ nasid_slice_to_cpuid(int nasid, int slic return -1; } + +int sn_prom_feature_available(int id) +{ + if (id >= BITS_PER_LONG * MAX_PROM_FEATURE_SETS) + return 0; + return test_bit(id, sn_prom_features); +} +EXPORT_SYMBOL(sn_prom_feature_available); + diff -puN arch/ia64/sn/kernel/tiocx.c~git-ia64 arch/ia64/sn/kernel/tiocx.c --- devel/arch/ia64/sn/kernel/tiocx.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/sn/kernel/tiocx.c 2005-09-07 19:42:41.000000000 -0700 @@ -183,11 +183,12 @@ int cx_driver_unregister(struct cx_drv * * @part_num: device's part number * @mfg_num: device's manufacturer number * @hubdev: hub info associated with this device + * @bt: board type of the device * */ int cx_device_register(nasid_t nasid, int part_num, int mfg_num, - struct hubdev_info *hubdev) + struct hubdev_info *hubdev, int bt) { struct cx_dev *cx_dev; @@ -200,6 +201,7 @@ cx_device_register(nasid_t nasid, int pa cx_dev->cx_id.mfg_num = mfg_num; cx_dev->cx_id.nasid = nasid; cx_dev->hubdev = hubdev; + cx_dev->bt = bt; cx_dev->dev.parent = NULL; cx_dev->dev.bus = &tiocx_bus_type; @@ -238,7 +240,8 @@ static int cx_device_reload(struct cx_de { cx_device_unregister(cx_dev); return cx_device_register(cx_dev->cx_id.nasid, cx_dev->cx_id.part_num, - cx_dev->cx_id.mfg_num, cx_dev->hubdev); + cx_dev->cx_id.mfg_num, cx_dev->hubdev, + cx_dev->bt); } static inline uint64_t tiocx_intr_alloc(nasid_t nasid, int widget, @@ -365,26 +368,20 @@ static void tio_corelet_reset(nasid_t na udelay(2000); } -static int tiocx_btchar_get(int nasid) +static int is_fpga_tio(int nasid, int *bt) { - moduleid_t module_id; - geoid_t geoid; - int cnodeid; + int ioboard_type; - cnodeid = nasid_to_cnodeid(nasid); - geoid = cnodeid_get_geoid(cnodeid); - module_id = geo_module(geoid); - return MODULE_GET_BTCHAR(module_id); -} + ioboard_type = ia64_sn_sysctl_ioboard_get(nasid); -static int is_fpga_brick(int nasid) -{ - switch (tiocx_btchar_get(nasid)) { + switch (ioboard_type) { case L1_BRICKTYPE_SA: case L1_BRICKTYPE_ATHENA: - case L1_BRICKTYPE_DAYTONA: + case L1_BOARDTYPE_DAYTONA: + *bt = ioboard_type; return 1; } + return 0; } @@ -407,16 +404,22 @@ static int tiocx_reload(struct cx_dev *c if (bitstream_loaded(nasid)) { uint64_t cx_id; + int rv; - cx_id = - *(volatile uint64_t *)(TIO_SWIN_BASE(nasid, TIOCX_CORELET) + + rv = ia64_sn_sysctl_tio_clock_reset(nasid); + if (rv) { + printk(KERN_ALERT "CX port JTAG reset failed.\n"); + } else { + cx_id = *(volatile uint64_t *) + (TIO_SWIN_BASE(nasid, TIOCX_CORELET) + WIDGET_ID); - part_num = XWIDGET_PART_NUM(cx_id); - mfg_num = XWIDGET_MFG_NUM(cx_id); - DBG("part= 0x%x, mfg= 0x%x\n", part_num, mfg_num); - /* just ignore it if it's a CE */ - if (part_num == TIO_CE_ASIC_PARTNUM) - return 0; + part_num = XWIDGET_PART_NUM(cx_id); + mfg_num = XWIDGET_MFG_NUM(cx_id); + DBG("part= 0x%x, mfg= 0x%x\n", part_num, mfg_num); + /* just ignore it if it's a CE */ + if (part_num == TIO_CE_ASIC_PARTNUM) + return 0; + } } cx_dev->cx_id.part_num = part_num; @@ -436,10 +439,10 @@ static ssize_t show_cxdev_control(struct { struct cx_dev *cx_dev = to_cx_dev(dev); - return sprintf(buf, "0x%x 0x%x 0x%x %d\n", + return sprintf(buf, "0x%x 0x%x 0x%x 0x%x\n", cx_dev->cx_id.nasid, cx_dev->cx_id.part_num, cx_dev->cx_id.mfg_num, - tiocx_btchar_get(cx_dev->cx_id.nasid)); + cx_dev->bt); } static ssize_t store_cxdev_control(struct device *dev, struct device_attribute *attr, const char *buf, @@ -488,11 +491,12 @@ static int __init tiocx_init(void) for (cnodeid = 0; cnodeid < MAX_COMPACT_NODES; cnodeid++) { nasid_t nasid; + int bt; if ((nasid = cnodeid_to_nasid(cnodeid)) < 0) break; /* No more nasids .. bail out of loop */ - if ((nasid & 0x1) && is_fpga_brick(nasid)) { + if ((nasid & 0x1) && is_fpga_tio(nasid, &bt)) { struct hubdev_info *hubdev; struct xwidget_info *widgetp; @@ -512,7 +516,7 @@ static int __init tiocx_init(void) if (cx_device_register (nasid, widgetp->xwi_hwid.part_num, - widgetp->xwi_hwid.mfg_num, hubdev) < 0) + widgetp->xwi_hwid.mfg_num, hubdev, bt) < 0) return -ENXIO; else found_tiocx_device++; diff -puN arch/ia64/sn/kernel/xpc_channel.c~git-ia64 arch/ia64/sn/kernel/xpc_channel.c --- devel/arch/ia64/sn/kernel/xpc_channel.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/sn/kernel/xpc_channel.c 2005-09-07 19:42:41.000000000 -0700 @@ -57,6 +57,7 @@ xpc_initialize_channels(struct xpc_parti spin_lock_init(&ch->lock); sema_init(&ch->msg_to_pull_sema, 1); /* mutex */ + sema_init(&ch->wdisconnect_sema, 0); /* event wait */ atomic_set(&ch->n_on_msg_allocate_wq, 0); init_waitqueue_head(&ch->msg_allocate_wq); @@ -166,6 +167,7 @@ xpc_setup_infrastructure(struct xpc_part xpc_initialize_channels(part, partid); atomic_set(&part->nchannels_active, 0); + atomic_set(&part->nchannels_engaged, 0); /* local_IPI_amo were set to 0 by an earlier memset() */ @@ -555,8 +557,6 @@ xpc_allocate_msgqueues(struct xpc_channe sema_init(&ch->notify_queue[i].sema, 0); } - sema_init(&ch->teardown_sema, 0); /* event wait */ - spin_lock_irqsave(&ch->lock, irq_flags); ch->flags |= XPC_C_SETUP; spin_unlock_irqrestore(&ch->lock, irq_flags); @@ -626,6 +626,55 @@ xpc_process_connect(struct xpc_channel * /* + * Notify those who wanted to be notified upon delivery of their message. + */ +static void +xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put) +{ + struct xpc_notify *notify; + u8 notify_type; + s64 get = ch->w_remote_GP.get - 1; + + + while (++get < put && atomic_read(&ch->n_to_notify) > 0) { + + notify = &ch->notify_queue[get % ch->local_nentries]; + + /* + * See if the notify entry indicates it was associated with + * a message who's sender wants to be notified. It is possible + * that it is, but someone else is doing or has done the + * notification. + */ + notify_type = notify->type; + if (notify_type == 0 || + cmpxchg(¬ify->type, notify_type, 0) != + notify_type) { + continue; + } + + DBUG_ON(notify_type != XPC_N_CALL); + + atomic_dec(&ch->n_to_notify); + + if (notify->func != NULL) { + dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, " + "msg_number=%ld, partid=%d, channel=%d\n", + (void *) notify, get, ch->partid, ch->number); + + notify->func(reason, ch->partid, ch->number, + notify->key); + + dev_dbg(xpc_chan, "notify->func() returned, " + "notify=0x%p, msg_number=%ld, partid=%d, " + "channel=%d\n", (void *) notify, get, + ch->partid, ch->number); + } + } +} + + +/* * Free up message queues and other stuff that were allocated for the specified * channel. * @@ -669,9 +718,6 @@ xpc_free_msgqueues(struct xpc_channel *c ch->remote_msgqueue = NULL; kfree(ch->notify_queue); ch->notify_queue = NULL; - - /* in case someone is waiting for the teardown to complete */ - up(&ch->teardown_sema); } } @@ -683,7 +729,7 @@ static void xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags) { struct xpc_partition *part = &xpc_partitions[ch->partid]; - u32 ch_flags = ch->flags; + u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED); DBUG_ON(!spin_is_locked(&ch->lock)); @@ -701,12 +747,13 @@ xpc_process_disconnect(struct xpc_channe } DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0); - /* it's now safe to free the channel's message queues */ - - xpc_free_msgqueues(ch); - DBUG_ON(ch->flags & XPC_C_SETUP); + if (part->act_state == XPC_P_DEACTIVATING) { + /* can't proceed until the other side disengages from us */ + if (xpc_partition_engaged(1UL << ch->partid)) { + return; + } - if (part->act_state != XPC_P_DEACTIVATING) { + } else { /* as long as the other side is up do the full protocol */ @@ -724,16 +771,33 @@ xpc_process_disconnect(struct xpc_channe } } + /* wake those waiting for notify completion */ + if (atomic_read(&ch->n_to_notify) > 0) { + /* >>> we do callout while holding ch->lock */ + xpc_notify_senders(ch, ch->reason, ch->w_local_GP.put); + } + /* both sides are disconnected now */ - ch->flags = XPC_C_DISCONNECTED; /* clear all flags, but this one */ + /* it's now safe to free the channel's message queues */ + xpc_free_msgqueues(ch); + + /* mark disconnected, clear all other flags except XPC_C_WDISCONNECT */ + ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT)); atomic_dec(&part->nchannels_active); - if (ch_flags & XPC_C_WASCONNECTED) { + if (channel_was_connected) { dev_info(xpc_chan, "channel %d to partition %d disconnected, " "reason=%d\n", ch->number, ch->partid, ch->reason); } + + /* wake the thread that is waiting for this channel to disconnect */ + if (ch->flags & XPC_C_WDISCONNECT) { + spin_unlock_irqrestore(&ch->lock, *irq_flags); + up(&ch->wdisconnect_sema); + spin_lock_irqsave(&ch->lock, *irq_flags); + } } @@ -764,7 +828,7 @@ xpc_process_openclose_IPI(struct xpc_par /* * If RCLOSEREQUEST is set, we're probably waiting for * RCLOSEREPLY. We should find it and a ROPENREQUEST packed - * with this RCLOSEQREUQEST in the IPI_flags. + * with this RCLOSEREQUEST in the IPI_flags. */ if (ch->flags & XPC_C_RCLOSEREQUEST) { @@ -852,7 +916,7 @@ xpc_process_openclose_IPI(struct xpc_par "channel=%d\n", args->msg_size, args->local_nentries, ch->partid, ch->number); - if ((ch->flags & XPC_C_DISCONNECTING) || + if ((ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) || part->act_state == XPC_P_DEACTIVATING) { spin_unlock_irqrestore(&ch->lock, irq_flags); return; @@ -1040,55 +1104,6 @@ xpc_connect_channel(struct xpc_channel * /* - * Notify those who wanted to be notified upon delivery of their message. - */ -static void -xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put) -{ - struct xpc_notify *notify; - u8 notify_type; - s64 get = ch->w_remote_GP.get - 1; - - - while (++get < put && atomic_read(&ch->n_to_notify) > 0) { - - notify = &ch->notify_queue[get % ch->local_nentries]; - - /* - * See if the notify entry indicates it was associated with - * a message who's sender wants to be notified. It is possible - * that it is, but someone else is doing or has done the - * notification. - */ - notify_type = notify->type; - if (notify_type == 0 || - cmpxchg(¬ify->type, notify_type, 0) != - notify_type) { - continue; - } - - DBUG_ON(notify_type != XPC_N_CALL); - - atomic_dec(&ch->n_to_notify); - - if (notify->func != NULL) { - dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, " - "msg_number=%ld, partid=%d, channel=%d\n", - (void *) notify, get, ch->partid, ch->number); - - notify->func(reason, ch->partid, ch->number, - notify->key); - - dev_dbg(xpc_chan, "notify->func() returned, " - "notify=0x%p, msg_number=%ld, partid=%d, " - "channel=%d\n", (void *) notify, get, - ch->partid, ch->number); - } - } -} - - -/* * Clear some of the msg flags in the local message queue. */ static inline void @@ -1240,6 +1255,7 @@ xpc_process_channel_activity(struct xpc_ u64 IPI_amo, IPI_flags; struct xpc_channel *ch; int ch_number; + u32 ch_flags; IPI_amo = xpc_get_IPI_flags(part); @@ -1266,8 +1282,9 @@ xpc_process_channel_activity(struct xpc_ xpc_process_openclose_IPI(part, ch_number, IPI_flags); } + ch_flags = ch->flags; /* need an atomic snapshot of flags */ - if (ch->flags & XPC_C_DISCONNECTING) { + if (ch_flags & XPC_C_DISCONNECTING) { spin_lock_irqsave(&ch->lock, irq_flags); xpc_process_disconnect(ch, &irq_flags); spin_unlock_irqrestore(&ch->lock, irq_flags); @@ -1278,9 +1295,9 @@ xpc_process_channel_activity(struct xpc_ continue; } - if (!(ch->flags & XPC_C_CONNECTED)) { - if (!(ch->flags & XPC_C_OPENREQUEST)) { - DBUG_ON(ch->flags & XPC_C_SETUP); + if (!(ch_flags & XPC_C_CONNECTED)) { + if (!(ch_flags & XPC_C_OPENREQUEST)) { + DBUG_ON(ch_flags & XPC_C_SETUP); (void) xpc_connect_channel(ch); } else { spin_lock_irqsave(&ch->lock, irq_flags); @@ -1305,8 +1322,8 @@ xpc_process_channel_activity(struct xpc_ /* - * XPC's heartbeat code calls this function to inform XPC that a partition has - * gone down. XPC responds by tearing down the XPartition Communication + * XPC's heartbeat code calls this function to inform XPC that a partition is + * going down. XPC responds by tearing down the XPartition Communication * infrastructure used for the just downed partition. * * XPC's heartbeat code will never call this function and xpc_partition_up() @@ -1314,7 +1331,7 @@ xpc_process_channel_activity(struct xpc_ * at the same time. */ void -xpc_partition_down(struct xpc_partition *part, enum xpc_retval reason) +xpc_partition_going_down(struct xpc_partition *part, enum xpc_retval reason) { unsigned long irq_flags; int ch_number; @@ -1330,12 +1347,11 @@ xpc_partition_down(struct xpc_partition } - /* disconnect all channels associated with the downed partition */ + /* disconnect channels associated with the partition going down */ for (ch_number = 0; ch_number < part->nchannels; ch_number++) { ch = &part->channels[ch_number]; - xpc_msgqueue_ref(ch); spin_lock_irqsave(&ch->lock, irq_flags); @@ -1370,6 +1386,7 @@ xpc_teardown_infrastructure(struct xpc_p * this partition. */ + DBUG_ON(atomic_read(&part->nchannels_engaged) != 0); DBUG_ON(atomic_read(&part->nchannels_active) != 0); DBUG_ON(part->setup_state != XPC_P_SETUP); part->setup_state = XPC_P_WTEARDOWN; @@ -1506,8 +1523,12 @@ xpc_initiate_disconnect(int ch_number) spin_lock_irqsave(&ch->lock, irq_flags); - XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering, + if (!(ch->flags & XPC_C_DISCONNECTED)) { + ch->flags |= XPC_C_WDISCONNECT; + + XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering, &irq_flags); + } spin_unlock_irqrestore(&ch->lock, irq_flags); @@ -1523,8 +1544,9 @@ xpc_initiate_disconnect(int ch_number) /* * To disconnect a channel, and reflect it back to all who may be waiting. * - * >>> An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by - * >>> xpc_free_msgqueues(). + * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by + * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by + * xpc_disconnect_wait(). * * THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN. */ @@ -1532,7 +1554,7 @@ void xpc_disconnect_channel(const int line, struct xpc_channel *ch, enum xpc_retval reason, unsigned long *irq_flags) { - u32 flags; + u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED); DBUG_ON(!spin_is_locked(&ch->lock)); @@ -1547,61 +1569,53 @@ xpc_disconnect_channel(const int line, s XPC_SET_REASON(ch, reason, line); - flags = ch->flags; + ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING); /* some of these may not have been set */ ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY | XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY | XPC_C_CONNECTING | XPC_C_CONNECTED); - ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING); xpc_IPI_send_closerequest(ch, irq_flags); - if (flags & XPC_C_CONNECTED) { + if (channel_was_connected) { ch->flags |= XPC_C_WASCONNECTED; } + spin_unlock_irqrestore(&ch->lock, *irq_flags); + + /* wake all idle kthreads so they can exit */ if (atomic_read(&ch->kthreads_idle) > 0) { - /* wake all idle kthreads so they can exit */ wake_up_all(&ch->idle_wq); } - spin_unlock_irqrestore(&ch->lock, *irq_flags); - - /* wake those waiting to allocate an entry from the local msg queue */ - if (atomic_read(&ch->n_on_msg_allocate_wq) > 0) { wake_up(&ch->msg_allocate_wq); } - /* wake those waiting for notify completion */ - - if (atomic_read(&ch->n_to_notify) > 0) { - xpc_notify_senders(ch, reason, ch->w_local_GP.put); - } - spin_lock_irqsave(&ch->lock, *irq_flags); } void -xpc_disconnected_callout(struct xpc_channel *ch) +xpc_disconnecting_callout(struct xpc_channel *ch) { /* - * Let the channel's registerer know that the channel is now + * Let the channel's registerer know that the channel is being * disconnected. We don't want to do this if the registerer was never - * informed of a connection being made, unless the disconnect was for - * abnormal reasons. + * informed of a connection being made. */ if (ch->func != NULL) { - dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, " - "channel=%d\n", ch->reason, ch->partid, ch->number); + dev_dbg(xpc_chan, "ch->func() called, reason=xpcDisconnecting," + " partid=%d, channel=%d\n", ch->partid, ch->number); - ch->func(ch->reason, ch->partid, ch->number, NULL, ch->key); + ch->func(xpcDisconnecting, ch->partid, ch->number, NULL, + ch->key); - dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, " - "channel=%d\n", ch->reason, ch->partid, ch->number); + dev_dbg(xpc_chan, "ch->func() returned, reason=" + "xpcDisconnecting, partid=%d, channel=%d\n", + ch->partid, ch->number); } } @@ -1848,7 +1862,7 @@ xpc_send_msg(struct xpc_channel *ch, str xpc_notify_func func, void *key) { enum xpc_retval ret = xpcSuccess; - struct xpc_notify *notify = NULL; // >>> to keep the compiler happy!! + struct xpc_notify *notify = notify; s64 put, msg_number = msg->number; diff -puN arch/ia64/sn/kernel/xpc.h~git-ia64 arch/ia64/sn/kernel/xpc.h --- devel/arch/ia64/sn/kernel/xpc.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/sn/kernel/xpc.h 2005-09-07 19:42:41.000000000 -0700 @@ -57,7 +57,7 @@ #define XPC_NASID_FROM_W_B(_w, _b) (((_w) * 64 + (_b)) * 2) #define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */ -#define XPC_HB_CHECK_DEFAULT_TIMEOUT 20 /* check HB every x secs */ +#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */ /* define the process name of HB checker and the CPU it is pinned to */ #define XPC_HB_CHECK_THREAD_NAME "xpc_hb" @@ -67,11 +67,6 @@ #define XPC_DISCOVERY_THREAD_NAME "xpc_discovery" -#define XPC_HB_ALLOWED(_p, _v) ((_v)->heartbeating_to_mask & (1UL << (_p))) -#define XPC_ALLOW_HB(_p, _v) (_v)->heartbeating_to_mask |= (1UL << (_p)) -#define XPC_DISALLOW_HB(_p, _v) (_v)->heartbeating_to_mask &= (~(1UL << (_p))) - - /* * Reserved Page provided by SAL. * @@ -88,14 +83,38 @@ struct xpc_rsvd_page { u8 version; u8 pad[6]; /* pad to u64 align */ volatile u64 vars_pa; + struct timespec stamp; /* time when reserved page was initialized */ u64 part_nasids[XP_NASID_MASK_WORDS] ____cacheline_aligned; u64 mach_nasids[XP_NASID_MASK_WORDS] ____cacheline_aligned; }; -#define XPC_RP_VERSION _XPC_VERSION(1,0) /* version 1.0 of the reserved page */ #define XPC_RSVD_PAGE_ALIGNED_SIZE \ (L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))) +#define XPC_RP_VERSION _XPC_VERSION(1,1) /* version 1.1 of the reserved page */ + +#define XPC_SUPPORTS_RP_STAMP(_version) \ + (_version >= _XPC_VERSION(1,1)) + +/* + * compare stamps - the return value is: + * + * < 0, if stamp1 < stamp2 + * = 0, if stamp1 == stamp2 + * > 0, if stamp1 > stamp2 + */ +static inline int +xpc_compare_stamps(struct timespec *stamp1, struct timespec *stamp2) +{ + int ret; + + + if ((ret = stamp1->tv_sec - stamp2->tv_sec) == 0) { + ret = stamp1->tv_nsec - stamp2->tv_nsec; + } + return ret; +} + /* * Define the structures by which XPC variables can be exported to other @@ -121,12 +140,61 @@ struct xpc_vars { u64 vars_part_pa; u64 amos_page_pa; /* paddr of page of AMOs from MSPEC driver */ AMO_t *amos_page; /* vaddr of page of AMOs from MSPEC driver */ - AMO_t *act_amos; /* pointer to the first activation AMO */ }; -#define XPC_V_VERSION _XPC_VERSION(3,0) /* version 3.0 of the cross vars */ #define XPC_VARS_ALIGNED_SIZE (L1_CACHE_ALIGN(sizeof(struct xpc_vars))) +#define XPC_V_VERSION _XPC_VERSION(3,1) /* version 3.1 of the cross vars */ + +#define XPC_SUPPORTS_DISENGAGE_REQUEST(_version) \ + (_version >= _XPC_VERSION(3,1)) + + +static inline int +xpc_hb_allowed(partid_t partid, struct xpc_vars *vars) +{ + return ((vars->heartbeating_to_mask & (1UL << partid)) != 0); +} + +static inline void +xpc_allow_hb(partid_t partid, struct xpc_vars *vars) +{ + u64 old_mask, new_mask; + + do { + old_mask = vars->heartbeating_to_mask; + new_mask = (old_mask | (1UL << partid)); + } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) != + old_mask); +} + +static inline void +xpc_disallow_hb(partid_t partid, struct xpc_vars *vars) +{ + u64 old_mask, new_mask; + + do { + old_mask = vars->heartbeating_to_mask; + new_mask = (old_mask & ~(1UL << partid)); + } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) != + old_mask); +} + + +/* + * The AMOs page consists of a number of AMO variables which are divided into + * four groups, The first two groups are used to identify an IRQ's sender. + * These two groups consist of 64 and 16 AMO variables respectively. The last + * two groups, consisting of just one AMO variable each, are used to identify + * the remote partitions that are currently engaged (from the viewpoint of + * the XPC running on the remote partition). + */ +#define XPC_NOTIFY_IRQ_AMOS 0 +#define XPC_ACTIVATE_IRQ_AMOS (XPC_NOTIFY_IRQ_AMOS + XP_MAX_PARTITIONS) +#define XPC_ENGAGED_PARTITIONS_AMO (XPC_ACTIVATE_IRQ_AMOS + XP_NASID_MASK_WORDS) +#define XPC_DISENGAGE_REQUEST_AMO (XPC_ENGAGED_PARTITIONS_AMO + 1) + + /* * The following structure describes the per partition specific variables. * @@ -358,7 +426,7 @@ struct xpc_channel { void *key; /* pointer to user's key */ struct semaphore msg_to_pull_sema; /* next msg to pull serialization */ - struct semaphore teardown_sema; /* wait for teardown completion */ + struct semaphore wdisconnect_sema; /* wait for channel disconnect */ struct xpc_openclose_args *local_openclose_args; /* args passed on */ /* opening or closing of channel */ @@ -410,6 +478,7 @@ struct xpc_channel { #define XPC_C_DISCONNECTED 0x00002000 /* channel is disconnected */ #define XPC_C_DISCONNECTING 0x00004000 /* channel is being disconnected */ +#define XPC_C_WDISCONNECT 0x00008000 /* waiting for channel disconnect */ @@ -422,6 +491,8 @@ struct xpc_partition { /* XPC HB infrastructure */ + u8 remote_rp_version; /* version# of partition's rsvd pg */ + struct timespec remote_rp_stamp;/* time when rsvd pg was initialized */ u64 remote_rp_pa; /* phys addr of partition's rsvd pg */ u64 remote_vars_pa; /* phys addr of partition's vars */ u64 remote_vars_part_pa; /* phys addr of partition's vars part */ @@ -432,10 +503,14 @@ struct xpc_partition { u32 act_IRQ_rcvd; /* IRQs since activation */ spinlock_t act_lock; /* protect updating of act_state */ u8 act_state; /* from XPC HB viewpoint */ + u8 remote_vars_version; /* version# of partition's vars */ enum xpc_retval reason; /* reason partition is deactivating */ int reason_line; /* line# deactivation initiated from */ int reactivate_nasid; /* nasid in partition to reactivate */ + unsigned long disengage_request_timeout; /* timeout in XPC_TICKS */ + struct timer_list disengage_request_timer; + /* XPC infrastructure referencing and teardown control */ @@ -454,6 +529,7 @@ struct xpc_partition { u8 nchannels; /* #of defined channels supported */ atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */ + atomic_t nchannels_engaged;/* #of channels engaged with remote part */ struct xpc_channel *channels;/* array of channel structures */ void *local_GPs_base; /* base address of kmalloc'd space */ @@ -518,6 +594,7 @@ struct xpc_partition { #define XPC_P_TORNDOWN 0x03 /* infrastructure is torndown */ + /* * struct xpc_partition IPI_timer #of seconds to wait before checking for * dropped IPIs. These occur whenever an IPI amo write doesn't complete until @@ -526,6 +603,13 @@ struct xpc_partition { #define XPC_P_DROPPED_IPI_WAIT (0.25 * HZ) +/* number of seconds to wait for other partitions to disengage */ +#define XPC_DISENGAGE_REQUEST_TIMELIMIT 90 + +/* interval in seconds to print 'waiting disengagement' messages */ +#define XPC_DISENGAGE_PRINTMSG_INTERVAL 10 + + #define XPC_PARTID(_p) ((partid_t) ((_p) - &xpc_partitions[0])) @@ -550,8 +634,6 @@ extern void xpc_activate_partition(struc /* found in xpc_partition.c */ extern int xpc_exiting; -extern int xpc_hb_interval; -extern int xpc_hb_check_interval; extern struct xpc_vars *xpc_vars; extern struct xpc_rsvd_page *xpc_rsvd_page; extern struct xpc_vars_part *xpc_vars_part; @@ -561,6 +643,7 @@ extern struct xpc_rsvd_page *xpc_rsvd_pa extern void xpc_allow_IPI_ops(void); extern void xpc_restrict_IPI_ops(void); extern int xpc_identify_act_IRQ_sender(void); +extern int xpc_partition_disengaged(struct xpc_partition *); extern enum xpc_retval xpc_mark_partition_active(struct xpc_partition *); extern void xpc_mark_partition_inactive(struct xpc_partition *); extern void xpc_discovery(void); @@ -585,8 +668,8 @@ extern void xpc_connected_callout(struct extern void xpc_deliver_msg(struct xpc_channel *); extern void xpc_disconnect_channel(const int, struct xpc_channel *, enum xpc_retval, unsigned long *); -extern void xpc_disconnected_callout(struct xpc_channel *); -extern void xpc_partition_down(struct xpc_partition *, enum xpc_retval); +extern void xpc_disconnecting_callout(struct xpc_channel *); +extern void xpc_partition_going_down(struct xpc_partition *, enum xpc_retval); extern void xpc_teardown_infrastructure(struct xpc_partition *); @@ -674,6 +757,157 @@ xpc_part_ref(struct xpc_partition *part) /* + * This next set of inlines are used to keep track of when a partition is + * potentially engaged in accessing memory belonging to another partition. + */ + +static inline void +xpc_mark_partition_engaged(struct xpc_partition *part) +{ + unsigned long irq_flags; + AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa + + (XPC_ENGAGED_PARTITIONS_AMO * sizeof(AMO_t))); + + + local_irq_save(irq_flags); + + /* set bit corresponding to our partid in remote partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_OR, + (1UL << sn_partition_id)); + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IPIs and AMOs to it until the heartbeat times out. + */ + (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static inline void +xpc_mark_partition_disengaged(struct xpc_partition *part) +{ + unsigned long irq_flags; + AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa + + (XPC_ENGAGED_PARTITIONS_AMO * sizeof(AMO_t))); + + + local_irq_save(irq_flags); + + /* clear bit corresponding to our partid in remote partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND, + ~(1UL << sn_partition_id)); + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IPIs and AMOs to it until the heartbeat times out. + */ + (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static inline void +xpc_request_partition_disengage(struct xpc_partition *part) +{ + unsigned long irq_flags; + AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa + + (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t))); + + + local_irq_save(irq_flags); + + /* set bit corresponding to our partid in remote partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_OR, + (1UL << sn_partition_id)); + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IPIs and AMOs to it until the heartbeat times out. + */ + (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static inline void +xpc_cancel_partition_disengage_request(struct xpc_partition *part) +{ + unsigned long irq_flags; + AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa + + (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t))); + + + local_irq_save(irq_flags); + + /* clear bit corresponding to our partid in remote partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND, + ~(1UL << sn_partition_id)); + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IPIs and AMOs to it until the heartbeat times out. + */ + (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static inline u64 +xpc_partition_engaged(u64 partid_mask) +{ + AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO; + + + /* return our partition's AMO variable ANDed with partid_mask */ + return (FETCHOP_LOAD_OP(TO_AMO((u64) &amo->variable), FETCHOP_LOAD) & + partid_mask); +} + +static inline u64 +xpc_partition_disengage_requested(u64 partid_mask) +{ + AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO; + + + /* return our partition's AMO variable ANDed with partid_mask */ + return (FETCHOP_LOAD_OP(TO_AMO((u64) &amo->variable), FETCHOP_LOAD) & + partid_mask); +} + +static inline void +xpc_clear_partition_engaged(u64 partid_mask) +{ + AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO; + + + /* clear bit(s) based on partid_mask in our partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND, + ~partid_mask); +} + +static inline void +xpc_clear_partition_disengage_request(u64 partid_mask) +{ + AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO; + + + /* clear bit(s) based on partid_mask in our partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND, + ~partid_mask); +} + + + +/* * The following set of macros and inlines are used for the sending and * receiving of IPIs (also known as IRQs). There are two flavors of IPIs, * one that is associated with partition activity (SGI_XPC_ACTIVATE) and @@ -722,13 +956,13 @@ xpc_IPI_send(AMO_t *amo, u64 flag, int n * Flag the appropriate AMO variable and send an IPI to the specified node. */ static inline void -xpc_activate_IRQ_send(u64 amos_page, int from_nasid, int to_nasid, +xpc_activate_IRQ_send(u64 amos_page_pa, int from_nasid, int to_nasid, int to_phys_cpuid) { int w_index = XPC_NASID_W_INDEX(from_nasid); int b_index = XPC_NASID_B_INDEX(from_nasid); - AMO_t *amos = (AMO_t *) __va(amos_page + - (XP_MAX_PARTITIONS * sizeof(AMO_t))); + AMO_t *amos = (AMO_t *) __va(amos_page_pa + + (XPC_ACTIVATE_IRQ_AMOS * sizeof(AMO_t))); (void) xpc_IPI_send(&amos[w_index], (1UL << b_index), to_nasid, @@ -756,6 +990,13 @@ xpc_IPI_send_reactivate(struct xpc_parti xpc_vars->act_nasid, xpc_vars->act_phys_cpuid); } +static inline void +xpc_IPI_send_disengage(struct xpc_partition *part) +{ + xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0), + part->remote_act_nasid, part->remote_act_phys_cpuid); +} + /* * IPIs associated with SGI_XPC_NOTIFY IRQ. @@ -903,17 +1144,18 @@ xpc_IPI_send_local_msgrequest(struct xpc * cacheable mapping for the entire region. This will prevent speculative * reading of cached copies of our lines from being issued which will cause * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64 - * (XP_MAX_PARTITIONS) AMO variables for message notification (xpc_main.c) - * and an additional 16 AMO variables for partition activation (xpc_hb.c). + * (XP_MAX_PARTITIONS) AMO variables for message notification and an + * additional 16 (XP_NASID_MASK_WORDS) AMO variables for partition activation + * and 2 AMO variables for partition deactivation. */ static inline AMO_t * -xpc_IPI_init(partid_t partid) +xpc_IPI_init(int index) { - AMO_t *part_amo = xpc_vars->amos_page + partid; + AMO_t *amo = xpc_vars->amos_page + index; - xpc_IPI_receive(part_amo); - return part_amo; + (void) xpc_IPI_receive(amo); /* clear AMO variable */ + return amo; } diff -puN arch/ia64/sn/kernel/xpc_main.c~git-ia64 arch/ia64/sn/kernel/xpc_main.c --- devel/arch/ia64/sn/kernel/xpc_main.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/sn/kernel/xpc_main.c 2005-09-07 19:42:41.000000000 -0700 @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -82,11 +83,13 @@ struct device *xpc_chan = &xpc_chan_dbg_ /* systune related variables for /proc/sys directories */ -static int xpc_hb_min = 1; -static int xpc_hb_max = 10; - -static int xpc_hb_check_min = 10; -static int xpc_hb_check_max = 120; +static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL; +static int xpc_hb_min_interval = 1; +static int xpc_hb_max_interval = 10; + +static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL; +static int xpc_hb_check_min_interval = 10; +static int xpc_hb_check_max_interval = 120; static ctl_table xpc_sys_xpc_hb_dir[] = { { @@ -99,7 +102,8 @@ static ctl_table xpc_sys_xpc_hb_dir[] = &proc_dointvec_minmax, &sysctl_intvec, NULL, - &xpc_hb_min, &xpc_hb_max + &xpc_hb_min_interval, + &xpc_hb_max_interval }, { 2, @@ -111,7 +115,8 @@ static ctl_table xpc_sys_xpc_hb_dir[] = &proc_dointvec_minmax, &sysctl_intvec, NULL, - &xpc_hb_check_min, &xpc_hb_check_max + &xpc_hb_check_min_interval, + &xpc_hb_check_max_interval }, {0} }; @@ -148,11 +153,11 @@ static DECLARE_WAIT_QUEUE_HEAD(xpc_act_I static unsigned long xpc_hb_check_timeout; -/* xpc_hb_checker thread exited notification */ -static DECLARE_MUTEX_LOCKED(xpc_hb_checker_exited); +/* used as an indication of when the xpc_hb_checker thread is inactive */ +static DECLARE_MUTEX_LOCKED(xpc_hb_checker_inactive); -/* xpc_discovery thread exited notification */ -static DECLARE_MUTEX_LOCKED(xpc_discovery_exited); +/* used as an indication of when the xpc_discovery thread is inactive */ +static DECLARE_MUTEX_LOCKED(xpc_discovery_inactive); static struct timer_list xpc_hb_timer; @@ -161,6 +166,30 @@ static struct timer_list xpc_hb_timer; static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *); +static int xpc_system_reboot(struct notifier_block *, unsigned long, void *); +static struct notifier_block xpc_reboot_notifier = { + .notifier_call = xpc_system_reboot, +}; + + +/* + * Timer function to enforce the timelimit on the partition disengage request. + */ +static void +xpc_timeout_partition_disengage_request(unsigned long data) +{ + struct xpc_partition *part = (struct xpc_partition *) data; + + + DBUG_ON(XPC_TICKS < part->disengage_request_timeout); + + (void) xpc_partition_disengaged(part); + + DBUG_ON(part->disengage_request_timeout != 0); + DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0); +} + + /* * Notify the heartbeat check thread that an IRQ has been received. */ @@ -214,12 +243,6 @@ xpc_hb_checker(void *ignore) while (!(volatile int) xpc_exiting) { - /* wait for IRQ or timeout */ - (void) wait_event_interruptible(xpc_act_IRQ_wq, - (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) || - jiffies >= xpc_hb_check_timeout || - (volatile int) xpc_exiting)); - dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have " "been received\n", (int) (xpc_hb_check_timeout - jiffies), @@ -240,6 +263,7 @@ xpc_hb_checker(void *ignore) } + /* check for outstanding IRQs */ new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd); if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) { force_IRQ = 0; @@ -257,13 +281,19 @@ xpc_hb_checker(void *ignore) xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); } + + /* wait for IRQ or timeout */ + (void) wait_event_interruptible(xpc_act_IRQ_wq, + (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) || + jiffies >= xpc_hb_check_timeout || + (volatile int) xpc_exiting)); } dev_dbg(xpc_part, "heartbeat checker is exiting\n"); /* mark this thread as inactive */ - up(&xpc_hb_checker_exited); + up(&xpc_hb_checker_inactive); return 0; } @@ -283,7 +313,7 @@ xpc_initiate_discovery(void *ignore) dev_dbg(xpc_part, "discovery thread is exiting\n"); /* mark this thread as inactive */ - up(&xpc_discovery_exited); + up(&xpc_discovery_inactive); return 0; } @@ -309,7 +339,7 @@ xpc_make_first_contact(struct xpc_partit "partition %d\n", XPC_PARTID(part)); /* wait a 1/4 of a second or so */ - msleep_interruptible(250); + (void) msleep_interruptible(250); if (part->act_state == XPC_P_DEACTIVATING) { return part->reason; @@ -336,7 +366,8 @@ static void xpc_channel_mgr(struct xpc_partition *part) { while (part->act_state != XPC_P_DEACTIVATING || - atomic_read(&part->nchannels_active) > 0) { + atomic_read(&part->nchannels_active) > 0 || + !xpc_partition_disengaged(part)) { xpc_process_channel_activity(part); @@ -360,7 +391,8 @@ xpc_channel_mgr(struct xpc_partition *pa (volatile u64) part->local_IPI_amo != 0 || ((volatile u8) part->act_state == XPC_P_DEACTIVATING && - atomic_read(&part->nchannels_active) == 0))); + atomic_read(&part->nchannels_active) == 0 && + xpc_partition_disengaged(part)))); atomic_set(&part->channel_mgr_requests, 1); // >>> Does it need to wakeup periodically as well? In case we @@ -482,7 +514,7 @@ xpc_activating(void *__partid) return 0; } - XPC_ALLOW_HB(partid, xpc_vars); + xpc_allow_hb(partid, xpc_vars); xpc_IPI_send_activated(part); @@ -492,6 +524,7 @@ xpc_activating(void *__partid) */ (void) xpc_partition_up(part); + xpc_disallow_hb(partid, xpc_vars); xpc_mark_partition_inactive(part); if (part->reason == xpcReactivating) { @@ -704,11 +737,14 @@ xpc_daemonize_kthread(void *args) xpc_kthread_waitmsgs(part, ch); } - if (atomic_dec_return(&ch->kthreads_assigned) == 0 && - ((ch->flags & XPC_C_CONNECTCALLOUT) || - (ch->reason != xpcUnregistering && - ch->reason != xpcOtherUnregistering))) { - xpc_disconnected_callout(ch); + if (atomic_dec_return(&ch->kthreads_assigned) == 0) { + if (ch->flags & XPC_C_CONNECTCALLOUT) { + xpc_disconnecting_callout(ch); + } + if (atomic_dec_return(&part->nchannels_engaged) == 0) { + xpc_mark_partition_disengaged(part); + xpc_IPI_send_disengage(part); + } } @@ -740,6 +776,7 @@ xpc_create_kthreads(struct xpc_channel * unsigned long irq_flags; pid_t pid; u64 args = XPC_PACK_ARGS(ch->partid, ch->number); + struct xpc_partition *part = &xpc_partitions[ch->partid]; while (needed-- > 0) { @@ -770,9 +807,13 @@ xpc_create_kthreads(struct xpc_channel * * kthread. That kthread is responsible for doing the * counterpart to the following before it exits. */ - (void) xpc_part_ref(&xpc_partitions[ch->partid]); + (void) xpc_part_ref(part); xpc_msgqueue_ref(ch); - atomic_inc(&ch->kthreads_assigned); + if (atomic_inc_return(&ch->kthreads_assigned) == 1) { + if (atomic_inc_return(&part->nchannels_engaged) == 1) { + xpc_mark_partition_engaged(part); + } + } ch->kthreads_created++; // >>> temporary debug only!!! } } @@ -781,6 +822,7 @@ xpc_create_kthreads(struct xpc_channel * void xpc_disconnect_wait(int ch_number) { + unsigned long irq_flags; partid_t partid; struct xpc_partition *part; struct xpc_channel *ch; @@ -793,10 +835,13 @@ xpc_disconnect_wait(int ch_number) if (xpc_part_ref(part)) { ch = &part->channels[ch_number]; -// >>> how do we keep from falling into the window between our check and going -// >>> down and coming back up where sema is re-inited? - if (ch->flags & XPC_C_SETUP) { - (void) down(&ch->teardown_sema); + if (ch->flags & XPC_C_WDISCONNECT) { + if (!(ch->flags & XPC_C_DISCONNECTED)) { + (void) down(&ch->wdisconnect_sema); + } + spin_lock_irqsave(&ch->lock, irq_flags); + ch->flags &= ~XPC_C_WDISCONNECT; + spin_unlock_irqrestore(&ch->lock, irq_flags); } xpc_part_deref(part); @@ -806,62 +851,89 @@ xpc_disconnect_wait(int ch_number) static void -xpc_do_exit(void) +xpc_do_exit(enum xpc_retval reason) { partid_t partid; int active_part_count; struct xpc_partition *part; + unsigned long printmsg_time; - /* now it's time to eliminate our heartbeat */ - del_timer_sync(&xpc_hb_timer); - xpc_vars->heartbeating_to_mask = 0; - - /* indicate to others that our reserved page is uninitialized */ - xpc_rsvd_page->vars_pa = 0; - - /* - * Ignore all incoming interrupts. Without interupts the heartbeat - * checker won't activate any new partitions that may come up. - */ - free_irq(SGI_XPC_ACTIVATE, NULL); + /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */ + DBUG_ON(xpc_exiting == 1); /* - * Cause the heartbeat checker and the discovery threads to exit. - * We don't want them attempting to activate new partitions as we - * try to deactivate the existing ones. + * Let the heartbeat checker thread and the discovery thread + * (if one is running) know that they should exit. Also wake up + * the heartbeat checker thread in case it's sleeping. */ xpc_exiting = 1; wake_up_interruptible(&xpc_act_IRQ_wq); - /* wait for the heartbeat checker thread to mark itself inactive */ - down(&xpc_hb_checker_exited); + /* ignore all incoming interrupts */ + free_irq(SGI_XPC_ACTIVATE, NULL); /* wait for the discovery thread to mark itself inactive */ - down(&xpc_discovery_exited); + down(&xpc_discovery_inactive); + + /* wait for the heartbeat checker thread to mark itself inactive */ + down(&xpc_hb_checker_inactive); - msleep_interruptible(300); + /* sleep for a 1/3 of a second or so */ + (void) msleep_interruptible(300); /* wait for all partitions to become inactive */ + printmsg_time = jiffies; + do { active_part_count = 0; for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { part = &xpc_partitions[partid]; - if (part->act_state != XPC_P_INACTIVE) { - active_part_count++; - - XPC_DEACTIVATE_PARTITION(part, xpcUnloading); + if (xpc_partition_disengaged(part) && + part->act_state == XPC_P_INACTIVE) { + continue; } + + active_part_count++; + + XPC_DEACTIVATE_PARTITION(part, reason); } - if (active_part_count) - msleep_interruptible(300); - } while (active_part_count > 0); + if (active_part_count == 0) { + break; + } + + if (jiffies >= printmsg_time) { + dev_info(xpc_part, "waiting for partitions to " + "deactivate/disengage, active count=%d, remote " + "engaged=0x%lx\n", active_part_count, + xpc_partition_engaged(1UL << partid)); + + printmsg_time = jiffies + + (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ); + } + + /* sleep for a 1/3 of a second or so */ + (void) msleep_interruptible(300); + + } while (1); + + DBUG_ON(xpc_partition_engaged(-1UL)); + + + /* indicate to others that our reserved page is uninitialized */ + xpc_rsvd_page->vars_pa = 0; + + /* now it's time to eliminate our heartbeat */ + del_timer_sync(&xpc_hb_timer); + DBUG_ON(xpc_vars->heartbeating_to_mask == 0); + /* take ourselves off of the reboot_notifier_list */ + (void) unregister_reboot_notifier(&xpc_reboot_notifier); /* close down protections for IPI operations */ xpc_restrict_IPI_ops(); @@ -876,6 +948,34 @@ xpc_do_exit(void) } +/* + * This function is called when the system is being rebooted. + */ +static int +xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused) +{ + enum xpc_retval reason; + + + switch (event) { + case SYS_RESTART: + reason = xpcSystemReboot; + break; + case SYS_HALT: + reason = xpcSystemHalt; + break; + case SYS_POWER_OFF: + reason = xpcSystemPoweroff; + break; + default: + reason = xpcSystemGoingDown; + } + + xpc_do_exit(reason); + return NOTIFY_DONE; +} + + int __init xpc_init(void) { @@ -920,6 +1020,12 @@ xpc_init(void) spin_lock_init(&part->act_lock); part->act_state = XPC_P_INACTIVE; XPC_SET_REASON(part, 0, 0); + + init_timer(&part->disengage_request_timer); + part->disengage_request_timer.function = + xpc_timeout_partition_disengage_request; + part->disengage_request_timer.data = (unsigned long) part; + part->setup_state = XPC_P_UNSET; init_waitqueue_head(&part->teardown_wq); atomic_set(&part->references, 0); @@ -976,6 +1082,13 @@ xpc_init(void) } + /* add ourselves to the reboot_notifier_list */ + ret = register_reboot_notifier(&xpc_reboot_notifier); + if (ret != 0) { + dev_warn(xpc_part, "can't register reboot notifier\n"); + } + + /* * Set the beating to other partitions into motion. This is * the last requirement for other partitions' discovery to @@ -997,6 +1110,9 @@ xpc_init(void) /* indicate to others that our reserved page is uninitialized */ xpc_rsvd_page->vars_pa = 0; + /* take ourselves off of the reboot_notifier_list */ + (void) unregister_reboot_notifier(&xpc_reboot_notifier); + del_timer_sync(&xpc_hb_timer); free_irq(SGI_XPC_ACTIVATE, NULL); xpc_restrict_IPI_ops(); @@ -1018,9 +1134,9 @@ xpc_init(void) dev_err(xpc_part, "failed while forking discovery thread\n"); /* mark this new thread as a non-starter */ - up(&xpc_discovery_exited); + up(&xpc_discovery_inactive); - xpc_do_exit(); + xpc_do_exit(xpcUnloading); return -EBUSY; } @@ -1039,7 +1155,7 @@ module_init(xpc_init); void __exit xpc_exit(void) { - xpc_do_exit(); + xpc_do_exit(xpcUnloading); } module_exit(xpc_exit); diff -puN arch/ia64/sn/kernel/xpc_partition.c~git-ia64 arch/ia64/sn/kernel/xpc_partition.c --- devel/arch/ia64/sn/kernel/xpc_partition.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/arch/ia64/sn/kernel/xpc_partition.c 2005-09-07 19:42:41.000000000 -0700 @@ -76,11 +76,6 @@ char ____cacheline_aligned xpc_remote_copy_buffer[XPC_RSVD_PAGE_ALIGNED_SIZE]; -/* systune related variables */ -int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL; -int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_TIMEOUT; - - /* * Given a nasid, get the physical address of the partition's reserved page * for that nasid. This function returns 0 on any error. @@ -239,16 +234,21 @@ xpc_rsvd_page_init(void) xpc_vars->amos_page = amos_page; /* save for next load of XPC */ - /* - * Initialize the activation related AMO variables. - */ - xpc_vars->act_amos = xpc_IPI_init(XP_MAX_PARTITIONS); - for (i = 1; i < XP_NASID_MASK_WORDS; i++) { - xpc_IPI_init(i + XP_MAX_PARTITIONS); + /* initialize the activate IRQ related AMO variables */ + for (i = 0; i < XP_NASID_MASK_WORDS; i++) { + (void) xpc_IPI_init(XPC_ACTIVATE_IRQ_AMOS + i); } + + /* initialize the engaged remote partitions related AMO variables */ + (void) xpc_IPI_init(XPC_ENGAGED_PARTITIONS_AMO); + (void) xpc_IPI_init(XPC_DISENGAGE_REQUEST_AMO); + /* export AMO page's physical address to other partitions */ xpc_vars->amos_page_pa = ia64_tpa((u64) xpc_vars->amos_page); + /* timestamp of when reserved page was initialized */ + rp->stamp = CURRENT_TIME; + /* * This signifies to the remote partition that our reserved * page is initialized. @@ -387,6 +387,11 @@ xpc_check_remote_hb(void) remote_vars = (struct xpc_vars *) xpc_remote_copy_buffer; for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { + + if (xpc_exiting) { + break; + } + if (partid == sn_partition_id) { continue; } @@ -417,7 +422,7 @@ xpc_check_remote_hb(void) if (((remote_vars->heartbeat == part->last_heartbeat) && (remote_vars->kdb_status == 0)) || - !XPC_HB_ALLOWED(sn_partition_id, remote_vars)) { + !xpc_hb_allowed(sn_partition_id, remote_vars)) { XPC_DEACTIVATE_PARTITION(part, xpcNoHeartbeat); continue; @@ -436,23 +441,23 @@ xpc_check_remote_hb(void) */ static enum xpc_retval xpc_get_remote_rp(int nasid, u64 *discovered_nasids, - struct xpc_rsvd_page *remote_rp, u64 *remote_rsvd_page_pa) + struct xpc_rsvd_page *remote_rp, u64 *remote_rp_pa) { int bres, i; /* get the reserved page's physical address */ - *remote_rsvd_page_pa = xpc_get_rsvd_page_pa(nasid, (u64) remote_rp, + *remote_rp_pa = xpc_get_rsvd_page_pa(nasid, (u64) remote_rp, XPC_RSVD_PAGE_ALIGNED_SIZE); - if (*remote_rsvd_page_pa == 0) { + if (*remote_rp_pa == 0) { return xpcNoRsvdPageAddr; } /* pull over the reserved page structure */ - bres = xp_bte_copy(*remote_rsvd_page_pa, ia64_tpa((u64) remote_rp), + bres = xp_bte_copy(*remote_rp_pa, ia64_tpa((u64) remote_rp), XPC_RSVD_PAGE_ALIGNED_SIZE, (BTE_NOTIFY | BTE_WACQUIRE), NULL); if (bres != BTE_SUCCESS) { @@ -524,6 +529,55 @@ xpc_get_remote_vars(u64 remote_vars_pa, /* + * Update the remote partition's info. + */ +static void +xpc_update_partition_info(struct xpc_partition *part, u8 remote_rp_version, + struct timespec *remote_rp_stamp, u64 remote_rp_pa, + u64 remote_vars_pa, struct xpc_vars *remote_vars) +{ + part->remote_rp_version = remote_rp_version; + dev_dbg(xpc_part, " remote_rp_version = 0x%016lx\n", + part->remote_rp_version); + + part->remote_rp_stamp = *remote_rp_stamp; + dev_dbg(xpc_part, " remote_rp_stamp (tv_sec = 0x%lx tv_nsec = 0x%lx\n", + part->remote_rp_stamp.tv_sec, part->remote_rp_stamp.tv_nsec); + + part->remote_rp_pa = remote_rp_pa; + dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", part->remote_rp_pa); + + part->remote_vars_pa = remote_vars_pa; + dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n", + part->remote_vars_pa); + + part->last_heartbeat = remote_vars->heartbeat; + dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n", + part->last_heartbeat); + + part->remote_vars_part_pa = remote_vars->vars_part_pa; + dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n", + part->remote_vars_part_pa); + + part->remote_act_nasid = remote_vars->act_nasid; + dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n", + part->remote_act_nasid); + + part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid; + dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n", + part->remote_act_phys_cpuid); + + part->remote_amos_page_pa = remote_vars->amos_page_pa; + dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n", + part->remote_amos_page_pa); + + part->remote_vars_version = remote_vars->version; + dev_dbg(xpc_part, " remote_vars_version = 0x%x\n", + part->remote_vars_version); +} + + +/* * Prior code has determine the nasid which generated an IPI. Inspect * that nasid to determine if its partition needs to be activated or * deactivated. @@ -542,8 +596,12 @@ xpc_identify_act_IRQ_req(int nasid) { struct xpc_rsvd_page *remote_rp; struct xpc_vars *remote_vars; - u64 remote_rsvd_page_pa; + u64 remote_rp_pa; u64 remote_vars_pa; + int remote_rp_version; + int reactivate = 0; + int stamp_diff; + struct timespec remote_rp_stamp = { 0, 0 }; partid_t partid; struct xpc_partition *part; enum xpc_retval ret; @@ -553,7 +611,7 @@ xpc_identify_act_IRQ_req(int nasid) remote_rp = (struct xpc_rsvd_page *) xpc_remote_copy_buffer; - ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rsvd_page_pa); + ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa); if (ret != xpcSuccess) { dev_warn(xpc_part, "unable to get reserved page from nasid %d, " "which sent interrupt, reason=%d\n", nasid, ret); @@ -561,6 +619,10 @@ xpc_identify_act_IRQ_req(int nasid) } remote_vars_pa = remote_rp->vars_pa; + remote_rp_version = remote_rp->version; + if (XPC_SUPPORTS_RP_STAMP(remote_rp_version)) { + remote_rp_stamp = remote_rp->stamp; + } partid = remote_rp->partid; part = &xpc_partitions[partid]; @@ -586,44 +648,117 @@ xpc_identify_act_IRQ_req(int nasid) "%ld:0x%lx\n", (int) nasid, (int) partid, part->act_IRQ_rcvd, remote_vars->heartbeat, remote_vars->heartbeating_to_mask); + if (xpc_partition_disengaged(part) && + part->act_state == XPC_P_INACTIVE) { - if (part->act_state == XPC_P_INACTIVE) { + xpc_update_partition_info(part, remote_rp_version, + &remote_rp_stamp, remote_rp_pa, + remote_vars_pa, remote_vars); + + if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) { + if (xpc_partition_disengage_requested(1UL << partid)) { + /* + * Other side is waiting on us to disengage, + * even though we already have. + */ + return; + } + } else { + /* other side doesn't support disengage requests */ + xpc_clear_partition_disengage_request(1UL << partid); + } - part->remote_rp_pa = remote_rsvd_page_pa; - dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", - part->remote_rp_pa); - - part->remote_vars_pa = remote_vars_pa; - dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n", - part->remote_vars_pa); + xpc_activate_partition(part); + return; + } - part->last_heartbeat = remote_vars->heartbeat; - dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n", - part->last_heartbeat); + DBUG_ON(part->remote_rp_version == 0); + DBUG_ON(part->remote_vars_version == 0); - part->remote_vars_part_pa = remote_vars->vars_part_pa; - dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n", - part->remote_vars_part_pa); - - part->remote_act_nasid = remote_vars->act_nasid; - dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n", - part->remote_act_nasid); - - part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid; - dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n", - part->remote_act_phys_cpuid); - - part->remote_amos_page_pa = remote_vars->amos_page_pa; - dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n", - part->remote_amos_page_pa); + if (!XPC_SUPPORTS_RP_STAMP(part->remote_rp_version)) { + DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(part-> + remote_vars_version)); + + if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) { + DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars-> + version)); + /* see if the other side rebooted */ + if (part->remote_amos_page_pa == + remote_vars->amos_page_pa && + xpc_hb_allowed(sn_partition_id, + remote_vars)) { + /* doesn't look that way, so ignore the IPI */ + return; + } + } - xpc_activate_partition(part); + /* + * Other side rebooted and previous XPC didn't support the + * disengage request, so we don't need to do anything special. + */ + + xpc_update_partition_info(part, remote_rp_version, + &remote_rp_stamp, remote_rp_pa, + remote_vars_pa, remote_vars); + part->reactivate_nasid = nasid; + XPC_DEACTIVATE_PARTITION(part, xpcReactivating); + return; + } + + DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)); + + if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) { + DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version)); + + /* + * Other side rebooted and previous XPC did support the + * disengage request, but the new one doesn't. + */ + + xpc_clear_partition_engaged(1UL << partid); + xpc_clear_partition_disengage_request(1UL << partid); + + xpc_update_partition_info(part, remote_rp_version, + &remote_rp_stamp, remote_rp_pa, + remote_vars_pa, remote_vars); + reactivate = 1; - } else if (part->remote_amos_page_pa != remote_vars->amos_page_pa || - !XPC_HB_ALLOWED(sn_partition_id, remote_vars)) { + } else { + DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version)); + + stamp_diff = xpc_compare_stamps(&part->remote_rp_stamp, + &remote_rp_stamp); + if (stamp_diff != 0) { + DBUG_ON(stamp_diff >= 0); + + /* + * Other side rebooted and the previous XPC did support + * the disengage request, as does the new one. + */ + + DBUG_ON(xpc_partition_engaged(1UL << partid)); + DBUG_ON(xpc_partition_disengage_requested(1UL << + partid)); + + xpc_update_partition_info(part, remote_rp_version, + &remote_rp_stamp, remote_rp_pa, + remote_vars_pa, remote_vars); + reactivate = 1; + } + } + + if (!xpc_partition_disengaged(part)) { + /* still waiting on other side to disengage from us */ + return; + } + if (reactivate) { part->reactivate_nasid = nasid; XPC_DEACTIVATE_PARTITION(part, xpcReactivating); + + } else if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version) && + xpc_partition_disengage_requested(1UL << partid)) { + XPC_DEACTIVATE_PARTITION(part, xpcOtherGoingDown); } } @@ -646,12 +781,16 @@ xpc_identify_act_IRQ_sender(void) struct xpc_rsvd_page *rp = (struct xpc_rsvd_page *) xpc_rsvd_page; - act_amos = xpc_vars->act_amos; + act_amos = xpc_vars->amos_page + XPC_ACTIVATE_IRQ_AMOS; /* scan through act AMO variable looking for non-zero entries */ for (word = 0; word < XP_NASID_MASK_WORDS; word++) { + if (xpc_exiting) { + break; + } + nasid_mask = xpc_IPI_receive(&act_amos[word]); if (nasid_mask == 0) { /* no IRQs from nasids in this variable */ @@ -688,6 +827,55 @@ xpc_identify_act_IRQ_sender(void) /* + * See if the other side has responded to a partition disengage request + * from us. + */ +int +xpc_partition_disengaged(struct xpc_partition *part) +{ + partid_t partid = XPC_PARTID(part); + int disengaged; + + + disengaged = (xpc_partition_engaged(1UL << partid) == 0); + if (part->disengage_request_timeout) { + if (!disengaged) { + if (jiffies < part->disengage_request_timeout) { + /* timelimit hasn't been reached yet */ + return 0; + } + + /* + * Other side hasn't responded to our disengage + * request in a timely fashion, so assume it's dead. + */ + + xpc_clear_partition_engaged(1UL << partid); + disengaged = 1; + } + part->disengage_request_timeout = 0; + + /* cancel the timer function, provided it's not us */ + if (!in_interrupt()) { + del_singleshot_timer_sync(&part-> + disengage_request_timer); + } + + DBUG_ON(part->act_state != XPC_P_DEACTIVATING && + part->act_state != XPC_P_INACTIVE); + if (part->act_state != XPC_P_INACTIVE) { + xpc_wakeup_channel_mgr(part); + } + + if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) { + xpc_cancel_partition_disengage_request(part); + } + } + return disengaged; +} + + +/* * Mark specified partition as active. */ enum xpc_retval @@ -721,7 +909,6 @@ xpc_deactivate_partition(const int line, enum xpc_retval reason) { unsigned long irq_flags; - partid_t partid = XPC_PARTID(part); spin_lock_irqsave(&part->act_lock, irq_flags); @@ -749,17 +936,27 @@ xpc_deactivate_partition(const int line, spin_unlock_irqrestore(&part->act_lock, irq_flags); - XPC_DISALLOW_HB(partid, xpc_vars); + if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) { + xpc_request_partition_disengage(part); + xpc_IPI_send_disengage(part); + + /* set a timelimit on the disengage request */ + part->disengage_request_timeout = jiffies + + (XPC_DISENGAGE_REQUEST_TIMELIMIT * HZ); + part->disengage_request_timer.expires = + part->disengage_request_timeout; + add_timer(&part->disengage_request_timer); + } dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n", partid, reason); - xpc_partition_down(part, reason); + xpc_partition_going_down(part, reason); } /* - * Mark specified partition as active. + * Mark specified partition as inactive. */ void xpc_mark_partition_inactive(struct xpc_partition *part) @@ -792,7 +989,7 @@ xpc_discovery(void) void *remote_rp_base; struct xpc_rsvd_page *remote_rp; struct xpc_vars *remote_vars; - u64 remote_rsvd_page_pa; + u64 remote_rp_pa; u64 remote_vars_pa; int region; int max_regions; @@ -877,7 +1074,7 @@ xpc_discovery(void) /* pull over the reserved page structure */ ret = xpc_get_remote_rp(nasid, discovered_nasids, - remote_rp, &remote_rsvd_page_pa); + remote_rp, &remote_rp_pa); if (ret != xpcSuccess) { dev_dbg(xpc_part, "unable to get reserved page " "from nasid %d, reason=%d\n", nasid, @@ -948,6 +1145,13 @@ xpc_discovery(void) remote_vars->act_nasid, remote_vars->act_phys_cpuid); + if (XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars-> + version)) { + part->remote_amos_page_pa = + remote_vars->amos_page_pa; + xpc_mark_partition_disengaged(part); + xpc_cancel_partition_disengage_request(part); + } xpc_IPI_send_activate(remote_vars); } } diff -puN include/asm-ia64/iosapic.h~git-ia64 include/asm-ia64/iosapic.h --- devel/include/asm-ia64/iosapic.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/iosapic.h 2005-09-07 19:42:41.000000000 -0700 @@ -80,12 +80,9 @@ extern int iosapic_remove (unsigned int #endif /* CONFIG_HOTPLUG */ extern int gsi_to_vector (unsigned int gsi); extern int gsi_to_irq (unsigned int gsi); -extern void iosapic_enable_intr (unsigned int vector); extern int iosapic_register_intr (unsigned int gsi, unsigned long polarity, unsigned long trigger); -#ifdef CONFIG_ACPI_DEALLOCATE_IRQ extern void iosapic_unregister_intr (unsigned int irq); -#endif extern void __init iosapic_override_isa_irq (unsigned int isa_irq, unsigned int gsi, unsigned long polarity, unsigned long trigger); @@ -97,7 +94,6 @@ extern int __init iosapic_register_platf unsigned long trigger); extern unsigned int iosapic_version (char __iomem *addr); -extern void iosapic_pci_fixup (int); #ifdef CONFIG_NUMA extern void __devinit map_iosapic_to_node (unsigned int, int); #endif diff -puN include/asm-ia64/irq.h~git-ia64 include/asm-ia64/irq.h --- devel/include/asm-ia64/irq.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/irq.h 2005-09-07 19:42:41.000000000 -0700 @@ -35,8 +35,4 @@ extern void disable_irq_nosync (unsigned extern void enable_irq (unsigned int); extern void set_irq_affinity_info (unsigned int irq, int dest, int redir); -struct irqaction; -struct pt_regs; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - #endif /* _ASM_IA64_IRQ_H */ diff -puN include/asm-ia64/mca_asm.h~git-ia64 include/asm-ia64/mca_asm.h --- devel/include/asm-ia64/mca_asm.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/mca_asm.h 2005-09-07 19:42:41.000000000 -0700 @@ -8,6 +8,8 @@ * Copyright (C) 2000 David Mosberger-Tang * Copyright (C) 2002 Intel Corp. * Copyright (C) 2002 Jenna Hall + * Copyright (C) 2005 Silicon Graphics, Inc + * Copyright (C) 2005 Keith Owens */ #ifndef _ASM_IA64_MCA_ASM_H #define _ASM_IA64_MCA_ASM_H @@ -207,106 +209,33 @@ ;; /* - * The following offsets capture the order in which the - * RSE related registers from the old context are - * saved onto the new stack frame. - * - * +-----------------------+ - * |NDIRTY [BSP - BSPSTORE]| - * +-----------------------+ - * | RNAT | - * +-----------------------+ - * | BSPSTORE | - * +-----------------------+ - * | IFS | - * +-----------------------+ - * | PFS | - * +-----------------------+ - * | RSC | - * +-----------------------+ <-------- Bottom of new stack frame - */ -#define rse_rsc_offset 0 -#define rse_pfs_offset (rse_rsc_offset+0x08) -#define rse_ifs_offset (rse_pfs_offset+0x08) -#define rse_bspstore_offset (rse_ifs_offset+0x08) -#define rse_rnat_offset (rse_bspstore_offset+0x08) -#define rse_ndirty_offset (rse_rnat_offset+0x08) - -/* - * rse_switch_context - * - * 1. Save old RSC onto the new stack frame - * 2. Save PFS onto new stack frame - * 3. Cover the old frame and start a new frame. - * 4. Save IFS onto new stack frame - * 5. Save the old BSPSTORE on the new stack frame - * 6. Save the old RNAT on the new stack frame - * 7. Write BSPSTORE with the new backing store pointer - * 8. Read and save the new BSP to calculate the #dirty registers - * NOTE: Look at pages 11-10, 11-11 in PRM Vol 2 - */ -#define rse_switch_context(temp,p_stackframe,p_bspstore) \ - ;; \ - mov temp=ar.rsc;; \ - st8 [p_stackframe]=temp,8;; \ - mov temp=ar.pfs;; \ - st8 [p_stackframe]=temp,8; \ - cover ;; \ - mov temp=cr.ifs;; \ - st8 [p_stackframe]=temp,8;; \ - mov temp=ar.bspstore;; \ - st8 [p_stackframe]=temp,8;; \ - mov temp=ar.rnat;; \ - st8 [p_stackframe]=temp,8; \ - mov ar.bspstore=p_bspstore;; \ - mov temp=ar.bsp;; \ - sub temp=temp,p_bspstore;; \ - st8 [p_stackframe]=temp,8;; - -/* - * rse_return_context - * 1. Allocate a zero-sized frame - * 2. Store the number of dirty registers RSC.loadrs field - * 3. Issue a loadrs to insure that any registers from the interrupted - * context which were saved on the new stack frame have been loaded - * back into the stacked registers - * 4. Restore BSPSTORE - * 5. Restore RNAT - * 6. Restore PFS - * 7. Restore IFS - * 8. Restore RSC - * 9. Issue an RFI - */ -#define rse_return_context(psr_mask_reg,temp,p_stackframe) \ - ;; \ - alloc temp=ar.pfs,0,0,0,0; \ - add p_stackframe=rse_ndirty_offset,p_stackframe;; \ - ld8 temp=[p_stackframe];; \ - shl temp=temp,16;; \ - mov ar.rsc=temp;; \ - loadrs;; \ - add p_stackframe=-rse_ndirty_offset+rse_bspstore_offset,p_stackframe;;\ - ld8 temp=[p_stackframe];; \ - mov ar.bspstore=temp;; \ - add p_stackframe=-rse_bspstore_offset+rse_rnat_offset,p_stackframe;;\ - ld8 temp=[p_stackframe];; \ - mov ar.rnat=temp;; \ - add p_stackframe=-rse_rnat_offset+rse_pfs_offset,p_stackframe;; \ - ld8 temp=[p_stackframe];; \ - mov ar.pfs=temp;; \ - add p_stackframe=-rse_pfs_offset+rse_ifs_offset,p_stackframe;; \ - ld8 temp=[p_stackframe];; \ - mov cr.ifs=temp;; \ - add p_stackframe=-rse_ifs_offset+rse_rsc_offset,p_stackframe;; \ - ld8 temp=[p_stackframe];; \ - mov ar.rsc=temp ; \ - mov temp=psr;; \ - or temp=temp,psr_mask_reg;; \ - mov cr.ipsr=temp;; \ - mov temp=ip;; \ - add temp=0x30,temp;; \ - mov cr.iip=temp;; \ - srlz.i;; \ - rfi;; + * The MCA and INIT stacks in struct ia64_mca_cpu look like normal kernel + * stacks, except that the SAL/OS state and a switch_stack are stored near the + * top of the MCA/INIT stack. To support concurrent entry to MCA or INIT, as + * well as MCA over INIT, each event needs its own SAL/OS state. All entries + * are 16 byte aligned. + * + * +---------------------------+ + * | pt_regs | + * +---------------------------+ + * | switch_stack | + * +---------------------------+ + * | SAL/OS state | + * +---------------------------+ + * | 16 byte scratch area | + * +---------------------------+ <-------- SP at start of C MCA handler + * | ..... | + * +---------------------------+ + * | RBS for MCA/INIT handler | + * +---------------------------+ + * | struct task for MCA/INIT | + * +---------------------------+ <-------- Bottom of MCA/INIT stack + */ + +#define ALIGN16(x) ((x)&~15) +#define MCA_PT_REGS_OFFSET ALIGN16(KERNEL_STACK_SIZE-IA64_PT_REGS_SIZE) +#define MCA_SWITCH_STACK_OFFSET ALIGN16(MCA_PT_REGS_OFFSET-IA64_SWITCH_STACK_SIZE) +#define MCA_SOS_OFFSET ALIGN16(MCA_SWITCH_STACK_OFFSET-IA64_SAL_OS_STATE_SIZE) +#define MCA_SP_OFFSET ALIGN16(MCA_SOS_OFFSET-16) #endif /* _ASM_IA64_MCA_ASM_H */ diff -puN include/asm-ia64/mca.h~git-ia64 include/asm-ia64/mca.h --- devel/include/asm-ia64/mca.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/mca.h 2005-09-07 19:42:41.000000000 -0700 @@ -11,8 +11,6 @@ #ifndef _ASM_IA64_MCA_H #define _ASM_IA64_MCA_H -#define IA64_MCA_STACK_SIZE 8192 - #if !defined(__ASSEMBLY__) #include @@ -48,7 +46,8 @@ typedef union cmcv_reg_u { enum { IA64_MCA_RENDEZ_CHECKIN_NOTDONE = 0x0, - IA64_MCA_RENDEZ_CHECKIN_DONE = 0x1 + IA64_MCA_RENDEZ_CHECKIN_DONE = 0x1, + IA64_MCA_RENDEZ_CHECKIN_INIT = 0x2, }; /* Information maintained by the MC infrastructure */ @@ -63,18 +62,42 @@ typedef struct ia64_mc_info_s { } ia64_mc_info_t; -typedef struct ia64_mca_sal_to_os_state_s { - u64 imsto_os_gp; /* GP of the os registered with the SAL */ - u64 imsto_pal_proc; /* PAL_PROC entry point - physical addr */ - u64 imsto_sal_proc; /* SAL_PROC entry point - physical addr */ - u64 imsto_sal_gp; /* GP of the SAL - physical */ - u64 imsto_rendez_state; /* Rendez state information */ - u64 imsto_sal_check_ra; /* Return address in SAL_CHECK while going - * back to SAL from OS after MCA handling. - */ - u64 pal_min_state; /* from PAL in r17 */ - u64 proc_state_param; /* from PAL in r18. See SDV 2:268 11.3.2.1 */ -} ia64_mca_sal_to_os_state_t; +/* Handover state from SAL to OS and vice versa, for both MCA and INIT events. + * Besides the handover state, it also contains some saved registers from the + * time of the event. + * Note: mca_asm.S depends on the precise layout of this structure. + */ + +struct ia64_sal_os_state { + /* SAL to OS, must be at offset 0 */ + u64 os_gp; /* GP of the os registered with the SAL, physical */ + u64 pal_proc; /* PAL_PROC entry point, physical */ + u64 sal_proc; /* SAL_PROC entry point, physical */ + u64 rv_rc; /* MCA - Rendezvous state, INIT - reason code */ + u64 proc_state_param; /* from R18 */ + u64 monarch; /* 1 for a monarch event, 0 for a slave */ + /* common, must follow SAL to OS */ + u64 sal_ra; /* Return address in SAL, physical */ + u64 sal_gp; /* GP of the SAL - physical */ + pal_min_state_area_t *pal_min_state; /* from R17. physical in asm, virtual in C */ + u64 prev_IA64_KR_CURRENT; /* previous value of IA64_KR(CURRENT) */ + struct task_struct *prev_task; /* previous task, NULL if it is not useful */ + /* Some interrupt registers are not saved in minstate, pt_regs or + * switch_stack. Because MCA/INIT can occur when interrupts are + * disabled, we need to save the additional interrupt registers over + * MCA/INIT and resume. + */ + u64 isr; + u64 ifa; + u64 itir; + u64 iipa; + u64 iim; + u64 iha; + /* OS to SAL, must follow common */ + u64 os_status; /* OS status to SAL, enum below */ + u64 context; /* 0 if return to same context + 1 if return to new context */ +}; enum { IA64_MCA_CORRECTED = 0x0, /* Error has been corrected by OS_MCA */ @@ -84,35 +107,21 @@ enum { }; enum { + IA64_INIT_RESUME = 0x0, /* Resume after return from INIT */ + IA64_INIT_WARM_BOOT = -1, /* Warm boot of the system need from SAL */ +}; + +enum { IA64_MCA_SAME_CONTEXT = 0x0, /* SAL to return to same context */ IA64_MCA_NEW_CONTEXT = -1 /* SAL to return to new context */ }; -typedef struct ia64_mca_os_to_sal_state_s { - u64 imots_os_status; /* OS status to SAL as to what happened - * with the MCA handling. - */ - u64 imots_sal_gp; /* GP of the SAL - physical */ - u64 imots_context; /* 0 if return to same context - 1 if return to new context */ - u64 *imots_new_min_state; /* Pointer to structure containing - * new values of registers in the min state - * save area. - */ - u64 imots_sal_check_ra; /* Return address in SAL_CHECK while going - * back to SAL from OS after MCA handling. - */ -} ia64_mca_os_to_sal_state_t; - /* Per-CPU MCA state that is too big for normal per-CPU variables. */ struct ia64_mca_cpu { - u64 stack[IA64_MCA_STACK_SIZE/8]; /* MCA memory-stack */ - u64 proc_state_dump[512]; - u64 stackframe[32]; - u64 rbstore[IA64_MCA_STACK_SIZE/8]; /* MCA reg.-backing store */ + u64 mca_stack[KERNEL_STACK_SIZE/8]; u64 init_stack[KERNEL_STACK_SIZE/8]; -} __attribute__ ((aligned(16))); +}; /* Array of physical addresses of each CPU's MCA area. */ extern unsigned long __per_cpu_mca[NR_CPUS]; @@ -121,12 +130,29 @@ extern void ia64_mca_init(void); extern void ia64_mca_cpu_init(void *); extern void ia64_os_mca_dispatch(void); extern void ia64_os_mca_dispatch_end(void); -extern void ia64_mca_ucmc_handler(void); +extern void ia64_mca_ucmc_handler(struct pt_regs *, struct ia64_sal_os_state *); +extern void ia64_init_handler(struct pt_regs *, + struct switch_stack *, + struct ia64_sal_os_state *); extern void ia64_monarch_init_handler(void); extern void ia64_slave_init_handler(void); extern void ia64_mca_cmc_vector_setup(void); -extern int ia64_reg_MCA_extension(void*); +extern int ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *)); extern void ia64_unreg_MCA_extension(void); +extern u64 ia64_get_rnat(u64 *); + +#else /* __ASSEMBLY__ */ + +#define IA64_MCA_CORRECTED 0x0 /* Error has been corrected by OS_MCA */ +#define IA64_MCA_WARM_BOOT -1 /* Warm boot of the system need from SAL */ +#define IA64_MCA_COLD_BOOT -2 /* Cold boot of the system need from SAL */ +#define IA64_MCA_HALT -3 /* System to be halted by SAL */ + +#define IA64_INIT_RESUME 0x0 /* Resume after return from INIT */ +#define IA64_INIT_WARM_BOOT -1 /* Warm boot of the system need from SAL */ + +#define IA64_MCA_SAME_CONTEXT 0x0 /* SAL to return to same context */ +#define IA64_MCA_NEW_CONTEXT -1 /* SAL to return to new context */ #endif /* !__ASSEMBLY__ */ #endif /* _ASM_IA64_MCA_H */ diff -puN include/asm-ia64/ptrace.h~git-ia64 include/asm-ia64/ptrace.h --- devel/include/asm-ia64/ptrace.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/ptrace.h 2005-09-07 19:42:41.000000000 -0700 @@ -119,7 +119,7 @@ struct pt_regs { unsigned long ar_unat; /* interrupted task's NaT register (preserved) */ unsigned long ar_pfs; /* prev function state */ unsigned long ar_rsc; /* RSE configuration */ - /* The following two are valid only if cr_ipsr.cpl > 0: */ + /* The following two are valid only if cr_ipsr.cpl > 0 || ti->flags & _TIF_MCA_INIT */ unsigned long ar_rnat; /* RSE NaT */ unsigned long ar_bspstore; /* RSE bspstore */ diff -puN include/asm-ia64/sn/l1.h~git-ia64 include/asm-ia64/sn/l1.h --- devel/include/asm-ia64/sn/l1.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/sn/l1.h 2005-09-07 19:42:41.000000000 -0700 @@ -35,4 +35,16 @@ #define L1_BRICKTYPE_ATHENA 0x2b /* + */ #define L1_BRICKTYPE_DAYTONA 0x7a /* z */ +/* board type response codes */ +#define L1_BOARDTYPE_IP69 0x0100 /* CA */ +#define L1_BOARDTYPE_IP63 0x0200 /* CB */ +#define L1_BOARDTYPE_BASEIO 0x0300 /* IB */ +#define L1_BOARDTYPE_PCIE2SLOT 0x0400 /* IC */ +#define L1_BOARDTYPE_PCIX3SLOT 0x0500 /* ID */ +#define L1_BOARDTYPE_PCIXPCIE4SLOT 0x0600 /* IE */ +#define L1_BOARDTYPE_ABACUS 0x0700 /* AB */ +#define L1_BOARDTYPE_DAYTONA 0x0800 /* AD */ +#define L1_BOARDTYPE_INVAL (-1) /* invalid brick type */ + + #endif /* _ASM_IA64_SN_L1_H */ diff -puN /dev/null include/asm-ia64/sn/sn_feature_sets.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/asm-ia64/sn/sn_feature_sets.h 2005-09-07 19:42:41.000000000 -0700 @@ -0,0 +1,57 @@ +#ifndef _ASM_IA64_SN_FEATURE_SETS_H +#define _ASM_IA64_SN_FEATURE_SETS_H + +/* + * SN PROM Features + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2005 Silicon Graphics, Inc. All rights reserved. + */ + + +#include +#include + +/* --------------------- PROM Features -----------------------------*/ +extern int sn_prom_feature_available(int id); + +#define MAX_PROM_FEATURE_SETS 2 + +/* + * The following defines features that may or may not be supported by the + * current PROM. The OS uses sn_prom_feature_available(feature) to test for + * the presence of a PROM feature. Down rev (old) PROMs will always test + * "false" for new features. + * + * Use: + * if (sn_prom_feature_available(PRF_FEATURE_XXX)) + * ... + */ + +/* + * Example: feature XXX + */ +#define PRF_FEATURE_XXX 0 + + + +/* --------------------- OS Features -------------------------------*/ + +/* + * The following defines OS features that are optionally present in + * the operating system. + * During boot, PROM is notified of these features via a series of calls: + * + * ia64_sn_set_os_feature(feature1); + * + * Once enabled, a feature cannot be disabled. + * + * By default, features are disabled unless explicitly enabled. + */ +#define OSF_MCA_SLV_TO_OS_INIT_SLV 0 +#define OSF_FEAT_LOG_SBES 1 + +#endif /* _ASM_IA64_SN_FEATURE_SETS_H */ diff -puN include/asm-ia64/sn/sn_sal.h~git-ia64 include/asm-ia64/sn/sn_sal.h --- devel/include/asm-ia64/sn/sn_sal.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/sn/sn_sal.h 2005-09-07 19:42:41.000000000 -0700 @@ -47,6 +47,7 @@ #define SN_SAL_CONSOLE_PUTB 0x02000028 #define SN_SAL_CONSOLE_XMIT_CHARS 0x0200002a #define SN_SAL_CONSOLE_READC 0x0200002b +#define SN_SAL_SYSCTL_OP 0x02000030 #define SN_SAL_SYSCTL_MODID_GET 0x02000031 #define SN_SAL_SYSCTL_GET 0x02000032 #define SN_SAL_SYSCTL_IOBRICK_MODULE_GET 0x02000033 @@ -80,6 +81,9 @@ #define SN_SAL_RESERVED_DO_NOT_USE 0x02000062 #define SN_SAL_IOIF_GET_PCI_TOPOLOGY 0x02000064 +#define SN_SAL_GET_PROM_FEATURE_SET 0x02000065 +#define SN_SAL_SET_OS_FEATURE_SET 0x02000066 + /* * Service-specific constants */ @@ -98,6 +102,13 @@ #define SAL_INTR_FREE 2 /* + * operations available on the generic SN_SAL_SYSCTL_OP + * runtime service + */ +#define SAL_SYSCTL_OP_IOBOARD 0x0001 /* retrieve board type */ +#define SAL_SYSCTL_OP_TIO_JLCK_RST 0x0002 /* issue TIO clock reset */ + +/* * IRouter (i.e. generalized system controller) operations */ #define SAL_IROUTER_OPEN 0 /* open a subchannel */ @@ -118,8 +129,8 @@ /* * Error Handling Features */ -#define SAL_ERR_FEAT_MCA_SLV_TO_OS_INIT_SLV 0x1 -#define SAL_ERR_FEAT_LOG_SBES 0x2 +#define SAL_ERR_FEAT_MCA_SLV_TO_OS_INIT_SLV 0x1 // obsolete +#define SAL_ERR_FEAT_LOG_SBES 0x2 // obsolete #define SAL_ERR_FEAT_MFR_OVERRIDE 0x4 #define SAL_ERR_FEAT_SBE_THRESHOLD 0xffff0000 @@ -152,12 +163,6 @@ sn_sal_rev(void) } /* - * Specify the minimum PROM revsion required for this kernel. - * Note that they're stored in hex format... - */ -#define SN_SAL_MIN_VERSION 0x0404 - -/* * Returns the master console nasid, if the call fails, return an illegal * value. */ @@ -336,7 +341,7 @@ ia64_sn_plat_cpei_handler(void) } /* - * Set Error Handling Features + * Set Error Handling Features (Obsolete) */ static inline u64 ia64_sn_plat_set_error_handling_features(void) @@ -876,6 +881,41 @@ ia64_sn_sysctl_event_init(nasid_t nasid) return (int) rv.v0; } +/* + * Ask the system controller on the specified nasid to reset + * the CX corelet clock. Only valid on TIO nodes. + */ +static inline int +ia64_sn_sysctl_tio_clock_reset(nasid_t nasid) +{ + struct ia64_sal_retval rv; + SAL_CALL_REENTRANT(rv, SN_SAL_SYSCTL_OP, SAL_SYSCTL_OP_TIO_JLCK_RST, + nasid, 0, 0, 0, 0, 0); + if (rv.status != 0) + return (int)rv.status; + if (rv.v0 != 0) + return (int)rv.v0; + + return 0; +} + +/* + * Get the associated ioboard type for a given nasid. + */ +static inline int +ia64_sn_sysctl_ioboard_get(nasid_t nasid) +{ + struct ia64_sal_retval rv; + SAL_CALL_REENTRANT(rv, SN_SAL_SYSCTL_OP, SAL_SYSCTL_OP_IOBOARD, + nasid, 0, 0, 0, 0, 0); + if (rv.v0 != 0) + return (int)rv.v0; + if (rv.v1 != 0) + return (int)rv.v1; + + return 0; +} + /** * ia64_sn_get_fit_compt - read a FIT entry from the PROM header * @nasid: NASID of node to read @@ -1052,4 +1092,25 @@ ia64_sn_is_fake_prom(void) return (rv.status == 0); } +static inline int +ia64_sn_get_prom_feature_set(int set, unsigned long *feature_set) +{ + struct ia64_sal_retval rv; + + SAL_CALL_NOLOCK(rv, SN_SAL_GET_PROM_FEATURE_SET, set, 0, 0, 0, 0, 0, 0); + if (rv.status != 0) + return rv.status; + *feature_set = rv.v0; + return 0; +} + +static inline int +ia64_sn_set_os_feature(int feature) +{ + struct ia64_sal_retval rv; + + SAL_CALL_NOLOCK(rv, SN_SAL_SET_OS_FEATURE_SET, feature, 0, 0, 0, 0, 0, 0); + return rv.status; +} + #endif /* _ASM_IA64_SN_SN_SAL_H */ diff -puN include/asm-ia64/sn/tiocx.h~git-ia64 include/asm-ia64/sn/tiocx.h --- devel/include/asm-ia64/sn/tiocx.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/sn/tiocx.h 2005-09-07 19:42:41.000000000 -0700 @@ -19,6 +19,7 @@ struct cx_id_s { struct cx_dev { struct cx_id_s cx_id; + int bt; /* board/blade type */ void *soft; /* driver specific */ struct hubdev_info *hubdev; struct device dev; @@ -59,7 +60,7 @@ struct cx_drv { extern struct sn_irq_info *tiocx_irq_alloc(nasid_t, int, int, nasid_t, int); extern void tiocx_irq_free(struct sn_irq_info *); extern int cx_device_unregister(struct cx_dev *); -extern int cx_device_register(nasid_t, int, int, struct hubdev_info *); +extern int cx_device_register(nasid_t, int, int, struct hubdev_info *, int); extern int cx_driver_unregister(struct cx_drv *); extern int cx_driver_register(struct cx_drv *); extern uint64_t tiocx_dma_addr(uint64_t addr); diff -puN include/asm-ia64/sn/xp.h~git-ia64 include/asm-ia64/sn/xp.h --- devel/include/asm-ia64/sn/xp.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/sn/xp.h 2005-09-07 19:42:41.000000000 -0700 @@ -217,7 +217,15 @@ enum xpc_retval { xpcInvalidPartid, /* 42: invalid partition ID */ xpcLocalPartid, /* 43: local partition ID */ - xpcUnknownReason /* 44: unknown reason -- must be last in list */ + xpcOtherGoingDown, /* 44: other side going down, reason unknown */ + xpcSystemGoingDown, /* 45: system is going down, reason unknown */ + xpcSystemHalt, /* 46: system is being halted */ + xpcSystemReboot, /* 47: system is being rebooted */ + xpcSystemPoweroff, /* 48: system is being powered off */ + + xpcDisconnecting, /* 49: channel disconnecting (closing) */ + + xpcUnknownReason /* 50: unknown reason -- must be last in list */ }; diff -puN include/asm-ia64/thread_info.h~git-ia64 include/asm-ia64/thread_info.h --- devel/include/asm-ia64/thread_info.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/thread_info.h 2005-09-07 19:42:41.000000000 -0700 @@ -76,6 +76,7 @@ struct thread_info { #define TIF_SIGDELAYED 5 /* signal delayed from MCA/INIT/NMI/PMI context */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 +#define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) @@ -85,6 +86,7 @@ struct thread_info { #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SIGDELAYED (1 << TIF_SIGDELAYED) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) +#define _TIF_MCA_INIT (1 << TIF_MCA_INIT) /* "work to do on user-return" bits */ #define TIF_ALLWORK_MASK (_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SIGDELAYED) diff -puN include/asm-ia64/unwind.h~git-ia64 include/asm-ia64/unwind.h --- devel/include/asm-ia64/unwind.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/asm-ia64/unwind.h 2005-09-07 19:42:41.000000000 -0700 @@ -114,13 +114,6 @@ extern void unw_remove_unwind_table (voi */ extern void unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t); -/* - * Prepare to unwind from interruption. The pt-regs and switch-stack structures must have - * be "adjacent" (no state modifications between pt-regs and switch-stack). - */ -extern void unw_init_from_interruption (struct unw_frame_info *info, struct task_struct *t, - struct pt_regs *pt, struct switch_stack *sw); - extern void unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct switch_stack *sw); diff -puN include/linux/sched.h~git-ia64 include/linux/sched.h --- devel/include/linux/sched.h~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/include/linux/sched.h 2005-09-07 19:42:41.000000000 -0700 @@ -895,6 +895,8 @@ extern int task_curr(const task_t *p); extern int idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); extern task_t *idle_task(int cpu); +extern task_t *curr_task(int cpu); +extern void set_curr_task(int cpu, task_t *p); void yield(void); diff -puN kernel/sched.c~git-ia64 kernel/sched.c --- devel/kernel/sched.c~git-ia64 2005-09-07 19:42:39.000000000 -0700 +++ devel-akpm/kernel/sched.c 2005-09-07 19:42:41.000000000 -0700 @@ -3472,6 +3472,34 @@ task_t *idle_task(int cpu) } /** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + */ +task_t *curr_task(int cpu) +{ + return cpu_curr(cpu); +} +EXPORT_SYMBOL_GPL(curr_task); + +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with interrupts disabled, the caller must save the original + * value of the current task (see curr_task() above) and restore that value + * before reenabling interrupts. + */ +void set_curr_task(int cpu, task_t *p) +{ + cpu_curr(cpu) = p; +} +EXPORT_SYMBOL_GPL(set_curr_task); + +/** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. */ _