#!/usr/bin/perl
#
# This file is part of Cygwin.
#
# This software is a copyrighted work licensed under the terms of the
# Cygwin license.  Please consult the file "CYGWIN_LICENSE" for
# details.
#
use strict;
use integer;
use Getopt::Long;

sub cleanup(@);

my $cpu;
my $output_def;
GetOptions('cpu=s'=>\$cpu, 'output-def=s'=>\$output_def);

$main::first = 0;
if (!defined($cpu) || !defined($output_def)) {
    die "$0: missing required option\n";
}

my $is_aarch64 = $cpu eq 'aarch64';
my $is_x86_64 = $cpu eq 'x86_64';
# FIXME? Do other (non-32 bit) arches on Windows still use symbol prefixes?
my $sym_prefix = '';

my @top = ();
while (<>) {
    push(@top, cleanup $_);
    last if /^\s*exports$/oi;
}
my @in = cleanup <>;

my %sigfe = ();
my @data = ();
my @nosigfuncs = ();
my @text = ();
for (@in) {
    chomp;
    s/\s+DATA$//o and do {
	push @data, $_;
	next;
    };
    if (/=/o) {
	if (s/\s+NOSIGFE\s*$//) {
	    # nothing
	} elsif (s/\s+SIGFE(_MAYBE)?$//) {
	    my $func = (split(' '))[2];
	    my $maybe = (defined($1) ? lc $1 : '') . '_';
	    $sigfe{$func} = '_sigfe' . $maybe . $func;
	}
    } else {
	my ($func, $sigfe) = m%^\s*(\S+)(?:\s+((?:NO)?SIGFE(?:_MAYBE)?))?$%o;
	if (defined($sigfe) && $sigfe =~ /^NO/o) {
	    $_ = $func;
	} else {
	    $sigfe ||= 'sigfe';
	    $_ = '_' . lc($sigfe) . '_' . $func;
	    $sigfe{$func} = $_;
	    $_ = $func . ' = ' . $_;
	}
    }
    s/(\S)\s+(\S)/$1 $2/go;
    s/(\S)\s+$/$1/o;
    s/^\s+(\S)/$1/o;
    push @text, $_;
}

for (@text) {
    my ($alias, $func) = /^(\S+)\s+=\s+(\S+)\s*$/o;
    $_ = $alias . ' = ' . $sigfe{$func}
      if defined($func) && $sigfe{$func};
}

open OUT, '>', $output_def or die "$0: couldn't open \"$output_def\" - $!\n";
push @top, (map {$_ . " DATA\n"} @data), (map {$_ . "\n"} @text);
print OUT @top;
close OUT;

open SIGFE, '>', 'sigfe.s' or die "$0: couldn't open 'sigfe.s' file for writing - $!\n";

for my $k (sort keys %sigfe) {
    print SIGFE fefunc($k, $sigfe{$k});
}
close SIGFE;

sub fefunc {
    my $func = $sym_prefix . shift;
    my $fe = $sym_prefix . shift;
    my $sigfe_func;
    if ($is_x86_64 || $is_aarch64) {
	$sigfe_func = ($fe =~ /^(.*)_${func}$/)[0];
    }
    my $extra;
    my $res;
    if ($is_x86_64) {
	$res = <<EOF;
	.extern	$func
	.global	$fe
	.seh_proc $fe
$fe:
	leaq	$func(%rip),%r10
	pushq	%r10
	.seh_pushreg %r10
	.seh_endprologue
	jmp	$sigfe_func
	.seh_endproc

EOF
    }
    # TODO: This is only a stub, it needs to be implemented properly for AArch64.
    if ($is_aarch64) {
	$res = <<EOF;
	.extern $func
	.global $fe
	.seh_proc $fe
$fe:
	sub sp, sp, 16			// allocate stack, 16-byte alligned
	.seh_stackalloc 16		// SEH: describe stack allocation
	.seh_endprologue		// end of prologue for unwinder
	adrp x9, $func			// load page address of func
	add x9, x9, :lo12:$func		// compute full address of func
	str x9, [sp, 0]			// store func pointer on stack
	adrp x9, $sigfe_func		// load page address of sigfe_func
	add x9, x9, :lo12:$sigfe_func	// compute final address of sigfe_func
	br x9				// branch to x9
	.seh_endproc
EOF
    }

    if (!$main::first++) {
	if ($is_x86_64) {
	  $res = <<EOF . longjmp () . $res;
	.include "tlsoffsets"
	.text

	.seh_proc _sigfe_maybe
_sigfe_maybe:					# stack is aligned on entry!
	.seh_endprologue
	movq	%gs:8,%r10			# location of bottom of stack
	leaq	_cygtls.initialized(%r10),%r11	# where we will be looking
	cmpq	%r11,%rsp			# stack loc > than tls
	jge	0f				# yep.  we don't have a tls.
	movl	_cygtls.initialized(%r10),%r11d
	cmpl	\$0xc763173f,%r11d		# initialized?
	je	1f
0:	ret
	.seh_endproc

	.seh_proc _sigfe
_sigfe:						# stack is aligned on entry!
	.seh_endprologue
	movq	%gs:8,%r10			# location of bottom of stack
1:	movl	\$1,%r11d
	xchgl	%r11d,_cygtls.stacklock(%r10)	# try to acquire lock
	testl	%r11d,%r11d			# it will be zero
	jz	2f				#  if so
	pause
	jmp	1b				# loop
2:	movq	\$8,%rax			# have the lock, now increment the
	xaddq	%rax,_cygtls.stackptr(%r10)	#  stack pointer and get pointer
	leaq	_sigbe(%rip),%r11		# new place to return to
	xchgq	%r11,8(%rsp)			# exchange with real return value
	movq	%r11,(%rax)			# store real return value on alt stack
	incl	_cygtls.incyg(%r10)
	decl	_cygtls.stacklock(%r10)		# release lock
	popq	%rax				# pop real function address from stack
	jmp	*%rax				# and jmp to it
	.seh_endproc

	.global _sigbe
	.seh_proc _sigbe
_sigbe:						# return here after cygwin syscall
						# stack is aligned on entry!
	.seh_endprologue
	movq	%gs:8,%r10			# address of bottom of tls
1:	movl	\$1,%r11d
	xchgl	%r11d,_cygtls.stacklock(%r10)	# try to acquire lock
	testl	%r11d,%r11d			# it will be zero
	jz	2f				#  if so
	pause
	jmp	1b				#  and loop
2:	movq	\$-8,%r11			# now decrement aux stack
	xaddq	%r11,_cygtls.stackptr(%r10)	#  and get pointer
	movq	-8(%r11),%r11			# get return address from signal stack
	decl	_cygtls.incyg(%r10)
	decl	_cygtls.stacklock(%r10)		# release lock
	jmp	*%r11				# "return" to caller
	.seh_endproc

	.global	sigdelayed
	.seh_proc sigdelayed
sigdelayed:
	pushq	%r10				# used for return address injection
	.seh_pushreg %r10
	pushq	%rbp
	.seh_pushreg %rbp
	movq	%rsp,%rbp
	pushf
	.seh_pushreg %rax			# fake, there's no .seh_pushreg for the flags
	cld					# x86_64 ABI requires direction flag cleared
	# stack is aligned or unaligned on entry!
	# make sure it is aligned from here on
	# We could be called from an interrupted thread which doesn't know
	# about his fate, so save and restore everything and the kitchen sink.
	andq	\$0xffffffffffffffc0,%rsp
	.seh_setframe %rbp,0
	pushq	%r15
	.seh_pushreg %r15
	pushq	%r14
	.seh_pushreg %r14
	pushq	%r13
	.seh_pushreg %r13
	pushq	%r12
	.seh_pushreg %r12
	pushq	%r11
	.seh_pushreg %r11
	pushq	%r9
	.seh_pushreg %r9
	pushq	%r8
	.seh_pushreg %r8
	pushq	%rsi
	.seh_pushreg %rsi
	pushq	%rdi
	.seh_pushreg %rdi
	pushq	%rdx
	.seh_pushreg %rdx
	pushq	%rcx
	.seh_pushreg %rcx
	pushq	%rbx
	.seh_pushreg %rbx
	pushq	%rax
	.seh_pushreg %rax

	# +0x20: indicates if xsave is available
	# +0x24: decrement of the stack to allocate space
	# +0x28: %eax returnd by cpuid (0x0d, 0x00)
	# +0x2c: %edx returnd by cpuid (0x0d, 0x00)
	# +0x30: state save area
	movl	\$1,%eax
	cpuid
	andl	\$0x04000000,%ecx # xsave available?
	jnz	1f
	movl	\$0x248,%ebx # 0x18 for alignment, 0x30 for additional space
	subq	%rbx,%rsp
	movl	%ecx,0x20(%rsp)
	movl	%ebx,0x24(%rsp)
	fxsave64 0x30(%rsp) # x86 CPU with 64-bit mode has fxsave64/fxrstor64
	jmp	2f
1:
	movl	\$0x0d,%eax
	xorl	%ecx,%ecx
	cpuid	# get necessary space for xsave
	movq	%rbx,%rcx
	addq	\$0x48,%rbx # 0x18 for alignment, 0x30 for additional space
	subq	%rbx,%rsp
	movl	%ebx,0x24(%rsp)
	xorq	%rax,%rax
	shrq	\$3,%rcx
	leaq	0x30(%rsp),%rdi
	rep	stosq
	xgetbv	# get XCR0 (ecx is 0 after rep)
	movl	%eax,0x28(%rsp)
	movl	%edx,0x2c(%rsp)
	notl	%ecx # set ecx non-zero
	movl	%ecx,0x20(%rsp)
	xsave64	0x30(%rsp)
2:
	.seh_endprologue

	movq	%gs:8,%r12			# get tls
	movl	_cygtls.saved_errno(%r12),%r15d	# temporarily save saved_errno
	movq	\$_cygtls.start_offset,%rcx	# point to beginning of tls block
	addq	%r12,%rcx			#  and store as first arg to method
	call	_ZN7_cygtls19call_signal_handlerEv	# call handler

1:	movl	\$1,%r11d
	xchgl	%r11d,_cygtls.stacklock(%r12)	# try to acquire lock
	testl	%r11d,%r11d			# it will be zero
	jz	2f				#  if so
	pause
	jmp	1b				#  and loop
2:	testl	%r15d,%r15d			# was saved_errno < 0
	jl	3f				# yup.  ignore it
	movq	_cygtls.errno_addr(%r12),%r11
	movl	%r15d,(%r11)
3:	movq	\$-8,%r11			# now decrement aux stack
	xaddq	%r11,_cygtls.stackptr(%r12)	#  and get pointer
	xorq	%r10,%r10
	xchgq	%r10,-8(%r11)			# get return address from signal stack
	xorl	%r11d,%r11d
	movl	%r11d,_cygtls.incyg(%r12)
	movl	%r11d,_cygtls.stacklock(%r12)	# release lock

	movl	0x20(%rsp),%ecx
	testl	%ecx,%ecx # xsave available?
	jnz	1f
	fxrstor64 0x30(%rsp)
	jmp	2f
1:
	movl	0x28(%rsp),%eax
	movl	0x2c(%rsp),%edx
	xrstor64 0x30(%rsp)
2:
	movl	0x24(%rsp),%ebx
	addq	%rbx,%rsp

	popq	%rax
	popq	%rbx
	popq	%rcx
	popq	%rdx
	popq	%rdi
	popq	%rsi
	popq	%r8
	popq	%r9
	popq	%r11
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	movq	%rbp,%rsp
	subq	\$8, %rsp
	popf
	popq	%rbp
	xchgq	%r10,(%rsp)
	ret
	.seh_endproc
_sigdelayed_end:
	.global _sigdelayed_end

	.seh_proc stabilize_sig_stack
stabilize_sig_stack:
	pushq	%r12
	.seh_pushreg %r12
	subq	\$0x20,%rsp
	.seh_stackalloc 32
	.seh_endprologue
	movq	%gs:8,%r12
1:	movl	\$1,%r10d
	xchgl	%r10d,_cygtls.stacklock(%r12)	# try to acquire lock
	testl	%r10d,%r10d
	jz	2f
	pause
	jmp	1b
2:	incl	_cygtls.incyg(%r12)
	cmpl	\$0,_cygtls.current_sig(%r12)
	jz	3f
	decl	_cygtls.stacklock(%r12)		# release lock
	movq	\$_cygtls.start_offset,%rcx	# point to beginning
	addq	%r12,%rcx			#  of tls block
	call	_ZN7_cygtls19call_signal_handlerEv
	decl	_cygtls.incyg(%r12)
	jmp	1b
3:	decl	_cygtls.incyg(%r12)
	addq	\$0x20,%rsp
	movq	%r12,%r11			# return tls addr in r11
	popq	%r12
	ret
	.seh_endproc
EOF
	}
	# TODO: These are only stubs, they need to be implemented properly for AArch64.
	if ($is_aarch64) {
	  $res = <<EOF . longjmp () . $res;
	.include "tlsoffsets"
	.text

	.seh_proc _sigfe_maybe
_sigfe_maybe:					# stack is aligned on entry!
	.seh_endprologue
	ldr     x10, [x18, #0x8]		// Load TEB pointer in x10
	ldr     x11, =_cygtls.initialized	// Load relative offset of _cygtls.initialized
	add     x11, x10, x11                  	// compute absolute address and store in x11
	cmp     sp, x11				// Compare current stack pointer with TLS location
	b.hs    0f                             	// if sp >= tls, skip TLS logic
	ldr     w12, [x11]                    	// Load the value at _cygtls.initialized (32-bit)
	movz    w13, #0xc763			// Prepare magic value(0xc763173f) lower 16 bits
	movk    w13, #0x173f, lsl #16		// Add upper 16 bits, full value now in w13
	cmp     w12, w13			// Compare loaded value with magic
	b.ne    0f                              // If not equal, not initialized, skip TLS logic
	ret
0:
	ret
	.seh_endproc

    .seh_proc _sigfe
_sigfe:
    .seh_endprologue
    ldr     x10, [x18, #0x8]		// Load TLS base into x10
    mov     w9, #1			// constant value for lock acquisition
0:  ldr     x11, =_cygtls.stacklock	// Load offset of stacklock
    add     x12, x10, x11		// Compute final address of stacklock
    ldaxr   w13, [x12]			// Load current stacklock value atomically
    stlxr   w14, w9, [x12]		// Attempt to store 1 to stacklock atomically
    cbnz    w14, 0b			// Retry if atomic store failed
    cbz     w13, 1f			// If lock was free, proceed
    yield
    b       0b				// Retry acquiring the lock
1:
    ldr     x11, =_cygtls.incyg	// Load offset of incyg
    add     x12, x10, x11		// Compute final address of incyg
    ldr     w9, [x12]			// Load current incyg value
    add     w9, w9, #1			// Increment incyg
    str     w9, [x12]			// Store updated incyg value
    mov     x9, #8			// Set stack frame size increment (8 bytes)
2:  ldr     x11, =_cygtls.stackptr	// Load offset of stack pointer
    add     x12, x10, x11		// Compute final address of stack pointer
    ldaxr   x13, [x12]			// Atomically load current stack pointer
    add     x14, x13, x9		// Compute new stack pointer value
    stlxr   w15, x14, [x12]		// Attempt to update stack pointer atomically
    cbnz    w15, 2b			// Retry if atomic update failed
    str     x30, [x13]                 // Save LR(return address) on stack
    adr     x11, _sigbe		// Load address of _sigbe
    mov     x30, x11                   // Set LR = _sigbe
    ldr     x11, =_cygtls.stacklock	// Load offset of stacklock TLS variable
    add     x12, x10, x11		// Compute final address of stacklock
    ldr     w9, [x12]			// Load current stacklock value
    sub     w9, w9, #1			// Decrement stacklock to release lock
    stlr    w9, [x12]			// Store stacklock value (release lock)
    ldr     x9, [sp], #16              // Pop real func address from stack
    br      x9				// Branch to real function
    .seh_endproc

    .global _sigbe
    .seh_proc _sigbe
_sigbe:
    .seh_endprologue
    ldr     x10, [x18, #0x8]		// Load TLS base into x10
    mov     w9, #1			// Constant value 1 for lock acquisition
3:  ldr     x11, =_cygtls.stacklock	// Load offset of stacklock
    add     x12, x10, x11		// Compute final address of stacklock
    ldaxr   w13, [x12]			// Load current stacklock value atomically
    stlxr   w14, w9, [x12]		// Attempt to set stacklock atomically
    cbnz    w14, 3b			// Retry if failed
    cbz     w13, 4f			// If lock was free, continue
    yield
    b       3b				// Retry acquiring the lock
4:
    mov     x9, #-8			// Set stack pointer decrement value
5:  ldr     x11, =_cygtls.stackptr	// Load offset of stack pointer
    add     x12, x10, x11		// Compute final address of stack pointer
    ldaxr   x13, [x12]			// Load current stack pointer atomically
    add     x14, x13, x9		// Compute new stack pointer value
    stlxr   w15, x14, [x12]		// Attempt to update stack pointer atomically
    cbnz    w15, 5b			// Retry if atomic update failed
    sub     x13, x13, #8               // Compute address where LR was saved
    ldr     x30, [x13]                 // Restore saved LR
    ldr     x11, =_cygtls.incyg	// Load offset of incyg
    add     x12, x10, x11		// Compute final address of incyg
    ldr     w9, [x12]			// Load current incyg value
    sub     w9, w9, #1			// Decrement incyg
    str     w9, [x12]			// Store updated incyg value
    ldr     x11, =_cygtls.stacklock	// Load offset of stacklock
    add     x12, x10, x11		// Compute final address of stacklock
    ldr     w9, [x12]			// Load current stacklock value
    sub     w9, w9, #1			// Decrement stacklock (release lock)
    stlr    w9, [x12]			// Store stacklock
    ret				// Return to caller using restored LR
    .seh_endproc

	.global	sigdelayed
	.seh_proc sigdelayed
sigdelayed:
	stp	x0,  x1,  [sp, #-16]!
	stp	x2,  x3,  [sp, #-16]!
	stp	x4,  x5,  [sp, #-16]!
	stp	x6,  x7,  [sp, #-16]!
	stp	x8,  x9,  [sp, #-16]!
	stp	x10, x11, [sp, #-16]!
	stp	x12, x13, [sp, #-16]!
	stp	x14, x15, [sp, #-16]!
	stp	x16, x17, [sp, #-16]!
	stp	x18, x19, [sp, #-16]!
	.seh_stackalloc 160
	.seh_save_reg x19, 152
	stp	x20, x21, [sp, #-16]!
	.seh_save_regp_x x20, 16
	stp	x22, x23, [sp, #-16]!
	.seh_save_regp_x x22, 16
	stp	x24, x25, [sp, #-16]!
	.seh_save_regp_x x24, 16
	stp	x26, x27, [sp, #-16]!
	.seh_save_regp_x x26, 16
	stp	x28, x29, [sp, #-16]!
	.seh_save_regp_x x28, 16

	mov	x1, sp
	str	x1, [sp, #-16]!

	stp	q0,  q1,  [sp, #-32]!
	stp	q2,  q3,  [sp, #-32]!
	stp	q4,  q5,  [sp, #-32]!
	stp	q6,  q7,  [sp, #-32]!
	stp	q8,  q9,  [sp, #-32]!
	stp	q10, q11, [sp, #-32]!
	stp	q12, q13, [sp, #-32]!
	stp	q14, q15, [sp, #-32]!
	stp	q16, q17, [sp, #-32]!
	stp	q18, q19, [sp, #-32]!
	stp	q20, q21, [sp, #-32]!
	stp	q22, q23, [sp, #-32]!
	stp	q24, q25, [sp, #-32]!
	stp	q26, q27, [sp, #-32]!
	stp	q28, q29, [sp, #-32]!
	stp	q30, q31, [sp, #-32]!

	mrs	x1, fpcr
	mrs	x2, fpsr
	stp	x1, x2, [sp, #-16]!

	.seh_stackalloc 544

	.seh_endprologue

	ldr	x12, [x18, #8]			// get TLS pointer
	ldr	x13, =_cygtls.saved_errno	// get offset to saved_errno
	add	x13, x12, x13			// set x13 to &TLS.saved_errno
	ldr	w19, [x13]			// preserve saved_errno in w19

	ldr	x13, =_cygtls.start_offset	// get offset to beginning of TLS block
	add	x0, x12, x13			// store offset as first arg to method
	bl	_ZN7_cygtls19call_signal_handlerEv	// call handler
	ldr	x12, [x18, #8]			// restore clobbered TLS pointer

	mov	w11, #1				// set w11 to 1 (locked)
	ldr	x13, =_cygtls.stacklock		// get offset to stacklock
	add	x13, x12, x13			// set x13 to &TLS.stacklock
1:
	ldaxr	w14, [x13]			// read lock value with acquire
	cbnz	w14, 2f				// wait if already locked
	stxr	w14, w11, [x13]			// attempt to store 1
	cbnz	w14, 1b				// retry if locking not succeeded
	b	3f				// continue to critical region
2:
	yield					// hint to CPU (spin-wait)
	b	1b				// try again

3:
	tst	w19, w19			// was saved_errno < 0
	blt	4f				// if yes, ignore it
	ldr	x13, =_cygtls.errno_addr	// get offset to errno_addr
	add	x13, x12, x13			// set x13 to &TLS.errno_addr
	ldr	x11, [x13]			// set x11 to TLS->errno_addr
	str	w19, [x11]			// store saved_errno to errno_addr

4:
	ldr	x13, =_cygtls.stackptr		// get offset to stackptr
	add	x13, x12, x13			// set x13 to &TLS.stackptr
5:
	ldxr	x11, [x13]			// get aux stack address
	sub	x11, x11, #8			// decrement aux stack address
	stxr	w14, x11, [x13]			// attempt to store decremented value
	cbnz	w14, 5b				// retry if not succeeded

6:
	ldxr	x30, [x11]			// get return address from signal stack
	stxr	w14, xzr, [x11]			// attempt to clear return address
	cbnz	w14, 6b				// retry if not succeeded

	ldr	x13, =_cygtls.incyg		// get offset to incyg
	add	x13, x12, x13			// set x13 to &TLS.incyg
	str	wzr, [x13]			// set TLS.incyg to 0 (not in cygwin)
	ldr	x13, =_cygtls.stacklock		// get offset to stacklock
	add	x13, x12, x13			// set x13 to &TLS.stacklock
	stlr	wzr, [x13]			// release lock

	.seh_startepilogue

	.seh_stackalloc 544

	ldp	x1, x2, [sp], #16
	msr	fpcr, x1
	msr	fpsr, x2

	ldp	q30, q31, [sp], #32
	ldp	q28, q29, [sp], #32
	ldp	q26, q27, [sp], #32
	ldp	q24, q25, [sp], #32
	ldp	q22, q23, [sp], #32
	ldp	q20, q21, [sp], #32
	ldp	q18, q19, [sp], #32
	ldp	q16, q17, [sp], #32
	ldp	q14, q15, [sp], #32
	ldp	q12, q13, [sp], #32
	ldp	q10, q11, [sp], #32
	ldp	q8,  q9,  [sp], #32
	ldp	q6,  q7,  [sp], #32
	ldp	q4,  q5,  [sp], #32
	ldp	q2,  q3,  [sp], #32
	ldp	q0,  q1,  [sp], #32

	ldr	x1, [sp], #16
	mov	sp, x1

	ldp	x28, x29, [sp], #16
	.seh_save_regp_x x28, 16
	ldp	x26, x27, [sp], #16
	.seh_save_regp_x x26, 16
	ldp	x24, x25, [sp], #16
	.seh_save_regp_x x24, 16
	ldp	x22, x23, [sp], #16
	.seh_save_regp_x x22, 16
	ldp	x20, x21, [sp], #16
	.seh_save_regp_x x20, 16
	ldp	x18, x19, [sp], #16
	.seh_save_reg x19, 152
	.seh_stackalloc 160
	ldp	x16, x17, [sp], #16
	ldp	x14, x15, [sp], #16
	ldp	x12, x13, [sp], #16
	ldp	x10, x11, [sp], #16
	ldp	x8,  x9,  [sp], #16
	ldp	x6,  x7,  [sp], #16
	ldp	x4,  x5,  [sp], #16
	ldp	x2,  x3,  [sp], #16
	ldp	x0,  x1,  [sp], #16

	.seh_endepilogue
	ret
	.seh_endproc
_sigdelayed_end:
	.global _sigdelayed_end
	.seh_proc stabilize_sig_stack
stabilize_sig_stack:
	// prologue
	stp	fp, lr, [sp, #-0x10]!		// save FP and LR registers
	.seh_save_fplr_x 0x10
	mov	fp, sp				// set frame pointer for unwinder
	.seh_set_fp
	.seh_endprologue

	ldr	x10, [x18, #0x8]		// load TLS block base pointer into x10

	// try to acquire the lock
	mov	w9, #1				// value to store (1 == locked)
	ldr	x11, =_cygtls.stacklock		// load the symbol offset
	add	x12, x10, x11			// x12 = tls_base + &stacklock
1:
	ldaxr	w13, [x12]			// load old lock value
	cbnz	w13, 2f				// if already locked, wait
	stxr	w14, w9, [x12]			// attempt to acquire the lock
	cbnz	w14, 1b				// if locking failed, retry
	b	3f				// lock acquired, continue

2:
	yield					// yield to allow other threads to run
	b	1b				// retry acquiring the lock

3:
	// lock acquired, increment incyg counter
	ldr	x11, =_cygtls.incyg		// load the symbol offset
	add	x12, x10, x11			// x12 = tls_base + &incyg
	ldr	w9, [x12]			// load current value of incyg
	add	w9, w9, #1			// increment incyg counter
	str	w9, [x12]			// store back incremented value

	// check current_sig
	ldr	x11, =_cygtls.current_sig	// load the symbol offset
	ldr	w9, [x10, x11]			// load current value of current_sig
	cbz	w9, 4f				// if no current signal, jump to cleanup

	// release lock before calling signal handler
	ldr	x11, =_cygtls.stacklock		// load the symbol offset
	add	x12, x10, x11			// x12 = tls_base + &stacklock
	ldr	w9, [x12]			// load current value of stacklock
	sub	w9, w9, #1			// decrement stacklock
	stlr	w9, [x12]			// store with release semantics

	// prepare arg and call handler
	ldr	x0, =_cygtls.start_offset	// load the symbol offset
	add	x0, x10, x0			// x0 = tls_base + &start_offset
	bl	_ZN7_cygtls19call_signal_handlerEv

	// call may clobber x10, restore TLS base
	ldr	x10, [x18, #0x8]		// reload tls_base, use as return value

	// decrement incyg
	ldr	x11, =_cygtls.incyg		// load the symbol offset
	add	x12, x10, x11			// x12 = tls_base + &incyg
	ldr	w9, [x12]			// load current value of incyg
	sub	w9, w9, #1			// decrement incyg counter
	str	w9, [x12]			// store back decremented value

	// loop to handle another signal
	b	1b

4:
	// no signal to handle, decrement incyg counter
	ldr	x11, =_cygtls.incyg		// load the symbol offset
	add	x12, x10, x11			// x12 = tls_base + &incyg
	ldr	w9, [x12]			// load current value of incyg
	sub	w9, w9, #1			// decrement incyg counter
	str	w9, [x12]			// store back decremented value

	// epilogue
	.seh_startepilogue
	ldp	fp, lr, [sp], #0x10		// restore FP and LR registers
	.seh_save_fplr_x 0x10
	.seh_endepilogue
	ret
	.seh_endproc
EOF
	}
    }
    return $res;
}

sub longjmp {
    if ($is_x86_64) {
	return <<EOF;

	.globl	sigsetjmp
	.seh_proc sigsetjmp
sigsetjmp:
	.seh_endprologue
	movl	%edx,0x100(%rcx)		# store savemask
	testl	%edx,%edx			# savemask != 0?
	je	setjmp				# no, skip fetching sigmask
	pushq	%rcx
	subq	\$0x20,%rsp
	leaq	0x108(%rcx),%r8			# &sigjmp_buf.sigmask
	xorq	%rdx,%rdx			# NULL
	xorl	%ecx,%ecx			# SIG_SETMASK
	call	pthread_sigmask
	addq	\$0x20,%rsp
	popq	%rcx
	jmp	setjmp
	.seh_endproc

	.globl  setjmp
	.seh_proc setjmp
setjmp:
	.seh_endprologue
	# We use the Windows jmp_buf layout with two small twists.
	# - we store the tls stackptr in Frame, MSVCRT stores a second copy
	#   of %rbp in Frame (twice? why?)
	# - we just store %rsp as is, MSVCRT stores %rsp of the caller in Rsp
	movq	%rbx,0x8(%rcx)
	movq	%rsp,0x10(%rcx)
	movq	%rbp,0x18(%rcx)
	movq	%rsi,0x20(%rcx)
	movq	%rdi,0x28(%rcx)
	movq	%r12,0x30(%rcx)
	movq	%r13,0x38(%rcx)
	movq	%r14,0x40(%rcx)
	movq	%r15,0x48(%rcx)
	movq	(%rsp),%r10
	movq	%r10,0x50(%rcx)
	stmxcsr	0x58(%rcx)
	fnstcw	0x5c(%rcx)
	# jmp_buf is potentially unaligned!
	movdqu	%xmm6,0x60(%rcx)
	movdqu	%xmm7,0x70(%rcx)
	movdqu	%xmm8,0x80(%rcx)
	movdqu	%xmm9,0x90(%rcx)
	movdqu	%xmm10,0xa0(%rcx)
	movdqu	%xmm11,0xb0(%rcx)
	movdqu	%xmm12,0xc0(%rcx)
	movdqu	%xmm13,0xd0(%rcx)
	movdqu	%xmm14,0xe0(%rcx)
	movdqu	%xmm15,0xf0(%rcx)
	pushq	%rcx
	.seh_pushreg %rcx
	call	stabilize_sig_stack		# returns tls in r11
	popq	%rcx
	movq	_cygtls.stackptr(%r11),%r10
	movq	%r10,(%rcx)
	decl	_cygtls.stacklock(%r11)		# release lock
	xorl	%eax,%eax
	ret
	.seh_endproc

	.globl	siglongjmp
	.seh_proc siglongjmp
siglongjmp:
	pushq	%rcx
	.seh_pushreg %rcx
	.seh_endprologue
	movl	%edx, %r12d
	movl	0x100(%rcx),%r8d		# savemask
	testl	%r8d,%r8d			# savemask != 0?
	je	1f				# no, jmp to longjmp
	xorq	%r8,%r8				# NULL
	leaq    0x108(%rcx),%rdx		# &sigjmp_buf.sigmask
	xorl	%ecx,%ecx			# SIG_SETMASK
	subq	\$0x20,%rsp
	call	pthread_sigmask
	addq	\$0x20,%rsp
	jmp	1f
	.seh_endproc

	.globl  longjmp
	.seh_proc longjmp
longjmp:
	pushq	%rcx
	.seh_pushreg %rcx
	.seh_endprologue
	movl	%edx,%r12d			# save return value
1:
	call	stabilize_sig_stack		# returns tls in r11
	popq	%rcx
	movl	%r12d,%eax			# restore return value
	movq	(%rcx),%r10			# get old signal stack
	movq	%r10,_cygtls.stackptr(%r11)	# restore
	decl	_cygtls.stacklock(%r11)		# release lock
	xorl	%r10d,%r10d
	movl	%r10d,_cygtls.incyg(%r11)		# we're not in cygwin anymore
	movq	0x8(%rcx),%rbx
	movq	0x10(%rcx),%rsp
	movq	0x18(%rcx),%rbp
	movq	0x20(%rcx),%rsi
	movq	0x28(%rcx),%rdi
	movq	0x30(%rcx),%r12
	movq	0x38(%rcx),%r13
	movq	0x40(%rcx),%r14
	movq	0x48(%rcx),%r15
	movq	0x50(%rcx),%r10
	movq	%r10,(%rsp)
	ldmxcsr	0x58(%rcx)
	fnclex
	fldcw	0x5c(%rcx)
	# jmp_buf is potentially unaligned!
	movdqu	0x60(%rcx),%xmm6
	movdqu	0x70(%rcx),%xmm7
	movdqu	0x80(%rcx),%xmm8
	movdqu	0x90(%rcx),%xmm9
	movdqu	0xa0(%rcx),%xmm10
	movdqu	0xb0(%rcx),%xmm11
	movdqu	0xc0(%rcx),%xmm12
	movdqu	0xd0(%rcx),%xmm13
	movdqu	0xe0(%rcx),%xmm14
	movdqu	0xf0(%rcx),%xmm15
	testl	%eax,%eax
	jne	0f
	incl	%eax
0:	ret
	.seh_endproc
EOF
    }
    if ($is_aarch64) {
	# TODO: These are only stubs, they need to be implemented properly for AArch64.
	return <<EOF;
	.globl	sigsetjmp
	.seh_proc sigsetjmp
sigsetjmp:
	// prologue
	stp		fp, lr, [sp, #-0x10]!	// save FP and LR registers
	mov		fp, sp			// set FP to current SP
	.seh_endprologue
	str	w1, [x0, #0x100]		// buf->savemask = savemask
	cbz     w1, 1f				// If savemask == 0, skip fetching sigmask
	mov     x3, x0                        	// save buf in x3
	sub     sp, sp, #32			// Allocate 32 bytes on stack call
	mov     x0, #0                         	// SIG_SETMASK
	mov     x1, xzr                        	// newmask = NULL
	add     x2, x3, #0x108                 	// &buf->sigmask
	bl      pthread_sigmask
	add     sp, sp, #32
1:
	bl	setjmp
	// epilogue
	ldp	fp, lr, [sp], #0x10		// restore saved FP and LR registers
	ret
	.seh_endproc

	.globl  setjmp
	.seh_proc setjmp
setjmp:
	// prologue
	stp	fp, lr, [sp, #-0x10]!		// save FP and LR registers
	.seh_save_fplr_x 0x10
	mov	fp, sp				// set frame pointer for unwinder
	.seh_set_fp
	.seh_endprologue

	// save callee-saved registers from jump buffer
	stp	x19, x20, [x0, #0x08]		// save x19, x20
	stp	x21, x22, [x0, #0x18]		// save x21, x22
	stp	x23, x24, [x0, #0x28]		// save x23, x24
	stp	x25, x26, [x0, #0x38]		// save x25, x26
	stp	x27, x28, [x0, #0x48]		// save x27, x28
	stp	x29, x30, [x0, #0x58]		// save x29 (FP) and x30 (LR)

	add	x1, sp, #0x10			// get the old stack pointer
	str	x1, [x0, #0x68]			// save SP

	mrs	x1, fpcr			// get fp control register
	str	x1, [x0, #0x70]			// save FPCR
	mrs	x1, fpsr			// get fp status register
	str	x1, [x0, #0x78]			// save FPSR

	// save fp registers (d8-d15)
	stp	d8,  d9,  [x0, #0x80]		// save d8, d9
	stp	d10, d11, [x0, #0x90]		// save d10, d11
	stp	d12, d13, [x0, #0xA0]		// save d12, d13
	stp	d14, d15, [x0, #0xB0]		// save d14, d15

	// save TLS stack pointer
	ldr	x1, [sp, #0x10]			// get the TLS stack pointer
	str	x1, [x0, #0xB8]			// save TLS stack pointer

	str	x0, [sp, #-0x10]!		// save jmp_buf before call
	bl	stabilize_sig_stack		// call stabilize_sig_stack (returns TLS in x10)
	ldr	x0, [sp], #0x10			// restore jmp_buf after call

	// store the stack pointer to jump_buf
	ldr	x2, =_cygtls.stackptr		// load the symbol offset
	add	x2, x10, x2			// x2 = tls_base + &stackptr
	ldr	x3, [x2]			// load current value of stackptr
	str	x3, [x0]			// store stackptr into jmp_buf

	// decrement the stack lock
	ldr	x2, =_cygtls.stacklock		// load the symbol offset
	add	x2, x10, x2			// x2 = tls_base + &stacklock
	ldr	w3, [x2]			// load current stacklock value
	sub	w3, w3, #1			// decrement
	str	w3, [x2]			// store back

	mov	w0, wzr				// return 0

	// epilogue
	.seh_startepilogue
	ldp	fp, lr, [sp], #0x10		// restore saved FP and LR registers
	.seh_save_fplr_x 0x10
	.seh_endepilogue
	ret
	.seh_endproc

	.globl	siglongjmp
	.seh_proc siglongjmp
siglongjmp:
	// prologue
	stp	fp, lr, [sp, #-0x10]!		// save FP and LR registers
	mov	fp, sp				// set FP to current SP
	.seh_endprologue
	mov x19, x1				// save val
	mov x20, x0				// save buf
	ldr     w8, [x20, #0x100]       	// w8 = buf->savemask
	cbz     w8, 1f                  	// if savemask == 0, skip
	sub	sp, sp, #32			// allocate 32 bytes on stack
	mov     x0, #0                  	// SIG_SETMASK
	mov     x1, xzr                 	// newmask = NULL
	add     x2, x20, #0x108         	// &buf->sigmask
	bl      pthread_sigmask

	add     sp, sp, #32			// call frame
1:
	mov	x0, x20				//buf
	mov	x1, x19				//val
	bl longjmp

	// epilogue
	ldp	fp, lr, [sp], #0x10		// restore saved FP and LR registers
	ret
	.seh_endproc

	.globl  longjmp
	.seh_proc longjmp
longjmp:
	// prologue
	.seh_endprologue
1:
	stp	x0, x1, [sp, #-0x10]!		// save function args (jmp_buf and return value)
	bl	stabilize_sig_stack		// call stabilize_sig_stack (returns TLS in x10)
	ldp	x2, x3, [sp], #0x10		// restore jmp_buf and return value to x2 and x3
	ldr	x9, [x2]			// get old signal stack from jump buffer

	// restore stack pointer in TLS
	ldr	x11, =_cygtls.stackptr		// load the symbol offset
	add	x11, x10, x11			// x11 = tls_base + &stackptr
	str	x9, [x11]			// store signal stack into stackptr

	// release lock by decrementing counter
	ldr	x11, =_cygtls.stacklock		// load the symbol offset
	add	x11, x10, x11			// x11 = tls_base + &stacklock
	ldr	w12, [x11]			// load current stacklock value
	sub	w12, w12, #1			// decrement
	str	w12, [x11]			// store back

	// we're not in cygwin anymore, clear "in cygwin" flag
	ldr	x11, =_cygtls.incyg		// load the symbol offset
	add	x11, x10, x11			// x11 = tls_base + &incyg
	mov	w12, wzr			// clear the incyg counter
	str	w12, [x11]			// store back the new value

	// use second argument as the return value
	mov	x0, x3				// move saved return value to x0

	// restore callee-saved registers from jump buffer
	ldp	x19, x20, [x2, #0x08]		// restore x19, x20
	ldp	x21, x22, [x2, #0x18]		// restore x21, x22
	ldp	x23, x24, [x2, #0x28]		// restore x23, x24
	ldp	x25, x26, [x2, #0x38]		// restore x25, x26
	ldp	x27, x28, [x2, #0x48]		// restore x27, x28
	ldp	x29, x30, [x2, #0x58]		// restore x29 (FP) and x30 (LR)
	ldr	x10, [x2, #0x68]		// get saved stack pointer
	mov	sp, x10				// restore stack pointer
	ldr	x10, [x2, #0x70]		// load floating-point control register
	msr	fpcr, x10			// restore FPCR
	ldr	x10, [x2, #0x78]		// load floating-point status register
	msr	fpsr, x10			// restore FPSR

	// restore floating-point registers (d8-d15)
	ldp	d8, d9, [x2, #0x80]		// restore d8, d9
	ldp	d10, d11, [x2, #0x90]		// restore d10, d11
	ldp	d12, d13, [x2, #0xA0]		// restore d12, d13
	ldp	d14, d15, [x2, #0xB0]		// restore d14, d15

	// restore TLS stack pointer
	ldr	x1, [x2, #0xB8]			// get the saved TLS stack pointer
	str	x1, [sp]			// restore TLS stack pointer

	// ensure return value is non-zero (C standard requirement)
	cbnz	x0, 0f				// if return value non-zero, return
	mov	x0, #1				// set return value to 1
0:
	// epilogue
	.seh_startepilogue
	.seh_endepilogue
	ret
	.seh_endproc
EOF
    }
}

sub cleanup(@) {
    grep {s/\r//og; s/#.*//og; s/\s+\n//sog; !/^$/o && $_} @_;
}
