ARM64 assembly pack: make it Windows-friendly.

"Windows friendliness" means a) unified PIC-ification, unified across all platforms; b) unified commantary delimiter; c) explicit ldur/stur, as Visual Studio assembler can't automatically encode ldr/str as ldur/stur when needed. Reviewed-by: Paul Dale <paul.dale@oracle.com> Reviewed-by: Richard Levitte <levitte@openssl.org> (Merged from https://github.com/openssl/openssl/pull/8256)
2025-02-17 14:32:04 +08:00 · 2019-02-15 22:16:41 +01:00 · 2019-02-15 22:16:41 +01:00 · db42bb440e
commit db42bb440e
parent 3405db97e5
8 changed files with 191 additions and 220 deletions
--- a/crypto/aes/asm/vpaes-armv8.pl
+++ b/crypto/aes/asm/vpaes-armv8.pl
@ -150,12 +150,12 @@ my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));

 $code.=<<___;
-##
-##  _aes_preheat
-##
-##  Fills register %r10 -> .aes_consts (so you can -fPIC)
-##  and %xmm9-%xmm15 as specified below.
-##
+//
+//  _aes_preheat
+//
+//  Fills register %r10 -> .aes_consts (so you can -fPIC)
+//  and %xmm9-%xmm15 as specified below.
+//
 .type	_vpaes_encrypt_preheat,%function
 .align	4
 _vpaes_encrypt_preheat:
@ -167,21 +167,21 @@ _vpaes_encrypt_preheat:
 	ret
 .size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat

-##
-##  _aes_encrypt_core
-##
-##  AES-encrypt %xmm0.
-##
-##  Inputs:
-##     %xmm0 = input
-##     %xmm9-%xmm15 as in _vpaes_preheat
-##    (%rdx) = scheduled keys
-##
-##  Output in %xmm0
-##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
-##  Preserves %xmm6 - %xmm8 so you get some local vectors
-##
-##
+//
+//  _aes_encrypt_core
+//
+//  AES-encrypt %xmm0.
+//
+//  Inputs:
+//     %xmm0 = input
+//     %xmm9-%xmm15 as in _vpaes_preheat
+//    (%rdx) = scheduled keys
+//
+//  Output in %xmm0
+//  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+//  Preserves %xmm6 - %xmm8 so you get some local vectors
+//
+//
 .type	_vpaes_encrypt_core,%function
 .align 4
 _vpaes_encrypt_core:
@ -387,11 +387,11 @@ _vpaes_decrypt_preheat:
 	ret
 .size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat

-##
-##  Decryption core
-##
-##  Same API as encryption core.
-##
+//
+//  Decryption core
+//
+//  Same API as encryption core.
+//
 .type	_vpaes_decrypt_core,%function
 .align	4
 _vpaes_decrypt_core:
@ -643,11 +643,11 @@ my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));

 $code.=<<___;
-########################################################
-##                                                    ##
-##                  AES key schedule                  ##
-##                                                    ##
-########################################################
+////////////////////////////////////////////////////////
+//                                                    //
+//                  AES key schedule                  //
+//                                                    //
+////////////////////////////////////////////////////////
 .type	_vpaes_key_preheat,%function
 .align	4
 _vpaes_key_preheat:
@ -703,14 +703,14 @@ _vpaes_schedule_core:
 	b.eq	.Lschedule_192
 	// 128: fall though

-##
-##  .schedule_128
-##
-##  128-bit specific part of key schedule.
-##
-##  This schedule is really simple, because all its parts
-##  are accomplished by the subroutines.
-##
+//
+//  .schedule_128
+//
+//  128-bit specific part of key schedule.
+//
+//  This schedule is really simple, because all its parts
+//  are accomplished by the subroutines.
+//
 .Lschedule_128:
 	mov	$inp, #10			// mov	\$10, %esi

@ -721,21 +721,21 @@ _vpaes_schedule_core:
 	bl	_vpaes_schedule_mangle		// write output
 	b 	.Loop_schedule_128

-##
-##  .aes_schedule_192
-##
-##  192-bit specific part of key schedule.
-##
-##  The main body of this schedule is the same as the 128-bit
-##  schedule, but with more smearing.  The long, high side is
-##  stored in %xmm7 as before, and the short, low side is in
-##  the high bits of %xmm6.
-##
-##  This schedule is somewhat nastier, however, because each
-##  round produces 192 bits of key material, or 1.5 round keys.
-##  Therefore, on each cycle we do 2 rounds and produce 3 round
-##  keys.
-##
+//
+//  .aes_schedule_192
+//
+//  192-bit specific part of key schedule.
+//
+//  The main body of this schedule is the same as the 128-bit
+//  schedule, but with more smearing.  The long, high side is
+//  stored in %xmm7 as before, and the short, low side is in
+//  the high bits of %xmm6.
+//
+//  This schedule is somewhat nastier, however, because each
+//  round produces 192 bits of key material, or 1.5 round keys.
+//  Therefore, on each cycle we do 2 rounds and produce 3 round
+//  keys.
+//
 .align	4
 .Lschedule_192:
 	sub	$inp, $inp, #8
@ -759,16 +759,16 @@ _vpaes_schedule_core:
 	bl	_vpaes_schedule_192_smear
 	b	.Loop_schedule_192

-##
-##  .aes_schedule_256
-##
-##  256-bit specific part of key schedule.
-##
-##  The structure here is very similar to the 128-bit
-##  schedule, but with an additional "low side" in
-##  %xmm6.  The low side's rounds are the same as the
-##  high side's, except no rcon and no rotation.
-##
+//
+//  .aes_schedule_256
+//
+//  256-bit specific part of key schedule.
+//
+//  The structure here is very similar to the 128-bit
+//  schedule, but with an additional "low side" in
+//  %xmm6.  The low side's rounds are the same as the
+//  high side's, except no rcon and no rotation.
+//
 .align	4
 .Lschedule_256:
 	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
@ -795,16 +795,16 @@ _vpaes_schedule_core:

 	b	.Loop_schedule_256

-##
-##  .aes_schedule_mangle_last
-##
-##  Mangler for last round of key schedule
-##  Mangles %xmm0
-##    when encrypting, outputs out(%xmm0) ^ 63
-##    when decrypting, outputs unskew(%xmm0)
-##
-##  Always called right before return... jumps to cleanup and exits
-##
+//
+//  .aes_schedule_mangle_last
+//
+//  Mangler for last round of key schedule
+//  Mangles %xmm0
+//    when encrypting, outputs out(%xmm0) ^ 63
+//    when decrypting, outputs unskew(%xmm0)
+//
+//  Always called right before return... jumps to cleanup and exits
+//
 .align	4
 .Lschedule_mangle_last:
 	// schedule last round key from xmm0
@ -838,20 +838,20 @@ _vpaes_schedule_core:
 	ret
 .size	_vpaes_schedule_core,.-_vpaes_schedule_core

-##
-##  .aes_schedule_192_smear
-##
-##  Smear the short, low side in the 192-bit key schedule.
-##
-##  Inputs:
-##    %xmm7: high side, b  a  x  y
-##    %xmm6:  low side, d  c  0  0
-##    %xmm13: 0
-##
-##  Outputs:
-##    %xmm6: b+c+d  b+c  0  0
-##    %xmm0: b+c+d  b+c  b  a
-##
+//
+//  .aes_schedule_192_smear
+//
+//  Smear the short, low side in the 192-bit key schedule.
+//
+//  Inputs:
+//    %xmm7: high side, b  a  x  y
+//    %xmm6:  low side, d  c  0  0
+//    %xmm13: 0
+//
+//  Outputs:
+//    %xmm6: b+c+d  b+c  0  0
+//    %xmm0: b+c+d  b+c  b  a
+//
 .type	_vpaes_schedule_192_smear,%function
 .align	4
 _vpaes_schedule_192_smear:
@ -867,24 +867,24 @@ _vpaes_schedule_192_smear:
 	ret
 .size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear

-##
-##  .aes_schedule_round
-##
-##  Runs one main round of the key schedule on %xmm0, %xmm7
-##
-##  Specifically, runs subbytes on the high dword of %xmm0
-##  then rotates it by one byte and xors into the low dword of
-##  %xmm7.
-##
-##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
-##  next rcon.
-##
-##  Smears the dwords of %xmm7 by xoring the low into the
-##  second low, result into third, result into highest.
-##
-##  Returns results in %xmm7 = %xmm0.
-##  Clobbers %xmm1-%xmm4, %r11.
-##
+//
+//  .aes_schedule_round
+//
+//  Runs one main round of the key schedule on %xmm0, %xmm7
+//
+//  Specifically, runs subbytes on the high dword of %xmm0
+//  then rotates it by one byte and xors into the low dword of
+//  %xmm7.
+//
+//  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+//  next rcon.
+//
+//  Smears the dwords of %xmm7 by xoring the low into the
+//  second low, result into third, result into highest.
+//
+//  Returns results in %xmm7 = %xmm0.
+//  Clobbers %xmm1-%xmm4, %r11.
+//
 .type	_vpaes_schedule_round,%function
 .align	4
 _vpaes_schedule_round:
@ -932,15 +932,15 @@ _vpaes_schedule_low_round:
 	ret
 .size	_vpaes_schedule_round,.-_vpaes_schedule_round

-##
-##  .aes_schedule_transform
-##
-##  Linear-transform %xmm0 according to tables at (%r11)
-##
-##  Requires that %xmm9 = 0x0F0F... as in preheat
-##  Output in %xmm0
-##  Clobbers %xmm1, %xmm2
-##
+//
+//  .aes_schedule_transform
+//
+//  Linear-transform %xmm0 according to tables at (%r11)
+//
+//  Requires that %xmm9 = 0x0F0F... as in preheat
+//  Output in %xmm0
+//  Clobbers %xmm1, %xmm2
+//
 .type	_vpaes_schedule_transform,%function
 .align	4
 _vpaes_schedule_transform:
@ -954,29 +954,29 @@ _vpaes_schedule_transform:
 	ret
 .size	_vpaes_schedule_transform,.-_vpaes_schedule_transform

-##
-##  .aes_schedule_mangle
-##
-##  Mangle xmm0 from (basis-transformed) standard version
-##  to our version.
-##
-##  On encrypt,
-##    xor with 0x63
-##    multiply by circulant 0,1,1,1
-##    apply shiftrows transform
-##
-##  On decrypt,
-##    xor with 0x63
-##    multiply by "inverse mixcolumns" circulant E,B,D,9
-##    deskew
-##    apply shiftrows transform
-##
-##
-##  Writes out to (%rdx), and increments or decrements it
-##  Keeps track of round number mod 4 in %r8
-##  Preserves xmm0
-##  Clobbers xmm1-xmm5
-##
+//
+//  .aes_schedule_mangle
+//
+//  Mangle xmm0 from (basis-transformed) standard version
+//  to our version.
+//
+//  On encrypt,
+//    xor with 0x63
+//    multiply by circulant 0,1,1,1
+//    apply shiftrows transform
+//
+//  On decrypt,
+//    xor with 0x63
+//    multiply by "inverse mixcolumns" circulant E,B,D,9
+//    deskew
+//    apply shiftrows transform
+//
+//
+//  Writes out to (%rdx), and increments or decrements it
+//  Keeps track of round number mod 4 in %r8
+//  Preserves xmm0
+//  Clobbers xmm1-xmm5
+//
 .type	_vpaes_schedule_mangle,%function
 .align	4
 _vpaes_schedule_mangle:
--- a/crypto/bn/asm/armv8-mont.pl
+++ b/crypto/bn/asm/armv8-mont.pl
@ -197,7 +197,7 @@ bn_mul_mont:
 	mul	$nlo,$nj,$m1		// np[j]*m1
 	adds	$lo1,$lo1,$lo0
 	umulh	$nhi,$nj,$m1
-	str	$lo1,[$tp,#-16]		// tp[j-1]
+	stur	$lo1,[$tp,#-16]		// tp[j-1]
 	cbnz	$j,.Linner

 .Linner_skip:
@ -253,13 +253,13 @@ bn_mul_mont:
 	csel	$nj,$tj,$aj,lo		// did it borrow?
 	ldr	$tj,[$tp],#8
 	ldr	$aj,[$rp],#8
-	str	xzr,[$tp,#-16]		// wipe tp
-	str	$nj,[$rp,#-16]
+	stur	xzr,[$tp,#-16]		// wipe tp
+	stur	$nj,[$rp,#-16]
 	cbnz	$num,.Lcond_copy

 	csel	$nj,$tj,$aj,lo
-	str	xzr,[$tp,#-8]		// wipe tp
-	str	$nj,[$rp,#-8]
+	stur	xzr,[$tp,#-8]		// wipe tp
+	stur	$nj,[$rp,#-8]

 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
@ -596,7 +596,7 @@ __bn_sqr8x_mont:
 	ldp	$a4,$a5,[$tp,#8*4]
 	ldp	$a6,$a7,[$tp,#8*6]
 	adds	$acc0,$acc0,$a0
-	ldr	$n0,[$rp,#-8*8]
+	ldur	$n0,[$rp,#-8*8]
 	adcs	$acc1,$acc1,$a1
 	ldp	$a0,$a1,[$ap,#8*0]
 	adcs	$acc2,$acc2,$a2
@ -794,7 +794,7 @@ $code.=<<___;
 	//adc	$carry,xzr,xzr		// moved below
 	cbz	$cnt,.Lsqr8x8_post_condition

-	ldr	$n0,[$tp,#-8*8]
+	ldur	$n0,[$tp,#-8*8]
 	ldp	$a0,$a1,[$np,#8*0]
 	ldp	$a2,$a3,[$np,#8*2]
 	ldp	$a4,$a5,[$np,#8*4]
@ -852,7 +852,7 @@ $code.=<<___;
 	ldp	$a6,$a7,[$tp,#8*6]
 	cbz	$cnt,.Lsqr8x_tail_break

-	ldr	$n0,[$rp,#-8*8]
+	ldur	$n0,[$rp,#-8*8]
 	adds	$acc0,$acc0,$a0
 	adcs	$acc1,$acc1,$a1
 	ldp	$a0,$a1,[$np,#8*0]
--- a/crypto/chacha/asm/chacha-armv8.pl
+++ b/crypto/chacha/asm/chacha-armv8.pl
@ -131,12 +131,6 @@ $code.=<<___;
 .quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
 .Lone:
 .long	1,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef	__ILP32__
-.long	OPENSSL_armcap_P-.
-#else
-.quad	OPENSSL_armcap_P-.
-#endif
 .asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"

 .globl	ChaCha20_ctr32
@ -144,17 +138,13 @@ $code.=<<___;
 .align	5
 ChaCha20_ctr32:
 	cbz	$len,.Labort
-	adr	@x[0],.LOPENSSL_armcap_P
 	cmp	$len,#192
 	b.lo	.Lshort
-#ifdef	__ILP32__
-	ldrsw	@x[1],[@x[0]]
-#else
-	ldr	@x[1],[@x[0]]
-#endif
-	ldr	w17,[@x[1],@x[0]]
+
+	adrp	x17,OPENSSL_armcap_P
+	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
 	tst	w17,#ARMV7_NEON
-	b.ne	ChaCha20_neon
+	b.ne	.LChaCha20_neon

 .Lshort:
 	.inst	0xd503233f			// paciasp
@ -380,6 +370,7 @@ $code.=<<___;
 .type	ChaCha20_neon,%function
 .align	5
 ChaCha20_neon:
+.LChaCha20_neon:
 	.inst	0xd503233f			// paciasp
 	stp	x29,x30,[sp,#-96]!
 	add	x29,sp,#0
--- a/crypto/ec/asm/ecp_nistz256-armv8.pl
+++ b/crypto/ec/asm/ecp_nistz256-armv8.pl
@ -1654,7 +1654,7 @@ ecp_nistz256_scatter_w5:

 	ldp	x4,x5,[$inp]		// X
 	ldp	x6,x7,[$inp,#16]
-	str	w4,[$out,#64*0-4]
+	stur	w4,[$out,#64*0-4]
 	lsr	x4,x4,#32
 	str	w5,[$out,#64*1-4]
 	lsr	x5,x5,#32
@ -1670,7 +1670,7 @@ ecp_nistz256_scatter_w5:

 	ldp	x4,x5,[$inp,#32]	// Y
 	ldp	x6,x7,[$inp,#48]
-	str	w4,[$out,#64*0-4]
+	stur	w4,[$out,#64*0-4]
 	lsr	x4,x4,#32
 	str	w5,[$out,#64*1-4]
 	lsr	x5,x5,#32
@ -1686,7 +1686,7 @@ ecp_nistz256_scatter_w5:

 	ldp	x4,x5,[$inp,#64]	// Z
 	ldp	x6,x7,[$inp,#80]
-	str	w4,[$out,#64*0-4]
+	stur	w4,[$out,#64*0-4]
 	lsr	x4,x4,#32
 	str	w5,[$out,#64*1-4]
 	lsr	x5,x5,#32
--- a/crypto/perlasm/arm-xlate.pl
+++ b/crypto/perlasm/arm-xlate.pl
@ -103,6 +103,12 @@ my $asciz = sub {
    {	"";	}
 };

+my $adrp = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    "\tadrp\t$args\@PAGE";
+} if ($flavour =~ /ios64/);
+
+
 sub range {
  my ($r,$sfx,$start,$end) = @_;

@ -132,6 +138,10 @@ sub expand_line {

    $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;

+    if ($flavour =~ /ios64/) {
+	$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
+    }
+
    return $line;
 }

--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@ -71,17 +71,12 @@ poly1305_init:
 	csel	x0,xzr,x0,eq
 	b.eq	.Lno_key

-#ifdef	__ILP32__
-	ldrsw	$t1,.LOPENSSL_armcap_P
-#else
-	ldr	$t1,.LOPENSSL_armcap_P
-#endif
-	adr	$t0,.LOPENSSL_armcap_P
+	adrp	x17,OPENSSL_armcap_P
+	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]

 	ldp	$r0,$r1,[$inp]		// load key
 	mov	$s1,#0xfffffffc0fffffff
 	movk	$s1,#0x0fff,lsl#48
-	ldr	w17,[$t0,$t1]
 #ifdef	__ARMEB__
 	rev	$r0,$r0			// flip bytes
 	rev	$r1,$r1
@ -93,10 +88,10 @@ poly1305_init:

 	tst	w17,#ARMV7_NEON

-	adr	$d0,poly1305_blocks
-	adr	$r0,poly1305_blocks_neon
-	adr	$d1,poly1305_emit
-	adr	$r1,poly1305_emit_neon
+	adr	$d0,.Lpoly1305_blocks
+	adr	$r0,.Lpoly1305_blocks_neon
+	adr	$d1,.Lpoly1305_emit
+	adr	$r1,.Lpoly1305_emit_neon

 	csel	$d0,$d0,$r0,eq
 	csel	$d1,$d1,$r1,eq
@ -115,6 +110,7 @@ poly1305_init:
 .type	poly1305_blocks,%function
 .align	5
 poly1305_blocks:
+.Lpoly1305_blocks:
 	ands	$len,$len,#-16
 	b.eq	.Lno_data

@ -179,6 +175,7 @@ poly1305_blocks:
 .type	poly1305_emit,%function
 .align	5
 poly1305_emit:
+.Lpoly1305_emit:
 	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
 	ldr	$h2,[$ctx,#16]
 	ldp	$t0,$t1,[$nonce]	// load nonce
@ -285,10 +282,11 @@ poly1305_splat:
 .type	poly1305_blocks_neon,%function
 .align	5
 poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
 	ldr	$is_base2_26,[$ctx,#24]
 	cmp	$len,#128
 	b.hs	.Lblocks_neon
-	cbz	$is_base2_26,poly1305_blocks
+	cbz	$is_base2_26,.Lpoly1305_blocks

 .Lblocks_neon:
 	.inst	0xd503233f		// paciasp
@ -431,7 +429,7 @@ poly1305_blocks_neon:
 	csel	$in2,$zeros,$in2,lo

 	mov	x4,#1
-	str	x4,[$ctx,#-24]		// set is_base2_26
+	stur	x4,[$ctx,#-24]		// set is_base2_26
 	sub	$ctx,$ctx,#48		// restore original $ctx
 	b	.Ldo_neon

@ -868,6 +866,7 @@ poly1305_blocks_neon:
 .type	poly1305_emit_neon,%function
 .align	5
 poly1305_emit_neon:
+.Lpoly1305_emit_neon:
 	ldr	$is_base2_26,[$ctx,#24]
 	cbz	$is_base2_26,poly1305_emit

@ -920,12 +919,6 @@ poly1305_emit_neon:
 .align	5
 .Lzeros:
 .long	0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef	__ILP32__
-.long	OPENSSL_armcap_P-.
-#else
-.quad	OPENSSL_armcap_P-.
-#endif
 .asciz	"Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
 ___
--- a/crypto/sha/asm/sha1-armv8.pl
+++ b/crypto/sha/asm/sha1-armv8.pl
@ -58,10 +58,10 @@ $code.=<<___ if ($i<15 && !($i&1));
 	lsr	@Xx[$i+1],@Xx[$i],#32
 ___
 $code.=<<___ if ($i<14 && !($i&1));
-	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
+	ldur	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
 ___
 $code.=<<___ if ($i<14 && ($i&1));
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	@Xx[$i+1],@Xx[$i+1],#32
 #else
 	rev32	@Xx[$i+1],@Xx[$i+1]
@ -171,23 +171,19 @@ ___
 }

 $code.=<<___;
-#include "arm_arch.h"
+#ifndef	__KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#endif

 .text

-.extern	OPENSSL_armcap_P
 .globl	sha1_block_data_order
 .type	sha1_block_data_order,%function
 .align	6
 sha1_block_data_order:
-#ifdef	__ILP32__
-	ldrsw	x16,.LOPENSSL_armcap_P
-#else
-	ldr	x16,.LOPENSSL_armcap_P
-#endif
-	adr	x17,.LOPENSSL_armcap_P
-	add	x16,x16,x17
-	ldr	w16,[x16]
+	adrp	x16,OPENSSL_armcap_P
+	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
 	tst	w16,#ARMV8_SHA1
 	b.ne	.Lv8_entry

@ -208,7 +204,7 @@ sha1_block_data_order:
 	movz	$K,#0x7999
 	sub	$num,$num,#1
 	movk	$K,#0x5a82,lsl#16
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	$Xx[0],@Xx[0],#32
 #else
 	rev32	@Xx[0],@Xx[0]
@ -321,15 +317,11 @@ $code.=<<___;
 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
-.LOPENSSL_armcap_P:
-#ifdef	__ILP32__
-.long	OPENSSL_armcap_P-.
-#else
-.quad	OPENSSL_armcap_P-.
-#endif
 .asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
+#if !defined(__KERNELL__) && !defined(_WIN64)
 .comm	OPENSSL_armcap_P,4,4
+#endif
 ___
 }}}

--- a/crypto/sha/asm/sha512-armv8.pl
+++ b/crypto/sha/asm/sha512-armv8.pl
@ -188,24 +188,18 @@ ___
 $code.=<<___;
 #ifndef	__KERNEL__
 # include "arm_arch.h"
+.extern	OPENSSL_armcap_P
 #endif

 .text

-.extern	OPENSSL_armcap_P
 .globl	$func
 .type	$func,%function
 .align	6
 $func:
 #ifndef	__KERNEL__
-# ifdef	__ILP32__
-	ldrsw	x16,.LOPENSSL_armcap_P
-# else
-	ldr	x16,.LOPENSSL_armcap_P
-# endif
-	adr	x17,.LOPENSSL_armcap_P
-	add	x16,x16,x17
-	ldr	w16,[x16]
+	adrp	x16,OPENSSL_armcap_P
+	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
 ___
 $code.=<<___	if ($SZ==4);
 	tst	w16,#ARMV8_SHA256
@ -353,15 +347,6 @@ $code.=<<___ if ($SZ==4);
 ___
 $code.=<<___;
 .size	.LK$BITS,.-.LK$BITS
-#ifndef	__KERNEL__
-.align	3
-.LOPENSSL_armcap_P:
-# ifdef	__ILP32__
-	.long	OPENSSL_armcap_P-.
-# else
-	.quad	OPENSSL_armcap_P-.
-# endif
-#endif
 .asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
 ___
@ -841,7 +826,7 @@ ___
 }

 $code.=<<___;
-#ifndef	__KERNEL__
+#if !defined(__KERNEL__) && !defined(_WIN64)
 .comm	OPENSSL_armcap_P,4,4
 #endif
 ___