2
0
mirror of https://github.com/openssl/openssl.git synced 2025-04-12 20:30:52 +08:00

Make it able to run asm code on OpenBSD (arm64)

In order to get asm code running on OpenBSD we must place
all constants into .rodata sections.

The change to crypto/perlasm/arm-xlate.pl adjusts changes
from Theo for additional assembler variants/flavours we
use for building OpenSSL.

Fixes 

Reviewed-by: Hugo Landau <hlandau@devever.net>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/24137)
This commit is contained in:
Theo Buehler 2024-04-15 01:23:36 +02:00 committed by Tomas Mraz
parent 3b7bd871c1
commit c6e65c1f8e
12 changed files with 164 additions and 73 deletions

@ -107,12 +107,13 @@ my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
$code.=<<___;
.rodata
.align 5
.Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.previous
.globl ${prefix}_set_encrypt_key
.type ${prefix}_set_encrypt_key,%function
.align 5
@ -139,7 +140,8 @@ $code.=<<___;
tst $bits,#0x3f
b.ne .Lenc_key_abort
adr $ptr,.Lrcon
adrp $ptr,.Lrcon
add $ptr,$ptr,:lo12:.Lrcon
cmp $bits,#192
veor $zero,$zero,$zero

@ -55,7 +55,7 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\""
$code.=<<___;
#include "arm_arch.h"
.text
.rodata
.type _vpaes_consts,%object
.align 7 // totally strategic alignment
@ -146,6 +146,9 @@ _vpaes_consts:
.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
.size _vpaes_consts,.-_vpaes_consts
.align 6
.text
___
{
@ -165,7 +168,8 @@ $code.=<<___;
.type _vpaes_encrypt_preheat,%function
.align 4
_vpaes_encrypt_preheat:
adr x10, .Lk_inv
adrp x10, .Lk_inv
add x10, x10, :lo12:.Lk_inv
movi v17.16b, #0x0f
ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
@ -193,7 +197,8 @@ _vpaes_encrypt_preheat:
_vpaes_encrypt_core:
mov x9, $key
ldr w8, [$key,#240] // pull rounds
adr x11, .Lk_mc_forward+16
adrp x11, .Lk_mc_forward+16
add x11, x11, :lo12:.Lk_mc_forward+16
// vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
@ -280,7 +285,8 @@ vpaes_encrypt:
_vpaes_encrypt_2x:
mov x9, $key
ldr w8, [$key,#240] // pull rounds
adr x11, .Lk_mc_forward+16
adrp x11, .Lk_mc_forward+16
add x11, x11, :lo12:.Lk_mc_forward+16
// vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
@ -383,9 +389,11 @@ _vpaes_encrypt_2x:
.type _vpaes_decrypt_preheat,%function
.align 4
_vpaes_decrypt_preheat:
adr x10, .Lk_inv
adrp x10, .Lk_inv
add x10, x10, :lo12:.Lk_inv
movi v17.16b, #0x0f
adr x11, .Lk_dipt
adrp x11, .Lk_dipt
add x11, x11, :lo12:.Lk_dipt
ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
@ -407,10 +415,12 @@ _vpaes_decrypt_core:
// vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
eor x11, x11, #0x30 // xor \$0x30, %r11
adr x10, .Lk_sr
adrp x10, .Lk_sr
add x10, x10, :lo12:.Lk_sr
and x11, x11, #0x30 // and \$0x30, %r11
add x11, x11, x10
adr x10, .Lk_mc_forward+48
adrp x10, .Lk_mc_forward+48
add x10, x10, :lo12:.Lk_mc_forward+48
ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
@ -518,10 +528,12 @@ _vpaes_decrypt_2x:
// vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
eor x11, x11, #0x30 // xor \$0x30, %r11
adr x10, .Lk_sr
adrp x10, .Lk_sr
add x10, x10, :lo12:.Lk_sr
and x11, x11, #0x30 // and \$0x30, %r11
add x11, x11, x10
adr x10, .Lk_mc_forward+48
adrp x10, .Lk_mc_forward+48
add x10, x10, :lo12:.Lk_mc_forward+48
ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
@ -657,14 +669,18 @@ $code.=<<___;
.type _vpaes_key_preheat,%function
.align 4
_vpaes_key_preheat:
adr x10, .Lk_inv
adrp x10, .Lk_inv
add x10, x10, :lo12:.Lk_inv
movi v16.16b, #0x5b // .Lk_s63
adr x11, .Lk_sb1
adrp x11, .Lk_sb1
add x11, x11, :lo12:.Lk_sb1
movi v17.16b, #0x0f // .Lk_s0F
ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
adr x10, .Lk_dksd
adrp x10, .Lk_dksd
add x10, x10, :lo12:.Lk_dksd
ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
adr x11, .Lk_mc_forward
adrp x11, .Lk_mc_forward
add x11, x11, :lo12:.Lk_mc_forward
ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
ld1 {v8.2d}, [x10] // .Lk_rcon
@ -688,7 +704,8 @@ _vpaes_schedule_core:
bl _vpaes_schedule_transform
mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10
add x10, x10, :lo12:.Lk_sr
add x8, x8, x10
cbnz $dir, .Lschedule_am_decrypting
@ -814,12 +831,14 @@ _vpaes_schedule_core:
.align 4
.Lschedule_mangle_last:
// schedule last round key from xmm0
adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
add x11, x11, :lo12:.Lk_deskew
cbnz $dir, .Lschedule_mangle_last_dec
// encrypting
ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
add x11, x11, :lo12:.Lk_opt
add $out, $out, #32 // add \$32, %rdx
tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute

@ -1898,6 +1898,7 @@ __bn_mul4x_mont:
___
}
$code.=<<___;
.rodata
.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___

@ -140,7 +140,7 @@ $code.=<<___;
.extern ChaCha20_ctr32_sve
#endif
.text
.rodata
.align 5
.Lsigma:
@ -151,6 +151,8 @@ $code.=<<___;
.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm"
.text
.globl ChaCha20_ctr32_dflt
.type ChaCha20_ctr32_dflt,%function
.align 5
@ -170,7 +172,8 @@ ChaCha20_ctr32_dflt:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr @x[0],.Lsigma
adrp @x[0],.Lsigma
add @x[0],@x[0],:lo12:.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
@ -473,7 +476,8 @@ ChaCha20_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr @x[0],.Lsigma
adrp @x[0],.Lsigma
add @x[0],@x[0],:lo12:.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
@ -884,7 +888,8 @@ ChaCha20_512_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr @x[0],.Lsigma
adrp @x[0],.Lsigma
add @x[0],@x[0],:lo12:.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]

@ -55,7 +55,7 @@ my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont
$code.=<<___;
#include "arm_arch.h"
.text
.rodata
___
########################################################################
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
@ -117,6 +117,8 @@ $code.=<<___;
.quad 0xccd1c8aaee00bc4f
.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.text
// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl ecp_nistz256_to_mont
.type ecp_nistz256_to_mont,%function
@ -127,12 +129,16 @@ ecp_nistz256_to_mont:
add x29,sp,#0
stp x19,x20,[sp,#16]
ldr $bi,.LRR // bp[0]
adrp $bi,.LRR
ldr $bi,[$bi,:lo12:.LRR] // bp[0]
ldp $a0,$a1,[$ap]
ldp $a2,$a3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adr $bp,.LRR // &bp[0]
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
adrp $bp,.LRR // &bp[0]
add $bp,$bp,:lo12:.LRR
bl __ecp_nistz256_mul_mont
@ -155,9 +161,12 @@ ecp_nistz256_from_mont:
mov $bi,#1 // bp[0]
ldp $a0,$a1,[$ap]
ldp $a2,$a3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adr $bp,.Lone // &bp[0]
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
adrp $bp,.Lone // &bp[0]
add $bp,$bp,:lo12:.Lone
bl __ecp_nistz256_mul_mont
@ -181,8 +190,10 @@ ecp_nistz256_mul_mont:
ldr $bi,[$bp] // bp[0]
ldp $a0,$a1,[$ap]
ldp $a2,$a3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
bl __ecp_nistz256_mul_mont
@ -204,8 +215,10 @@ ecp_nistz256_sqr_mont:
ldp $a0,$a1,[$ap]
ldp $a2,$a3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
bl __ecp_nistz256_sqr_mont
@ -229,8 +242,10 @@ ecp_nistz256_add:
ldp $t0,$t1,[$bp]
ldp $acc2,$acc3,[$ap,#16]
ldp $t2,$t3,[$bp,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
bl __ecp_nistz256_add
@ -250,8 +265,10 @@ ecp_nistz256_div_by_2:
ldp $acc0,$acc1,[$ap]
ldp $acc2,$acc3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
bl __ecp_nistz256_div_by_2
@ -271,8 +288,10 @@ ecp_nistz256_mul_by_2:
ldp $acc0,$acc1,[$ap]
ldp $acc2,$acc3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
mov $t0,$acc0
mov $t1,$acc1
mov $t2,$acc2
@ -296,8 +315,10 @@ ecp_nistz256_mul_by_3:
ldp $acc0,$acc1,[$ap]
ldp $acc2,$acc3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
mov $t0,$acc0
mov $t1,$acc1
mov $t2,$acc2
@ -333,8 +354,10 @@ ecp_nistz256_sub:
ldp $acc0,$acc1,[$ap]
ldp $acc2,$acc3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
bl __ecp_nistz256_sub_from
@ -357,8 +380,10 @@ ecp_nistz256_neg:
mov $acc1,xzr
mov $acc2,xzr
mov $acc3,xzr
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
bl __ecp_nistz256_sub_from
@ -736,9 +761,11 @@ ecp_nistz256_point_double:
mov $rp_real,$rp
ldp $acc2,$acc3,[$ap,#48]
mov $ap_real,$ap
ldr $poly1,.Lpoly+8
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
mov $t0,$acc0
ldr $poly3,.Lpoly+24
ldr $poly3,[$poly3,#24]
mov $t1,$acc1
ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont
mov $t2,$acc2
@ -897,8 +924,10 @@ ecp_nistz256_point_add:
mov $rp_real,$rp
mov $ap_real,$ap
mov $bp_real,$bp
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
orr $t0,$a0,$a1
orr $t2,$a2,$a3
orr $in2infty,$t0,$t2
@ -1151,8 +1180,10 @@ ecp_nistz256_point_add_affine:
mov $rp_real,$rp
mov $ap_real,$ap
mov $bp_real,$bp
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
adrp $poly3,.Lpoly
add $poly3,$poly3,:lo12:.Lpoly
ldr $poly1,[$poly3,#8]
ldr $poly3,[$poly3,#24]
ldp $a0,$a1,[$ap,#64] // in1_z
ldp $a2,$a3,[$ap,#64+16]
@ -1303,7 +1334,8 @@ $code.=<<___;
stp $acc2,$acc3,[$rp_real,#$i+16]
___
$code.=<<___ if ($i == 0);
adr $bp_real,.Lone_mont-64
adrp $bp_real,.Lone_mont-64
add $bp_real,$bp_real,:lo12:.Lone_mont-64
___
}
$code.=<<___;
@ -1354,7 +1386,8 @@ ecp_nistz256_ord_mul_mont:
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
adr $ordk,.Lord
adrp $ordk,.Lord
add $ordk,$ordk,:lo12:.Lord
ldr $bi,[$bp] // bp[0]
ldp $a0,$a1,[$ap]
ldp $a2,$a3,[$ap,#16]
@ -1497,7 +1530,8 @@ ecp_nistz256_ord_sqr_mont:
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
adr $ordk,.Lord
adrp $ordk,.Lord
add $ordk,$ordk,:lo12:.Lord
ldp $a0,$a1,[$ap]
ldp $a2,$a3,[$ap,#16]

@ -42,7 +42,8 @@ $code.=<<___;
adc $t4,xzr,xzr
// Load polynomial
adr x2,$mod
adrp x2,$mod
add x2,x2,:lo12:$mod
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
@ -88,7 +89,8 @@ $code.=<<___;
sbc $t4,xzr,xzr
// Load polynomial
adr x2,$mod
adrp x2,$mod
add x2,x2,:lo12:$mod
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
@ -134,7 +136,8 @@ $code.=<<___;
lsr $s3,$s3,#1
// Load mod
adr x2,$mod
adrp x2,$mod
add x2,x2,:lo12:$mod
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
@ -161,7 +164,7 @@ ___
$code.=<<___;
#include "arm_arch.h"
.arch armv8-a
.text
.rodata
.align 5
// The polynomial p
@ -177,6 +180,8 @@ $code.=<<___;
.Lord_div_2:
.quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
.text
// void bn_rshift1(BN_ULONG *a);
.globl bn_rshift1
.type bn_rshift1,%function
@ -272,7 +277,8 @@ ecp_sm2p256_mul_by_3:
mov $t3,$s3
// Sub polynomial
adr x2,.Lpoly
adrp x2,.Lpoly
add x2,x2,:lo12:.Lpoly
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
subs $s0,$s0,$s4
@ -302,7 +308,8 @@ ecp_sm2p256_mul_by_3:
mov $t3,$s3
// Sub polynomial
adr x2,.Lpoly
adrp x2,.Lpoly
add x2,x2,:lo12:.Lpoly
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
subs $s0,$s0,$s4
@ -508,7 +515,8 @@ $code.=<<___;
mov $s6,$s2
mov $s7,$s3
adr $t0,.Lpoly
adrp $t0,.Lpoly
add $t0,$t0,:lo12:.Lpoly
ldp $t1,$t2,[$t0]
ldp $t3,$t4,[$t0,#16]

@ -6035,6 +6035,7 @@ ___
}
$code.=<<___;
.rodata
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#endif

@ -810,6 +810,7 @@ ___
}
$code.=<<___;
.rodata
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#endif

@ -442,7 +442,8 @@ poly1305_blocks_neon:
ldr x30,[sp,#8]
add $in2,$inp,#32
adr $zeros,.Lzeros
adrp $zeros,.Lzeros
add $zeros,$zeros,:lo12:.Lzeros
subs $len,$len,#64
csel $in2,$zeros,$in2,lo
@ -454,7 +455,8 @@ poly1305_blocks_neon:
.align 4
.Leven_neon:
add $in2,$inp,#32
adr $zeros,.Lzeros
adrp $zeros,.Lzeros
add $zeros,$zeros,:lo12:.Lzeros
subs $len,$len,#64
csel $in2,$zeros,$in2,lo
@ -937,6 +939,8 @@ poly1305_emit_neon:
ret
.size poly1305_emit_neon,.-poly1305_emit_neon
.rodata
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0

@ -82,7 +82,7 @@ my @rhotates = ([ 0, 1, 62, 28, 27 ],
$code.=<<___;
#include "arm_arch.h"
.text
.rodata
.align 8 // strategic alignment and padding that allows to use
// address value as loop termination condition...
@ -123,11 +123,14 @@ my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
my @C = map("x$_", (26,27,28,30));
$code.=<<___;
.text
.type KeccakF1600_int,%function
.align 5
KeccakF1600_int:
AARCH64_SIGN_LINK_REGISTER
adr $C[2],iotas
adrp $C[2],iotas
add $C[2],$C[2],:lo12:iotas
stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
b .Loop
.align 4
@ -556,7 +559,8 @@ $code.=<<___;
.align 5
KeccakF1600_ce:
mov x9,#24
adr x10,iotas
adrp x10,iotas
add x10,x10,:lo12:iotas
b .Loop_ce
.align 4
.Loop_ce:

@ -259,7 +259,8 @@ sha1_block_armv8:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
adr x4,.Lconst
adrp x4,.Lconst
add x4,x4,:lo12:.Lconst
eor $E,$E,$E
ld1.32 {$ABCD},[$ctx],#16
ld1.32 {$E}[0],[$ctx]
@ -319,6 +320,9 @@ $code.=<<___;
ldr x29,[sp],#16
ret
.size sha1_block_armv8,.-sha1_block_armv8
.rodata
.align 6
.Lconst:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19

@ -235,7 +235,8 @@ $code.=<<___;
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
adr $Ktbl,.LK$BITS
adrp $Ktbl,.LK$BITS
add $Ktbl,$Ktbl,:lo12:.LK$BITS
stp $ctx,$num,[x29,#96]
.Loop:
@ -285,6 +286,8 @@ $code.=<<___;
ret
.size $func,.-$func
.rodata
.align 6
.type .LK$BITS,%object
.LK$BITS:
@ -355,6 +358,8 @@ $code.=<<___;
.size .LK$BITS,.-.LK$BITS
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.text
___
if ($SZ==4) {
@ -376,7 +381,8 @@ sha256_block_armv8:
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
adr $Ktbl,.LK256
adrp $Ktbl,.LK256
add $Ktbl,$Ktbl,:lo12:.LK256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
@ -641,7 +647,8 @@ sha256_block_neon:
mov x29, sp
sub sp,sp,#16*4
adr $Ktbl,.LK256
adrp $Ktbl,.LK256
add $Ktbl,$Ktbl,:lo12:.LK256
add $num,$inp,$num,lsl#6 // len to point at the end of inp
ld1.8 {@X[0]},[$inp], #16
@ -755,7 +762,8 @@ sha512_block_armv8:
ld1 {@MSG[4]-@MSG[7]},[$inp],#64
ld1.64 {@H[0]-@H[3]},[$ctx] // load context
adr $Ktbl,.LK512
adrp $Ktbl,.LK512
add $Ktbl,$Ktbl,:lo12:.LK512
rev64 @MSG[0],@MSG[0]
rev64 @MSG[1],@MSG[1]