mirror of
https://github.com/openssl/openssl.git
synced 2025-03-19 19:50:42 +08:00
Optimize RSA on armv8
Add Neon path for RSA on armv8, this optimisation targets to A72 and N1 that are ones of important cores of infrastructure. Other platforms are not impacted. A72 old new improved rsa 512 sign 9828.6 9738.7 -1% rsa 512 verify 121497.2 122367.7 1% rsa 1024 sign 1818 1816.9 0% rsa 1024 verify 37175.6 37161.3 0% rsa 2048 sign 267.3 267.4 0% rsa 2048 verify 10127.6 10119.6 0% rsa 3072 sign 86.8 87 0% rsa 3072 verify 4604.2 4956.2 8% rsa 4096 sign 38.3 38.5 1% rsa 4096 verify 2619.8 2972.1 13% rsa 7680 sign 5 7 40% rsa 7680 verify 756 929.4 23% rsa 15360 sign 0.8 1 25% rsa 15360 verify 190.4 246 29% N1 old new improved rsa 512 sign 12599.2 12596.7 0% rsa 512 verify 148636.1 148656.2 0% rsa 1024 sign 2150.6 2148.9 0% rsa 1024 verify 42353.5 42265.2 0% rsa 2048 sign 305.5 305.3 0% rsa 2048 verify 11209.7 11205.2 0% rsa 3072 sign 97.8 98.2 0% rsa 3072 verify 5061.3 5990.7 18% rsa 4096 sign 42.8 43 0% rsa 4096 verify 2867.6 3509.8 22% rsa 7680 sign 5.5 8.4 53% rsa 7680 verify 823.5 1058.3 29% rsa 15360 sign 0.9 1.1 22% rsa 15360 verify 207 273.9 32% CustomizedGitHooks: yes Change-Id: I01c732cc429d793c4eb5ffd27ccd30ff9cebf8af Jira: SECLIB-540 Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/14761)
This commit is contained in:
parent
f0f4a46c4f
commit
1064616012
@ -19,6 +19,7 @@
|
||||
|
||||
unsigned int OPENSSL_armcap_P = 0;
|
||||
unsigned int OPENSSL_arm_midr = 0;
|
||||
unsigned int OPENSSL_armv8_rsa_neonized = 0;
|
||||
|
||||
#if __ARM_MAX_ARCH__<7
|
||||
void OPENSSL_cpuid_setup(void)
|
||||
@ -237,6 +238,12 @@ void OPENSSL_cpuid_setup(void)
|
||||
# ifdef __aarch64__
|
||||
if (OPENSSL_armcap_P & ARMV8_CPUID)
|
||||
OPENSSL_arm_midr = _armv8_cpuid_probe();
|
||||
|
||||
if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) ||
|
||||
MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)) &&
|
||||
(OPENSSL_armcap_P & ARMV7_NEON)) {
|
||||
OPENSSL_armv8_rsa_neonized = 1;
|
||||
}
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
|
@ -67,16 +67,34 @@ $n0="x4"; # const BN_ULONG *n0,
|
||||
$num="x5"; # int num);
|
||||
|
||||
$code.=<<___;
|
||||
#ifndef __KERNEL__
|
||||
# include "arm_arch.h"
|
||||
.extern OPENSSL_armv8_rsa_neonized
|
||||
.hidden OPENSSL_armv8_rsa_neonized
|
||||
#endif
|
||||
.text
|
||||
|
||||
.globl bn_mul_mont
|
||||
.type bn_mul_mont,%function
|
||||
.align 5
|
||||
bn_mul_mont:
|
||||
.Lbn_mul_mont:
|
||||
tst $num,#3
|
||||
b.ne .Lmul_mont
|
||||
cmp $num,#32
|
||||
b.le .Lscalar_impl
|
||||
#ifndef __KERNEL__
|
||||
adrp x17,OPENSSL_armv8_rsa_neonized
|
||||
ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
|
||||
cbnz w17, bn_mul8x_mont_neon
|
||||
#endif
|
||||
|
||||
.Lscalar_impl:
|
||||
tst $num,#7
|
||||
b.eq __bn_sqr8x_mont
|
||||
tst $num,#3
|
||||
b.eq __bn_mul4x_mont
|
||||
|
||||
.Lmul_mont:
|
||||
stp x29,x30,[sp,#-64]!
|
||||
add x29,sp,#0
|
||||
@ -274,6 +292,369 @@ bn_mul_mont:
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
___
|
||||
{
|
||||
my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
|
||||
my ($Z,$Temp)=("v4.16b","v5");
|
||||
my @ACC=map("v$_",(6..13));
|
||||
my ($Bi,$Ni,$M0)=map("v$_",(28..30));
|
||||
my $sBi="s28";
|
||||
my $sM0="s30";
|
||||
my $zero="v14";
|
||||
my $temp="v15";
|
||||
my $ACCTemp="v16";
|
||||
|
||||
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
|
||||
my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
|
||||
|
||||
$code.=<<___;
|
||||
.type bn_mul8x_mont_neon,%function
|
||||
.align 5
|
||||
bn_mul8x_mont_neon:
|
||||
stp x29,x30,[sp,#-80]!
|
||||
mov x16,sp
|
||||
stp d8,d9,[sp,#16]
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
lsl $num,$num,#1
|
||||
eor $zero.16b,$zero.16b,$zero.16b
|
||||
|
||||
.align 4
|
||||
.LNEON_8n:
|
||||
eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
|
||||
sub $toutptr,sp,#128
|
||||
eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
|
||||
sub $toutptr,$toutptr,$num,lsl#4
|
||||
eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
|
||||
and $toutptr,$toutptr,#-64
|
||||
eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
|
||||
mov sp,$toutptr // alloca
|
||||
eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
|
||||
add $toutptr,$toutptr,#256
|
||||
eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
|
||||
sub $inner,$num,#8
|
||||
eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
|
||||
eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
|
||||
|
||||
.LNEON_8n_init:
|
||||
st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
||||
subs $inner,$inner,#8
|
||||
st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
||||
st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
||||
st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
|
||||
bne .LNEON_8n_init
|
||||
|
||||
add $tinptr,sp,#256
|
||||
ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
||||
add $bnptr,sp,#8
|
||||
ldr $sM0,[$n0],#4
|
||||
mov $outer,$num
|
||||
b .LNEON_8n_outer
|
||||
|
||||
.align 4
|
||||
.LNEON_8n_outer:
|
||||
ldr $sBi,[$bptr],#4 // *b++
|
||||
uxtl $Bi.4s,$Bi.4h
|
||||
add $toutptr,sp,#128
|
||||
ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
||||
|
||||
umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||||
umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||||
umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||||
shl $Ni.2d,@ACC[0].2d,#16
|
||||
ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
||||
umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||||
add $Ni.2d,$Ni.2d,@ACC[0].2d
|
||||
umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||||
mul $Ni.2s,$Ni.2s,$M0.2s
|
||||
umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||||
st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
|
||||
umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||||
uxtl $Ni.4s,$Ni.4h
|
||||
umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||||
___
|
||||
for ($i=0; $i<7;) {
|
||||
$code.=<<___;
|
||||
ldr $sBi,[$bptr],#4 // *b++
|
||||
umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||||
umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||||
uxtl $Bi.4s,$Bi.4h
|
||||
umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||||
ushr $temp.2d,@ACC[0].2d,#16
|
||||
umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||||
umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||||
ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||||
add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||||
umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||||
ushr @ACC[0].2d,@ACC[0].2d,#16
|
||||
umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||||
umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||||
add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
|
||||
ins @ACC[1].d[0],$ACCTemp.d[0]
|
||||
st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
||||
___
|
||||
push(@ACC,shift(@ACC)); $i++;
|
||||
$code.=<<___;
|
||||
umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||||
ld1 {@ACC[7].2d},[$tinptr],#16
|
||||
umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||||
umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||||
shl $Ni.2d,@ACC[0].2d,#16
|
||||
ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
||||
umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||||
add $Ni.2d,$Ni.2d,@ACC[0].2d
|
||||
umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||||
mul $Ni.2s,$Ni.2s,$M0.2s
|
||||
umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||||
st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
|
||||
umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||||
uxtl $Ni.4s,$Ni.4h
|
||||
umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
||||
umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||||
ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
||||
umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||||
umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||||
mov $Temp.16b,@ACC[0].16b
|
||||
ushr $Temp.2d,$Temp.2d,#16
|
||||
ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||||
umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||||
umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||||
add @ACC[0].2d,@ACC[0].2d,$Temp.2d
|
||||
umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||||
ushr @ACC[0].2d,@ACC[0].2d,#16
|
||||
eor $temp.16b,$temp.16b,$temp.16b
|
||||
ins @ACC[0].d[1],$temp.d[0]
|
||||
umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||||
umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||||
add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
|
||||
st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
||||
add $bnptr,sp,#8 // rewind
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
sub $inner,$num,#8
|
||||
b .LNEON_8n_inner
|
||||
|
||||
.align 4
|
||||
.LNEON_8n_inner:
|
||||
subs $inner,$inner,#8
|
||||
umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||||
ld1 {@ACC[7].2d},[$tinptr]
|
||||
umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||||
ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
|
||||
umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||||
ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
||||
umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||||
b.eq .LInner_jump
|
||||
add $tinptr,$tinptr,#16 // don't advance in last iteration
|
||||
.LInner_jump:
|
||||
umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||||
umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||||
umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||||
umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||||
___
|
||||
for ($i=1; $i<8; $i++) {
|
||||
$code.=<<___;
|
||||
ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
|
||||
umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||||
umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||||
umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||||
umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||||
umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||||
umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||||
umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||||
umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||||
st1 {@ACC[0].2d},[$toutptr],#16
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||||
ld1 {@ACC[7].2d},[$tinptr]
|
||||
umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||||
ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
|
||||
umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||||
b.eq .LInner_jump$i
|
||||
add $tinptr,$tinptr,#16 // don't advance in last iteration
|
||||
.LInner_jump$i:
|
||||
umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||||
umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||||
umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||||
umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||||
umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
b.ne .LInner_after_rewind$i
|
||||
sub $aptr,$aptr,$num,lsl#2 // rewind
|
||||
.LInner_after_rewind$i:
|
||||
umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||||
ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
||||
umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||||
ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
||||
umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||||
add $bnptr,sp,#8 // rewind
|
||||
umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||||
umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||||
umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||||
umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||||
st1 {@ACC[0].2d},[$toutptr],#16
|
||||
umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||||
|
||||
bne .LNEON_8n_inner
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
add $tinptr,sp,#128
|
||||
st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
||||
eor $N0.16b,$N0.16b,$N0.16b // $N0
|
||||
st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
||||
eor $N1.16b,$N1.16b,$N1.16b // $N1
|
||||
st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
||||
st1 {@ACC[6].2d},[$toutptr]
|
||||
|
||||
subs $outer,$outer,#8
|
||||
ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
|
||||
ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
|
||||
ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
|
||||
ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
|
||||
|
||||
b.eq .LInner_8n_jump_2steps
|
||||
sub $nptr,$nptr,$num,lsl#2 // rewind
|
||||
b .LNEON_8n_outer
|
||||
|
||||
.LInner_8n_jump_2steps:
|
||||
add $toutptr,sp,#128
|
||||
st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
|
||||
mov $Temp.16b,@ACC[0].16b
|
||||
ushr $temp.2d,@ACC[0].2d,#16
|
||||
ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||||
st1 {$N0.2d,$N1.2d}, [sp],#32
|
||||
add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||||
st1 {$N0.2d,$N1.2d}, [sp],#32
|
||||
ushr $temp.2d,@ACC[0].2d,#16
|
||||
st1 {$N0.2d,$N1.2d}, [sp],#32
|
||||
zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
||||
ins $temp.d[1],$zero.d[0]
|
||||
|
||||
mov $inner,$num
|
||||
b .LNEON_tail_entry
|
||||
|
||||
.align 4
|
||||
.LNEON_tail:
|
||||
add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||||
mov $Temp.16b,@ACC[0].16b
|
||||
ushr $temp.2d,@ACC[0].2d,#16
|
||||
ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||||
ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
|
||||
add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||||
ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
|
||||
ushr $temp.2d,@ACC[0].2d,#16
|
||||
ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
|
||||
zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
||||
ins $temp.d[1],$zero.d[0]
|
||||
|
||||
.LNEON_tail_entry:
|
||||
___
|
||||
for ($i=1; $i<8; $i++) {
|
||||
$code.=<<___;
|
||||
add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
||||
st1 {@ACC[0].s}[0], [$toutptr],#4
|
||||
ushr $temp.2d,@ACC[1].2d,#16
|
||||
mov $Temp.16b,@ACC[1].16b
|
||||
ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
|
||||
add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
||||
ushr $temp.2d,@ACC[1].2d,#16
|
||||
zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
|
||||
ins $temp.d[1],$zero.d[0]
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
}
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
|
||||
subs $inner,$inner,#8
|
||||
st1 {@ACC[7].s}[0], [$toutptr],#4
|
||||
bne .LNEON_tail
|
||||
|
||||
st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
|
||||
sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
|
||||
subs $aptr,sp,#0 // clear carry flag
|
||||
add $bptr,sp,$num,lsl#2
|
||||
|
||||
.LNEON_sub:
|
||||
ldp w4,w5,[$aptr],#8
|
||||
ldp w6,w7,[$aptr],#8
|
||||
ldp w8,w9,[$nptr],#8
|
||||
ldp w10,w11,[$nptr],#8
|
||||
sbcs w8,w4,w8
|
||||
sbcs w9,w5,w9
|
||||
sbcs w10,w6,w10
|
||||
sbcs w11,w7,w11
|
||||
sub x17,$bptr,$aptr
|
||||
stp w8,w9,[$rptr],#8
|
||||
stp w10,w11,[$rptr],#8
|
||||
cbnz x17,.LNEON_sub
|
||||
|
||||
ldr w10, [$aptr] // load top-most bit
|
||||
mov x11,sp
|
||||
eor v0.16b,v0.16b,v0.16b
|
||||
sub x11,$bptr,x11 // this is num*4
|
||||
eor v1.16b,v1.16b,v1.16b
|
||||
mov $aptr,sp
|
||||
sub $rptr,$rptr,x11 // rewind $rptr
|
||||
mov $nptr,$bptr // second 3/4th of frame
|
||||
sbcs w10,w10,wzr // result is carry flag
|
||||
|
||||
.LNEON_copy_n_zap:
|
||||
ldp w4,w5,[$aptr],#8
|
||||
ldp w6,w7,[$aptr],#8
|
||||
ldp w8,w9,[$rptr],#8
|
||||
ldp w10,w11,[$rptr]
|
||||
sub $rptr,$rptr,#8
|
||||
b.cs .LCopy_1
|
||||
mov w8,w4
|
||||
mov w9,w5
|
||||
mov w10,w6
|
||||
mov w11,w7
|
||||
.LCopy_1:
|
||||
st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
||||
st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
||||
ldp w4,w5,[$aptr],#8
|
||||
ldp w6,w7,[$aptr],#8
|
||||
stp w8,w9,[$rptr],#8
|
||||
stp w10,w11,[$rptr],#8
|
||||
sub $aptr,$aptr,#32
|
||||
ldp w8,w9,[$rptr],#8
|
||||
ldp w10,w11,[$rptr]
|
||||
sub $rptr,$rptr,#8
|
||||
b.cs .LCopy_2
|
||||
mov w8, w4
|
||||
mov w9, w5
|
||||
mov w10, w6
|
||||
mov w11, w7
|
||||
.LCopy_2:
|
||||
st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
|
||||
st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
||||
sub x17,$bptr,$aptr // preserves carry
|
||||
stp w8,w9,[$rptr],#8
|
||||
stp w10,w11,[$rptr],#8
|
||||
cbnz x17,.LNEON_copy_n_zap
|
||||
|
||||
mov sp,x16
|
||||
ldp d14,d15,[sp,#64]
|
||||
ldp d12,d13,[sp,#48]
|
||||
ldp d10,d11,[sp,#32]
|
||||
ldp d8,d9,[sp,#16]
|
||||
ldr x29,[sp],#80
|
||||
ret // bx lr
|
||||
|
||||
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
||||
___
|
||||
}
|
||||
{
|
||||
########################################################################
|
||||
# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
|
||||
|
||||
|
@ -177,3 +177,4 @@ INCLUDE[armv4-mont.o]=..
|
||||
GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl
|
||||
INCLUDE[armv4-gf2m.o]=..
|
||||
GENERATE[armv8-mont.S]=asm/armv8-mont.pl
|
||||
INCLUDE[armv8-mont.o]=..
|
||||
|
Loading…
x
Reference in New Issue
Block a user