mirror of
https://github.com/openssl/openssl.git
synced 2024-11-21 01:15:20 +08:00
bn/asm/armv4-mont.pl: boost NEON performance.
Close difference gap on Cortex-A9, which resulted in further improvement even on other processors. Reviewed-by: Richard Levitte <levitte@openssl.org>
This commit is contained in:
parent
75f648aa06
commit
8eed3289b2
@ -38,6 +38,15 @@
|
||||
# for execution on all NEON-capable processors, because gain on
|
||||
# others outweighs the marginal loss on Cortex-A9.
|
||||
|
||||
# September 2015
|
||||
#
|
||||
# Align Cortex-A9 performance with November 2013 improvements, i.e.
|
||||
# NEON code is now ~20-105% faster than integer-only one on this
|
||||
# processor. But this optimization further improved performance even
|
||||
# on other processors: NEON code path is ~45-180% faster than original
|
||||
# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
|
||||
# Snapdragon S4.
|
||||
|
||||
$flavour = shift;
|
||||
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
||||
@ -272,19 +281,16 @@ bn_mul_mont:
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
___
|
||||
{
|
||||
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||||
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||||
|
||||
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
|
||||
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
|
||||
my ($Z,$Temp)=("q4","q5");
|
||||
my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
|
||||
my @ACC=map("q$_",(6..13));
|
||||
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
|
||||
my $zero=&Dlo($Z);
|
||||
my $temp=&Dlo($Temp);
|
||||
my $zero="$Z#lo";
|
||||
my $temp="$Temp#lo";
|
||||
|
||||
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
|
||||
my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
|
||||
my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
@ -300,59 +306,58 @@ bn_mul8x_mont_neon:
|
||||
ldmia ip,{r4-r5} @ load rest of parameter block
|
||||
mov ip,sp
|
||||
|
||||
sub $toutptr,sp,#16
|
||||
cmp $num,#8
|
||||
bhi .LNEON_8n
|
||||
|
||||
@ special case for $num==8, everything is in register bank...
|
||||
|
||||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||||
sub $toutptr,$toutptr,$num,lsl#4
|
||||
veor $zero,$zero,$zero
|
||||
sub $toutptr,sp,$num,lsl#4
|
||||
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
|
||||
and $toutptr,$toutptr,#-64
|
||||
vld1.32 {${M0}[0]}, [$n0,:32]
|
||||
mov sp,$toutptr @ alloca
|
||||
veor $zero,$zero,$zero
|
||||
subs $inner,$num,#8
|
||||
vzip.16 $Bi,$zero
|
||||
|
||||
vmull.u32 $A0xB,$Bi,${A0}[0]
|
||||
vmull.u32 $A1xB,$Bi,${A0}[1]
|
||||
vmull.u32 $A2xB,$Bi,${A1}[0]
|
||||
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
|
||||
vmull.u32 $A3xB,$Bi,${A1}[1]
|
||||
vmull.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vmull.u32 @ACC[1],$Bi,${A0}[1]
|
||||
vmull.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||||
vmull.u32 @ACC[3],$Bi,${A1}[1]
|
||||
|
||||
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
|
||||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||||
veor $zero,$zero,$zero
|
||||
vmul.u32 $Ni,$temp,$M0
|
||||
vmul.u32 $Ni,$Ni,$M0
|
||||
|
||||
vmull.u32 $A4xB,$Bi,${A2}[0]
|
||||
vmull.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vld1.32 {$N0-$N3}, [$nptr]!
|
||||
vmull.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmull.u32 $A6xB,$Bi,${A3}[0]
|
||||
vmull.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vmull.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmull.u32 $A7xB,$Bi,${A3}[1]
|
||||
vmull.u32 @ACC[7],$Bi,${A3}[1]
|
||||
|
||||
bne .LNEON_1st
|
||||
|
||||
@ special case for num=8, everything is in register bank...
|
||||
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
sub $outer,$num,#1
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vmov $Temp,$A0xB
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vmov $A0xB,$A1xB
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vmov $A1xB,$A2xB
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
vmov $A2xB,$A3xB
|
||||
vmov $A3xB,$A4xB
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vmov $Temp,@ACC[0]
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vmov @ACC[0],@ACC[1]
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmov @ACC[1],@ACC[2]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vmov @ACC[2],@ACC[3]
|
||||
vmov @ACC[3],@ACC[4]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vmov $A4xB,$A5xB
|
||||
vmov $A5xB,$A6xB
|
||||
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||||
vmov $A6xB,$A7xB
|
||||
veor $A7xB,$A7xB
|
||||
vmov @ACC[4],@ACC[5]
|
||||
vmov @ACC[5],@ACC[6]
|
||||
vadd.u64 $temp,$temp,$Temp#hi
|
||||
vmov @ACC[6],@ACC[7]
|
||||
veor @ACC[7],@ACC[7]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
b .LNEON_outer8
|
||||
@ -362,279 +367,302 @@ bn_mul8x_mont_neon:
|
||||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||||
veor $zero,$zero,$zero
|
||||
vzip.16 $Bi,$zero
|
||||
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
|
||||
|
||||
vmlal.u32 $A0xB,$Bi,${A0}[0]
|
||||
vmlal.u32 $A1xB,$Bi,${A0}[1]
|
||||
vmlal.u32 $A2xB,$Bi,${A1}[0]
|
||||
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
|
||||
vmlal.u32 $A3xB,$Bi,${A1}[1]
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
|
||||
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
|
||||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||||
veor $zero,$zero,$zero
|
||||
subs $outer,$outer,#1
|
||||
vmul.u32 $Ni,$temp,$M0
|
||||
vmul.u32 $Ni,$Ni,$M0
|
||||
|
||||
vmlal.u32 $A4xB,$Bi,${A2}[0]
|
||||
vmlal.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmlal.u32 $A6xB,$Bi,${A3}[0]
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmlal.u32 $A7xB,$Bi,${A3}[1]
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vmov $Temp,$A0xB
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vmov $A0xB,$A1xB
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vmov $A1xB,$A2xB
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
vmov $A2xB,$A3xB
|
||||
vmov $A3xB,$A4xB
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vmov $Temp,@ACC[0]
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vmov @ACC[0],@ACC[1]
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmov @ACC[1],@ACC[2]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vmov @ACC[2],@ACC[3]
|
||||
vmov @ACC[3],@ACC[4]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vmov $A4xB,$A5xB
|
||||
vmov $A5xB,$A6xB
|
||||
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||||
vmov $A6xB,$A7xB
|
||||
veor $A7xB,$A7xB
|
||||
vmov @ACC[4],@ACC[5]
|
||||
vmov @ACC[5],@ACC[6]
|
||||
vadd.u64 $temp,$temp,$Temp#hi
|
||||
vmov @ACC[6],@ACC[7]
|
||||
veor @ACC[7],@ACC[7]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
bne .LNEON_outer8
|
||||
|
||||
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
|
||||
mov $toutptr,sp
|
||||
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
|
||||
vshr.u64 $temp,@ACC[0]#lo,#16
|
||||
mov $inner,$num
|
||||
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
|
||||
add $tinptr,sp,#16
|
||||
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
|
||||
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
|
||||
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
|
||||
add $tinptr,sp,#96
|
||||
vshr.u64 $temp,@ACC[0]#hi,#16
|
||||
vzip.16 @ACC[0]#lo,@ACC[0]#hi
|
||||
|
||||
b .LNEON_tail2
|
||||
b .LNEON_tail_entry
|
||||
|
||||
.align 4
|
||||
.LNEON_1st:
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
vld1.32 {$A0-$A3}, [$aptr]!
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
.LNEON_8n:
|
||||
veor @ACC[0],@ACC[0],@ACC[0]
|
||||
sub $toutptr,sp,#128
|
||||
veor @ACC[1],@ACC[1],@ACC[1]
|
||||
sub $toutptr,$toutptr,$num,lsl#4
|
||||
veor @ACC[2],@ACC[2],@ACC[2]
|
||||
and $toutptr,$toutptr,#-64
|
||||
veor @ACC[3],@ACC[3],@ACC[3]
|
||||
mov sp,$toutptr @ alloca
|
||||
veor @ACC[4],@ACC[4],@ACC[4]
|
||||
add $toutptr,$toutptr,#256
|
||||
veor @ACC[5],@ACC[5],@ACC[5]
|
||||
sub $inner,$num,#8
|
||||
veor @ACC[6],@ACC[6],@ACC[6]
|
||||
veor @ACC[7],@ACC[7],@ACC[7]
|
||||
|
||||
.LNEON_8n_init:
|
||||
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
|
||||
subs $inner,$inner,#8
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
|
||||
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
|
||||
vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
|
||||
bne .LNEON_8n_init
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vld1.32 {$N0-$N1}, [$nptr]!
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||||
|
||||
vmull.u32 $A0xB,$Bi,${A0}[0]
|
||||
vld1.32 {$N2-$N3}, [$nptr]!
|
||||
vmull.u32 $A1xB,$Bi,${A0}[1]
|
||||
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||||
vmull.u32 $A2xB,$Bi,${A1}[0]
|
||||
vmull.u32 $A3xB,$Bi,${A1}[1]
|
||||
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||||
|
||||
vmull.u32 $A4xB,$Bi,${A2}[0]
|
||||
vmull.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmull.u32 $A6xB,$Bi,${A3}[0]
|
||||
vmull.u32 $A7xB,$Bi,${A3}[1]
|
||||
|
||||
bne .LNEON_1st
|
||||
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
add $tinptr,sp,#16
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vld1.64 {$Temp}, [sp,:128]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
sub $outer,$num,#1
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vld1.64 {$A0xB}, [$tinptr, :128]!
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
|
||||
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||||
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||||
veor $Z,$Z,$Z
|
||||
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||||
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||||
vst1.64 {$Z}, [$toutptr,:128]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
b .LNEON_outer
|
||||
add $tinptr,sp,#256
|
||||
vld1.32 {$A0-$A3},[$aptr]!
|
||||
add $bnptr,sp,#8
|
||||
vld1.32 {${M0}[0]},[$n0,:32]
|
||||
mov $outer,$num
|
||||
b .LNEON_8n_outer
|
||||
|
||||
.align 4
|
||||
.LNEON_outer:
|
||||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||||
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
|
||||
vld1.32 {$A0-$A3}, [$aptr]!
|
||||
.LNEON_8n_outer:
|
||||
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
|
||||
veor $zero,$zero,$zero
|
||||
mov $toutptr,sp
|
||||
vzip.16 $Bi,$zero
|
||||
add $toutptr,sp,#128
|
||||
vld1.32 {$N0-$N3},[$nptr]!
|
||||
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
veor $zero,$zero,$zero
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmul.u32 $Ni,$Ni,$M0
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
___
|
||||
for ($i=0; $i<7;) {
|
||||
$code.=<<___;
|
||||
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
veor $temp,$temp,$temp
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vzip.16 $Bi,$temp
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
|
||||
vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
|
||||
___
|
||||
push(@ACC,shift(@ACC)); $i++;
|
||||
$code.=<<___;
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vld1.64 {@ACC[7]},[$tinptr,:128]!
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
veor $zero,$zero,$zero
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmul.u32 $Ni,$Ni,$M0
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
vld1.32 {$A0-$A3},[$aptr]!
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
|
||||
vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
|
||||
add $bnptr,sp,#8 @ rewind
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
sub $inner,$num,#8
|
||||
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||||
b .LNEON_8n_inner
|
||||
|
||||
vmlal.u32 $A0xB,$Bi,${A0}[0]
|
||||
vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
|
||||
vmlal.u32 $A1xB,$Bi,${A0}[1]
|
||||
vmlal.u32 $A2xB,$Bi,${A1}[0]
|
||||
vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
|
||||
vmlal.u32 $A3xB,$Bi,${A1}[1]
|
||||
|
||||
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
|
||||
veor $zero,$zero,$zero
|
||||
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
|
||||
vld1.64 {$A7xB},[$tinptr,:128]!
|
||||
vmul.u32 $Ni,$temp,$M0
|
||||
|
||||
vmlal.u32 $A4xB,$Bi,${A2}[0]
|
||||
vld1.32 {$N0-$N3}, [$nptr]!
|
||||
vmlal.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmlal.u32 $A6xB,$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmlal.u32 $A7xB,$Bi,${A3}[1]
|
||||
|
||||
.LNEON_inner:
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
vld1.32 {$A0-$A3}, [$aptr]!
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
subs $inner,$inner,#8
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vld1.64 {$A0xB}, [$tinptr, :128]!
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||||
|
||||
vmlal.u32 $A0xB,$Bi,${A0}[0]
|
||||
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
|
||||
vmlal.u32 $A1xB,$Bi,${A0}[1]
|
||||
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A2xB,$Bi,${A1}[0]
|
||||
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
|
||||
vmlal.u32 $A3xB,$Bi,${A1}[1]
|
||||
vld1.32 {$N0-$N3}, [$nptr]!
|
||||
|
||||
vmlal.u32 $A4xB,$Bi,${A2}[0]
|
||||
vld1.64 {$A7xB}, [$tinptr, :128]!
|
||||
vmlal.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmlal.u32 $A6xB,$Bi,${A3}[0]
|
||||
vmlal.u32 $A7xB,$Bi,${A3}[1]
|
||||
|
||||
bne .LNEON_inner
|
||||
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
add $tinptr,sp,#16
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vld1.64 {$Temp}, [sp,:128]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
subs $outer,$outer,#1
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vld1.64 {$A0xB}, [$tinptr, :128]!
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
|
||||
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||||
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||||
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
bne .LNEON_outer
|
||||
|
||||
mov $toutptr,sp
|
||||
mov $inner,$num
|
||||
|
||||
.LNEON_tail:
|
||||
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||||
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
|
||||
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
|
||||
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
|
||||
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
|
||||
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
|
||||
vld1.64 {$A7xB}, [$tinptr, :128]!
|
||||
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
|
||||
|
||||
.LNEON_tail2:
|
||||
vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A1xB")`,#16
|
||||
vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A1xB")`,#16
|
||||
vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A2xB")`,#16
|
||||
vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A2xB")`,#16
|
||||
vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A3xB")`,#16
|
||||
vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A3xB")`,#16
|
||||
vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A4xB")`,#16
|
||||
vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A4xB")`,#16
|
||||
vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A5xB")`,#16
|
||||
vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A5xB")`,#16
|
||||
vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A6xB")`,#16
|
||||
vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
|
||||
vld1.64 {$A0xB}, [$tinptr, :128]!
|
||||
vshr.u64 $temp,`&Dhi("$A6xB")`,#16
|
||||
vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A7xB")`,#16
|
||||
vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
|
||||
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||||
vshr.u64 $temp,`&Dhi("$A7xB")`,#16
|
||||
vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
|
||||
.align 4
|
||||
.LNEON_8n_inner:
|
||||
subs $inner,$inner,#8
|
||||
vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vld1.64 {@ACC[7]},[$tinptr,:128]
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vld1.32 {$N0-$N3},[$nptr]!
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
it ne
|
||||
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
___
|
||||
for ($i=1; $i<8; $i++) {
|
||||
$code.=<<___;
|
||||
vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vst1.64 {@ACC[0]},[$toutptr,:128]!
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vld1.64 {@ACC[7]},[$tinptr,:128]
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
it ne
|
||||
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
it eq
|
||||
subeq $aptr,$aptr,$num,lsl#2 @ rewind
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vld1.32 {$A0-$A3},[$aptr]!
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
add $bnptr,sp,#8 @ rewind
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vst1.64 {@ACC[0]},[$toutptr,:128]!
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
|
||||
bne .LNEON_8n_inner
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
add $tinptr,sp,#128
|
||||
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
|
||||
veor q2,q2,q2 @ $N0-$N1
|
||||
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
|
||||
veor q3,q3,q3 @ $N2-$N3
|
||||
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
|
||||
vst1.64 {@ACC[6]},[$toutptr,:128]
|
||||
|
||||
subs $outer,$outer,#8
|
||||
vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
|
||||
vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
|
||||
vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
|
||||
vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
|
||||
|
||||
itt ne
|
||||
subne $nptr,$nptr,$num,lsl#2 @ rewind
|
||||
bne .LNEON_8n_outer
|
||||
|
||||
add $toutptr,sp,#128
|
||||
vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
|
||||
vshr.u64 $temp,@ACC[0]#lo,#16
|
||||
vst1.64 {q2-q3},[sp,:256]!
|
||||
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
|
||||
vst1.64 {q2-q3}, [sp,:256]!
|
||||
vshr.u64 $temp,@ACC[0]#hi,#16
|
||||
vst1.64 {q2-q3}, [sp,:256]!
|
||||
vzip.16 @ACC[0]#lo,@ACC[0]#hi
|
||||
|
||||
mov $inner,$num
|
||||
b .LNEON_tail_entry
|
||||
|
||||
.align 4
|
||||
.LNEON_tail:
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
|
||||
vshr.u64 $temp,@ACC[0]#lo,#16
|
||||
vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
|
||||
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
|
||||
vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
|
||||
vshr.u64 $temp,@ACC[0]#hi,#16
|
||||
vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
|
||||
vzip.16 @ACC[0]#lo,@ACC[0]#hi
|
||||
|
||||
.LNEON_tail_entry:
|
||||
___
|
||||
for ($i=1; $i<8; $i++) {
|
||||
$code.=<<___;
|
||||
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
|
||||
vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,@ACC[1]#lo,#16
|
||||
vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
|
||||
vshr.u64 $temp,@ACC[1]#hi,#16
|
||||
vzip.16 @ACC[1]#lo,@ACC[1]#hi
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
}
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
|
||||
subs $inner,$inner,#8
|
||||
vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
|
||||
bne .LNEON_tail
|
||||
|
||||
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
|
||||
@ -708,8 +736,14 @@ $code.=<<___;
|
||||
#endif
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
$code =~ s/\bret\b/bx lr/gm;
|
||||
print $code;
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
|
||||
s/\bret\b/bx lr/g or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
||||
|
Loading…
Reference in New Issue
Block a user