mirror of
https://github.com/openssl/openssl.git
synced 2025-04-06 20:20:50 +08:00
Optimize chacha20 on aarch64 by SVE2
This patch improves existing chacha20 SVE patch by using SVE2, which is an optional architecture feature of aarch64, with XAR instruction that can improve the performance of chacha20. Signed-off-by: Daniel Hu <Daniel.Hu@arm.com> Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/18522)
This commit is contained in:
parent
b147b9daf1
commit
bcb52bcc9f
@ -31,17 +31,25 @@ sub AUTOLOAD() # thunk [simplified] x86-style perlasm
|
||||
}
|
||||
|
||||
my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
|
||||
my ($state) = ("x5");
|
||||
my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7");
|
||||
my ($saved_outp) = ("x8");
|
||||
my ($wctr, $xctr) = ("w9", "x9");
|
||||
my @mx=map("z$_",(0..7,16..23));
|
||||
my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6");
|
||||
my ($sve2flag) = ("x7");
|
||||
my ($wctr, $xctr) = ("w8", "x8");
|
||||
my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
|
||||
my ($tmp,$tmpw) = ("x10", "w10");
|
||||
my ($counter) = ("x11");
|
||||
my @K=map("x$_",(12..15,19..22));
|
||||
my @KL=map("w$_",(12..15,19..22));
|
||||
my @mx=map("z$_",(0..15));
|
||||
my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
||||
$xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
|
||||
my @xt=map("z$_",(24..31,8..11));
|
||||
my ($rot8) = ("z12");
|
||||
my ($zctr) = ("z13");
|
||||
my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt;
|
||||
my ($zctr) = ("z16");
|
||||
my @xt=map("z$_",(17..24));
|
||||
my @perm=map("z$_",(25..30));
|
||||
my ($rot8) = ("z31");
|
||||
my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt;
|
||||
# in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register)
|
||||
# in SVE2 we use all 15 backup register
|
||||
my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8);
|
||||
my $debug_encoder=0;
|
||||
|
||||
sub SVE_ADD() {
|
||||
@ -148,8 +156,12 @@ sub SVE_QR_GROUP() {
|
||||
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
|
||||
|
||||
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
|
||||
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
|
||||
&SVE_REV16($d0,$d1,$d2,$d3);
|
||||
if ($have_sve2 == 0) {
|
||||
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
|
||||
&SVE_REV16($d0,$d1,$d2,$d3);
|
||||
} else {
|
||||
&SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
|
||||
}
|
||||
|
||||
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
|
||||
if ($have_sve2 == 0) {
|
||||
@ -162,8 +174,12 @@ sub SVE_QR_GROUP() {
|
||||
}
|
||||
|
||||
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
|
||||
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
|
||||
&SVE_ROT8($d0,$d1,$d2,$d3);
|
||||
if ($have_sve2 == 0) {
|
||||
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
|
||||
&SVE_ROT8($d0,$d1,$d2,$d3);
|
||||
} else {
|
||||
&SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
|
||||
}
|
||||
|
||||
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
|
||||
if ($have_sve2 == 0) {
|
||||
@ -178,26 +194,31 @@ sub SVE_QR_GROUP() {
|
||||
|
||||
sub SVE_INNER_BLOCK() {
|
||||
$code.=<<___;
|
||||
//cbnz $sve2flag, 10f
|
||||
mov $counter,#10
|
||||
1:
|
||||
.align 5
|
||||
___
|
||||
&SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
|
||||
&SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
|
||||
$code.=<<___;
|
||||
// SVE 2 not enabled until hardware available
|
||||
#if 0
|
||||
b 11f
|
||||
10:
|
||||
___
|
||||
# &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
|
||||
# &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
|
||||
$code.=<<___;
|
||||
11:
|
||||
#endif
|
||||
subs $counter,$counter,1
|
||||
b.ne 1b
|
||||
___
|
||||
}
|
||||
|
||||
{{{
|
||||
my ($dlen,$rsize,$tmp) = ("x10","x11","x12");
|
||||
sub SVE2_INNER_BLOCK() {
|
||||
$code.=<<___;
|
||||
mov $counter,#10
|
||||
1:
|
||||
.align 5
|
||||
___
|
||||
&SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
|
||||
&SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
|
||||
$code.=<<___;
|
||||
subs $counter,$counter,1
|
||||
b.ne 1b
|
||||
___
|
||||
}
|
||||
|
||||
sub load() {
|
||||
my $x0 = shift;
|
||||
@ -252,72 +273,75 @@ sub transpose() {
|
||||
my $xd = shift;
|
||||
|
||||
$code.=<<___;
|
||||
zip1 $xt8.s,$xa.s,$xb.s
|
||||
zip2 $xt9.s,$xa.s,$xb.s
|
||||
zip1 $xt10.s,$xc.s,$xd.s
|
||||
zip2 $xt11.s,$xc.s,$xd.s
|
||||
zip1 $xa.d,$xt8.d,$xt10.d
|
||||
zip2 $xb.d,$xt8.d,$xt10.d
|
||||
zip1 $xc.d,$xt9.d,$xt11.d
|
||||
zip2 $xd.d,$xt9.d,$xt11.d
|
||||
zip1 $xt0.s,$xa.s,$xb.s
|
||||
zip2 $xt1.s,$xa.s,$xb.s
|
||||
zip1 $xt2.s,$xc.s,$xd.s
|
||||
zip2 $xt3.s,$xc.s,$xd.s
|
||||
zip1 $xa.d,$xt0.d,$xt2.d
|
||||
zip2 $xb.d,$xt0.d,$xt2.d
|
||||
zip1 $xc.d,$xt1.d,$xt3.d
|
||||
zip2 $xd.d,$xt1.d,$xt3.d
|
||||
___
|
||||
}
|
||||
|
||||
sub add_states() {
|
||||
my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
|
||||
|
||||
sub SVE_ADD_STATES() {
|
||||
$code.=<<___;
|
||||
ldp $tmpw0,$tmpw1,[$state]
|
||||
ldp $tmpw2,$tmpw3,[$state,#8]
|
||||
dup $xt0.s,$tmpw0
|
||||
lsr $tmp1,@K[5],#32
|
||||
dup $xt0.s,@KL[5]
|
||||
dup $xt1.s,$tmpw1
|
||||
dup $xt2.s,$tmpw2
|
||||
dup $xt3.s,$tmpw3
|
||||
ldp $tmpw0,$tmpw1,[$state,#16]
|
||||
ldp $tmpw2,$tmpw3,[$state,#24]
|
||||
add @mx[0].s,@mx[0].s,$xt0.s
|
||||
add @mx[1].s,@mx[1].s,$xt1.s
|
||||
add @mx[2].s,@mx[2].s,$xt2.s
|
||||
add @mx[3].s,@mx[3].s,$xt3.s
|
||||
add @mx[0].s,@mx[0].s,$bak0.s
|
||||
add @mx[1].s,@mx[1].s,$bak1.s
|
||||
add @mx[2].s,@mx[2].s,$bak2.s
|
||||
add @mx[3].s,@mx[3].s,$bak3.s
|
||||
add @mx[4].s,@mx[4].s,$bak4.s
|
||||
add @mx[5].s,@mx[5].s,$bak5.s
|
||||
add @mx[6].s,@mx[6].s,$bak6.s
|
||||
add @mx[7].s,@mx[7].s,$bak7.s
|
||||
add @mx[8].s,@mx[8].s,$bak8.s
|
||||
add @mx[9].s,@mx[9].s,$bak9.s
|
||||
lsr $tmp0,@K[6],#32
|
||||
dup $xt4.s,$tmpw0
|
||||
dup $xt5.s,$tmpw1
|
||||
dup $xt6.s,$tmpw2
|
||||
dup $xt7.s,$tmpw3
|
||||
ldp $tmpw0,$tmpw1,[$state,#32]
|
||||
ldp $tmpw2,$tmpw3,[$state,#40]
|
||||
add @mx[4].s,@mx[4].s,$xt4.s
|
||||
add @mx[5].s,@mx[5].s,$xt5.s
|
||||
add @mx[6].s,@mx[6].s,$xt6.s
|
||||
add @mx[7].s,@mx[7].s,$xt7.s
|
||||
dup $xt0.s,$tmpw0
|
||||
dup $xt1.s,$tmpw1
|
||||
dup $xt2.s,$tmpw2
|
||||
dup $xt3.s,$tmpw3
|
||||
ldp $tmpw0,$tmpw1,[$state,#48]
|
||||
ldp $tmpw2,$tmpw3,[$state,#56]
|
||||
add @mx[8].s,@mx[8].s,$xt0.s
|
||||
add @mx[9].s,@mx[9].s,$xt1.s
|
||||
add @mx[10].s,@mx[10].s,$xt2.s
|
||||
add @mx[11].s,@mx[11].s,$xt3.s
|
||||
dup $xt5.s,$tmpw1
|
||||
dup $xt6.s,$tmpw2
|
||||
dup $xt7.s,$tmpw3
|
||||
lsr $tmp1,@K[7],#32
|
||||
dup $xt5.s,@KL[7]
|
||||
dup $xt6.s,$tmpw1
|
||||
add @mx[10].s,@mx[10].s,$xt0.s
|
||||
add @mx[11].s,@mx[11].s,$xt1.s
|
||||
add @mx[12].s,@mx[12].s,$zctr.s
|
||||
add @mx[13].s,@mx[13].s,$xt5.s
|
||||
add @mx[14].s,@mx[14].s,$xt6.s
|
||||
add @mx[15].s,@mx[15].s,$xt7.s
|
||||
add @mx[13].s,@mx[13].s,$xt4.s
|
||||
add @mx[14].s,@mx[14].s,$xt5.s
|
||||
add @mx[15].s,@mx[15].s,$xt6.s
|
||||
___
|
||||
}
|
||||
|
||||
sub SVE2_ADD_STATES() {
|
||||
$code.=<<___;
|
||||
add @mx[0].s,@mx[0].s,$bak0.s
|
||||
add @mx[1].s,@mx[1].s,$bak1.s
|
||||
add @mx[2].s,@mx[2].s,$bak2.s
|
||||
add @mx[3].s,@mx[3].s,$bak3.s
|
||||
add @mx[4].s,@mx[4].s,$bak4.s
|
||||
add @mx[5].s,@mx[5].s,$bak5.s
|
||||
add @mx[6].s,@mx[6].s,$bak6.s
|
||||
add @mx[7].s,@mx[7].s,$bak7.s
|
||||
add @mx[8].s,@mx[8].s,$bak8.s
|
||||
add @mx[9].s,@mx[9].s,$bak9.s
|
||||
add @mx[10].s,@mx[10].s,$bak10.s
|
||||
add @mx[11].s,@mx[11].s,$bak11.s
|
||||
add @mx[12].s,@mx[12].s,$zctr.s
|
||||
add @mx[13].s,@mx[13].s,$bak13.s
|
||||
add @mx[14].s,@mx[14].s,$bak14.s
|
||||
add @mx[15].s,@mx[15].s,$bak15.s
|
||||
___
|
||||
}
|
||||
|
||||
sub SVE_TRANSFORMS() {
|
||||
&add_states();
|
||||
&transpose($xa0,$xb0,$xc0,$xd0);
|
||||
&transpose($xa1,$xb1,$xc1,$xd1);
|
||||
&transpose($xa2,$xb2,$xc2,$xd2);
|
||||
&transpose($xa3,$xb3,$xc3,$xd3);
|
||||
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
|
||||
&transpose($xa0,$xa1,$xa2,$xa3);
|
||||
&transpose($xb0,$xb1,$xb2,$xb3);
|
||||
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
|
||||
$code.=<<___;
|
||||
eor $xa0.d,$xa0.d,$xt0.d
|
||||
eor $xa1.d,$xa1.d,$xt1.d
|
||||
@ -330,8 +354,8 @@ $code.=<<___;
|
||||
___
|
||||
&transpose($xc0,$xc1,$xc2,$xc3);
|
||||
&store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
|
||||
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
|
||||
&transpose($xd0,$xd1,$xd2,$xd3);
|
||||
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
|
||||
$code.=<<___;
|
||||
eor $xc0.d,$xc0.d,$xt0.d
|
||||
eor $xc1.d,$xc1.d,$xt1.d
|
||||
@ -348,73 +372,111 @@ $code.=<<___;
|
||||
incw $zctr.s, ALL, MUL #1
|
||||
___
|
||||
}
|
||||
}}}
|
||||
|
||||
sub SVE_LOAD_STATES() {
|
||||
my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
|
||||
|
||||
$code.=<<___;
|
||||
// FIXME following code are not functionally necessary
|
||||
// but appear to enhance performance
|
||||
#if 1
|
||||
ptrues p2.s,ALL
|
||||
ptrues p2.s,ALL
|
||||
ptrues p2.s,ALL
|
||||
ptrues p2.s,ALL
|
||||
ptrues p2.s,ALL
|
||||
ptrues p2.s,ALL
|
||||
#endif
|
||||
lsr $tmp0,@K[0],#32
|
||||
dup @mx[0].s,@KL[0]
|
||||
dup $bak0.s,@KL[0]
|
||||
dup @mx[1].s,$tmpw0
|
||||
dup $bak1.s,$tmpw0
|
||||
lsr $tmp1,@K[1],#32
|
||||
dup @mx[2].s,@KL[1]
|
||||
dup $bak2.s,@KL[1]
|
||||
dup @mx[3].s,$tmpw1
|
||||
dup $bak3.s,$tmpw1
|
||||
lsr $tmp0,@K[2],#32
|
||||
dup @mx[4].s,@KL[2]
|
||||
dup $bak4.s,@KL[2]
|
||||
dup @mx[5].s,$tmpw0
|
||||
dup $bak5.s,$tmpw0
|
||||
lsr $tmp1,@K[3],#32
|
||||
dup @mx[6].s,@KL[3]
|
||||
dup $bak6.s,@KL[3]
|
||||
dup @mx[7].s,$tmpw1
|
||||
dup $bak7.s,$tmpw1
|
||||
lsr $tmp0,@K[4],#32
|
||||
dup @mx[8].s,@KL[4]
|
||||
dup $bak8.s,@KL[4]
|
||||
dup @mx[9].s,$tmpw0
|
||||
dup $bak9.s,$tmpw0
|
||||
lsr $tmp1,@K[5],#32
|
||||
dup @mx[10].s,@KL[5]
|
||||
dup @mx[11].s,$tmpw1
|
||||
orr @mx[12].d,$zctr.d,$zctr.d
|
||||
lsr $tmp0,@K[6],#32
|
||||
dup @mx[13].s,$tmpw0
|
||||
lsr $tmp1,@K[7],#32
|
||||
dup @mx[14].s,@KL[7]
|
||||
dup @mx[15].s,$tmpw1
|
||||
___
|
||||
}
|
||||
|
||||
sub SVE2_LOAD_STATES() {
|
||||
$code.=<<___;
|
||||
ldp $tmpw0,$tmpw1,[$state]
|
||||
ldp $tmpw2,$tmpw3,[$state,#8]
|
||||
dup @mx[0].s,$tmpw0
|
||||
dup @mx[1].s,$tmpw1
|
||||
dup @mx[2].s,$tmpw2
|
||||
dup @mx[3].s,$tmpw3
|
||||
ldp $tmpw0,$tmpw1,[$state,#16]
|
||||
ldp $tmpw2,$tmpw3,[$state,#24]
|
||||
dup @mx[4].s,$tmpw0
|
||||
dup @mx[5].s,$tmpw1
|
||||
dup @mx[6].s,$tmpw2
|
||||
dup @mx[7].s,$tmpw3
|
||||
ldp $tmpw0,$tmpw1,[$state,#32]
|
||||
ldp $tmpw2,$tmpw3,[$state,#40]
|
||||
dup @mx[8].s,$tmpw0
|
||||
dup @mx[9].s,$tmpw1
|
||||
dup @mx[10].s,$tmpw2
|
||||
dup @mx[11].s,$tmpw3
|
||||
ldp $tmpw0,$tmpw1,[$state, #48]
|
||||
ldp $tmpw2,$tmpw3,[$state,#56]
|
||||
mov @mx[12].s,p0/m,$zctr.s
|
||||
dup @mx[13].s,$tmpw1
|
||||
dup @mx[14].s,$tmpw2
|
||||
dup @mx[15].s,$tmpw3
|
||||
lsr $tmp0,@K[0],#32
|
||||
dup @mx[0].s,@KL[0]
|
||||
dup $bak0.s,@KL[0]
|
||||
dup @mx[1].s,$tmpw0
|
||||
dup $bak1.s,$tmpw0
|
||||
lsr $tmp1,@K[1],#32
|
||||
dup @mx[2].s,@KL[1]
|
||||
dup $bak2.s,@KL[1]
|
||||
dup @mx[3].s,$tmpw1
|
||||
dup $bak3.s,$tmpw1
|
||||
lsr $tmp0,@K[2],#32
|
||||
dup @mx[4].s,@KL[2]
|
||||
dup $bak4.s,@KL[2]
|
||||
dup @mx[5].s,$tmpw0
|
||||
dup $bak5.s,$tmpw0
|
||||
lsr $tmp1,@K[3],#32
|
||||
dup @mx[6].s,@KL[3]
|
||||
dup $bak6.s,@KL[3]
|
||||
dup @mx[7].s,$tmpw1
|
||||
dup $bak7.s,$tmpw1
|
||||
lsr $tmp0,@K[4],#32
|
||||
dup @mx[8].s,@KL[4]
|
||||
dup $bak8.s,@KL[4]
|
||||
dup @mx[9].s,$tmpw0
|
||||
dup $bak9.s,$tmpw0
|
||||
lsr $tmp1,@K[5],#32
|
||||
dup @mx[10].s,@KL[5]
|
||||
dup $bak10.s,@KL[5]
|
||||
dup @mx[11].s,$tmpw1
|
||||
dup $bak11.s,$tmpw1
|
||||
orr @mx[12].d,$zctr.d,$zctr.d
|
||||
lsr $tmp0,@K[6],#32
|
||||
dup @mx[13].s,$tmpw0
|
||||
dup $bak13.s,$tmpw0
|
||||
lsr $tmp1,@K[7],#32
|
||||
dup @mx[14].s,@KL[7]
|
||||
dup $bak14.s,@KL[7]
|
||||
dup @mx[15].s,$tmpw1
|
||||
dup $bak15.s,$tmpw1
|
||||
___
|
||||
}
|
||||
|
||||
sub sve_handle_blocks() {
|
||||
my ($counter) = ("x10");
|
||||
|
||||
&SVE_LOAD_STATES();
|
||||
$code.=<<___;
|
||||
mov $counter,#10
|
||||
.align 5
|
||||
1:
|
||||
cbz $sve2flag,.sve_inner
|
||||
___
|
||||
|
||||
&SVE_INNER_BLOCK();
|
||||
&SVE2_LOAD_STATES();
|
||||
&SVE2_INNER_BLOCK();
|
||||
&SVE2_ADD_STATES();
|
||||
$code.=<<___;
|
||||
subs $counter,$counter,1
|
||||
b.ne 1b
|
||||
b .fini_inner
|
||||
.sve_inner:
|
||||
___
|
||||
&SVE_LOAD_STATES();
|
||||
&SVE_INNER_BLOCK();
|
||||
&SVE_ADD_STATES();
|
||||
$code.=<<___;
|
||||
.fini_inner:
|
||||
___
|
||||
&SVE_TRANSFORMS();
|
||||
}
|
||||
|
||||
sub chacha20_process() {
|
||||
my ($counter) = ("x10");
|
||||
my ($tmpw) = ("w11");
|
||||
|
||||
$code.=<<___;
|
||||
.align 5
|
||||
.Loop:
|
||||
@ -430,27 +492,18 @@ ___
|
||||
}
|
||||
|
||||
{{{
|
||||
my ($tmp,$tmpw) = ("x10", "w10");
|
||||
my ($tmpw0,$tmpw1) = ("w11", "w12");
|
||||
my ($ptr) = ("x13");
|
||||
|
||||
$code.=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
.arch armv8-a
|
||||
|
||||
#if 0
|
||||
.extern OPENSSL_armcap_P
|
||||
.hidden OPENSSL_armcap_P
|
||||
#endif
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.Lchacha20_consts:
|
||||
.word 0x61707865
|
||||
.word 0x3320646e
|
||||
.word 0x79622d32
|
||||
.word 0x6b206574
|
||||
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
|
||||
.Lrot8:
|
||||
.word 0x02010003,0x04040404,0x02010003,0x04040404
|
||||
.globl ChaCha20_ctr32_sve
|
||||
@ -458,49 +511,55 @@ $code.=<<___;
|
||||
.align 5
|
||||
ChaCha20_ctr32_sve:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov $tmp, #64
|
||||
whilelo p0.s,xzr,$tmp
|
||||
cntp $veclen,p0,p0.s
|
||||
// run Neon if we only have 128-bit SVE
|
||||
// in the future, we need to check SVE2
|
||||
cmp $veclen,4
|
||||
b.le .Lreturn
|
||||
cntw $veclen, ALL, MUL #1
|
||||
lsr $blocks,$len,#6
|
||||
cmp $blocks,$veclen
|
||||
b.lt .Lreturn
|
||||
stp d8,d9,[sp,-48]!
|
||||
stp d10,d11,[sp,16]
|
||||
stp d12,d13,[sp,32]
|
||||
sub sp,sp,#64
|
||||
adr $tmp,.Lchacha20_consts
|
||||
ld1 {v0.4s},[$tmp]
|
||||
adr $tmp,.Lrot8
|
||||
ldp $tmpw0,$tmpw1,[$tmp]
|
||||
ld1 {v1.4s,v2.4s},[$key]
|
||||
ld1 {v3.4s},[$ctr]
|
||||
ldr $wctr,[$ctr]
|
||||
index $zctr.s,$wctr,1
|
||||
index $rot8.s,$tmpw0,$tmpw1
|
||||
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[sp]
|
||||
mov $state,sp
|
||||
#if 0
|
||||
// SVE2 code not enabled until we have hardware
|
||||
// for verification
|
||||
mov $sve2flag,0
|
||||
adrp $tmp,OPENSSL_armcap_P
|
||||
ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
|
||||
tst $tmpw,#ARMV8_SVE2
|
||||
b.eq 1f
|
||||
mov $sve2flag,1
|
||||
b 2f
|
||||
1:
|
||||
cmp $veclen,4
|
||||
b.le .Lreturn
|
||||
adr $tmp,.Lrot8
|
||||
ldp $tmpw0,$tmpw1,[$tmp]
|
||||
index $rot8.s,$tmpw0,$tmpw1
|
||||
2:
|
||||
stp d8,d9,[sp,-96]!
|
||||
stp d10,d11,[sp,16]
|
||||
stp d12,d13,[sp,32]
|
||||
stp d14,d15,[sp,48]
|
||||
stp x19,x20,[sp,64]
|
||||
stp x21,x22,[sp,80]
|
||||
adr $tmp,.Lchacha20_consts
|
||||
ldp @K[0],@K[1],[$tmp]
|
||||
ldp @K[2],@K[3],[$key]
|
||||
ldp @K[4],@K[5],[$key, 16]
|
||||
ldp @K[6],@K[7],[$ctr]
|
||||
ldr $wctr,[$ctr]
|
||||
index $zctr.s,$wctr,1
|
||||
ptrues p0.s,ALL
|
||||
#ifdef __AARCH64EB__
|
||||
ror @K[2],@K[2],#32
|
||||
ror @K[3],@K[3],#32
|
||||
ror @K[4],@K[4],#32
|
||||
ror @K[5],@K[5],#32
|
||||
ror @K[6],@K[6],#32
|
||||
ror @K[7],@K[7],#32
|
||||
#endif
|
||||
___
|
||||
&chacha20_process();
|
||||
$code.=<<___;
|
||||
add sp,sp,#64
|
||||
ldp d10,d11,[sp,16]
|
||||
ldp d12,d13,[sp,32]
|
||||
ldp d8,d9,[sp],48
|
||||
ldp d14,d15,[sp,48]
|
||||
ldp x19,x20,[sp,64]
|
||||
ldp x21,x22,[sp,80]
|
||||
ldp d8,d9,[sp],96
|
||||
str $wctr,[$ctr]
|
||||
and $len,$len,#63
|
||||
add $len,$len,$blocks,lsl #6
|
||||
@ -514,6 +573,7 @@ ___
|
||||
########################################
|
||||
{
|
||||
my %opcode_unpred = (
|
||||
"movprfx" => 0x0420BC00,
|
||||
"eor" => 0x04a03000,
|
||||
"add" => 0x04200000,
|
||||
"orr" => 0x04603000,
|
||||
@ -528,6 +588,7 @@ my %opcode_unpred = (
|
||||
"index" => 0x04204C00,
|
||||
"mov" => 0x05203800,
|
||||
"dup" => 0x05203800,
|
||||
"cntw" => 0x04A0E000,
|
||||
"tbl" => 0x05203000);
|
||||
|
||||
my %opcode_imm_unpred = (
|
||||
@ -564,6 +625,7 @@ my %opcode_pred = (
|
||||
"st4w" => 0xE570E000,
|
||||
"st1w" => 0xE500E000,
|
||||
"ld1w" => 0xA540A000,
|
||||
"ld1rw" => 0x8540C000,
|
||||
"revh" => 0x05258000);
|
||||
|
||||
my %tsize = (
|
||||
@ -740,6 +802,10 @@ sub sve_pred {
|
||||
if ($addr =~ m/x([0-9]+)\s*/o) {
|
||||
$xn = $1;
|
||||
}
|
||||
|
||||
if ($mnemonic =~m/ld1r[bhwd]/o) {
|
||||
$size = 0;
|
||||
}
|
||||
if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
|
||||
return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
|
||||
} elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
|
||||
@ -810,8 +876,14 @@ sub sve_other {
|
||||
} elsif ($arg =~ m/x([0-9]+)/o) {
|
||||
return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
|
||||
}
|
||||
} elsif ($mnemonic =~ /cnt[bhdw]/) {
|
||||
if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
|
||||
return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
|
||||
}
|
||||
} elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
|
||||
return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
|
||||
} elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
|
||||
return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
|
||||
}
|
||||
sprintf "%s // fail to parse", $inst;
|
||||
}
|
||||
@ -834,9 +906,10 @@ foreach(split("\n",$code)) {
|
||||
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
|
||||
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
|
||||
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
|
||||
s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
|
||||
s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
|
||||
s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
|
||||
s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
|
||||
s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user