Optimize chacha20 on aarch64 by SVE2

This patch improves existing chacha20 SVE patch by using SVE2,
which is an optional architecture feature of aarch64, with XAR
instruction that can improve the performance of chacha20.

Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/18522)
This commit is contained in:
Daniel Hu 2022-05-25 10:23:40 +01:00 committed by Pauli
parent b147b9daf1
commit bcb52bcc9f

View File

@ -31,17 +31,25 @@ sub AUTOLOAD() # thunk [simplified] x86-style perlasm
}
my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
my ($state) = ("x5");
my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7");
my ($saved_outp) = ("x8");
my ($wctr, $xctr) = ("w9", "x9");
my @mx=map("z$_",(0..7,16..23));
my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6");
my ($sve2flag) = ("x7");
my ($wctr, $xctr) = ("w8", "x8");
my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
my ($tmp,$tmpw) = ("x10", "w10");
my ($counter) = ("x11");
my @K=map("x$_",(12..15,19..22));
my @KL=map("w$_",(12..15,19..22));
my @mx=map("z$_",(0..15));
my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
$xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
my @xt=map("z$_",(24..31,8..11));
my ($rot8) = ("z12");
my ($zctr) = ("z13");
my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt;
my ($zctr) = ("z16");
my @xt=map("z$_",(17..24));
my @perm=map("z$_",(25..30));
my ($rot8) = ("z31");
my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt;
# in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register)
# in SVE2 we use all 15 backup register
my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8);
my $debug_encoder=0;
sub SVE_ADD() {
@ -148,8 +156,12 @@ sub SVE_QR_GROUP() {
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
&SVE_REV16($d0,$d1,$d2,$d3);
if ($have_sve2 == 0) {
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
&SVE_REV16($d0,$d1,$d2,$d3);
} else {
&SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
}
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
if ($have_sve2 == 0) {
@ -162,8 +174,12 @@ sub SVE_QR_GROUP() {
}
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
&SVE_ROT8($d0,$d1,$d2,$d3);
if ($have_sve2 == 0) {
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
&SVE_ROT8($d0,$d1,$d2,$d3);
} else {
&SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
}
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
if ($have_sve2 == 0) {
@ -178,26 +194,31 @@ sub SVE_QR_GROUP() {
sub SVE_INNER_BLOCK() {
$code.=<<___;
//cbnz $sve2flag, 10f
mov $counter,#10
1:
.align 5
___
&SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
&SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
$code.=<<___;
// SVE 2 not enabled until hardware available
#if 0
b 11f
10:
___
# &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
# &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
$code.=<<___;
11:
#endif
subs $counter,$counter,1
b.ne 1b
___
}
{{{
my ($dlen,$rsize,$tmp) = ("x10","x11","x12");
sub SVE2_INNER_BLOCK() {
$code.=<<___;
mov $counter,#10
1:
.align 5
___
&SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
&SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
$code.=<<___;
subs $counter,$counter,1
b.ne 1b
___
}
sub load() {
my $x0 = shift;
@ -252,72 +273,75 @@ sub transpose() {
my $xd = shift;
$code.=<<___;
zip1 $xt8.s,$xa.s,$xb.s
zip2 $xt9.s,$xa.s,$xb.s
zip1 $xt10.s,$xc.s,$xd.s
zip2 $xt11.s,$xc.s,$xd.s
zip1 $xa.d,$xt8.d,$xt10.d
zip2 $xb.d,$xt8.d,$xt10.d
zip1 $xc.d,$xt9.d,$xt11.d
zip2 $xd.d,$xt9.d,$xt11.d
zip1 $xt0.s,$xa.s,$xb.s
zip2 $xt1.s,$xa.s,$xb.s
zip1 $xt2.s,$xc.s,$xd.s
zip2 $xt3.s,$xc.s,$xd.s
zip1 $xa.d,$xt0.d,$xt2.d
zip2 $xb.d,$xt0.d,$xt2.d
zip1 $xc.d,$xt1.d,$xt3.d
zip2 $xd.d,$xt1.d,$xt3.d
___
}
sub add_states() {
my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
sub SVE_ADD_STATES() {
$code.=<<___;
ldp $tmpw0,$tmpw1,[$state]
ldp $tmpw2,$tmpw3,[$state,#8]
dup $xt0.s,$tmpw0
lsr $tmp1,@K[5],#32
dup $xt0.s,@KL[5]
dup $xt1.s,$tmpw1
dup $xt2.s,$tmpw2
dup $xt3.s,$tmpw3
ldp $tmpw0,$tmpw1,[$state,#16]
ldp $tmpw2,$tmpw3,[$state,#24]
add @mx[0].s,@mx[0].s,$xt0.s
add @mx[1].s,@mx[1].s,$xt1.s
add @mx[2].s,@mx[2].s,$xt2.s
add @mx[3].s,@mx[3].s,$xt3.s
add @mx[0].s,@mx[0].s,$bak0.s
add @mx[1].s,@mx[1].s,$bak1.s
add @mx[2].s,@mx[2].s,$bak2.s
add @mx[3].s,@mx[3].s,$bak3.s
add @mx[4].s,@mx[4].s,$bak4.s
add @mx[5].s,@mx[5].s,$bak5.s
add @mx[6].s,@mx[6].s,$bak6.s
add @mx[7].s,@mx[7].s,$bak7.s
add @mx[8].s,@mx[8].s,$bak8.s
add @mx[9].s,@mx[9].s,$bak9.s
lsr $tmp0,@K[6],#32
dup $xt4.s,$tmpw0
dup $xt5.s,$tmpw1
dup $xt6.s,$tmpw2
dup $xt7.s,$tmpw3
ldp $tmpw0,$tmpw1,[$state,#32]
ldp $tmpw2,$tmpw3,[$state,#40]
add @mx[4].s,@mx[4].s,$xt4.s
add @mx[5].s,@mx[5].s,$xt5.s
add @mx[6].s,@mx[6].s,$xt6.s
add @mx[7].s,@mx[7].s,$xt7.s
dup $xt0.s,$tmpw0
dup $xt1.s,$tmpw1
dup $xt2.s,$tmpw2
dup $xt3.s,$tmpw3
ldp $tmpw0,$tmpw1,[$state,#48]
ldp $tmpw2,$tmpw3,[$state,#56]
add @mx[8].s,@mx[8].s,$xt0.s
add @mx[9].s,@mx[9].s,$xt1.s
add @mx[10].s,@mx[10].s,$xt2.s
add @mx[11].s,@mx[11].s,$xt3.s
dup $xt5.s,$tmpw1
dup $xt6.s,$tmpw2
dup $xt7.s,$tmpw3
lsr $tmp1,@K[7],#32
dup $xt5.s,@KL[7]
dup $xt6.s,$tmpw1
add @mx[10].s,@mx[10].s,$xt0.s
add @mx[11].s,@mx[11].s,$xt1.s
add @mx[12].s,@mx[12].s,$zctr.s
add @mx[13].s,@mx[13].s,$xt5.s
add @mx[14].s,@mx[14].s,$xt6.s
add @mx[15].s,@mx[15].s,$xt7.s
add @mx[13].s,@mx[13].s,$xt4.s
add @mx[14].s,@mx[14].s,$xt5.s
add @mx[15].s,@mx[15].s,$xt6.s
___
}
sub SVE2_ADD_STATES() {
$code.=<<___;
add @mx[0].s,@mx[0].s,$bak0.s
add @mx[1].s,@mx[1].s,$bak1.s
add @mx[2].s,@mx[2].s,$bak2.s
add @mx[3].s,@mx[3].s,$bak3.s
add @mx[4].s,@mx[4].s,$bak4.s
add @mx[5].s,@mx[5].s,$bak5.s
add @mx[6].s,@mx[6].s,$bak6.s
add @mx[7].s,@mx[7].s,$bak7.s
add @mx[8].s,@mx[8].s,$bak8.s
add @mx[9].s,@mx[9].s,$bak9.s
add @mx[10].s,@mx[10].s,$bak10.s
add @mx[11].s,@mx[11].s,$bak11.s
add @mx[12].s,@mx[12].s,$zctr.s
add @mx[13].s,@mx[13].s,$bak13.s
add @mx[14].s,@mx[14].s,$bak14.s
add @mx[15].s,@mx[15].s,$bak15.s
___
}
sub SVE_TRANSFORMS() {
&add_states();
&transpose($xa0,$xb0,$xc0,$xd0);
&transpose($xa1,$xb1,$xc1,$xd1);
&transpose($xa2,$xb2,$xc2,$xd2);
&transpose($xa3,$xb3,$xc3,$xd3);
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
&transpose($xa0,$xa1,$xa2,$xa3);
&transpose($xb0,$xb1,$xb2,$xb3);
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
$code.=<<___;
eor $xa0.d,$xa0.d,$xt0.d
eor $xa1.d,$xa1.d,$xt1.d
@ -330,8 +354,8 @@ $code.=<<___;
___
&transpose($xc0,$xc1,$xc2,$xc3);
&store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
&transpose($xd0,$xd1,$xd2,$xd3);
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
$code.=<<___;
eor $xc0.d,$xc0.d,$xt0.d
eor $xc1.d,$xc1.d,$xt1.d
@ -348,73 +372,111 @@ $code.=<<___;
incw $zctr.s, ALL, MUL #1
___
}
}}}
sub SVE_LOAD_STATES() {
my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
$code.=<<___;
// FIXME following code are not functionally necessary
// but appear to enhance performance
#if 1
ptrues p2.s,ALL
ptrues p2.s,ALL
ptrues p2.s,ALL
ptrues p2.s,ALL
ptrues p2.s,ALL
ptrues p2.s,ALL
#endif
lsr $tmp0,@K[0],#32
dup @mx[0].s,@KL[0]
dup $bak0.s,@KL[0]
dup @mx[1].s,$tmpw0
dup $bak1.s,$tmpw0
lsr $tmp1,@K[1],#32
dup @mx[2].s,@KL[1]
dup $bak2.s,@KL[1]
dup @mx[3].s,$tmpw1
dup $bak3.s,$tmpw1
lsr $tmp0,@K[2],#32
dup @mx[4].s,@KL[2]
dup $bak4.s,@KL[2]
dup @mx[5].s,$tmpw0
dup $bak5.s,$tmpw0
lsr $tmp1,@K[3],#32
dup @mx[6].s,@KL[3]
dup $bak6.s,@KL[3]
dup @mx[7].s,$tmpw1
dup $bak7.s,$tmpw1
lsr $tmp0,@K[4],#32
dup @mx[8].s,@KL[4]
dup $bak8.s,@KL[4]
dup @mx[9].s,$tmpw0
dup $bak9.s,$tmpw0
lsr $tmp1,@K[5],#32
dup @mx[10].s,@KL[5]
dup @mx[11].s,$tmpw1
orr @mx[12].d,$zctr.d,$zctr.d
lsr $tmp0,@K[6],#32
dup @mx[13].s,$tmpw0
lsr $tmp1,@K[7],#32
dup @mx[14].s,@KL[7]
dup @mx[15].s,$tmpw1
___
}
sub SVE2_LOAD_STATES() {
$code.=<<___;
ldp $tmpw0,$tmpw1,[$state]
ldp $tmpw2,$tmpw3,[$state,#8]
dup @mx[0].s,$tmpw0
dup @mx[1].s,$tmpw1
dup @mx[2].s,$tmpw2
dup @mx[3].s,$tmpw3
ldp $tmpw0,$tmpw1,[$state,#16]
ldp $tmpw2,$tmpw3,[$state,#24]
dup @mx[4].s,$tmpw0
dup @mx[5].s,$tmpw1
dup @mx[6].s,$tmpw2
dup @mx[7].s,$tmpw3
ldp $tmpw0,$tmpw1,[$state,#32]
ldp $tmpw2,$tmpw3,[$state,#40]
dup @mx[8].s,$tmpw0
dup @mx[9].s,$tmpw1
dup @mx[10].s,$tmpw2
dup @mx[11].s,$tmpw3
ldp $tmpw0,$tmpw1,[$state, #48]
ldp $tmpw2,$tmpw3,[$state,#56]
mov @mx[12].s,p0/m,$zctr.s
dup @mx[13].s,$tmpw1
dup @mx[14].s,$tmpw2
dup @mx[15].s,$tmpw3
lsr $tmp0,@K[0],#32
dup @mx[0].s,@KL[0]
dup $bak0.s,@KL[0]
dup @mx[1].s,$tmpw0
dup $bak1.s,$tmpw0
lsr $tmp1,@K[1],#32
dup @mx[2].s,@KL[1]
dup $bak2.s,@KL[1]
dup @mx[3].s,$tmpw1
dup $bak3.s,$tmpw1
lsr $tmp0,@K[2],#32
dup @mx[4].s,@KL[2]
dup $bak4.s,@KL[2]
dup @mx[5].s,$tmpw0
dup $bak5.s,$tmpw0
lsr $tmp1,@K[3],#32
dup @mx[6].s,@KL[3]
dup $bak6.s,@KL[3]
dup @mx[7].s,$tmpw1
dup $bak7.s,$tmpw1
lsr $tmp0,@K[4],#32
dup @mx[8].s,@KL[4]
dup $bak8.s,@KL[4]
dup @mx[9].s,$tmpw0
dup $bak9.s,$tmpw0
lsr $tmp1,@K[5],#32
dup @mx[10].s,@KL[5]
dup $bak10.s,@KL[5]
dup @mx[11].s,$tmpw1
dup $bak11.s,$tmpw1
orr @mx[12].d,$zctr.d,$zctr.d
lsr $tmp0,@K[6],#32
dup @mx[13].s,$tmpw0
dup $bak13.s,$tmpw0
lsr $tmp1,@K[7],#32
dup @mx[14].s,@KL[7]
dup $bak14.s,@KL[7]
dup @mx[15].s,$tmpw1
dup $bak15.s,$tmpw1
___
}
sub sve_handle_blocks() {
my ($counter) = ("x10");
&SVE_LOAD_STATES();
$code.=<<___;
mov $counter,#10
.align 5
1:
cbz $sve2flag,.sve_inner
___
&SVE_INNER_BLOCK();
&SVE2_LOAD_STATES();
&SVE2_INNER_BLOCK();
&SVE2_ADD_STATES();
$code.=<<___;
subs $counter,$counter,1
b.ne 1b
b .fini_inner
.sve_inner:
___
&SVE_LOAD_STATES();
&SVE_INNER_BLOCK();
&SVE_ADD_STATES();
$code.=<<___;
.fini_inner:
___
&SVE_TRANSFORMS();
}
sub chacha20_process() {
my ($counter) = ("x10");
my ($tmpw) = ("w11");
$code.=<<___;
.align 5
.Loop:
@ -430,27 +492,18 @@ ___
}
{{{
my ($tmp,$tmpw) = ("x10", "w10");
my ($tmpw0,$tmpw1) = ("w11", "w12");
my ($ptr) = ("x13");
$code.=<<___;
#include "arm_arch.h"
.arch armv8-a
#if 0
.extern OPENSSL_armcap_P
.hidden OPENSSL_armcap_P
#endif
.text
.align 5
.Lchacha20_consts:
.word 0x61707865
.word 0x3320646e
.word 0x79622d32
.word 0x6b206574
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.Lrot8:
.word 0x02010003,0x04040404,0x02010003,0x04040404
.globl ChaCha20_ctr32_sve
@ -458,49 +511,55 @@ $code.=<<___;
.align 5
ChaCha20_ctr32_sve:
AARCH64_VALID_CALL_TARGET
mov $tmp, #64
whilelo p0.s,xzr,$tmp
cntp $veclen,p0,p0.s
// run Neon if we only have 128-bit SVE
// in the future, we need to check SVE2
cmp $veclen,4
b.le .Lreturn
cntw $veclen, ALL, MUL #1
lsr $blocks,$len,#6
cmp $blocks,$veclen
b.lt .Lreturn
stp d8,d9,[sp,-48]!
stp d10,d11,[sp,16]
stp d12,d13,[sp,32]
sub sp,sp,#64
adr $tmp,.Lchacha20_consts
ld1 {v0.4s},[$tmp]
adr $tmp,.Lrot8
ldp $tmpw0,$tmpw1,[$tmp]
ld1 {v1.4s,v2.4s},[$key]
ld1 {v3.4s},[$ctr]
ldr $wctr,[$ctr]
index $zctr.s,$wctr,1
index $rot8.s,$tmpw0,$tmpw1
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[sp]
mov $state,sp
#if 0
// SVE2 code not enabled until we have hardware
// for verification
mov $sve2flag,0
adrp $tmp,OPENSSL_armcap_P
ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
tst $tmpw,#ARMV8_SVE2
b.eq 1f
mov $sve2flag,1
b 2f
1:
cmp $veclen,4
b.le .Lreturn
adr $tmp,.Lrot8
ldp $tmpw0,$tmpw1,[$tmp]
index $rot8.s,$tmpw0,$tmpw1
2:
stp d8,d9,[sp,-96]!
stp d10,d11,[sp,16]
stp d12,d13,[sp,32]
stp d14,d15,[sp,48]
stp x19,x20,[sp,64]
stp x21,x22,[sp,80]
adr $tmp,.Lchacha20_consts
ldp @K[0],@K[1],[$tmp]
ldp @K[2],@K[3],[$key]
ldp @K[4],@K[5],[$key, 16]
ldp @K[6],@K[7],[$ctr]
ldr $wctr,[$ctr]
index $zctr.s,$wctr,1
ptrues p0.s,ALL
#ifdef __AARCH64EB__
ror @K[2],@K[2],#32
ror @K[3],@K[3],#32
ror @K[4],@K[4],#32
ror @K[5],@K[5],#32
ror @K[6],@K[6],#32
ror @K[7],@K[7],#32
#endif
___
&chacha20_process();
$code.=<<___;
add sp,sp,#64
ldp d10,d11,[sp,16]
ldp d12,d13,[sp,32]
ldp d8,d9,[sp],48
ldp d14,d15,[sp,48]
ldp x19,x20,[sp,64]
ldp x21,x22,[sp,80]
ldp d8,d9,[sp],96
str $wctr,[$ctr]
and $len,$len,#63
add $len,$len,$blocks,lsl #6
@ -514,6 +573,7 @@ ___
########################################
{
my %opcode_unpred = (
"movprfx" => 0x0420BC00,
"eor" => 0x04a03000,
"add" => 0x04200000,
"orr" => 0x04603000,
@ -528,6 +588,7 @@ my %opcode_unpred = (
"index" => 0x04204C00,
"mov" => 0x05203800,
"dup" => 0x05203800,
"cntw" => 0x04A0E000,
"tbl" => 0x05203000);
my %opcode_imm_unpred = (
@ -564,6 +625,7 @@ my %opcode_pred = (
"st4w" => 0xE570E000,
"st1w" => 0xE500E000,
"ld1w" => 0xA540A000,
"ld1rw" => 0x8540C000,
"revh" => 0x05258000);
my %tsize = (
@ -740,6 +802,10 @@ sub sve_pred {
if ($addr =~ m/x([0-9]+)\s*/o) {
$xn = $1;
}
if ($mnemonic =~m/ld1r[bhwd]/o) {
$size = 0;
}
if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
} elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
@ -810,8 +876,14 @@ sub sve_other {
} elsif ($arg =~ m/x([0-9]+)/o) {
return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
}
} elsif ($mnemonic =~ /cnt[bhdw]/) {
if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
}
} elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
} elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
}
sprintf "%s // fail to parse", $inst;
}
@ -834,9 +906,10 @@ foreach(split("\n",$code)) {
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
print $_,"\n";
}