diff --git a/crypto/chacha/asm/chacha-armv8-sve.pl b/crypto/chacha/asm/chacha-armv8-sve.pl index 6080414e0d..dfc4548a4f 100755 --- a/crypto/chacha/asm/chacha-armv8-sve.pl +++ b/crypto/chacha/asm/chacha-armv8-sve.pl @@ -31,17 +31,25 @@ sub AUTOLOAD() # thunk [simplified] x86-style perlasm } my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4)); -my ($state) = ("x5"); -my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7"); -my ($saved_outp) = ("x8"); -my ($wctr, $xctr) = ("w9", "x9"); -my @mx=map("z$_",(0..7,16..23)); +my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6"); +my ($sve2flag) = ("x7"); +my ($wctr, $xctr) = ("w8", "x8"); +my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10"); +my ($tmp,$tmpw) = ("x10", "w10"); +my ($counter) = ("x11"); +my @K=map("x$_",(12..15,19..22)); +my @KL=map("w$_",(12..15,19..22)); +my @mx=map("z$_",(0..15)); my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx; -my @xt=map("z$_",(24..31,8..11)); -my ($rot8) = ("z12"); -my ($zctr) = ("z13"); -my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt; +my ($zctr) = ("z16"); +my @xt=map("z$_",(17..24)); +my @perm=map("z$_",(25..30)); +my ($rot8) = ("z31"); +my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt; +# in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register) +# in SVE2 we use all 15 backup register +my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8); my $debug_encoder=0; sub SVE_ADD() { @@ -148,8 +156,12 @@ sub SVE_QR_GROUP() { my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); - &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); - &SVE_REV16($d0,$d1,$d2,$d3); + if ($have_sve2 == 0) { + &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); + &SVE_REV16($d0,$d1,$d2,$d3); + } else { + &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); + } &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); if ($have_sve2 == 0) { @@ -162,8 +174,12 @@ sub SVE_QR_GROUP() { } &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); - &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); - &SVE_ROT8($d0,$d1,$d2,$d3); + if ($have_sve2 == 0) { + &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); + &SVE_ROT8($d0,$d1,$d2,$d3); + } else { + &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); + } &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); if ($have_sve2 == 0) { @@ -178,26 +194,31 @@ sub SVE_QR_GROUP() { sub SVE_INNER_BLOCK() { $code.=<<___; - //cbnz $sve2flag, 10f + mov $counter,#10 +1: +.align 5 ___ &SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); &SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); $code.=<<___; - // SVE 2 not enabled until hardware available -#if 0 - b 11f -10: -___ -# &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); -# &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); -$code.=<<___; -11: -#endif + subs $counter,$counter,1 + b.ne 1b ___ } -{{{ -my ($dlen,$rsize,$tmp) = ("x10","x11","x12"); +sub SVE2_INNER_BLOCK() { +$code.=<<___; + mov $counter,#10 +1: +.align 5 +___ + &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); + &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); +$code.=<<___; + subs $counter,$counter,1 + b.ne 1b +___ +} sub load() { my $x0 = shift; @@ -252,72 +273,75 @@ sub transpose() { my $xd = shift; $code.=<<___; - zip1 $xt8.s,$xa.s,$xb.s - zip2 $xt9.s,$xa.s,$xb.s - zip1 $xt10.s,$xc.s,$xd.s - zip2 $xt11.s,$xc.s,$xd.s - zip1 $xa.d,$xt8.d,$xt10.d - zip2 $xb.d,$xt8.d,$xt10.d - zip1 $xc.d,$xt9.d,$xt11.d - zip2 $xd.d,$xt9.d,$xt11.d + zip1 $xt0.s,$xa.s,$xb.s + zip2 $xt1.s,$xa.s,$xb.s + zip1 $xt2.s,$xc.s,$xd.s + zip2 $xt3.s,$xc.s,$xd.s + zip1 $xa.d,$xt0.d,$xt2.d + zip2 $xb.d,$xt0.d,$xt2.d + zip1 $xc.d,$xt1.d,$xt3.d + zip2 $xd.d,$xt1.d,$xt3.d ___ } -sub add_states() { - my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13"); - +sub SVE_ADD_STATES() { $code.=<<___; - ldp $tmpw0,$tmpw1,[$state] - ldp $tmpw2,$tmpw3,[$state,#8] - dup $xt0.s,$tmpw0 + lsr $tmp1,@K[5],#32 + dup $xt0.s,@KL[5] dup $xt1.s,$tmpw1 - dup $xt2.s,$tmpw2 - dup $xt3.s,$tmpw3 - ldp $tmpw0,$tmpw1,[$state,#16] - ldp $tmpw2,$tmpw3,[$state,#24] - add @mx[0].s,@mx[0].s,$xt0.s - add @mx[1].s,@mx[1].s,$xt1.s - add @mx[2].s,@mx[2].s,$xt2.s - add @mx[3].s,@mx[3].s,$xt3.s + add @mx[0].s,@mx[0].s,$bak0.s + add @mx[1].s,@mx[1].s,$bak1.s + add @mx[2].s,@mx[2].s,$bak2.s + add @mx[3].s,@mx[3].s,$bak3.s + add @mx[4].s,@mx[4].s,$bak4.s + add @mx[5].s,@mx[5].s,$bak5.s + add @mx[6].s,@mx[6].s,$bak6.s + add @mx[7].s,@mx[7].s,$bak7.s + add @mx[8].s,@mx[8].s,$bak8.s + add @mx[9].s,@mx[9].s,$bak9.s + lsr $tmp0,@K[6],#32 dup $xt4.s,$tmpw0 - dup $xt5.s,$tmpw1 - dup $xt6.s,$tmpw2 - dup $xt7.s,$tmpw3 - ldp $tmpw0,$tmpw1,[$state,#32] - ldp $tmpw2,$tmpw3,[$state,#40] - add @mx[4].s,@mx[4].s,$xt4.s - add @mx[5].s,@mx[5].s,$xt5.s - add @mx[6].s,@mx[6].s,$xt6.s - add @mx[7].s,@mx[7].s,$xt7.s - dup $xt0.s,$tmpw0 - dup $xt1.s,$tmpw1 - dup $xt2.s,$tmpw2 - dup $xt3.s,$tmpw3 - ldp $tmpw0,$tmpw1,[$state,#48] - ldp $tmpw2,$tmpw3,[$state,#56] - add @mx[8].s,@mx[8].s,$xt0.s - add @mx[9].s,@mx[9].s,$xt1.s - add @mx[10].s,@mx[10].s,$xt2.s - add @mx[11].s,@mx[11].s,$xt3.s - dup $xt5.s,$tmpw1 - dup $xt6.s,$tmpw2 - dup $xt7.s,$tmpw3 + lsr $tmp1,@K[7],#32 + dup $xt5.s,@KL[7] + dup $xt6.s,$tmpw1 + add @mx[10].s,@mx[10].s,$xt0.s + add @mx[11].s,@mx[11].s,$xt1.s add @mx[12].s,@mx[12].s,$zctr.s - add @mx[13].s,@mx[13].s,$xt5.s - add @mx[14].s,@mx[14].s,$xt6.s - add @mx[15].s,@mx[15].s,$xt7.s + add @mx[13].s,@mx[13].s,$xt4.s + add @mx[14].s,@mx[14].s,$xt5.s + add @mx[15].s,@mx[15].s,$xt6.s +___ +} + +sub SVE2_ADD_STATES() { +$code.=<<___; + add @mx[0].s,@mx[0].s,$bak0.s + add @mx[1].s,@mx[1].s,$bak1.s + add @mx[2].s,@mx[2].s,$bak2.s + add @mx[3].s,@mx[3].s,$bak3.s + add @mx[4].s,@mx[4].s,$bak4.s + add @mx[5].s,@mx[5].s,$bak5.s + add @mx[6].s,@mx[6].s,$bak6.s + add @mx[7].s,@mx[7].s,$bak7.s + add @mx[8].s,@mx[8].s,$bak8.s + add @mx[9].s,@mx[9].s,$bak9.s + add @mx[10].s,@mx[10].s,$bak10.s + add @mx[11].s,@mx[11].s,$bak11.s + add @mx[12].s,@mx[12].s,$zctr.s + add @mx[13].s,@mx[13].s,$bak13.s + add @mx[14].s,@mx[14].s,$bak14.s + add @mx[15].s,@mx[15].s,$bak15.s ___ } sub SVE_TRANSFORMS() { - &add_states(); &transpose($xa0,$xb0,$xc0,$xd0); &transpose($xa1,$xb1,$xc1,$xd1); &transpose($xa2,$xb2,$xc2,$xd2); &transpose($xa3,$xb3,$xc3,$xd3); - &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7); &transpose($xa0,$xa1,$xa2,$xa3); &transpose($xb0,$xb1,$xb2,$xb3); + &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7); $code.=<<___; eor $xa0.d,$xa0.d,$xt0.d eor $xa1.d,$xa1.d,$xt1.d @@ -330,8 +354,8 @@ $code.=<<___; ___ &transpose($xc0,$xc1,$xc2,$xc3); &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); - &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7); &transpose($xd0,$xd1,$xd2,$xd3); + &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7); $code.=<<___; eor $xc0.d,$xc0.d,$xt0.d eor $xc1.d,$xc1.d,$xt1.d @@ -348,73 +372,111 @@ $code.=<<___; incw $zctr.s, ALL, MUL #1 ___ } -}}} sub SVE_LOAD_STATES() { - my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13"); - $code.=<<___; - // FIXME following code are not functionally necessary - // but appear to enhance performance -#if 1 - ptrues p2.s,ALL - ptrues p2.s,ALL - ptrues p2.s,ALL - ptrues p2.s,ALL - ptrues p2.s,ALL - ptrues p2.s,ALL -#endif + lsr $tmp0,@K[0],#32 + dup @mx[0].s,@KL[0] + dup $bak0.s,@KL[0] + dup @mx[1].s,$tmpw0 + dup $bak1.s,$tmpw0 + lsr $tmp1,@K[1],#32 + dup @mx[2].s,@KL[1] + dup $bak2.s,@KL[1] + dup @mx[3].s,$tmpw1 + dup $bak3.s,$tmpw1 + lsr $tmp0,@K[2],#32 + dup @mx[4].s,@KL[2] + dup $bak4.s,@KL[2] + dup @mx[5].s,$tmpw0 + dup $bak5.s,$tmpw0 + lsr $tmp1,@K[3],#32 + dup @mx[6].s,@KL[3] + dup $bak6.s,@KL[3] + dup @mx[7].s,$tmpw1 + dup $bak7.s,$tmpw1 + lsr $tmp0,@K[4],#32 + dup @mx[8].s,@KL[4] + dup $bak8.s,@KL[4] + dup @mx[9].s,$tmpw0 + dup $bak9.s,$tmpw0 + lsr $tmp1,@K[5],#32 + dup @mx[10].s,@KL[5] + dup @mx[11].s,$tmpw1 + orr @mx[12].d,$zctr.d,$zctr.d + lsr $tmp0,@K[6],#32 + dup @mx[13].s,$tmpw0 + lsr $tmp1,@K[7],#32 + dup @mx[14].s,@KL[7] + dup @mx[15].s,$tmpw1 ___ +} + +sub SVE2_LOAD_STATES() { $code.=<<___; - ldp $tmpw0,$tmpw1,[$state] - ldp $tmpw2,$tmpw3,[$state,#8] - dup @mx[0].s,$tmpw0 - dup @mx[1].s,$tmpw1 - dup @mx[2].s,$tmpw2 - dup @mx[3].s,$tmpw3 - ldp $tmpw0,$tmpw1,[$state,#16] - ldp $tmpw2,$tmpw3,[$state,#24] - dup @mx[4].s,$tmpw0 - dup @mx[5].s,$tmpw1 - dup @mx[6].s,$tmpw2 - dup @mx[7].s,$tmpw3 - ldp $tmpw0,$tmpw1,[$state,#32] - ldp $tmpw2,$tmpw3,[$state,#40] - dup @mx[8].s,$tmpw0 - dup @mx[9].s,$tmpw1 - dup @mx[10].s,$tmpw2 - dup @mx[11].s,$tmpw3 - ldp $tmpw0,$tmpw1,[$state, #48] - ldp $tmpw2,$tmpw3,[$state,#56] - mov @mx[12].s,p0/m,$zctr.s - dup @mx[13].s,$tmpw1 - dup @mx[14].s,$tmpw2 - dup @mx[15].s,$tmpw3 + lsr $tmp0,@K[0],#32 + dup @mx[0].s,@KL[0] + dup $bak0.s,@KL[0] + dup @mx[1].s,$tmpw0 + dup $bak1.s,$tmpw0 + lsr $tmp1,@K[1],#32 + dup @mx[2].s,@KL[1] + dup $bak2.s,@KL[1] + dup @mx[3].s,$tmpw1 + dup $bak3.s,$tmpw1 + lsr $tmp0,@K[2],#32 + dup @mx[4].s,@KL[2] + dup $bak4.s,@KL[2] + dup @mx[5].s,$tmpw0 + dup $bak5.s,$tmpw0 + lsr $tmp1,@K[3],#32 + dup @mx[6].s,@KL[3] + dup $bak6.s,@KL[3] + dup @mx[7].s,$tmpw1 + dup $bak7.s,$tmpw1 + lsr $tmp0,@K[4],#32 + dup @mx[8].s,@KL[4] + dup $bak8.s,@KL[4] + dup @mx[9].s,$tmpw0 + dup $bak9.s,$tmpw0 + lsr $tmp1,@K[5],#32 + dup @mx[10].s,@KL[5] + dup $bak10.s,@KL[5] + dup @mx[11].s,$tmpw1 + dup $bak11.s,$tmpw1 + orr @mx[12].d,$zctr.d,$zctr.d + lsr $tmp0,@K[6],#32 + dup @mx[13].s,$tmpw0 + dup $bak13.s,$tmpw0 + lsr $tmp1,@K[7],#32 + dup @mx[14].s,@KL[7] + dup $bak14.s,@KL[7] + dup @mx[15].s,$tmpw1 + dup $bak15.s,$tmpw1 ___ } sub sve_handle_blocks() { - my ($counter) = ("x10"); - - &SVE_LOAD_STATES(); $code.=<<___; - mov $counter,#10 -.align 5 -1: + cbz $sve2flag,.sve_inner ___ - - &SVE_INNER_BLOCK(); + &SVE2_LOAD_STATES(); + &SVE2_INNER_BLOCK(); + &SVE2_ADD_STATES(); $code.=<<___; - subs $counter,$counter,1 - b.ne 1b + b .fini_inner +.sve_inner: +___ + &SVE_LOAD_STATES(); + &SVE_INNER_BLOCK(); + &SVE_ADD_STATES(); +$code.=<<___; +.fini_inner: ___ &SVE_TRANSFORMS(); } sub chacha20_process() { - my ($counter) = ("x10"); - my ($tmpw) = ("w11"); - $code.=<<___; .align 5 .Loop: @@ -430,27 +492,18 @@ ___ } {{{ -my ($tmp,$tmpw) = ("x10", "w10"); -my ($tmpw0,$tmpw1) = ("w11", "w12"); -my ($ptr) = ("x13"); - $code.=<<___; #include "arm_arch.h" .arch armv8-a -#if 0 .extern OPENSSL_armcap_P .hidden OPENSSL_armcap_P -#endif .text .align 5 .Lchacha20_consts: - .word 0x61707865 - .word 0x3320646e - .word 0x79622d32 - .word 0x6b206574 +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lrot8: .word 0x02010003,0x04040404,0x02010003,0x04040404 .globl ChaCha20_ctr32_sve @@ -458,49 +511,55 @@ $code.=<<___; .align 5 ChaCha20_ctr32_sve: AARCH64_VALID_CALL_TARGET - mov $tmp, #64 - whilelo p0.s,xzr,$tmp - cntp $veclen,p0,p0.s - // run Neon if we only have 128-bit SVE - // in the future, we need to check SVE2 - cmp $veclen,4 - b.le .Lreturn + cntw $veclen, ALL, MUL #1 lsr $blocks,$len,#6 cmp $blocks,$veclen b.lt .Lreturn - stp d8,d9,[sp,-48]! - stp d10,d11,[sp,16] - stp d12,d13,[sp,32] - sub sp,sp,#64 - adr $tmp,.Lchacha20_consts - ld1 {v0.4s},[$tmp] - adr $tmp,.Lrot8 - ldp $tmpw0,$tmpw1,[$tmp] - ld1 {v1.4s,v2.4s},[$key] - ld1 {v3.4s},[$ctr] - ldr $wctr,[$ctr] - index $zctr.s,$wctr,1 - index $rot8.s,$tmpw0,$tmpw1 - st1 {v0.4s,v1.4s,v2.4s,v3.4s},[sp] - mov $state,sp -#if 0 - // SVE2 code not enabled until we have hardware - // for verification mov $sve2flag,0 adrp $tmp,OPENSSL_armcap_P ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P] tst $tmpw,#ARMV8_SVE2 b.eq 1f mov $sve2flag,1 + b 2f 1: + cmp $veclen,4 + b.le .Lreturn + adr $tmp,.Lrot8 + ldp $tmpw0,$tmpw1,[$tmp] + index $rot8.s,$tmpw0,$tmpw1 +2: + stp d8,d9,[sp,-96]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + stp x19,x20,[sp,64] + stp x21,x22,[sp,80] + adr $tmp,.Lchacha20_consts + ldp @K[0],@K[1],[$tmp] + ldp @K[2],@K[3],[$key] + ldp @K[4],@K[5],[$key, 16] + ldp @K[6],@K[7],[$ctr] + ldr $wctr,[$ctr] + index $zctr.s,$wctr,1 + ptrues p0.s,ALL +#ifdef __AARCH64EB__ + ror @K[2],@K[2],#32 + ror @K[3],@K[3],#32 + ror @K[4],@K[4],#32 + ror @K[5],@K[5],#32 + ror @K[6],@K[6],#32 + ror @K[7],@K[7],#32 #endif ___ &chacha20_process(); $code.=<<___; - add sp,sp,#64 ldp d10,d11,[sp,16] ldp d12,d13,[sp,32] - ldp d8,d9,[sp],48 + ldp d14,d15,[sp,48] + ldp x19,x20,[sp,64] + ldp x21,x22,[sp,80] + ldp d8,d9,[sp],96 str $wctr,[$ctr] and $len,$len,#63 add $len,$len,$blocks,lsl #6 @@ -514,6 +573,7 @@ ___ ######################################## { my %opcode_unpred = ( + "movprfx" => 0x0420BC00, "eor" => 0x04a03000, "add" => 0x04200000, "orr" => 0x04603000, @@ -528,6 +588,7 @@ my %opcode_unpred = ( "index" => 0x04204C00, "mov" => 0x05203800, "dup" => 0x05203800, + "cntw" => 0x04A0E000, "tbl" => 0x05203000); my %opcode_imm_unpred = ( @@ -564,6 +625,7 @@ my %opcode_pred = ( "st4w" => 0xE570E000, "st1w" => 0xE500E000, "ld1w" => 0xA540A000, + "ld1rw" => 0x8540C000, "revh" => 0x05258000); my %tsize = ( @@ -740,6 +802,10 @@ sub sve_pred { if ($addr =~ m/x([0-9]+)\s*/o) { $xn = $1; } + + if ($mnemonic =~m/ld1r[bhwd]/o) { + $size = 0; + } if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) { return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) { @@ -810,8 +876,14 @@ sub sve_other { } elsif ($arg =~ m/x([0-9]+)/o) { return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst); } + } elsif ($mnemonic =~ /cnt[bhdw]/) { + if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { + return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst); + } } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) { return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst); + } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) { + return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst); } sprintf "%s // fail to parse", $inst; } @@ -834,9 +906,10 @@ foreach(split("\n",$code)) { s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge; s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge; s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge; + s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge; - s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge; + s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge; print $_,"\n"; }