mirror of
https://github.com/openssl/openssl.git
synced 2025-03-31 20:10:45 +08:00
sha/asm/keccak1600-ppc64.pl: Load data in 8 byte chunks on little endian
We currently load data byte by byte in order to byteswap it on big endian. On little endian we can just do 8 byte loads. A SHAKE128 benchmark runs 10% faster on POWER9 with this patch applied. Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/8455)
This commit is contained in:
parent
90c311315c
commit
8f9842fd03
@ -51,6 +51,16 @@ if ($flavour =~ /64/) {
|
||||
$PUSH ="std";
|
||||
} else { die "nonsense $flavour"; }
|
||||
|
||||
$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
|
||||
|
||||
if ($LITTLE_ENDIAN) {
|
||||
$DWORD_LE_LOAD = "ldu r0,8(r3)";
|
||||
$LE_LOAD_SIZE = "8";
|
||||
} else {
|
||||
$DWORD_LE_LOAD = "bl dword_le_load";
|
||||
$LE_LOAD_SIZE = "1";
|
||||
}
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
||||
@ -384,7 +394,9 @@ KeccakF1600:
|
||||
.byte 0,12,4,1,0x80,18,1,0
|
||||
.long 0
|
||||
.size KeccakF1600,.-KeccakF1600
|
||||
|
||||
___
|
||||
if (!$LITTLE_ENDIAN) {
|
||||
$code.=<<___;
|
||||
.type dword_le_load,\@function
|
||||
.align 5
|
||||
dword_le_load:
|
||||
@ -408,7 +420,10 @@ dword_le_load:
|
||||
.byte 0,12,0x14,0,0,0,1,0
|
||||
.long 0
|
||||
.size dword_le_load,.-dword_le_load
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
.globl SHA3_absorb
|
||||
.type SHA3_absorb,\@function
|
||||
.align 5
|
||||
@ -436,7 +451,7 @@ SHA3_absorb:
|
||||
$PUSH r0,`$FRAME+$LRSAVE`($sp)
|
||||
|
||||
bl PICmeup
|
||||
subi r4,r4,1 ; prepare for lbzu
|
||||
subi r4,r4,$LE_LOAD_SIZE ; prepare for ldu or lbzu
|
||||
subi r12,r12,8 ; prepare for ldu
|
||||
|
||||
$PUSH r3,`$LOCALS+0*$SIZE_T`($sp) ; save A[][]
|
||||
@ -487,79 +502,79 @@ SHA3_absorb:
|
||||
srwi r5,r5,3
|
||||
$PUSH r4,`$LOCALS+2*$SIZE_T`($sp) ; save len
|
||||
mtctr r5
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[0][0],$A[0][0],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[0][1],$A[0][1],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[0][2],$A[0][2],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[0][3],$A[0][3],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[0][4],$A[0][4],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[1][0],$A[1][0],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[1][1],$A[1][1],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[1][2],$A[1][2],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[1][3],$A[1][3],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[1][4],$A[1][4],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[2][0],$A[2][0],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[2][1],$A[2][1],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[2][2],$A[2][2],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[2][3],$A[2][3],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[2][4],$A[2][4],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[3][0],$A[3][0],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[3][1],$A[3][1],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[3][2],$A[3][2],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[3][3],$A[3][3],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[3][4],$A[3][4],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[4][0],$A[4][0],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[4][1],$A[4][1],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[4][2],$A[4][2],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[4][3],$A[4][3],r0
|
||||
bdz .Lprocess_block
|
||||
bl dword_le_load ; *inp++
|
||||
$DWORD_LE_LOAD ; *inp++
|
||||
xor $A[4][4],$A[4][4],r0
|
||||
|
||||
.Lprocess_block:
|
||||
|
Loading…
x
Reference in New Issue
Block a user