mirror of
https://github.com/openssl/openssl.git
synced 2024-12-15 06:01:37 +08:00
8f9842fd03
We currently load data byte by byte in order to byteswap it on big endian. On little endian we can just do 8 byte loads. A SHAKE128 benchmark runs 10% faster on POWER9 with this patch applied. Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/8455)
778 lines
19 KiB
Raku
Executable File
778 lines
19 KiB
Raku
Executable File
#!/usr/bin/env perl
|
|
# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
# Keccak-1600 for PPC64.
|
|
#
|
|
# June 2017.
|
|
#
|
|
# This is straightforward KECCAK_1X_ALT implementation that works on
|
|
# *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and
|
|
# it's possible to achieve performance better than below, but that is
|
|
# naturally option only for POWER8 and successors...
|
|
#
|
|
######################################################################
|
|
# Numbers are cycles per processed byte.
|
|
#
|
|
# r=1088(*)
|
|
#
|
|
# PPC970/G5 14.0/+130%
|
|
# POWER7 9.7/+110%
|
|
# POWER8 10.6/+100%
|
|
# POWER9 8.2/+66%
|
|
#
|
|
# (*) Corresponds to SHA3-256. Percentage after slash is improvement
|
|
# over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
|
|
# much better (but watch out for them generating code specific
|
|
# to processor they execute on).
|
|
|
|
# $output is the last argument if it looks like a file (it has an extension)
|
|
# $flavour is the first argument if it doesn't look like a file
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
|
|
if ($flavour =~ /64/) {
|
|
$SIZE_T =8;
|
|
$LRSAVE =2*$SIZE_T;
|
|
$UCMP ="cmpld";
|
|
$STU ="stdu";
|
|
$POP ="ld";
|
|
$PUSH ="std";
|
|
} else { die "nonsense $flavour"; }
|
|
|
|
$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
|
|
|
|
if ($LITTLE_ENDIAN) {
|
|
$DWORD_LE_LOAD = "ldu r0,8(r3)";
|
|
$LE_LOAD_SIZE = "8";
|
|
} else {
|
|
$DWORD_LE_LOAD = "bl dword_le_load";
|
|
$LE_LOAD_SIZE = "1";
|
|
}
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
|
die "can't locate ppc-xlate.pl";
|
|
|
|
open STDOUT,"| $^X $xlate $flavour \"$output\""
|
|
or die "can't call $xlate: $!";
|
|
|
|
$FRAME=24*$SIZE_T+6*$SIZE_T+32;
|
|
$LOCALS=6*$SIZE_T;
|
|
$TEMP=$LOCALS+6*$SIZE_T;
|
|
|
|
my $sp ="r1";
|
|
|
|
my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ],
|
|
(7, 12, 17, 22, 27));
|
|
$A[1][1] = "r6"; # r13 is reserved
|
|
|
|
my @C = map("r$_", (0,3,4,5));
|
|
|
|
my @rhotates = ([ 0, 1, 62, 28, 27 ],
|
|
[ 36, 44, 6, 55, 20 ],
|
|
[ 3, 10, 43, 25, 39 ],
|
|
[ 41, 45, 15, 21, 8 ],
|
|
[ 18, 2, 61, 56, 14 ]);
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.type KeccakF1600_int,\@function
|
|
.align 5
|
|
KeccakF1600_int:
|
|
li r0,24
|
|
mtctr r0
|
|
b .Loop
|
|
.align 4
|
|
.Loop:
|
|
xor $C[0],$A[0][0],$A[1][0] ; Theta
|
|
std $A[0][4],`$TEMP+0`($sp)
|
|
xor $C[1],$A[0][1],$A[1][1]
|
|
std $A[1][4],`$TEMP+8`($sp)
|
|
xor $C[2],$A[0][2],$A[1][2]
|
|
std $A[2][4],`$TEMP+16`($sp)
|
|
xor $C[3],$A[0][3],$A[1][3]
|
|
std $A[3][4],`$TEMP+24`($sp)
|
|
___
|
|
$C[4]=$A[0][4];
|
|
$C[5]=$A[1][4];
|
|
$C[6]=$A[2][4];
|
|
$C[7]=$A[3][4];
|
|
$code.=<<___;
|
|
xor $C[4],$A[0][4],$A[1][4]
|
|
xor $C[0],$C[0],$A[2][0]
|
|
xor $C[1],$C[1],$A[2][1]
|
|
xor $C[2],$C[2],$A[2][2]
|
|
xor $C[3],$C[3],$A[2][3]
|
|
xor $C[4],$C[4],$A[2][4]
|
|
xor $C[0],$C[0],$A[3][0]
|
|
xor $C[1],$C[1],$A[3][1]
|
|
xor $C[2],$C[2],$A[3][2]
|
|
xor $C[3],$C[3],$A[3][3]
|
|
xor $C[4],$C[4],$A[3][4]
|
|
xor $C[0],$C[0],$A[4][0]
|
|
xor $C[2],$C[2],$A[4][2]
|
|
xor $C[1],$C[1],$A[4][1]
|
|
xor $C[3],$C[3],$A[4][3]
|
|
rotldi $C[5],$C[2],1
|
|
xor $C[4],$C[4],$A[4][4]
|
|
rotldi $C[6],$C[3],1
|
|
xor $C[5],$C[5],$C[0]
|
|
rotldi $C[7],$C[4],1
|
|
|
|
xor $A[0][1],$A[0][1],$C[5]
|
|
xor $A[1][1],$A[1][1],$C[5]
|
|
xor $A[2][1],$A[2][1],$C[5]
|
|
xor $A[3][1],$A[3][1],$C[5]
|
|
xor $A[4][1],$A[4][1],$C[5]
|
|
|
|
rotldi $C[5],$C[0],1
|
|
xor $C[6],$C[6],$C[1]
|
|
xor $C[2],$C[2],$C[7]
|
|
rotldi $C[7],$C[1],1
|
|
xor $C[3],$C[3],$C[5]
|
|
xor $C[4],$C[4],$C[7]
|
|
|
|
xor $C[1], $A[0][2],$C[6] ;mr $C[1],$A[0][2]
|
|
xor $A[1][2],$A[1][2],$C[6]
|
|
xor $A[2][2],$A[2][2],$C[6]
|
|
xor $A[3][2],$A[3][2],$C[6]
|
|
xor $A[4][2],$A[4][2],$C[6]
|
|
|
|
xor $A[0][0],$A[0][0],$C[4]
|
|
xor $A[1][0],$A[1][0],$C[4]
|
|
xor $A[2][0],$A[2][0],$C[4]
|
|
xor $A[3][0],$A[3][0],$C[4]
|
|
xor $A[4][0],$A[4][0],$C[4]
|
|
___
|
|
$C[4]=undef;
|
|
$C[5]=undef;
|
|
$C[6]=undef;
|
|
$C[7]=undef;
|
|
$code.=<<___;
|
|
ld $A[0][4],`$TEMP+0`($sp)
|
|
xor $C[0], $A[0][3],$C[2] ;mr $C[0],$A[0][3]
|
|
ld $A[1][4],`$TEMP+8`($sp)
|
|
xor $A[1][3],$A[1][3],$C[2]
|
|
ld $A[2][4],`$TEMP+16`($sp)
|
|
xor $A[2][3],$A[2][3],$C[2]
|
|
ld $A[3][4],`$TEMP+24`($sp)
|
|
xor $A[3][3],$A[3][3],$C[2]
|
|
xor $A[4][3],$A[4][3],$C[2]
|
|
|
|
xor $C[2], $A[0][4],$C[3] ;mr $C[2],$A[0][4]
|
|
xor $A[1][4],$A[1][4],$C[3]
|
|
xor $A[2][4],$A[2][4],$C[3]
|
|
xor $A[3][4],$A[3][4],$C[3]
|
|
xor $A[4][4],$A[4][4],$C[3]
|
|
|
|
mr $C[3],$A[0][1] ; Rho+Pi
|
|
rotldi $A[0][1],$A[1][1],$rhotates[1][1]
|
|
;mr $C[1],$A[0][2]
|
|
rotldi $A[0][2],$A[2][2],$rhotates[2][2]
|
|
;mr $C[0],$A[0][3]
|
|
rotldi $A[0][3],$A[3][3],$rhotates[3][3]
|
|
;mr $C[2],$A[0][4]
|
|
rotldi $A[0][4],$A[4][4],$rhotates[4][4]
|
|
|
|
rotldi $A[1][1],$A[1][4],$rhotates[1][4]
|
|
rotldi $A[2][2],$A[2][3],$rhotates[2][3]
|
|
rotldi $A[3][3],$A[3][2],$rhotates[3][2]
|
|
rotldi $A[4][4],$A[4][1],$rhotates[4][1]
|
|
|
|
rotldi $A[1][4],$A[4][2],$rhotates[4][2]
|
|
rotldi $A[2][3],$A[3][4],$rhotates[3][4]
|
|
rotldi $A[3][2],$A[2][1],$rhotates[2][1]
|
|
rotldi $A[4][1],$A[1][3],$rhotates[1][3]
|
|
|
|
rotldi $A[4][2],$A[2][4],$rhotates[2][4]
|
|
rotldi $A[3][4],$A[4][3],$rhotates[4][3]
|
|
rotldi $A[2][1],$A[1][2],$rhotates[1][2]
|
|
rotldi $A[1][3],$A[3][1],$rhotates[3][1]
|
|
|
|
rotldi $A[2][4],$A[4][0],$rhotates[4][0]
|
|
rotldi $A[4][3],$A[3][0],$rhotates[3][0]
|
|
rotldi $A[1][2],$A[2][0],$rhotates[2][0]
|
|
rotldi $A[3][1],$A[1][0],$rhotates[1][0]
|
|
|
|
rotldi $A[1][0],$C[0],$rhotates[0][3]
|
|
rotldi $A[2][0],$C[3],$rhotates[0][1]
|
|
rotldi $A[3][0],$C[2],$rhotates[0][4]
|
|
rotldi $A[4][0],$C[1],$rhotates[0][2]
|
|
|
|
andc $C[0],$A[0][2],$A[0][1] ; Chi+Iota
|
|
andc $C[1],$A[0][3],$A[0][2]
|
|
andc $C[2],$A[0][0],$A[0][4]
|
|
andc $C[3],$A[0][1],$A[0][0]
|
|
xor $A[0][0],$A[0][0],$C[0]
|
|
andc $C[0],$A[0][4],$A[0][3]
|
|
xor $A[0][1],$A[0][1],$C[1]
|
|
ld $C[1],`$LOCALS+4*$SIZE_T`($sp)
|
|
xor $A[0][3],$A[0][3],$C[2]
|
|
xor $A[0][4],$A[0][4],$C[3]
|
|
xor $A[0][2],$A[0][2],$C[0]
|
|
ldu $C[3],8($C[1]) ; Iota[i++]
|
|
|
|
andc $C[0],$A[1][2],$A[1][1]
|
|
std $C[1],`$LOCALS+4*$SIZE_T`($sp)
|
|
andc $C[1],$A[1][3],$A[1][2]
|
|
andc $C[2],$A[1][0],$A[1][4]
|
|
xor $A[0][0],$A[0][0],$C[3] ; A[0][0] ^= Iota
|
|
andc $C[3],$A[1][1],$A[1][0]
|
|
xor $A[1][0],$A[1][0],$C[0]
|
|
andc $C[0],$A[1][4],$A[1][3]
|
|
xor $A[1][1],$A[1][1],$C[1]
|
|
xor $A[1][3],$A[1][3],$C[2]
|
|
xor $A[1][4],$A[1][4],$C[3]
|
|
xor $A[1][2],$A[1][2],$C[0]
|
|
|
|
andc $C[0],$A[2][2],$A[2][1]
|
|
andc $C[1],$A[2][3],$A[2][2]
|
|
andc $C[2],$A[2][0],$A[2][4]
|
|
andc $C[3],$A[2][1],$A[2][0]
|
|
xor $A[2][0],$A[2][0],$C[0]
|
|
andc $C[0],$A[2][4],$A[2][3]
|
|
xor $A[2][1],$A[2][1],$C[1]
|
|
xor $A[2][3],$A[2][3],$C[2]
|
|
xor $A[2][4],$A[2][4],$C[3]
|
|
xor $A[2][2],$A[2][2],$C[0]
|
|
|
|
andc $C[0],$A[3][2],$A[3][1]
|
|
andc $C[1],$A[3][3],$A[3][2]
|
|
andc $C[2],$A[3][0],$A[3][4]
|
|
andc $C[3],$A[3][1],$A[3][0]
|
|
xor $A[3][0],$A[3][0],$C[0]
|
|
andc $C[0],$A[3][4],$A[3][3]
|
|
xor $A[3][1],$A[3][1],$C[1]
|
|
xor $A[3][3],$A[3][3],$C[2]
|
|
xor $A[3][4],$A[3][4],$C[3]
|
|
xor $A[3][2],$A[3][2],$C[0]
|
|
|
|
andc $C[0],$A[4][2],$A[4][1]
|
|
andc $C[1],$A[4][3],$A[4][2]
|
|
andc $C[2],$A[4][0],$A[4][4]
|
|
andc $C[3],$A[4][1],$A[4][0]
|
|
xor $A[4][0],$A[4][0],$C[0]
|
|
andc $C[0],$A[4][4],$A[4][3]
|
|
xor $A[4][1],$A[4][1],$C[1]
|
|
xor $A[4][3],$A[4][3],$C[2]
|
|
xor $A[4][4],$A[4][4],$C[3]
|
|
xor $A[4][2],$A[4][2],$C[0]
|
|
|
|
bdnz .Loop
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,0,0
|
|
.size KeccakF1600_int,.-KeccakF1600_int
|
|
|
|
.type KeccakF1600,\@function
|
|
.align 5
|
|
KeccakF1600:
|
|
$STU $sp,-$FRAME($sp)
|
|
mflr r0
|
|
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
|
|
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
|
|
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
|
|
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
|
|
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
|
|
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
|
|
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
|
|
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
|
|
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
|
|
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
|
|
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
|
|
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
|
|
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
|
|
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
|
|
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
|
|
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
|
|
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
|
|
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
|
|
$PUSH r0,`$FRAME+$LRSAVE`($sp)
|
|
|
|
bl PICmeup
|
|
subi r12,r12,8 ; prepare for ldu
|
|
|
|
$PUSH r3,`$LOCALS+0*$SIZE_T`($sp)
|
|
;$PUSH r4,`$LOCALS+1*$SIZE_T`($sp)
|
|
;$PUSH r5,`$LOCALS+2*$SIZE_T`($sp)
|
|
;$PUSH r6,`$LOCALS+3*$SIZE_T`($sp)
|
|
$PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
|
|
|
|
ld $A[0][0],`8*0`(r3) ; load A[5][5]
|
|
ld $A[0][1],`8*1`(r3)
|
|
ld $A[0][2],`8*2`(r3)
|
|
ld $A[0][3],`8*3`(r3)
|
|
ld $A[0][4],`8*4`(r3)
|
|
ld $A[1][0],`8*5`(r3)
|
|
ld $A[1][1],`8*6`(r3)
|
|
ld $A[1][2],`8*7`(r3)
|
|
ld $A[1][3],`8*8`(r3)
|
|
ld $A[1][4],`8*9`(r3)
|
|
ld $A[2][0],`8*10`(r3)
|
|
ld $A[2][1],`8*11`(r3)
|
|
ld $A[2][2],`8*12`(r3)
|
|
ld $A[2][3],`8*13`(r3)
|
|
ld $A[2][4],`8*14`(r3)
|
|
ld $A[3][0],`8*15`(r3)
|
|
ld $A[3][1],`8*16`(r3)
|
|
ld $A[3][2],`8*17`(r3)
|
|
ld $A[3][3],`8*18`(r3)
|
|
ld $A[3][4],`8*19`(r3)
|
|
ld $A[4][0],`8*20`(r3)
|
|
ld $A[4][1],`8*21`(r3)
|
|
ld $A[4][2],`8*22`(r3)
|
|
ld $A[4][3],`8*23`(r3)
|
|
ld $A[4][4],`8*24`(r3)
|
|
|
|
bl KeccakF1600_int
|
|
|
|
$POP r3,`$LOCALS+0*$SIZE_T`($sp)
|
|
std $A[0][0],`8*0`(r3) ; return A[5][5]
|
|
std $A[0][1],`8*1`(r3)
|
|
std $A[0][2],`8*2`(r3)
|
|
std $A[0][3],`8*3`(r3)
|
|
std $A[0][4],`8*4`(r3)
|
|
std $A[1][0],`8*5`(r3)
|
|
std $A[1][1],`8*6`(r3)
|
|
std $A[1][2],`8*7`(r3)
|
|
std $A[1][3],`8*8`(r3)
|
|
std $A[1][4],`8*9`(r3)
|
|
std $A[2][0],`8*10`(r3)
|
|
std $A[2][1],`8*11`(r3)
|
|
std $A[2][2],`8*12`(r3)
|
|
std $A[2][3],`8*13`(r3)
|
|
std $A[2][4],`8*14`(r3)
|
|
std $A[3][0],`8*15`(r3)
|
|
std $A[3][1],`8*16`(r3)
|
|
std $A[3][2],`8*17`(r3)
|
|
std $A[3][3],`8*18`(r3)
|
|
std $A[3][4],`8*19`(r3)
|
|
std $A[4][0],`8*20`(r3)
|
|
std $A[4][1],`8*21`(r3)
|
|
std $A[4][2],`8*22`(r3)
|
|
std $A[4][3],`8*23`(r3)
|
|
std $A[4][4],`8*24`(r3)
|
|
|
|
$POP r0,`$FRAME+$LRSAVE`($sp)
|
|
$POP r14,`$FRAME-$SIZE_T*18`($sp)
|
|
$POP r15,`$FRAME-$SIZE_T*17`($sp)
|
|
$POP r16,`$FRAME-$SIZE_T*16`($sp)
|
|
$POP r17,`$FRAME-$SIZE_T*15`($sp)
|
|
$POP r18,`$FRAME-$SIZE_T*14`($sp)
|
|
$POP r19,`$FRAME-$SIZE_T*13`($sp)
|
|
$POP r20,`$FRAME-$SIZE_T*12`($sp)
|
|
$POP r21,`$FRAME-$SIZE_T*11`($sp)
|
|
$POP r22,`$FRAME-$SIZE_T*10`($sp)
|
|
$POP r23,`$FRAME-$SIZE_T*9`($sp)
|
|
$POP r24,`$FRAME-$SIZE_T*8`($sp)
|
|
$POP r25,`$FRAME-$SIZE_T*7`($sp)
|
|
$POP r26,`$FRAME-$SIZE_T*6`($sp)
|
|
$POP r27,`$FRAME-$SIZE_T*5`($sp)
|
|
$POP r28,`$FRAME-$SIZE_T*4`($sp)
|
|
$POP r29,`$FRAME-$SIZE_T*3`($sp)
|
|
$POP r30,`$FRAME-$SIZE_T*2`($sp)
|
|
$POP r31,`$FRAME-$SIZE_T*1`($sp)
|
|
mtlr r0
|
|
addi $sp,$sp,$FRAME
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,1,0x80,18,1,0
|
|
.long 0
|
|
.size KeccakF1600,.-KeccakF1600
|
|
___
|
|
if (!$LITTLE_ENDIAN) {
|
|
$code.=<<___;
|
|
.type dword_le_load,\@function
|
|
.align 5
|
|
dword_le_load:
|
|
lbz r0,1(r3)
|
|
lbz r4,2(r3)
|
|
lbz r5,3(r3)
|
|
insrdi r0,r4,8,48
|
|
lbz r4,4(r3)
|
|
insrdi r0,r5,8,40
|
|
lbz r5,5(r3)
|
|
insrdi r0,r4,8,32
|
|
lbz r4,6(r3)
|
|
insrdi r0,r5,8,24
|
|
lbz r5,7(r3)
|
|
insrdi r0,r4,8,16
|
|
lbzu r4,8(r3)
|
|
insrdi r0,r5,8,8
|
|
insrdi r0,r4,8,0
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,1,0
|
|
.long 0
|
|
.size dword_le_load,.-dword_le_load
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
.globl SHA3_absorb
|
|
.type SHA3_absorb,\@function
|
|
.align 5
|
|
SHA3_absorb:
|
|
$STU $sp,-$FRAME($sp)
|
|
mflr r0
|
|
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
|
|
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
|
|
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
|
|
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
|
|
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
|
|
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
|
|
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
|
|
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
|
|
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
|
|
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
|
|
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
|
|
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
|
|
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
|
|
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
|
|
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
|
|
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
|
|
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
|
|
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
|
|
$PUSH r0,`$FRAME+$LRSAVE`($sp)
|
|
|
|
bl PICmeup
|
|
subi r4,r4,$LE_LOAD_SIZE ; prepare for ldu or lbzu
|
|
subi r12,r12,8 ; prepare for ldu
|
|
|
|
$PUSH r3,`$LOCALS+0*$SIZE_T`($sp) ; save A[][]
|
|
$PUSH r4,`$LOCALS+1*$SIZE_T`($sp) ; save inp
|
|
$PUSH r5,`$LOCALS+2*$SIZE_T`($sp) ; save len
|
|
$PUSH r6,`$LOCALS+3*$SIZE_T`($sp) ; save bsz
|
|
mr r0,r6
|
|
$PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
|
|
|
|
ld $A[0][0],`8*0`(r3) ; load A[5][5]
|
|
ld $A[0][1],`8*1`(r3)
|
|
ld $A[0][2],`8*2`(r3)
|
|
ld $A[0][3],`8*3`(r3)
|
|
ld $A[0][4],`8*4`(r3)
|
|
ld $A[1][0],`8*5`(r3)
|
|
ld $A[1][1],`8*6`(r3)
|
|
ld $A[1][2],`8*7`(r3)
|
|
ld $A[1][3],`8*8`(r3)
|
|
ld $A[1][4],`8*9`(r3)
|
|
ld $A[2][0],`8*10`(r3)
|
|
ld $A[2][1],`8*11`(r3)
|
|
ld $A[2][2],`8*12`(r3)
|
|
ld $A[2][3],`8*13`(r3)
|
|
ld $A[2][4],`8*14`(r3)
|
|
ld $A[3][0],`8*15`(r3)
|
|
ld $A[3][1],`8*16`(r3)
|
|
ld $A[3][2],`8*17`(r3)
|
|
ld $A[3][3],`8*18`(r3)
|
|
ld $A[3][4],`8*19`(r3)
|
|
ld $A[4][0],`8*20`(r3)
|
|
ld $A[4][1],`8*21`(r3)
|
|
ld $A[4][2],`8*22`(r3)
|
|
ld $A[4][3],`8*23`(r3)
|
|
ld $A[4][4],`8*24`(r3)
|
|
|
|
mr r3,r4
|
|
mr r4,r5
|
|
mr r5,r0
|
|
|
|
b .Loop_absorb
|
|
|
|
.align 4
|
|
.Loop_absorb:
|
|
$UCMP r4,r5 ; len < bsz?
|
|
blt .Labsorbed
|
|
|
|
sub r4,r4,r5 ; len -= bsz
|
|
srwi r5,r5,3
|
|
$PUSH r4,`$LOCALS+2*$SIZE_T`($sp) ; save len
|
|
mtctr r5
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[0][0],$A[0][0],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[0][1],$A[0][1],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[0][2],$A[0][2],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[0][3],$A[0][3],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[0][4],$A[0][4],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[1][0],$A[1][0],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[1][1],$A[1][1],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[1][2],$A[1][2],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[1][3],$A[1][3],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[1][4],$A[1][4],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[2][0],$A[2][0],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[2][1],$A[2][1],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[2][2],$A[2][2],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[2][3],$A[2][3],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[2][4],$A[2][4],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[3][0],$A[3][0],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[3][1],$A[3][1],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[3][2],$A[3][2],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[3][3],$A[3][3],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[3][4],$A[3][4],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[4][0],$A[4][0],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[4][1],$A[4][1],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[4][2],$A[4][2],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[4][3],$A[4][3],r0
|
|
bdz .Lprocess_block
|
|
$DWORD_LE_LOAD ; *inp++
|
|
xor $A[4][4],$A[4][4],r0
|
|
|
|
.Lprocess_block:
|
|
$PUSH r3,`$LOCALS+1*$SIZE_T`($sp) ; save inp
|
|
|
|
bl KeccakF1600_int
|
|
|
|
$POP r0,`$LOCALS+4*$SIZE_T`($sp) ; pull iotas[24]
|
|
$POP r5,`$LOCALS+3*$SIZE_T`($sp) ; restore bsz
|
|
$POP r4,`$LOCALS+2*$SIZE_T`($sp) ; restore len
|
|
$POP r3,`$LOCALS+1*$SIZE_T`($sp) ; restore inp
|
|
addic r0,r0,`-8*24` ; rewind iotas
|
|
$PUSH r0,`$LOCALS+4*$SIZE_T`($sp)
|
|
|
|
b .Loop_absorb
|
|
|
|
.align 4
|
|
.Labsorbed:
|
|
$POP r3,`$LOCALS+0*$SIZE_T`($sp)
|
|
std $A[0][0],`8*0`(r3) ; return A[5][5]
|
|
std $A[0][1],`8*1`(r3)
|
|
std $A[0][2],`8*2`(r3)
|
|
std $A[0][3],`8*3`(r3)
|
|
std $A[0][4],`8*4`(r3)
|
|
std $A[1][0],`8*5`(r3)
|
|
std $A[1][1],`8*6`(r3)
|
|
std $A[1][2],`8*7`(r3)
|
|
std $A[1][3],`8*8`(r3)
|
|
std $A[1][4],`8*9`(r3)
|
|
std $A[2][0],`8*10`(r3)
|
|
std $A[2][1],`8*11`(r3)
|
|
std $A[2][2],`8*12`(r3)
|
|
std $A[2][3],`8*13`(r3)
|
|
std $A[2][4],`8*14`(r3)
|
|
std $A[3][0],`8*15`(r3)
|
|
std $A[3][1],`8*16`(r3)
|
|
std $A[3][2],`8*17`(r3)
|
|
std $A[3][3],`8*18`(r3)
|
|
std $A[3][4],`8*19`(r3)
|
|
std $A[4][0],`8*20`(r3)
|
|
std $A[4][1],`8*21`(r3)
|
|
std $A[4][2],`8*22`(r3)
|
|
std $A[4][3],`8*23`(r3)
|
|
std $A[4][4],`8*24`(r3)
|
|
|
|
mr r3,r4 ; return value
|
|
$POP r0,`$FRAME+$LRSAVE`($sp)
|
|
$POP r14,`$FRAME-$SIZE_T*18`($sp)
|
|
$POP r15,`$FRAME-$SIZE_T*17`($sp)
|
|
$POP r16,`$FRAME-$SIZE_T*16`($sp)
|
|
$POP r17,`$FRAME-$SIZE_T*15`($sp)
|
|
$POP r18,`$FRAME-$SIZE_T*14`($sp)
|
|
$POP r19,`$FRAME-$SIZE_T*13`($sp)
|
|
$POP r20,`$FRAME-$SIZE_T*12`($sp)
|
|
$POP r21,`$FRAME-$SIZE_T*11`($sp)
|
|
$POP r22,`$FRAME-$SIZE_T*10`($sp)
|
|
$POP r23,`$FRAME-$SIZE_T*9`($sp)
|
|
$POP r24,`$FRAME-$SIZE_T*8`($sp)
|
|
$POP r25,`$FRAME-$SIZE_T*7`($sp)
|
|
$POP r26,`$FRAME-$SIZE_T*6`($sp)
|
|
$POP r27,`$FRAME-$SIZE_T*5`($sp)
|
|
$POP r28,`$FRAME-$SIZE_T*4`($sp)
|
|
$POP r29,`$FRAME-$SIZE_T*3`($sp)
|
|
$POP r30,`$FRAME-$SIZE_T*2`($sp)
|
|
$POP r31,`$FRAME-$SIZE_T*1`($sp)
|
|
mtlr r0
|
|
addi $sp,$sp,$FRAME
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,1,0x80,18,4,0
|
|
.long 0
|
|
.size SHA3_absorb,.-SHA3_absorb
|
|
___
|
|
{
|
|
my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31));
|
|
$code.=<<___;
|
|
.globl SHA3_squeeze
|
|
.type SHA3_squeeze,\@function
|
|
.align 5
|
|
SHA3_squeeze:
|
|
$STU $sp,`-10*$SIZE_T`($sp)
|
|
mflr r0
|
|
$PUSH r28,`6*$SIZE_T`($sp)
|
|
$PUSH r29,`7*$SIZE_T`($sp)
|
|
$PUSH r30,`8*$SIZE_T`($sp)
|
|
$PUSH r31,`9*$SIZE_T`($sp)
|
|
$PUSH r0,`10*$SIZE_T+$LRSAVE`($sp)
|
|
|
|
mr $A_flat,r3
|
|
subi r3,r3,8 ; prepare for ldu
|
|
subi $out,r4,1 ; prepare for stbu
|
|
mr $len,r5
|
|
mr $bsz,r6
|
|
b .Loop_squeeze
|
|
|
|
.align 4
|
|
.Loop_squeeze:
|
|
ldu r0,8(r3)
|
|
${UCMP}i $len,8
|
|
blt .Lsqueeze_tail
|
|
|
|
stb r0,1($out)
|
|
srdi r0,r0,8
|
|
stb r0,2($out)
|
|
srdi r0,r0,8
|
|
stb r0,3($out)
|
|
srdi r0,r0,8
|
|
stb r0,4($out)
|
|
srdi r0,r0,8
|
|
stb r0,5($out)
|
|
srdi r0,r0,8
|
|
stb r0,6($out)
|
|
srdi r0,r0,8
|
|
stb r0,7($out)
|
|
srdi r0,r0,8
|
|
stbu r0,8($out)
|
|
|
|
subic. $len,$len,8
|
|
beq .Lsqueeze_done
|
|
|
|
subic. r6,r6,8
|
|
bgt .Loop_squeeze
|
|
|
|
mr r3,$A_flat
|
|
bl KeccakF1600
|
|
subi r3,$A_flat,8 ; prepare for ldu
|
|
mr r6,$bsz
|
|
b .Loop_squeeze
|
|
|
|
.align 4
|
|
.Lsqueeze_tail:
|
|
mtctr $len
|
|
.Loop_tail:
|
|
stbu r0,1($out)
|
|
srdi r0,r0,8
|
|
bdnz .Loop_tail
|
|
|
|
.Lsqueeze_done:
|
|
$POP r0,`10*$SIZE_T+$LRSAVE`($sp)
|
|
$POP r28,`6*$SIZE_T`($sp)
|
|
$POP r29,`7*$SIZE_T`($sp)
|
|
$POP r30,`8*$SIZE_T`($sp)
|
|
$POP r31,`9*$SIZE_T`($sp)
|
|
mtlr r0
|
|
addi $sp,$sp,`10*$SIZE_T`
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,1,0x80,4,4,0
|
|
.long 0
|
|
.size SHA3_squeeze,.-SHA3_squeeze
|
|
___
|
|
}
|
|
|
|
# Ugly hack here, because PPC assembler syntax seem to vary too
|
|
# much from platforms to platform...
|
|
$code.=<<___;
|
|
.align 6
|
|
PICmeup:
|
|
mflr r0
|
|
bcl 20,31,\$+4
|
|
mflr r12 ; vvvvvv "distance" between . and 1st data entry
|
|
addi r12,r12,`64-8`
|
|
mtlr r0
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,0,0
|
|
.space `64-9*4`
|
|
.type iotas,\@object
|
|
iotas:
|
|
.quad 0x0000000000000001
|
|
.quad 0x0000000000008082
|
|
.quad 0x800000000000808a
|
|
.quad 0x8000000080008000
|
|
.quad 0x000000000000808b
|
|
.quad 0x0000000080000001
|
|
.quad 0x8000000080008081
|
|
.quad 0x8000000000008009
|
|
.quad 0x000000000000008a
|
|
.quad 0x0000000000000088
|
|
.quad 0x0000000080008009
|
|
.quad 0x000000008000000a
|
|
.quad 0x000000008000808b
|
|
.quad 0x800000000000008b
|
|
.quad 0x8000000000008089
|
|
.quad 0x8000000000008003
|
|
.quad 0x8000000000008002
|
|
.quad 0x8000000000000080
|
|
.quad 0x000000000000800a
|
|
.quad 0x800000008000000a
|
|
.quad 0x8000000080008081
|
|
.quad 0x8000000000008080
|
|
.quad 0x0000000080000001
|
|
.quad 0x8000000080008008
|
|
.size iotas,.-iotas
|
|
.asciz "Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
|
|
___
|
|
|
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
print $code;
|
|
close STDOUT or die "error closing STDOUT: $!";
|