#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
# whirlpool_block for x86_64.
# 2500 cycles per 64-byte input block on AMD64, which is *identical*
# to 32-bit MMX version executed on same CPU. So why did I bother?
# Well, it's faster than gcc 3.3.2 generated code by over 50%, and
# over 80% faster than PathScale 1.4, an "ambitious" commercial
# compiler. Furthermore it surpasses gcc 3.4.3 by 170% and Sun Studio
# 10 - by 360%[!]... What is it with x86_64 compilers? It's not the
# first example when they fail to generate more optimal code, when
# I believe they had *all* chances to...
# Note that register and stack frame layout are virtually identical
# to 32-bit MMX version, except that %r8-15 are used instead of
# %mm0-8. You can even notice that K[i] and S[i] are loaded to
# %eax:%ebx as pair of 32-bit values and not as single 64-bit one.
# This is done in order to avoid 64-bit shift penalties on Intel
# EM64T core. Speaking of which! I bet it's possible to improve
# Opteron performance by compressing the table to 2KB and replacing
# unaligned references with complementary rotations [which would
# incidentally replace lea instructions], but it would definitely
# just "kill" EM64T, because it has only 1 shifter/rotator [against
# 3 on Opteron] and which is *unacceptably* slow with 64-bit
# operand.
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
sub L() { $code.=".byte ".join(',',@_)."\n"; }
sub LL(){ $code.=".byte ".join(',',@_).",".join(',',@_)."\n"; }
.globl $func
.type $func,\@function,3
.align 16
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%r11
sub \$128+40,%rsp
and \$-64,%rsp
lea 128(%rsp),%r10
mov %rdi,0(%r10) # save parameter block
mov %rsi,8(%r10)
mov %rdx,16(%r10)
mov %r11,32(%r10) # saved stack pointer
mov %r10,%rbx
lea $table(%rip),%rbp
xor %rcx,%rcx
xor %rdx,%rdx
for($i=0;$i<8;$i++) { $code.="mov $i*8(%rdi),@mm[$i]\n"; } # L=H
for($i=0;$i<8;$i++) { $code.="mov @mm[$i],$i*8(%rsp)\n"; } # K=L
for($i=0;$i<8;$i++) { $code.="xor $i*8(%rsi),@mm[$i]\n"; } # L^=inp
for($i=0;$i<8;$i++) { $code.="mov @mm[$i],64+$i*8(%rsp)\n"; } # S=L
xor %rsi,%rsi
mov %rsi,24(%rbx) # zero round counter
jmp .Lround
.align 16
mov 4096(%rbp,%rsi,8),@mm[0] # rc[r]
mov 0(%rsp),%eax
mov 4(%rsp),%ebx
movz %al,%ecx
movz %ah,%edx
for($i=0;$i<8;$i++) {
my $func = ($i==0)? "mov" : "xor";
shr \$16,%eax
lea (%rcx,%rcx),%rsi
movz %al,%ecx
lea (%rdx,%rdx),%rdi
movz %ah,%edx
xor 0(%rbp,%rsi,8),@mm[0]
$func 7(%rbp,%rdi,8),@mm[1]
mov $i*8+8(%rsp),%eax # ($i+1)*8
lea (%rcx,%rcx),%rsi
movz %bl,%ecx
lea (%rdx,%rdx),%rdi
movz %bh,%edx
$func 6(%rbp,%rsi,8),@mm[2]
$func 5(%rbp,%rdi,8),@mm[3]
shr \$16,%ebx
lea (%rcx,%rcx),%rsi
movz %bl,%ecx
lea (%rdx,%rdx),%rdi
movz %bh,%edx
$func 4(%rbp,%rsi,8),@mm[4]
$func 3(%rbp,%rdi,8),@mm[5]
mov $i*8+8+4(%rsp),%ebx # ($i+1)*8+4
lea (%rcx,%rcx),%rsi
movz %al,%ecx
lea (%rdx,%rdx),%rdi
movz %ah,%edx
$func 2(%rbp,%rsi,8),@mm[6]
$func 1(%rbp,%rdi,8),@mm[7]
for($i=0;$i<8;$i++) { $code.="mov @mm[$i],$i*8(%rsp)\n"; } # K=L
for($i=0;$i<8;$i++) {
shr \$16,%eax
lea (%rcx,%rcx),%rsi
movz %al,%ecx
lea (%rdx,%rdx),%rdi
movz %ah,%edx
xor 0(%rbp,%rsi,8),@mm[0]
xor 7(%rbp,%rdi,8),@mm[1]
`"mov 64+$i*8+8(%rsp),%eax" if($i<7);` # 64+($i+1)*8
lea (%rcx,%rcx),%rsi
movz %bl,%ecx
lea (%rdx,%rdx),%rdi
movz %bh,%edx
xor 6(%rbp,%rsi,8),@mm[2]
xor 5(%rbp,%rdi,8),@mm[3]
shr \$16,%ebx
lea (%rcx,%rcx),%rsi
movz %bl,%ecx
lea (%rdx,%rdx),%rdi
movz %bh,%edx
xor 4(%rbp,%rsi,8),@mm[4]
xor 3(%rbp,%rdi,8),@mm[5]
`"mov 64+$i*8+8+4(%rsp),%ebx" if($i<7);` # 64+($i+1)*8+4
lea (%rcx,%rcx),%rsi
movz %al,%ecx
lea (%rdx,%rdx),%rdi
movz %ah,%edx
xor 2(%rbp,%rsi,8),@mm[6]
xor 1(%rbp,%rdi,8),@mm[7]
lea 128(%rsp),%rbx
mov 24(%rbx),%rsi # pull round counter
add \$1,%rsi
cmp \$10,%rsi
je .Lroundsdone
mov %rsi,24(%rbx) # update round counter
for($i=0;$i<8;$i++) { $code.="mov @mm[$i],64+$i*8(%rsp)\n"; } # S=L
jmp .Lround
.align 16
mov 0(%rbx),%rdi # reload argument block
mov 8(%rbx),%rsi
mov 16(%rbx),%rax
for($i=0;$i<8;$i++) { $code.="xor $i*8(%rsi),@mm[$i]\n"; } # L^=inp
for($i=0;$i<8;$i++) { $code.="xor $i*8(%rdi),@mm[$i]\n"; } # L^=H
for($i=0;$i<8;$i++) { $code.="mov @mm[$i],$i*8(%rdi)\n"; } # H=L
lea 64(%rsi),%rsi # inp+=64
sub \$1,%rax # num--
jz .Lalldone
mov %rsi,8(%rbx) # update parameter block
mov %rax,16(%rbx)
jmp .Louterloop
mov 32(%rbx),%rsi # restore saved pointer
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
.size $func,.-$func
.align 64
.type $table,\@object
&L(0x18,0x23,0xc6,0xe8,0x87,0xb8,0x01,0x4f); # rc[ROUNDS]
if ($win64) {
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lprologue(%rip),%r10
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
lea .Lepilogue(%rip),%r10
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lin_prologue
mov 128+32(%rax),%rax # pull saved stack pointer
lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
.size se_handler,.-se_handler
.section .pdata
.align 4
.rva .LSEH_begin_$func
.rva .LSEH_end_$func
.rva .LSEH_info_$func
.section .xdata
.align 8
.byte 9,0,0,0
.rva se_handler
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;