mirror of
https://github.com/openssl/openssl.git
synced 2025-01-18 13:44:20 +08:00
crypto/sha/asm/sha1-x86_64.pl update:
+5% on Atom Silvermont, up to +8% improvement of legacy code. Harmonize sha1-586.pl and aesni-sha1-x86_86.p with sha1-x86_64.pl.
This commit is contained in:
parent
30ea570f0f
commit
b217ca63b1
@ -21,24 +21,24 @@
|
||||
# subroutine:
|
||||
#
|
||||
# AES-128-CBC +SHA1 stitch gain
|
||||
# Westmere 3.77[+5.5] 9.26 6.66 +39%
|
||||
# Sandy Bridge 5.05[+5.0(6.2)] 10.06(11.21) 5.98(7.01) +68%(+60%)
|
||||
# Westmere 3.77[+5.3] 9.07 6.55 +38%
|
||||
# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
|
||||
# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
|
||||
# Haswell 4.43[+3.6(4.1)] 8.00(8.55) 4.55(5.21) +75%(+64%)
|
||||
# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
|
||||
# Bulldozer 5.77[+6.0] 11.72 6.37 +84%
|
||||
#
|
||||
# AES-192-CBC
|
||||
# Westmere 4.51 10.00 6.91 +45%
|
||||
# Sandy Bridge 6.05 11.06(12.21) 6.11(7.18) +81%(+70%)
|
||||
# Westmere 4.51 9.81 6.80 +44%
|
||||
# Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%)
|
||||
# Ivy Bridge 6.05 10.65 6.07 +75%
|
||||
# Haswell 5.29 8.86(9.42) 5.32(5.32) +67%(+77%)
|
||||
# Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%)
|
||||
# Bulldozer 6.89 12.84 6.96 +84%
|
||||
#
|
||||
# AES-256-CBC
|
||||
# Westmere 5.25 10.74 7.24 +48%
|
||||
# Sandy Bridge 7.05 12.06(13.21) 7.12(7.63) +69%(+73%)
|
||||
# Westmere 5.25 10.55 7.21 +46%
|
||||
# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
|
||||
# Ivy Bridge 7.05 11.65 7.12 +64%
|
||||
# Haswell 6.19 9.76(10.3) 6.21(6.25) +57%(+65%)
|
||||
# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
|
||||
# Bulldozer 8.00 13.95 8.25 +69%
|
||||
#
|
||||
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
|
||||
@ -230,11 +230,11 @@ $code.=<<___;
|
||||
movdqu 32($inp),@X[-2&7]
|
||||
movdqu 48($inp),@X[-1&7]
|
||||
pshufb @Tx[2],@X[-4&7] # byte swap
|
||||
add \$64,$inp
|
||||
pshufb @Tx[2],@X[-3&7]
|
||||
pshufb @Tx[2],@X[-2&7]
|
||||
pshufb @Tx[2],@X[-1&7]
|
||||
add \$64,$inp
|
||||
paddd @Tx[1],@X[-4&7] # add K_00_19
|
||||
pshufb @Tx[2],@X[-1&7]
|
||||
paddd @Tx[1],@X[-3&7]
|
||||
paddd @Tx[1],@X[-2&7]
|
||||
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
|
||||
@ -297,74 +297,75 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
||||
my ($a,$b,$c,$d,$e);
|
||||
|
||||
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa(@X[0],@X[-3&7]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
|
||||
eval(shift(@insns));
|
||||
&movdqa (@Tx[0],@X[-1&7]);
|
||||
&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns));
|
||||
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
eval(shift(@insns)); # ror
|
||||
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&movdqa (@Tx[2],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
&movdqa (@Tx[0],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
|
||||
&paddd (@X[0],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&psrld (@Tx[0],31);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns));
|
||||
&movdqa (@Tx[1],@Tx[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&psrld (@Tx[2],30);
|
||||
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pslld (@Tx[1],2);
|
||||
&pxor (@X[0],@Tx[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
||||
&pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
|
||||
|
||||
foreach (@insns) { eval; } # remaining instructions [if any]
|
||||
|
||||
@ -375,27 +376,30 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
|
||||
sub Xupdate_ssse3_32_79()
|
||||
{ use integer;
|
||||
my $body = shift;
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
|
||||
my ($a,$b,$c,$d,$e);
|
||||
|
||||
&pshufd (@Tx[0],@X[-2&7],0xee) if ($Xi==8); # was &movdqa (@Tx[0],@X[-1&7])
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns)) if ($Xi==8);
|
||||
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
||||
&punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
|
||||
eval(shift(@insns)) if ($Xi==8);
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)) if (@insns[1] =~ /_ror/);
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
||||
&punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
|
||||
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
|
||||
eval(shift(@insns));
|
||||
if ($Xi%5) {
|
||||
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
|
||||
} else { # ... or load next one
|
||||
&movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
|
||||
}
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
eval(shift(@insns)); # ror
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
||||
@ -403,28 +407,30 @@ sub Xupdate_ssse3_32_79()
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
||||
|
||||
&movdqa (@Tx[0],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # body_20_39
|
||||
|
||||
&pslld (@X[0],2);
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&psrld (@Tx[0],30);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
eval(shift(@insns));
|
||||
|
||||
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns)) if (@insns[1] =~ /_rol/);
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
|
||||
&pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
@ -445,11 +451,12 @@ sub Xuplast_ssse3_80()
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
||||
my ($a,$b,$c,$d,$e);
|
||||
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
||||
@ -481,9 +488,12 @@ sub Xloop_ssse3()
|
||||
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pshufb (@X[($Xi-3)&7],@Tx[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&paddd (@X[($Xi-4)&7],@Tx[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
@ -492,6 +502,8 @@ sub Xloop_ssse3()
|
||||
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&psubd (@X[($Xi-4)&7],@Tx[1]);
|
||||
|
||||
foreach (@insns) { eval; }
|
||||
|
@ -93,8 +93,9 @@
|
||||
# Westmere 7.3 5.5/+33% -
|
||||
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73%
|
||||
# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53%
|
||||
# Haswell 6.5 4.3/+51% 4.1(**)/+58%
|
||||
# Bulldozer 11.6 6.0/+92%
|
||||
# VIA Nano 10.6 7.4/+43%
|
||||
# VIA Nano 10.6 7.5/+41%
|
||||
#
|
||||
# (*) Loop is 1056 instructions long and expected result is ~8.25.
|
||||
# It remains mystery [to me] why ILP is limited to 1.7.
|
||||
@ -512,7 +513,7 @@ my $_ror=sub { &ror(@_) };
|
||||
&mov (@T[1],$C);
|
||||
&psubd (@X[-2&7],@X[3]);
|
||||
&xor (@T[1],$D);
|
||||
&movdqa (@X[0],@X[-3&7]);
|
||||
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
|
||||
&and (@T[0],@T[1]);
|
||||
&jmp (&label("loop"));
|
||||
|
||||
@ -539,76 +540,77 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
||||
my ($a,$b,$c,$d,$e);
|
||||
|
||||
eval(shift(@insns)); # ror
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
|
||||
&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
|
||||
&movdqa (@X[2],@X[-1&7]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&paddd (@X[3],@X[-1&7]);
|
||||
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns));
|
||||
&psrldq (@X[2],4); # "X[-3]", 3 dwords
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
|
||||
&pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
&movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&movdqa (@X[4],@X[0]);
|
||||
&movdqa (@X[2],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
&movdqa (@X[2],@X[0]);
|
||||
eval(shift(@insns));
|
||||
|
||||
&pslldq (@X[4],12); # "X[0]"<<96, extract one dword
|
||||
&paddd (@X[0],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&psrld (@X[2],31);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
&movdqa (@X[3],@X[4]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&psrld (@X[4],30);
|
||||
&por (@X[0],@X[2]); # "X[0]"<<<=1
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
&por (@X[0],@X[2]); # "X[0]"<<<=1
|
||||
eval(shift(@insns));
|
||||
&movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pslld (@X[3],2);
|
||||
&pxor (@X[0],@X[4]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
&pxor (@X[0],@X[4]);
|
||||
&movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
|
||||
&movdqa (@X[1],@X[-2&7]) if ($Xi<7);
|
||||
&pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7])
|
||||
&pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
@ -623,10 +625,9 @@ sub Xupdate_ssse3_32_79()
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
|
||||
my ($a,$b,$c,$d,$e);
|
||||
|
||||
&movdqa (@X[2],@X[-1&7]) if ($Xi==8);
|
||||
eval(shift(@insns)); # body_20_39
|
||||
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
||||
&palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
|
||||
&punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8)
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
@ -635,13 +636,14 @@ sub Xupdate_ssse3_32_79()
|
||||
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
|
||||
if ($Xi%5) {
|
||||
&movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
|
||||
} else { # ... or load next one
|
||||
&movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
|
||||
}
|
||||
&paddd (@X[3],@X[-1&7]);
|
||||
eval(shift(@insns)); # ror
|
||||
&paddd (@X[3],@X[-1&7]);
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
|
||||
@ -656,6 +658,7 @@ sub Xupdate_ssse3_32_79()
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
|
||||
|
||||
&pslld (@X[0],2);
|
||||
eval(shift(@insns)); # body_20_39
|
||||
@ -667,6 +670,8 @@ sub Xupdate_ssse3_32_79()
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)) if (@insns[1] =~ /_rol/);
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
|
||||
|
||||
&por (@X[0],@X[2]); # "X[0]"<<<=2
|
||||
eval(shift(@insns)); # body_20_39
|
||||
@ -677,7 +682,7 @@ sub Xupdate_ssse3_32_79()
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
&movdqa (@X[3],@X[0]) if ($Xi<19);
|
||||
&pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0])
|
||||
eval(shift(@insns));
|
||||
|
||||
foreach (@insns) { eval; } # remaining instructions
|
||||
@ -691,6 +696,12 @@ sub Xuplast_ssse3_80()
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
||||
my ($a,$b,$c,$d,$e);
|
||||
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&paddd (@X[3],@X[-1&7]);
|
||||
eval(shift(@insns));
|
||||
@ -728,9 +739,16 @@ sub Xloop_ssse3()
|
||||
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pshufb (@X[($Xi-3)&7],@X[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&paddd (@X[($Xi-4)&7],@X[3]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
@ -739,6 +757,8 @@ sub Xloop_ssse3()
|
||||
&movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&psubd (@X[($Xi-4)&7],@X[3]);
|
||||
|
||||
foreach (@insns) { eval; }
|
||||
@ -816,6 +836,64 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
|
||||
'&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
||||
);
|
||||
}
|
||||
######
|
||||
sub bodyx_00_19 () { # ((c^d)&b)^d
|
||||
# on start @T[0]=(b&c)^(~b&d), $e+=X[]+K
|
||||
return &bodyx_20_39() if ($rx==19); $rx++;
|
||||
(
|
||||
'($a,$b,$c,$d,$e)=@V;'.
|
||||
|
||||
'&rorx ($b,$b,2) if ($j==0);'. # $b>>>2
|
||||
'&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2
|
||||
'&lea ($e,&DWP(0,$e,@T[0]));',
|
||||
'&rorx (@T[0],$a,5);',
|
||||
|
||||
'&andn (@T[1],$a,$c);',
|
||||
'&and ($a,$b)',
|
||||
'&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer
|
||||
|
||||
'&xor (@T[1],$a)',
|
||||
'&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
||||
);
|
||||
}
|
||||
|
||||
sub bodyx_20_39 () { # b^d^c
|
||||
# on start $b=b^c^d
|
||||
return &bodyx_40_59() if ($rx==39); $rx++;
|
||||
(
|
||||
'($a,$b,$c,$d,$e)=@V;'.
|
||||
|
||||
'&add ($e,($j==19?@T[0]:$b))',
|
||||
'&rorx ($b,@T[1],7);', # $b>>>2
|
||||
'&rorx (@T[0],$a,5);',
|
||||
|
||||
'&xor ($a,$b) if ($j<79);',
|
||||
'&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer
|
||||
'&xor ($a,$c) if ($j<79);',
|
||||
'&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
||||
);
|
||||
}
|
||||
|
||||
sub bodyx_40_59 () { # ((b^c)&(c^d))^c
|
||||
# on start $b=((b^c)&(c^d))^c
|
||||
return &bodyx_20_39() if ($rx==59); $rx++;
|
||||
(
|
||||
'($a,$b,$c,$d,$e)=@V;'.
|
||||
|
||||
'&rorx (@T[0],$a,5)',
|
||||
'&lea ($e,&DWP(0,$e,$b))',
|
||||
'&rorx ($b,@T[1],7)', # $b>>>2
|
||||
'&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer
|
||||
|
||||
'&mov (@T[1],$c)',
|
||||
'&xor ($a,$b)', # b^c for next round
|
||||
'&xor (@T[1],$b)', # c^d for next round
|
||||
|
||||
'&and ($a,@T[1])',
|
||||
'&add ($e,@T[0])',
|
||||
'&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
||||
);
|
||||
}
|
||||
|
||||
&set_label("loop",16);
|
||||
&Xupdate_ssse3_16_31(\&body_00_19);
|
||||
@ -855,9 +933,10 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
|
||||
&mov (&DWP(12,@T[1]),$D);
|
||||
&xor ($B,$D);
|
||||
&mov (&DWP(16,@T[1]),$E);
|
||||
&and ($B,@T[0]);
|
||||
&movdqa (@X[0],@X[-3&7]);
|
||||
&xchg ($B,@T[0]);
|
||||
&mov (@T[1],@T[0]);
|
||||
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
|
||||
&and (@T[0],$B);
|
||||
&mov ($B,$T[1]);
|
||||
|
||||
&jmp (&label("loop"));
|
||||
|
||||
@ -1226,9 +1305,10 @@ sub Xtail_avx()
|
||||
&mov (&DWP(8,@T[1]),$C);
|
||||
&xor ($B,$D);
|
||||
&mov (&DWP(12,@T[1]),$D);
|
||||
&and ($B,@T[0]);
|
||||
&mov (&DWP(16,@T[1]),$E);
|
||||
&xchg ($B,@T[0]);
|
||||
&mov (@T[1],@T[0]);
|
||||
&and (@T[0],$B);
|
||||
&mov ($B,@T[1]);
|
||||
|
||||
&jmp (&label("loop"));
|
||||
|
||||
|
@ -62,16 +62,20 @@
|
||||
# CPU clock cycles spent to process single byte (less is better).
|
||||
#
|
||||
# x86_64 SSSE3 AVX[2]
|
||||
# P4 9.8 -
|
||||
# Opteron 6.65 -
|
||||
# Core2 6.70 6.05/+11% -
|
||||
# Westmere 7.08 5.44/+30% -
|
||||
# Sandy Bridge 7.93 6.16/+28% 4.99/+59%
|
||||
# Ivy Bridge 6.30 4.63/+36% 4.60/+37%
|
||||
# Haswell 5.98 4.12/+45% 3.57/+67%
|
||||
# Bulldozer 10.9 5.95/+82%
|
||||
# VIA Nano 10.2 7.46/+37%
|
||||
# Atom 11.0 9.61/+14%
|
||||
# P4 9.05 -
|
||||
# Opteron 6.26 -
|
||||
# Core2 6.55 6.05/+8% -
|
||||
# Westmere 6.73 5.30/+27% -
|
||||
# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
|
||||
# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
|
||||
# Haswell 5.45 4.15/+31% 3.57/+53%
|
||||
# Bulldozer 9.11 5.95/+53%
|
||||
# VIA Nano 9.32 7.15/+30%
|
||||
# Atom [10.5?] [9.23?]/+14%
|
||||
# Silvermont 13.1(*) 9.37/+40%
|
||||
#
|
||||
# (*) obviously suboptimal result, nothing was done about it,
|
||||
# because SSSE3 code is compiled unconditionally;
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
@ -114,7 +118,7 @@ $num="%r10";
|
||||
$t0="%eax";
|
||||
$t1="%ebx";
|
||||
$t2="%ecx";
|
||||
@xi=("%edx","%ebp");
|
||||
@xi=("%edx","%ebp","%r14d");
|
||||
$A="%esi";
|
||||
$B="%edi";
|
||||
$C="%r11d";
|
||||
@ -129,42 +133,40 @@ my $j=$i+1;
|
||||
$code.=<<___ if ($i==0);
|
||||
mov `4*$i`($inp),$xi[0]
|
||||
bswap $xi[0]
|
||||
mov $xi[0],`4*$i`(%rsp)
|
||||
___
|
||||
$code.=<<___ if ($i<15);
|
||||
mov $c,$t0
|
||||
mov `4*$j`($inp),$xi[1]
|
||||
mov $d,$t0
|
||||
mov $xi[0],`4*$i`(%rsp)
|
||||
mov $a,$t2
|
||||
xor $d,$t0
|
||||
bswap $xi[1]
|
||||
xor $c,$t0
|
||||
rol \$5,$t2
|
||||
lea 0x5a827999($xi[0],$e),$e
|
||||
and $b,$t0
|
||||
mov $xi[1],`4*$j`(%rsp)
|
||||
lea 0x5a827999($xi[0],$e),$e
|
||||
add $t2,$e
|
||||
xor $d,$t0
|
||||
rol \$30,$b
|
||||
add $t0,$e
|
||||
___
|
||||
$code.=<<___ if ($i>=15);
|
||||
mov `4*($j%16)`(%rsp),$xi[1]
|
||||
mov $c,$t0
|
||||
xor `4*($j%16)`(%rsp),$xi[1]
|
||||
mov $d,$t0
|
||||
mov $xi[0],`4*($i%16)`(%rsp)
|
||||
mov $a,$t2
|
||||
xor `4*(($j+2)%16)`(%rsp),$xi[1]
|
||||
xor $d,$t0
|
||||
xor $c,$t0
|
||||
rol \$5,$t2
|
||||
xor `4*(($j+8)%16)`(%rsp),$xi[1]
|
||||
and $b,$t0
|
||||
lea 0x5a827999($xi[0],$e),$e
|
||||
xor `4*(($j+13)%16)`(%rsp),$xi[1]
|
||||
xor $d,$t0
|
||||
rol \$1,$xi[1]
|
||||
add $t2,$e
|
||||
rol \$30,$b
|
||||
mov $xi[1],`4*($j%16)`(%rsp)
|
||||
xor $d,$t0
|
||||
add $t2,$e
|
||||
rol \$1,$xi[1]
|
||||
add $t0,$e
|
||||
___
|
||||
unshift(@xi,pop(@xi));
|
||||
push(@xi,shift(@xi));
|
||||
}
|
||||
|
||||
sub BODY_20_39 {
|
||||
@ -172,62 +174,58 @@ my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
|
||||
$code.=<<___ if ($i<79);
|
||||
mov `4*($j%16)`(%rsp),$xi[1]
|
||||
mov $c,$t0
|
||||
xor `4*($j%16)`(%rsp),$xi[1]
|
||||
mov $b,$t0
|
||||
`"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
|
||||
mov $a,$t2
|
||||
xor `4*(($j+2)%16)`(%rsp),$xi[1]
|
||||
xor $b,$t0
|
||||
rol \$5,$t2
|
||||
lea $K($xi[0],$e),$e
|
||||
xor `4*(($j+8)%16)`(%rsp),$xi[1]
|
||||
xor $d,$t0
|
||||
rol \$5,$t2
|
||||
xor `4*(($j+8)%16)`(%rsp),$xi[1]
|
||||
lea $K($xi[0],$e),$e
|
||||
xor $c,$t0
|
||||
add $t2,$e
|
||||
xor `4*(($j+13)%16)`(%rsp),$xi[1]
|
||||
rol \$30,$b
|
||||
add $t0,$e
|
||||
rol \$1,$xi[1]
|
||||
___
|
||||
$code.=<<___ if ($i<76);
|
||||
mov $xi[1],`4*($j%16)`(%rsp)
|
||||
___
|
||||
$code.=<<___ if ($i==79);
|
||||
mov $c,$t0
|
||||
mov $b,$t0
|
||||
mov $a,$t2
|
||||
xor $b,$t0
|
||||
xor $d,$t0
|
||||
lea $K($xi[0],$e),$e
|
||||
rol \$5,$t2
|
||||
xor $d,$t0
|
||||
xor $c,$t0
|
||||
add $t2,$e
|
||||
rol \$30,$b
|
||||
add $t0,$e
|
||||
___
|
||||
unshift(@xi,pop(@xi));
|
||||
push(@xi,shift(@xi));
|
||||
}
|
||||
|
||||
sub BODY_40_59 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___;
|
||||
mov `4*($j%16)`(%rsp),$xi[1]
|
||||
mov $c,$t0
|
||||
mov $c,$t1
|
||||
xor `4*($j%16)`(%rsp),$xi[1]
|
||||
mov $d,$t0
|
||||
mov $xi[0],`4*($i%16)`(%rsp)
|
||||
mov $d,$t1
|
||||
xor `4*(($j+2)%16)`(%rsp),$xi[1]
|
||||
and $d,$t0
|
||||
and $c,$t0
|
||||
mov $a,$t2
|
||||
xor `4*(($j+8)%16)`(%rsp),$xi[1]
|
||||
xor $d,$t1
|
||||
lea 0x8f1bbcdc($xi[0],$e),$e
|
||||
xor $c,$t1
|
||||
rol \$5,$t2
|
||||
xor `4*(($j+13)%16)`(%rsp),$xi[1]
|
||||
add $t0,$e
|
||||
and $b,$t1
|
||||
rol \$1,$xi[1]
|
||||
add $t1,$e
|
||||
rol \$30,$b
|
||||
mov $xi[1],`4*($j%16)`(%rsp)
|
||||
and $b,$t1
|
||||
add $t2,$e
|
||||
rol \$30,$b
|
||||
add $t1,$e
|
||||
___
|
||||
unshift(@xi,pop(@xi));
|
||||
push(@xi,shift(@xi));
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
@ -261,17 +259,18 @@ $code.=<<___;
|
||||
|
||||
.align 16
|
||||
.Lialu:
|
||||
mov %rsp,%rax
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
mov %rsp,%r11
|
||||
push %r14
|
||||
mov %rdi,$ctx # reassigned argument
|
||||
sub \$`8+16*4`,%rsp
|
||||
mov %rsi,$inp # reassigned argument
|
||||
and \$-64,%rsp
|
||||
mov %rdx,$num # reassigned argument
|
||||
mov %r11,`16*4`(%rsp)
|
||||
mov %rax,`16*4`(%rsp)
|
||||
.Lprologue:
|
||||
|
||||
mov 0($ctx),$A
|
||||
@ -305,11 +304,12 @@ $code.=<<___;
|
||||
jnz .Lloop
|
||||
|
||||
mov `16*4`(%rsp),%rsi
|
||||
mov (%rsi),%r13
|
||||
mov 8(%rsi),%r12
|
||||
mov 16(%rsi),%rbp
|
||||
mov 24(%rsi),%rbx
|
||||
lea 32(%rsi),%rsp
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue:
|
||||
ret
|
||||
.size sha1_block_data_order,.-sha1_block_data_order
|
||||
@ -389,11 +389,11 @@ $code.=<<___;
|
||||
movdqu 32($inp),@X[-2&7]
|
||||
movdqu 48($inp),@X[-1&7]
|
||||
pshufb @X[2],@X[-4&7] # byte swap
|
||||
add \$64,$inp
|
||||
pshufb @X[2],@X[-3&7]
|
||||
pshufb @X[2],@X[-2&7]
|
||||
pshufb @X[2],@X[-1&7]
|
||||
add \$64,$inp
|
||||
paddd @Tx[1],@X[-4&7] # add K_00_19
|
||||
pshufb @X[2],@X[-1&7]
|
||||
paddd @Tx[1],@X[-3&7]
|
||||
paddd @Tx[1],@X[-2&7]
|
||||
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
|
||||
@ -418,74 +418,75 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
||||
my ($a,$b,$c,$d,$e);
|
||||
|
||||
&movdqa (@X[0],@X[-3&7]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
|
||||
eval(shift(@insns));
|
||||
&movdqa (@Tx[0],@X[-1&7]);
|
||||
&palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns));
|
||||
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
eval(shift(@insns)); # ror
|
||||
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&movdqa (@Tx[2],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
&movdqa (@Tx[0],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
|
||||
&paddd (@X[0],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&psrld (@Tx[0],31);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns));
|
||||
&movdqa (@Tx[1],@Tx[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&psrld (@Tx[2],30);
|
||||
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pslld (@Tx[1],2);
|
||||
&pxor (@X[0],@Tx[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
||||
&pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
|
||||
|
||||
foreach (@insns) { eval; } # remaining instructions [if any]
|
||||
|
||||
@ -499,24 +500,27 @@ sub Xupdate_ssse3_32_79()
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
|
||||
my ($a,$b,$c,$d,$e);
|
||||
|
||||
&movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns)) if ($Xi==8);
|
||||
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
||||
&palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
|
||||
eval(shift(@insns)) if ($Xi==8);
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)) if (@insns[1] =~ /_ror/);
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
||||
&punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
|
||||
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
|
||||
eval(shift(@insns));
|
||||
if ($Xi%5) {
|
||||
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
|
||||
} else { # ... or load next one
|
||||
&movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
|
||||
}
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
eval(shift(@insns)); # ror
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
eval(shift(@insns));
|
||||
|
||||
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
||||
@ -524,29 +528,31 @@ sub Xupdate_ssse3_32_79()
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
||||
|
||||
&movdqa (@Tx[0],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # body_20_39
|
||||
|
||||
&pslld (@X[0],2);
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&psrld (@Tx[0],30);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # ror
|
||||
eval(shift(@insns));
|
||||
|
||||
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns));
|
||||
&movdqa (@Tx[1],@X[0]) if ($Xi<19);
|
||||
eval(shift(@insns)); # body_20_39
|
||||
eval(shift(@insns)) if (@insns[1] =~ /_rol/);
|
||||
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
|
||||
&pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); # rol
|
||||
eval(shift(@insns));
|
||||
@ -566,11 +572,12 @@ sub Xuplast_ssse3_80()
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
||||
my ($a,$b,$c,$d,$e);
|
||||
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&paddd (@Tx[1],@X[-1&7]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
||||
@ -602,10 +609,12 @@ sub Xloop_ssse3()
|
||||
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pshufb (@X[($Xi-3)&7],@X[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&paddd (@X[($Xi-4)&7],@Tx[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
@ -614,6 +623,8 @@ sub Xloop_ssse3()
|
||||
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&psubd (@X[($Xi-4)&7],@Tx[1]);
|
||||
|
||||
foreach (@insns) { eval; }
|
||||
@ -1680,16 +1691,17 @@ se_handler:
|
||||
jae .Lcommon_seh_tail
|
||||
|
||||
mov `16*4`(%rax),%rax # pull saved stack pointer
|
||||
lea 32(%rax),%rax
|
||||
|
||||
mov -8(%rax),%rbx
|
||||
mov -16(%rax),%rbp
|
||||
mov -24(%rax),%r12
|
||||
mov -32(%rax),%r13
|
||||
mov -40(%rax),%r14
|
||||
mov %rbx,144($context) # restore context->Rbx
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
mov %r12,216($context) # restore context->R12
|
||||
mov %r13,224($context) # restore context->R13
|
||||
mov %r14,232($context) # restore context->R14
|
||||
|
||||
jmp .Lcommon_seh_tail
|
||||
.size se_handler,.-se_handler
|
||||
|
Loading…
Reference in New Issue
Block a user