mirror of
https://github.com/openssl/openssl.git
synced 2025-01-18 13:44:20 +08:00
[aesni|sha*]-mb-x86_64.pl: add data prefetching.
This commit is contained in:
parent
3ef477c69f
commit
3847d15d6b
@ -15,8 +15,8 @@
|
|||||||
# asymptotic measured
|
# asymptotic measured
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# Westmere 5.00/4=1.25 5.13/4=1.28
|
# Westmere 5.00/4=1.25 5.13/4=1.28
|
||||||
# Atom 15.0/4=3.75 15.7/4=3.93
|
# Atom 15.0/4=3.75 ?15.7/4=3.93
|
||||||
# Sandy Bridge 5.06/4=1.27 5.15/4=1.29
|
# Sandy Bridge 5.06/4=1.27 5.18/4=1.29
|
||||||
# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
|
# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
|
||||||
# Haswell 4.44/4=1.11 4.44/4=1.11
|
# Haswell 4.44/4=1.11 4.44/4=1.11
|
||||||
# Bulldozer 5.75/4=1.44 5.76/4=1.44
|
# Bulldozer 5.75/4=1.44 5.76/4=1.44
|
||||||
@ -27,8 +27,8 @@
|
|||||||
#
|
#
|
||||||
# asymptotic measured
|
# asymptotic measured
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# Sandy Bridge 5.06/8=0.64 7.05/8=0.88(*)
|
# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
|
||||||
# Ivy Bridge 5.06/8=0.64 7.02/8=0.88(*)
|
# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
|
||||||
# Haswell 5.00/8=0.63 5.00/8=0.63
|
# Haswell 5.00/8=0.63 5.00/8=0.63
|
||||||
# Bulldozer 5.75/8=0.72 5.77/8=0.72
|
# Bulldozer 5.75/8=0.72 5.77/8=0.72
|
||||||
#
|
#
|
||||||
@ -188,7 +188,11 @@ $code.=<<___;
|
|||||||
sub $offset,$sink
|
sub $offset,$sink
|
||||||
|
|
||||||
aesenc $rndkey1,@out[0]
|
aesenc $rndkey1,@out[0]
|
||||||
|
prefetcht0 31(@inptr[0],$offset) # prefetch input
|
||||||
|
prefetcht0 31(@inptr[1],$offset)
|
||||||
aesenc $rndkey1,@out[1]
|
aesenc $rndkey1,@out[1]
|
||||||
|
prefetcht0 31(@inptr[2],$offset)
|
||||||
|
prefetcht0 31(@inptr[2],$offset)
|
||||||
aesenc $rndkey1,@out[2]
|
aesenc $rndkey1,@out[2]
|
||||||
aesenc $rndkey1,@out[3]
|
aesenc $rndkey1,@out[3]
|
||||||
movups 0x30-0x78($key),$rndkey1
|
movups 0x30-0x78($key),$rndkey1
|
||||||
@ -199,8 +203,8 @@ $code.=<<___;
|
|||||||
cmp `32+4*$i`(%rsp),$one
|
cmp `32+4*$i`(%rsp),$one
|
||||||
aesenc $rndkey,@out[0]
|
aesenc $rndkey,@out[0]
|
||||||
aesenc $rndkey,@out[1]
|
aesenc $rndkey,@out[1]
|
||||||
cmovge $sink,@inptr[$i] # cancel input
|
|
||||||
aesenc $rndkey,@out[2]
|
aesenc $rndkey,@out[2]
|
||||||
|
cmovge $sink,@inptr[$i] # cancel input
|
||||||
cmovg $sink,@outptr[$i] # sink output
|
cmovg $sink,@outptr[$i] # sink output
|
||||||
aesenc $rndkey,@out[3]
|
aesenc $rndkey,@out[3]
|
||||||
movups `0x40+16*$i-0x78`($key),$rndkey
|
movups `0x40+16*$i-0x78`($key),$rndkey
|
||||||
@ -209,7 +213,11 @@ ___
|
|||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
movdqa $counters,$mask
|
movdqa $counters,$mask
|
||||||
aesenc $rndkey0,@out[0]
|
aesenc $rndkey0,@out[0]
|
||||||
|
prefetcht0 15(@outptr[0],$offset) # prefetch output
|
||||||
|
prefetcht0 15(@outptr[1],$offset)
|
||||||
aesenc $rndkey0,@out[1]
|
aesenc $rndkey0,@out[1]
|
||||||
|
prefetcht0 15(@outptr[2],$offset)
|
||||||
|
prefetcht0 15(@outptr[3],$offset)
|
||||||
aesenc $rndkey0,@out[2]
|
aesenc $rndkey0,@out[2]
|
||||||
aesenc $rndkey0,@out[3]
|
aesenc $rndkey0,@out[3]
|
||||||
movups 0x80-0x78($key),$rndkey0
|
movups 0x80-0x78($key),$rndkey0
|
||||||
@ -260,13 +268,15 @@ $code.=<<___;
|
|||||||
aesenc $rndkey0,@out[2]
|
aesenc $rndkey0,@out[2]
|
||||||
aesenc $rndkey0,@out[3]
|
aesenc $rndkey0,@out[3]
|
||||||
movups 0xe0-0x78($key),$rndkey0
|
movups 0xe0-0x78($key),$rndkey0
|
||||||
|
jmp .Lenc4x_tail
|
||||||
|
|
||||||
|
.align 32
|
||||||
.Lenc4x_tail:
|
.Lenc4x_tail:
|
||||||
aesenc $rndkey1,@out[0]
|
aesenc $rndkey1,@out[0]
|
||||||
aesenc $rndkey1,@out[1]
|
aesenc $rndkey1,@out[1]
|
||||||
aesenc $rndkey1,@out[2]
|
aesenc $rndkey1,@out[2]
|
||||||
movdqu (@inptr[0],$offset),@inp[0]
|
|
||||||
aesenc $rndkey1,@out[3]
|
aesenc $rndkey1,@out[3]
|
||||||
|
movdqu (@inptr[0],$offset),@inp[0]
|
||||||
movdqu 0x10-0x78($key),$rndkey1
|
movdqu 0x10-0x78($key),$rndkey1
|
||||||
|
|
||||||
aesenclast $rndkey0,@out[0]
|
aesenclast $rndkey0,@out[0]
|
||||||
@ -426,7 +436,11 @@ $code.=<<___;
|
|||||||
sub $offset,$sink
|
sub $offset,$sink
|
||||||
|
|
||||||
aesdec $rndkey1,@out[0]
|
aesdec $rndkey1,@out[0]
|
||||||
|
prefetcht0 31(@inptr[0],$offset) # prefetch input
|
||||||
|
prefetcht0 31(@inptr[1],$offset)
|
||||||
aesdec $rndkey1,@out[1]
|
aesdec $rndkey1,@out[1]
|
||||||
|
prefetcht0 31(@inptr[2],$offset)
|
||||||
|
prefetcht0 31(@inptr[3],$offset)
|
||||||
aesdec $rndkey1,@out[2]
|
aesdec $rndkey1,@out[2]
|
||||||
aesdec $rndkey1,@out[3]
|
aesdec $rndkey1,@out[3]
|
||||||
movups 0x30-0x78($key),$rndkey1
|
movups 0x30-0x78($key),$rndkey1
|
||||||
@ -447,7 +461,11 @@ ___
|
|||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
movdqa $counters,$mask
|
movdqa $counters,$mask
|
||||||
aesdec $rndkey0,@out[0]
|
aesdec $rndkey0,@out[0]
|
||||||
|
prefetcht0 15(@outptr[0],$offset) # prefetch output
|
||||||
|
prefetcht0 15(@outptr[1],$offset)
|
||||||
aesdec $rndkey0,@out[1]
|
aesdec $rndkey0,@out[1]
|
||||||
|
prefetcht0 15(@outptr[2],$offset)
|
||||||
|
prefetcht0 15(@outptr[3],$offset)
|
||||||
aesdec $rndkey0,@out[2]
|
aesdec $rndkey0,@out[2]
|
||||||
aesdec $rndkey0,@out[3]
|
aesdec $rndkey0,@out[3]
|
||||||
movups 0x80-0x78($key),$rndkey0
|
movups 0x80-0x78($key),$rndkey0
|
||||||
@ -498,7 +516,9 @@ $code.=<<___;
|
|||||||
aesdec $rndkey0,@out[2]
|
aesdec $rndkey0,@out[2]
|
||||||
aesdec $rndkey0,@out[3]
|
aesdec $rndkey0,@out[3]
|
||||||
movups 0xe0-0x78($key),$rndkey0
|
movups 0xe0-0x78($key),$rndkey0
|
||||||
|
jmp .Ldec4x_tail
|
||||||
|
|
||||||
|
.align 32
|
||||||
.Ldec4x_tail:
|
.Ldec4x_tail:
|
||||||
aesdec $rndkey1,@out[0]
|
aesdec $rndkey1,@out[0]
|
||||||
aesdec $rndkey1,@out[1]
|
aesdec $rndkey1,@out[1]
|
||||||
@ -512,12 +532,12 @@ $code.=<<___;
|
|||||||
movdqu 0x20-0x78($key),$rndkey0
|
movdqu 0x20-0x78($key),$rndkey0
|
||||||
|
|
||||||
aesdeclast @inp[0],@out[0]
|
aesdeclast @inp[0],@out[0]
|
||||||
movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
|
|
||||||
aesdeclast @inp[1],@out[1]
|
aesdeclast @inp[1],@out[1]
|
||||||
|
movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
|
||||||
movdqu -16(@inptr[1],$offset),@inp[1]
|
movdqu -16(@inptr[1],$offset),@inp[1]
|
||||||
aesdeclast @inp[2],@out[2]
|
aesdeclast @inp[2],@out[2]
|
||||||
movdqu -16(@inptr[2],$offset),@inp[2]
|
|
||||||
aesdeclast @inp[3],@out[3]
|
aesdeclast @inp[3],@out[3]
|
||||||
|
movdqu -16(@inptr[2],$offset),@inp[2]
|
||||||
movdqu -16(@inptr[3],$offset),@inp[3]
|
movdqu -16(@inptr[3],$offset),@inp[3]
|
||||||
|
|
||||||
movups @out[0],-16(@outptr[0],$offset)
|
movups @out[0],-16(@outptr[0],$offset)
|
||||||
@ -682,7 +702,13 @@ $code.=<<___ if ($i);
|
|||||||
___
|
___
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
vaesenc $rndkey,@out[1],@out[1]
|
vaesenc $rndkey,@out[1],@out[1]
|
||||||
|
prefetcht0 31(@ptr[$i]) # prefetch input
|
||||||
vaesenc $rndkey,@out[2],@out[2]
|
vaesenc $rndkey,@out[2],@out[2]
|
||||||
|
___
|
||||||
|
$code.=<<___ if ($i>1);
|
||||||
|
prefetcht0 15(@ptr[$i-2]) # prefetch output
|
||||||
|
___
|
||||||
|
$code.=<<___;
|
||||||
vaesenc $rndkey,@out[3],@out[3]
|
vaesenc $rndkey,@out[3],@out[3]
|
||||||
lea (@ptr[$i],$offset),$offset
|
lea (@ptr[$i],$offset),$offset
|
||||||
cmovge %rsp,@ptr[$i] # cancel input
|
cmovge %rsp,@ptr[$i] # cancel input
|
||||||
@ -703,6 +729,8 @@ ___
|
|||||||
}
|
}
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
vmovdqu 32(%rsp),$counters
|
vmovdqu 32(%rsp),$counters
|
||||||
|
prefetcht0 15(@ptr[$i-2]) # prefetch output
|
||||||
|
prefetcht0 15(@ptr[$i-1])
|
||||||
cmp \$11,$rounds
|
cmp \$11,$rounds
|
||||||
jb .Lenc8x_tail
|
jb .Lenc8x_tail
|
||||||
|
|
||||||
@ -958,7 +986,13 @@ $code.=<<___ if ($i);
|
|||||||
___
|
___
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
vaesdec $rndkey,@out[1],@out[1]
|
vaesdec $rndkey,@out[1],@out[1]
|
||||||
|
prefetcht0 31(@ptr[$i]) # prefetch input
|
||||||
vaesdec $rndkey,@out[2],@out[2]
|
vaesdec $rndkey,@out[2],@out[2]
|
||||||
|
___
|
||||||
|
$code.=<<___ if ($i>1);
|
||||||
|
prefetcht0 15(@ptr[$i-2]) # prefetch output
|
||||||
|
___
|
||||||
|
$code.=<<___;
|
||||||
vaesdec $rndkey,@out[3],@out[3]
|
vaesdec $rndkey,@out[3],@out[3]
|
||||||
lea (@ptr[$i],$offset),$offset
|
lea (@ptr[$i],$offset),$offset
|
||||||
cmovge %rsp,@ptr[$i] # cancel input
|
cmovge %rsp,@ptr[$i] # cancel input
|
||||||
@ -979,6 +1013,8 @@ ___
|
|||||||
}
|
}
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
vmovdqu 32(%rsp),$counters
|
vmovdqu 32(%rsp),$counters
|
||||||
|
prefetcht0 15(@ptr[$i-2]) # prefetch output
|
||||||
|
prefetcht0 15(@ptr[$i-1])
|
||||||
cmp \$11,$rounds
|
cmp \$11,$rounds
|
||||||
jb .Ldec8x_tail
|
jb .Ldec8x_tail
|
||||||
|
|
||||||
|
@ -14,20 +14,21 @@
|
|||||||
#
|
#
|
||||||
# this +aesni(i) sha1 aesni-sha1 gain(iv)
|
# this +aesni(i) sha1 aesni-sha1 gain(iv)
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
# Westmere(ii) 10.4/n +1.28=3.88(n=4) 5.44 6.58 +70%
|
# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68%
|
||||||
# Atom(ii) 18.9/n +3.93=8.66(n=4) 10.0 14.0 +62%
|
# Atom(ii) 18.9?/n +3.93=8.66(n=4) 10.0 14.0 +62%
|
||||||
# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
|
# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
|
||||||
# Ivy Bridge (8.03 +5.14=13.2)/n 4.60 5.54 +68%
|
# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68%
|
||||||
# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
|
# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
|
||||||
# Bulldozer (9.75 +5.76=15.5)/n 5.95 6.37 +64%
|
# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64%
|
||||||
#
|
#
|
||||||
# (i) multi-block CBC encrypt with 128-bit key;
|
# (i) multi-block CBC encrypt with 128-bit key;
|
||||||
# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
|
# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
|
||||||
# because of lower AES-NI instruction throughput;
|
# because of lower AES-NI instruction throughput;
|
||||||
# (iii) "this" is for n=8, when we gather twice as much data, result
|
# (iii) "this" is for n=8, when we gather twice as much data, result
|
||||||
# for n=4 is 7.98+4.44=12.4;
|
# for n=4 is 8.00+4.44=12.4;
|
||||||
# (iv) improvement coefficients in real-life application are somewhat
|
# (iv) presented improvement coefficients are asymptotic limits and
|
||||||
# lower and range from 30% to 100% (on Haswell);
|
# in real-life application are somewhat lower, e.g. for 2KB
|
||||||
|
# fragments they range from 30% to 100% (on Haswell);
|
||||||
|
|
||||||
$flavour = shift;
|
$flavour = shift;
|
||||||
$output = shift;
|
$output = shift;
|
||||||
@ -80,6 +81,14 @@ $Tbl="%rbp";
|
|||||||
@Xi=map("%xmm$_",(10..14));
|
@Xi=map("%xmm$_",(10..14));
|
||||||
$K="%xmm15";
|
$K="%xmm15";
|
||||||
|
|
||||||
|
if (1) {
|
||||||
|
# Atom-specific optimization aiming to eliminate pshufb with high
|
||||||
|
# registers [and thus get rid of 48 cycles accumulated penalty]
|
||||||
|
@Xi=map("%xmm$_",(0..4));
|
||||||
|
($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
|
||||||
|
@V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
|
||||||
|
}
|
||||||
|
|
||||||
$REG_SZ=16;
|
$REG_SZ=16;
|
||||||
|
|
||||||
sub Xi_off {
|
sub Xi_off {
|
||||||
@ -139,8 +148,8 @@ $code.=<<___ if ($i<14); # just load input
|
|||||||
|
|
||||||
psrld \$2,$b
|
psrld \$2,$b
|
||||||
paddd $t2,$e # e+=rol(a,5)
|
paddd $t2,$e # e+=rol(a,5)
|
||||||
movd `4*$j-16*4`(@ptr[2]),$t2
|
|
||||||
pshufb $tx,@Xi[1]
|
pshufb $tx,@Xi[1]
|
||||||
|
movd `4*$j-16*4`(@ptr[2]),$t2
|
||||||
por $t1,$b # b=rol(b,30)
|
por $t1,$b # b=rol(b,30)
|
||||||
___
|
___
|
||||||
$code.=<<___ if ($i==14); # just load input
|
$code.=<<___ if ($i==14); # just load input
|
||||||
@ -152,6 +161,7 @@ $code.=<<___ if ($i==14); # just load input
|
|||||||
movdqa $b,$t1
|
movdqa $b,$t1
|
||||||
movdqa $b,$t0
|
movdqa $b,$t0
|
||||||
pslld \$5,$t2
|
pslld \$5,$t2
|
||||||
|
prefetcht0 63(@ptr[0])
|
||||||
pandn $d,$t1
|
pandn $d,$t1
|
||||||
pand $c,$t0
|
pand $c,$t0
|
||||||
punpckldq $t3,@Xi[1]
|
punpckldq $t3,@Xi[1]
|
||||||
@ -162,14 +172,17 @@ $code.=<<___ if ($i==14); # just load input
|
|||||||
psrld \$27,$t3
|
psrld \$27,$t3
|
||||||
pxor $t1,$t0 # Ch(b,c,d)
|
pxor $t1,$t0 # Ch(b,c,d)
|
||||||
movdqa $b,$t1
|
movdqa $b,$t1
|
||||||
|
prefetcht0 63(@ptr[1])
|
||||||
|
|
||||||
por $t3,$t2 # rol(a,5)
|
por $t3,$t2 # rol(a,5)
|
||||||
pslld \$30,$t1
|
pslld \$30,$t1
|
||||||
paddd $t0,$e # e+=Ch(b,c,d)
|
paddd $t0,$e # e+=Ch(b,c,d)
|
||||||
|
prefetcht0 63(@ptr[2])
|
||||||
|
|
||||||
psrld \$2,$b
|
psrld \$2,$b
|
||||||
paddd $t2,$e # e+=rol(a,5)
|
paddd $t2,$e # e+=rol(a,5)
|
||||||
pshufb $tx,@Xi[1]
|
pshufb $tx,@Xi[1]
|
||||||
|
prefetcht0 63(@ptr[3])
|
||||||
por $t1,$b # b=rol(b,30)
|
por $t1,$b # b=rol(b,30)
|
||||||
___
|
___
|
||||||
$code.=<<___ if ($i>=13 && $i<15);
|
$code.=<<___ if ($i>=13 && $i<15);
|
||||||
@ -382,12 +395,12 @@ $code.=<<___;
|
|||||||
movdqu 0x60($ctx),$D
|
movdqu 0x60($ctx),$D
|
||||||
movdqu 0x80($ctx),$E
|
movdqu 0x80($ctx),$E
|
||||||
movdqa 0x60($Tbl),$tx # pbswap_mask
|
movdqa 0x60($Tbl),$tx # pbswap_mask
|
||||||
|
movdqa -0x20($Tbl),$K # K_00_19
|
||||||
jmp .Loop
|
jmp .Loop
|
||||||
|
|
||||||
.align 32
|
.align 32
|
||||||
.Loop:
|
.Loop:
|
||||||
___
|
___
|
||||||
$code.=" movdqa -0x20($Tbl),$K\n"; # K_00_19
|
|
||||||
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
||||||
$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39
|
$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39
|
||||||
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||||
@ -434,6 +447,7 @@ $code.=<<___;
|
|||||||
|
|
||||||
movdqa @Xi[0],(%rbx) # save counters
|
movdqa @Xi[0],(%rbx) # save counters
|
||||||
movdqa 0x60($Tbl),$tx # pbswap_mask
|
movdqa 0x60($Tbl),$tx # pbswap_mask
|
||||||
|
movdqa -0x20($Tbl),$K # K_00_19
|
||||||
dec $num
|
dec $num
|
||||||
jnz .Loop
|
jnz .Loop
|
||||||
|
|
||||||
@ -551,6 +565,7 @@ $code.=<<___ if ($i<14);
|
|||||||
___
|
___
|
||||||
$code.=<<___ if ($i==14);
|
$code.=<<___ if ($i==14);
|
||||||
vpaddd $K,$e,$e # e+=K_00_19
|
vpaddd $K,$e,$e # e+=K_00_19
|
||||||
|
prefetcht0 63(@ptr[0])
|
||||||
vpslld \$5,$a,$t2
|
vpslld \$5,$a,$t2
|
||||||
vpandn $d,$b,$t1
|
vpandn $d,$b,$t1
|
||||||
vpand $c,$b,$t0
|
vpand $c,$b,$t0
|
||||||
@ -559,14 +574,17 @@ $code.=<<___ if ($i==14);
|
|||||||
vpaddd @Xi[0],$e,$e # e+=X[i]
|
vpaddd @Xi[0],$e,$e # e+=X[i]
|
||||||
$vpack $t3,@Xi[1],@Xi[1]
|
$vpack $t3,@Xi[1],@Xi[1]
|
||||||
vpsrld \$27,$a,$t3
|
vpsrld \$27,$a,$t3
|
||||||
|
prefetcht0 63(@ptr[1])
|
||||||
vpxor $t1,$t0,$t0 # Ch(b,c,d)
|
vpxor $t1,$t0,$t0 # Ch(b,c,d)
|
||||||
|
|
||||||
vpslld \$30,$b,$t1
|
vpslld \$30,$b,$t1
|
||||||
vpor $t3,$t2,$t2 # rol(a,5)
|
vpor $t3,$t2,$t2 # rol(a,5)
|
||||||
|
prefetcht0 63(@ptr[2])
|
||||||
vpaddd $t0,$e,$e # e+=Ch(b,c,d)
|
vpaddd $t0,$e,$e # e+=Ch(b,c,d)
|
||||||
|
|
||||||
vpsrld \$2,$b,$b
|
vpsrld \$2,$b,$b
|
||||||
vpaddd $t2,$e,$e # e+=rol(a,5)
|
vpaddd $t2,$e,$e # e+=rol(a,5)
|
||||||
|
prefetcht0 63(@ptr[3])
|
||||||
vpshufb $tx,@Xi[1],@Xi[1]
|
vpshufb $tx,@Xi[1],@Xi[1]
|
||||||
vpor $t1,$b,$b # b=rol(b,30)
|
vpor $t1,$b,$b # b=rol(b,30)
|
||||||
___
|
___
|
||||||
@ -580,6 +598,7 @@ $code.=<<___ if ($i>=15); # apply Xupdate
|
|||||||
vpaddd $K,$e,$e # e+=K_00_19
|
vpaddd $K,$e,$e # e+=K_00_19
|
||||||
vpslld \$5,$a,$t2
|
vpslld \$5,$a,$t2
|
||||||
vpandn $d,$b,$t1
|
vpandn $d,$b,$t1
|
||||||
|
`"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
|
||||||
vpand $c,$b,$t0
|
vpand $c,$b,$t0
|
||||||
|
|
||||||
vmovdqa @Xi[0],`&Xi_off($i)`
|
vmovdqa @Xi[0],`&Xi_off($i)`
|
||||||
@ -588,14 +607,17 @@ $code.=<<___ if ($i>=15); # apply Xupdate
|
|||||||
vpsrld \$27,$a,$t3
|
vpsrld \$27,$a,$t3
|
||||||
vpxor $t1,$t0,$t0 # Ch(b,c,d)
|
vpxor $t1,$t0,$t0 # Ch(b,c,d)
|
||||||
vpxor @Xi[3],@Xi[1],@Xi[1]
|
vpxor @Xi[3],@Xi[1],@Xi[1]
|
||||||
|
`"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
|
||||||
|
|
||||||
vpslld \$30,$b,$t1
|
vpslld \$30,$b,$t1
|
||||||
vpor $t3,$t2,$t2 # rol(a,5)
|
vpor $t3,$t2,$t2 # rol(a,5)
|
||||||
vpaddd $t0,$e,$e # e+=Ch(b,c,d)
|
vpaddd $t0,$e,$e # e+=Ch(b,c,d)
|
||||||
|
`"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
|
||||||
vpsrld \$31,@Xi[1],$tx
|
vpsrld \$31,@Xi[1],$tx
|
||||||
vpaddd @Xi[1],@Xi[1],@Xi[1]
|
vpaddd @Xi[1],@Xi[1],@Xi[1]
|
||||||
|
|
||||||
vpsrld \$2,$b,$b
|
vpsrld \$2,$b,$b
|
||||||
|
`"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
|
||||||
vpaddd $t2,$e,$e # e+=rol(a,5)
|
vpaddd $t2,$e,$e # e+=rol(a,5)
|
||||||
vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1]
|
vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1]
|
||||||
vpor $t1,$b,$b # b=rol(b,30)
|
vpor $t1,$b,$b # b=rol(b,30)
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
# this +aesni(i) sha256 aesni-sha256 gain(iv)
|
# this +aesni(i) sha256 aesni-sha256 gain(iv)
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
|
# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
|
||||||
# Atom(ii) 39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
|
# Atom(ii) ?39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
|
||||||
# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
|
# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
|
||||||
# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
|
# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
|
||||||
# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
|
# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
|
||||||
@ -27,8 +27,9 @@
|
|||||||
# AES-NI-SHA256 stitch for these processors;
|
# AES-NI-SHA256 stitch for these processors;
|
||||||
# (iii) "this" is for n=8, when we gather twice as much data, result
|
# (iii) "this" is for n=8, when we gather twice as much data, result
|
||||||
# for n=4 is 20.3+4.44=24.7;
|
# for n=4 is 20.3+4.44=24.7;
|
||||||
# (iv) improvement coefficients in real-life application are somewhat
|
# (iv) presented improvement coefficients are asymptotic limits and
|
||||||
# lower and range from 75% to 130% (on Haswell);
|
# in real-life application are somewhat lower, e.g. for 2KB
|
||||||
|
# fragments they range from 75% to 13% (on Haswell);
|
||||||
|
|
||||||
$flavour = shift;
|
$flavour = shift;
|
||||||
$output = shift;
|
$output = shift;
|
||||||
@ -135,6 +136,7 @@ $code.=<<___;
|
|||||||
|
|
||||||
psrld \$25-11,$t2
|
psrld \$25-11,$t2
|
||||||
movdqa $e,$t1
|
movdqa $e,$t1
|
||||||
|
`"prefetch 63(@ptr[0])" if ($i==15)`
|
||||||
pxor $t3,$sigma
|
pxor $t3,$sigma
|
||||||
movdqa $e,$axb # borrow $axb
|
movdqa $e,$axb # borrow $axb
|
||||||
pslld \$26-21,$t3
|
pslld \$26-21,$t3
|
||||||
@ -142,6 +144,7 @@ $code.=<<___;
|
|||||||
pand $f,$axb
|
pand $f,$axb
|
||||||
pxor $t2,$sigma
|
pxor $t2,$sigma
|
||||||
|
|
||||||
|
`"prefetch 63(@ptr[1])" if ($i==15)`
|
||||||
movdqa $a,$t2
|
movdqa $a,$t2
|
||||||
pxor $t3,$sigma # Sigma1(e)
|
pxor $t3,$sigma # Sigma1(e)
|
||||||
movdqa $a,$t3
|
movdqa $a,$t3
|
||||||
@ -153,6 +156,7 @@ $code.=<<___;
|
|||||||
pslld \$10,$t3
|
pslld \$10,$t3
|
||||||
pxor $a,$axb # a^b, b^c in next round
|
pxor $a,$axb # a^b, b^c in next round
|
||||||
|
|
||||||
|
`"prefetch 63(@ptr[2])" if ($i==15)`
|
||||||
psrld \$13,$sigma
|
psrld \$13,$sigma
|
||||||
pxor $t3,$t2
|
pxor $t3,$t2
|
||||||
paddd $t1,$Xi # Xi+=Ch(e,f,g)
|
paddd $t1,$Xi # Xi+=Ch(e,f,g)
|
||||||
@ -160,6 +164,7 @@ $code.=<<___;
|
|||||||
pand $axb,$bxc
|
pand $axb,$bxc
|
||||||
pxor $sigma,$t2
|
pxor $sigma,$t2
|
||||||
|
|
||||||
|
`"prefetch 63(@ptr[3])" if ($i==15)`
|
||||||
psrld \$22-13,$sigma
|
psrld \$22-13,$sigma
|
||||||
pxor $t3,$t2
|
pxor $t3,$t2
|
||||||
movdqa $b,$h
|
movdqa $b,$h
|
||||||
@ -465,30 +470,38 @@ $code.=<<___;
|
|||||||
|
|
||||||
vpsrld \$25,$e,$t2
|
vpsrld \$25,$e,$t2
|
||||||
vpxor $t3,$sigma,$sigma
|
vpxor $t3,$sigma,$sigma
|
||||||
|
`"prefetch 63(@ptr[0])" if ($i==15)`
|
||||||
vpslld \$7,$e,$t3
|
vpslld \$7,$e,$t3
|
||||||
vpandn $g,$e,$t1
|
vpandn $g,$e,$t1
|
||||||
vpand $f,$e,$axb # borrow $axb
|
vpand $f,$e,$axb # borrow $axb
|
||||||
|
`"prefetch 63(@ptr[1])" if ($i==15)`
|
||||||
vpxor $t2,$sigma,$sigma
|
vpxor $t2,$sigma,$sigma
|
||||||
|
|
||||||
vpsrld \$2,$a,$h # borrow $h
|
vpsrld \$2,$a,$h # borrow $h
|
||||||
vpxor $t3,$sigma,$sigma # Sigma1(e)
|
vpxor $t3,$sigma,$sigma # Sigma1(e)
|
||||||
|
`"prefetch 63(@ptr[2])" if ($i==15)`
|
||||||
vpslld \$30,$a,$t2
|
vpslld \$30,$a,$t2
|
||||||
vpxor $axb,$t1,$t1 # Ch(e,f,g)
|
vpxor $axb,$t1,$t1 # Ch(e,f,g)
|
||||||
vpxor $a,$b,$axb # a^b, b^c in next round
|
vpxor $a,$b,$axb # a^b, b^c in next round
|
||||||
|
`"prefetch 63(@ptr[3])" if ($i==15)`
|
||||||
vpxor $t2,$h,$h
|
vpxor $t2,$h,$h
|
||||||
vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
|
vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
|
||||||
|
|
||||||
vpsrld \$13,$a,$t2
|
vpsrld \$13,$a,$t2
|
||||||
|
`"prefetch 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
|
||||||
vpslld \$19,$a,$t3
|
vpslld \$19,$a,$t3
|
||||||
vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
|
vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
|
||||||
vpand $axb,$bxc,$bxc
|
vpand $axb,$bxc,$bxc
|
||||||
|
`"prefetch 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
|
||||||
vpxor $t2,$h,$sigma
|
vpxor $t2,$h,$sigma
|
||||||
|
|
||||||
vpsrld \$22,$a,$t2
|
vpsrld \$22,$a,$t2
|
||||||
vpxor $t3,$sigma,$sigma
|
vpxor $t3,$sigma,$sigma
|
||||||
|
`"prefetch 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
|
||||||
vpslld \$10,$a,$t3
|
vpslld \$10,$a,$t3
|
||||||
vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
|
vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
|
||||||
vpaddd $Xi,$d,$d # d+=Xi
|
vpaddd $Xi,$d,$d # d+=Xi
|
||||||
|
`"prefetch 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
|
||||||
vpxor $t2,$sigma,$sigma
|
vpxor $t2,$sigma,$sigma
|
||||||
vpxor $t3,$sigma,$sigma # Sigma0(a)
|
vpxor $t3,$sigma,$sigma # Sigma0(a)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user