Optimize AES-ECB mode in OpenSSL for both aarch64 and aarch32

Aes-ecb mode can be optimized by inverleaving cipher operation on
several blocks and loop unrolling. Interleaving needs one ideal
unrolling factor, here we adopt the same factor with aes-cbc,
which is described as below:
    If blocks number > 5, select 5 blocks as one iteration,every
    loop, decrease the blocks number by 5.
    If 3 < left blocks < 5 select 3 blocks as one iteration, every
    loop, decrease the block number by 3.
    If left blocks < 3, treat them as tail blocks.
Detailed implementation will have a little adjustment for squeezing
code space.
With this way, for small size such as 16 bytes, the performance is
similar as before, but for big size such as 16k bytes, the performance
improves a lot, even reaches to 100%, for some arches such as A57,
the improvement  even exceeds 100%. The following table will list the
encryption performance data on aarch64, take a72 and a57 as examples.
Performance value takes the unit of cycles per byte, takes the format
as comparision of values. List them as below:

A72:
                            Before optimization     After optimization  Improve
evp-aes-128-ecb@16          17.26538237             16.82663866         2.61%
evp-aes-128-ecb@64          5.50528499              5.222637557         5.41%
evp-aes-128-ecb@256         2.632700213             1.908442892         37.95%
evp-aes-128-ecb@1024        1.876102047             1.078018868         74.03%
evp-aes-128-ecb@8192        1.6550392               0.853982929         93.80%
evp-aes-128-ecb@16384       1.636871283             0.847623957         93.11%
evp-aes-192-ecb@16          17.73104961             17.09692468         3.71%
evp-aes-192-ecb@64          5.78984398              5.418545192         6.85%
evp-aes-192-ecb@256         2.872005308             2.081815274         37.96%
evp-aes-192-ecb@1024        2.083226672             1.25095642          66.53%
evp-aes-192-ecb@8192        1.831992057             0.995916251         83.95%
evp-aes-192-ecb@16384       1.821590009             0.993820525         83.29%
evp-aes-256-ecb@16          18.0606306              17.96963317         0.51%
evp-aes-256-ecb@64          6.19651997              5.762465812         7.53%
evp-aes-256-ecb@256         3.176991394             2.24642538          41.42%
evp-aes-256-ecb@1024        2.385991919             1.396018192         70.91%
evp-aes-256-ecb@8192        2.147862636             1.142222597         88.04%
evp-aes-256-ecb@16384       2.131361787             1.135944617         87.63%

A57:
                            Before optimization     After optimization  Improve
evp-aes-128-ecb@16          18.61045121             18.36456218         1.34%
evp-aes-128-ecb@64          6.438628994             5.467959461         17.75%
evp-aes-128-ecb@256         2.957452881             1.97238604          49.94%
evp-aes-128-ecb@1024        2.117096219             1.099665054         92.52%
evp-aes-128-ecb@8192        1.868385973             0.837440804         123.11%
evp-aes-128-ecb@16384       1.853078526             0.822420027         125.32%
evp-aes-192-ecb@16          19.07021756             18.50018552         3.08%
evp-aes-192-ecb@64          6.672351486             5.696088921         17.14%
evp-aes-192-ecb@256         3.260427769             2.131449916         52.97%
evp-aes-192-ecb@1024        2.410522832             1.250529718         92.76%
evp-aes-192-ecb@8192        2.17921605              0.973225504         123.92%
evp-aes-192-ecb@16384       2.162250997             0.95919871          125.42%
evp-aes-256-ecb@16          19.3008384              19.12743654         0.91%
evp-aes-256-ecb@64          6.992950658             5.92149541          18.09%
evp-aes-256-ecb@256         3.576361743             2.287619504         56.34%
evp-aes-256-ecb@1024        2.726671027             1.381267599         97.40%
evp-aes-256-ecb@8192        2.493583657             1.110959913         124.45%
evp-aes-256-ecb@16384       2.473916816             1.099967073         124.91%

Change-Id: Iccd23d972e0d52d22dc093f4c208f69c9d5a0ca7

Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/10518)
This commit is contained in:
XiaokangQian 2019-11-07 02:36:45 +00:00 committed by Richard Levitte
parent ef1e59ed83
commit 2ff16afc17
6 changed files with 855 additions and 2 deletions

View File

@ -384,6 +384,836 @@ ___
&gen_block("en");
&gen_block("de");
}}}
# Performance in cycles per byte.
# Processed with AES-ECB different key size.
# It shows the value before and after optimization as below:
# (before/after):
#
# AES-128-ECB AES-192-ECB AES-256-ECB
# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
# Optimization is implemented by loop unrolling and interleaving.
# Commonly, we choose the unrolling factor as 5, if the input
# data size smaller than 5 blocks, but not smaller than 3 blocks,
# choose 3 as the unrolling factor.
# If the input data size dsize >= 5*16 bytes, then take 5 blocks
# as one iteration, every loop the left size lsize -= 5*16.
# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
# every loop lsize -=3*16.
# If lsize < 3*16 bytes, treat them as the tail, interleave the
# two blocks AES instructions.
# There is one special case, if the original input data size dsize
# = 16 bytes, we will treat it seperately to improve the
# performance: one independent code block without LR, FP load and
# store, just looks like what the original ECB implementation does.
{{{
my ($inp,$out,$len,$key)=map("x$_",(0..3));
my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
### q7 last round key
### q10-q15 q7 Last 7 round keys
### q8-q9 preloaded round keys except last 7 keys for big size
### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
{
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
my ($dat3,$in3,$tmp3); # used only in 64-bit mode
my ($dat4,$in4,$tmp4);
if ($flavour =~ /64/) {
($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
}
$code.=<<___;
.globl ${prefix}_ecb_encrypt
.type ${prefix}_ecb_encrypt,%function
.align 5
${prefix}_ecb_encrypt:
___
$code.=<<___ if ($flavour =~ /64/);
subs $len,$len,#16
// Original input data size bigger than 16, jump to big size processing.
b.ne .Lecb_big_size
vld1.8 {$dat0},[$inp]
cmp $enc,#0 // en- or decrypting?
ldr $rounds,[$key,#240]
vld1.32 {q5-q6},[$key],#32 // load key schedule...
b.eq .Lecb_small_dec
aese $dat0,q5
aesmc $dat0,$dat0
vld1.32 {q8-q9},[$key],#32 // load key schedule...
aese $dat0,q6
aesmc $dat0,$dat0
subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
b.eq .Lecb_128_enc
.Lecb_round_loop:
aese $dat0,q8
aesmc $dat0,$dat0
vld1.32 {q8},[$key],#16 // load key schedule...
aese $dat0,q9
aesmc $dat0,$dat0
vld1.32 {q9},[$key],#16 // load key schedule...
subs $rounds,$rounds,#2 // bias
b.gt .Lecb_round_loop
.Lecb_128_enc:
vld1.32 {q10-q11},[$key],#32 // load key schedule...
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat0,q9
aesmc $dat0,$dat0
vld1.32 {q12-q13},[$key],#32 // load key schedule...
aese $dat0,q10
aesmc $dat0,$dat0
aese $dat0,q11
aesmc $dat0,$dat0
vld1.32 {q14-q15},[$key],#32 // load key schedule...
aese $dat0,q12
aesmc $dat0,$dat0
aese $dat0,q13
aesmc $dat0,$dat0
vld1.32 {$rndlast},[$key]
aese $dat0,q14
aesmc $dat0,$dat0
aese $dat0,q15
veor $dat0,$dat0,$rndlast
vst1.8 {$dat0},[$out]
b .Lecb_Final_abort
.Lecb_small_dec:
aesd $dat0,q5
aesimc $dat0,$dat0
vld1.32 {q8-q9},[$key],#32 // load key schedule...
aesd $dat0,q6
aesimc $dat0,$dat0
subs $rounds,$rounds,#10 // bias
b.eq .Lecb_128_dec
.Lecb_dec_round_loop:
aesd $dat0,q8
aesimc $dat0,$dat0
vld1.32 {q8},[$key],#16 // load key schedule...
aesd $dat0,q9
aesimc $dat0,$dat0
vld1.32 {q9},[$key],#16 // load key schedule...
subs $rounds,$rounds,#2 // bias
b.gt .Lecb_dec_round_loop
.Lecb_128_dec:
vld1.32 {q10-q11},[$key],#32 // load key schedule...
aesd $dat0,q8
aesimc $dat0,$dat0
aesd $dat0,q9
aesimc $dat0,$dat0
vld1.32 {q12-q13},[$key],#32 // load key schedule...
aesd $dat0,q10
aesimc $dat0,$dat0
aesd $dat0,q11
aesimc $dat0,$dat0
vld1.32 {q14-q15},[$key],#32 // load key schedule...
aesd $dat0,q12
aesimc $dat0,$dat0
aesd $dat0,q13
aesimc $dat0,$dat0
vld1.32 {$rndlast},[$key]
aesd $dat0,q14
aesimc $dat0,$dat0
aesd $dat0,q15
veor $dat0,$dat0,$rndlast
vst1.8 {$dat0},[$out]
b .Lecb_Final_abort
.Lecb_big_size:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
mov ip,sp
stmdb sp!,{r4-r8,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load remaining args
subs $len,$len,#16
___
$code.=<<___;
mov $step,#16
b.lo .Lecb_done
cclr $step,eq
cmp $enc,#0 // en- or decrypting?
ldr $rounds,[$key,#240]
and $len,$len,#-16
vld1.8 {$dat},[$inp],$step
vld1.32 {q8-q9},[$key] // load key schedule...
sub $rounds,$rounds,#6
add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
sub $rounds,$rounds,#2
vld1.32 {q10-q11},[$key_],#32
vld1.32 {q12-q13},[$key_],#32
vld1.32 {q14-q15},[$key_],#32
vld1.32 {$rndlast},[$key_]
add $key_,$key,#32
mov $cnt,$rounds
b.eq .Lecb_dec
vld1.8 {$dat1},[$inp],#16
subs $len,$len,#32 // bias
add $cnt,$rounds,#2
vorr $in1,$dat1,$dat1
vorr $dat2,$dat1,$dat1
vorr $dat1,$dat,$dat
b.lo .Lecb_enc_tail
vorr $dat1,$in1,$in1
vld1.8 {$dat2},[$inp],#16
___
$code.=<<___ if ($flavour =~ /64/);
cmp $len,#32
b.lo .Loop3x_ecb_enc
vld1.8 {$dat3},[$inp],#16
vld1.8 {$dat4},[$inp],#16
sub $len,$len,#32 // bias
mov $cnt,$rounds
.Loop5x_ecb_enc:
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat2,q8
aesmc $dat2,$dat2
aese $dat3,q8
aesmc $dat3,$dat3
aese $dat4,q8
aesmc $dat4,$dat4
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
aese $dat2,q9
aesmc $dat2,$dat2
aese $dat3,q9
aesmc $dat3,$dat3
aese $dat4,q9
aesmc $dat4,$dat4
vld1.32 {q9},[$key_],#16
b.gt .Loop5x_ecb_enc
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat2,q8
aesmc $dat2,$dat2
aese $dat3,q8
aesmc $dat3,$dat3
aese $dat4,q8
aesmc $dat4,$dat4
cmp $len,#0x40 // because .Lecb_enc_tail4x
sub $len,$len,#0x50
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
aese $dat2,q9
aesmc $dat2,$dat2
aese $dat3,q9
aesmc $dat3,$dat3
aese $dat4,q9
aesmc $dat4,$dat4
csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
mov $key_,$key
aese $dat0,q10
aesmc $dat0,$dat0
aese $dat1,q10
aesmc $dat1,$dat1
aese $dat2,q10
aesmc $dat2,$dat2
aese $dat3,q10
aesmc $dat3,$dat3
aese $dat4,q10
aesmc $dat4,$dat4
add $inp,$inp,x6 // $inp is adjusted in such way that
// at exit from the loop $dat1-$dat4
// are loaded with last "words"
add x6,$len,#0x60 // because .Lecb_enc_tail4x
aese $dat0,q11
aesmc $dat0,$dat0
aese $dat1,q11
aesmc $dat1,$dat1
aese $dat2,q11
aesmc $dat2,$dat2
aese $dat3,q11
aesmc $dat3,$dat3
aese $dat4,q11
aesmc $dat4,$dat4
aese $dat0,q12
aesmc $dat0,$dat0
aese $dat1,q12
aesmc $dat1,$dat1
aese $dat2,q12
aesmc $dat2,$dat2
aese $dat3,q12
aesmc $dat3,$dat3
aese $dat4,q12
aesmc $dat4,$dat4
aese $dat0,q13
aesmc $dat0,$dat0
aese $dat1,q13
aesmc $dat1,$dat1
aese $dat2,q13
aesmc $dat2,$dat2
aese $dat3,q13
aesmc $dat3,$dat3
aese $dat4,q13
aesmc $dat4,$dat4
aese $dat0,q14
aesmc $dat0,$dat0
aese $dat1,q14
aesmc $dat1,$dat1
aese $dat2,q14
aesmc $dat2,$dat2
aese $dat3,q14
aesmc $dat3,$dat3
aese $dat4,q14
aesmc $dat4,$dat4
aese $dat0,q15
vld1.8 {$in0},[$inp],#16
aese $dat1,q15
vld1.8 {$in1},[$inp],#16
aese $dat2,q15
vld1.8 {$in2},[$inp],#16
aese $dat3,q15
vld1.8 {$in3},[$inp],#16
aese $dat4,q15
vld1.8 {$in4},[$inp],#16
cbz x6,.Lecb_enc_tail4x
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
veor $tmp0,$rndlast,$dat0
vorr $dat0,$in0,$in0
veor $tmp1,$rndlast,$dat1
vorr $dat1,$in1,$in1
veor $tmp2,$rndlast,$dat2
vorr $dat2,$in2,$in2
veor $tmp3,$rndlast,$dat3
vorr $dat3,$in3,$in3
veor $tmp4,$rndlast,$dat4
vst1.8 {$tmp0},[$out],#16
vorr $dat4,$in4,$in4
vst1.8 {$tmp1},[$out],#16
mov $cnt,$rounds
vst1.8 {$tmp2},[$out],#16
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vst1.8 {$tmp3},[$out],#16
vst1.8 {$tmp4},[$out],#16
b.hs .Loop5x_ecb_enc
add $len,$len,#0x50
cbz $len,.Lecb_done
add $cnt,$rounds,#2
subs $len,$len,#0x30
vorr $dat0,$in2,$in2
vorr $dat1,$in3,$in3
vorr $dat2,$in4,$in4
b.lo .Lecb_enc_tail
b .Loop3x_ecb_enc
.align 4
.Lecb_enc_tail4x:
veor $tmp1,$rndlast,$dat1
veor $tmp2,$rndlast,$dat2
veor $tmp3,$rndlast,$dat3
veor $tmp4,$rndlast,$dat4
vst1.8 {$tmp1},[$out],#16
vst1.8 {$tmp2},[$out],#16
vst1.8 {$tmp3},[$out],#16
vst1.8 {$tmp4},[$out],#16
b .Lecb_done
.align 4
___
$code.=<<___;
.Loop3x_ecb_enc:
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat2,q8
aesmc $dat2,$dat2
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
aese $dat2,q9
aesmc $dat2,$dat2
vld1.32 {q9},[$key_],#16
b.gt .Loop3x_ecb_enc
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat2,q8
aesmc $dat2,$dat2
subs $len,$len,#0x30
mov.lo x6,$len // x6, $cnt, is zero at this point
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
aese $dat2,q9
aesmc $dat2,$dat2
add $inp,$inp,x6 // $inp is adjusted in such way that
// at exit from the loop $dat1-$dat2
// are loaded with last "words"
mov $key_,$key
aese $dat0,q12
aesmc $dat0,$dat0
aese $dat1,q12
aesmc $dat1,$dat1
aese $dat2,q12
aesmc $dat2,$dat2
vld1.8 {$in0},[$inp],#16
aese $dat0,q13
aesmc $dat0,$dat0
aese $dat1,q13
aesmc $dat1,$dat1
aese $dat2,q13
aesmc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
aese $dat0,q14
aesmc $dat0,$dat0
aese $dat1,q14
aesmc $dat1,$dat1
aese $dat2,q14
aesmc $dat2,$dat2
vld1.8 {$in2},[$inp],#16
aese $dat0,q15
aese $dat1,q15
aese $dat2,q15
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
add $cnt,$rounds,#2
veor $tmp0,$rndlast,$dat0
veor $tmp1,$rndlast,$dat1
veor $dat2,$dat2,$rndlast
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vst1.8 {$tmp0},[$out],#16
vorr $dat0,$in0,$in0
vst1.8 {$tmp1},[$out],#16
vorr $dat1,$in1,$in1
vst1.8 {$dat2},[$out],#16
vorr $dat2,$in2,$in2
b.hs .Loop3x_ecb_enc
cmn $len,#0x30
b.eq .Lecb_done
nop
.Lecb_enc_tail:
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat2,q8
aesmc $dat2,$dat2
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat1,q9
aesmc $dat1,$dat1
aese $dat2,q9
aesmc $dat2,$dat2
vld1.32 {q9},[$key_],#16
b.gt .Lecb_enc_tail
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat2,q8
aesmc $dat2,$dat2
aese $dat1,q9
aesmc $dat1,$dat1
aese $dat2,q9
aesmc $dat2,$dat2
aese $dat1,q12
aesmc $dat1,$dat1
aese $dat2,q12
aesmc $dat2,$dat2
cmn $len,#0x20
aese $dat1,q13
aesmc $dat1,$dat1
aese $dat2,q13
aesmc $dat2,$dat2
aese $dat1,q14
aesmc $dat1,$dat1
aese $dat2,q14
aesmc $dat2,$dat2
aese $dat1,q15
aese $dat2,q15
b.eq .Lecb_enc_one
veor $tmp1,$rndlast,$dat1
veor $tmp2,$rndlast,$dat2
vst1.8 {$tmp1},[$out],#16
vst1.8 {$tmp2},[$out],#16
b .Lecb_done
.Lecb_enc_one:
veor $tmp1,$rndlast,$dat2
vst1.8 {$tmp1},[$out],#16
b .Lecb_done
___
$code.=<<___;
.align 5
.Lecb_dec:
vld1.8 {$dat1},[$inp],#16
subs $len,$len,#32 // bias
add $cnt,$rounds,#2
vorr $in1,$dat1,$dat1
vorr $dat2,$dat1,$dat1
vorr $dat1,$dat,$dat
b.lo .Lecb_dec_tail
vorr $dat1,$in1,$in1
vld1.8 {$dat2},[$inp],#16
___
$code.=<<___ if ($flavour =~ /64/);
cmp $len,#32
b.lo .Loop3x_ecb_dec
vld1.8 {$dat3},[$inp],#16
vld1.8 {$dat4},[$inp],#16
sub $len,$len,#32 // bias
mov $cnt,$rounds
.Loop5x_ecb_dec:
aesd $dat0,q8
aesimc $dat0,$dat0
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
aesd $dat3,q8
aesimc $dat3,$dat3
aesd $dat4,q8
aesimc $dat4,$dat4
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aesd $dat0,q9
aesimc $dat0,$dat0
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
aesd $dat3,q9
aesimc $dat3,$dat3
aesd $dat4,q9
aesimc $dat4,$dat4
vld1.32 {q9},[$key_],#16
b.gt .Loop5x_ecb_dec
aesd $dat0,q8
aesimc $dat0,$dat0
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
aesd $dat3,q8
aesimc $dat3,$dat3
aesd $dat4,q8
aesimc $dat4,$dat4
cmp $len,#0x40 // because .Lecb_tail4x
sub $len,$len,#0x50
aesd $dat0,q9
aesimc $dat0,$dat0
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
aesd $dat3,q9
aesimc $dat3,$dat3
aesd $dat4,q9
aesimc $dat4,$dat4
csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
mov $key_,$key
aesd $dat0,q10
aesimc $dat0,$dat0
aesd $dat1,q10
aesimc $dat1,$dat1
aesd $dat2,q10
aesimc $dat2,$dat2
aesd $dat3,q10
aesimc $dat3,$dat3
aesd $dat4,q10
aesimc $dat4,$dat4
add $inp,$inp,x6 // $inp is adjusted in such way that
// at exit from the loop $dat1-$dat4
// are loaded with last "words"
add x6,$len,#0x60 // because .Lecb_tail4x
aesd $dat0,q11
aesimc $dat0,$dat0
aesd $dat1,q11
aesimc $dat1,$dat1
aesd $dat2,q11
aesimc $dat2,$dat2
aesd $dat3,q11
aesimc $dat3,$dat3
aesd $dat4,q11
aesimc $dat4,$dat4
aesd $dat0,q12
aesimc $dat0,$dat0
aesd $dat1,q12
aesimc $dat1,$dat1
aesd $dat2,q12
aesimc $dat2,$dat2
aesd $dat3,q12
aesimc $dat3,$dat3
aesd $dat4,q12
aesimc $dat4,$dat4
aesd $dat0,q13
aesimc $dat0,$dat0
aesd $dat1,q13
aesimc $dat1,$dat1
aesd $dat2,q13
aesimc $dat2,$dat2
aesd $dat3,q13
aesimc $dat3,$dat3
aesd $dat4,q13
aesimc $dat4,$dat4
aesd $dat0,q14
aesimc $dat0,$dat0
aesd $dat1,q14
aesimc $dat1,$dat1
aesd $dat2,q14
aesimc $dat2,$dat2
aesd $dat3,q14
aesimc $dat3,$dat3
aesd $dat4,q14
aesimc $dat4,$dat4
aesd $dat0,q15
vld1.8 {$in0},[$inp],#16
aesd $dat1,q15
vld1.8 {$in1},[$inp],#16
aesd $dat2,q15
vld1.8 {$in2},[$inp],#16
aesd $dat3,q15
vld1.8 {$in3},[$inp],#16
aesd $dat4,q15
vld1.8 {$in4},[$inp],#16
cbz x6,.Lecb_tail4x
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
veor $tmp0,$rndlast,$dat0
vorr $dat0,$in0,$in0
veor $tmp1,$rndlast,$dat1
vorr $dat1,$in1,$in1
veor $tmp2,$rndlast,$dat2
vorr $dat2,$in2,$in2
veor $tmp3,$rndlast,$dat3
vorr $dat3,$in3,$in3
veor $tmp4,$rndlast,$dat4
vst1.8 {$tmp0},[$out],#16
vorr $dat4,$in4,$in4
vst1.8 {$tmp1},[$out],#16
mov $cnt,$rounds
vst1.8 {$tmp2},[$out],#16
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vst1.8 {$tmp3},[$out],#16
vst1.8 {$tmp4},[$out],#16
b.hs .Loop5x_ecb_dec
add $len,$len,#0x50
cbz $len,.Lecb_done
add $cnt,$rounds,#2
subs $len,$len,#0x30
vorr $dat0,$in2,$in2
vorr $dat1,$in3,$in3
vorr $dat2,$in4,$in4
b.lo .Lecb_dec_tail
b .Loop3x_ecb_dec
.align 4
.Lecb_tail4x:
veor $tmp1,$rndlast,$dat1
veor $tmp2,$rndlast,$dat2
veor $tmp3,$rndlast,$dat3
veor $tmp4,$rndlast,$dat4
vst1.8 {$tmp1},[$out],#16
vst1.8 {$tmp2},[$out],#16
vst1.8 {$tmp3},[$out],#16
vst1.8 {$tmp4},[$out],#16
b .Lecb_done
.align 4
___
$code.=<<___;
.Loop3x_ecb_dec:
aesd $dat0,q8
aesimc $dat0,$dat0
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aesd $dat0,q9
aesimc $dat0,$dat0
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
vld1.32 {q9},[$key_],#16
b.gt .Loop3x_ecb_dec
aesd $dat0,q8
aesimc $dat0,$dat0
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
subs $len,$len,#0x30
mov.lo x6,$len // x6, $cnt, is zero at this point
aesd $dat0,q9
aesimc $dat0,$dat0
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
add $inp,$inp,x6 // $inp is adjusted in such way that
// at exit from the loop $dat1-$dat2
// are loaded with last "words"
mov $key_,$key
aesd $dat0,q12
aesimc $dat0,$dat0
aesd $dat1,q12
aesimc $dat1,$dat1
aesd $dat2,q12
aesimc $dat2,$dat2
vld1.8 {$in0},[$inp],#16
aesd $dat0,q13
aesimc $dat0,$dat0
aesd $dat1,q13
aesimc $dat1,$dat1
aesd $dat2,q13
aesimc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
aesd $dat0,q14
aesimc $dat0,$dat0
aesd $dat1,q14
aesimc $dat1,$dat1
aesd $dat2,q14
aesimc $dat2,$dat2
vld1.8 {$in2},[$inp],#16
aesd $dat0,q15
aesd $dat1,q15
aesd $dat2,q15
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
add $cnt,$rounds,#2
veor $tmp0,$rndlast,$dat0
veor $tmp1,$rndlast,$dat1
veor $dat2,$dat2,$rndlast
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vst1.8 {$tmp0},[$out],#16
vorr $dat0,$in0,$in0
vst1.8 {$tmp1},[$out],#16
vorr $dat1,$in1,$in1
vst1.8 {$dat2},[$out],#16
vorr $dat2,$in2,$in2
b.hs .Loop3x_ecb_dec
cmn $len,#0x30
b.eq .Lecb_done
nop
.Lecb_dec_tail:
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
vld1.32 {q9},[$key_],#16
b.gt .Lecb_dec_tail
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
aesd $dat1,q12
aesimc $dat1,$dat1
aesd $dat2,q12
aesimc $dat2,$dat2
cmn $len,#0x20
aesd $dat1,q13
aesimc $dat1,$dat1
aesd $dat2,q13
aesimc $dat2,$dat2
aesd $dat1,q14
aesimc $dat1,$dat1
aesd $dat2,q14
aesimc $dat2,$dat2
aesd $dat1,q15
aesd $dat2,q15
b.eq .Lecb_dec_one
veor $tmp1,$rndlast,$dat1
veor $tmp2,$rndlast,$dat2
vst1.8 {$tmp1},[$out],#16
vst1.8 {$tmp2},[$out],#16
b .Lecb_done
.Lecb_dec_one:
veor $tmp1,$rndlast,$dat2
vst1.8 {$tmp1},[$out],#16
.Lecb_done:
___
}
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r8,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldr x29,[sp],#16
___
$code.=<<___ if ($flavour =~ /64/);
.Lecb_Final_abort:
ret
___
$code.=<<___;
.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
___
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");

View File

@ -89,6 +89,7 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
# define HWAES_encrypt aes_v8_encrypt
# define HWAES_decrypt aes_v8_decrypt
# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
# define HWAES_ecb_encrypt aes_v8_ecb_encrypt
# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
# endif
# endif
@ -411,6 +412,9 @@ void HWAES_decrypt(const unsigned char *in, unsigned char *out,
void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, const int enc);
void HWAES_ecb_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
const int enc);
void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
const unsigned char ivec[16]);

View File

@ -29,6 +29,10 @@ typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], int enc);
typedef void (*ecb128_f) (const unsigned char *in, unsigned char *out,
size_t len, const void *key,
int enc);
typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out,
size_t blocks, const void *key,
const unsigned char ivec[16]);

View File

@ -29,6 +29,10 @@ static int cipher_hw_aes_initkey(PROV_CIPHER_CTX *dat,
# ifdef HWAES_cbc_encrypt
if (dat->mode == EVP_CIPH_CBC_MODE)
dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
# endif
# ifdef HWAES_ecb_encrypt
if (dat->mode == EVP_CIPH_ECB_MODE)
dat->stream.ecb = (ecb128_f)HWAES_ecb_encrypt;
# endif
} else
#endif
@ -64,6 +68,11 @@ static int cipher_hw_aes_initkey(PROV_CIPHER_CTX *dat,
dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
else
# endif
# ifdef HWAES_ecb_encrypt
if (dat->mode == EVP_CIPH_ECB_MODE)
dat->stream.ecb = (ecb128_f)HWAES_ecb_encrypt;
else
# endif
# ifdef HWAES_ctr32_encrypt_blocks
if (dat->mode == EVP_CIPH_CTR_MODE)
dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;

View File

@ -34,8 +34,13 @@ int cipher_hw_generic_ecb(PROV_CIPHER_CTX *dat, unsigned char *out,
if (len < bl)
return 1;
for (i = 0, len -= bl; i <= len; i += bl)
(*dat->block) (in + i, out + i, dat->ks);
if (dat->stream.ecb) {
(*dat->stream.ecb) (in, out, len, dat->ks, dat->enc);
}
else {
for (i = 0, len -= bl; i <= len; i += bl)
(*dat->block) (in + i, out + i, dat->ks);
}
return 1;
}

View File

@ -37,6 +37,7 @@ struct prov_cipher_ctx_st {
union {
cbc128_f cbc;
ctr128_f ctr;
ecb128_f ecb;
} stream;
unsigned int mode;