mirror of
https://github.com/openssl/openssl.git
synced 2025-01-12 13:36:28 +08:00
15b7175f55
This patch implements the SM4 optimization for ARM processor, using SM4 HW instruction, which is an optional feature of crypto extension for aarch64 V8. Tested on some modern ARM micro-architectures with SM4 support, the performance uplift can be observed around 8X~40X over existing C implementation in openssl. Algorithms that can be parallelized (like CTR, ECB, CBC decryption) are on higher end, with algorithm like CBC encryption on lower end (due to inter-block dependency) Perf data on Yitian-710 2.75GHz hardware, before and after optimization: Before: type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes SM4-CTR 105787.80k 107837.87k 108380.84k 108462.08k 108549.46k 108554.92k SM4-ECB 111924.58k 118173.76k 119776.00k 120093.70k 120264.02k 120274.94k SM4-CBC 106428.09k 109190.98k 109674.33k 109774.51k 109827.41k 109827.41k After (7.4x - 36.6x faster): type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes SM4-CTR 781979.02k 2432994.28k 3437753.86k 3834177.88k 3963715.58k 3974556.33k SM4-ECB 937590.69k 2941689.02k 3945751.81k 4328655.87k 4459181.40k 4468692.31k SM4-CBC 890639.88k 1027746.58k 1050621.78k 1056696.66k 1058613.93k 1058701.31k Signed-off-by: Daniel Hu <Daniel.Hu@arm.com> Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/17455)
636 lines
16 KiB
Raku
Executable File
636 lines
16 KiB
Raku
Executable File
#! /usr/bin/env perl
|
|
# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
# This module implements support for SM4 hw support on aarch64
|
|
# Oct 2021
|
|
#
|
|
|
|
# $output is the last argument if it looks like a file (it has an extension)
|
|
# $flavour is the first argument if it doesn't look like a file
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
|
|
or die "can't call $xlate: $!";
|
|
*STDOUT=*OUT;
|
|
|
|
$prefix="sm4_v8";
|
|
my @rks=map("v$_",(0..7));
|
|
|
|
sub rev32() {
|
|
my $dst = shift;
|
|
my $src = shift;
|
|
$code.=<<___;
|
|
#ifndef __ARMEB__
|
|
rev32 $dst.16b,$src.16b
|
|
#endif
|
|
___
|
|
}
|
|
|
|
sub enc_blk () {
|
|
my $data = shift;
|
|
$code.=<<___;
|
|
sm4e $data.4s,@rks[0].4s
|
|
sm4e $data.4s,@rks[1].4s
|
|
sm4e $data.4s,@rks[2].4s
|
|
sm4e $data.4s,@rks[3].4s
|
|
sm4e $data.4s,@rks[4].4s
|
|
sm4e $data.4s,@rks[5].4s
|
|
sm4e $data.4s,@rks[6].4s
|
|
sm4e $data.4s,@rks[7].4s
|
|
rev64 $data.4S,$data.4S
|
|
ext $data.16b,$data.16b,$data.16b,#8
|
|
___
|
|
}
|
|
|
|
sub enc_4blks () {
|
|
my $data0 = shift;
|
|
my $data1 = shift;
|
|
my $data2 = shift;
|
|
my $data3 = shift;
|
|
$code.=<<___;
|
|
sm4e $data0.4s,@rks[0].4s
|
|
sm4e $data1.4s,@rks[0].4s
|
|
sm4e $data2.4s,@rks[0].4s
|
|
sm4e $data3.4s,@rks[0].4s
|
|
|
|
sm4e $data0.4s,@rks[1].4s
|
|
sm4e $data1.4s,@rks[1].4s
|
|
sm4e $data2.4s,@rks[1].4s
|
|
sm4e $data3.4s,@rks[1].4s
|
|
|
|
sm4e $data0.4s,@rks[2].4s
|
|
sm4e $data1.4s,@rks[2].4s
|
|
sm4e $data2.4s,@rks[2].4s
|
|
sm4e $data3.4s,@rks[2].4s
|
|
|
|
sm4e $data0.4s,@rks[3].4s
|
|
sm4e $data1.4s,@rks[3].4s
|
|
sm4e $data2.4s,@rks[3].4s
|
|
sm4e $data3.4s,@rks[3].4s
|
|
|
|
sm4e $data0.4s,@rks[4].4s
|
|
sm4e $data1.4s,@rks[4].4s
|
|
sm4e $data2.4s,@rks[4].4s
|
|
sm4e $data3.4s,@rks[4].4s
|
|
|
|
sm4e $data0.4s,@rks[5].4s
|
|
sm4e $data1.4s,@rks[5].4s
|
|
sm4e $data2.4s,@rks[5].4s
|
|
sm4e $data3.4s,@rks[5].4s
|
|
|
|
sm4e $data0.4s,@rks[6].4s
|
|
sm4e $data1.4s,@rks[6].4s
|
|
sm4e $data2.4s,@rks[6].4s
|
|
sm4e $data3.4s,@rks[6].4s
|
|
|
|
sm4e $data0.4s,@rks[7].4s
|
|
rev64 $data0.4S,$data0.4S
|
|
sm4e $data1.4s,@rks[7].4s
|
|
ext $data0.16b,$data0.16b,$data0.16b,#8
|
|
rev64 $data1.4S,$data1.4S
|
|
sm4e $data2.4s,@rks[7].4s
|
|
ext $data1.16b,$data1.16b,$data1.16b,#8
|
|
rev64 $data2.4S,$data2.4S
|
|
sm4e $data3.4s,@rks[7].4s
|
|
ext $data2.16b,$data2.16b,$data2.16b,#8
|
|
rev64 $data3.4S,$data3.4S
|
|
ext $data3.16b,$data3.16b,$data3.16b,#8
|
|
___
|
|
}
|
|
|
|
$code=<<___;
|
|
#include "arm_arch.h"
|
|
.arch armv8-a+crypto
|
|
.text
|
|
___
|
|
|
|
{{{
|
|
$code.=<<___;
|
|
.align 6
|
|
.Lck:
|
|
.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
|
|
.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
|
|
.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
|
|
.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
|
|
.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
|
|
.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
|
|
.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
|
|
.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
|
|
.Lfk:
|
|
.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
|
|
___
|
|
}}}
|
|
|
|
{{{
|
|
my ($key,$keys)=("x0","x1");
|
|
my ($tmp)=("x2");
|
|
my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7));
|
|
my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
|
|
my ($fkconst) = ("v24");
|
|
$code.=<<___;
|
|
.globl ${prefix}_set_encrypt_key
|
|
.type ${prefix}_set_encrypt_key,%function
|
|
.align 5
|
|
${prefix}_set_encrypt_key:
|
|
AARCH64_VALID_CALL_TARGET
|
|
ld1 {$key0.4s},[$key]
|
|
adr $tmp,.Lfk
|
|
ld1 {$fkconst.4s},[$tmp]
|
|
adr $tmp,.Lck
|
|
ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
|
|
___
|
|
&rev32($key0, $key0);
|
|
$code.=<<___;
|
|
ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
|
|
eor $key0.16b,$key0.16b,$fkconst.16b;
|
|
sm4ekey $key0.4S,$key0.4S,$const0.4S
|
|
sm4ekey $key1.4S,$key0.4S,$const1.4S
|
|
sm4ekey $key2.4S,$key1.4S,$const2.4S
|
|
sm4ekey $key3.4S,$key2.4S,$const3.4S
|
|
sm4ekey $key4.4S,$key3.4S,$const4.4S
|
|
st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64
|
|
sm4ekey $key5.4S,$key4.4S,$const5.4S
|
|
sm4ekey $key6.4S,$key5.4S,$const6.4S
|
|
sm4ekey $key7.4S,$key6.4S,$const7.4S
|
|
st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys]
|
|
ret
|
|
.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
|
|
___
|
|
}}}
|
|
|
|
{{{
|
|
my ($key,$keys)=("x0","x1");
|
|
my ($tmp)=("x2");
|
|
my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7));
|
|
my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
|
|
my ($fkconst) = ("v24");
|
|
$code.=<<___;
|
|
.globl ${prefix}_set_decrypt_key
|
|
.type ${prefix}_set_decrypt_key,%function
|
|
.align 5
|
|
${prefix}_set_decrypt_key:
|
|
AARCH64_VALID_CALL_TARGET
|
|
ld1 {$key0.4s},[$key]
|
|
adr $tmp,.Lfk
|
|
ld1 {$fkconst.4s},[$tmp]
|
|
adr $tmp, .Lck
|
|
ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
|
|
___
|
|
&rev32($key0, $key0);
|
|
$code.=<<___;
|
|
ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
|
|
eor $key0.16b, $key0.16b,$fkconst.16b;
|
|
sm4ekey $key0.4S,$key0.4S,$const0.4S
|
|
sm4ekey $key1.4S,$key0.4S,$const1.4S
|
|
sm4ekey $key2.4S,$key1.4S,$const2.4S
|
|
rev64 $key0.4s,$key0.4s
|
|
rev64 $key1.4s,$key1.4s
|
|
ext $key0.16b,$key0.16b,$key0.16b,#8
|
|
ext $key1.16b,$key1.16b,$key1.16b,#8
|
|
sm4ekey $key3.4S,$key2.4S,$const3.4S
|
|
sm4ekey $key4.4S,$key3.4S,$const4.4S
|
|
rev64 $key2.4s,$key2.4s
|
|
rev64 $key3.4s,$key3.4s
|
|
ext $key2.16b,$key2.16b,$key2.16b,#8
|
|
ext $key3.16b,$key3.16b,$key3.16b,#8
|
|
sm4ekey $key5.4S,$key4.4S,$const5.4S
|
|
sm4ekey $key6.4S,$key5.4S,$const6.4S
|
|
rev64 $key4.4s,$key4.4s
|
|
rev64 $key5.4s,$key5.4s
|
|
ext $key4.16b,$key4.16b,$key4.16b,#8
|
|
ext $key5.16b,$key5.16b,$key5.16b,#8
|
|
sm4ekey $key7.4S,$key6.4S,$const7.4S
|
|
rev64 $key6.4s, $key6.4s
|
|
rev64 $key7.4s, $key7.4s
|
|
ext $key6.16b,$key6.16b,$key6.16b,#8
|
|
ext $key7.16b,$key7.16b,$key7.16b,#8
|
|
st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64
|
|
st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys]
|
|
ret
|
|
.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
|
|
___
|
|
}}}
|
|
|
|
{{{
|
|
sub gen_block () {
|
|
my $dir = shift;
|
|
my ($inp,$out,$rk)=map("x$_",(0..2));
|
|
my ($data)=("v16");
|
|
$code.=<<___;
|
|
.globl ${prefix}_${dir}crypt
|
|
.type ${prefix}_${dir}crypt,%function
|
|
.align 5
|
|
${prefix}_${dir}crypt:
|
|
AARCH64_VALID_CALL_TARGET
|
|
ld1 {$data.4s},[$inp]
|
|
ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
|
|
ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
|
|
___
|
|
&rev32($data,$data);
|
|
&enc_blk($data);
|
|
&rev32($data,$data);
|
|
$code.=<<___;
|
|
st1 {$data.4s},[$out]
|
|
ret
|
|
.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
|
|
___
|
|
}
|
|
|
|
&gen_block("en");
|
|
&gen_block("de");
|
|
}}}
|
|
|
|
{{{
|
|
my ($inp,$out,$len,$rk)=map("x$_",(0..3));
|
|
my ($enc) = ("w4");
|
|
my @dat=map("v$_",(16..23));
|
|
$code.=<<___;
|
|
.globl ${prefix}_ecb_encrypt
|
|
.type ${prefix}_ecb_encrypt,%function
|
|
.align 5
|
|
${prefix}_ecb_encrypt:
|
|
AARCH64_VALID_CALL_TARGET
|
|
ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
|
|
ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
|
|
1:
|
|
cmp $len,#64
|
|
b.lt 1f
|
|
ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
|
|
cmp $len,#128
|
|
b.lt 2f
|
|
ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
|
|
// 8 blocks
|
|
___
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
&rev32(@dat[4],@dat[4]);
|
|
&rev32(@dat[5],@dat[5]);
|
|
&rev32(@dat[6],@dat[6]);
|
|
&rev32(@dat[7],@dat[7]);
|
|
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
|
&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
&rev32(@dat[4],@dat[4]);
|
|
&rev32(@dat[5],@dat[5]);
|
|
$code.=<<___;
|
|
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
|
|
___
|
|
&rev32(@dat[6],@dat[6]);
|
|
&rev32(@dat[7],@dat[7]);
|
|
$code.=<<___;
|
|
st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
|
|
subs $len,$len,#128
|
|
b.gt 1b
|
|
ret
|
|
// 4 blocks
|
|
2:
|
|
___
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
$code.=<<___;
|
|
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
|
|
subs $len,$len,#64
|
|
b.gt 1b
|
|
1:
|
|
subs $len,$len,#16
|
|
b.lt 1f
|
|
ld1 {@dat[0].4s},[$inp],#16
|
|
___
|
|
&rev32(@dat[0],@dat[0]);
|
|
&enc_blk(@dat[0]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
$code.=<<___;
|
|
st1 {@dat[0].4s},[$out],#16
|
|
b.ne 1b
|
|
1:
|
|
ret
|
|
.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
|
|
___
|
|
}}}
|
|
|
|
{{{
|
|
my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
|
|
my ($enc) = ("w5");
|
|
my @dat=map("v$_",(16..23));
|
|
my @in=map("v$_",(24..31));
|
|
my ($ivec) = ("v8");
|
|
$code.=<<___;
|
|
.globl ${prefix}_cbc_encrypt
|
|
.type ${prefix}_cbc_encrypt,%function
|
|
.align 5
|
|
${prefix}_cbc_encrypt:
|
|
AARCH64_VALID_CALL_TARGET
|
|
stp d8,d9,[sp, #-16]!
|
|
|
|
ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
|
|
ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
|
|
ld1 {$ivec.4s},[$ivp]
|
|
cmp $enc,#0
|
|
b.eq .Ldec
|
|
1:
|
|
cmp $len, #64
|
|
b.lt 1f
|
|
ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
|
|
eor @dat[0].16b,@dat[0].16b,$ivec.16b
|
|
___
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
&enc_blk(@dat[0]);
|
|
$code.=<<___;
|
|
eor @dat[1].16b,@dat[1].16b,@dat[0].16b
|
|
___
|
|
&enc_blk(@dat[1]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
$code.=<<___;
|
|
eor @dat[2].16b,@dat[2].16b,@dat[1].16b
|
|
___
|
|
&enc_blk(@dat[2]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
$code.=<<___;
|
|
eor @dat[3].16b,@dat[3].16b,@dat[2].16b
|
|
___
|
|
&enc_blk(@dat[3]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
$code.=<<___;
|
|
mov $ivec.16b,@dat[3].16b
|
|
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
|
|
subs $len,$len,#64
|
|
b.ne 1b
|
|
1:
|
|
subs $len,$len,#16
|
|
b.lt 3f
|
|
ld1 {@dat[0].4s},[$inp],#16
|
|
eor $ivec.16b,$ivec.16b,@dat[0].16b
|
|
___
|
|
&rev32($ivec,$ivec);
|
|
&enc_blk($ivec);
|
|
&rev32($ivec,$ivec);
|
|
$code.=<<___;
|
|
st1 {$ivec.16b},[$out],#16
|
|
b.ne 1b
|
|
b 3f
|
|
.Ldec:
|
|
1:
|
|
cmp $len, #64
|
|
b.lt 1f
|
|
ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp]
|
|
ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
|
|
cmp $len,#128
|
|
b.lt 2f
|
|
// 8 blocks mode
|
|
ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp]
|
|
ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
|
|
___
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],$dat[3]);
|
|
&rev32(@dat[4],@dat[4]);
|
|
&rev32(@dat[5],@dat[5]);
|
|
&rev32(@dat[6],@dat[6]);
|
|
&rev32(@dat[7],$dat[7]);
|
|
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
|
&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
&rev32(@dat[4],@dat[4]);
|
|
&rev32(@dat[5],@dat[5]);
|
|
&rev32(@dat[6],@dat[6]);
|
|
&rev32(@dat[7],@dat[7]);
|
|
$code.=<<___;
|
|
eor @dat[0].16b,@dat[0].16b,$ivec.16b
|
|
eor @dat[1].16b,@dat[1].16b,@in[0].16b
|
|
eor @dat[2].16b,@dat[2].16b,@in[1].16b
|
|
mov $ivec.16b,@in[7].16b
|
|
eor @dat[3].16b,$dat[3].16b,@in[2].16b
|
|
eor @dat[4].16b,$dat[4].16b,@in[3].16b
|
|
eor @dat[5].16b,$dat[5].16b,@in[4].16b
|
|
eor @dat[6].16b,$dat[6].16b,@in[5].16b
|
|
eor @dat[7].16b,$dat[7].16b,@in[6].16b
|
|
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
|
|
st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
|
|
subs $len,$len,128
|
|
b.gt 1b
|
|
b 3f
|
|
// 4 blocks mode
|
|
2:
|
|
___
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],$dat[3]);
|
|
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
$code.=<<___;
|
|
eor @dat[0].16b,@dat[0].16b,$ivec.16b
|
|
eor @dat[1].16b,@dat[1].16b,@in[0].16b
|
|
mov $ivec.16b,@in[3].16b
|
|
eor @dat[2].16b,@dat[2].16b,@in[1].16b
|
|
eor @dat[3].16b,$dat[3].16b,@in[2].16b
|
|
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
|
|
subs $len,$len,#64
|
|
b.gt 1b
|
|
1:
|
|
subs $len,$len,#16
|
|
b.lt 3f
|
|
ld1 {@dat[0].4s},[$inp],#16
|
|
mov @in[0].16b,@dat[0].16b
|
|
___
|
|
&rev32(@dat[0],@dat[0]);
|
|
&enc_blk(@dat[0]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
$code.=<<___;
|
|
eor @dat[0].16b,@dat[0].16b,$ivec.16b
|
|
mov $ivec.16b,@in[0].16b
|
|
st1 {@dat[0].16b},[$out],#16
|
|
b.ne 1b
|
|
3:
|
|
// save back IV
|
|
st1 {$ivec.16b},[$ivp]
|
|
ldp d8,d9,[sp],#16
|
|
ret
|
|
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
|
|
___
|
|
}}}
|
|
|
|
{{{
|
|
my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
|
|
my ($ctr)=("w5");
|
|
my @dat=map("v$_",(16..23));
|
|
my @in=map("v$_",(24..31));
|
|
my ($ivec)=("v8");
|
|
$code.=<<___;
|
|
.globl ${prefix}_ctr32_encrypt_blocks
|
|
.type ${prefix}_ctr32_encrypt_blocks,%function
|
|
.align 5
|
|
${prefix}_ctr32_encrypt_blocks:
|
|
AARCH64_VALID_CALL_TARGET
|
|
stp d8,d9,[sp, #-16]!
|
|
|
|
ld1 {$ivec.4s},[$ivp]
|
|
ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
|
|
ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
|
|
___
|
|
&rev32($ivec,$ivec);
|
|
$code.=<<___;
|
|
mov $ctr,$ivec.s[3]
|
|
1:
|
|
cmp $len,#4
|
|
b.lt 1f
|
|
ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
|
|
mov @dat[0].16b,$ivec.16b
|
|
mov @dat[1].16b,$ivec.16b
|
|
mov @dat[2].16b,$ivec.16b
|
|
mov @dat[3].16b,$ivec.16b
|
|
add $ctr,$ctr,#1
|
|
mov $dat[1].s[3],$ctr
|
|
add $ctr,$ctr,#1
|
|
mov @dat[2].s[3],$ctr
|
|
add $ctr,$ctr,#1
|
|
mov @dat[3].s[3],$ctr
|
|
cmp $len,#8
|
|
b.lt 2f
|
|
ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
|
|
mov @dat[4].16b,$ivec.16b
|
|
mov @dat[5].16b,$ivec.16b
|
|
mov @dat[6].16b,$ivec.16b
|
|
mov @dat[7].16b,$ivec.16b
|
|
add $ctr,$ctr,#1
|
|
mov $dat[4].s[3],$ctr
|
|
add $ctr,$ctr,#1
|
|
mov @dat[5].s[3],$ctr
|
|
add $ctr,$ctr,#1
|
|
mov @dat[6].s[3],$ctr
|
|
add $ctr,$ctr,#1
|
|
mov @dat[7].s[3],$ctr
|
|
___
|
|
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
|
&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
&rev32(@dat[4],@dat[4]);
|
|
&rev32(@dat[5],@dat[5]);
|
|
&rev32(@dat[6],@dat[6]);
|
|
&rev32(@dat[7],@dat[7]);
|
|
$code.=<<___;
|
|
eor @dat[0].16b,@dat[0].16b,@in[0].16b
|
|
eor @dat[1].16b,@dat[1].16b,@in[1].16b
|
|
eor @dat[2].16b,@dat[2].16b,@in[2].16b
|
|
eor @dat[3].16b,@dat[3].16b,@in[3].16b
|
|
eor @dat[4].16b,@dat[4].16b,@in[4].16b
|
|
eor @dat[5].16b,@dat[5].16b,@in[5].16b
|
|
eor @dat[6].16b,@dat[6].16b,@in[6].16b
|
|
eor @dat[7].16b,@dat[7].16b,@in[7].16b
|
|
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
|
|
st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
|
|
subs $len,$len,#8
|
|
b.eq 3f
|
|
add $ctr,$ctr,#1
|
|
mov $ivec.s[3],$ctr
|
|
b 1b
|
|
2:
|
|
___
|
|
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
&rev32(@dat[1],@dat[1]);
|
|
&rev32(@dat[2],@dat[2]);
|
|
&rev32(@dat[3],@dat[3]);
|
|
$code.=<<___;
|
|
eor @dat[0].16b,@dat[0].16b,@in[0].16b
|
|
eor @dat[1].16b,@dat[1].16b,@in[1].16b
|
|
eor @dat[2].16b,@dat[2].16b,@in[2].16b
|
|
eor @dat[3].16b,@dat[3].16b,@in[3].16b
|
|
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
|
|
subs $len,$len,#4
|
|
b.eq 3f
|
|
add $ctr,$ctr,#1
|
|
mov $ivec.s[3],$ctr
|
|
b 1b
|
|
1:
|
|
subs $len,$len,#1
|
|
b.lt 3f
|
|
mov $dat[0].16b,$ivec.16b
|
|
ld1 {@in[0].4s},[$inp],#16
|
|
___
|
|
&enc_blk(@dat[0]);
|
|
&rev32(@dat[0],@dat[0]);
|
|
$code.=<<___;
|
|
eor $dat[0].16b,$dat[0].16b,@in[0].16b
|
|
st1 {$dat[0].4s},[$out],#16
|
|
b.eq 3f
|
|
add $ctr,$ctr,#1
|
|
mov $ivec.s[3],$ctr
|
|
b 1b
|
|
3:
|
|
ldp d8,d9,[sp],#16
|
|
ret
|
|
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
|
|
___
|
|
}}}
|
|
########################################
|
|
{ my %opcode = (
|
|
"sm4e" => 0xcec08400,
|
|
"sm4ekey" => 0xce60c800);
|
|
|
|
sub unsm4 {
|
|
my ($mnemonic,$arg)=@_;
|
|
|
|
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
|
|
&&
|
|
sprintf ".inst\t0x%08x\t//%s %s",
|
|
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
|
|
$mnemonic,$arg;
|
|
}
|
|
}
|
|
|
|
open SELF,$0;
|
|
while(<SELF>) {
|
|
next if (/^#!/);
|
|
last if (!s/^#/\/\// and !/^$/);
|
|
print;
|
|
}
|
|
close SELF;
|
|
|
|
foreach(split("\n",$code)) {
|
|
s/\`([^\`]*)\`/eval($1)/ge;
|
|
|
|
s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge;
|
|
print $_,"\n";
|
|
}
|
|
|
|
close STDOUT or die "error closing STDOUT: $!";
|