openssl/crypto/arm64cpuid.pl
Daniel Hu 15b7175f55 SM4 optimization for ARM by HW instruction
This patch implements the SM4 optimization for ARM processor,
using SM4 HW instruction, which is an optional feature of
crypto extension for aarch64 V8.

Tested on some modern ARM micro-architectures with SM4 support, the
performance uplift can be observed around 8X~40X over existing
C implementation in openssl. Algorithms that can be parallelized
(like CTR, ECB, CBC decryption) are on higher end, with algorithm
like CBC encryption on lower end (due to inter-block dependency)

Perf data on Yitian-710 2.75GHz hardware, before and after optimization:

Before:
  type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
  SM4-CTR  105787.80k   107837.87k   108380.84k   108462.08k   108549.46k   108554.92k
  SM4-ECB  111924.58k   118173.76k   119776.00k   120093.70k   120264.02k   120274.94k
  SM4-CBC  106428.09k   109190.98k   109674.33k   109774.51k   109827.41k   109827.41k

After (7.4x - 36.6x faster):
  type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
  SM4-CTR  781979.02k  2432994.28k  3437753.86k  3834177.88k  3963715.58k  3974556.33k
  SM4-ECB  937590.69k  2941689.02k  3945751.81k  4328655.87k  4459181.40k  4468692.31k
  SM4-CBC  890639.88k  1027746.58k  1050621.78k  1056696.66k  1058613.93k  1058701.31k

Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>

Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/17455)
2022-01-18 11:52:14 +01:00

244 lines
5.0 KiB
Perl
Executable File

#! /usr/bin/env perl
# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
$code.=<<___;
#include "arm_arch.h"
.text
.arch armv8-a+crypto
.align 5
.globl _armv7_neon_probe
.type _armv7_neon_probe,%function
_armv7_neon_probe:
AARCH64_VALID_CALL_TARGET
orr v15.16b, v15.16b, v15.16b
ret
.size _armv7_neon_probe,.-_armv7_neon_probe
.globl _armv7_tick
.type _armv7_tick,%function
_armv7_tick:
AARCH64_VALID_CALL_TARGET
#ifdef __APPLE__
mrs x0, CNTPCT_EL0
#else
mrs x0, CNTVCT_EL0
#endif
ret
.size _armv7_tick,.-_armv7_tick
.globl _armv8_aes_probe
.type _armv8_aes_probe,%function
_armv8_aes_probe:
AARCH64_VALID_CALL_TARGET
aese v0.16b, v0.16b
ret
.size _armv8_aes_probe,.-_armv8_aes_probe
.globl _armv8_sha1_probe
.type _armv8_sha1_probe,%function
_armv8_sha1_probe:
AARCH64_VALID_CALL_TARGET
sha1h s0, s0
ret
.size _armv8_sha1_probe,.-_armv8_sha1_probe
.globl _armv8_sha256_probe
.type _armv8_sha256_probe,%function
_armv8_sha256_probe:
AARCH64_VALID_CALL_TARGET
sha256su0 v0.4s, v0.4s
ret
.size _armv8_sha256_probe,.-_armv8_sha256_probe
.globl _armv8_pmull_probe
.type _armv8_pmull_probe,%function
_armv8_pmull_probe:
AARCH64_VALID_CALL_TARGET
pmull v0.1q, v0.1d, v0.1d
ret
.size _armv8_pmull_probe,.-_armv8_pmull_probe
.globl _armv8_sm4_probe
.type _armv8_sm4_probe,%function
_armv8_sm4_probe:
AARCH64_VALID_CALL_TARGET
.long 0xcec08400 // sm4e v0.4s, v0.4s
ret
.size _armv8_sm4_probe,.-_armv8_sm4_probe
.globl _armv8_sha512_probe
.type _armv8_sha512_probe,%function
_armv8_sha512_probe:
AARCH64_VALID_CALL_TARGET
.long 0xcec08000 // sha512su0 v0.2d,v0.2d
ret
.size _armv8_sha512_probe,.-_armv8_sha512_probe
.globl _armv8_cpuid_probe
.type _armv8_cpuid_probe,%function
_armv8_cpuid_probe:
AARCH64_VALID_CALL_TARGET
mrs x0, midr_el1
ret
.size _armv8_cpuid_probe,.-_armv8_cpuid_probe
.globl _armv8_sm3_probe
.type _armv8_sm3_probe,%function
_armv8_sm3_probe:
AARCH64_VALID_CALL_TARGET
.long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s
ret
.size _armv8_sm3_probe,.-_armv8_sm3_probe
.globl OPENSSL_cleanse
.type OPENSSL_cleanse,%function
.align 5
OPENSSL_cleanse:
AARCH64_VALID_CALL_TARGET
cbz x1,.Lret // len==0?
cmp x1,#15
b.hi .Lot // len>15
nop
.Little:
strb wzr,[x0],#1 // store byte-by-byte
subs x1,x1,#1
b.ne .Little
.Lret: ret
.align 4
.Lot: tst x0,#7
b.eq .Laligned // inp is aligned
strb wzr,[x0],#1 // store byte-by-byte
sub x1,x1,#1
b .Lot
.align 4
.Laligned:
str xzr,[x0],#8 // store word-by-word
sub x1,x1,#8
tst x1,#-8
b.ne .Laligned // len>=8
cbnz x1,.Little // len!=0?
ret
.size OPENSSL_cleanse,.-OPENSSL_cleanse
.globl CRYPTO_memcmp
.type CRYPTO_memcmp,%function
.align 4
CRYPTO_memcmp:
AARCH64_VALID_CALL_TARGET
eor w3,w3,w3
cbz x2,.Lno_data // len==0?
cmp x2,#16
b.ne .Loop_cmp
ldp x8,x9,[x0]
ldp x10,x11,[x1]
eor x8,x8,x10
eor x9,x9,x11
orr x8,x8,x9
mov x0,#1
cmp x8,#0
csel x0,xzr,x0,eq
ret
.align 4
.Loop_cmp:
ldrb w4,[x0],#1
ldrb w5,[x1],#1
eor w4,w4,w5
orr w3,w3,w4
subs x2,x2,#1
b.ne .Loop_cmp
.Lno_data:
neg w0,w3
lsr w0,w0,#31
ret
.size CRYPTO_memcmp,.-CRYPTO_memcmp
.globl _armv8_rng_probe
.type _armv8_rng_probe,%function
_armv8_rng_probe:
mrs x0, s3_3_c2_c4_0 // rndr
mrs x0, s3_3_c2_c4_1 // rndrrs
ret
.size _armv8_rng_probe,.-_armv8_rng_probe
___
sub gen_random {
my $rdop = shift;
my $rand_reg = $rdop eq "rndr" ? "s3_3_c2_c4_0" : "s3_3_c2_c4_1";
print<<___;
// Fill buffer with Randomly Generated Bytes
// inputs: char * in x0 - Pointer to buffer
// size_t in x1 - Number of bytes to write to buffer
// outputs: size_t in x0 - Number of bytes successfully written to buffer
.globl OPENSSL_${rdop}_asm
.type OPENSSL_${rdop}_asm,%function
.align 4
OPENSSL_${rdop}_asm:
mov x2,xzr
mov x3,xzr
.align 4
.Loop_${rdop}:
cmp x1,#0
b.eq .${rdop}_done
mov x3,xzr
mrs x3,$rand_reg
b.eq .${rdop}_done
cmp x1,#8
b.lt .Loop_single_byte_${rdop}
str x3,[x0]
add x0,x0,#8
add x2,x2,#8
subs x1,x1,#8
b.ge .Loop_${rdop}
.align 4
.Loop_single_byte_${rdop}:
strb w3,[x0]
lsr x3,x3,#8
add x2,x2,#1
add x0,x0,#1
subs x1,x1,#1
b.gt .Loop_single_byte_${rdop}
.align 4
.${rdop}_done:
mov x0,x2
ret
.size OPENSSL_${rdop}_asm,.-OPENSSL_${rdop}_asm
___
}
gen_random("rndr");
gen_random("rndrrs");
print $code;
close STDOUT or die "error closing STDOUT: $!";