openssl/crypto/arm_arch.h
Xu Yizhou 88c53cf17d Apply SM4 optimization patch to Kunpeng-920
In the ideal scenario, performance can reach up to 2.2X.
But in single block input or CFB/OFB mode, CBC encryption,
performance could drop about 50%.

Perf data on Kunpeng-920 2.6GHz hardware, before and after optimization:

Before:
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
SM4-CTR 75318.96k 79089.62k 79736.15k 79934.12k 80325.44k 80068.61k
SM4-ECB 80211.39k 84998.36k 86472.28k 87024.93k 87144.80k 86862.51k
SM4-GCM 72156.19k 82012.08k 83848.02k 84322.65k 85103.65k 84896.43k
SM4-CBC 77956.13k 80638.81k 81976.17k 81606.31k 82078.91k 81750.70k
SM4-CFB 78078.20k 81054.87k 81841.07k 82396.38k 82203.99k 82236.76k
SM4-OFB 78282.76k 82074.03k 82765.74k 82989.06k 83200.68k 83487.17k

After:
type    16 bytes  64 bytes   256 bytes  1024 bytes 8192 bytes 16384 bytes
SM4-CTR 35678.07k 120687.25k 176632.27k 177192.62k 177586.18k 178295.18k
SM4-ECB 35540.32k 122628.07k 175067.90k 178007.84k 178298.88k 178328.92k
SM4-GCM 34215.75k 116720.50k 170275.16k 171770.88k 172714.21k 172272.30k
SM4-CBC 35645.60k 36544.86k  36515.50k  36732.15k  36618.24k  36629.16k
SM4-CFB 35528.14k 35690.99k  35954.86k  35843.42k  35809.18k  35809.96k
SM4-OFB 35563.55k 35853.56k  35963.05k  36203.52k  36233.85k  36307.82k

Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>

Reviewed-by: Hugo Landau <hlandau@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/19547)
2022-11-02 08:45:10 +11:00

197 lines
6.7 KiB
C

/*
* Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#ifndef OSSL_CRYPTO_ARM_ARCH_H
# define OSSL_CRYPTO_ARM_ARCH_H
# if !defined(__ARM_ARCH__)
# if defined(__CC_ARM)
# define __ARM_ARCH__ __TARGET_ARCH_ARM
# if defined(__BIG_ENDIAN)
# define __ARMEB__
# else
# define __ARMEL__
# endif
# elif defined(__GNUC__)
# if defined(__aarch64__)
# define __ARM_ARCH__ 8
/*
* Why doesn't gcc define __ARM_ARCH__? Instead it defines
* bunch of below macros. See all_architectures[] table in
* gcc/config/arm/arm.c. On a side note it defines
* __ARMEL__/__ARMEB__ for little-/big-endian.
*/
# elif defined(__ARM_ARCH)
# define __ARM_ARCH__ __ARM_ARCH
# elif defined(__ARM_ARCH_8A__)
# define __ARM_ARCH__ 8
# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
defined(__ARM_ARCH_7EM__)
# define __ARM_ARCH__ 7
# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \
defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \
defined(__ARM_ARCH_6T2__)
# define __ARM_ARCH__ 6
# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \
defined(__ARM_ARCH_5TEJ__)
# define __ARM_ARCH__ 5
# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
# define __ARM_ARCH__ 4
# else
# error "unsupported ARM architecture"
# endif
# endif
# endif
# if !defined(__ARM_MAX_ARCH__)
# define __ARM_MAX_ARCH__ __ARM_ARCH__
# endif
# if __ARM_MAX_ARCH__<__ARM_ARCH__
# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__"
# elif __ARM_MAX_ARCH__!=__ARM_ARCH__
# if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__)
# error "can't build universal big-endian binary"
# endif
# endif
# ifndef __ASSEMBLER__
extern unsigned int OPENSSL_armcap_P;
extern unsigned int OPENSSL_arm_midr;
extern unsigned int OPENSSL_armv8_rsa_neonized;
# endif
# define ARMV7_NEON (1<<0)
# define ARMV7_TICK (1<<1)
# define ARMV8_AES (1<<2)
# define ARMV8_SHA1 (1<<3)
# define ARMV8_SHA256 (1<<4)
# define ARMV8_PMULL (1<<5)
# define ARMV8_SHA512 (1<<6)
# define ARMV8_CPUID (1<<7)
# define ARMV8_RNG (1<<8)
# define ARMV8_SM3 (1<<9)
# define ARMV8_SM4 (1<<10)
# define ARMV8_SHA3 (1<<11)
# define ARMV8_UNROLL8_EOR3 (1<<12)
# define ARMV8_SVE (1<<13)
# define ARMV8_SVE2 (1<<14)
/*
* MIDR_EL1 system register
*
* 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0
* | | | | | | |
* |RES0 | Implementer | Variant | Arch | PartNum |Revision|
* |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________|
*
*/
# define ARM_CPU_IMP_ARM 0x41
# define HISI_CPU_IMP 0x48
# define ARM_CPU_PART_CORTEX_A72 0xD08
# define ARM_CPU_PART_N1 0xD0C
# define ARM_CPU_PART_V1 0xD40
# define ARM_CPU_PART_N2 0xD49
# define HISI_CPU_PART_KP920 0xD01
# define MIDR_PARTNUM_SHIFT 4
# define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT)
# define MIDR_PARTNUM(midr) \
(((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT)
# define MIDR_IMPLEMENTER_SHIFT 24
# define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT)
# define MIDR_IMPLEMENTER(midr) \
(((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT)
# define MIDR_ARCHITECTURE_SHIFT 16
# define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT)
# define MIDR_ARCHITECTURE(midr) \
(((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT)
# define MIDR_CPU_MODEL_MASK \
(MIDR_IMPLEMENTER_MASK | \
MIDR_PARTNUM_MASK | \
MIDR_ARCHITECTURE_MASK)
# define MIDR_CPU_MODEL(imp, partnum) \
(((imp) << MIDR_IMPLEMENTER_SHIFT) | \
(0xfU << MIDR_ARCHITECTURE_SHIFT) | \
((partnum) << MIDR_PARTNUM_SHIFT))
# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \
(((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum))
#if defined(__ASSEMBLER__)
/*
* Support macros for
* - Armv8.3-A Pointer Authentication and
* - Armv8.5-A Branch Target Identification
* features which require emitting a .note.gnu.property section with the
* appropriate architecture-dependent feature bits set.
* Read more: "ELF for the Arm® 64-bit Architecture"
*/
# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */
# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */
# else
# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */
# define AARCH64_VALID_CALL_TARGET
# endif
# if defined(__ARM_FEATURE_PAC_DEFAULT) && \
(__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */
# define GNU_PROPERTY_AARCH64_POINTER_AUTH \
(1 << 1) /* Has Pointer Authentication */
# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */
# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */
# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \
(__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */
# define GNU_PROPERTY_AARCH64_POINTER_AUTH \
(1 << 1) /* Has Pointer Authentication */
# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */
# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */
# else
# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */
# if GNU_PROPERTY_AARCH64_BTI != 0
# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET
# else
# define AARCH64_SIGN_LINK_REGISTER
# endif
# define AARCH64_VALIDATE_LINK_REGISTER
# endif
# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0
.pushsection .note.gnu.property, "a";
.balign 8;
.long 4;
.long 0x10;
.long 0x5;
.asciz "GNU";
.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
.long 4;
.long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI);
.long 0;
.popsection;
# endif
# endif /* defined __ASSEMBLER__ */
# define IS_CPU_SUPPORT_UNROLL8_EOR3() \
(OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3)
#endif