openssl/include/crypto/aes_platform.h

451 lines
22 KiB
C
Raw Normal View History

/*
* Copyright 2019-2021 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#ifndef OSSL_AES_PLATFORM_H
# define OSSL_AES_PLATFORM_H
# pragma once
# include "openssl/aes.h"
# ifdef VPAES_ASM
int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
AES_KEY *key);
int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
AES_KEY *key);
void vpaes_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void vpaes_decrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void vpaes_cbc_encrypt(const unsigned char *in,
unsigned char *out,
size_t length,
const AES_KEY *key, unsigned char *ivec, int enc);
# endif /* VPAES_ASM */
# ifdef BSAES_ASM
void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char ivec[16], int enc);
void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
const unsigned char ivec[16]);
void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
size_t len, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char iv[16]);
void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
size_t len, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char iv[16]);
# endif /* BSAES_ASM */
# ifdef AES_CTR_ASM
void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key,
const unsigned char ivec[AES_BLOCK_SIZE]);
# endif /* AES_CTR_ASM */
# ifdef AES_XTS_ASM
void AES_xts_encrypt(const unsigned char *inp, unsigned char *out, size_t len,
const AES_KEY *key1, const AES_KEY *key2,
const unsigned char iv[16]);
void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
const AES_KEY *key1, const AES_KEY *key2,
const unsigned char iv[16]);
# endif /* AES_XTS_ASM */
# if defined(OPENSSL_CPUID_OBJ)
# if (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
# include "ppc_arch.h"
# ifdef VPAES_ASM
# define VPAES_CAPABLE (OPENSSL_ppccap_P & PPC_ALTIVEC)
# endif
# define HWAES_CAPABLE (OPENSSL_ppccap_P & PPC_CRYPTO207)
# define HWAES_set_encrypt_key aes_p8_set_encrypt_key
# define HWAES_set_decrypt_key aes_p8_set_decrypt_key
# define HWAES_encrypt aes_p8_encrypt
# define HWAES_decrypt aes_p8_decrypt
# define HWAES_cbc_encrypt aes_p8_cbc_encrypt
# define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
# define HWAES_xts_encrypt aes_p8_xts_encrypt
# define HWAES_xts_decrypt aes_p8_xts_decrypt
# endif /* PPC */
# if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
# include "arm_arch.h"
# if __ARM_MAX_ARCH__>=7
# if defined(BSAES_ASM)
# define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
# endif
# if defined(VPAES_ASM)
# define VPAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
# endif
# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
# define HWAES_set_encrypt_key aes_v8_set_encrypt_key
# define HWAES_set_decrypt_key aes_v8_set_decrypt_key
# define HWAES_encrypt aes_v8_encrypt
# define HWAES_decrypt aes_v8_decrypt
# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
Optimize AES-ECB mode in OpenSSL for both aarch64 and aarch32 Aes-ecb mode can be optimized by inverleaving cipher operation on several blocks and loop unrolling. Interleaving needs one ideal unrolling factor, here we adopt the same factor with aes-cbc, which is described as below: If blocks number > 5, select 5 blocks as one iteration,every loop, decrease the blocks number by 5. If 3 < left blocks < 5 select 3 blocks as one iteration, every loop, decrease the block number by 3. If left blocks < 3, treat them as tail blocks. Detailed implementation will have a little adjustment for squeezing code space. With this way, for small size such as 16 bytes, the performance is similar as before, but for big size such as 16k bytes, the performance improves a lot, even reaches to 100%, for some arches such as A57, the improvement even exceeds 100%. The following table will list the encryption performance data on aarch64, take a72 and a57 as examples. Performance value takes the unit of cycles per byte, takes the format as comparision of values. List them as below: A72: Before optimization After optimization Improve evp-aes-128-ecb@16 17.26538237 16.82663866 2.61% evp-aes-128-ecb@64 5.50528499 5.222637557 5.41% evp-aes-128-ecb@256 2.632700213 1.908442892 37.95% evp-aes-128-ecb@1024 1.876102047 1.078018868 74.03% evp-aes-128-ecb@8192 1.6550392 0.853982929 93.80% evp-aes-128-ecb@16384 1.636871283 0.847623957 93.11% evp-aes-192-ecb@16 17.73104961 17.09692468 3.71% evp-aes-192-ecb@64 5.78984398 5.418545192 6.85% evp-aes-192-ecb@256 2.872005308 2.081815274 37.96% evp-aes-192-ecb@1024 2.083226672 1.25095642 66.53% evp-aes-192-ecb@8192 1.831992057 0.995916251 83.95% evp-aes-192-ecb@16384 1.821590009 0.993820525 83.29% evp-aes-256-ecb@16 18.0606306 17.96963317 0.51% evp-aes-256-ecb@64 6.19651997 5.762465812 7.53% evp-aes-256-ecb@256 3.176991394 2.24642538 41.42% evp-aes-256-ecb@1024 2.385991919 1.396018192 70.91% evp-aes-256-ecb@8192 2.147862636 1.142222597 88.04% evp-aes-256-ecb@16384 2.131361787 1.135944617 87.63% A57: Before optimization After optimization Improve evp-aes-128-ecb@16 18.61045121 18.36456218 1.34% evp-aes-128-ecb@64 6.438628994 5.467959461 17.75% evp-aes-128-ecb@256 2.957452881 1.97238604 49.94% evp-aes-128-ecb@1024 2.117096219 1.099665054 92.52% evp-aes-128-ecb@8192 1.868385973 0.837440804 123.11% evp-aes-128-ecb@16384 1.853078526 0.822420027 125.32% evp-aes-192-ecb@16 19.07021756 18.50018552 3.08% evp-aes-192-ecb@64 6.672351486 5.696088921 17.14% evp-aes-192-ecb@256 3.260427769 2.131449916 52.97% evp-aes-192-ecb@1024 2.410522832 1.250529718 92.76% evp-aes-192-ecb@8192 2.17921605 0.973225504 123.92% evp-aes-192-ecb@16384 2.162250997 0.95919871 125.42% evp-aes-256-ecb@16 19.3008384 19.12743654 0.91% evp-aes-256-ecb@64 6.992950658 5.92149541 18.09% evp-aes-256-ecb@256 3.576361743 2.287619504 56.34% evp-aes-256-ecb@1024 2.726671027 1.381267599 97.40% evp-aes-256-ecb@8192 2.493583657 1.110959913 124.45% evp-aes-256-ecb@16384 2.473916816 1.099967073 124.91% Change-Id: Iccd23d972e0d52d22dc093f4c208f69c9d5a0ca7 Reviewed-by: Shane Lontis <shane.lontis@oracle.com> Reviewed-by: Richard Levitte <levitte@openssl.org> (Merged from https://github.com/openssl/openssl/pull/10518)
2019-11-07 10:36:45 +08:00
# define HWAES_ecb_encrypt aes_v8_ecb_encrypt
Optimize AES-XTS mode in OpenSSL for aarch64 Aes-xts mode can be optimized by interleaving cipher operation on several blocks and loop unrolling. Interleaving needs one ideal unrolling factor, here we adopt the same factor with aes-cbc, which is described as below: If blocks number > 5, select 5 blocks as one iteration,every loop, decrease the blocks number by 5. If left blocks < 5, treat them as tail blocks. Detailed implementation has a little adjustment for squeezing code space. With this way, for small size such as 16 bytes, the performance is similar as before, but for big size such as 16k bytes, the performance improves a lot, even reaches to 2x uplift, for some arches such as A57, the improvement even reaches more than 2x uplift. We collect many performance datas on different micro-archs such as thunderx2, ampere-emag, a72, a75, a57, a53 and N1, all of which reach 0.5-2x uplift. The following table lists the encryption performance data on aarch64, take a72, a75, a57, a53 and N1 as examples. Performance value takes the unit of cycles per byte, takes the format as comparision of values. List them as below: A72: Before optimization After optimization Improve evp-aes-128-xts@16 8.899913518 5.949087263 49.60% evp-aes-128-xts@64 4.525512668 3.389141845 33.53% evp-aes-128-xts@256 3.502906908 1.633573479 114.43% evp-aes-128-xts@1024 3.174210419 1.155952639 174.60% evp-aes-128-xts@8192 3.053019303 1.028134888 196.95% evp-aes-128-xts@16384 3.025292462 1.02021169 196.54% evp-aes-256-xts@16 9.971105023 6.754233758 47.63% evp-aes-256-xts@64 4.931479093 3.786527393 30.24% evp-aes-256-xts@256 3.746788153 1.943975947 92.74% evp-aes-256-xts@1024 3.401743802 1.477394648 130.25% evp-aes-256-xts@8192 3.278769327 1.32950421 146.62% evp-aes-256-xts@16384 3.27093296 1.325276257 146.81% A75: Before optimization After optimization Improve evp-aes-128-xts@16 8.397965173 5.126839098 63.80% evp-aes-128-xts@64 4.176860631 2.59817764 60.76% evp-aes-128-xts@256 3.069126585 1.284561028 138.92% evp-aes-128-xts@1024 2.805962699 0.932754655 200.83% evp-aes-128-xts@8192 2.725820131 0.829820397 228.48% evp-aes-128-xts@16384 2.71521905 0.823251591 229.82% evp-aes-256-xts@16 11.24790935 7.383914448 52.33% evp-aes-256-xts@64 5.294128847 3.048641998 73.66% evp-aes-256-xts@256 3.861649617 1.570359905 145.91% evp-aes-256-xts@1024 3.537646797 1.200493533 194.68% evp-aes-256-xts@8192 3.435353012 1.085345319 216.52% evp-aes-256-xts@16384 3.437952563 1.097963822 213.12% A57: Before optimization After optimization Improve evp-aes-128-xts@16 10.57455446 7.165438012 47.58% evp-aes-128-xts@64 5.418185447 3.721241202 45.60% evp-aes-128-xts@256 3.855184592 1.747145379 120.66% evp-aes-128-xts@1024 3.477199757 1.253049735 177.50% evp-aes-128-xts@8192 3.36768104 1.091943159 208.41% evp-aes-128-xts@16384 3.360373443 1.088942789 208.59% evp-aes-256-xts@16 12.54559459 8.745489036 43.45% evp-aes-256-xts@64 6.542808937 4.326387568 51.23% evp-aes-256-xts@256 4.62668822 2.119908754 118.25% evp-aes-256-xts@1024 4.161716505 1.557335554 167.23% evp-aes-256-xts@8192 4.032462227 1.377749511 192.68% evp-aes-256-xts@16384 4.023293877 1.371558933 193.34% A53: Before optimization After optimization Improve evp-aes-128-xts@16 18.07842135 13.96980808 29.40% evp-aes-128-xts@64 7.933818397 6.07159276 30.70% evp-aes-128-xts@256 5.264604704 2.611155744 101.60% evp-aes-128-xts@1024 4.606660117 1.722713454 167.40% evp-aes-128-xts@8192 4.405160115 1.454379201 202.90% evp-aes-128-xts@16384 4.401592028 1.442279392 205.20% evp-aes-256-xts@16 20.07084054 16.00803726 25.40% evp-aes-256-xts@64 9.192647294 6.883876732 33.50% evp-aes-256-xts@256 6.336143161 3.108140452 103.90% evp-aes-256-xts@1024 5.62502952 2.097960651 168.10% evp-aes-256-xts@8192 5.412085608 1.807294191 199.50% evp-aes-256-xts@16384 5.403062591 1.790135764 201.80% N1: Before optimization After optimization Improve evp-aes-128-xts@16 6.48147613 4.209415473 53.98% evp-aes-128-xts@64 2.847744115 1.950757468 45.98% evp-aes-128-xts@256 2.085711968 1.061903238 96.41% evp-aes-128-xts@1024 1.842014669 0.798486302 130.69% evp-aes-128-xts@8192 1.760449052 0.713853939 146.61% evp-aes-128-xts@16384 1.760763546 0.707702009 148.80% evp-aes-256-xts@16 7.264142817 5.265970454 37.94% evp-aes-256-xts@64 3.251356212 2.41176323 34.81% evp-aes-256-xts@256 2.380488469 1.342095742 77.37% evp-aes-256-xts@1024 2.08853022 1.041718215 100.49% evp-aes-256-xts@8192 2.027432668 0.944571334 114.64% evp-aes-256-xts@16384 2.00740782 0.941991415 113.10% Add more XTS test cases to cover the cipher stealing mode and cases of different number of blocks. CustomizedGitHooks: yes Change-Id: I93ee31b2575e1413764e27b599af62994deb4c96 Reviewed-by: Paul Dale <paul.dale@oracle.com> Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org> (Merged from https://github.com/openssl/openssl/pull/11399)
2020-03-13 11:27:34 +08:00
# if __ARM_MAX_ARCH__>=8
# define HWAES_xts_encrypt aes_v8_xts_encrypt
# define HWAES_xts_decrypt aes_v8_xts_decrypt
# endif
# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
# define AES_PMULL_CAPABLE ((OPENSSL_armcap_P & ARMV8_PMULL) && (OPENSSL_armcap_P & ARMV8_AES))
# define AES_GCM_ENC_BYTES 512
# define AES_GCM_DEC_BYTES 512
# if __ARM_MAX_ARCH__>=8
# define AES_gcm_encrypt armv8_aes_gcm_encrypt
# define AES_gcm_decrypt armv8_aes_gcm_decrypt
# define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_v8_ctr32_encrypt_blocks && \
(gctx)->gcm.ghash==gcm_ghash_v8)
size_t aes_gcm_enc_128_kernel(const uint8_t * plaintext, uint64_t plaintext_length, uint8_t * ciphertext,
uint64_t *Xi, unsigned char ivec[16], const void *key);
size_t aes_gcm_enc_192_kernel(const uint8_t * plaintext, uint64_t plaintext_length, uint8_t * ciphertext,
uint64_t *Xi, unsigned char ivec[16], const void *key);
size_t aes_gcm_enc_256_kernel(const uint8_t * plaintext, uint64_t plaintext_length, uint8_t * ciphertext,
uint64_t *Xi, unsigned char ivec[16], const void *key);
size_t aes_gcm_dec_128_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
uint64_t *Xi, unsigned char ivec[16], const void *key);
size_t aes_gcm_dec_192_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
uint64_t *Xi, unsigned char ivec[16], const void *key);
size_t aes_gcm_dec_256_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
uint64_t *Xi, unsigned char ivec[16], const void *key);
size_t armv8_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key,
unsigned char ivec[16], u64 *Xi);
size_t armv8_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key,
unsigned char ivec[16], u64 *Xi);
void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
# endif
# endif
# endif
# endif /* OPENSSL_CPUID_OBJ */
# if defined(AES_ASM) && ( \
defined(__x86_64) || defined(__x86_64__) || \
defined(_M_AMD64) || defined(_M_X64) )
# define AES_CBC_HMAC_SHA_CAPABLE 1
# define AESNI_CBC_HMAC_SHA_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
# endif
# if defined(AES_ASM) && !defined(I386_ONLY) && ( \
((defined(__i386) || defined(__i386__) || \
defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
defined(__x86_64) || defined(__x86_64__) || \
defined(_M_AMD64) || defined(_M_X64) )
/* AES-NI section */
# define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
# ifdef VPAES_ASM
# define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
# endif
# ifdef BSAES_ASM
# define BSAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
# endif
# define AES_GCM_ENC_BYTES 32
# define AES_GCM_DEC_BYTES 16
int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
AES_KEY *key);
int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
AES_KEY *key);
void aesni_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void aesni_decrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void aesni_ecb_encrypt(const unsigned char *in,
unsigned char *out,
size_t length, const AES_KEY *key, int enc);
void aesni_cbc_encrypt(const unsigned char *in,
unsigned char *out,
size_t length,
const AES_KEY *key, unsigned char *ivec, int enc);
# ifndef OPENSSL_NO_OCB
void aesni_ocb_encrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const void *key,
size_t start_block_num,
unsigned char offset_i[16],
const unsigned char L_[][16],
unsigned char checksum[16]);
void aesni_ocb_decrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const void *key,
size_t start_block_num,
unsigned char offset_i[16],
const unsigned char L_[][16],
unsigned char checksum[16]);
# endif /* OPENSSL_NO_OCB */
void aesni_ctr32_encrypt_blocks(const unsigned char *in,
unsigned char *out,
size_t blocks,
const void *key, const unsigned char *ivec);
void aesni_xts_encrypt(const unsigned char *in,
unsigned char *out,
size_t length,
const AES_KEY *key1, const AES_KEY *key2,
const unsigned char iv[16]);
void aesni_xts_decrypt(const unsigned char *in,
unsigned char *out,
size_t length,
const AES_KEY *key1, const AES_KEY *key2,
const unsigned char iv[16]);
void aesni_ccm64_encrypt_blocks(const unsigned char *in,
unsigned char *out,
size_t blocks,
const void *key,
const unsigned char ivec[16],
unsigned char cmac[16]);
void aesni_ccm64_decrypt_blocks(const unsigned char *in,
unsigned char *out,
size_t blocks,
const void *key,
const unsigned char ivec[16],
unsigned char cmac[16]);
# if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
size_t aesni_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
const void *key, unsigned char ivec[16], u64 *Xi);
size_t aesni_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
const void *key, unsigned char ivec[16], u64 *Xi);
void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *in, size_t len);
# define AES_gcm_encrypt aesni_gcm_encrypt
# define AES_gcm_decrypt aesni_gcm_decrypt
# define AES_GCM_ASM(ctx) (ctx->ctr == aesni_ctr32_encrypt_blocks && \
ctx->gcm.ghash == gcm_ghash_avx)
# endif
# elif defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
/* Fujitsu SPARC64 X support */
extern unsigned int OPENSSL_sparcv9cap_P[];
# include "sparc_arch.h"
# define SPARC_AES_CAPABLE (OPENSSL_sparcv9cap_P[1] & CFR_AES)
# define HWAES_CAPABLE (OPENSSL_sparcv9cap_P[0] & SPARCV9_FJAESX)
# define HWAES_set_encrypt_key aes_fx_set_encrypt_key
# define HWAES_set_decrypt_key aes_fx_set_decrypt_key
# define HWAES_encrypt aes_fx_encrypt
# define HWAES_decrypt aes_fx_decrypt
# define HWAES_cbc_encrypt aes_fx_cbc_encrypt
# define HWAES_ctr32_encrypt_blocks aes_fx_ctr32_encrypt_blocks
void aes_t4_set_encrypt_key(const unsigned char *key, int bits, AES_KEY *ks);
void aes_t4_set_decrypt_key(const unsigned char *key, int bits, AES_KEY *ks);
void aes_t4_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void aes_t4_decrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
/*
* Key-length specific subroutines were chosen for following reason.
* Each SPARC T4 core can execute up to 8 threads which share core's
* resources. Loading as much key material to registers allows to
* minimize references to shared memory interface, as well as amount
* of instructions in inner loops [much needed on T4]. But then having
* non-key-length specific routines would require conditional branches
* either in inner loops or on subroutines' entries. Former is hardly
* acceptable, while latter means code size increase to size occupied
* by multiple key-length specific subroutines, so why fight?
*/
void aes128_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
unsigned char *ivec);
void aes128_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
unsigned char *ivec);
void aes192_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
unsigned char *ivec);
void aes192_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
unsigned char *ivec);
void aes256_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
unsigned char *ivec);
void aes256_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
unsigned char *ivec);
void aes128_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key,
unsigned char *ivec);
void aes192_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key,
unsigned char *ivec);
void aes256_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key,
unsigned char *ivec);
void aes128_t4_xts_encrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char *ivec);
void aes128_t4_xts_decrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char *ivec);
void aes256_t4_xts_encrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char *ivec);
void aes256_t4_xts_decrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char *ivec);
# elif defined(OPENSSL_CPUID_OBJ) && defined(__s390__)
/* IBM S390X support */
# include "s390x_arch.h"
/* Convert key size to function code: [16,24,32] -> [18,19,20]. */
# define S390X_AES_FC(keylen) (S390X_AES_128 + ((((keylen) << 3) - 128) >> 6))
/* Most modes of operation need km for partial block processing. */
# define S390X_aes_128_CAPABLE (OPENSSL_s390xcap_P.km[0] & \
S390X_CAPBIT(S390X_AES_128))
# define S390X_aes_192_CAPABLE (OPENSSL_s390xcap_P.km[0] & \
S390X_CAPBIT(S390X_AES_192))
# define S390X_aes_256_CAPABLE (OPENSSL_s390xcap_P.km[0] & \
S390X_CAPBIT(S390X_AES_256))
# define S390X_aes_128_cbc_CAPABLE 1 /* checked by callee */
# define S390X_aes_192_cbc_CAPABLE 1
# define S390X_aes_256_cbc_CAPABLE 1
# define S390X_aes_128_ecb_CAPABLE S390X_aes_128_CAPABLE
# define S390X_aes_192_ecb_CAPABLE S390X_aes_192_CAPABLE
# define S390X_aes_256_ecb_CAPABLE S390X_aes_256_CAPABLE
# define S390X_aes_128_ofb_CAPABLE (S390X_aes_128_CAPABLE && \
(OPENSSL_s390xcap_P.kmo[0] & \
S390X_CAPBIT(S390X_AES_128)))
# define S390X_aes_192_ofb_CAPABLE (S390X_aes_192_CAPABLE && \
(OPENSSL_s390xcap_P.kmo[0] & \
S390X_CAPBIT(S390X_AES_192)))
# define S390X_aes_256_ofb_CAPABLE (S390X_aes_256_CAPABLE && \
(OPENSSL_s390xcap_P.kmo[0] & \
S390X_CAPBIT(S390X_AES_256)))
# define S390X_aes_128_cfb_CAPABLE (S390X_aes_128_CAPABLE && \
(OPENSSL_s390xcap_P.kmf[0] & \
S390X_CAPBIT(S390X_AES_128)))
# define S390X_aes_192_cfb_CAPABLE (S390X_aes_192_CAPABLE && \
(OPENSSL_s390xcap_P.kmf[0] & \
S390X_CAPBIT(S390X_AES_192)))
# define S390X_aes_256_cfb_CAPABLE (S390X_aes_256_CAPABLE && \
(OPENSSL_s390xcap_P.kmf[0] & \
S390X_CAPBIT(S390X_AES_256)))
# define S390X_aes_128_cfb8_CAPABLE (OPENSSL_s390xcap_P.kmf[0] & \
S390X_CAPBIT(S390X_AES_128))
# define S390X_aes_192_cfb8_CAPABLE (OPENSSL_s390xcap_P.kmf[0] & \
S390X_CAPBIT(S390X_AES_192))
# define S390X_aes_256_cfb8_CAPABLE (OPENSSL_s390xcap_P.kmf[0] & \
S390X_CAPBIT(S390X_AES_256))
# define S390X_aes_128_cfb1_CAPABLE 0
# define S390X_aes_192_cfb1_CAPABLE 0
# define S390X_aes_256_cfb1_CAPABLE 0
# define S390X_aes_128_ctr_CAPABLE 1 /* checked by callee */
# define S390X_aes_192_ctr_CAPABLE 1
# define S390X_aes_256_ctr_CAPABLE 1
# define S390X_aes_128_xts_CAPABLE 1 /* checked by callee */
# define S390X_aes_256_xts_CAPABLE 1
# define S390X_aes_128_gcm_CAPABLE (S390X_aes_128_CAPABLE && \
(OPENSSL_s390xcap_P.kma[0] & \
S390X_CAPBIT(S390X_AES_128)))
# define S390X_aes_192_gcm_CAPABLE (S390X_aes_192_CAPABLE && \
(OPENSSL_s390xcap_P.kma[0] & \
S390X_CAPBIT(S390X_AES_192)))
# define S390X_aes_256_gcm_CAPABLE (S390X_aes_256_CAPABLE && \
(OPENSSL_s390xcap_P.kma[0] & \
S390X_CAPBIT(S390X_AES_256)))
# define S390X_aes_128_ccm_CAPABLE (S390X_aes_128_CAPABLE && \
(OPENSSL_s390xcap_P.kmac[0] & \
S390X_CAPBIT(S390X_AES_128)))
# define S390X_aes_192_ccm_CAPABLE (S390X_aes_192_CAPABLE && \
(OPENSSL_s390xcap_P.kmac[0] & \
S390X_CAPBIT(S390X_AES_192)))
# define S390X_aes_256_ccm_CAPABLE (S390X_aes_256_CAPABLE && \
(OPENSSL_s390xcap_P.kmac[0] & \
S390X_CAPBIT(S390X_AES_256)))
# define S390X_CCM_AAD_FLAG 0x40
# ifndef OPENSSL_NO_OCB
# define S390X_aes_128_ocb_CAPABLE 0
# define S390X_aes_192_ocb_CAPABLE 0
# define S390X_aes_256_ocb_CAPABLE 0
# endif /* OPENSSL_NO_OCB */
# ifndef OPENSSL_NO_SIV
# define S390X_aes_128_siv_CAPABLE 0
# define S390X_aes_192_siv_CAPABLE 0
# define S390X_aes_256_siv_CAPABLE 0
# endif /* OPENSSL_NO_SIV */
/* Convert key size to function code: [16,24,32] -> [18,19,20]. */
# define S390X_AES_FC(keylen) (S390X_AES_128 + ((((keylen) << 3) - 128) >> 6))
# endif
# if defined(HWAES_CAPABLE)
int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key);
int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key);
void HWAES_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void HWAES_decrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, const int enc);
Optimize AES-ECB mode in OpenSSL for both aarch64 and aarch32 Aes-ecb mode can be optimized by inverleaving cipher operation on several blocks and loop unrolling. Interleaving needs one ideal unrolling factor, here we adopt the same factor with aes-cbc, which is described as below: If blocks number > 5, select 5 blocks as one iteration,every loop, decrease the blocks number by 5. If 3 < left blocks < 5 select 3 blocks as one iteration, every loop, decrease the block number by 3. If left blocks < 3, treat them as tail blocks. Detailed implementation will have a little adjustment for squeezing code space. With this way, for small size such as 16 bytes, the performance is similar as before, but for big size such as 16k bytes, the performance improves a lot, even reaches to 100%, for some arches such as A57, the improvement even exceeds 100%. The following table will list the encryption performance data on aarch64, take a72 and a57 as examples. Performance value takes the unit of cycles per byte, takes the format as comparision of values. List them as below: A72: Before optimization After optimization Improve evp-aes-128-ecb@16 17.26538237 16.82663866 2.61% evp-aes-128-ecb@64 5.50528499 5.222637557 5.41% evp-aes-128-ecb@256 2.632700213 1.908442892 37.95% evp-aes-128-ecb@1024 1.876102047 1.078018868 74.03% evp-aes-128-ecb@8192 1.6550392 0.853982929 93.80% evp-aes-128-ecb@16384 1.636871283 0.847623957 93.11% evp-aes-192-ecb@16 17.73104961 17.09692468 3.71% evp-aes-192-ecb@64 5.78984398 5.418545192 6.85% evp-aes-192-ecb@256 2.872005308 2.081815274 37.96% evp-aes-192-ecb@1024 2.083226672 1.25095642 66.53% evp-aes-192-ecb@8192 1.831992057 0.995916251 83.95% evp-aes-192-ecb@16384 1.821590009 0.993820525 83.29% evp-aes-256-ecb@16 18.0606306 17.96963317 0.51% evp-aes-256-ecb@64 6.19651997 5.762465812 7.53% evp-aes-256-ecb@256 3.176991394 2.24642538 41.42% evp-aes-256-ecb@1024 2.385991919 1.396018192 70.91% evp-aes-256-ecb@8192 2.147862636 1.142222597 88.04% evp-aes-256-ecb@16384 2.131361787 1.135944617 87.63% A57: Before optimization After optimization Improve evp-aes-128-ecb@16 18.61045121 18.36456218 1.34% evp-aes-128-ecb@64 6.438628994 5.467959461 17.75% evp-aes-128-ecb@256 2.957452881 1.97238604 49.94% evp-aes-128-ecb@1024 2.117096219 1.099665054 92.52% evp-aes-128-ecb@8192 1.868385973 0.837440804 123.11% evp-aes-128-ecb@16384 1.853078526 0.822420027 125.32% evp-aes-192-ecb@16 19.07021756 18.50018552 3.08% evp-aes-192-ecb@64 6.672351486 5.696088921 17.14% evp-aes-192-ecb@256 3.260427769 2.131449916 52.97% evp-aes-192-ecb@1024 2.410522832 1.250529718 92.76% evp-aes-192-ecb@8192 2.17921605 0.973225504 123.92% evp-aes-192-ecb@16384 2.162250997 0.95919871 125.42% evp-aes-256-ecb@16 19.3008384 19.12743654 0.91% evp-aes-256-ecb@64 6.992950658 5.92149541 18.09% evp-aes-256-ecb@256 3.576361743 2.287619504 56.34% evp-aes-256-ecb@1024 2.726671027 1.381267599 97.40% evp-aes-256-ecb@8192 2.493583657 1.110959913 124.45% evp-aes-256-ecb@16384 2.473916816 1.099967073 124.91% Change-Id: Iccd23d972e0d52d22dc093f4c208f69c9d5a0ca7 Reviewed-by: Shane Lontis <shane.lontis@oracle.com> Reviewed-by: Richard Levitte <levitte@openssl.org> (Merged from https://github.com/openssl/openssl/pull/10518)
2019-11-07 10:36:45 +08:00
void HWAES_ecb_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
const int enc);
void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
const unsigned char ivec[16]);
void HWAES_xts_encrypt(const unsigned char *inp, unsigned char *out,
size_t len, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char iv[16]);
void HWAES_xts_decrypt(const unsigned char *inp, unsigned char *out,
size_t len, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char iv[16]);
# ifndef OPENSSL_NO_OCB
# ifdef HWAES_ocb_encrypt
void HWAES_ocb_encrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const void *key,
size_t start_block_num,
unsigned char offset_i[16],
const unsigned char L_[][16],
unsigned char checksum[16]);
# else
# define HWAES_ocb_encrypt ((ocb128_f)NULL)
# endif
# ifdef HWAES_ocb_decrypt
void HWAES_ocb_decrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const void *key,
size_t start_block_num,
unsigned char offset_i[16],
const unsigned char L_[][16],
unsigned char checksum[16]);
# else
# define HWAES_ocb_decrypt ((ocb128_f)NULL)
# endif
# endif /* OPENSSL_NO_OCB */
# endif /* HWAES_CAPABLE */
#endif /* OSSL_AES_PLATFORM_H */