mirror of
https://github.com/openssl/openssl.git
synced 2025-03-31 20:10:45 +08:00
evp/e_chacha20_poly1305.c: further improve small-fragment TLS performance.
Improvement coefficients vary with TLS fragment length and platform, on most Intel processors maximum improvement is ~50%, while on Ryzen - 80%. The "secret" is new dedicated ChaCha20_128 code path and vectorized xor helpers. Reviewed-by: Rich Salz <rsalz@openssl.org> (Merged from https://github.com/openssl/openssl/pull/6638)
This commit is contained in:
parent
2ce71b6027
commit
0edb109f97
@ -196,14 +196,23 @@ static int chacha20_poly1305_init_key(EVP_CIPHER_CTX *ctx,
|
||||
}
|
||||
|
||||
# if !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
|
||||
# if defined(POLY1305_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
|
||||
defined(_M_AMD64) || defined(_M_X64))
|
||||
# define XOR128_HELPERS
|
||||
void *xor128_encrypt_n_pad(void *out, const void *inp, void *otp, size_t len);
|
||||
void *xor128_decrypt_n_pad(void *out, const void *inp, void *otp, size_t len);
|
||||
static const unsigned char zero[4 * CHACHA_BLK_SIZE] = { 0 };
|
||||
# else
|
||||
static const unsigned char zero[2 * CHACHA_BLK_SIZE] = { 0 };
|
||||
# endif
|
||||
|
||||
static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
|
||||
const unsigned char *in, size_t len)
|
||||
{
|
||||
EVP_CHACHA_AEAD_CTX *actx = aead_data(ctx);
|
||||
size_t i, tail, tohash_len, plen = actx->tls_payload_length;
|
||||
unsigned char *buf, *tohash, *ctr, storage[2 * CHACHA_BLK_SIZE + 32];
|
||||
size_t tail, tohash_len, buf_len, plen = actx->tls_payload_length;
|
||||
unsigned char *buf, *tohash, *ctr, storage[sizeof(zero) + 32];
|
||||
|
||||
if (len != plen + POLY1305_BLOCK_SIZE)
|
||||
return -1;
|
||||
@ -212,9 +221,11 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
|
||||
ctr = buf + CHACHA_BLK_SIZE;
|
||||
tohash = buf + CHACHA_BLK_SIZE - POLY1305_BLOCK_SIZE;
|
||||
|
||||
if (plen <= CHACHA_BLK_SIZE) {
|
||||
# ifdef XOR128_HELPERS
|
||||
if (plen <= 3 * CHACHA_BLK_SIZE) {
|
||||
actx->key.counter[0] = 0;
|
||||
ChaCha20_ctr32(buf, zero, 2 * CHACHA_BLK_SIZE, actx->key.key.d,
|
||||
buf_len = (plen + 2 * CHACHA_BLK_SIZE - 1) & (0 - CHACHA_BLK_SIZE);
|
||||
ChaCha20_ctr32(buf, zero, buf_len, actx->key.key.d,
|
||||
actx->key.counter);
|
||||
Poly1305_Init(POLY1305_ctx(actx), buf);
|
||||
actx->key.partial_len = 0;
|
||||
@ -223,6 +234,31 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
|
||||
actx->len.aad = EVP_AEAD_TLS1_AAD_LEN;
|
||||
actx->len.text = plen;
|
||||
|
||||
if (plen) {
|
||||
if (ctx->encrypt)
|
||||
ctr = xor128_encrypt_n_pad(out, in, ctr, plen);
|
||||
else
|
||||
ctr = xor128_decrypt_n_pad(out, in, ctr, plen);
|
||||
|
||||
in += plen;
|
||||
out += plen;
|
||||
tohash_len = (size_t)(ctr - tohash);
|
||||
}
|
||||
}
|
||||
# else
|
||||
if (plen <= CHACHA_BLK_SIZE) {
|
||||
size_t i;
|
||||
|
||||
actx->key.counter[0] = 0;
|
||||
ChaCha20_ctr32(buf, zero, (buf_len = 2 * CHACHA_BLK_SIZE),
|
||||
actx->key.key.d, actx->key.counter);
|
||||
Poly1305_Init(POLY1305_ctx(actx), buf);
|
||||
actx->key.partial_len = 0;
|
||||
memcpy(tohash, actx->tls_aad, POLY1305_BLOCK_SIZE);
|
||||
tohash_len = POLY1305_BLOCK_SIZE;
|
||||
actx->len.aad = EVP_AEAD_TLS1_AAD_LEN;
|
||||
actx->len.text = plen;
|
||||
|
||||
if (ctx->encrypt) {
|
||||
for (i = 0; i < plen; i++) {
|
||||
out[i] = ctr[i] ^= in[i];
|
||||
@ -242,10 +278,12 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
|
||||
memset(ctr + i, 0, tail);
|
||||
ctr += i + tail;
|
||||
tohash_len += i + tail;
|
||||
} else {
|
||||
}
|
||||
# endif
|
||||
else {
|
||||
actx->key.counter[0] = 0;
|
||||
ChaCha20_ctr32(buf, zero, CHACHA_BLK_SIZE, actx->key.key.d,
|
||||
actx->key.counter);
|
||||
ChaCha20_ctr32(buf, zero, (buf_len = CHACHA_BLK_SIZE),
|
||||
actx->key.key.d, actx->key.counter);
|
||||
Poly1305_Init(POLY1305_ctx(actx), buf);
|
||||
actx->key.counter[0] = 1;
|
||||
actx->key.partial_len = 0;
|
||||
@ -300,7 +338,7 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
|
||||
}
|
||||
|
||||
Poly1305_Update(POLY1305_ctx(actx), tohash, tohash_len);
|
||||
OPENSSL_cleanse(buf, 2 * CHACHA_BLK_SIZE);
|
||||
OPENSSL_cleanse(buf, buf_len);
|
||||
Poly1305_Final(POLY1305_ctx(actx), ctx->encrypt ? actx->tag
|
||||
: tohash);
|
||||
|
||||
|
@ -3753,6 +3753,110 @@ poly1305_emit_base2_44:
|
||||
.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
|
||||
___
|
||||
} } }
|
||||
|
||||
{ # chacha20-poly1305 helpers
|
||||
my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
||||
("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
||||
$code.=<<___;
|
||||
.globl xor128_encrypt_n_pad
|
||||
.type xor128_encrypt_n_pad,\@abi-omnipotent
|
||||
.align 16
|
||||
xor128_encrypt_n_pad:
|
||||
sub $otp,$inp
|
||||
sub $otp,$out
|
||||
mov $len,%r10 # put len aside
|
||||
shr \$4,$len # len / 16
|
||||
jz .Ltail_enc
|
||||
nop
|
||||
.Loop_enc_xmm:
|
||||
movdqu ($inp,$otp),%xmm0
|
||||
pxor ($otp),%xmm0
|
||||
movdqu %xmm0,($out,$otp)
|
||||
movdqa %xmm0,($otp)
|
||||
lea 16($otp),$otp
|
||||
dec $len
|
||||
jnz .Loop_enc_xmm
|
||||
|
||||
and \$15,%r10 # len % 16
|
||||
jz .Ldone_enc
|
||||
|
||||
.Ltail_enc:
|
||||
mov \$16,$len
|
||||
sub %r10,$len
|
||||
xor %eax,%eax
|
||||
.Loop_enc_byte:
|
||||
mov ($inp,$otp),%al
|
||||
xor ($otp),%al
|
||||
mov %al,($out,$otp)
|
||||
mov %al,($otp)
|
||||
lea 1($otp),$otp
|
||||
dec %r10
|
||||
jnz .Loop_enc_byte
|
||||
|
||||
xor %eax,%eax
|
||||
.Loop_enc_pad:
|
||||
mov %al,($otp)
|
||||
lea 1($otp),$otp
|
||||
dec $len
|
||||
jnz .Loop_enc_pad
|
||||
|
||||
.Ldone_enc:
|
||||
mov $otp,%rax
|
||||
ret
|
||||
.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
|
||||
|
||||
.globl xor128_decrypt_n_pad
|
||||
.type xor128_decrypt_n_pad,\@abi-omnipotent
|
||||
.align 16
|
||||
xor128_decrypt_n_pad:
|
||||
sub $otp,$inp
|
||||
sub $otp,$out
|
||||
mov $len,%r10 # put len aside
|
||||
shr \$4,$len # len / 16
|
||||
jz .Ltail_dec
|
||||
nop
|
||||
.Loop_dec_xmm:
|
||||
movdqu ($inp,$otp),%xmm0
|
||||
movdqa ($otp),%xmm1
|
||||
pxor %xmm0,%xmm1
|
||||
movdqu %xmm1,($out,$otp)
|
||||
movdqa %xmm0,($otp)
|
||||
lea 16($otp),$otp
|
||||
dec $len
|
||||
jnz .Loop_dec_xmm
|
||||
|
||||
pxor %xmm1,%xmm1
|
||||
and \$15,%r10 # len % 16
|
||||
jz .Ldone_dec
|
||||
|
||||
.Ltail_dec:
|
||||
mov \$16,$len
|
||||
sub %r10,$len
|
||||
xor %eax,%eax
|
||||
xor %r11,%r11
|
||||
.Loop_dec_byte:
|
||||
mov ($inp,$otp),%r11b
|
||||
mov ($otp),%al
|
||||
xor %r11b,%al
|
||||
mov %al,($out,$otp)
|
||||
mov %r11b,($otp)
|
||||
lea 1($otp),$otp
|
||||
dec %r10
|
||||
jnz .Loop_dec_byte
|
||||
|
||||
xor %eax,%eax
|
||||
.Loop_dec_pad:
|
||||
mov %al,($otp)
|
||||
lea 1($otp),$otp
|
||||
dec $len
|
||||
jnz .Loop_dec_pad
|
||||
|
||||
.Ldone_dec:
|
||||
mov $otp,%rax
|
||||
ret
|
||||
.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.align 64
|
||||
.Lconst:
|
||||
|
Loading…
x
Reference in New Issue
Block a user