mirror of
https://github.com/openssl/openssl.git
synced 2025-01-18 13:44:20 +08:00
1efd8533e1
Also fix conditional branch out of range when using sanitisers. Fixes #18813 Signed-off-by: Tom Cosgrove <tom.cosgrove@arm.com> Change-Id: Ic543885091ed3ef2ddcbe21de0a4ac0bca1e2494 Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Matt Caswell <matt@openssl.org> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/18816)
2379 lines
82 KiB
Perl
2379 lines
82 KiB
Perl
#!/usr/bin/env perl
|
|
# Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
use strict;
|
|
|
|
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
my $xlate;
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour $output";
|
|
*STDOUT=*OUT;
|
|
|
|
my $code = data();
|
|
print $code;
|
|
|
|
close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|
|
|
|
sub data
|
|
{
|
|
local $/;
|
|
return <DATA>;
|
|
}
|
|
|
|
__END__
|
|
// Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the OpenSSL license (the "License"). You may not use
|
|
// this file except in compliance with the License. You can obtain a copy
|
|
// in the file LICENSE in the source distribution or at
|
|
// https://www.openssl.org/source/license.html
|
|
//
|
|
// ====================================================================
|
|
// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
|
|
// project. Rights for redistribution and usage in source and binary
|
|
// forms are granted according to the OpenSSL license.
|
|
// ====================================================================
|
|
//
|
|
// This implementation is a translation of bsaes-armv7 for AArch64.
|
|
// No attempt has been made to carry across the build switches for
|
|
// kernel targets, since the Linux kernel crypto support has moved on
|
|
// from when it was based on OpenSSL.
|
|
|
|
// A lot of hand-scheduling has been performed. Consequently, this code
|
|
// doesn't factor out neatly into macros in the same way that the
|
|
// AArch32 version did, and there is little to be gained by wrapping it
|
|
// up in Perl, and it is presented as pure assembly.
|
|
|
|
|
|
#include "crypto/arm_arch.h"
|
|
|
|
.text
|
|
|
|
.extern AES_cbc_encrypt
|
|
.extern AES_encrypt
|
|
.extern AES_decrypt
|
|
|
|
.type _bsaes_decrypt8,%function
|
|
.align 4
|
|
// On entry:
|
|
// x9 -> key (previously expanded using _bsaes_key_convert)
|
|
// x10 = number of rounds
|
|
// v0-v7 input data
|
|
// On exit:
|
|
// x9-x11 corrupted
|
|
// other general-purpose registers preserved
|
|
// v0-v7 output data
|
|
// v11-v15 preserved
|
|
// other SIMD registers corrupted
|
|
_bsaes_decrypt8:
|
|
ldr q8, [x9], #16
|
|
adr x11, .LM0ISR
|
|
movi v9.16b, #0x55
|
|
ldr q10, [x11], #16
|
|
movi v16.16b, #0x33
|
|
movi v17.16b, #0x0f
|
|
sub x10, x10, #1
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v8.16b
|
|
eor v2.16b, v2.16b, v8.16b
|
|
eor v4.16b, v4.16b, v8.16b
|
|
eor v3.16b, v3.16b, v8.16b
|
|
eor v5.16b, v5.16b, v8.16b
|
|
tbl v0.16b, {v0.16b}, v10.16b
|
|
tbl v1.16b, {v1.16b}, v10.16b
|
|
tbl v2.16b, {v2.16b}, v10.16b
|
|
tbl v4.16b, {v4.16b}, v10.16b
|
|
eor v6.16b, v6.16b, v8.16b
|
|
eor v7.16b, v7.16b, v8.16b
|
|
tbl v3.16b, {v3.16b}, v10.16b
|
|
tbl v5.16b, {v5.16b}, v10.16b
|
|
tbl v6.16b, {v6.16b}, v10.16b
|
|
ushr v8.2d, v0.2d, #1
|
|
tbl v7.16b, {v7.16b}, v10.16b
|
|
ushr v10.2d, v4.2d, #1
|
|
ushr v18.2d, v2.2d, #1
|
|
eor v8.16b, v8.16b, v1.16b
|
|
ushr v19.2d, v6.2d, #1
|
|
eor v10.16b, v10.16b, v5.16b
|
|
eor v18.16b, v18.16b, v3.16b
|
|
and v8.16b, v8.16b, v9.16b
|
|
eor v19.16b, v19.16b, v7.16b
|
|
and v10.16b, v10.16b, v9.16b
|
|
and v18.16b, v18.16b, v9.16b
|
|
eor v1.16b, v1.16b, v8.16b
|
|
shl v8.2d, v8.2d, #1
|
|
and v9.16b, v19.16b, v9.16b
|
|
eor v5.16b, v5.16b, v10.16b
|
|
shl v10.2d, v10.2d, #1
|
|
eor v3.16b, v3.16b, v18.16b
|
|
shl v18.2d, v18.2d, #1
|
|
eor v0.16b, v0.16b, v8.16b
|
|
shl v8.2d, v9.2d, #1
|
|
eor v7.16b, v7.16b, v9.16b
|
|
eor v4.16b, v4.16b, v10.16b
|
|
eor v2.16b, v2.16b, v18.16b
|
|
ushr v9.2d, v1.2d, #2
|
|
eor v6.16b, v6.16b, v8.16b
|
|
ushr v8.2d, v0.2d, #2
|
|
ushr v10.2d, v5.2d, #2
|
|
ushr v18.2d, v4.2d, #2
|
|
eor v9.16b, v9.16b, v3.16b
|
|
eor v8.16b, v8.16b, v2.16b
|
|
eor v10.16b, v10.16b, v7.16b
|
|
eor v18.16b, v18.16b, v6.16b
|
|
and v9.16b, v9.16b, v16.16b
|
|
and v8.16b, v8.16b, v16.16b
|
|
and v10.16b, v10.16b, v16.16b
|
|
and v16.16b, v18.16b, v16.16b
|
|
eor v3.16b, v3.16b, v9.16b
|
|
shl v9.2d, v9.2d, #2
|
|
eor v2.16b, v2.16b, v8.16b
|
|
shl v8.2d, v8.2d, #2
|
|
eor v7.16b, v7.16b, v10.16b
|
|
shl v10.2d, v10.2d, #2
|
|
eor v6.16b, v6.16b, v16.16b
|
|
shl v16.2d, v16.2d, #2
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v5.16b, v5.16b, v10.16b
|
|
eor v4.16b, v4.16b, v16.16b
|
|
ushr v8.2d, v3.2d, #4
|
|
ushr v9.2d, v2.2d, #4
|
|
ushr v10.2d, v1.2d, #4
|
|
ushr v16.2d, v0.2d, #4
|
|
eor v8.16b, v8.16b, v7.16b
|
|
eor v9.16b, v9.16b, v6.16b
|
|
eor v10.16b, v10.16b, v5.16b
|
|
eor v16.16b, v16.16b, v4.16b
|
|
and v8.16b, v8.16b, v17.16b
|
|
and v9.16b, v9.16b, v17.16b
|
|
and v10.16b, v10.16b, v17.16b
|
|
and v16.16b, v16.16b, v17.16b
|
|
eor v7.16b, v7.16b, v8.16b
|
|
shl v8.2d, v8.2d, #4
|
|
eor v6.16b, v6.16b, v9.16b
|
|
shl v9.2d, v9.2d, #4
|
|
eor v5.16b, v5.16b, v10.16b
|
|
shl v10.2d, v10.2d, #4
|
|
eor v4.16b, v4.16b, v16.16b
|
|
shl v16.2d, v16.2d, #4
|
|
eor v3.16b, v3.16b, v8.16b
|
|
eor v2.16b, v2.16b, v9.16b
|
|
eor v1.16b, v1.16b, v10.16b
|
|
eor v0.16b, v0.16b, v16.16b
|
|
b .Ldec_sbox
|
|
.align 4
|
|
.Ldec_loop:
|
|
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
|
|
ldp q8, q9, [x9], #32
|
|
eor v0.16b, v16.16b, v0.16b
|
|
ldr q10, [x9], #16
|
|
eor v1.16b, v17.16b, v1.16b
|
|
ldr q16, [x9], #16
|
|
eor v2.16b, v18.16b, v2.16b
|
|
eor v3.16b, v19.16b, v3.16b
|
|
eor v4.16b, v8.16b, v4.16b
|
|
eor v5.16b, v9.16b, v5.16b
|
|
eor v6.16b, v10.16b, v6.16b
|
|
eor v7.16b, v16.16b, v7.16b
|
|
tbl v0.16b, {v0.16b}, v28.16b
|
|
tbl v1.16b, {v1.16b}, v28.16b
|
|
tbl v2.16b, {v2.16b}, v28.16b
|
|
tbl v3.16b, {v3.16b}, v28.16b
|
|
tbl v4.16b, {v4.16b}, v28.16b
|
|
tbl v5.16b, {v5.16b}, v28.16b
|
|
tbl v6.16b, {v6.16b}, v28.16b
|
|
tbl v7.16b, {v7.16b}, v28.16b
|
|
.Ldec_sbox:
|
|
eor v1.16b, v1.16b, v4.16b
|
|
eor v3.16b, v3.16b, v4.16b
|
|
subs x10, x10, #1
|
|
eor v4.16b, v4.16b, v7.16b
|
|
eor v2.16b, v2.16b, v7.16b
|
|
eor v1.16b, v1.16b, v6.16b
|
|
eor v6.16b, v6.16b, v4.16b
|
|
eor v2.16b, v2.16b, v5.16b
|
|
eor v0.16b, v0.16b, v1.16b
|
|
eor v7.16b, v7.16b, v6.16b
|
|
eor v8.16b, v6.16b, v2.16b
|
|
and v9.16b, v4.16b, v6.16b
|
|
eor v10.16b, v2.16b, v6.16b
|
|
eor v3.16b, v3.16b, v0.16b
|
|
eor v5.16b, v5.16b, v0.16b
|
|
eor v16.16b, v7.16b, v4.16b
|
|
eor v17.16b, v4.16b, v0.16b
|
|
and v18.16b, v0.16b, v2.16b
|
|
eor v19.16b, v7.16b, v4.16b
|
|
eor v1.16b, v1.16b, v3.16b
|
|
eor v20.16b, v3.16b, v0.16b
|
|
eor v21.16b, v5.16b, v2.16b
|
|
eor v22.16b, v3.16b, v7.16b
|
|
and v8.16b, v17.16b, v8.16b
|
|
orr v17.16b, v3.16b, v5.16b
|
|
eor v23.16b, v1.16b, v6.16b
|
|
eor v24.16b, v20.16b, v16.16b
|
|
eor v25.16b, v1.16b, v5.16b
|
|
orr v26.16b, v20.16b, v21.16b
|
|
and v20.16b, v20.16b, v21.16b
|
|
and v27.16b, v7.16b, v1.16b
|
|
eor v21.16b, v21.16b, v23.16b
|
|
orr v28.16b, v16.16b, v23.16b
|
|
orr v29.16b, v22.16b, v25.16b
|
|
eor v26.16b, v26.16b, v8.16b
|
|
and v16.16b, v16.16b, v23.16b
|
|
and v22.16b, v22.16b, v25.16b
|
|
and v21.16b, v24.16b, v21.16b
|
|
eor v8.16b, v28.16b, v8.16b
|
|
eor v23.16b, v5.16b, v2.16b
|
|
eor v24.16b, v1.16b, v6.16b
|
|
eor v16.16b, v16.16b, v22.16b
|
|
eor v22.16b, v3.16b, v0.16b
|
|
eor v25.16b, v29.16b, v21.16b
|
|
eor v21.16b, v26.16b, v21.16b
|
|
eor v8.16b, v8.16b, v20.16b
|
|
eor v26.16b, v23.16b, v24.16b
|
|
eor v16.16b, v16.16b, v20.16b
|
|
eor v28.16b, v22.16b, v19.16b
|
|
eor v20.16b, v25.16b, v20.16b
|
|
eor v9.16b, v21.16b, v9.16b
|
|
eor v8.16b, v8.16b, v18.16b
|
|
eor v18.16b, v5.16b, v1.16b
|
|
eor v21.16b, v16.16b, v17.16b
|
|
eor v16.16b, v16.16b, v17.16b
|
|
eor v17.16b, v20.16b, v27.16b
|
|
eor v20.16b, v3.16b, v7.16b
|
|
eor v25.16b, v9.16b, v8.16b
|
|
eor v27.16b, v0.16b, v4.16b
|
|
and v29.16b, v9.16b, v17.16b
|
|
eor v30.16b, v8.16b, v29.16b
|
|
eor v31.16b, v21.16b, v29.16b
|
|
eor v29.16b, v21.16b, v29.16b
|
|
bsl v30.16b, v17.16b, v21.16b
|
|
bsl v31.16b, v9.16b, v8.16b
|
|
bsl v16.16b, v30.16b, v29.16b
|
|
bsl v21.16b, v29.16b, v30.16b
|
|
eor v8.16b, v31.16b, v30.16b
|
|
and v1.16b, v1.16b, v31.16b
|
|
and v9.16b, v16.16b, v31.16b
|
|
and v6.16b, v6.16b, v30.16b
|
|
eor v16.16b, v17.16b, v21.16b
|
|
and v4.16b, v4.16b, v30.16b
|
|
eor v17.16b, v8.16b, v30.16b
|
|
and v21.16b, v24.16b, v8.16b
|
|
eor v9.16b, v9.16b, v25.16b
|
|
and v19.16b, v19.16b, v8.16b
|
|
eor v24.16b, v30.16b, v16.16b
|
|
eor v25.16b, v30.16b, v16.16b
|
|
and v7.16b, v7.16b, v17.16b
|
|
and v10.16b, v10.16b, v16.16b
|
|
eor v29.16b, v9.16b, v16.16b
|
|
eor v30.16b, v31.16b, v9.16b
|
|
and v0.16b, v24.16b, v0.16b
|
|
and v9.16b, v18.16b, v9.16b
|
|
and v2.16b, v25.16b, v2.16b
|
|
eor v10.16b, v10.16b, v6.16b
|
|
eor v18.16b, v29.16b, v16.16b
|
|
and v5.16b, v30.16b, v5.16b
|
|
eor v24.16b, v8.16b, v29.16b
|
|
and v25.16b, v26.16b, v29.16b
|
|
and v26.16b, v28.16b, v29.16b
|
|
eor v8.16b, v8.16b, v29.16b
|
|
eor v17.16b, v17.16b, v18.16b
|
|
eor v5.16b, v1.16b, v5.16b
|
|
and v23.16b, v24.16b, v23.16b
|
|
eor v21.16b, v21.16b, v25.16b
|
|
eor v19.16b, v19.16b, v26.16b
|
|
eor v0.16b, v4.16b, v0.16b
|
|
and v3.16b, v17.16b, v3.16b
|
|
eor v1.16b, v9.16b, v1.16b
|
|
eor v9.16b, v25.16b, v23.16b
|
|
eor v5.16b, v5.16b, v21.16b
|
|
eor v2.16b, v6.16b, v2.16b
|
|
and v6.16b, v8.16b, v22.16b
|
|
eor v3.16b, v7.16b, v3.16b
|
|
and v8.16b, v20.16b, v18.16b
|
|
eor v10.16b, v10.16b, v9.16b
|
|
eor v0.16b, v0.16b, v19.16b
|
|
eor v9.16b, v1.16b, v9.16b
|
|
eor v1.16b, v2.16b, v21.16b
|
|
eor v3.16b, v3.16b, v19.16b
|
|
and v16.16b, v27.16b, v16.16b
|
|
eor v17.16b, v26.16b, v6.16b
|
|
eor v6.16b, v8.16b, v7.16b
|
|
eor v7.16b, v1.16b, v9.16b
|
|
eor v1.16b, v5.16b, v3.16b
|
|
eor v2.16b, v10.16b, v3.16b
|
|
eor v4.16b, v16.16b, v4.16b
|
|
eor v8.16b, v6.16b, v17.16b
|
|
eor v5.16b, v9.16b, v3.16b
|
|
eor v9.16b, v0.16b, v1.16b
|
|
eor v6.16b, v7.16b, v1.16b
|
|
eor v0.16b, v4.16b, v17.16b
|
|
eor v4.16b, v8.16b, v7.16b
|
|
eor v7.16b, v9.16b, v2.16b
|
|
eor v8.16b, v3.16b, v0.16b
|
|
eor v7.16b, v7.16b, v5.16b
|
|
eor v3.16b, v4.16b, v7.16b
|
|
eor v4.16b, v7.16b, v0.16b
|
|
eor v7.16b, v8.16b, v3.16b
|
|
bcc .Ldec_done
|
|
ext v8.16b, v0.16b, v0.16b, #8
|
|
ext v9.16b, v1.16b, v1.16b, #8
|
|
ldr q28, [x11] // load from .LISR in common case (x10 > 0)
|
|
ext v10.16b, v6.16b, v6.16b, #8
|
|
ext v16.16b, v3.16b, v3.16b, #8
|
|
ext v17.16b, v5.16b, v5.16b, #8
|
|
ext v18.16b, v4.16b, v4.16b, #8
|
|
eor v8.16b, v8.16b, v0.16b
|
|
eor v9.16b, v9.16b, v1.16b
|
|
eor v10.16b, v10.16b, v6.16b
|
|
eor v16.16b, v16.16b, v3.16b
|
|
eor v17.16b, v17.16b, v5.16b
|
|
ext v19.16b, v2.16b, v2.16b, #8
|
|
ext v20.16b, v7.16b, v7.16b, #8
|
|
eor v18.16b, v18.16b, v4.16b
|
|
eor v6.16b, v6.16b, v8.16b
|
|
eor v8.16b, v2.16b, v10.16b
|
|
eor v4.16b, v4.16b, v9.16b
|
|
eor v2.16b, v19.16b, v2.16b
|
|
eor v9.16b, v20.16b, v7.16b
|
|
eor v0.16b, v0.16b, v16.16b
|
|
eor v1.16b, v1.16b, v16.16b
|
|
eor v6.16b, v6.16b, v17.16b
|
|
eor v8.16b, v8.16b, v16.16b
|
|
eor v7.16b, v7.16b, v18.16b
|
|
eor v4.16b, v4.16b, v16.16b
|
|
eor v2.16b, v3.16b, v2.16b
|
|
eor v1.16b, v1.16b, v17.16b
|
|
eor v3.16b, v5.16b, v9.16b
|
|
eor v5.16b, v8.16b, v17.16b
|
|
eor v7.16b, v7.16b, v17.16b
|
|
ext v8.16b, v0.16b, v0.16b, #12
|
|
ext v9.16b, v6.16b, v6.16b, #12
|
|
ext v10.16b, v4.16b, v4.16b, #12
|
|
ext v16.16b, v1.16b, v1.16b, #12
|
|
ext v17.16b, v5.16b, v5.16b, #12
|
|
ext v18.16b, v7.16b, v7.16b, #12
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v6.16b, v6.16b, v9.16b
|
|
eor v4.16b, v4.16b, v10.16b
|
|
ext v19.16b, v2.16b, v2.16b, #12
|
|
ext v20.16b, v3.16b, v3.16b, #12
|
|
eor v1.16b, v1.16b, v16.16b
|
|
eor v5.16b, v5.16b, v17.16b
|
|
eor v7.16b, v7.16b, v18.16b
|
|
eor v2.16b, v2.16b, v19.16b
|
|
eor v16.16b, v16.16b, v0.16b
|
|
eor v3.16b, v3.16b, v20.16b
|
|
eor v17.16b, v17.16b, v4.16b
|
|
eor v10.16b, v10.16b, v6.16b
|
|
ext v0.16b, v0.16b, v0.16b, #8
|
|
eor v9.16b, v9.16b, v1.16b
|
|
ext v1.16b, v1.16b, v1.16b, #8
|
|
eor v8.16b, v8.16b, v3.16b
|
|
eor v16.16b, v16.16b, v3.16b
|
|
eor v18.16b, v18.16b, v5.16b
|
|
eor v19.16b, v19.16b, v7.16b
|
|
ext v21.16b, v5.16b, v5.16b, #8
|
|
ext v5.16b, v7.16b, v7.16b, #8
|
|
eor v7.16b, v20.16b, v2.16b
|
|
ext v4.16b, v4.16b, v4.16b, #8
|
|
ext v20.16b, v3.16b, v3.16b, #8
|
|
eor v17.16b, v17.16b, v3.16b
|
|
ext v2.16b, v2.16b, v2.16b, #8
|
|
eor v3.16b, v10.16b, v3.16b
|
|
ext v10.16b, v6.16b, v6.16b, #8
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v16.16b
|
|
eor v5.16b, v5.16b, v18.16b
|
|
eor v3.16b, v3.16b, v4.16b
|
|
eor v7.16b, v20.16b, v7.16b
|
|
eor v6.16b, v2.16b, v19.16b
|
|
eor v4.16b, v21.16b, v17.16b
|
|
eor v2.16b, v10.16b, v9.16b
|
|
bne .Ldec_loop
|
|
ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
|
|
b .Ldec_loop
|
|
.align 4
|
|
.Ldec_done:
|
|
ushr v8.2d, v0.2d, #1
|
|
movi v9.16b, #0x55
|
|
ldr q10, [x9]
|
|
ushr v16.2d, v2.2d, #1
|
|
movi v17.16b, #0x33
|
|
ushr v18.2d, v6.2d, #1
|
|
movi v19.16b, #0x0f
|
|
eor v8.16b, v8.16b, v1.16b
|
|
ushr v20.2d, v3.2d, #1
|
|
eor v16.16b, v16.16b, v7.16b
|
|
eor v18.16b, v18.16b, v4.16b
|
|
and v8.16b, v8.16b, v9.16b
|
|
eor v20.16b, v20.16b, v5.16b
|
|
and v16.16b, v16.16b, v9.16b
|
|
and v18.16b, v18.16b, v9.16b
|
|
shl v21.2d, v8.2d, #1
|
|
eor v1.16b, v1.16b, v8.16b
|
|
and v8.16b, v20.16b, v9.16b
|
|
eor v7.16b, v7.16b, v16.16b
|
|
shl v9.2d, v16.2d, #1
|
|
eor v4.16b, v4.16b, v18.16b
|
|
shl v16.2d, v18.2d, #1
|
|
eor v0.16b, v0.16b, v21.16b
|
|
shl v18.2d, v8.2d, #1
|
|
eor v5.16b, v5.16b, v8.16b
|
|
eor v2.16b, v2.16b, v9.16b
|
|
eor v6.16b, v6.16b, v16.16b
|
|
ushr v8.2d, v1.2d, #2
|
|
eor v3.16b, v3.16b, v18.16b
|
|
ushr v9.2d, v0.2d, #2
|
|
ushr v16.2d, v7.2d, #2
|
|
ushr v18.2d, v2.2d, #2
|
|
eor v8.16b, v8.16b, v4.16b
|
|
eor v9.16b, v9.16b, v6.16b
|
|
eor v16.16b, v16.16b, v5.16b
|
|
eor v18.16b, v18.16b, v3.16b
|
|
and v8.16b, v8.16b, v17.16b
|
|
and v9.16b, v9.16b, v17.16b
|
|
and v16.16b, v16.16b, v17.16b
|
|
and v17.16b, v18.16b, v17.16b
|
|
eor v4.16b, v4.16b, v8.16b
|
|
shl v8.2d, v8.2d, #2
|
|
eor v6.16b, v6.16b, v9.16b
|
|
shl v9.2d, v9.2d, #2
|
|
eor v5.16b, v5.16b, v16.16b
|
|
shl v16.2d, v16.2d, #2
|
|
eor v3.16b, v3.16b, v17.16b
|
|
shl v17.2d, v17.2d, #2
|
|
eor v1.16b, v1.16b, v8.16b
|
|
eor v0.16b, v0.16b, v9.16b
|
|
eor v7.16b, v7.16b, v16.16b
|
|
eor v2.16b, v2.16b, v17.16b
|
|
ushr v8.2d, v4.2d, #4
|
|
ushr v9.2d, v6.2d, #4
|
|
ushr v16.2d, v1.2d, #4
|
|
ushr v17.2d, v0.2d, #4
|
|
eor v8.16b, v8.16b, v5.16b
|
|
eor v9.16b, v9.16b, v3.16b
|
|
eor v16.16b, v16.16b, v7.16b
|
|
eor v17.16b, v17.16b, v2.16b
|
|
and v8.16b, v8.16b, v19.16b
|
|
and v9.16b, v9.16b, v19.16b
|
|
and v16.16b, v16.16b, v19.16b
|
|
and v17.16b, v17.16b, v19.16b
|
|
eor v5.16b, v5.16b, v8.16b
|
|
shl v8.2d, v8.2d, #4
|
|
eor v3.16b, v3.16b, v9.16b
|
|
shl v9.2d, v9.2d, #4
|
|
eor v7.16b, v7.16b, v16.16b
|
|
shl v16.2d, v16.2d, #4
|
|
eor v2.16b, v2.16b, v17.16b
|
|
shl v17.2d, v17.2d, #4
|
|
eor v4.16b, v4.16b, v8.16b
|
|
eor v6.16b, v6.16b, v9.16b
|
|
eor v7.16b, v7.16b, v10.16b
|
|
eor v1.16b, v1.16b, v16.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v0.16b, v0.16b, v17.16b
|
|
eor v4.16b, v4.16b, v10.16b
|
|
eor v6.16b, v6.16b, v10.16b
|
|
eor v3.16b, v3.16b, v10.16b
|
|
eor v5.16b, v5.16b, v10.16b
|
|
eor v1.16b, v1.16b, v10.16b
|
|
eor v0.16b, v0.16b, v10.16b
|
|
ret
|
|
.size _bsaes_decrypt8,.-_bsaes_decrypt8
|
|
|
|
.type _bsaes_const,%object
|
|
.align 6
|
|
_bsaes_const:
|
|
// InvShiftRows constants
|
|
// Used in _bsaes_decrypt8, which assumes contiguity
|
|
// .LM0ISR used with round 0 key
|
|
// .LISR used with middle round keys
|
|
// .LISRM0 used with final round key
|
|
.LM0ISR:
|
|
.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
|
|
.LISR:
|
|
.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
|
|
.LISRM0:
|
|
.quad 0x01040b0e0205080f, 0x0306090c00070a0d
|
|
|
|
// ShiftRows constants
|
|
// Used in _bsaes_encrypt8, which assumes contiguity
|
|
// .LM0SR used with round 0 key
|
|
// .LSR used with middle round keys
|
|
// .LSRM0 used with final round key
|
|
.LM0SR:
|
|
.quad 0x0a0e02060f03070b, 0x0004080c05090d01
|
|
.LSR:
|
|
.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
|
|
.LSRM0:
|
|
.quad 0x0304090e00050a0f, 0x01060b0c0207080d
|
|
|
|
.LM0_bigendian:
|
|
.quad 0x02060a0e03070b0f, 0x0004080c0105090d
|
|
.LM0_littleendian:
|
|
.quad 0x0105090d0004080c, 0x03070b0f02060a0e
|
|
|
|
// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
|
|
// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
|
|
.LREVM0SR:
|
|
.quad 0x090d01050c000408, 0x03070b0f060a0e02
|
|
|
|
.align 6
|
|
.size _bsaes_const,.-_bsaes_const
|
|
|
|
.type _bsaes_encrypt8,%function
|
|
.align 4
|
|
// On entry:
|
|
// x9 -> key (previously expanded using _bsaes_key_convert)
|
|
// x10 = number of rounds
|
|
// v0-v7 input data
|
|
// On exit:
|
|
// x9-x11 corrupted
|
|
// other general-purpose registers preserved
|
|
// v0-v7 output data
|
|
// v11-v15 preserved
|
|
// other SIMD registers corrupted
|
|
_bsaes_encrypt8:
|
|
ldr q8, [x9], #16
|
|
adr x11, .LM0SR
|
|
ldr q9, [x11], #16
|
|
_bsaes_encrypt8_alt:
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v8.16b
|
|
sub x10, x10, #1
|
|
eor v2.16b, v2.16b, v8.16b
|
|
eor v4.16b, v4.16b, v8.16b
|
|
eor v3.16b, v3.16b, v8.16b
|
|
eor v5.16b, v5.16b, v8.16b
|
|
tbl v0.16b, {v0.16b}, v9.16b
|
|
tbl v1.16b, {v1.16b}, v9.16b
|
|
tbl v2.16b, {v2.16b}, v9.16b
|
|
tbl v4.16b, {v4.16b}, v9.16b
|
|
eor v6.16b, v6.16b, v8.16b
|
|
eor v7.16b, v7.16b, v8.16b
|
|
tbl v3.16b, {v3.16b}, v9.16b
|
|
tbl v5.16b, {v5.16b}, v9.16b
|
|
tbl v6.16b, {v6.16b}, v9.16b
|
|
ushr v8.2d, v0.2d, #1
|
|
movi v10.16b, #0x55
|
|
tbl v7.16b, {v7.16b}, v9.16b
|
|
ushr v9.2d, v4.2d, #1
|
|
movi v16.16b, #0x33
|
|
ushr v17.2d, v2.2d, #1
|
|
eor v8.16b, v8.16b, v1.16b
|
|
movi v18.16b, #0x0f
|
|
ushr v19.2d, v6.2d, #1
|
|
eor v9.16b, v9.16b, v5.16b
|
|
eor v17.16b, v17.16b, v3.16b
|
|
and v8.16b, v8.16b, v10.16b
|
|
eor v19.16b, v19.16b, v7.16b
|
|
and v9.16b, v9.16b, v10.16b
|
|
and v17.16b, v17.16b, v10.16b
|
|
eor v1.16b, v1.16b, v8.16b
|
|
shl v8.2d, v8.2d, #1
|
|
and v10.16b, v19.16b, v10.16b
|
|
eor v5.16b, v5.16b, v9.16b
|
|
shl v9.2d, v9.2d, #1
|
|
eor v3.16b, v3.16b, v17.16b
|
|
shl v17.2d, v17.2d, #1
|
|
eor v0.16b, v0.16b, v8.16b
|
|
shl v8.2d, v10.2d, #1
|
|
eor v7.16b, v7.16b, v10.16b
|
|
eor v4.16b, v4.16b, v9.16b
|
|
eor v2.16b, v2.16b, v17.16b
|
|
ushr v9.2d, v1.2d, #2
|
|
eor v6.16b, v6.16b, v8.16b
|
|
ushr v8.2d, v0.2d, #2
|
|
ushr v10.2d, v5.2d, #2
|
|
ushr v17.2d, v4.2d, #2
|
|
eor v9.16b, v9.16b, v3.16b
|
|
eor v8.16b, v8.16b, v2.16b
|
|
eor v10.16b, v10.16b, v7.16b
|
|
eor v17.16b, v17.16b, v6.16b
|
|
and v9.16b, v9.16b, v16.16b
|
|
and v8.16b, v8.16b, v16.16b
|
|
and v10.16b, v10.16b, v16.16b
|
|
and v16.16b, v17.16b, v16.16b
|
|
eor v3.16b, v3.16b, v9.16b
|
|
shl v9.2d, v9.2d, #2
|
|
eor v2.16b, v2.16b, v8.16b
|
|
shl v8.2d, v8.2d, #2
|
|
eor v7.16b, v7.16b, v10.16b
|
|
shl v10.2d, v10.2d, #2
|
|
eor v6.16b, v6.16b, v16.16b
|
|
shl v16.2d, v16.2d, #2
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v5.16b, v5.16b, v10.16b
|
|
eor v4.16b, v4.16b, v16.16b
|
|
ushr v8.2d, v3.2d, #4
|
|
ushr v9.2d, v2.2d, #4
|
|
ushr v10.2d, v1.2d, #4
|
|
ushr v16.2d, v0.2d, #4
|
|
eor v8.16b, v8.16b, v7.16b
|
|
eor v9.16b, v9.16b, v6.16b
|
|
eor v10.16b, v10.16b, v5.16b
|
|
eor v16.16b, v16.16b, v4.16b
|
|
and v8.16b, v8.16b, v18.16b
|
|
and v9.16b, v9.16b, v18.16b
|
|
and v10.16b, v10.16b, v18.16b
|
|
and v16.16b, v16.16b, v18.16b
|
|
eor v7.16b, v7.16b, v8.16b
|
|
shl v8.2d, v8.2d, #4
|
|
eor v6.16b, v6.16b, v9.16b
|
|
shl v9.2d, v9.2d, #4
|
|
eor v5.16b, v5.16b, v10.16b
|
|
shl v10.2d, v10.2d, #4
|
|
eor v4.16b, v4.16b, v16.16b
|
|
shl v16.2d, v16.2d, #4
|
|
eor v3.16b, v3.16b, v8.16b
|
|
eor v2.16b, v2.16b, v9.16b
|
|
eor v1.16b, v1.16b, v10.16b
|
|
eor v0.16b, v0.16b, v16.16b
|
|
b .Lenc_sbox
|
|
.align 4
|
|
.Lenc_loop:
|
|
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
|
|
ldp q8, q9, [x9], #32
|
|
eor v0.16b, v16.16b, v0.16b
|
|
ldr q10, [x9], #16
|
|
eor v1.16b, v17.16b, v1.16b
|
|
ldr q16, [x9], #16
|
|
eor v2.16b, v18.16b, v2.16b
|
|
eor v3.16b, v19.16b, v3.16b
|
|
eor v4.16b, v8.16b, v4.16b
|
|
eor v5.16b, v9.16b, v5.16b
|
|
eor v6.16b, v10.16b, v6.16b
|
|
eor v7.16b, v16.16b, v7.16b
|
|
tbl v0.16b, {v0.16b}, v28.16b
|
|
tbl v1.16b, {v1.16b}, v28.16b
|
|
tbl v2.16b, {v2.16b}, v28.16b
|
|
tbl v3.16b, {v3.16b}, v28.16b
|
|
tbl v4.16b, {v4.16b}, v28.16b
|
|
tbl v5.16b, {v5.16b}, v28.16b
|
|
tbl v6.16b, {v6.16b}, v28.16b
|
|
tbl v7.16b, {v7.16b}, v28.16b
|
|
.Lenc_sbox:
|
|
eor v5.16b, v5.16b, v6.16b
|
|
eor v3.16b, v3.16b, v0.16b
|
|
subs x10, x10, #1
|
|
eor v2.16b, v2.16b, v1.16b
|
|
eor v5.16b, v5.16b, v0.16b
|
|
eor v8.16b, v3.16b, v7.16b
|
|
eor v6.16b, v6.16b, v2.16b
|
|
eor v7.16b, v7.16b, v5.16b
|
|
eor v8.16b, v8.16b, v4.16b
|
|
eor v3.16b, v6.16b, v3.16b
|
|
eor v4.16b, v4.16b, v5.16b
|
|
eor v6.16b, v1.16b, v5.16b
|
|
eor v2.16b, v2.16b, v7.16b
|
|
eor v1.16b, v8.16b, v1.16b
|
|
eor v8.16b, v7.16b, v4.16b
|
|
eor v9.16b, v3.16b, v0.16b
|
|
eor v10.16b, v7.16b, v6.16b
|
|
eor v16.16b, v5.16b, v3.16b
|
|
eor v17.16b, v6.16b, v2.16b
|
|
eor v18.16b, v5.16b, v1.16b
|
|
eor v19.16b, v2.16b, v4.16b
|
|
eor v20.16b, v1.16b, v0.16b
|
|
orr v21.16b, v8.16b, v9.16b
|
|
orr v22.16b, v10.16b, v16.16b
|
|
eor v23.16b, v8.16b, v17.16b
|
|
eor v24.16b, v9.16b, v18.16b
|
|
and v19.16b, v19.16b, v20.16b
|
|
orr v20.16b, v17.16b, v18.16b
|
|
and v8.16b, v8.16b, v9.16b
|
|
and v9.16b, v17.16b, v18.16b
|
|
and v17.16b, v23.16b, v24.16b
|
|
and v10.16b, v10.16b, v16.16b
|
|
eor v16.16b, v21.16b, v19.16b
|
|
eor v18.16b, v20.16b, v19.16b
|
|
and v19.16b, v2.16b, v1.16b
|
|
and v20.16b, v6.16b, v5.16b
|
|
eor v21.16b, v22.16b, v17.16b
|
|
eor v9.16b, v9.16b, v10.16b
|
|
eor v10.16b, v16.16b, v17.16b
|
|
eor v16.16b, v18.16b, v8.16b
|
|
and v17.16b, v4.16b, v0.16b
|
|
orr v18.16b, v7.16b, v3.16b
|
|
eor v21.16b, v21.16b, v8.16b
|
|
eor v8.16b, v9.16b, v8.16b
|
|
eor v9.16b, v10.16b, v19.16b
|
|
eor v10.16b, v3.16b, v0.16b
|
|
eor v16.16b, v16.16b, v17.16b
|
|
eor v17.16b, v5.16b, v1.16b
|
|
eor v19.16b, v21.16b, v20.16b
|
|
eor v20.16b, v8.16b, v18.16b
|
|
eor v8.16b, v8.16b, v18.16b
|
|
eor v18.16b, v7.16b, v4.16b
|
|
eor v21.16b, v9.16b, v16.16b
|
|
eor v22.16b, v6.16b, v2.16b
|
|
and v23.16b, v9.16b, v19.16b
|
|
eor v24.16b, v10.16b, v17.16b
|
|
eor v25.16b, v0.16b, v1.16b
|
|
eor v26.16b, v7.16b, v6.16b
|
|
eor v27.16b, v18.16b, v22.16b
|
|
eor v28.16b, v3.16b, v5.16b
|
|
eor v29.16b, v16.16b, v23.16b
|
|
eor v30.16b, v20.16b, v23.16b
|
|
eor v23.16b, v20.16b, v23.16b
|
|
eor v31.16b, v4.16b, v2.16b
|
|
bsl v29.16b, v19.16b, v20.16b
|
|
bsl v30.16b, v9.16b, v16.16b
|
|
bsl v8.16b, v29.16b, v23.16b
|
|
bsl v20.16b, v23.16b, v29.16b
|
|
eor v9.16b, v30.16b, v29.16b
|
|
and v5.16b, v5.16b, v30.16b
|
|
and v8.16b, v8.16b, v30.16b
|
|
and v1.16b, v1.16b, v29.16b
|
|
eor v16.16b, v19.16b, v20.16b
|
|
and v2.16b, v2.16b, v29.16b
|
|
eor v19.16b, v9.16b, v29.16b
|
|
and v17.16b, v17.16b, v9.16b
|
|
eor v8.16b, v8.16b, v21.16b
|
|
and v20.16b, v22.16b, v9.16b
|
|
eor v21.16b, v29.16b, v16.16b
|
|
eor v22.16b, v29.16b, v16.16b
|
|
and v23.16b, v25.16b, v16.16b
|
|
and v6.16b, v6.16b, v19.16b
|
|
eor v25.16b, v8.16b, v16.16b
|
|
eor v29.16b, v30.16b, v8.16b
|
|
and v4.16b, v21.16b, v4.16b
|
|
and v8.16b, v28.16b, v8.16b
|
|
and v0.16b, v22.16b, v0.16b
|
|
eor v21.16b, v23.16b, v1.16b
|
|
eor v22.16b, v9.16b, v25.16b
|
|
eor v9.16b, v9.16b, v25.16b
|
|
eor v23.16b, v25.16b, v16.16b
|
|
and v3.16b, v29.16b, v3.16b
|
|
and v24.16b, v24.16b, v25.16b
|
|
and v25.16b, v27.16b, v25.16b
|
|
and v10.16b, v22.16b, v10.16b
|
|
and v9.16b, v9.16b, v18.16b
|
|
eor v18.16b, v19.16b, v23.16b
|
|
and v19.16b, v26.16b, v23.16b
|
|
eor v3.16b, v5.16b, v3.16b
|
|
eor v17.16b, v17.16b, v24.16b
|
|
eor v10.16b, v24.16b, v10.16b
|
|
and v16.16b, v31.16b, v16.16b
|
|
eor v20.16b, v20.16b, v25.16b
|
|
eor v9.16b, v25.16b, v9.16b
|
|
eor v4.16b, v2.16b, v4.16b
|
|
and v7.16b, v18.16b, v7.16b
|
|
eor v18.16b, v19.16b, v6.16b
|
|
eor v5.16b, v8.16b, v5.16b
|
|
eor v0.16b, v1.16b, v0.16b
|
|
eor v1.16b, v21.16b, v10.16b
|
|
eor v8.16b, v3.16b, v17.16b
|
|
eor v2.16b, v16.16b, v2.16b
|
|
eor v3.16b, v6.16b, v7.16b
|
|
eor v6.16b, v18.16b, v9.16b
|
|
eor v4.16b, v4.16b, v20.16b
|
|
eor v10.16b, v5.16b, v10.16b
|
|
eor v0.16b, v0.16b, v17.16b
|
|
eor v9.16b, v2.16b, v9.16b
|
|
eor v3.16b, v3.16b, v20.16b
|
|
eor v7.16b, v6.16b, v1.16b
|
|
eor v5.16b, v8.16b, v4.16b
|
|
eor v6.16b, v10.16b, v1.16b
|
|
eor v2.16b, v4.16b, v0.16b
|
|
eor v4.16b, v3.16b, v10.16b
|
|
eor v9.16b, v9.16b, v7.16b
|
|
eor v3.16b, v0.16b, v5.16b
|
|
eor v0.16b, v1.16b, v4.16b
|
|
eor v1.16b, v4.16b, v8.16b
|
|
eor v4.16b, v9.16b, v5.16b
|
|
eor v6.16b, v6.16b, v3.16b
|
|
bcc .Lenc_done
|
|
ext v8.16b, v0.16b, v0.16b, #12
|
|
ext v9.16b, v4.16b, v4.16b, #12
|
|
ldr q28, [x11]
|
|
ext v10.16b, v6.16b, v6.16b, #12
|
|
ext v16.16b, v1.16b, v1.16b, #12
|
|
ext v17.16b, v3.16b, v3.16b, #12
|
|
ext v18.16b, v7.16b, v7.16b, #12
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v4.16b, v4.16b, v9.16b
|
|
eor v6.16b, v6.16b, v10.16b
|
|
ext v19.16b, v2.16b, v2.16b, #12
|
|
ext v20.16b, v5.16b, v5.16b, #12
|
|
eor v1.16b, v1.16b, v16.16b
|
|
eor v3.16b, v3.16b, v17.16b
|
|
eor v7.16b, v7.16b, v18.16b
|
|
eor v2.16b, v2.16b, v19.16b
|
|
eor v16.16b, v16.16b, v0.16b
|
|
eor v5.16b, v5.16b, v20.16b
|
|
eor v17.16b, v17.16b, v6.16b
|
|
eor v10.16b, v10.16b, v4.16b
|
|
ext v0.16b, v0.16b, v0.16b, #8
|
|
eor v9.16b, v9.16b, v1.16b
|
|
ext v1.16b, v1.16b, v1.16b, #8
|
|
eor v8.16b, v8.16b, v5.16b
|
|
eor v16.16b, v16.16b, v5.16b
|
|
eor v18.16b, v18.16b, v3.16b
|
|
eor v19.16b, v19.16b, v7.16b
|
|
ext v3.16b, v3.16b, v3.16b, #8
|
|
ext v7.16b, v7.16b, v7.16b, #8
|
|
eor v20.16b, v20.16b, v2.16b
|
|
ext v6.16b, v6.16b, v6.16b, #8
|
|
ext v21.16b, v5.16b, v5.16b, #8
|
|
eor v17.16b, v17.16b, v5.16b
|
|
ext v2.16b, v2.16b, v2.16b, #8
|
|
eor v10.16b, v10.16b, v5.16b
|
|
ext v22.16b, v4.16b, v4.16b, #8
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v16.16b
|
|
eor v5.16b, v7.16b, v18.16b
|
|
eor v4.16b, v3.16b, v17.16b
|
|
eor v3.16b, v6.16b, v10.16b
|
|
eor v7.16b, v21.16b, v20.16b
|
|
eor v6.16b, v2.16b, v19.16b
|
|
eor v2.16b, v22.16b, v9.16b
|
|
bne .Lenc_loop
|
|
ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
|
|
b .Lenc_loop
|
|
.align 4
|
|
.Lenc_done:
|
|
ushr v8.2d, v0.2d, #1
|
|
movi v9.16b, #0x55
|
|
ldr q10, [x9]
|
|
ushr v16.2d, v3.2d, #1
|
|
movi v17.16b, #0x33
|
|
ushr v18.2d, v4.2d, #1
|
|
movi v19.16b, #0x0f
|
|
eor v8.16b, v8.16b, v1.16b
|
|
ushr v20.2d, v2.2d, #1
|
|
eor v16.16b, v16.16b, v7.16b
|
|
eor v18.16b, v18.16b, v6.16b
|
|
and v8.16b, v8.16b, v9.16b
|
|
eor v20.16b, v20.16b, v5.16b
|
|
and v16.16b, v16.16b, v9.16b
|
|
and v18.16b, v18.16b, v9.16b
|
|
shl v21.2d, v8.2d, #1
|
|
eor v1.16b, v1.16b, v8.16b
|
|
and v8.16b, v20.16b, v9.16b
|
|
eor v7.16b, v7.16b, v16.16b
|
|
shl v9.2d, v16.2d, #1
|
|
eor v6.16b, v6.16b, v18.16b
|
|
shl v16.2d, v18.2d, #1
|
|
eor v0.16b, v0.16b, v21.16b
|
|
shl v18.2d, v8.2d, #1
|
|
eor v5.16b, v5.16b, v8.16b
|
|
eor v3.16b, v3.16b, v9.16b
|
|
eor v4.16b, v4.16b, v16.16b
|
|
ushr v8.2d, v1.2d, #2
|
|
eor v2.16b, v2.16b, v18.16b
|
|
ushr v9.2d, v0.2d, #2
|
|
ushr v16.2d, v7.2d, #2
|
|
ushr v18.2d, v3.2d, #2
|
|
eor v8.16b, v8.16b, v6.16b
|
|
eor v9.16b, v9.16b, v4.16b
|
|
eor v16.16b, v16.16b, v5.16b
|
|
eor v18.16b, v18.16b, v2.16b
|
|
and v8.16b, v8.16b, v17.16b
|
|
and v9.16b, v9.16b, v17.16b
|
|
and v16.16b, v16.16b, v17.16b
|
|
and v17.16b, v18.16b, v17.16b
|
|
eor v6.16b, v6.16b, v8.16b
|
|
shl v8.2d, v8.2d, #2
|
|
eor v4.16b, v4.16b, v9.16b
|
|
shl v9.2d, v9.2d, #2
|
|
eor v5.16b, v5.16b, v16.16b
|
|
shl v16.2d, v16.2d, #2
|
|
eor v2.16b, v2.16b, v17.16b
|
|
shl v17.2d, v17.2d, #2
|
|
eor v1.16b, v1.16b, v8.16b
|
|
eor v0.16b, v0.16b, v9.16b
|
|
eor v7.16b, v7.16b, v16.16b
|
|
eor v3.16b, v3.16b, v17.16b
|
|
ushr v8.2d, v6.2d, #4
|
|
ushr v9.2d, v4.2d, #4
|
|
ushr v16.2d, v1.2d, #4
|
|
ushr v17.2d, v0.2d, #4
|
|
eor v8.16b, v8.16b, v5.16b
|
|
eor v9.16b, v9.16b, v2.16b
|
|
eor v16.16b, v16.16b, v7.16b
|
|
eor v17.16b, v17.16b, v3.16b
|
|
and v8.16b, v8.16b, v19.16b
|
|
and v9.16b, v9.16b, v19.16b
|
|
and v16.16b, v16.16b, v19.16b
|
|
and v17.16b, v17.16b, v19.16b
|
|
eor v5.16b, v5.16b, v8.16b
|
|
shl v8.2d, v8.2d, #4
|
|
eor v2.16b, v2.16b, v9.16b
|
|
shl v9.2d, v9.2d, #4
|
|
eor v7.16b, v7.16b, v16.16b
|
|
shl v16.2d, v16.2d, #4
|
|
eor v3.16b, v3.16b, v17.16b
|
|
shl v17.2d, v17.2d, #4
|
|
eor v6.16b, v6.16b, v8.16b
|
|
eor v4.16b, v4.16b, v9.16b
|
|
eor v7.16b, v7.16b, v10.16b
|
|
eor v1.16b, v1.16b, v16.16b
|
|
eor v3.16b, v3.16b, v10.16b
|
|
eor v0.16b, v0.16b, v17.16b
|
|
eor v6.16b, v6.16b, v10.16b
|
|
eor v4.16b, v4.16b, v10.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v5.16b, v5.16b, v10.16b
|
|
eor v1.16b, v1.16b, v10.16b
|
|
eor v0.16b, v0.16b, v10.16b
|
|
ret
|
|
.size _bsaes_encrypt8,.-_bsaes_encrypt8
|
|
|
|
.type _bsaes_key_convert,%function
|
|
.align 4
|
|
// On entry:
|
|
// x9 -> input key (big-endian)
|
|
// x10 = number of rounds
|
|
// x17 -> output key (native endianness)
|
|
// On exit:
|
|
// x9, x10 corrupted
|
|
// x11 -> .LM0_bigendian
|
|
// x17 -> last quadword of output key
|
|
// other general-purpose registers preserved
|
|
// v2-v6 preserved
|
|
// v7.16b[] = 0x63
|
|
// v8-v14 preserved
|
|
// v15 = last round key (converted to native endianness)
|
|
// other SIMD registers corrupted
|
|
_bsaes_key_convert:
|
|
#ifdef __AARCH64EL__
|
|
adr x11, .LM0_littleendian
|
|
#else
|
|
adr x11, .LM0_bigendian
|
|
#endif
|
|
ldr q0, [x9], #16 // load round 0 key
|
|
ldr q1, [x11] // .LM0
|
|
ldr q15, [x9], #16 // load round 1 key
|
|
|
|
movi v7.16b, #0x63 // compose .L63
|
|
movi v16.16b, #0x01 // bit masks
|
|
movi v17.16b, #0x02
|
|
movi v18.16b, #0x04
|
|
movi v19.16b, #0x08
|
|
movi v20.16b, #0x10
|
|
movi v21.16b, #0x20
|
|
movi v22.16b, #0x40
|
|
movi v23.16b, #0x80
|
|
|
|
#ifdef __AARCH64EL__
|
|
rev32 v0.16b, v0.16b
|
|
#endif
|
|
sub x10, x10, #1
|
|
str q0, [x17], #16 // save round 0 key
|
|
|
|
.align 4
|
|
.Lkey_loop:
|
|
tbl v0.16b, {v15.16b}, v1.16b
|
|
ldr q15, [x9], #16 // load next round key
|
|
|
|
eor v0.16b, v0.16b, v7.16b
|
|
cmtst v24.16b, v0.16b, v16.16b
|
|
cmtst v25.16b, v0.16b, v17.16b
|
|
cmtst v26.16b, v0.16b, v18.16b
|
|
cmtst v27.16b, v0.16b, v19.16b
|
|
cmtst v28.16b, v0.16b, v20.16b
|
|
cmtst v29.16b, v0.16b, v21.16b
|
|
cmtst v30.16b, v0.16b, v22.16b
|
|
cmtst v31.16b, v0.16b, v23.16b
|
|
sub x10, x10, #1
|
|
st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
|
|
st1 {v28.16b-v31.16b}, [x17], #64
|
|
cbnz x10, .Lkey_loop
|
|
|
|
// don't save last round key
|
|
#ifdef __AARCH64EL__
|
|
rev32 v15.16b, v15.16b
|
|
adr x11, .LM0_bigendian
|
|
#endif
|
|
ret
|
|
.size _bsaes_key_convert,.-_bsaes_key_convert
|
|
|
|
.globl ossl_bsaes_cbc_encrypt
|
|
.type ossl_bsaes_cbc_encrypt,%function
|
|
.align 4
|
|
// On entry:
|
|
// x0 -> input ciphertext
|
|
// x1 -> output plaintext
|
|
// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
|
|
// x3 -> key
|
|
// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
|
|
// w5 must be == 0
|
|
// On exit:
|
|
// Output plaintext filled in
|
|
// Initialisation vector overwritten with last quadword of ciphertext
|
|
// No output registers, usual AAPCS64 register preservation
|
|
ossl_bsaes_cbc_encrypt:
|
|
cmp x2, #128
|
|
bhs .Lcbc_do_bsaes
|
|
b AES_cbc_encrypt
|
|
.Lcbc_do_bsaes:
|
|
|
|
// it is up to the caller to make sure we are called with enc == 0
|
|
|
|
stp x29, x30, [sp, #-48]!
|
|
stp d8, d9, [sp, #16]
|
|
stp d10, d15, [sp, #32]
|
|
lsr x2, x2, #4 // len in 16 byte blocks
|
|
|
|
ldr w15, [x3, #240] // get # of rounds
|
|
mov x14, sp
|
|
|
|
// allocate the key schedule on the stack
|
|
add x17, sp, #96
|
|
sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
|
|
|
|
// populate the key schedule
|
|
mov x9, x3 // pass key
|
|
mov x10, x15 // pass # of rounds
|
|
mov sp, x17 // sp is sp
|
|
bl _bsaes_key_convert
|
|
ldr q6, [sp]
|
|
str q15, [x17] // save last round key
|
|
eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
|
|
str q6, [sp]
|
|
|
|
ldr q15, [x4] // load IV
|
|
b .Lcbc_dec_loop
|
|
|
|
.align 4
|
|
.Lcbc_dec_loop:
|
|
subs x2, x2, #0x8
|
|
bmi .Lcbc_dec_loop_finish
|
|
|
|
ldr q0, [x0], #16 // load input
|
|
mov x9, sp // pass the key
|
|
ldr q1, [x0], #16
|
|
mov x10, x15
|
|
ldr q2, [x0], #16
|
|
ldr q3, [x0], #16
|
|
ldr q4, [x0], #16
|
|
ldr q5, [x0], #16
|
|
ldr q6, [x0], #16
|
|
ldr q7, [x0], #-7*16
|
|
|
|
bl _bsaes_decrypt8
|
|
|
|
ldr q16, [x0], #16 // reload input
|
|
eor v0.16b, v0.16b, v15.16b // ^= IV
|
|
eor v1.16b, v1.16b, v16.16b
|
|
str q0, [x1], #16 // write output
|
|
ldr q0, [x0], #16
|
|
str q1, [x1], #16
|
|
ldr q1, [x0], #16
|
|
eor v1.16b, v4.16b, v1.16b
|
|
ldr q4, [x0], #16
|
|
eor v2.16b, v2.16b, v4.16b
|
|
eor v0.16b, v6.16b, v0.16b
|
|
ldr q4, [x0], #16
|
|
str q0, [x1], #16
|
|
str q1, [x1], #16
|
|
eor v0.16b, v7.16b, v4.16b
|
|
ldr q1, [x0], #16
|
|
str q2, [x1], #16
|
|
ldr q2, [x0], #16
|
|
ldr q15, [x0], #16
|
|
str q0, [x1], #16
|
|
eor v0.16b, v5.16b, v2.16b
|
|
eor v1.16b, v3.16b, v1.16b
|
|
str q1, [x1], #16
|
|
str q0, [x1], #16
|
|
|
|
b .Lcbc_dec_loop
|
|
|
|
.Lcbc_dec_loop_finish:
|
|
adds x2, x2, #8
|
|
beq .Lcbc_dec_done
|
|
|
|
ldr q0, [x0], #16 // load input
|
|
cmp x2, #2
|
|
blo .Lcbc_dec_one
|
|
ldr q1, [x0], #16
|
|
mov x9, sp // pass the key
|
|
mov x10, x15
|
|
beq .Lcbc_dec_two
|
|
ldr q2, [x0], #16
|
|
cmp x2, #4
|
|
blo .Lcbc_dec_three
|
|
ldr q3, [x0], #16
|
|
beq .Lcbc_dec_four
|
|
ldr q4, [x0], #16
|
|
cmp x2, #6
|
|
blo .Lcbc_dec_five
|
|
ldr q5, [x0], #16
|
|
beq .Lcbc_dec_six
|
|
ldr q6, [x0], #-6*16
|
|
|
|
bl _bsaes_decrypt8
|
|
|
|
ldr q5, [x0], #16 // reload input
|
|
eor v0.16b, v0.16b, v15.16b // ^= IV
|
|
ldr q8, [x0], #16
|
|
ldr q9, [x0], #16
|
|
ldr q10, [x0], #16
|
|
str q0, [x1], #16 // write output
|
|
ldr q0, [x0], #16
|
|
eor v1.16b, v1.16b, v5.16b
|
|
ldr q5, [x0], #16
|
|
eor v6.16b, v6.16b, v8.16b
|
|
ldr q15, [x0]
|
|
eor v4.16b, v4.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
str q1, [x1], #16
|
|
eor v0.16b, v7.16b, v0.16b
|
|
str q6, [x1], #16
|
|
eor v1.16b, v3.16b, v5.16b
|
|
str q4, [x1], #16
|
|
str q2, [x1], #16
|
|
str q0, [x1], #16
|
|
str q1, [x1]
|
|
b .Lcbc_dec_done
|
|
.align 4
|
|
.Lcbc_dec_six:
|
|
sub x0, x0, #0x60
|
|
bl _bsaes_decrypt8
|
|
ldr q3, [x0], #16 // reload input
|
|
eor v0.16b, v0.16b, v15.16b // ^= IV
|
|
ldr q5, [x0], #16
|
|
ldr q8, [x0], #16
|
|
ldr q9, [x0], #16
|
|
str q0, [x1], #16 // write output
|
|
ldr q0, [x0], #16
|
|
eor v1.16b, v1.16b, v3.16b
|
|
ldr q15, [x0]
|
|
eor v3.16b, v6.16b, v5.16b
|
|
eor v4.16b, v4.16b, v8.16b
|
|
eor v2.16b, v2.16b, v9.16b
|
|
str q1, [x1], #16
|
|
eor v0.16b, v7.16b, v0.16b
|
|
str q3, [x1], #16
|
|
str q4, [x1], #16
|
|
str q2, [x1], #16
|
|
str q0, [x1]
|
|
b .Lcbc_dec_done
|
|
.align 4
|
|
.Lcbc_dec_five:
|
|
sub x0, x0, #0x50
|
|
bl _bsaes_decrypt8
|
|
ldr q3, [x0], #16 // reload input
|
|
eor v0.16b, v0.16b, v15.16b // ^= IV
|
|
ldr q5, [x0], #16
|
|
ldr q7, [x0], #16
|
|
ldr q8, [x0], #16
|
|
str q0, [x1], #16 // write output
|
|
ldr q15, [x0]
|
|
eor v0.16b, v1.16b, v3.16b
|
|
eor v1.16b, v6.16b, v5.16b
|
|
eor v3.16b, v4.16b, v7.16b
|
|
str q0, [x1], #16
|
|
eor v0.16b, v2.16b, v8.16b
|
|
str q1, [x1], #16
|
|
str q3, [x1], #16
|
|
str q0, [x1]
|
|
b .Lcbc_dec_done
|
|
.align 4
|
|
.Lcbc_dec_four:
|
|
sub x0, x0, #0x40
|
|
bl _bsaes_decrypt8
|
|
ldr q2, [x0], #16 // reload input
|
|
eor v0.16b, v0.16b, v15.16b // ^= IV
|
|
ldr q3, [x0], #16
|
|
ldr q5, [x0], #16
|
|
str q0, [x1], #16 // write output
|
|
ldr q15, [x0]
|
|
eor v0.16b, v1.16b, v2.16b
|
|
eor v1.16b, v6.16b, v3.16b
|
|
eor v2.16b, v4.16b, v5.16b
|
|
str q0, [x1], #16
|
|
str q1, [x1], #16
|
|
str q2, [x1]
|
|
b .Lcbc_dec_done
|
|
.align 4
|
|
.Lcbc_dec_three:
|
|
sub x0, x0, #0x30
|
|
bl _bsaes_decrypt8
|
|
ldr q2, [x0], #16 // reload input
|
|
eor v0.16b, v0.16b, v15.16b // ^= IV
|
|
ldr q3, [x0], #16
|
|
ldr q15, [x0]
|
|
str q0, [x1], #16 // write output
|
|
eor v0.16b, v1.16b, v2.16b
|
|
eor v1.16b, v6.16b, v3.16b
|
|
str q0, [x1], #16
|
|
str q1, [x1]
|
|
b .Lcbc_dec_done
|
|
.align 4
|
|
.Lcbc_dec_two:
|
|
sub x0, x0, #0x20
|
|
bl _bsaes_decrypt8
|
|
ldr q2, [x0], #16 // reload input
|
|
eor v0.16b, v0.16b, v15.16b // ^= IV
|
|
ldr q15, [x0]
|
|
str q0, [x1], #16 // write output
|
|
eor v0.16b, v1.16b, v2.16b
|
|
str q0, [x1]
|
|
b .Lcbc_dec_done
|
|
.align 4
|
|
.Lcbc_dec_one:
|
|
sub x0, x0, #0x10
|
|
stp x1, x4, [sp, #-32]!
|
|
str x14, [sp, #16]
|
|
mov v8.16b, v15.16b
|
|
mov v15.16b, v0.16b
|
|
mov x2, x3
|
|
bl AES_decrypt
|
|
ldr x14, [sp, #16]
|
|
ldp x1, x4, [sp], #32
|
|
ldr q0, [x1] // load result
|
|
eor v0.16b, v0.16b, v8.16b // ^= IV
|
|
str q0, [x1] // write output
|
|
|
|
.align 4
|
|
.Lcbc_dec_done:
|
|
movi v0.16b, #0
|
|
movi v1.16b, #0
|
|
.Lcbc_dec_bzero:// wipe key schedule [if any]
|
|
stp q0, q1, [sp], #32
|
|
cmp sp, x14
|
|
bne .Lcbc_dec_bzero
|
|
str q15, [x4] // return IV
|
|
ldp d8, d9, [sp, #16]
|
|
ldp d10, d15, [sp, #32]
|
|
ldp x29, x30, [sp], #48
|
|
ret
|
|
.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
|
|
|
|
.globl ossl_bsaes_ctr32_encrypt_blocks
|
|
.type ossl_bsaes_ctr32_encrypt_blocks,%function
|
|
.align 4
|
|
// On entry:
|
|
// x0 -> input text (whole 16-byte blocks)
|
|
// x1 -> output text (whole 16-byte blocks)
|
|
// x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
|
|
// x3 -> key
|
|
// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
|
|
// On exit:
|
|
// Output text filled in
|
|
// No output registers, usual AAPCS64 register preservation
|
|
ossl_bsaes_ctr32_encrypt_blocks:
|
|
|
|
cmp x2, #8 // use plain AES for
|
|
blo .Lctr_enc_short // small sizes
|
|
|
|
stp x29, x30, [sp, #-80]!
|
|
stp d8, d9, [sp, #16]
|
|
stp d10, d11, [sp, #32]
|
|
stp d12, d13, [sp, #48]
|
|
stp d14, d15, [sp, #64]
|
|
|
|
ldr w15, [x3, #240] // get # of rounds
|
|
mov x14, sp
|
|
|
|
// allocate the key schedule on the stack
|
|
add x17, sp, #96
|
|
sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
|
|
|
|
// populate the key schedule
|
|
mov x9, x3 // pass key
|
|
mov x10, x15 // pass # of rounds
|
|
mov sp, x17 // sp is sp
|
|
bl _bsaes_key_convert
|
|
eor v7.16b, v7.16b, v15.16b // fix up last round key
|
|
str q7, [x17] // save last round key
|
|
|
|
ldr q0, [x4] // load counter
|
|
add x13, x11, #.LREVM0SR-.LM0_bigendian
|
|
ldr q4, [sp] // load round0 key
|
|
|
|
movi v8.4s, #1 // compose 1<<96
|
|
movi v9.16b, #0
|
|
rev32 v15.16b, v0.16b
|
|
rev32 v0.16b, v0.16b
|
|
ext v11.16b, v9.16b, v8.16b, #4
|
|
rev32 v4.16b, v4.16b
|
|
add v12.4s, v11.4s, v11.4s // compose 2<<96
|
|
str q4, [sp] // save adjusted round0 key
|
|
add v13.4s, v11.4s, v12.4s // compose 3<<96
|
|
add v14.4s, v12.4s, v12.4s // compose 4<<96
|
|
b .Lctr_enc_loop
|
|
|
|
.align 4
|
|
.Lctr_enc_loop:
|
|
// Intermix prologue from _bsaes_encrypt8 to use the opportunity
|
|
// to flip byte order in 32-bit counter
|
|
|
|
add v1.4s, v15.4s, v11.4s // +1
|
|
add x9, sp, #0x10 // pass next round key
|
|
add v2.4s, v15.4s, v12.4s // +2
|
|
ldr q9, [x13] // .LREVM0SR
|
|
ldr q8, [sp] // load round0 key
|
|
add v3.4s, v15.4s, v13.4s // +3
|
|
mov x10, x15 // pass rounds
|
|
sub x11, x13, #.LREVM0SR-.LSR // pass constants
|
|
add v6.4s, v2.4s, v14.4s
|
|
add v4.4s, v15.4s, v14.4s // +4
|
|
add v7.4s, v3.4s, v14.4s
|
|
add v15.4s, v4.4s, v14.4s // next counter
|
|
add v5.4s, v1.4s, v14.4s
|
|
|
|
bl _bsaes_encrypt8_alt
|
|
|
|
subs x2, x2, #8
|
|
blo .Lctr_enc_loop_done
|
|
|
|
ldr q16, [x0], #16
|
|
ldr q17, [x0], #16
|
|
eor v1.16b, v1.16b, v17.16b
|
|
ldr q17, [x0], #16
|
|
eor v0.16b, v0.16b, v16.16b
|
|
eor v4.16b, v4.16b, v17.16b
|
|
str q0, [x1], #16
|
|
ldr q16, [x0], #16
|
|
str q1, [x1], #16
|
|
mov v0.16b, v15.16b
|
|
str q4, [x1], #16
|
|
ldr q1, [x0], #16
|
|
eor v4.16b, v6.16b, v16.16b
|
|
eor v1.16b, v3.16b, v1.16b
|
|
ldr q3, [x0], #16
|
|
eor v3.16b, v7.16b, v3.16b
|
|
ldr q6, [x0], #16
|
|
eor v2.16b, v2.16b, v6.16b
|
|
ldr q6, [x0], #16
|
|
eor v5.16b, v5.16b, v6.16b
|
|
str q4, [x1], #16
|
|
str q1, [x1], #16
|
|
str q3, [x1], #16
|
|
str q2, [x1], #16
|
|
str q5, [x1], #16
|
|
|
|
bne .Lctr_enc_loop
|
|
b .Lctr_enc_done
|
|
|
|
.align 4
|
|
.Lctr_enc_loop_done:
|
|
add x2, x2, #8
|
|
ldr q16, [x0], #16 // load input
|
|
eor v0.16b, v0.16b, v16.16b
|
|
str q0, [x1], #16 // write output
|
|
cmp x2, #2
|
|
blo .Lctr_enc_done
|
|
ldr q17, [x0], #16
|
|
eor v1.16b, v1.16b, v17.16b
|
|
str q1, [x1], #16
|
|
beq .Lctr_enc_done
|
|
ldr q18, [x0], #16
|
|
eor v4.16b, v4.16b, v18.16b
|
|
str q4, [x1], #16
|
|
cmp x2, #4
|
|
blo .Lctr_enc_done
|
|
ldr q19, [x0], #16
|
|
eor v6.16b, v6.16b, v19.16b
|
|
str q6, [x1], #16
|
|
beq .Lctr_enc_done
|
|
ldr q20, [x0], #16
|
|
eor v3.16b, v3.16b, v20.16b
|
|
str q3, [x1], #16
|
|
cmp x2, #6
|
|
blo .Lctr_enc_done
|
|
ldr q21, [x0], #16
|
|
eor v7.16b, v7.16b, v21.16b
|
|
str q7, [x1], #16
|
|
beq .Lctr_enc_done
|
|
ldr q22, [x0]
|
|
eor v2.16b, v2.16b, v22.16b
|
|
str q2, [x1], #16
|
|
|
|
.Lctr_enc_done:
|
|
movi v0.16b, #0
|
|
movi v1.16b, #0
|
|
.Lctr_enc_bzero: // wipe key schedule [if any]
|
|
stp q0, q1, [sp], #32
|
|
cmp sp, x14
|
|
bne .Lctr_enc_bzero
|
|
|
|
ldp d8, d9, [sp, #16]
|
|
ldp d10, d11, [sp, #32]
|
|
ldp d12, d13, [sp, #48]
|
|
ldp d14, d15, [sp, #64]
|
|
ldp x29, x30, [sp], #80
|
|
ret
|
|
|
|
.Lctr_enc_short:
|
|
stp x29, x30, [sp, #-96]!
|
|
stp x19, x20, [sp, #16]
|
|
stp x21, x22, [sp, #32]
|
|
str x23, [sp, #48]
|
|
|
|
mov x19, x0 // copy arguments
|
|
mov x20, x1
|
|
mov x21, x2
|
|
mov x22, x3
|
|
ldr w23, [x4, #12] // load counter .LSW
|
|
ldr q1, [x4] // load whole counter value
|
|
#ifdef __AARCH64EL__
|
|
rev w23, w23
|
|
#endif
|
|
str q1, [sp, #80] // copy counter value
|
|
|
|
.Lctr_enc_short_loop:
|
|
add x0, sp, #80 // input counter value
|
|
add x1, sp, #64 // output on the stack
|
|
mov x2, x22 // key
|
|
|
|
bl AES_encrypt
|
|
|
|
ldr q0, [x19], #16 // load input
|
|
ldr q1, [sp, #64] // load encrypted counter
|
|
add x23, x23, #1
|
|
#ifdef __AARCH64EL__
|
|
rev w0, w23
|
|
str w0, [sp, #80+12] // next counter value
|
|
#else
|
|
str w23, [sp, #80+12] // next counter value
|
|
#endif
|
|
eor v0.16b, v0.16b, v1.16b
|
|
str q0, [x20], #16 // store output
|
|
subs x21, x21, #1
|
|
bne .Lctr_enc_short_loop
|
|
|
|
movi v0.16b, #0
|
|
movi v1.16b, #0
|
|
stp q0, q1, [sp, #64]
|
|
|
|
ldr x23, [sp, #48]
|
|
ldp x21, x22, [sp, #32]
|
|
ldp x19, x20, [sp, #16]
|
|
ldp x29, x30, [sp], #96
|
|
ret
|
|
.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
|
|
|
|
.globl ossl_bsaes_xts_encrypt
|
|
.type ossl_bsaes_xts_encrypt,%function
|
|
.align 4
|
|
// On entry:
|
|
// x0 -> input plaintext
|
|
// x1 -> output ciphertext
|
|
// x2 -> length of text in bytes (must be at least 16)
|
|
// x3 -> key1 (used to encrypt the XORed plaintext blocks)
|
|
// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
|
|
// x5 -> 16-byte initial vector (typically, sector number)
|
|
// On exit:
|
|
// Output ciphertext filled in
|
|
// No output registers, usual AAPCS64 register preservation
|
|
ossl_bsaes_xts_encrypt:
|
|
// Stack layout:
|
|
// sp ->
|
|
// nrounds*128-96 bytes: key schedule
|
|
// x19 ->
|
|
// 16 bytes: frame record
|
|
// 4*16 bytes: tweak storage across _bsaes_encrypt8
|
|
// 6*8 bytes: storage for 5 callee-saved general-purpose registers
|
|
// 8*8 bytes: storage for 8 callee-saved SIMD registers
|
|
stp x29, x30, [sp, #-192]!
|
|
stp x19, x20, [sp, #80]
|
|
stp x21, x22, [sp, #96]
|
|
str x23, [sp, #112]
|
|
stp d8, d9, [sp, #128]
|
|
stp d10, d11, [sp, #144]
|
|
stp d12, d13, [sp, #160]
|
|
stp d14, d15, [sp, #176]
|
|
|
|
mov x19, sp
|
|
mov x20, x0
|
|
mov x21, x1
|
|
mov x22, x2
|
|
mov x23, x3
|
|
|
|
// generate initial tweak
|
|
sub sp, sp, #16
|
|
mov x0, x5 // iv[]
|
|
mov x1, sp
|
|
mov x2, x4 // key2
|
|
bl AES_encrypt
|
|
ldr q11, [sp], #16
|
|
|
|
ldr w1, [x23, #240] // get # of rounds
|
|
// allocate the key schedule on the stack
|
|
add x17, sp, #96
|
|
sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
|
|
|
|
// populate the key schedule
|
|
mov x9, x23 // pass key
|
|
mov x10, x1 // pass # of rounds
|
|
mov sp, x17
|
|
bl _bsaes_key_convert
|
|
eor v15.16b, v15.16b, v7.16b // fix up last round key
|
|
str q15, [x17] // save last round key
|
|
|
|
subs x22, x22, #0x80
|
|
blo .Lxts_enc_short
|
|
b .Lxts_enc_loop
|
|
|
|
.align 4
|
|
.Lxts_enc_loop:
|
|
ldr q8, .Lxts_magic
|
|
mov x10, x1 // pass rounds
|
|
add x2, x19, #16
|
|
ldr q0, [x20], #16
|
|
sshr v1.2d, v11.2d, #63
|
|
mov x9, sp // pass key schedule
|
|
ldr q6, .Lxts_magic+16
|
|
add v2.2d, v11.2d, v11.2d
|
|
cmtst v3.2d, v11.2d, v6.2d
|
|
and v1.16b, v1.16b, v8.16b
|
|
ext v1.16b, v1.16b, v1.16b, #8
|
|
and v3.16b, v3.16b, v8.16b
|
|
ldr q4, [x20], #16
|
|
eor v12.16b, v2.16b, v1.16b
|
|
eor v1.16b, v4.16b, v12.16b
|
|
eor v0.16b, v0.16b, v11.16b
|
|
cmtst v2.2d, v12.2d, v6.2d
|
|
add v4.2d, v12.2d, v12.2d
|
|
add x0, x19, #16
|
|
ext v3.16b, v3.16b, v3.16b, #8
|
|
and v2.16b, v2.16b, v8.16b
|
|
eor v13.16b, v4.16b, v3.16b
|
|
ldr q3, [x20], #16
|
|
ext v4.16b, v2.16b, v2.16b, #8
|
|
eor v2.16b, v3.16b, v13.16b
|
|
ldr q3, [x20], #16
|
|
add v5.2d, v13.2d, v13.2d
|
|
cmtst v7.2d, v13.2d, v6.2d
|
|
and v7.16b, v7.16b, v8.16b
|
|
ldr q9, [x20], #16
|
|
ext v7.16b, v7.16b, v7.16b, #8
|
|
ldr q10, [x20], #16
|
|
eor v14.16b, v5.16b, v4.16b
|
|
ldr q16, [x20], #16
|
|
add v4.2d, v14.2d, v14.2d
|
|
eor v3.16b, v3.16b, v14.16b
|
|
eor v15.16b, v4.16b, v7.16b
|
|
add v5.2d, v15.2d, v15.2d
|
|
ldr q7, [x20], #16
|
|
cmtst v4.2d, v14.2d, v6.2d
|
|
and v17.16b, v4.16b, v8.16b
|
|
cmtst v18.2d, v15.2d, v6.2d
|
|
eor v4.16b, v9.16b, v15.16b
|
|
ext v9.16b, v17.16b, v17.16b, #8
|
|
eor v9.16b, v5.16b, v9.16b
|
|
add v17.2d, v9.2d, v9.2d
|
|
and v18.16b, v18.16b, v8.16b
|
|
eor v5.16b, v10.16b, v9.16b
|
|
str q9, [x2], #16
|
|
ext v10.16b, v18.16b, v18.16b, #8
|
|
cmtst v9.2d, v9.2d, v6.2d
|
|
and v9.16b, v9.16b, v8.16b
|
|
eor v10.16b, v17.16b, v10.16b
|
|
cmtst v17.2d, v10.2d, v6.2d
|
|
eor v6.16b, v16.16b, v10.16b
|
|
str q10, [x2], #16
|
|
ext v9.16b, v9.16b, v9.16b, #8
|
|
add v10.2d, v10.2d, v10.2d
|
|
eor v9.16b, v10.16b, v9.16b
|
|
str q9, [x2], #16
|
|
eor v7.16b, v7.16b, v9.16b
|
|
add v9.2d, v9.2d, v9.2d
|
|
and v8.16b, v17.16b, v8.16b
|
|
ext v8.16b, v8.16b, v8.16b, #8
|
|
eor v8.16b, v9.16b, v8.16b
|
|
str q8, [x2] // next round tweak
|
|
|
|
bl _bsaes_encrypt8
|
|
|
|
ldr q8, [x0], #16
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
ldr q9, [x0], #16
|
|
eor v4.16b, v4.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
ldr q10, [x0], #16
|
|
eor v3.16b, v3.16b, v15.16b
|
|
subs x22, x22, #0x80
|
|
str q0, [x21], #16
|
|
ldr q11, [x0] // next round tweak
|
|
str q1, [x21], #16
|
|
eor v0.16b, v7.16b, v8.16b
|
|
eor v1.16b, v2.16b, v9.16b
|
|
str q4, [x21], #16
|
|
eor v2.16b, v5.16b, v10.16b
|
|
str q6, [x21], #16
|
|
str q3, [x21], #16
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
str q2, [x21], #16
|
|
bpl .Lxts_enc_loop
|
|
|
|
.Lxts_enc_short:
|
|
adds x22, x22, #0x70
|
|
bmi .Lxts_enc_done
|
|
|
|
ldr q8, .Lxts_magic
|
|
sshr v1.2d, v11.2d, #63
|
|
add v2.2d, v11.2d, v11.2d
|
|
ldr q9, .Lxts_magic+16
|
|
subs x22, x22, #0x10
|
|
ldr q0, [x20], #16
|
|
and v1.16b, v1.16b, v8.16b
|
|
cmtst v3.2d, v11.2d, v9.2d
|
|
ext v1.16b, v1.16b, v1.16b, #8
|
|
and v3.16b, v3.16b, v8.16b
|
|
eor v12.16b, v2.16b, v1.16b
|
|
ext v1.16b, v3.16b, v3.16b, #8
|
|
add v2.2d, v12.2d, v12.2d
|
|
cmtst v3.2d, v12.2d, v9.2d
|
|
eor v13.16b, v2.16b, v1.16b
|
|
and v22.16b, v3.16b, v8.16b
|
|
bmi .Lxts_enc_1
|
|
|
|
ext v2.16b, v22.16b, v22.16b, #8
|
|
add v3.2d, v13.2d, v13.2d
|
|
ldr q1, [x20], #16
|
|
cmtst v4.2d, v13.2d, v9.2d
|
|
subs x22, x22, #0x10
|
|
eor v14.16b, v3.16b, v2.16b
|
|
and v23.16b, v4.16b, v8.16b
|
|
bmi .Lxts_enc_2
|
|
|
|
ext v3.16b, v23.16b, v23.16b, #8
|
|
add v4.2d, v14.2d, v14.2d
|
|
ldr q2, [x20], #16
|
|
cmtst v5.2d, v14.2d, v9.2d
|
|
eor v0.16b, v0.16b, v11.16b
|
|
subs x22, x22, #0x10
|
|
eor v15.16b, v4.16b, v3.16b
|
|
and v24.16b, v5.16b, v8.16b
|
|
bmi .Lxts_enc_3
|
|
|
|
ext v4.16b, v24.16b, v24.16b, #8
|
|
add v5.2d, v15.2d, v15.2d
|
|
ldr q3, [x20], #16
|
|
cmtst v6.2d, v15.2d, v9.2d
|
|
eor v1.16b, v1.16b, v12.16b
|
|
subs x22, x22, #0x10
|
|
eor v16.16b, v5.16b, v4.16b
|
|
and v25.16b, v6.16b, v8.16b
|
|
bmi .Lxts_enc_4
|
|
|
|
ext v5.16b, v25.16b, v25.16b, #8
|
|
add v6.2d, v16.2d, v16.2d
|
|
add x0, x19, #16
|
|
cmtst v7.2d, v16.2d, v9.2d
|
|
ldr q4, [x20], #16
|
|
eor v2.16b, v2.16b, v13.16b
|
|
str q16, [x0], #16
|
|
subs x22, x22, #0x10
|
|
eor v17.16b, v6.16b, v5.16b
|
|
and v26.16b, v7.16b, v8.16b
|
|
bmi .Lxts_enc_5
|
|
|
|
ext v7.16b, v26.16b, v26.16b, #8
|
|
add v18.2d, v17.2d, v17.2d
|
|
ldr q5, [x20], #16
|
|
eor v3.16b, v3.16b, v14.16b
|
|
str q17, [x0], #16
|
|
subs x22, x22, #0x10
|
|
eor v18.16b, v18.16b, v7.16b
|
|
bmi .Lxts_enc_6
|
|
|
|
ldr q6, [x20], #16
|
|
eor v4.16b, v4.16b, v15.16b
|
|
eor v5.16b, v5.16b, v16.16b
|
|
str q18, [x0] // next round tweak
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1
|
|
add x0, x19, #16
|
|
sub x22, x22, #0x10
|
|
eor v6.16b, v6.16b, v17.16b
|
|
|
|
bl _bsaes_encrypt8
|
|
|
|
ldr q16, [x0], #16
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
ldr q17, [x0], #16
|
|
eor v4.16b, v4.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v3.16b, v3.16b, v15.16b
|
|
ldr q11, [x0] // next round tweak
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
eor v0.16b, v7.16b, v16.16b
|
|
eor v1.16b, v2.16b, v17.16b
|
|
str q4, [x21], #16
|
|
str q6, [x21], #16
|
|
str q3, [x21], #16
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
b .Lxts_enc_done
|
|
|
|
.align 4
|
|
.Lxts_enc_6:
|
|
eor v4.16b, v4.16b, v15.16b
|
|
eor v5.16b, v5.16b, v16.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_encrypt8
|
|
|
|
ldr q16, [x0], #16
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
eor v4.16b, v4.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
ldr q11, [x0] // next round tweak
|
|
eor v3.16b, v3.16b, v15.16b
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
eor v0.16b, v7.16b, v16.16b
|
|
str q4, [x21], #16
|
|
str q6, [x21], #16
|
|
str q3, [x21], #16
|
|
str q0, [x21], #16
|
|
b .Lxts_enc_done
|
|
|
|
.align 4
|
|
.Lxts_enc_5:
|
|
eor v3.16b, v3.16b, v14.16b
|
|
eor v4.16b, v4.16b, v15.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_encrypt8
|
|
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
ldr q11, [x0] // next round tweak
|
|
eor v4.16b, v4.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v3.16b, v3.16b, v15.16b
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
str q4, [x21], #16
|
|
str q6, [x21], #16
|
|
str q3, [x21], #16
|
|
b .Lxts_enc_done
|
|
|
|
.align 4
|
|
.Lxts_enc_4:
|
|
eor v2.16b, v2.16b, v13.16b
|
|
eor v3.16b, v3.16b, v14.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_encrypt8
|
|
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
eor v4.16b, v4.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
mov v11.16b, v15.16b // next round tweak
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
str q4, [x21], #16
|
|
str q6, [x21], #16
|
|
b .Lxts_enc_done
|
|
|
|
.align 4
|
|
.Lxts_enc_3:
|
|
eor v1.16b, v1.16b, v12.16b
|
|
eor v2.16b, v2.16b, v13.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_encrypt8
|
|
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
eor v4.16b, v4.16b, v13.16b
|
|
mov v11.16b, v14.16b // next round tweak
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
str q4, [x21], #16
|
|
b .Lxts_enc_done
|
|
|
|
.align 4
|
|
.Lxts_enc_2:
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_encrypt8
|
|
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
mov v11.16b, v13.16b // next round tweak
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
b .Lxts_enc_done
|
|
|
|
.align 4
|
|
.Lxts_enc_1:
|
|
eor v0.16b, v0.16b, v11.16b
|
|
sub x0, sp, #16
|
|
sub x1, sp, #16
|
|
mov x2, x23
|
|
mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
|
|
mov v14.d[0], v12.d[1]
|
|
str q0, [sp, #-16]!
|
|
|
|
bl AES_encrypt
|
|
|
|
ldr q0, [sp], #16
|
|
trn1 v13.2d, v11.2d, v13.2d
|
|
trn1 v11.2d, v12.2d, v14.2d // next round tweak
|
|
eor v0.16b, v0.16b, v13.16b
|
|
str q0, [x21], #16
|
|
|
|
.Lxts_enc_done:
|
|
adds x22, x22, #0x10
|
|
beq .Lxts_enc_ret
|
|
|
|
sub x6, x21, #0x10
|
|
// Penultimate plaintext block produces final ciphertext part-block
|
|
// plus remaining part of final plaintext block. Move ciphertext part
|
|
// to final position and re-use penultimate ciphertext block buffer to
|
|
// construct final plaintext block
|
|
.Lxts_enc_steal:
|
|
ldrb w0, [x20], #1
|
|
ldrb w1, [x21, #-0x10]
|
|
strb w0, [x21, #-0x10]
|
|
strb w1, [x21], #1
|
|
|
|
subs x22, x22, #1
|
|
bhi .Lxts_enc_steal
|
|
|
|
// Finally encrypt the penultimate ciphertext block using the
|
|
// last tweak
|
|
ldr q0, [x6]
|
|
eor v0.16b, v0.16b, v11.16b
|
|
str q0, [sp, #-16]!
|
|
mov x0, sp
|
|
mov x1, sp
|
|
mov x2, x23
|
|
mov x21, x6
|
|
mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
|
|
|
|
bl AES_encrypt
|
|
|
|
trn1 v11.2d, v11.2d, v13.2d
|
|
ldr q0, [sp], #16
|
|
eor v0.16b, v0.16b, v11.16b
|
|
str q0, [x21]
|
|
|
|
.Lxts_enc_ret:
|
|
|
|
movi v0.16b, #0
|
|
movi v1.16b, #0
|
|
.Lxts_enc_bzero: // wipe key schedule
|
|
stp q0, q1, [sp], #32
|
|
cmp sp, x19
|
|
bne .Lxts_enc_bzero
|
|
|
|
ldp x19, x20, [sp, #80]
|
|
ldp x21, x22, [sp, #96]
|
|
ldr x23, [sp, #112]
|
|
ldp d8, d9, [sp, #128]
|
|
ldp d10, d11, [sp, #144]
|
|
ldp d12, d13, [sp, #160]
|
|
ldp d14, d15, [sp, #176]
|
|
ldp x29, x30, [sp], #192
|
|
ret
|
|
.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
|
|
|
|
// The assembler doesn't seem capable of de-duplicating these when expressed
|
|
// using `ldr qd,=` syntax, so assign a symbolic address
|
|
.align 5
|
|
.Lxts_magic:
|
|
.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000
|
|
|
|
.globl ossl_bsaes_xts_decrypt
|
|
.type ossl_bsaes_xts_decrypt,%function
|
|
.align 4
|
|
// On entry:
|
|
// x0 -> input ciphertext
|
|
// x1 -> output plaintext
|
|
// x2 -> length of text in bytes (must be at least 16)
|
|
// x3 -> key1 (used to decrypt the XORed ciphertext blocks)
|
|
// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
|
|
// x5 -> 16-byte initial vector (typically, sector number)
|
|
// On exit:
|
|
// Output plaintext filled in
|
|
// No output registers, usual AAPCS64 register preservation
|
|
ossl_bsaes_xts_decrypt:
|
|
// Stack layout:
|
|
// sp ->
|
|
// nrounds*128-96 bytes: key schedule
|
|
// x19 ->
|
|
// 16 bytes: frame record
|
|
// 4*16 bytes: tweak storage across _bsaes_decrypt8
|
|
// 6*8 bytes: storage for 5 callee-saved general-purpose registers
|
|
// 8*8 bytes: storage for 8 callee-saved SIMD registers
|
|
stp x29, x30, [sp, #-192]!
|
|
stp x19, x20, [sp, #80]
|
|
stp x21, x22, [sp, #96]
|
|
str x23, [sp, #112]
|
|
stp d8, d9, [sp, #128]
|
|
stp d10, d11, [sp, #144]
|
|
stp d12, d13, [sp, #160]
|
|
stp d14, d15, [sp, #176]
|
|
|
|
mov x19, sp
|
|
mov x20, x0
|
|
mov x21, x1
|
|
mov x22, x2
|
|
mov x23, x3
|
|
|
|
// generate initial tweak
|
|
sub sp, sp, #16
|
|
mov x0, x5 // iv[]
|
|
mov x1, sp
|
|
mov x2, x4 // key2
|
|
bl AES_encrypt
|
|
ldr q11, [sp], #16
|
|
|
|
ldr w1, [x23, #240] // get # of rounds
|
|
// allocate the key schedule on the stack
|
|
add x17, sp, #96
|
|
sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
|
|
|
|
// populate the key schedule
|
|
mov x9, x23 // pass key
|
|
mov x10, x1 // pass # of rounds
|
|
mov sp, x17
|
|
bl _bsaes_key_convert
|
|
ldr q6, [sp]
|
|
str q15, [x17] // save last round key
|
|
eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
|
|
str q6, [sp]
|
|
|
|
sub x30, x22, #0x10
|
|
tst x22, #0xf // if not multiple of 16
|
|
csel x22, x30, x22, ne // subtract another 16 bytes
|
|
subs x22, x22, #0x80
|
|
|
|
blo .Lxts_dec_short
|
|
b .Lxts_dec_loop
|
|
|
|
.align 4
|
|
.Lxts_dec_loop:
|
|
ldr q8, .Lxts_magic
|
|
mov x10, x1 // pass rounds
|
|
add x2, x19, #16
|
|
ldr q0, [x20], #16
|
|
sshr v1.2d, v11.2d, #63
|
|
mov x9, sp // pass key schedule
|
|
ldr q6, .Lxts_magic+16
|
|
add v2.2d, v11.2d, v11.2d
|
|
cmtst v3.2d, v11.2d, v6.2d
|
|
and v1.16b, v1.16b, v8.16b
|
|
ext v1.16b, v1.16b, v1.16b, #8
|
|
and v3.16b, v3.16b, v8.16b
|
|
ldr q4, [x20], #16
|
|
eor v12.16b, v2.16b, v1.16b
|
|
eor v1.16b, v4.16b, v12.16b
|
|
eor v0.16b, v0.16b, v11.16b
|
|
cmtst v2.2d, v12.2d, v6.2d
|
|
add v4.2d, v12.2d, v12.2d
|
|
add x0, x19, #16
|
|
ext v3.16b, v3.16b, v3.16b, #8
|
|
and v2.16b, v2.16b, v8.16b
|
|
eor v13.16b, v4.16b, v3.16b
|
|
ldr q3, [x20], #16
|
|
ext v4.16b, v2.16b, v2.16b, #8
|
|
eor v2.16b, v3.16b, v13.16b
|
|
ldr q3, [x20], #16
|
|
add v5.2d, v13.2d, v13.2d
|
|
cmtst v7.2d, v13.2d, v6.2d
|
|
and v7.16b, v7.16b, v8.16b
|
|
ldr q9, [x20], #16
|
|
ext v7.16b, v7.16b, v7.16b, #8
|
|
ldr q10, [x20], #16
|
|
eor v14.16b, v5.16b, v4.16b
|
|
ldr q16, [x20], #16
|
|
add v4.2d, v14.2d, v14.2d
|
|
eor v3.16b, v3.16b, v14.16b
|
|
eor v15.16b, v4.16b, v7.16b
|
|
add v5.2d, v15.2d, v15.2d
|
|
ldr q7, [x20], #16
|
|
cmtst v4.2d, v14.2d, v6.2d
|
|
and v17.16b, v4.16b, v8.16b
|
|
cmtst v18.2d, v15.2d, v6.2d
|
|
eor v4.16b, v9.16b, v15.16b
|
|
ext v9.16b, v17.16b, v17.16b, #8
|
|
eor v9.16b, v5.16b, v9.16b
|
|
add v17.2d, v9.2d, v9.2d
|
|
and v18.16b, v18.16b, v8.16b
|
|
eor v5.16b, v10.16b, v9.16b
|
|
str q9, [x2], #16
|
|
ext v10.16b, v18.16b, v18.16b, #8
|
|
cmtst v9.2d, v9.2d, v6.2d
|
|
and v9.16b, v9.16b, v8.16b
|
|
eor v10.16b, v17.16b, v10.16b
|
|
cmtst v17.2d, v10.2d, v6.2d
|
|
eor v6.16b, v16.16b, v10.16b
|
|
str q10, [x2], #16
|
|
ext v9.16b, v9.16b, v9.16b, #8
|
|
add v10.2d, v10.2d, v10.2d
|
|
eor v9.16b, v10.16b, v9.16b
|
|
str q9, [x2], #16
|
|
eor v7.16b, v7.16b, v9.16b
|
|
add v9.2d, v9.2d, v9.2d
|
|
and v8.16b, v17.16b, v8.16b
|
|
ext v8.16b, v8.16b, v8.16b, #8
|
|
eor v8.16b, v9.16b, v8.16b
|
|
str q8, [x2] // next round tweak
|
|
|
|
bl _bsaes_decrypt8
|
|
|
|
eor v6.16b, v6.16b, v13.16b
|
|
eor v0.16b, v0.16b, v11.16b
|
|
ldr q8, [x0], #16
|
|
eor v7.16b, v7.16b, v8.16b
|
|
str q0, [x21], #16
|
|
eor v0.16b, v1.16b, v12.16b
|
|
ldr q1, [x0], #16
|
|
eor v1.16b, v3.16b, v1.16b
|
|
subs x22, x22, #0x80
|
|
eor v2.16b, v2.16b, v15.16b
|
|
eor v3.16b, v4.16b, v14.16b
|
|
ldr q4, [x0], #16
|
|
str q0, [x21], #16
|
|
ldr q11, [x0] // next round tweak
|
|
eor v0.16b, v5.16b, v4.16b
|
|
str q6, [x21], #16
|
|
str q3, [x21], #16
|
|
str q2, [x21], #16
|
|
str q7, [x21], #16
|
|
str q1, [x21], #16
|
|
str q0, [x21], #16
|
|
bpl .Lxts_dec_loop
|
|
|
|
.Lxts_dec_short:
|
|
adds x22, x22, #0x70
|
|
bmi .Lxts_dec_done
|
|
|
|
ldr q8, .Lxts_magic
|
|
sshr v1.2d, v11.2d, #63
|
|
add v2.2d, v11.2d, v11.2d
|
|
ldr q9, .Lxts_magic+16
|
|
subs x22, x22, #0x10
|
|
ldr q0, [x20], #16
|
|
and v1.16b, v1.16b, v8.16b
|
|
cmtst v3.2d, v11.2d, v9.2d
|
|
ext v1.16b, v1.16b, v1.16b, #8
|
|
and v3.16b, v3.16b, v8.16b
|
|
eor v12.16b, v2.16b, v1.16b
|
|
ext v1.16b, v3.16b, v3.16b, #8
|
|
add v2.2d, v12.2d, v12.2d
|
|
cmtst v3.2d, v12.2d, v9.2d
|
|
eor v13.16b, v2.16b, v1.16b
|
|
and v22.16b, v3.16b, v8.16b
|
|
bmi .Lxts_dec_1
|
|
|
|
ext v2.16b, v22.16b, v22.16b, #8
|
|
add v3.2d, v13.2d, v13.2d
|
|
ldr q1, [x20], #16
|
|
cmtst v4.2d, v13.2d, v9.2d
|
|
subs x22, x22, #0x10
|
|
eor v14.16b, v3.16b, v2.16b
|
|
and v23.16b, v4.16b, v8.16b
|
|
bmi .Lxts_dec_2
|
|
|
|
ext v3.16b, v23.16b, v23.16b, #8
|
|
add v4.2d, v14.2d, v14.2d
|
|
ldr q2, [x20], #16
|
|
cmtst v5.2d, v14.2d, v9.2d
|
|
eor v0.16b, v0.16b, v11.16b
|
|
subs x22, x22, #0x10
|
|
eor v15.16b, v4.16b, v3.16b
|
|
and v24.16b, v5.16b, v8.16b
|
|
bmi .Lxts_dec_3
|
|
|
|
ext v4.16b, v24.16b, v24.16b, #8
|
|
add v5.2d, v15.2d, v15.2d
|
|
ldr q3, [x20], #16
|
|
cmtst v6.2d, v15.2d, v9.2d
|
|
eor v1.16b, v1.16b, v12.16b
|
|
subs x22, x22, #0x10
|
|
eor v16.16b, v5.16b, v4.16b
|
|
and v25.16b, v6.16b, v8.16b
|
|
bmi .Lxts_dec_4
|
|
|
|
ext v5.16b, v25.16b, v25.16b, #8
|
|
add v6.2d, v16.2d, v16.2d
|
|
add x0, x19, #16
|
|
cmtst v7.2d, v16.2d, v9.2d
|
|
ldr q4, [x20], #16
|
|
eor v2.16b, v2.16b, v13.16b
|
|
str q16, [x0], #16
|
|
subs x22, x22, #0x10
|
|
eor v17.16b, v6.16b, v5.16b
|
|
and v26.16b, v7.16b, v8.16b
|
|
bmi .Lxts_dec_5
|
|
|
|
ext v7.16b, v26.16b, v26.16b, #8
|
|
add v18.2d, v17.2d, v17.2d
|
|
ldr q5, [x20], #16
|
|
eor v3.16b, v3.16b, v14.16b
|
|
str q17, [x0], #16
|
|
subs x22, x22, #0x10
|
|
eor v18.16b, v18.16b, v7.16b
|
|
bmi .Lxts_dec_6
|
|
|
|
ldr q6, [x20], #16
|
|
eor v4.16b, v4.16b, v15.16b
|
|
eor v5.16b, v5.16b, v16.16b
|
|
str q18, [x0] // next round tweak
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1
|
|
add x0, x19, #16
|
|
sub x22, x22, #0x10
|
|
eor v6.16b, v6.16b, v17.16b
|
|
|
|
bl _bsaes_decrypt8
|
|
|
|
ldr q16, [x0], #16
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
ldr q17, [x0], #16
|
|
eor v6.16b, v6.16b, v13.16b
|
|
eor v4.16b, v4.16b, v14.16b
|
|
eor v2.16b, v2.16b, v15.16b
|
|
ldr q11, [x0] // next round tweak
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
eor v0.16b, v7.16b, v16.16b
|
|
eor v1.16b, v3.16b, v17.16b
|
|
str q6, [x21], #16
|
|
str q4, [x21], #16
|
|
str q2, [x21], #16
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
b .Lxts_dec_done
|
|
|
|
.align 4
|
|
.Lxts_dec_6:
|
|
eor v4.16b, v4.16b, v15.16b
|
|
eor v5.16b, v5.16b, v16.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_decrypt8
|
|
|
|
ldr q16, [x0], #16
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
eor v6.16b, v6.16b, v13.16b
|
|
eor v4.16b, v4.16b, v14.16b
|
|
ldr q11, [x0] // next round tweak
|
|
eor v2.16b, v2.16b, v15.16b
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
eor v0.16b, v7.16b, v16.16b
|
|
str q6, [x21], #16
|
|
str q4, [x21], #16
|
|
str q2, [x21], #16
|
|
str q0, [x21], #16
|
|
b .Lxts_dec_done
|
|
|
|
.align 4
|
|
.Lxts_dec_5:
|
|
eor v3.16b, v3.16b, v14.16b
|
|
eor v4.16b, v4.16b, v15.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_decrypt8
|
|
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
ldr q11, [x0] // next round tweak
|
|
eor v6.16b, v6.16b, v13.16b
|
|
eor v4.16b, v4.16b, v14.16b
|
|
eor v2.16b, v2.16b, v15.16b
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
str q6, [x21], #16
|
|
str q4, [x21], #16
|
|
str q2, [x21], #16
|
|
b .Lxts_dec_done
|
|
|
|
.align 4
|
|
.Lxts_dec_4:
|
|
eor v2.16b, v2.16b, v13.16b
|
|
eor v3.16b, v3.16b, v14.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_decrypt8
|
|
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
eor v6.16b, v6.16b, v13.16b
|
|
eor v4.16b, v4.16b, v14.16b
|
|
mov v11.16b, v15.16b // next round tweak
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
str q6, [x21], #16
|
|
str q4, [x21], #16
|
|
b .Lxts_dec_done
|
|
|
|
.align 4
|
|
.Lxts_dec_3:
|
|
eor v1.16b, v1.16b, v12.16b
|
|
eor v2.16b, v2.16b, v13.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_decrypt8
|
|
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
eor v6.16b, v6.16b, v13.16b
|
|
mov v11.16b, v14.16b // next round tweak
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
str q6, [x21], #16
|
|
b .Lxts_dec_done
|
|
|
|
.align 4
|
|
.Lxts_dec_2:
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
mov x9, sp // pass key schedule
|
|
mov x10, x1 // pass rounds
|
|
add x0, x19, #16
|
|
|
|
bl _bsaes_decrypt8
|
|
|
|
eor v0.16b, v0.16b, v11.16b
|
|
eor v1.16b, v1.16b, v12.16b
|
|
mov v11.16b, v13.16b // next round tweak
|
|
str q0, [x21], #16
|
|
str q1, [x21], #16
|
|
b .Lxts_dec_done
|
|
|
|
.align 4
|
|
.Lxts_dec_1:
|
|
eor v0.16b, v0.16b, v11.16b
|
|
sub x0, sp, #16
|
|
sub x1, sp, #16
|
|
mov x2, x23
|
|
mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
|
|
mov v14.d[0], v12.d[1]
|
|
str q0, [sp, #-16]!
|
|
|
|
bl AES_decrypt
|
|
|
|
ldr q0, [sp], #16
|
|
trn1 v13.2d, v11.2d, v13.2d
|
|
trn1 v11.2d, v12.2d, v14.2d // next round tweak
|
|
eor v0.16b, v0.16b, v13.16b
|
|
str q0, [x21], #16
|
|
|
|
.Lxts_dec_done:
|
|
adds x22, x22, #0x10
|
|
beq .Lxts_dec_ret
|
|
|
|
// calculate one round of extra tweak for the stolen ciphertext
|
|
ldr q8, .Lxts_magic
|
|
sshr v6.2d, v11.2d, #63
|
|
and v6.16b, v6.16b, v8.16b
|
|
add v12.2d, v11.2d, v11.2d
|
|
ext v6.16b, v6.16b, v6.16b, #8
|
|
eor v12.16b, v12.16b, v6.16b
|
|
|
|
// perform the final decryption with the last tweak value
|
|
ldr q0, [x20], #16
|
|
eor v0.16b, v0.16b, v12.16b
|
|
str q0, [sp, #-16]!
|
|
mov x0, sp
|
|
mov x1, sp
|
|
mov x2, x23
|
|
mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
|
|
mov v14.d[0], v12.d[1]
|
|
|
|
bl AES_decrypt
|
|
|
|
trn1 v12.2d, v12.2d, v14.2d
|
|
trn1 v11.2d, v11.2d, v13.2d
|
|
ldr q0, [sp], #16
|
|
eor v0.16b, v0.16b, v12.16b
|
|
str q0, [x21]
|
|
|
|
mov x6, x21
|
|
// Penultimate ciphertext block produces final plaintext part-block
|
|
// plus remaining part of final ciphertext block. Move plaintext part
|
|
// to final position and re-use penultimate plaintext block buffer to
|
|
// construct final ciphertext block
|
|
.Lxts_dec_steal:
|
|
ldrb w1, [x21]
|
|
ldrb w0, [x20], #1
|
|
strb w1, [x21, #0x10]
|
|
strb w0, [x21], #1
|
|
|
|
subs x22, x22, #1
|
|
bhi .Lxts_dec_steal
|
|
|
|
// Finally decrypt the penultimate plaintext block using the
|
|
// penultimate tweak
|
|
ldr q0, [x6]
|
|
eor v0.16b, v0.16b, v11.16b
|
|
str q0, [sp, #-16]!
|
|
mov x0, sp
|
|
mov x1, sp
|
|
mov x2, x23
|
|
mov x21, x6
|
|
|
|
bl AES_decrypt
|
|
|
|
trn1 v11.2d, v11.2d, v13.2d
|
|
ldr q0, [sp], #16
|
|
eor v0.16b, v0.16b, v11.16b
|
|
str q0, [x21]
|
|
|
|
.Lxts_dec_ret:
|
|
|
|
movi v0.16b, #0
|
|
movi v1.16b, #0
|
|
.Lxts_dec_bzero: // wipe key schedule
|
|
stp q0, q1, [sp], #32
|
|
cmp sp, x19
|
|
bne .Lxts_dec_bzero
|
|
|
|
ldp x19, x20, [sp, #80]
|
|
ldp x21, x22, [sp, #96]
|
|
ldr x23, [sp, #112]
|
|
ldp d8, d9, [sp, #128]
|
|
ldp d10, d11, [sp, #144]
|
|
ldp d12, d13, [sp, #160]
|
|
ldp d14, d15, [sp, #176]
|
|
ldp x29, x30, [sp], #192
|
|
ret
|
|
.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
|