mirror of
https://github.com/openssl/openssl.git
synced 2025-01-18 13:44:20 +08:00
32be631ca1
If one of the perlasm xlate drivers crashes, OpenSSL's build will currently swallow the error and silently truncate the output to however far the driver got. This will hopefully fail to build, but better to check such things. Handle this by checking for errors when closing STDOUT (which is a pipe to the xlate driver). Reviewed-by: Richard Levitte <levitte@openssl.org> Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org> (Merged from https://github.com/openssl/openssl/pull/10883)
5723 lines
272 KiB
Perl
Executable File
5723 lines
272 KiB
Perl
Executable File
#! /usr/bin/env perl
|
|
# Copyright 2019 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
#========================================================================
|
|
# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
|
|
# derived from https://github.com/ARM-software/AArch64cryptolib, original
|
|
# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
|
|
# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
|
|
# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
|
|
#========================================================================
|
|
#
|
|
# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
|
|
#
|
|
# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
|
|
#
|
|
# ____________________________________________________
|
|
# | |
|
|
# | PRE |
|
|
# |____________________________________________________|
|
|
# | | | |
|
|
# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
|
|
# |________________|________________|__________________|
|
|
# | | | |
|
|
# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
|
|
# |________________|________________|__________________|
|
|
# | | | |
|
|
# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
|
|
# |________________|________________|__________________|
|
|
# | | | |
|
|
# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
|
|
# |________________|____(mostly)____|__________________|
|
|
# | |
|
|
# | MODULO |
|
|
# |____________________________________________________|
|
|
#
|
|
# PRE:
|
|
# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
|
|
# EXT low_acc, low_acc, low_acc, #8
|
|
# EOR res_curr (4k+0), res_curr (4k+0), low_acc
|
|
#
|
|
# CTR block:
|
|
# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
|
|
# REV ctr32, rev_ctr32
|
|
# ORR ctr64, constctr96_top32, ctr32, LSL #32
|
|
# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
|
|
# INS ctr_next.d[1], ctr64X
|
|
# ADD rev_ctr32, #1
|
|
#
|
|
# AES block:
|
|
# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
|
|
# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
|
|
# Given we are very constrained in our ASIMD registers this is quite important
|
|
#
|
|
# Encrypt:
|
|
# LDR input_low, [ input_ptr ], #8
|
|
# LDR input_high, [ input_ptr ], #8
|
|
# EOR input_low, k14_low
|
|
# EOR input_high, k14_high
|
|
# INS res_curr.d[0], input_low
|
|
# INS res_curr.d[1], input_high
|
|
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k13
|
|
# EOR res_curr, res_curr, ctr_curr
|
|
# ST1 { res_curr.16b }, [ output_ptr ], #16
|
|
#
|
|
# Decrypt:
|
|
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k13
|
|
# LDR res_curr, [ input_ptr ], #16
|
|
# EOR res_curr, res_curr, ctr_curr
|
|
# MOV output_low, res_curr.d[0]
|
|
# MOV output_high, res_curr.d[1]
|
|
# EOR output_low, k14_low
|
|
# EOR output_high, k14_high
|
|
# STP output_low, output_high, [ output_ptr ], #16
|
|
#
|
|
# GHASH block X:
|
|
# do 128b karatsuba polynomial multiplication on block
|
|
# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
|
|
#
|
|
# multiplication:
|
|
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
|
|
#
|
|
# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
|
|
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
|
|
#
|
|
# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
|
|
# multiplying with "twisted" powers of H
|
|
#
|
|
# Note: We can PMULL directly into the acc_x in first GHASH of the loop
|
|
# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
|
|
# path latency dominates the performance
|
|
#
|
|
# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
|
|
# than indicated here
|
|
# REV64 res_curr, res_curr
|
|
# INS t_m.d[0], res_curr.d[1]
|
|
# EOR t_m.8B, t_m.8B, res_curr.8B
|
|
# PMULL2 t_h, res_curr, HX
|
|
# PMULL t_l, res_curr, HX
|
|
# PMULL t_m, t_m, HX_k
|
|
# EOR acc_h, acc_h, t_h
|
|
# EOR acc_l, acc_l, t_l
|
|
# EOR acc_m, acc_m, t_m
|
|
#
|
|
# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
|
|
# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
|
|
# with a reversed constant
|
|
# EOR acc_m, acc_m, acc_h
|
|
# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
|
|
# PMULL t_mod, acc_h, mod_constant
|
|
# EXT acc_h, acc_h, acc_h, #8
|
|
# EOR acc_m, acc_m, acc_h
|
|
# EOR acc_m, acc_m, t_mod
|
|
# PMULL acc_h, acc_m, mod_constant
|
|
# EXT acc_m, acc_m, acc_m, #8
|
|
# EOR acc_l, acc_l, acc_h
|
|
# EOR acc_l, acc_l, acc_m
|
|
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour $output";
|
|
*STDOUT=*OUT;
|
|
|
|
$input_ptr="x0"; #argument block
|
|
$bit_length="x1";
|
|
$output_ptr="x2";
|
|
$current_tag="x3";
|
|
$counter="x16";
|
|
$cc="x8";
|
|
|
|
{
|
|
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
|
|
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
|
|
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
|
|
my ($output_l0,$output_h0)=map("x$_",(6..7));
|
|
|
|
my $ctr32w="w9";
|
|
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
|
|
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
|
|
|
|
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
|
|
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
|
|
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
|
|
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
|
|
|
|
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
|
|
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
|
|
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
|
|
|
|
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
|
|
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
|
|
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
|
|
|
|
my $t0="v8";
|
|
my $t0d="d8";
|
|
|
|
my ($t1,$t2,$t3)=map("v$_",(28..30));
|
|
my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
|
|
|
|
my $t4="v8";
|
|
my $t4d="d8";
|
|
my $t5="v28";
|
|
my $t5d="d28";
|
|
my $t6="v31";
|
|
my $t6d="d31";
|
|
|
|
my $t7="v4";
|
|
my $t7d="d4";
|
|
my $t8="v29";
|
|
my $t8d="d29";
|
|
my $t9="v30";
|
|
my $t9d="d30";
|
|
|
|
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
|
|
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
|
|
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
|
|
|
|
my $mod_constantd="d8";
|
|
my $mod_constant="v8";
|
|
my $mod_t="v31";
|
|
|
|
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
|
|
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
|
|
my $rk2q1="v20.1q";
|
|
my $rk3q1="v21.1q";
|
|
my $rk4v="v22";
|
|
my $rk4d="d22";
|
|
|
|
$code=<<___;
|
|
#include "arm_arch.h"
|
|
|
|
#if __ARM_MAX_ARCH__>=8
|
|
___
|
|
$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
|
|
$code.=<<___ if ($flavour !~ /64/);
|
|
.fpu neon
|
|
#ifdef __thumb2__
|
|
.syntax unified
|
|
.thumb
|
|
# define INST(a,b,c,d) $_byte c,0xef,a,b
|
|
#else
|
|
.code 32
|
|
# define INST(a,b,c,d) $_byte a,b,c,0xf2
|
|
#endif
|
|
|
|
.text
|
|
___
|
|
|
|
#########################################################################################
|
|
# size_t aes_gcm_enc_128_kernel(const unsigned char *in,
|
|
# size_t len,
|
|
# unsigned char *out,
|
|
# const void *key,
|
|
# unsigned char ivec[16],
|
|
# u64 *Xi);
|
|
#
|
|
$code.=<<___;
|
|
.global aes_gcm_enc_128_kernel
|
|
.type aes_gcm_enc_128_kernel,%function
|
|
.align 4
|
|
aes_gcm_enc_128_kernel:
|
|
cbz x1, .L128_enc_ret
|
|
stp x19, x20, [sp, #-112]!
|
|
mov x16, x4
|
|
mov x8, x5
|
|
stp x21, x22, [sp, #16]
|
|
stp x23, x24, [sp, #32]
|
|
stp d8, d9, [sp, #48]
|
|
stp d10, d11, [sp, #64]
|
|
stp d12, d13, [sp, #80]
|
|
stp d14, d15, [sp, #96]
|
|
|
|
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
|
|
ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
|
|
|
|
ld1 {$acc_lb}, [$current_tag]
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
mov $len, $main_end_input_ptr
|
|
|
|
ldr $rk9q, [$cc, #144] @ load rk9
|
|
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
|
|
lsr $rctr32x, $ctr96_t32x, #32
|
|
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
ext $h4b, $h4b, $h4b, #8
|
|
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 1
|
|
rev $rctr32w, $rctr32w @ rev_ctr32
|
|
|
|
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
|
|
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
|
|
ldr $rk0q, [$cc, #0] @ load rk0
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 1
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 1
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 3
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
|
|
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 1
|
|
rev $ctr32w, $rctr32w @ CTR block 2
|
|
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 2
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 2
|
|
rev $ctr32w, $rctr32w @ CTR block 3
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
|
|
ldr $rk1q, [$cc, #16] @ load rk1
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 3
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 3
|
|
|
|
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
ext $h3b, $h3b, $h3b, #8
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
ldr $rk2q, [$cc, #32] @ load rk2
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
ext $h1b, $h1b, $h1b, #8
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
ldr $rk8q, [$cc, #128] @ load rk8
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
ldr $rk3q, [$cc, #48] @ load rk3
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
ldr $rk6q, [$cc, #96] @ load rk6
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
ldr $rk7q, [$cc, #112] @ load rk7
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
ldr $rk5q, [$cc, #80] @ load rk5
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
|
|
ext $h2b, $h2b, $h2b, #8
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
ldr $rk4q, [$cc, #64] @ load rk4
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
|
|
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
|
|
aese $ctr2b, $rk9 @ AES block 2 - round 9
|
|
|
|
aese $ctr0b, $rk9 @ AES block 0 - round 9
|
|
|
|
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
|
|
|
|
aese $ctr1b, $rk9 @ AES block 1 - round 9
|
|
|
|
aese $ctr3b, $rk9 @ AES block 3 - round 9
|
|
b.ge .L128_enc_tail @ handle tail
|
|
|
|
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
|
|
|
|
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
|
|
|
|
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
|
|
|
|
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
|
|
|
|
eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
|
|
eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
|
|
|
|
eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
|
|
fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
|
|
|
|
eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
|
|
eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
|
|
fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
|
|
|
|
fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
|
|
eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
|
|
|
|
eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
|
|
fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
|
|
|
|
fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
|
|
eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
|
|
rev $ctr32w, $rctr32w @ CTR block 4
|
|
|
|
fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
|
|
|
|
eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4
|
|
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4
|
|
rev $ctr32w, $rctr32w @ CTR block 5
|
|
|
|
eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 5
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 5
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 5
|
|
|
|
fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
|
|
rev $ctr32w, $rctr32w @ CTR block 6
|
|
st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
|
|
|
|
fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 6
|
|
eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
|
|
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 6
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 6
|
|
rev $ctr32w, $rctr32w @ CTR block 7
|
|
st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
|
|
|
|
eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
|
|
st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
|
|
b.ge .L128_enc_prepretail @ do prepretail
|
|
|
|
.L128_enc_main_loop: @ main loop start
|
|
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
|
|
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
|
|
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+8
|
|
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
|
|
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
|
|
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
|
|
eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
|
|
eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
|
|
|
|
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
|
|
fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
|
|
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+9
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
|
|
|
|
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+10
|
|
|
|
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
|
|
st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
|
|
eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
|
|
|
|
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
|
|
|
|
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
|
|
st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+11
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
|
|
eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
|
|
b.lt .L128_enc_main_loop
|
|
|
|
.L128_enc_prepretail: @ PREPRETAIL
|
|
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
|
|
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
|
|
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
|
|
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
|
|
eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
eor $acc_mb, $acc_mb, $acc_lb
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
eor $acc_mb, $acc_mb, $t1.16b
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
eor $acc_mb, $acc_mb, $acc_hb
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
|
|
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
eor $acc_lb, $acc_lb, $t1.16b
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
|
|
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
|
|
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
|
|
|
|
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
|
|
eor $acc_lb, $acc_lb, $acc_mb
|
|
|
|
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
|
|
.L128_enc_tail: @ TAIL
|
|
|
|
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
|
|
|
|
cmp $main_end_input_ptr, #48
|
|
|
|
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
|
|
eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
|
|
|
|
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
|
|
|
|
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
|
|
|
|
eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
|
|
|
|
b.gt .L128_enc_blocks_more_than_3
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
movi $acc_l.8b, #0
|
|
mov $ctr3b, $ctr2b
|
|
|
|
cmp $main_end_input_ptr, #32
|
|
mov $ctr2b, $ctr1b
|
|
movi $acc_h.8b, #0
|
|
|
|
movi $acc_m.8b, #0
|
|
b.gt .L128_enc_blocks_more_than_2
|
|
|
|
mov $ctr3b, $ctr1b
|
|
cmp $main_end_input_ptr, #16
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
b.gt .L128_enc_blocks_more_than_1
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
b .L128_enc_blocks_less_than_1
|
|
.L128_enc_blocks_more_than_3: @ blocks left > 3
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
|
|
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-3 block
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
|
|
eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
|
|
|
|
fmov $res1d, $input_l0 @ AES final-2 block - mov low
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
|
|
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
|
|
|
|
eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
|
|
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
|
|
.L128_enc_blocks_more_than_2: @ blocks left > 2
|
|
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-2 block
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
|
|
|
|
fmov $res1d, $input_l0 @ AES final-1 block - mov low
|
|
eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
|
|
eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
|
|
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
.L128_enc_blocks_more_than_1: @ blocks left > 1
|
|
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-1 block
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
|
|
eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
|
|
|
|
fmov $res1d, $input_l0 @ AES final block - mov low
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
fmov $res1.d[1], $input_h0 @ AES final block - mov high
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
|
|
eor $res1b, $res1b, $ctr3b @ AES final block - result
|
|
|
|
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
|
|
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
.L128_enc_blocks_less_than_1: @ blocks left <= 1
|
|
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
|
|
|
|
mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
|
|
sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
|
|
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
|
|
lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
|
|
cmp $bit_length, #64
|
|
|
|
csel $input_l0, $rk10_l, $rk10_h, lt
|
|
csel $input_h0, $rk10_h, xzr, lt
|
|
|
|
fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
|
|
|
|
fmov $ctr0.d[1], $input_h0
|
|
|
|
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
|
|
rev64 $res0b, $res1b @ GHASH final block
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
mov $t0d, $res0.d[1] @ GHASH final block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
|
|
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
|
|
rev $ctr32w, $rctr32w
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
|
|
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
|
|
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
|
|
|
|
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
|
|
st1 { $res1b}, [$output_ptr] @ store all 16B
|
|
|
|
str $ctr32w, [$counter, #12] @ store the updated counter
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
mov x0, $len
|
|
st1 { $acc_l.16b }, [$current_tag]
|
|
ldp x21, x22, [sp, #16]
|
|
ldp x23, x24, [sp, #32]
|
|
ldp d8, d9, [sp, #48]
|
|
ldp d10, d11, [sp, #64]
|
|
ldp d12, d13, [sp, #80]
|
|
ldp d14, d15, [sp, #96]
|
|
ldp x19, x20, [sp], #112
|
|
ret
|
|
|
|
.L128_enc_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
|
|
___
|
|
|
|
#########################################################################################
|
|
# size_t aes_gcm_dec_128_kernel(const unsigned char *in,
|
|
# size_t len,
|
|
# unsigned char *out,
|
|
# const void *key,
|
|
# unsigned char ivec[16],
|
|
# u64 *Xi);
|
|
#
|
|
$code.=<<___;
|
|
.global aes_gcm_dec_128_kernel
|
|
.type aes_gcm_dec_128_kernel,%function
|
|
.align 4
|
|
aes_gcm_dec_128_kernel:
|
|
cbz x1, .L128_dec_ret
|
|
stp x19, x20, [sp, #-112]!
|
|
mov x16, x4
|
|
mov x8, x5
|
|
stp x21, x22, [sp, #16]
|
|
stp x23, x24, [sp, #32]
|
|
stp d8, d9, [sp, #48]
|
|
stp d10, d11, [sp, #64]
|
|
stp d12, d13, [sp, #80]
|
|
stp d14, d15, [sp, #96]
|
|
|
|
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
mov $len, $main_end_input_ptr
|
|
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
|
|
|
|
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
ldr $rk0q, [$cc, #0] @ load rk0
|
|
|
|
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
|
|
|
|
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
|
|
ext $h2b, $h2b, $h2b, #8
|
|
|
|
lsr $rctr32x, $ctr96_t32x, #32
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 2
|
|
|
|
ldr $rk1q, [$cc, #16] @ load rk1
|
|
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
|
|
rev $rctr32w, $rctr32w @ rev_ctr32
|
|
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 1
|
|
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
rev $ctr32w, $rctr32w @ CTR block 1
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
|
|
ldr $rk2q, [$cc, #32] @ load rk2
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 1
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 1
|
|
rev $ctr32w, $rctr32w @ CTR block 2
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 2
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 2
|
|
rev $ctr32w, $rctr32w @ CTR block 3
|
|
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 3
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 3
|
|
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 3
|
|
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
ldr $rk3q, [$cc, #48] @ load rk3
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
ldr $rk6q, [$cc, #96] @ load rk6
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
ldr $rk7q, [$cc, #112] @ load rk7
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
ldr $rk4q, [$cc, #64] @ load rk4
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
ld1 { $acc_lb}, [$current_tag]
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
ldr $rk5q, [$cc, #80] @ load rk5
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
ldr $rk9q, [$cc, #144] @ load rk9
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
ext $h3b, $h3b, $h3b, #8
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
ldr $rk8q, [$cc, #128] @ load rk8
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
ext $h1b, $h1b, $h1b, #8
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
|
|
|
|
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
ext $h4b, $h4b, $h4b, #8
|
|
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
|
|
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
|
|
|
|
aese $ctr2b, $rk9 @ AES block 2 - round 9
|
|
|
|
aese $ctr3b, $rk9 @ AES block 3 - round 9
|
|
|
|
aese $ctr0b, $rk9 @ AES block 0 - round 9
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
|
|
|
|
aese $ctr1b, $rk9 @ AES block 1 - round 9
|
|
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
|
|
b.ge .L128_dec_tail @ handle tail
|
|
|
|
ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
|
|
|
|
ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
|
|
|
|
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
|
|
ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
|
|
|
|
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
|
|
rev64 $res0b, $res0b @ GHASH block 0
|
|
rev $ctr32w, $rctr32w @ CTR block 4
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4
|
|
ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
|
|
|
|
rev64 $res1b, $res1b @ GHASH block 1
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
|
|
|
|
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
|
|
|
|
mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
|
|
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4
|
|
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4
|
|
rev $ctr32w, $rctr32w @ CTR block 5
|
|
eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
|
|
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 5
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 5
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 5
|
|
rev $ctr32w, $rctr32w @ CTR block 6
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 6
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
|
|
|
|
eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
|
|
eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
|
|
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
|
|
|
|
eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
|
|
|
|
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
|
|
b.ge .L128_dec_prepretail @ do prepretail
|
|
|
|
.L128_dec_main_loop: @ main loop start
|
|
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
|
|
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+7
|
|
|
|
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+8
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
|
|
rev64 $res1b, $res1b @ GHASH block 4k+5
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
|
|
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+9
|
|
|
|
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
|
|
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
|
|
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
|
|
eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
|
|
|
|
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
|
|
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
|
|
|
|
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
|
|
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
|
|
rev64 $res0b, $res0b @ GHASH block 4k+4
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+10
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
|
|
|
|
eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
|
|
|
|
eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
|
|
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
|
|
b.lt L128_dec_main_loop
|
|
|
|
.L128_dec_prepretail: @ PREPRETAIL
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
|
|
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+7
|
|
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
|
|
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
|
|
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
|
|
|
|
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
|
|
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
|
|
|
|
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
|
|
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
|
|
|
|
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
.L128_dec_tail: @ TAIL
|
|
|
|
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
|
|
|
|
eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
|
|
|
|
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
|
|
|
|
cmp $main_end_input_ptr, #48
|
|
|
|
eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
|
|
|
|
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
|
|
b.gt .L128_dec_blocks_more_than_3
|
|
|
|
mov $ctr3b, $ctr2b
|
|
sub $rctr32w, $rctr32w, #1
|
|
movi $acc_l.8b, #0
|
|
|
|
movi $acc_h.8b, #0
|
|
mov $ctr2b, $ctr1b
|
|
|
|
movi $acc_m.8b, #0
|
|
cmp $main_end_input_ptr, #32
|
|
b.gt .L128_dec_blocks_more_than_2
|
|
|
|
cmp $main_end_input_ptr, #16
|
|
|
|
mov $ctr3b, $ctr1b
|
|
sub $rctr32w, $rctr32w, #1
|
|
b.gt .L128_dec_blocks_more_than_1
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
b .L128_dec_blocks_less_than_1
|
|
.L128_dec_blocks_more_than_3: @ blocks left > 3
|
|
rev64 $res0b, $res1b @ GHASH final-3 block
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
|
|
eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
|
|
mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
|
|
|
|
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
|
|
eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
|
|
.L128_dec_blocks_more_than_2: @ blocks left > 2
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-2 block
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
|
|
eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
|
|
.L128_dec_blocks_more_than_1: @ blocks left > 1
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-1 block
|
|
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
|
|
|
|
eor $ctr0b, $res1b, $ctr3b @ AES final block - result
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
|
|
mov $output_l0, $ctr0.d[0] @ AES final block - mov low
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES final block - mov high
|
|
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
|
|
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
|
|
|
|
eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
.L128_dec_blocks_less_than_1: @ blocks left <= 1
|
|
|
|
mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
|
|
mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
|
|
sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
|
|
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
|
|
lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
|
|
cmp $bit_length, #64
|
|
|
|
csel $ctr96_b64x, $rk10_h, xzr, lt
|
|
csel $ctr32x, $rk10_l, $rk10_h, lt
|
|
|
|
fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
|
|
|
|
mov $ctr0.d[1], $ctr96_b64x
|
|
|
|
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
|
|
rev64 $res0b, $res1b @ GHASH final block
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
|
|
|
|
and $output_h0, $output_h0, $ctr96_b64x
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
mov $t0d, $res0.d[1] @ GHASH final block - mid
|
|
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
|
|
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
|
|
and $output_l0, $output_l0, $ctr32x
|
|
|
|
rev $ctr32w, $rctr32w
|
|
|
|
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
|
|
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
orr $output_l0, $output_l0, $end_input_ptr
|
|
str $ctr32w, [$counter, #12] @ store the updated counter
|
|
|
|
orr $output_h0, $output_h0, $main_end_input_ptr
|
|
stp $output_l0, $output_h0, [$output_ptr]
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
mov x0, $len
|
|
st1 { $acc_l.16b }, [$current_tag]
|
|
|
|
ldp x21, x22, [sp, #16]
|
|
ldp x23, x24, [sp, #32]
|
|
ldp d8, d9, [sp, #48]
|
|
ldp d10, d11, [sp, #64]
|
|
ldp d12, d13, [sp, #80]
|
|
ldp d14, d15, [sp, #96]
|
|
ldp x19, x20, [sp], #112
|
|
ret
|
|
|
|
.L128_dec_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
|
|
___
|
|
}
|
|
|
|
{
|
|
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
|
|
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
|
|
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
|
|
my ($output_l0,$output_h0)=map("x$_",(6..7));
|
|
|
|
my $ctr32w="w9";
|
|
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
|
|
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
|
|
|
|
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
|
|
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
|
|
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
|
|
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
|
|
|
|
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
|
|
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
|
|
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
|
|
|
|
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
|
|
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
|
|
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
|
|
|
|
my $t0="v8";
|
|
my $t0d="d8";
|
|
my $t3="v4";
|
|
my $t3d="d4";
|
|
|
|
my ($t1,$t2)=map("v$_",(30..31));
|
|
my ($t1d,$t2d)=map("d$_",(30..31));
|
|
|
|
my $t4="v30";
|
|
my $t4d="d30";
|
|
my $t5="v8";
|
|
my $t5d="d8";
|
|
my $t6="v31";
|
|
my $t6d="d31";
|
|
|
|
my $t7="v5";
|
|
my $t7d="d5";
|
|
my $t8="v6";
|
|
my $t8d="d6";
|
|
my $t9="v30";
|
|
my $t9d="d30";
|
|
|
|
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
|
|
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
|
|
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
|
|
|
|
my $mod_constantd="d8";
|
|
my $mod_constant="v8";
|
|
my $mod_t="v31";
|
|
|
|
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
|
|
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
|
|
my $rk2q1="v20.1q";
|
|
my $rk3q1="v21.1q";
|
|
my $rk4v="v22";
|
|
my $rk4d="d22";
|
|
|
|
#########################################################################################
|
|
# size_t aes_gcm_enc_192_kernel(const unsigned char *in,
|
|
# size_t len,
|
|
# unsigned char *out,
|
|
# const void *key,
|
|
# unsigned char ivec[16],
|
|
# u64 *Xi);
|
|
#
|
|
$code.=<<___;
|
|
.global aes_gcm_enc_192_kernel
|
|
.type aes_gcm_enc_192_kernel,%function
|
|
.align 4
|
|
aes_gcm_enc_192_kernel:
|
|
cbz x1, .L192_enc_ret
|
|
stp x19, x20, [sp, #-112]!
|
|
mov x16, x4
|
|
mov x8, x5
|
|
stp x21, x22, [sp, #16]
|
|
stp x23, x24, [sp, #32]
|
|
stp d8, d9, [sp, #48]
|
|
stp d10, d11, [sp, #64]
|
|
stp d12, d13, [sp, #80]
|
|
stp d14, d15, [sp, #96]
|
|
|
|
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
|
|
|
|
ldr $rk5q, [$cc, #80] @ load rk5
|
|
|
|
ldr $rk4q, [$cc, #64] @ load rk4
|
|
|
|
ldr $rk8q, [$cc, #128] @ load rk8
|
|
|
|
lsr $rctr32x, $ctr96_t32x, #32
|
|
ldr $rk6q, [$cc, #96] @ load rk6
|
|
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
|
|
|
|
ldr $rk7q, [$cc, #112] @ load rk7
|
|
rev $rctr32w, $rctr32w @ rev_ctr32
|
|
|
|
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 3
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 1
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 1
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 1
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
|
|
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 1
|
|
rev $ctr32w, $rctr32w @ CTR block 2
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 2
|
|
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 2
|
|
rev $ctr32w, $rctr32w @ CTR block 3
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
|
|
ldr $rk0q, [$cc, #0] @ load rk0
|
|
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 3
|
|
|
|
ldr $rk3q, [$cc, #48] @ load rk3
|
|
|
|
ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
|
|
|
|
ldr $rk1q, [$cc, #16] @ load rk1
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
ld1 { $acc_lb}, [$current_tag]
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
ldr $rk11q, [$cc, #176] @ load rk11
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
ext $h4b, $h4b, $h4b, #8
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
ldr $rk2q, [$cc, #32] @ load rk2
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
ldr $rk10q, [$cc, #160] @ load rk10
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
ext $h1b, $h1b, $h1b, #8
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
ldr $rk9q, [$cc, #144] @ load rk9
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
ext $h3b, $h3b, $h3b, #8
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
|
|
ext $h2b, $h2b, $h2b, #8
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
|
|
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
mov $len, $main_end_input_ptr
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
|
|
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
|
|
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
|
|
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
|
|
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
|
|
|
|
aese $ctr2b, $rk11 @ AES block 2 - round 11
|
|
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
|
|
aese $ctr1b, $rk11 @ AES block 1 - round 11
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
|
|
|
|
aese $ctr0b, $rk11 @ AES block 0 - round 11
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 3
|
|
|
|
aese $ctr3b, $rk11 @ AES block 3 - round 11
|
|
b.ge .L192_enc_tail @ handle tail
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 4
|
|
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
|
|
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
|
|
|
|
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
|
|
|
|
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
|
|
eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
|
|
|
|
eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
|
|
eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
|
|
fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
|
|
|
|
eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
|
|
fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
|
|
|
|
eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
|
|
eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
|
|
|
|
fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
|
|
eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
|
|
|
|
fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
|
|
|
|
eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
|
|
fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4
|
|
eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4
|
|
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4
|
|
rev $ctr32w, $rctr32w @ CTR block 5
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 5
|
|
|
|
fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
|
|
st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
|
|
|
|
fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
|
|
|
|
eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 5
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
|
|
|
|
fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 5
|
|
rev $ctr32w, $rctr32w @ CTR block 6
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 6
|
|
eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 6
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 6
|
|
rev $ctr32w, $rctr32w @ CTR block 7
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
|
|
st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
|
|
|
|
eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
|
|
st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
|
|
b.ge .L192_enc_prepretail @ do prepretail
|
|
|
|
.L192_enc_main_loop: @ main loop start
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
|
|
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
|
|
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
|
|
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
|
|
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+8
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
|
|
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
|
|
|
|
aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
|
|
|
|
eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
|
|
|
|
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+9
|
|
|
|
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
|
|
st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
|
|
|
|
eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
|
|
|
|
aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+10
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
|
|
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
|
|
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
|
|
|
|
aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
|
|
eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
|
|
|
|
st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+11
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
|
|
|
|
eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
|
|
st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
|
|
b.lt .L192_enc_main_loop
|
|
|
|
.L192_enc_prepretail: @ PREPRETAIL
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
|
|
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
eor $acc_mb, $acc_mb, $acc_lb
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
|
|
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
eor $acc_mb, $acc_mb, $t1.16b
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
eor $acc_mb, $acc_mb, $acc_hb
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
|
|
|
|
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
|
|
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
|
|
eor $acc_lb, $acc_lb, $t1.16b
|
|
|
|
aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
|
|
|
|
aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
|
|
|
|
aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
|
|
|
|
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
|
|
eor $acc_lb, $acc_lb, $acc_mb
|
|
.L192_enc_tail: @ TAIL
|
|
|
|
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
|
|
|
|
eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
|
|
eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
|
|
|
|
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
|
|
|
|
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
|
|
cmp $main_end_input_ptr, #48
|
|
|
|
eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
|
|
|
|
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
b.gt .L192_enc_blocks_more_than_3
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
movi $acc_m.8b, #0
|
|
|
|
mov $ctr3b, $ctr2b
|
|
movi $acc_h.8b, #0
|
|
cmp $main_end_input_ptr, #32
|
|
|
|
mov $ctr2b, $ctr1b
|
|
movi $acc_l.8b, #0
|
|
b.gt .L192_enc_blocks_more_than_2
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
|
|
mov $ctr3b, $ctr1b
|
|
cmp $main_end_input_ptr, #16
|
|
b.gt .L192_enc_blocks_more_than_1
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
b .L192_enc_blocks_less_than_1
|
|
.L192_enc_blocks_more_than_3: @ blocks left > 3
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
|
|
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-3 block
|
|
|
|
eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
|
|
fmov $res1d, $input_l0 @ AES final-2 block - mov low
|
|
|
|
fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
|
|
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
|
|
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
|
|
eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
|
|
.L192_enc_blocks_more_than_2: @ blocks left > 2
|
|
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-2 block
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
|
|
|
|
fmov $res1d, $input_l0 @ AES final-1 block - mov low
|
|
|
|
fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
|
|
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
|
|
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
.L192_enc_blocks_more_than_1: @ blocks left > 1
|
|
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
|
|
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-1 block
|
|
|
|
eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
|
|
fmov $res1d, $input_l0 @ AES final block - mov low
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
fmov $res1.d[1], $input_h0 @ AES final block - mov high
|
|
|
|
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
|
|
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
|
|
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
|
|
eor $res1b, $res1b, $ctr3b @ AES final block - result
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
.L192_enc_blocks_less_than_1: @ blocks left <= 1
|
|
|
|
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
|
|
rev $ctr32w, $rctr32w
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
|
|
sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
|
|
|
|
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
|
|
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
|
|
lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
|
|
cmp $bit_length, #64
|
|
|
|
csel $input_l0, $rk12_l, $rk12_h, lt
|
|
csel $input_h0, $rk12_h, xzr, lt
|
|
|
|
fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
|
|
|
|
fmov $ctr0.d[1], $input_h0
|
|
|
|
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
|
|
rev64 $res0b, $res1b @ GHASH final block
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
mov $t0d, $res0.d[1] @ GHASH final block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
|
|
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
|
|
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
|
|
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
|
|
str $ctr32w, [$counter, #12] @ store the updated counter
|
|
|
|
st1 { $res1b}, [$output_ptr] @ store all 16B
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
mov x0, $len
|
|
st1 { $acc_l.16b }, [$current_tag]
|
|
|
|
ldp x21, x22, [sp, #16]
|
|
ldp x23, x24, [sp, #32]
|
|
ldp d8, d9, [sp, #48]
|
|
ldp d10, d11, [sp, #64]
|
|
ldp d12, d13, [sp, #80]
|
|
ldp d14, d15, [sp, #96]
|
|
ldp x19, x20, [sp], #112
|
|
ret
|
|
|
|
.L192_enc_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
|
|
___
|
|
|
|
#########################################################################################
|
|
# size_t aes_gcm_dec_192_kernel(const unsigned char *in,
|
|
# size_t len,
|
|
# unsigned char *out,
|
|
# const void *key,
|
|
# unsigned char ivec[16],
|
|
# u64 *Xi);
|
|
#
|
|
$code.=<<___;
|
|
.global aes_gcm_dec_192_kernel
|
|
.type aes_gcm_dec_192_kernel,%function
|
|
.align 4
|
|
aes_gcm_dec_192_kernel:
|
|
cbz x1, .L192_dec_ret
|
|
stp x19, x20, [sp, #-112]!
|
|
mov x16, x4
|
|
mov x8, x5
|
|
stp x21, x22, [sp, #16]
|
|
stp x23, x24, [sp, #32]
|
|
stp d8, d9, [sp, #48]
|
|
stp d10, d11, [sp, #64]
|
|
stp d12, d13, [sp, #80]
|
|
stp d14, d15, [sp, #96]
|
|
|
|
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
|
|
|
|
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
|
|
|
|
ldr $rk0q, [$cc, #0] @ load rk0
|
|
|
|
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
mov $len, $main_end_input_ptr
|
|
ldr $rk2q, [$cc, #32] @ load rk2
|
|
|
|
lsr $rctr32x, $ctr96_t32x, #32
|
|
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 3
|
|
|
|
rev $rctr32w, $rctr32w @ rev_ctr32
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 1
|
|
|
|
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
|
|
ldr $rk1q, [$cc, #16] @ load rk1
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
rev $ctr32w, $rctr32w @ CTR block 1
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 1
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
|
|
ldr $rk3q, [$cc, #48] @ load rk3
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 1
|
|
rev $ctr32w, $rctr32w @ CTR block 2
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 2
|
|
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 2
|
|
rev $ctr32w, $rctr32w @ CTR block 3
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
|
|
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 3
|
|
|
|
ldr $rk8q, [$cc, #128] @ load rk8
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
ldr $rk11q, [$cc, #176] @ load rk11
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
ext $h4b, $h4b, $h4b, #8
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
|
|
ext $h2b, $h2b, $h2b, #8
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
ext $h3b, $h3b, $h3b, #8
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
ext $h1b, $h1b, $h1b, #8
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
ldr $rk10q, [$cc, #160] @ load rk10
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
ldr $rk9q, [$cc, #144] @ load rk9
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
ldr $rk7q, [$cc, #112] @ load rk7
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
ldr $rk4q, [$cc, #64] @ load rk4
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
ld1 { $acc_lb}, [$current_tag]
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 3
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
ldr $rk5q, [$cc, #80] @ load rk5
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
ldr $rk6q, [$cc, #96] @ load rk6
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
|
|
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
|
|
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
|
|
|
|
aese $ctr3b, $rk11 @ AES block 3 - round 11
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
|
|
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
|
|
|
|
aese $ctr2b, $rk11 @ AES block 2 - round 11
|
|
|
|
aese $ctr1b, $rk11 @ AES block 1 - round 11
|
|
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
|
|
|
|
aese $ctr0b, $rk11 @ AES block 0 - round 11
|
|
b.ge .L192_dec_tail @ handle tail
|
|
|
|
ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
|
|
|
|
ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
|
|
|
|
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
|
|
|
|
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
|
|
rev $ctr32w, $rctr32w @ CTR block 4
|
|
ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
|
|
|
|
ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
|
|
|
|
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
|
|
|
|
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
|
|
|
|
mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
|
|
rev64 $res0b, $res0b @ GHASH block 0
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4
|
|
rev64 $res1b, $res1b @ GHASH block 1
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
|
|
eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4
|
|
rev $ctr32w, $rctr32w @ CTR block 5
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 5
|
|
eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 5
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 5
|
|
eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 6
|
|
eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
|
|
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
|
|
|
|
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 6
|
|
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
|
|
b.ge .L192_dec_prepretail @ do prepretail
|
|
|
|
.L192_dec_main_loop: @ main loop start
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
|
|
|
|
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
|
|
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+7
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
|
|
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
|
|
eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
|
|
|
|
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
|
|
ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+8
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
|
|
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
|
|
eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
|
|
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
|
|
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
|
|
|
|
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
|
|
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
|
|
rev64 $res1b, $res1b @ GHASH block 4k+5
|
|
|
|
aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
|
|
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
|
|
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
|
|
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+9
|
|
|
|
eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
|
|
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
|
|
eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+10
|
|
eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
|
|
|
|
eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
|
|
rev64 $res0b, $res0b @ GHASH block 4k+4
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
|
|
|
|
aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
|
|
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
|
|
b.lt .L192_dec_main_loop
|
|
|
|
.L192_dec_prepretail: @ PREPRETAIL
|
|
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+7
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
|
|
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
|
|
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3
|
|
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
|
|
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
|
|
|
|
aese $ctr0b, $rk11
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
|
|
|
|
aese $ctr2b, $rk11
|
|
|
|
aese $ctr1b, $rk11
|
|
|
|
aese $ctr3b, $rk11
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
.L192_dec_tail: @ TAIL
|
|
|
|
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
|
|
|
|
eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
|
|
|
|
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
|
|
|
|
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
|
|
cmp $main_end_input_ptr, #48
|
|
|
|
eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
|
|
|
|
eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
|
|
b.gt .L192_dec_blocks_more_than_3
|
|
|
|
movi $acc_l.8b, #0
|
|
movi $acc_h.8b, #0
|
|
|
|
mov $ctr3b, $ctr2b
|
|
mov $ctr2b, $ctr1b
|
|
sub $rctr32w, $rctr32w, #1
|
|
|
|
movi $acc_m.8b, #0
|
|
cmp $main_end_input_ptr, #32
|
|
b.gt .L192_dec_blocks_more_than_2
|
|
|
|
mov $ctr3b, $ctr1b
|
|
cmp $main_end_input_ptr, #16
|
|
sub $rctr32w, $rctr32w, #1
|
|
|
|
b.gt .L192_dec_blocks_more_than_1
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
b .L192_dec_blocks_less_than_1
|
|
.L192_dec_blocks_more_than_3: @ blocks left > 3
|
|
rev64 $res0b, $res1b @ GHASH final-3 block
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
|
|
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
|
|
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
|
|
|
|
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
|
|
eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
|
|
eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
|
|
.L192_dec_blocks_more_than_2: @ blocks left > 2
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-2 block
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
|
|
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
|
|
|
|
eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
.L192_dec_blocks_more_than_1: @ blocks left > 1
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-1 block
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
|
|
eor $ctr0b, $res1b, $ctr3b @ AES final block - result
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
|
|
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
mov $output_h0, $ctr0.d[1] @ AES final block - mov high
|
|
|
|
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
mov $output_l0, $ctr0.d[0] @ AES final block - mov low
|
|
|
|
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
|
|
|
|
eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
|
|
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
.L192_dec_blocks_less_than_1: @ blocks left <= 1
|
|
|
|
mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
|
|
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
|
|
sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
|
|
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
|
|
|
|
lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
|
|
cmp $bit_length, #64
|
|
|
|
csel $ctr32x, $rk12_l, $rk12_h, lt
|
|
csel $ctr96_b64x, $rk12_h, xzr, lt
|
|
|
|
fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
|
|
and $output_l0, $output_l0, $ctr32x
|
|
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
|
|
|
|
orr $output_l0, $output_l0, $end_input_ptr
|
|
mov $ctr0.d[1], $ctr96_b64x
|
|
|
|
rev $ctr32w, $rctr32w
|
|
|
|
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
str $ctr32w, [$counter, #12] @ store the updated counter
|
|
|
|
rev64 $res0b, $res1b @ GHASH final block
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
|
|
|
|
and $output_h0, $output_h0, $ctr96_b64x
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
mov $t0d, $res0.d[1] @ GHASH final block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
|
|
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
|
|
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
orr $output_h0, $output_h0, $main_end_input_ptr
|
|
stp $output_l0, $output_h0, [$output_ptr]
|
|
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
|
|
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
mov x0, $len
|
|
st1 { $acc_l.16b }, [$current_tag]
|
|
|
|
ldp x21, x22, [sp, #16]
|
|
ldp x23, x24, [sp, #32]
|
|
ldp d8, d9, [sp, #48]
|
|
ldp d10, d11, [sp, #64]
|
|
ldp d12, d13, [sp, #80]
|
|
ldp d14, d15, [sp, #96]
|
|
ldp x19, x20, [sp], #112
|
|
ret
|
|
|
|
.L192_dec_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
|
|
___
|
|
}
|
|
|
|
{
|
|
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
|
|
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
|
|
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
|
|
my ($output_l0,$output_h0)=map("x$_",(6..7));
|
|
|
|
my $ctr32w="w9";
|
|
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
|
|
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
|
|
|
|
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
|
|
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
|
|
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
|
|
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
|
|
|
|
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
|
|
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
|
|
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
|
|
|
|
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
|
|
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
|
|
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
|
|
|
|
my $t0="v8";
|
|
my $t0d="d8";
|
|
my $t1="v4";
|
|
my $t1d="d4";
|
|
my $t2="v8";
|
|
my $t2d="d8";
|
|
my $t3="v4";
|
|
my $t3d="d4";
|
|
my $t4="v4";
|
|
my $t4d="d4";
|
|
my $t5="v5";
|
|
my $t5d="d5";
|
|
my $t6="v8";
|
|
my $t6d="d8";
|
|
my $t7="v5";
|
|
my $t7d="d5";
|
|
my $t8="v6";
|
|
my $t8d="d6";
|
|
my $t9="v4";
|
|
my $t9d="d4";
|
|
|
|
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
|
|
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
|
|
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
|
|
|
|
my $mod_constantd="d8";
|
|
my $mod_constant="v8";
|
|
my $mod_t="v7";
|
|
|
|
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
|
|
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
|
|
my $rk2q1="v20.1q";
|
|
my $rk3q1="v21.1q";
|
|
my $rk4v="v22";
|
|
my $rk4d="d22";
|
|
|
|
#########################################################################################
|
|
# size_t aes_gcm_enc_256_kernel(const unsigned char *in,
|
|
# size_t len,
|
|
# unsigned char *out,
|
|
# const void *key,
|
|
# unsigned char ivec[16],
|
|
# u64 *Xi);
|
|
#
|
|
$code.=<<___;
|
|
.global aes_gcm_enc_256_kernel
|
|
.type aes_gcm_enc_256_kernel,%function
|
|
.align 4
|
|
aes_gcm_enc_256_kernel:
|
|
cbz x1, .L256_enc_ret
|
|
stp x19, x20, [sp, #-112]!
|
|
mov x16, x4
|
|
mov x8, x5
|
|
stp x21, x22, [sp, #16]
|
|
stp x23, x24, [sp, #32]
|
|
stp d8, d9, [sp, #48]
|
|
stp d10, d11, [sp, #64]
|
|
stp d12, d13, [sp, #80]
|
|
stp d14, d15, [sp, #96]
|
|
|
|
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
mov $len, $main_end_input_ptr
|
|
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
|
|
|
|
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
|
|
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
|
|
ldr $rk0q, [$cc, #0] @ load rk0
|
|
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
|
|
ldr $rk7q, [$cc, #112] @ load rk7
|
|
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
|
|
lsr $rctr32x, $ctr96_t32x, #32
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 2
|
|
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
|
|
|
|
rev $rctr32w, $rctr32w @ rev_ctr32
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 1
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 1
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 3
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 1
|
|
ldr $rk1q, [$cc, #16] @ load rk1
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 1
|
|
rev $ctr32w, $rctr32w @ CTR block 2
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 2
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
|
|
ldr $rk2q, [$cc, #32] @ load rk2
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 2
|
|
rev $ctr32w, $rctr32w @ CTR block 3
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
|
|
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 3
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
ldr $rk3q, [$cc, #48] @ load rk3
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
ldr $rk6q, [$cc, #96] @ load rk6
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
ldr $rk5q, [$cc, #80] @ load rk5
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
ext $h3b, $h3b, $h3b, #8
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
ldr $rk13q, [$cc, #208] @ load rk13
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
ldr $rk4q, [$cc, #64] @ load rk4
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
|
|
ext $h2b, $h2b, $h2b, #8
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
ldr $rk12q, [$cc, #192] @ load rk12
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
ext $h4b, $h4b, $h4b, #8
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
ldr $rk11q, [$cc, #176] @ load rk11
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
ldr $rk8q, [$cc, #128] @ load rk8
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 3
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
ld1 { $acc_lb}, [$current_tag]
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
ldr $rk9q, [$cc, #144] @ load rk9
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
ext $h1b, $h1b, $h1b, #8
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
ldr $rk10q, [$cc, #160] @ load rk10
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
|
|
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
|
|
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
|
|
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
|
|
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
|
|
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
|
|
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
|
|
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
|
|
|
|
aese $ctr2b, $rk13 @ AES block 2 - round 13
|
|
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
|
|
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
|
|
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
|
|
|
|
aese $ctr1b, $rk13 @ AES block 1 - round 13
|
|
|
|
aese $ctr0b, $rk13 @ AES block 0 - round 13
|
|
|
|
aese $ctr3b, $rk13 @ AES block 3 - round 13
|
|
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
|
|
b.ge .L256_enc_tail @ handle tail
|
|
|
|
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 4
|
|
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
|
|
|
|
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
|
|
|
|
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
|
|
eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
|
|
eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
|
|
|
|
fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
|
|
eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
|
|
|
|
eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
|
|
eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
|
|
fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
|
|
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
|
|
eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
|
|
|
|
eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
|
|
fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
|
|
|
|
fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
|
|
fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
|
|
eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
|
|
|
|
fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
|
|
|
|
eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4
|
|
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4
|
|
rev $ctr32w, $rctr32w @ CTR block 5
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 5
|
|
|
|
eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 5
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 5
|
|
rev $ctr32w, $rctr32w @ CTR block 6
|
|
st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
|
|
|
|
fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
|
|
eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
|
|
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 6
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 6
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 6
|
|
st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
|
|
rev $ctr32w, $rctr32w @ CTR block 7
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
|
|
|
|
eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
|
|
st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
|
|
b.ge L256_enc_prepretail @ do prepretail
|
|
|
|
.L256_enc_main_loop: @ main loop start
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
|
|
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+8
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
|
|
eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
|
|
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
|
|
eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
|
|
|
|
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
|
|
eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
|
|
eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
|
|
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
|
|
eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
|
|
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
|
|
|
|
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
|
|
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
|
|
fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
|
|
|
|
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
|
|
fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
|
|
|
|
fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
|
|
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
|
|
fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
|
|
|
|
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
|
|
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+9
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
|
|
|
|
eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
|
|
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
|
|
|
|
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+10
|
|
st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
|
|
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
|
|
fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
|
|
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
|
|
|
|
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
|
|
eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
|
|
|
|
st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+11
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
|
|
|
|
eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
|
|
st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
|
|
b.lt L256_enc_main_loop
|
|
|
|
.L256_enc_prepretail: @ PREPRETAIL
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
|
|
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
|
|
|
|
eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
|
|
|
|
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
eor $acc_mb, $acc_mb, $acc_lb
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
|
|
eor $acc_mb, $acc_mb, $t1.16b
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
|
|
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
|
|
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
|
|
eor $acc_mb, $acc_mb, $acc_hb
|
|
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
|
|
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
|
|
|
|
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
|
|
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8
|
|
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
|
|
|
|
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
|
|
eor $acc_lb, $acc_lb, $t1.16b
|
|
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
|
|
|
|
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
|
|
|
|
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
|
|
|
|
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
|
|
eor $acc_lb, $acc_lb, $acc_mb
|
|
.L256_enc_tail: @ TAIL
|
|
|
|
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
|
|
|
|
eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
|
|
eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
|
|
|
|
cmp $main_end_input_ptr, #48
|
|
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
|
|
|
|
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
|
|
|
|
eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
|
|
b.gt .L256_enc_blocks_more_than_3
|
|
|
|
cmp $main_end_input_ptr, #32
|
|
mov $ctr3b, $ctr2b
|
|
movi $acc_l.8b, #0
|
|
|
|
movi $acc_h.8b, #0
|
|
sub $rctr32w, $rctr32w, #1
|
|
|
|
mov $ctr2b, $ctr1b
|
|
movi $acc_m.8b, #0
|
|
b.gt .L256_enc_blocks_more_than_2
|
|
|
|
mov $ctr3b, $ctr1b
|
|
sub $rctr32w, $rctr32w, #1
|
|
cmp $main_end_input_ptr, #16
|
|
|
|
b.gt .L256_enc_blocks_more_than_1
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
b .L256_enc_blocks_less_than_1
|
|
.L256_enc_blocks_more_than_3: @ blocks left > 3
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
|
|
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-3 block
|
|
|
|
eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
|
|
fmov $res1d, $input_l0 @ AES final-2 block - mov low
|
|
|
|
fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
|
|
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
|
|
eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
|
|
.L256_enc_blocks_more_than_2: @ blocks left > 2
|
|
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
|
|
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-2 block
|
|
|
|
eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
fmov $res1d, $input_l0 @ AES final-1 block - mov low
|
|
eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
|
|
|
|
fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
|
|
eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
|
|
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
.L256_enc_blocks_more_than_1: @ blocks left > 1
|
|
|
|
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-1 block
|
|
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
|
|
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
|
|
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
fmov $res1d, $input_l0 @ AES final block - mov low
|
|
|
|
fmov $res1.d[1], $input_h0 @ AES final block - mov high
|
|
|
|
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
|
|
eor $res1b, $res1b, $ctr3b @ AES final block - result
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
.L256_enc_blocks_less_than_1: @ blocks left <= 1
|
|
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
|
|
mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
|
|
sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
|
|
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
|
|
|
|
mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
|
|
lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
|
|
cmp $bit_length, #64
|
|
|
|
csel $input_l0, $rk14_l, $rk14_h, lt
|
|
csel $input_h0, $rk14_h, xzr, lt
|
|
|
|
fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
|
|
|
|
fmov $ctr0.d[1], $input_h0
|
|
|
|
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
|
|
rev64 $res0b, $res1b @ GHASH final block
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
mov $t0d, $res0.d[1] @ GHASH final block - mid
|
|
rev $ctr32w, $rctr32w
|
|
|
|
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
|
|
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
|
|
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
str $ctr32w, [$counter, #12] @ store the updated counter
|
|
|
|
st1 { $res1b}, [$output_ptr] @ store all 16B
|
|
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
mov x0, $len
|
|
st1 { $acc_l.16b }, [$current_tag]
|
|
|
|
ldp x21, x22, [sp, #16]
|
|
ldp x23, x24, [sp, #32]
|
|
ldp d8, d9, [sp, #48]
|
|
ldp d10, d11, [sp, #64]
|
|
ldp d12, d13, [sp, #80]
|
|
ldp d14, d15, [sp, #96]
|
|
ldp x19, x20, [sp], #112
|
|
ret
|
|
|
|
.L256_enc_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
|
|
___
|
|
|
|
{
|
|
my $t8="v4";
|
|
my $t8d="d4";
|
|
my $t9="v6";
|
|
my $t9d="d6";
|
|
#########################################################################################
|
|
# size_t aes_gcm_dec_256_kernel(const unsigned char *in,
|
|
# size_t len,
|
|
# unsigned char *out,
|
|
# const void *key,
|
|
# unsigned char ivec[16],
|
|
# u64 *Xi);
|
|
#
|
|
$code.=<<___;
|
|
.global aes_gcm_dec_256_kernel
|
|
.type aes_gcm_dec_256_kernel,%function
|
|
.align 4
|
|
aes_gcm_dec_256_kernel:
|
|
cbz x1, .L256_dec_ret
|
|
stp x19, x20, [sp, #-112]!
|
|
mov x16, x4
|
|
mov x8, x5
|
|
stp x21, x22, [sp, #16]
|
|
stp x23, x24, [sp, #32]
|
|
stp d8, d9, [sp, #48]
|
|
stp d10, d11, [sp, #64]
|
|
stp d12, d13, [sp, #80]
|
|
stp d14, d15, [sp, #96]
|
|
|
|
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
mov $len, $main_end_input_ptr
|
|
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
|
|
|
|
ldr $rk8q, [$cc, #128] @ load rk8
|
|
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
|
|
ldr $rk7q, [$cc, #112] @ load rk7
|
|
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
|
|
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
ldr $rk6q, [$cc, #96] @ load rk6
|
|
|
|
lsr $rctr32x, $ctr96_t32x, #32
|
|
ldr $rk5q, [$cc, #80] @ load rk5
|
|
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
|
|
|
|
ldr $rk3q, [$cc, #48] @ load rk3
|
|
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
rev $rctr32w, $rctr32w @ rev_ctr32
|
|
|
|
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 3
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 1
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 1
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 1
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
|
|
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 1
|
|
rev $ctr32w, $rctr32w @ CTR block 2
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 2
|
|
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 2
|
|
rev $ctr32w, $rctr32w @ CTR block 3
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
|
|
ldr $rk0q, [$cc, #0] @ load rk0
|
|
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 3
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 3
|
|
|
|
ldr $rk4q, [$cc, #64] @ load rk4
|
|
|
|
ldr $rk13q, [$cc, #208] @ load rk13
|
|
|
|
ldr $rk1q, [$cc, #16] @ load rk1
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
ext $h3b, $h3b, $h3b, #8
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
ext $h4b, $h4b, $h4b, #8
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
|
|
ext $h2b, $h2b, $h2b, #8
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
ldr $rk2q, [$cc, #32] @ load rk2
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
ld1 { $acc_lb}, [$current_tag]
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
ldr $rk9q, [$cc, #144] @ load rk9
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
ldr $rk12q, [$cc, #192] @ load rk12
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
ext $h1b, $h1b, $h1b, #8
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
ldr $rk10q, [$cc, #160] @ load rk10
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
ldr $rk11q, [$cc, #176] @ load rk11
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
|
|
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
|
|
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
|
|
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
|
|
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
|
|
|
|
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
|
|
|
|
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
|
|
|
|
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
|
|
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
|
|
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
|
|
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
|
|
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
|
|
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
|
|
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
|
|
|
|
aese $ctr1b, $rk13 @ AES block 1 - round 13
|
|
|
|
aese $ctr2b, $rk13 @ AES block 2 - round 13
|
|
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
|
|
|
|
aese $ctr3b, $rk13 @ AES block 3 - round 13
|
|
|
|
aese $ctr0b, $rk13 @ AES block 0 - round 13
|
|
b.ge .L256_dec_tail @ handle tail
|
|
|
|
ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
|
|
|
|
ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
|
|
|
|
rev $ctr32w, $rctr32w @ CTR block 4
|
|
|
|
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
|
|
|
|
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
|
|
rev64 $res1b, $res1b @ GHASH block 1
|
|
ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
|
|
|
|
mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
|
|
rev64 $res0b, $res0b @ GHASH block 0
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4
|
|
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
|
|
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4
|
|
rev $ctr32w, $rctr32w @ CTR block 5
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 5
|
|
|
|
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
|
|
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
|
|
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
|
|
eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
|
|
|
|
eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 5
|
|
|
|
ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 5
|
|
rev $ctr32w, $rctr32w @ CTR block 6
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 6
|
|
|
|
eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
|
|
|
|
eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
|
|
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
|
|
|
|
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
|
|
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
b.ge .L256_dec_prepretail @ do prepretail
|
|
|
|
.L256_dec_main_loop: @ main loop start
|
|
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+7
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+8
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
|
|
ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
|
|
|
|
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
|
|
ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
|
|
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
|
|
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
|
|
ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
|
|
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
|
|
ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
|
|
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
|
|
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
|
|
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
|
|
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
|
|
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
|
|
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
|
|
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
|
|
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
|
|
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+9
|
|
|
|
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
|
|
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
|
|
|
|
eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
|
|
eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
|
|
|
|
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
|
|
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
|
|
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
|
|
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
|
|
|
|
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+10
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
|
|
|
|
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
|
|
|
|
rev64 $res1b, $res1b @ GHASH block 4k+5
|
|
eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
|
|
|
|
eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
|
|
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
|
|
|
|
rev64 $res0b, $res0b @ GHASH block 4k+4
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
b.lt .L256_dec_main_loop
|
|
|
|
|
|
.L256_dec_prepretail: @ PREPRETAIL
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
|
|
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
|
|
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
|
|
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
|
|
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
|
|
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
|
|
|
|
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
|
|
rev $ctr32w, $rctr32w @ CTR block 4k+7
|
|
eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
|
|
rev64 $res2b, $res2b @ GHASH block 4k+2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
|
|
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
|
|
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
|
|
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
|
|
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
|
|
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
|
|
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
|
|
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
|
|
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
|
|
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
|
|
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
|
|
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
|
|
rev64 $res3b, $res3b @ GHASH block 4k+3
|
|
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
|
|
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
|
|
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
|
|
|
|
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
|
|
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
|
|
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
|
|
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
|
|
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
|
|
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
|
|
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
|
|
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
|
|
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
|
|
|
|
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
|
|
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
|
|
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
|
|
|
|
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
|
|
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
|
|
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
|
|
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
|
|
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
|
|
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
|
|
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
|
|
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
|
|
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
|
|
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
|
|
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
|
|
|
|
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
|
|
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
|
|
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
|
|
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
|
|
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
|
|
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
|
|
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
|
|
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
|
|
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
|
|
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
|
|
|
|
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
|
|
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
|
|
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
|
|
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
|
|
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
|
|
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
|
|
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
|
|
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
|
|
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
|
|
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
|
|
eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
|
|
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
|
|
eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
|
|
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
|
|
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
|
|
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
|
|
eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
|
|
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
|
|
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
|
|
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
|
|
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
|
|
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
|
|
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
|
|
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
|
|
|
|
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
|
|
|
|
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
|
|
|
|
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
|
|
|
|
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
.L256_dec_tail: @ TAIL
|
|
|
|
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
|
|
|
|
eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
|
|
|
|
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
|
|
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
|
|
cmp $main_end_input_ptr, #48
|
|
|
|
eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
|
|
|
|
eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
|
|
b.gt .L256_dec_blocks_more_than_3
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
mov $ctr3b, $ctr2b
|
|
movi $acc_m.8b, #0
|
|
|
|
movi $acc_l.8b, #0
|
|
cmp $main_end_input_ptr, #32
|
|
|
|
movi $acc_h.8b, #0
|
|
mov $ctr2b, $ctr1b
|
|
b.gt .L256_dec_blocks_more_than_2
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
|
|
mov $ctr3b, $ctr1b
|
|
cmp $main_end_input_ptr, #16
|
|
b.gt .L256_dec_blocks_more_than_1
|
|
|
|
sub $rctr32w, $rctr32w, #1
|
|
b .L256_dec_blocks_less_than_1
|
|
.L256_dec_blocks_more_than_3: @ blocks left > 3
|
|
rev64 $res0b, $res1b @ GHASH final-3 block
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
|
|
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
|
|
|
|
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
|
|
|
|
mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
|
|
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
|
|
eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
|
|
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
|
|
.L256_dec_blocks_more_than_2: @ blocks left > 2
|
|
|
|
rev64 $res0b, $res1b @ GHASH final-2 block
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
|
|
|
|
eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
|
|
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
|
|
.L256_dec_blocks_more_than_1: @ blocks left > 1
|
|
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
|
|
rev64 $res0b, $res1b @ GHASH final-1 block
|
|
|
|
ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
movi $t0.8b, #0 @ suppress further partial tag feed in
|
|
|
|
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
|
|
|
|
eor $ctr0b, $res1b, $ctr3b @ AES final block - result
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
|
|
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
mov $output_l0, $ctr0.d[0] @ AES final block - mov low
|
|
|
|
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
|
|
mov $output_h0, $ctr0.d[1] @ AES final block - mov high
|
|
|
|
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
|
|
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
|
|
.L256_dec_blocks_less_than_1: @ blocks left <= 1
|
|
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
|
|
|
|
sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
|
|
|
|
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
|
|
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
|
|
and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
|
|
lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
|
|
cmp $bit_length, #64
|
|
|
|
csel $ctr32x, $rk14_l, $rk14_h, lt
|
|
csel $ctr96_b64x, $rk14_h, xzr, lt
|
|
|
|
fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
|
|
and $output_l0, $output_l0, $ctr32x
|
|
|
|
mov $ctr0.d[1], $ctr96_b64x
|
|
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
|
|
|
|
rev $ctr32w, $rctr32w
|
|
|
|
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
|
|
|
|
orr $output_l0, $output_l0, $end_input_ptr
|
|
|
|
and $output_h0, $output_h0, $ctr96_b64x
|
|
|
|
orr $output_h0, $output_h0, $main_end_input_ptr
|
|
|
|
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
|
|
rev64 $res0b, $res1b @ GHASH final block
|
|
|
|
eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
|
|
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
|
|
mov $t0d, $res0.d[1] @ GHASH final block - mid
|
|
|
|
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
|
|
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
|
|
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
|
|
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
|
|
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
|
|
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
|
|
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
|
|
|
|
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
|
|
|
|
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
|
|
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
|
|
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
|
|
|
|
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
|
|
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
|
|
|
|
stp $output_l0, $output_h0, [$output_ptr]
|
|
|
|
str $ctr32w, [$counter, #12] @ store the updated counter
|
|
|
|
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
mov x0, $len
|
|
st1 { $acc_l.16b }, [$current_tag]
|
|
|
|
ldp x21, x22, [sp, #16]
|
|
ldp x23, x24, [sp, #32]
|
|
ldp d8, d9, [sp, #48]
|
|
ldp d10, d11, [sp, #64]
|
|
ldp d12, d13, [sp, #80]
|
|
ldp d14, d15, [sp, #96]
|
|
ldp x19, x20, [sp], #112
|
|
ret
|
|
|
|
.L256_dec_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
|
|
___
|
|
}
|
|
}
|
|
|
|
$code.=<<___;
|
|
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
|
.align 2
|
|
#endif
|
|
___
|
|
|
|
if ($flavour =~ /64/) { ######## 64-bit code
|
|
sub unvmov {
|
|
my $arg=shift;
|
|
|
|
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
|
|
sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
|
|
$3<8?$3:$3+8,($4 eq "lo")?0:1;
|
|
}
|
|
foreach(split("\n",$code)) {
|
|
s/@\s/\/\//o; # old->new style commentary
|
|
print $_,"\n";
|
|
}
|
|
} else { ######## 32-bit code
|
|
sub unvdup32 {
|
|
my $arg=shift;
|
|
|
|
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
|
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
|
}
|
|
sub unvpmullp64 {
|
|
my ($mnemonic,$arg)=@_;
|
|
|
|
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
|
|
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|
|
|(($2&7)<<17)|(($2&8)<<4)
|
|
|(($3&7)<<1) |(($3&8)<<2);
|
|
$word |= 0x00010001 if ($mnemonic =~ "2");
|
|
# since ARMv7 instructions are always encoded little-endian.
|
|
# correct solution is to use .inst directive, but older%%%%
|
|
# assemblers don't implement it:-(
|
|
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
|
|
$word&0xff,($word>>8)&0xff,
|
|
($word>>16)&0xff,($word>>24)&0xff,
|
|
$mnemonic,$arg;
|
|
}
|
|
}
|
|
|
|
foreach(split("\n",$code)) {
|
|
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
|
|
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
|
|
s/\/\/\s?/@ /o; # new->old style commentary
|
|
|
|
# fix up remaining new-style suffixes
|
|
s/\],#[0-9]+/]!/o;
|
|
|
|
s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
|
|
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
|
|
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
|
|
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
|
s/^(\s+)b\./$1b/o or
|
|
s/^(\s+)ret/$1bx\tlr/o;
|
|
|
|
if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
|
|
print " it $2\n";
|
|
}
|
|
|
|
print $_,"\n";
|
|
}
|
|
}
|
|
|
|
close STDOUT or die "error closing STDOUT"; # enforce flush
|