openssl/crypto/modes/asm/aes-gcm-armv8_64.pl
David Benjamin 32be631ca1 Do not silently truncate files on perlasm errors
If one of the perlasm xlate drivers crashes, OpenSSL's build will
currently swallow the error and silently truncate the output to however
far the driver got. This will hopefully fail to build, but better to
check such things.

Handle this by checking for errors when closing STDOUT (which is a pipe
to the xlate driver).

Reviewed-by: Richard Levitte <levitte@openssl.org>
Reviewed-by: Tim Hudson <tjh@openssl.org>
Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org>
(Merged from https://github.com/openssl/openssl/pull/10883)
2020-01-22 18:11:30 +01:00

5723 lines
272 KiB
Perl
Executable File

#! /usr/bin/env perl
# Copyright 2019 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
#========================================================================
# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
# derived from https://github.com/ARM-software/AArch64cryptolib, original
# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
#========================================================================
#
# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
#
# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
#
# ____________________________________________________
# | |
# | PRE |
# |____________________________________________________|
# | | | |
# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
# |________________|________________|__________________|
# | | | |
# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
# |________________|________________|__________________|
# | | | |
# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
# |________________|________________|__________________|
# | | | |
# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
# |________________|____(mostly)____|__________________|
# | |
# | MODULO |
# |____________________________________________________|
#
# PRE:
# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
# EXT low_acc, low_acc, low_acc, #8
# EOR res_curr (4k+0), res_curr (4k+0), low_acc
#
# CTR block:
# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
# REV ctr32, rev_ctr32
# ORR ctr64, constctr96_top32, ctr32, LSL #32
# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
# INS ctr_next.d[1], ctr64X
# ADD rev_ctr32, #1
#
# AES block:
# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
# Given we are very constrained in our ASIMD registers this is quite important
#
# Encrypt:
# LDR input_low, [ input_ptr ], #8
# LDR input_high, [ input_ptr ], #8
# EOR input_low, k14_low
# EOR input_high, k14_high
# INS res_curr.d[0], input_low
# INS res_curr.d[1], input_high
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k13
# EOR res_curr, res_curr, ctr_curr
# ST1 { res_curr.16b }, [ output_ptr ], #16
#
# Decrypt:
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k13
# LDR res_curr, [ input_ptr ], #16
# EOR res_curr, res_curr, ctr_curr
# MOV output_low, res_curr.d[0]
# MOV output_high, res_curr.d[1]
# EOR output_low, k14_low
# EOR output_high, k14_high
# STP output_low, output_high, [ output_ptr ], #16
#
# GHASH block X:
# do 128b karatsuba polynomial multiplication on block
# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
#
# multiplication:
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
#
# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
#
# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
# multiplying with "twisted" powers of H
#
# Note: We can PMULL directly into the acc_x in first GHASH of the loop
# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
# path latency dominates the performance
#
# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
# than indicated here
# REV64 res_curr, res_curr
# INS t_m.d[0], res_curr.d[1]
# EOR t_m.8B, t_m.8B, res_curr.8B
# PMULL2 t_h, res_curr, HX
# PMULL t_l, res_curr, HX
# PMULL t_m, t_m, HX_k
# EOR acc_h, acc_h, t_h
# EOR acc_l, acc_l, t_l
# EOR acc_m, acc_m, t_m
#
# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
# with a reversed constant
# EOR acc_m, acc_m, acc_h
# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
# PMULL t_mod, acc_h, mod_constant
# EXT acc_h, acc_h, acc_h, #8
# EOR acc_m, acc_m, acc_h
# EOR acc_m, acc_m, t_mod
# PMULL acc_h, acc_m, mod_constant
# EXT acc_m, acc_m, acc_m, #8
# EOR acc_l, acc_l, acc_h
# EOR acc_l, acc_l, acc_m
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
$input_ptr="x0"; #argument block
$bit_length="x1";
$output_ptr="x2";
$current_tag="x3";
$counter="x16";
$cc="x8";
{
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
my ($output_l0,$output_h0)=map("x$_",(6..7));
my $ctr32w="w9";
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
my $t0="v8";
my $t0d="d8";
my ($t1,$t2,$t3)=map("v$_",(28..30));
my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
my $t4="v8";
my $t4d="d8";
my $t5="v28";
my $t5d="d28";
my $t6="v31";
my $t6d="d31";
my $t7="v4";
my $t7d="d4";
my $t8="v29";
my $t8d="d29";
my $t9="v30";
my $t9d="d30";
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
my $mod_constantd="d8";
my $mod_constant="v8";
my $mod_t="v31";
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
my $rk2q1="v20.1q";
my $rk3q1="v21.1q";
my $rk4v="v22";
my $rk4d="d22";
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=8
___
$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.fpu neon
#ifdef __thumb2__
.syntax unified
.thumb
# define INST(a,b,c,d) $_byte c,0xef,a,b
#else
.code 32
# define INST(a,b,c,d) $_byte a,b,c,0xf2
#endif
.text
___
#########################################################################################
# size_t aes_gcm_enc_128_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_enc_128_kernel
.type aes_gcm_enc_128_kernel,%function
.align 4
aes_gcm_enc_128_kernel:
cbz x1, .L128_enc_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
ld1 {$acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldr $rk9q, [$cc, #144] @ load rk9
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
lsr $rctr32x, $ctr96_t32x, #32
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
fmov $ctr1d, $ctr96_b64x @ CTR block 1
rev $rctr32w, $rctr32w @ rev_ctr32
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
ldr $rk0q, [$cc, #0] @ load rk0
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
fmov $ctr3d, $ctr96_b64x @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
ldr $rk1q, [$cc, #16] @ load rk1
add $rctr32w, $rctr32w, #1 @ CTR block 3
fmov $ctr3.d[1], $ctr32x @ CTR block 3
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $rk2q, [$cc, #32] @ load rk2
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
ldr $rk8q, [$cc, #128] @ load rk8
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $rk3q, [$cc, #48] @ load rk3
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
ldr $rk6q, [$cc, #96] @ load rk6
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $rk7q, [$cc, #112] @ load rk7
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
ldr $rk5q, [$cc, #80] @ load rk5
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ldr $rk4q, [$cc, #64] @ load rk4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr2b, $rk9 @ AES block 2 - round 9
aese $ctr0b, $rk9 @ AES block 0 - round 9
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
aese $ctr1b, $rk9 @ AES block 1 - round 9
aese $ctr3b, $rk9 @ AES block 3 - round 9
b.ge .L128_enc_tail @ handle tail
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
rev $ctr32w, $rctr32w @ CTR block 4
fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4
add $rctr32w, $rctr32w, #1 @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
fmov $ctr1d, $ctr96_b64x @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
fmov $ctr1.d[1], $ctr32x @ CTR block 5
fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
rev $ctr32w, $rctr32w @ CTR block 6
st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
add $rctr32w, $rctr32w, #1 @ CTR block 6
eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
fmov $ctr2d, $ctr96_b64x @ CTR block 6
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
fmov $ctr2.d[1], $ctr32x @ CTR block 6
rev $ctr32w, $rctr32w @ CTR block 7
st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
b.ge .L128_enc_prepretail @ do prepretail
.L128_enc_main_loop: @ main loop start
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
eor $res0b, $res0b, $acc_lb @ PRE 1
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
rev $ctr32w, $rctr32w @ CTR block 4k+8
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
movi $mod_constant.8b, #0xc2
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
rev $ctr32w, $rctr32w @ CTR block 4k+11
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
b.lt .L128_enc_main_loop
.L128_enc_prepretail: @ PREPRETAIL
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
eor $res0b, $res0b, $acc_lb @ PRE 1
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
movi $mod_constant.8b, #0xc2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
ext $acc_hb, $acc_hb, $acc_hb, #8
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $acc_mb, $acc_mb, $acc_lb
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $acc_mb, $acc_mb, $t1.16b
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $acc_hb
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
ext $acc_mb, $acc_mb, $acc_mb, #8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_lb, $acc_lb, $t1.16b
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
eor $acc_lb, $acc_lb, $acc_mb
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
.L128_enc_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
cmp $main_end_input_ptr, #48
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
b.gt .L128_enc_blocks_more_than_3
sub $rctr32w, $rctr32w, #1
movi $acc_l.8b, #0
mov $ctr3b, $ctr2b
cmp $main_end_input_ptr, #32
mov $ctr2b, $ctr1b
movi $acc_h.8b, #0
movi $acc_m.8b, #0
b.gt .L128_enc_blocks_more_than_2
mov $ctr3b, $ctr1b
cmp $main_end_input_ptr, #16
sub $rctr32w, $rctr32w, #1
b.gt .L128_enc_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L128_enc_blocks_less_than_1
.L128_enc_blocks_more_than_3: @ blocks left > 3
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
rev64 $res0b, $res1b @ GHASH final-3 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
fmov $res1d, $input_l0 @ AES final-2 block - mov low
movi $t0.8b, #0 @ suppress further partial tag feed in
fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
.L128_enc_blocks_more_than_2: @ blocks left > 2
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
rev64 $res0b, $res1b @ GHASH final-2 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
fmov $res1d, $input_l0 @ AES final-1 block - mov low
eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
.L128_enc_blocks_more_than_1: @ blocks left > 1
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
rev64 $res0b, $res1b @ GHASH final-1 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
fmov $res1d, $input_l0 @ AES final block - mov low
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
fmov $res1.d[1], $input_h0 @ AES final block - mov high
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
eor $res1b, $res1b, $ctr3b @ AES final block - result
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
.L128_enc_blocks_less_than_1: @ blocks left <= 1
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
cmp $bit_length, #64
csel $input_l0, $rk10_l, $rk10_h, lt
csel $input_h0, $rk10_h, xzr, lt
fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
fmov $ctr0.d[1], $input_h0
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
mov $t0d, $res0.d[1] @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
rev $ctr32w, $rctr32w
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
st1 { $res1b}, [$output_ptr] @ store all 16B
str $ctr32w, [$counter, #12] @ store the updated counter
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L128_enc_ret:
mov w0, #0x0
ret
.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
___
#########################################################################################
# size_t aes_gcm_dec_128_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_dec_128_kernel
.type aes_gcm_dec_128_kernel,%function
.align 4
aes_gcm_dec_128_kernel:
cbz x1, .L128_dec_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
ldr $rk0q, [$cc, #0] @ load rk0
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
lsr $rctr32x, $ctr96_t32x, #32
fmov $ctr2d, $ctr96_b64x @ CTR block 2
ldr $rk1q, [$cc, #16] @ load rk1
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
rev $rctr32w, $rctr32w @ rev_ctr32
fmov $ctr1d, $ctr96_b64x @ CTR block 1
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
rev $ctr32w, $rctr32w @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ldr $rk2q, [$cc, #32] @ load rk2
add $rctr32w, $rctr32w, #1 @ CTR block 1
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
fmov $ctr3d, $ctr96_b64x @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
fmov $ctr3.d[1], $ctr32x @ CTR block 3
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $rk3q, [$cc, #48] @ load rk3
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
ldr $rk6q, [$cc, #96] @ load rk6
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk7q, [$cc, #112] @ load rk7
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $rk4q, [$cc, #64] @ load rk4
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
ldr $rk5q, [$cc, #80] @ load rk5
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
ldr $rk8q, [$cc, #128] @ load rk8
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr2b, $rk9 @ AES block 2 - round 9
aese $ctr3b, $rk9 @ AES block 3 - round 9
aese $ctr0b, $rk9 @ AES block 0 - round 9
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr1b, $rk9 @ AES block 1 - round 9
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
b.ge .L128_dec_tail @ handle tail
ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
rev64 $res0b, $res0b @ GHASH block 0
rev $ctr32w, $rctr32w @ CTR block 4
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
add $rctr32w, $rctr32w, #1 @ CTR block 4
ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
rev64 $res1b, $res1b @ GHASH block 1
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
fmov $ctr0d, $ctr96_b64x @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
fmov $ctr1d, $ctr96_b64x @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
fmov $ctr1.d[1], $ctr32x @ CTR block 5
rev $ctr32w, $rctr32w @ CTR block 6
add $rctr32w, $rctr32w, #1 @ CTR block 6
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
b.ge .L128_dec_prepretail @ do prepretail
.L128_dec_main_loop: @ main loop start
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
rev64 $res2b, $res2b @ GHASH block 4k+2
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
rev $ctr32w, $rctr32w @ CTR block 4k+7
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
eor $res0b, $res0b, $acc_lb @ PRE 1
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
rev64 $res3b, $res3b @ GHASH block 4k+3
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
movi $mod_constant.8b, #0xc2
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
rev $ctr32w, $rctr32w @ CTR block 4k+8
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
rev64 $res1b, $res1b @ GHASH block 4k+5
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
rev64 $res0b, $res0b @ GHASH block 4k+4
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
b.lt L128_dec_main_loop
.L128_dec_prepretail: @ PREPRETAIL
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
eor $res0b, $res0b, $acc_lb @ PRE 1
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
rev64 $res2b, $res2b @ GHASH block 4k+2
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
rev $ctr32w, $rctr32w @ CTR block 4k+7
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
rev64 $res3b, $res3b @ GHASH block 4k+3
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
movi $mod_constant.8b, #0xc2
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
.L128_dec_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
cmp $main_end_input_ptr, #48
eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
b.gt .L128_dec_blocks_more_than_3
mov $ctr3b, $ctr2b
sub $rctr32w, $rctr32w, #1
movi $acc_l.8b, #0
movi $acc_h.8b, #0
mov $ctr2b, $ctr1b
movi $acc_m.8b, #0
cmp $main_end_input_ptr, #32
b.gt .L128_dec_blocks_more_than_2
cmp $main_end_input_ptr, #16
mov $ctr3b, $ctr1b
sub $rctr32w, $rctr32w, #1
b.gt .L128_dec_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L128_dec_blocks_less_than_1
.L128_dec_blocks_more_than_3: @ blocks left > 3
rev64 $res0b, $res1b @ GHASH final-3 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
.L128_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
.L128_dec_blocks_more_than_1: @ blocks left > 1
rev64 $res0b, $res1b @ GHASH final-1 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
eor $ctr0b, $res1b, $ctr3b @ AES final block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
mov $output_l0, $ctr0.d[0] @ AES final block - mov low
mov $output_h0, $ctr0.d[1] @ AES final block - mov high
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L128_dec_blocks_less_than_1: @ blocks left <= 1
mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
cmp $bit_length, #64
csel $ctr96_b64x, $rk10_h, xzr, lt
csel $ctr32x, $rk10_l, $rk10_h, lt
fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
mov $ctr0.d[1], $ctr96_b64x
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
and $output_h0, $output_h0, $ctr96_b64x
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
mov $t0d, $res0.d[1] @ GHASH final block - mid
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
and $output_l0, $output_l0, $ctr32x
rev $ctr32w, $rctr32w
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
orr $output_l0, $output_l0, $end_input_ptr
str $ctr32w, [$counter, #12] @ store the updated counter
orr $output_h0, $output_h0, $main_end_input_ptr
stp $output_l0, $output_h0, [$output_ptr]
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L128_dec_ret:
mov w0, #0x0
ret
.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
___
}
{
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
my ($output_l0,$output_h0)=map("x$_",(6..7));
my $ctr32w="w9";
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
my $t0="v8";
my $t0d="d8";
my $t3="v4";
my $t3d="d4";
my ($t1,$t2)=map("v$_",(30..31));
my ($t1d,$t2d)=map("d$_",(30..31));
my $t4="v30";
my $t4d="d30";
my $t5="v8";
my $t5d="d8";
my $t6="v31";
my $t6d="d31";
my $t7="v5";
my $t7d="d5";
my $t8="v6";
my $t8d="d6";
my $t9="v30";
my $t9d="d30";
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
my $mod_constantd="d8";
my $mod_constant="v8";
my $mod_t="v31";
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
my $rk2q1="v20.1q";
my $rk3q1="v21.1q";
my $rk4v="v22";
my $rk4d="d22";
#########################################################################################
# size_t aes_gcm_enc_192_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_enc_192_kernel
.type aes_gcm_enc_192_kernel,%function
.align 4
aes_gcm_enc_192_kernel:
cbz x1, .L192_enc_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ldr $rk5q, [$cc, #80] @ load rk5
ldr $rk4q, [$cc, #64] @ load rk4
ldr $rk8q, [$cc, #128] @ load rk8
lsr $rctr32x, $ctr96_t32x, #32
ldr $rk6q, [$cc, #96] @ load rk6
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
ldr $rk7q, [$cc, #112] @ load rk7
rev $rctr32w, $rctr32w @ rev_ctr32
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
fmov $ctr3d, $ctr96_b64x @ CTR block 3
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
fmov $ctr1d, $ctr96_b64x @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
ldr $rk0q, [$cc, #0] @ load rk0
fmov $ctr3.d[1], $ctr32x @ CTR block 3
ldr $rk3q, [$cc, #48] @ load rk3
ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
ldr $rk1q, [$cc, #16] @ load rk1
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $rk11q, [$cc, #176] @ load rk11
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk2q, [$cc, #32] @ load rk2
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr2b, $rk11 @ AES block 2 - round 11
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr1b, $rk11 @ AES block 1 - round 11
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr0b, $rk11 @ AES block 0 - round 11
add $rctr32w, $rctr32w, #1 @ CTR block 3
aese $ctr3b, $rk11 @ AES block 3 - round 11
b.ge .L192_enc_tail @ handle tail
rev $ctr32w, $rctr32w @ CTR block 4
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
add $rctr32w, $rctr32w, #1 @ CTR block 4
eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
fmov $ctr1d, $ctr96_b64x @ CTR block 5
st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
fmov $ctr1.d[1], $ctr32x @ CTR block 5
rev $ctr32w, $rctr32w @ CTR block 6
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
add $rctr32w, $rctr32w, #1 @ CTR block 6
eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
fmov $ctr2d, $ctr96_b64x @ CTR block 6
fmov $ctr2.d[1], $ctr32x @ CTR block 6
rev $ctr32w, $rctr32w @ CTR block 7
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
b.ge .L192_enc_prepretail @ do prepretail
.L192_enc_main_loop: @ main loop start
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $res0b, $res0b, $acc_lb @ PRE 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
rev $ctr32w, $rctr32w @ CTR block 4k+8
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
movi $mod_constant.8b, #0xc2
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
rev $ctr32w, $rctr32w @ CTR block 4k+11
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
b.lt .L192_enc_main_loop
.L192_enc_prepretail: @ PREPRETAIL
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
eor $res0b, $res0b, $acc_lb @ PRE 1
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
movi $mod_constant.8b, #0xc2
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
eor $acc_mb, $acc_mb, $acc_lb
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
ext $acc_hb, $acc_hb, $acc_hb, #8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $t1.16b
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
eor $acc_mb, $acc_mb, $acc_hb
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
ext $acc_mb, $acc_mb, $acc_mb, #8
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $acc_lb, $acc_lb, $t1.16b
aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
eor $acc_lb, $acc_lb, $acc_mb
.L192_enc_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
cmp $main_end_input_ptr, #48
eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
b.gt .L192_enc_blocks_more_than_3
sub $rctr32w, $rctr32w, #1
movi $acc_m.8b, #0
mov $ctr3b, $ctr2b
movi $acc_h.8b, #0
cmp $main_end_input_ptr, #32
mov $ctr2b, $ctr1b
movi $acc_l.8b, #0
b.gt .L192_enc_blocks_more_than_2
sub $rctr32w, $rctr32w, #1
mov $ctr3b, $ctr1b
cmp $main_end_input_ptr, #16
b.gt .L192_enc_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L192_enc_blocks_less_than_1
.L192_enc_blocks_more_than_3: @ blocks left > 3
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
rev64 $res0b, $res1b @ GHASH final-3 block
eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
fmov $res1d, $input_l0 @ AES final-2 block - mov low
fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
.L192_enc_blocks_more_than_2: @ blocks left > 2
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
rev64 $res0b, $res1b @ GHASH final-2 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
fmov $res1d, $input_l0 @ AES final-1 block - mov low
fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
.L192_enc_blocks_more_than_1: @ blocks left > 1
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
rev64 $res0b, $res1b @ GHASH final-1 block
eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ suppress further partial tag feed in
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
fmov $res1d, $input_l0 @ AES final block - mov low
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
fmov $res1.d[1], $input_h0 @ AES final block - mov high
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $res1b, $res1b, $ctr3b @ AES final block - result
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L192_enc_blocks_less_than_1: @ blocks left <= 1
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
rev $ctr32w, $rctr32w
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
cmp $bit_length, #64
csel $input_l0, $rk12_l, $rk12_h, lt
csel $input_h0, $rk12_h, xzr, lt
fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
fmov $ctr0.d[1], $input_h0
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
mov $t0d, $res0.d[1] @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
str $ctr32w, [$counter, #12] @ store the updated counter
st1 { $res1b}, [$output_ptr] @ store all 16B
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L192_enc_ret:
mov w0, #0x0
ret
.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
___
#########################################################################################
# size_t aes_gcm_dec_192_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_dec_192_kernel
.type aes_gcm_dec_192_kernel,%function
.align 4
aes_gcm_dec_192_kernel:
cbz x1, .L192_dec_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
ldr $rk0q, [$cc, #0] @ load rk0
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldr $rk2q, [$cc, #32] @ load rk2
lsr $rctr32x, $ctr96_t32x, #32
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
fmov $ctr3d, $ctr96_b64x @ CTR block 3
rev $rctr32w, $rctr32w @ rev_ctr32
fmov $ctr1d, $ctr96_b64x @ CTR block 1
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
ldr $rk1q, [$cc, #16] @ load rk1
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ldr $rk3q, [$cc, #48] @ load rk3
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
fmov $ctr3.d[1], $ctr32x @ CTR block 3
ldr $rk8q, [$cc, #128] @ load rk8
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk11q, [$cc, #176] @ load rk11
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldr $rk7q, [$cc, #112] @ load rk7
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
ldr $rk4q, [$cc, #64] @ load rk4
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
ldr $rk5q, [$cc, #80] @ load rk5
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
ldr $rk6q, [$cc, #96] @ load rk6
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
aese $ctr3b, $rk11 @ AES block 3 - round 11
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
aese $ctr2b, $rk11 @ AES block 2 - round 11
aese $ctr1b, $rk11 @ AES block 1 - round 11
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr0b, $rk11 @ AES block 0 - round 11
b.ge .L192_dec_tail @ handle tail
ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
rev $ctr32w, $rctr32w @ CTR block 4
ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
add $rctr32w, $rctr32w, #1 @ CTR block 4
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
rev64 $res0b, $res0b @ GHASH block 0
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
fmov $ctr0d, $ctr96_b64x @ CTR block 4
rev64 $res1b, $res1b @ GHASH block 1
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
fmov $ctr1d, $ctr96_b64x @ CTR block 5
eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
add $rctr32w, $rctr32w, #1 @ CTR block 5
fmov $ctr1.d[1], $ctr32x @ CTR block 5
eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
rev $ctr32w, $rctr32w @ CTR block 6
eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
add $rctr32w, $rctr32w, #1 @ CTR block 6
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
b.ge .L192_dec_prepretail @ do prepretail
.L192_dec_main_loop: @ main loop start
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
rev64 $res3b, $res3b @ GHASH block 4k+3
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
eor $res0b, $res0b, $acc_lb @ PRE 1
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
rev $ctr32w, $rctr32w @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
rev64 $res2b, $res2b @ GHASH block 4k+2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
movi $mod_constant.8b, #0xc2
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
rev $ctr32w, $rctr32w @ CTR block 4k+8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
rev64 $res1b, $res1b @ GHASH block 4k+5
aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
rev64 $res0b, $res0b @ GHASH block 4k+4
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
b.lt .L192_dec_main_loop
.L192_dec_prepretail: @ PREPRETAIL
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
eor $res0b, $res0b, $acc_lb @ PRE 1
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
rev64 $res2b, $res2b @ GHASH block 4k+2
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
rev $ctr32w, $rctr32w @ CTR block 4k+7
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
rev64 $res3b, $res3b @ GHASH block 4k+3
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
movi $mod_constant.8b, #0xc2
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
aese $ctr0b, $rk11
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
aese $ctr2b, $rk11
aese $ctr1b, $rk11
aese $ctr3b, $rk11
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
.L192_dec_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
cmp $main_end_input_ptr, #48
eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
b.gt .L192_dec_blocks_more_than_3
movi $acc_l.8b, #0
movi $acc_h.8b, #0
mov $ctr3b, $ctr2b
mov $ctr2b, $ctr1b
sub $rctr32w, $rctr32w, #1
movi $acc_m.8b, #0
cmp $main_end_input_ptr, #32
b.gt .L192_dec_blocks_more_than_2
mov $ctr3b, $ctr1b
cmp $main_end_input_ptr, #16
sub $rctr32w, $rctr32w, #1
b.gt .L192_dec_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L192_dec_blocks_less_than_1
.L192_dec_blocks_more_than_3: @ blocks left > 3
rev64 $res0b, $res1b @ GHASH final-3 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
.L192_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
.L192_dec_blocks_more_than_1: @ blocks left > 1
rev64 $res0b, $res1b @ GHASH final-1 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor $ctr0b, $res1b, $ctr3b @ AES final block - result
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
mov $output_h0, $ctr0.d[1] @ AES final block - mov high
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
mov $output_l0, $ctr0.d[0] @ AES final block - mov low
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L192_dec_blocks_less_than_1: @ blocks left <= 1
mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
cmp $bit_length, #64
csel $ctr32x, $rk12_l, $rk12_h, lt
csel $ctr96_b64x, $rk12_h, xzr, lt
fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
and $output_l0, $output_l0, $ctr32x
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
orr $output_l0, $output_l0, $end_input_ptr
mov $ctr0.d[1], $ctr96_b64x
rev $ctr32w, $rctr32w
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
str $ctr32w, [$counter, #12] @ store the updated counter
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
and $output_h0, $output_h0, $ctr96_b64x
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
mov $t0d, $res0.d[1] @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
orr $output_h0, $output_h0, $main_end_input_ptr
stp $output_l0, $output_h0, [$output_ptr]
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L192_dec_ret:
mov w0, #0x0
ret
.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
___
}
{
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
my ($output_l0,$output_h0)=map("x$_",(6..7));
my $ctr32w="w9";
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
my $t0="v8";
my $t0d="d8";
my $t1="v4";
my $t1d="d4";
my $t2="v8";
my $t2d="d8";
my $t3="v4";
my $t3d="d4";
my $t4="v4";
my $t4d="d4";
my $t5="v5";
my $t5d="d5";
my $t6="v8";
my $t6d="d8";
my $t7="v5";
my $t7d="d5";
my $t8="v6";
my $t8d="d6";
my $t9="v4";
my $t9d="d4";
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
my $mod_constantd="d8";
my $mod_constant="v8";
my $mod_t="v7";
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
my $rk2q1="v20.1q";
my $rk3q1="v21.1q";
my $rk4v="v22";
my $rk4d="d22";
#########################################################################################
# size_t aes_gcm_enc_256_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_enc_256_kernel
.type aes_gcm_enc_256_kernel,%function
.align 4
aes_gcm_enc_256_kernel:
cbz x1, .L256_enc_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
ldr $rk0q, [$cc, #0] @ load rk0
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
ldr $rk7q, [$cc, #112] @ load rk7
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
lsr $rctr32x, $ctr96_t32x, #32
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
rev $rctr32w, $rctr32w @ rev_ctr32
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
fmov $ctr1d, $ctr96_b64x @ CTR block 1
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
rev $ctr32w, $rctr32w @ CTR block 1
fmov $ctr3d, $ctr96_b64x @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
ldr $rk1q, [$cc, #16] @ load rk1
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
ldr $rk2q, [$cc, #32] @ load rk2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
fmov $ctr3.d[1], $ctr32x @ CTR block 3
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $rk3q, [$cc, #48] @ load rk3
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
ldr $rk6q, [$cc, #96] @ load rk6
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk5q, [$cc, #80] @ load rk5
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $rk13q, [$cc, #208] @ load rk13
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
ldr $rk4q, [$cc, #64] @ load rk4
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $rk12q, [$cc, #192] @ load rk12
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
ldr $rk11q, [$cc, #176] @ load rk11
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
ldr $rk8q, [$cc, #128] @ load rk8
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
aese $ctr2b, $rk13 @ AES block 2 - round 13
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
aese $ctr1b, $rk13 @ AES block 1 - round 13
aese $ctr0b, $rk13 @ AES block 0 - round 13
aese $ctr3b, $rk13 @ AES block 3 - round 13
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
b.ge .L256_enc_tail @ handle tail
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
rev $ctr32w, $rctr32w @ CTR block 4
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
add $rctr32w, $rctr32w, #1 @ CTR block 4
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
fmov $ctr1d, $ctr96_b64x @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
fmov $ctr1.d[1], $ctr32x @ CTR block 5
rev $ctr32w, $rctr32w @ CTR block 6
st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
add $rctr32w, $rctr32w, #1 @ CTR block 6
fmov $ctr2d, $ctr96_b64x @ CTR block 6
fmov $ctr2.d[1], $ctr32x @ CTR block 6
st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
rev $ctr32w, $rctr32w @ CTR block 7
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
b.ge L256_enc_prepretail @ do prepretail
.L256_enc_main_loop: @ main loop start
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
eor $res0b, $res0b, $acc_lb @ PRE 1
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
movi $mod_constant.8b, #0xc2
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
rev $ctr32w, $rctr32w @ CTR block 4k+8
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
rev $ctr32w, $rctr32w @ CTR block 4k+10
st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
rev $ctr32w, $rctr32w @ CTR block 4k+11
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
b.lt L256_enc_main_loop
.L256_enc_prepretail: @ PREPRETAIL
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
eor $res0b, $res0b, $acc_lb @ PRE 1
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
movi $mod_constant.8b, #0xc2
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
ext $acc_hb, $acc_hb, $acc_hb, #8
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $acc_mb, $acc_mb, $acc_lb
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
eor $acc_mb, $acc_mb, $t1.16b
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
eor $acc_mb, $acc_mb, $acc_hb
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
ext $acc_mb, $acc_mb, $acc_mb, #8
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
eor $acc_lb, $acc_lb, $t1.16b
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
eor $acc_lb, $acc_lb, $acc_mb
.L256_enc_tail: @ TAIL
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
cmp $main_end_input_ptr, #48
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
b.gt .L256_enc_blocks_more_than_3
cmp $main_end_input_ptr, #32
mov $ctr3b, $ctr2b
movi $acc_l.8b, #0
movi $acc_h.8b, #0
sub $rctr32w, $rctr32w, #1
mov $ctr2b, $ctr1b
movi $acc_m.8b, #0
b.gt .L256_enc_blocks_more_than_2
mov $ctr3b, $ctr1b
sub $rctr32w, $rctr32w, #1
cmp $main_end_input_ptr, #16
b.gt .L256_enc_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L256_enc_blocks_less_than_1
.L256_enc_blocks_more_than_3: @ blocks left > 3
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
rev64 $res0b, $res1b @ GHASH final-3 block
eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
fmov $res1d, $input_l0 @ AES final-2 block - mov low
fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
.L256_enc_blocks_more_than_2: @ blocks left > 2
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
rev64 $res0b, $res1b @ GHASH final-2 block
eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
eor $res0b, $res0b, $t0.16b @ feed in partial tag
fmov $res1d, $input_l0 @ AES final-1 block - mov low
eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
.L256_enc_blocks_more_than_1: @ blocks left > 1
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
rev64 $res0b, $res1b @ GHASH final-1 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
fmov $res1d, $input_l0 @ AES final block - mov low
fmov $res1.d[1], $input_h0 @ AES final block - mov high
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
eor $res1b, $res1b, $ctr3b @ AES final block - result
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
.L256_enc_blocks_less_than_1: @ blocks left <= 1
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
cmp $bit_length, #64
csel $input_l0, $rk14_l, $rk14_h, lt
csel $input_h0, $rk14_h, xzr, lt
fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
fmov $ctr0.d[1], $input_h0
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
mov $t0d, $res0.d[1] @ GHASH final block - mid
rev $ctr32w, $rctr32w
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
str $ctr32w, [$counter, #12] @ store the updated counter
st1 { $res1b}, [$output_ptr] @ store all 16B
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L256_enc_ret:
mov w0, #0x0
ret
.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
___
{
my $t8="v4";
my $t8d="d4";
my $t9="v6";
my $t9d="d6";
#########################################################################################
# size_t aes_gcm_dec_256_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_dec_256_kernel
.type aes_gcm_dec_256_kernel,%function
.align 4
aes_gcm_dec_256_kernel:
cbz x1, .L256_dec_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ldr $rk8q, [$cc, #128] @ load rk8
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
ldr $rk7q, [$cc, #112] @ load rk7
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
ldr $rk6q, [$cc, #96] @ load rk6
lsr $rctr32x, $ctr96_t32x, #32
ldr $rk5q, [$cc, #80] @ load rk5
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
ldr $rk3q, [$cc, #48] @ load rk3
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
rev $rctr32w, $rctr32w @ rev_ctr32
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
fmov $ctr3d, $ctr96_b64x @ CTR block 3
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
fmov $ctr1d, $ctr96_b64x @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
ldr $rk0q, [$cc, #0] @ load rk0
fmov $ctr3.d[1], $ctr32x @ CTR block 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
ldr $rk4q, [$cc, #64] @ load rk4
ldr $rk13q, [$cc, #208] @ load rk13
ldr $rk1q, [$cc, #16] @ load rk1
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk2q, [$cc, #32] @ load rk2
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $rk12q, [$cc, #192] @ load rk12
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
ldr $rk11q, [$cc, #176] @ load rk11
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr1b, $rk13 @ AES block 1 - round 13
aese $ctr2b, $rk13 @ AES block 2 - round 13
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
aese $ctr3b, $rk13 @ AES block 3 - round 13
aese $ctr0b, $rk13 @ AES block 0 - round 13
b.ge .L256_dec_tail @ handle tail
ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
rev $ctr32w, $rctr32w @ CTR block 4
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
rev64 $res1b, $res1b @ GHASH block 1
ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
rev64 $res0b, $res0b @ GHASH block 0
add $rctr32w, $rctr32w, #1 @ CTR block 4
fmov $ctr0d, $ctr96_b64x @ CTR block 4
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
fmov $ctr1d, $ctr96_b64x @ CTR block 5
ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
fmov $ctr1.d[1], $ctr32x @ CTR block 5
rev $ctr32w, $rctr32w @ CTR block 6
add $rctr32w, $rctr32w, #1 @ CTR block 6
eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
b.ge .L256_dec_prepretail @ do prepretail
.L256_dec_main_loop: @ main loop start
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
eor $res0b, $res0b, $acc_lb @ PRE 1
rev $ctr32w, $rctr32w @ CTR block 4k+7
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
rev64 $res2b, $res2b @ GHASH block 4k+2
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
rev64 $res3b, $res3b @ GHASH block 4k+3
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
rev $ctr32w, $rctr32w @ CTR block 4k+8
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
movi $mod_constant.8b, #0xc2
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
rev $ctr32w, $rctr32w @ CTR block 4k+9
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
rev64 $res1b, $res1b @ GHASH block 4k+5
eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
rev64 $res0b, $res0b @ GHASH block 4k+4
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
b.lt .L256_dec_main_loop
.L256_dec_prepretail: @ PREPRETAIL
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
rev $ctr32w, $rctr32w @ CTR block 4k+7
eor $res0b, $res0b, $acc_lb @ PRE 1
rev64 $res2b, $res2b @ GHASH block 4k+2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
rev64 $res3b, $res3b @ GHASH block 4k+3
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
movi $mod_constant.8b, #0xc2
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
.L256_dec_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
cmp $main_end_input_ptr, #48
eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
b.gt .L256_dec_blocks_more_than_3
sub $rctr32w, $rctr32w, #1
mov $ctr3b, $ctr2b
movi $acc_m.8b, #0
movi $acc_l.8b, #0
cmp $main_end_input_ptr, #32
movi $acc_h.8b, #0
mov $ctr2b, $ctr1b
b.gt .L256_dec_blocks_more_than_2
sub $rctr32w, $rctr32w, #1
mov $ctr3b, $ctr1b
cmp $main_end_input_ptr, #16
b.gt .L256_dec_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L256_dec_blocks_less_than_1
.L256_dec_blocks_more_than_3: @ blocks left > 3
rev64 $res0b, $res1b @ GHASH final-3 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
.L256_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
.L256_dec_blocks_more_than_1: @ blocks left > 1
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
rev64 $res0b, $res1b @ GHASH final-1 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ suppress further partial tag feed in
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
eor $ctr0b, $res1b, $ctr3b @ AES final block - result
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
mov $output_l0, $ctr0.d[0] @ AES final block - mov low
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
mov $output_h0, $ctr0.d[1] @ AES final block - mov high
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
.L256_dec_blocks_less_than_1: @ blocks left <= 1
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
sub $bit_length, $bit_length, #128 @ bit_length -= 128
mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
cmp $bit_length, #64
csel $ctr32x, $rk14_l, $rk14_h, lt
csel $ctr96_b64x, $rk14_h, xzr, lt
fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
and $output_l0, $output_l0, $ctr32x
mov $ctr0.d[1], $ctr96_b64x
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
rev $ctr32w, $rctr32w
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
orr $output_l0, $output_l0, $end_input_ptr
and $output_h0, $output_h0, $ctr96_b64x
orr $output_h0, $output_h0, $main_end_input_ptr
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
mov $t0d, $res0.d[1] @ GHASH final block - mid
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
stp $output_l0, $output_h0, [$output_ptr]
str $ctr32w, [$counter, #12] @ store the updated counter
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L256_dec_ret:
mov w0, #0x0
ret
.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
___
}
}
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#endif
___
if ($flavour =~ /64/) { ######## 64-bit code
sub unvmov {
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
$3<8?$3:$3+8,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/@\s/\/\//o; # old->new style commentary
print $_,"\n";
}
} else { ######## 32-bit code
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvpmullp64 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
$word |= 0x00010001 if ($mnemonic =~ "2");
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older%%%%
# assemblers don't implement it:-(
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remaining new-style suffixes
s/\],#[0-9]+/]!/o;
s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
print " it $2\n";
}
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT"; # enforce flush