openssl/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
Matt Caswell fecb3aae22 Update copyright year
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Release: yes
2022-05-03 13:34:51 +01:00

7370 lines
317 KiB
Perl

#! /usr/bin/env perl
# Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
#========================================================================
# Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
# derived from https://github.com/ARM-software/AArch64cryptolib, original
# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
# licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
# obtain it.
#========================================================================
#
# Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
# Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
# intermediate hashesfrom the 8 blocks.
#
# ____________________________________________________
# | |
# | PRE |
# |____________________________________________________|
# | | | |
# | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
# |________________|________________|__________________|
# | | | |
# | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
# |________________|________________|__________________|
# | | | |
# | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
# |________________|________________|__________________|
# | | | |
# | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
# |________________|________________|__________________|
# | | | |
# | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
# |________________|________________|__________________|
# | | | |
# | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
# |________________|________________|__________________|
# | | | |
# | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
# |________________|________________|__________________|
# | | | |
# | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
# |________________|____(mostly)____|__________________|
# | |
# | MODULO |
# |____________________________________________________|
#
# PRE:
# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
# EXT low_acc, low_acc, low_acc, #8
# EOR res_curr (8k+0), res_curr (4k+0), low_acc
#
# CTR block:
# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
# REV ctr32, rev_ctr32
# ORR ctr64, constctr96_top32, ctr32, LSL #32
# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
# INS ctr_next.d[1], ctr64X
# ADD rev_ctr32, #1
#
# AES block:
# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
# Given we are very constrained in our ASIMD registers this is quite important
#
# Encrypt:
# LDR input_low, [ input_ptr ], #8
# LDR input_high, [ input_ptr ], #8
# EOR input_low, k14_low
# EOR input_high, k14_high
# INS res_curr.d[0], input_low
# INS res_curr.d[1], input_high
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k13
# EOR res_curr, res_curr, ctr_curr
# ST1 { res_curr.16b }, [ output_ptr ], #16
#
# Decrypt:
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k13
# LDR res_curr, [ input_ptr ], #16
# EOR res_curr, res_curr, ctr_curr
# MOV output_low, res_curr.d[0]
# MOV output_high, res_curr.d[1]
# EOR output_low, k14_low
# EOR output_high, k14_high
# STP output_low, output_high, [ output_ptr ], #16
# GHASH block X:
# Do 128b karatsuba polynomial multiplication on block
# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
#
# multiplication:
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
#
# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
#
# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
# multiplying with "twisted" powers of H
#
# Note: We can PMULL directly into the acc_x in first GHASH of the loop
# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
# path latency dominates the performance
#
# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
# than indicated here
# REV64 res_curr, res_curr
# INS t_m.d[0], res_curr.d[1]
# EOR t_m.8B, t_m.8B, res_curr.8B
# PMULL2 t_h, res_curr, HX
# PMULL t_l, res_curr, HX
# PMULL t_m, t_m, HX_k
# EOR acc_h, acc_h, t_h
# EOR acc_l, acc_l, t_l
# EOR acc_m, acc_m, t_m
#
# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
# with a reversed constant
# EOR3 acc_m, acc_m, acc_l, acc_h // Finish off karatsuba processing
# PMULL t_mod, acc_h, mod_constant
# EXT acc_h, acc_h, acc_h, #8
# EOR3 acc_m, acc_m, t_mod, acc_h
# PMULL acc_h, acc_m, mod_constant
# EXT acc_m, acc_m, acc_m, #8
# EOR3 acc_l, acc_l, acc_m, acc_h
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
die "can't locate arm-xlate.pl";
die "only for 64 bit" if $flavour !~ /64/;
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=8
___
$code.=".arch armv8.2-a+crypto\n.text\n";
$input_ptr="x0"; #argument block
$bit_length="x1";
$output_ptr="x2";
$current_tag="x3";
$counter="x16";
$constant_temp="x15";
$modulo_constant="x10";
$cc="x8";
{
my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
my ($temp2_x,$temp3_x)=map("x$_",(13..14));
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
my $t0="v16";
my $t0d="d16";
my $t1="v29";
my $t2=$res1;
my $t3=$t1;
my $t4=$res0;
my $t5=$res2;
my $t6=$t0;
my $t7=$res3;
my $t8=$res4;
my $t9=$res5;
my $t10=$res6;
my $t11="v21";
my $t12=$t1;
my $rtmp_ctr="v30";
my $rtmp_ctrq="q30";
my $rctr_inc="v31";
my $rctr_incd="d31";
my $mod_constantd=$t0d;
my $mod_constant=$t0;
my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
my $rk2q1="v28.1q";
my $rk3q1="v26.1q";
my $rk4v="v27";
#########################################################################################
# size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global unroll8_eor3_aes_gcm_enc_128_kernel
.type unroll8_eor3_aes_gcm_enc_128_kernel,%function
.align 4
unroll8_eor3_aes_gcm_enc_128_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L128_enc_ret
stp d8, d9, [sp, #-80]!
mov $counter, x4
mov $cc, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add $modulo_constant, sp, #64
mov $constant_temp, #0x100000000 @ set up counter increment
movi $rctr_inc.16b, #0x0
mov $rctr_inc.d[1], $constant_temp
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
ld1 { $ctr0b}, [$counter] @ CTR block 0
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
b.ge .L128_enc_tail @ handle tail
ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 0 - result
rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 1 - result
stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 2 - result
eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 3 - result
eor3 $res7b, $ctr_t7b, $ctr7b,$rk10 @ AES block 7 - result
stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
b.ge .L128_enc_prepretail @ do prepretail
.L128_enc_main_loop: @ main loop start
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
rev64 $res1b, $res1b @ GHASH block 8k+1
rev64 $res0b, $res0b @ GHASH block 8k
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
rev64 $res3b, $res3b @ GHASH block 8k+3
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
eor $res0b, $res0b, $acc_lb @ PRE 1
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
rev64 $res2b, $res2b @ GHASH block 8k+2
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h3l | h3h
ext $h4.16b, $h4.16b, $h4.16b, #8
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
eor3 $acc_hb, $acc_hb, $t1.16b,$t2.16b @ GHASH block 8k+2, 8k+3 - high
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h1l | h1h
ext $h2.16b, $h2.16b, $h2.16b, #8
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
ldr $rk10q, [$cc, #160] @ load rk10
ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 8k+10 - result
mov $ctr2.16b, $h3.16b @ CTR block 8k+18
aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
eor3 $res7b, $ctr_t7b, $ctr7b, $rk10 @ AES block 7 - result
aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 8k+9 - result
eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 8k+11 - result
mov $ctr3.16b, $h4.16b @ CTR block 8k+19
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
mov $ctr1.16b, $h2.16b @ CTR block 8k+17
eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 8k+8 - result
mov $ctr0.16b, $h1.16b @ CTR block 8k+16
stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
b.lt .L128_enc_main_loop
.L128_enc_prepretail: @ PREPRETAIL
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
rev64 $res0b, $res0b @ GHASH block 8k
rev64 $res1b, $res1b @ GHASH block 8k+1
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
rev64 $res3b, $res3b @ GHASH block 8k+3
rev64 $res2b, $res2b @ GHASH block 8k+2
eor $res0b, $res0b, $acc_lb @ PRE 1
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h1l | h1h
ext $h2.16b, $h2.16b, $h2.16b, #8
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
eor3 $acc_lb, $acc_lb, $acc_hb, $acc_mb @ MODULO - fold into low
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
.L128_enc_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
mov $t1.16b, $rk10
ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
ext $h7.16b, $h7.16b, $h7.16b, #8
ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
ext $h8.16b, $h8.16b, $h8.16b, #8
cmp $main_end_input_ptr, #112
b.gt .L128_enc_blocks_more_than_7
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
movi $acc_h.8b, #0
cmp $main_end_input_ptr, #96
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr2b
mov $ctr2b, $ctr1b
movi $acc_l.8b, #0
movi $acc_m.8b, #0
b.gt .L128_enc_blocks_more_than_6
mov $ctr7b, $ctr6b
cmp $main_end_input_ptr, #80
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr1b
b.gt .L128_enc_blocks_more_than_5
cmp $main_end_input_ptr, #64
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr1b
b.gt .L128_enc_blocks_more_than_4
mov $ctr7b, $ctr6b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr1b
cmp $main_end_input_ptr, #48
b.gt .L128_enc_blocks_more_than_3
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr1b
cmp $main_end_input_ptr, #32
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
b.gt .L128_enc_blocks_more_than_2
cmp $main_end_input_ptr, #16
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr1b
b.gt .L128_enc_blocks_more_than_1
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
b .L128_enc_blocks_less_than_1
.L128_enc_blocks_more_than_7: @ blocks left > 7
st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
rev64 $res0b, $res1b @ GHASH final-7 block
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
.L128_enc_blocks_more_than_6: @ blocks left > 6
st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
rev64 $res0b, $res1b @ GHASH final-6 block
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
.L128_enc_blocks_more_than_5: @ blocks left > 5
st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
rev64 $res0b, $res1b @ GHASH final-5 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
movi $t0.8b, #0 @ supress further partial tag feed in
pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
.L128_enc_blocks_more_than_4: @ blocks left > 4
st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
rev64 $res0b, $res1b @ GHASH final-4 block
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
.L128_enc_blocks_more_than_3: @ blocks left > 3
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
rev64 $res0b, $res1b @ GHASH final-3 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ supress further partial tag feed in
ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
.L128_enc_blocks_more_than_2: @ blocks left > 2
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
rev64 $res0b, $res1b @ GHASH final-2 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
movi $t0.8b, #0 @ supress further partial tag feed in
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
.L128_enc_blocks_more_than_1: @ blocks left > 1
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
rev64 $res0b, $res1b @ GHASH final-1 block
ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ supress further partial tag feed in
ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
.L128_enc_blocks_less_than_1: @ blocks left <= 1
rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
str $rtmp_ctrq, [$counter] @ store the updated counter
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
cmp $bit_length, #64
csel $temp2_x, $temp1_x, $temp0_x, lt
csel $temp3_x, $temp0_x, xzr, lt
mov $ctr0.d[1], $temp3_x
mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
st1 { $res1b}, [$output_ptr] @ store all 16B
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
st1 { $acc_l.16b }, [$current_tag]
lsr x0, $bit_length, #3 @ return sizes
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L128_enc_ret:
mov w0, #0x0
ret
.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
___
#########################################################################################
# size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# u64 *Xi,
# unsigned char ivec[16],
# const void *key);
#
$code.=<<___;
.global unroll8_eor3_aes_gcm_dec_128_kernel
.type unroll8_eor3_aes_gcm_dec_128_kernel,%function
.align 4
unroll8_eor3_aes_gcm_dec_128_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L128_dec_ret
stp d8, d9, [sp, #-80]!
mov $counter, x4
mov $cc, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add $modulo_constant, sp, #64
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
ld1 { $ctr0b}, [$counter] @ CTR block 0
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
mov $constant_temp, #0x100000000 @ set up counter increment
movi $rctr_inc.16b, #0x0
mov $rctr_inc.d[1], $constant_temp
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
aese $ctr0b, $rk9 @ AES block 0 - round 9
aese $ctr1b, $rk9 @ AES block 1 - round 9
aese $ctr6b, $rk9 @ AES block 6 - round 9
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr4b, $rk9 @ AES block 4 - round 9
aese $ctr3b, $rk9 @ AES block 3 - round 9
aese $ctr2b, $rk9 @ AES block 2 - round 9
aese $ctr5b, $rk9 @ AES block 5 - round 9
aese $ctr7b, $rk9 @ AES block 7 - round 9
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
b.ge .L128_dec_tail @ handle tail
ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 0 - result
eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 1 - result
stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 3 - result
eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 2 - result
stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 6 - result
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 4 - result
eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 5 - result
stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 7 - result
stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
b.ge .L128_dec_prepretail @ do prepretail
.L128_dec_main_loop: @ main loop start
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
rev64 $res1b, $res1b @ GHASH block 8k+1
rev64 $res0b, $res0b @ GHASH block 8k
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev64 $res6b, $res6b @ GHASH block 8k+6
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
eor $res0b, $res0b, $acc_lb @ PRE 1
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
rev64 $res2b, $res2b @ GHASH block 8k+2
rev64 $res4b, $res4b @ GHASH block 8k+4
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
rev64 $res3b, $res3b @ GHASH block 8k+3
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
rev64 $res5b, $res5b @ GHASH block 8k+5
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
rev64 $res7b, $res7b @ GHASH block 8k+7
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 8k+9 - result
eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 8k+8 - result
eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 8k+15 - result
eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 8k+14 - result
eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 8k+10 - result
stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
mov $ctr1.16b, $h2.16b @ CTR block 8k+17
eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 8k+12 - result
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
mov $ctr0.16b, $h1.16b @ CTR block 8k+16
eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 8k+11 - result
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 8k+13 - result
mov $ctr2.16b, $h3.16b @ CTR block 8k+18
stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
mov $ctr3.16b, $h4.16b @ CTR block 8k+19
b.lt .L128_dec_main_loop
.L128_dec_prepretail: @ PREPRETAIL
rev64 $res3b, $res3b @ GHASH block 8k+3
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev64 $res0b, $res0b @ GHASH block 8k
rev64 $res2b, $res2b @ GHASH block 8k+2
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
eor $res0b, $res0b, $acc_lb @ PRE 1
rev64 $res1b, $res1b @ GHASH block 8k+1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
rev64 $res5b, $res5b @ GHASH block 8k+5
rev64 $res4b, $res4b @ GHASH block 8k+4
rev64 $res6b, $res6b @ GHASH block 8k+6
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
rev64 $res7b, $res7b @ GHASH block 8k+7
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
.L128_dec_tail: @ TAIL
mov $t1.16b, $rk10
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
cmp $main_end_input_ptr, #112
ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
ext $h8.16b, $h8.16b, $h8.16b, #8
ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
ext $h7.16b, $h7.16b, $h7.16b, #8
eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
b.gt .L128_dec_blocks_more_than_7
cmp $main_end_input_ptr, #96
mov $ctr7b, $ctr6b
movi $acc_l.8b, #0
movi $acc_h.8b, #0
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr2b
mov $ctr2b, $ctr1b
movi $acc_m.8b, #0
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
b.gt .L128_dec_blocks_more_than_6
cmp $main_end_input_ptr, #80
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr1b
b.gt .L128_dec_blocks_more_than_5
cmp $main_end_input_ptr, #64
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
b.gt .L128_dec_blocks_more_than_4
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr1b
cmp $main_end_input_ptr, #48
b.gt .L128_dec_blocks_more_than_3
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
cmp $main_end_input_ptr, #32
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
mov $ctr6b, $ctr1b
b.gt .L128_dec_blocks_more_than_2
cmp $main_end_input_ptr, #16
mov $ctr7b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
b.gt L128_dec_blocks_more_than_1
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
b .L128_dec_blocks_less_than_1
.L128_dec_blocks_more_than_7: @ blocks left > 7
rev64 $res0b, $res1b @ GHASH final-7 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
.L128_dec_blocks_more_than_6: @ blocks left > 6
rev64 $res0b, $res1b @ GHASH final-6 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
movi $t0.8b, #0 @ supress further partial tag feed in
pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
.L128_dec_blocks_more_than_5: @ blocks left > 5
rev64 $res0b, $res1b @ GHASH final-5 block
ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
movi $t0.8b, #0 @ supress further partial tag feed in
pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
.L128_dec_blocks_more_than_4: @ blocks left > 4
rev64 $res0b, $res1b @ GHASH final-4 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
.L128_dec_blocks_more_than_3: @ blocks left > 3
st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
rev64 $res0b, $res1b @ GHASH final-3 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
movi $t0.8b, #0 @ supress further partial tag feed in
eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
.L128_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
movi $t0.8b, #0 @ supress further partial tag feed in
ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
.L128_dec_blocks_more_than_1: @ blocks left > 1
st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
rev64 $res0b, $res1b @ GHASH final-1 block
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ supress further partial tag feed in
ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L128_dec_blocks_less_than_1: @ blocks left <= 1
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
cmp $bit_length, #64
mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
csel $temp2_x, $temp1_x, $temp0_x, lt
csel $temp3_x, $temp0_x, xzr, lt
mov $ctr0.d[1], $temp3_x
mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
st1 { $res4b}, [$output_ptr] @ store all 16B
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
st1 { $acc_l.16b }, [$current_tag]
rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
str $rtmp_ctrq, [$counter] @ store the updated counter
lsr x0, $bit_length, #3
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L128_dec_ret:
mov w0, #0x0
ret
.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
___
}
{
my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
my ($temp2_x,$temp3_x)=map("x$_",(13..14));
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
my $t0="v16";
my $t0d="d16";
my $t1="v29";
my $t2=$res1;
my $t3=$t1;
my $t4=$res0;
my $t5=$res2;
my $t6=$t0;
my $t7=$res3;
my $t8=$res4;
my $t9=$res5;
my $t10=$res6;
my $t11="v21";
my $t12=$t1;
my $rtmp_ctr="v30";
my $rtmp_ctrq="q30";
my $rctr_inc="v31";
my $rctr_incd="d31";
my $mod_constantd=$t0d;
my $mod_constant=$t0;
my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
my $rk2q1="v28.1q";
my $rk3q1="v26.1q";
my $rk4v="v27";
#########################################################################################
# size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global unroll8_eor3_aes_gcm_enc_192_kernel
.type unroll8_eor3_aes_gcm_enc_192_kernel,%function
.align 4
unroll8_eor3_aes_gcm_enc_192_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L192_enc_ret
stp d8, d9, [sp, #-80]!
mov $counter, x4
mov $cc, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add $modulo_constant, sp, #64
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
ld1 { $ctr0b}, [$counter] @ CTR block 0
mov $constant_temp, #0x100000000 @ set up counter increment
movi $rctr_inc.16b, #0x0
mov $rctr_inc.d[1], $constant_temp
rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 14 - round 10
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 11 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 9 - round 10
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 13 - round 10
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 12 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 10 - round 10
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 15 - round 10
aese $ctr6b, $rk11 @ AES block 14 - round 11
aese $ctr3b, $rk11 @ AES block 11 - round 11
aese $ctr4b, $rk11 @ AES block 12 - round 11
aese $ctr7b, $rk11 @ AES block 15 - round 11
ldr $rk12q, [$cc, #192] @ load rk12
aese $ctr1b, $rk11 @ AES block 9 - round 11
aese $ctr5b, $rk11 @ AES block 13 - round 11
aese $ctr2b, $rk11 @ AES block 10 - round 11
aese $ctr0b, $rk11 @ AES block 8 - round 11
b.ge .L192_enc_tail @ handle tail
ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 0 - result
rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 3 - result
eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 1 - result
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 2 - result
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
b.ge .L192_enc_prepretail @ do prepretail
.L192_enc_main_loop: @ main loop start
rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
rev64 $res2b, $res2b @ GHASH block 8k+2
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev64 $res0b, $res0b @ GHASH block 8k
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
rev64 $res1b, $res1b @ GHASH block 8k+1
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
eor $res0b, $res0b, $acc_lb @ PRE 1
rev64 $res3b, $res3b @ GHASH block 8k+3
rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
ldr $rk12q, [$cc, #192] @ load rk12
ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 8k+10 - result
eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 8k+8 - result
mov $ctr2.16b, $h3.16b @ CTR block 8k+18
eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 8k+9 - result
mov $ctr1.16b, $h2.16b @ CTR block 8k+17
stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
mov $ctr0.16b, $h1.16b @ CTR block 8k+16
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 8k+11 - result
mov $ctr3.16b, $h4.16b @ CTR block 8k+19
stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
b.lt .L192_enc_main_loop
.L192_enc_prepretail: @ PREPRETAIL
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
rev64 $res0b, $res0b @ GHASH block 8k
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
rev64 $res3b, $res3b @ GHASH block 8k+3
rev64 $res2b, $res2b @ GHASH block 8k+2
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
eor $res0b, $res0b, $acc_lb @ PRE 1
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
rev64 $res1b, $res1b @ GHASH block 8k+1
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ldr $rk12q, [$cc, #192] @ load rk12
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
.L192_enc_tail: @ TAIL
ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - l3ad plaintext
ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
ext $h8.16b, $h8.16b, $h8.16b, #8
mov $t1.16b, $rk12
ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
ext $h7.16b, $h7.16b, $h7.16b, #8
cmp $main_end_input_ptr, #112
eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
b.gt .L192_enc_blocks_more_than_7
cmp $main_end_input_ptr, #96
mov $ctr7b, $ctr6b
movi $acc_h.8b, #0
mov $ctr6b, $ctr5b
movi $acc_l.8b, #0
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr2b
mov $ctr2b, $ctr1b
movi $acc_m.8b, #0
b.gt .L192_enc_blocks_more_than_6
mov $ctr7b, $ctr6b
cmp $main_end_input_ptr, #80
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
b.gt .L192_enc_blocks_more_than_5
cmp $main_end_input_ptr, #64
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr1b
b.gt .L192_enc_blocks_more_than_4
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
cmp $main_end_input_ptr, #48
b.gt .L192_enc_blocks_more_than_3
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
cmp $main_end_input_ptr, #32
b.gt .L192_enc_blocks_more_than_2
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
cmp $main_end_input_ptr, #16
mov $ctr7b, $ctr1b
b.gt .L192_enc_blocks_more_than_1
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
b .L192_enc_blocks_less_than_1
.L192_enc_blocks_more_than_7: @ blocks left > 7
st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
rev64 $res0b, $res1b @ GHASH final-7 block
ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
.L192_enc_blocks_more_than_6: @ blocks left > 6
st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
rev64 $res0b, $res1b @ GHASH final-6 block
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
movi $t0.8b, #0 @ supress further partial tag feed in
pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
.L192_enc_blocks_more_than_5: @ blocks left > 5
st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
rev64 $res0b, $res1b @ GHASH final-5 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
movi $t0.8b, #0 @ supress further partial tag feed in
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
.L192_enc_blocks_more_than_4: @ blocks left > 4
st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
rev64 $res0b, $res1b @ GHASH final-4 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
.L192_enc_blocks_more_than_3: @ blocks left > 3
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
rev64 $res0b, $res1b @ GHASH final-3 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ supress further partial tag feed in
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
.L192_enc_blocks_more_than_2: @ blocks left > 2
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
rev64 $res0b, $res1b @ GHASH final-2 block
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
movi $t0.8b, #0 @ supress further partial tag feed in
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
.L192_enc_blocks_more_than_1: @ blocks left > 1
ldr $h2q, [$current_tag, #64] @ load h1l | h1h
ext $h2.16b, $h2.16b, $h2.16b, #8
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
rev64 $res0b, $res1b @ GHASH final-1 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
.L192_enc_blocks_less_than_1: @ blocks left <= 1
mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
cmp $bit_length, #64
mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
csel $temp2_x, $temp1_x, $temp0_x, lt
csel $temp3_x, $temp0_x, xzr, lt
mov $ctr0.d[1], $temp3_x
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
st1 { $res1b}, [$output_ptr] @ store all 16B
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
str $rtmp_ctrq, [$counter] @ store the updated counter
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
st1 { $acc_l.16b }, [$current_tag]
lsr x0, $bit_length, #3 @ return sizes
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L192_enc_ret:
mov w0, #0x0
ret
.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
___
#########################################################################################
# size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global unroll8_eor3_aes_gcm_dec_192_kernel
.type unroll8_eor3_aes_gcm_dec_192_kernel,%function
.align 4
unroll8_eor3_aes_gcm_dec_192_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L192_dec_ret
stp d8, d9, [sp, #-80]!
mov $counter, x4
mov $cc, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add $modulo_constant, sp, #64
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
ld1 { $ctr0b}, [$counter] @ CTR block 0
ld1 { $acc_lb}, [$current_tag]
mov $constant_temp, #0x100000000 @ set up counter increment
movi $rctr_inc.16b, #0x0
mov $rctr_inc.d[1], $constant_temp
rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
ldr $rk12q, [$cc, #192] @ load rk12
aese $ctr0b, $rk11 @ AES block 0 - round 11
aese $ctr1b, $rk11 @ AES block 1 - round 11
aese $ctr4b, $rk11 @ AES block 4 - round 11
aese $ctr6b, $rk11 @ AES block 6 - round 11
aese $ctr5b, $rk11 @ AES block 5 - round 11
aese $ctr7b, $rk11 @ AES block 7 - round 11
aese $ctr2b, $rk11 @ AES block 2 - round 11
aese $ctr3b, $rk11 @ AES block 3 - round 11
b.ge .L192_dec_tail @ handle tail
ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 1 - result
eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 0 - result
stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 3 - result
eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 2 - result
stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 4 - result
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 5 - result
stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 6 - result
eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 7 - result
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
b.ge .L192_dec_prepretail @ do prepretail
.L192_dec_main_loop: @ main loop start
rev64 $res1b, $res1b @ GHASH block 8k+1
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev64 $res0b, $res0b @ GHASH block 8k
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
rev64 $res4b, $res4b @ GHASH block 8k+4
rev64 $res3b, $res3b @ GHASH block 8k+3
eor $res0b, $res0b, $acc_lb @ PRE 1
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
rev64 $res5b, $res5b @ GHASH block 8k+5
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
rev64 $res2b, $res2b @ GHASH block 8k+2
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
rev64 $res7b, $res7b @ GHASH block 8k+7
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
rev64 $res6b, $res6b @ GHASH block 8k+6
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
ldr $rk12q, [$cc, #192] @ load rk12
ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 8k+8 - result
rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 8k+9 - result
stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 8k+11 - result
eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 8k+10 - result
eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 8k+15 - result
stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 8k+13 - result
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
mov $ctr3.16b, $h4.16b @ CTR block 8k+19
eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 8k+12 - result
stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 8k+14 - result
stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
mov $ctr0.16b, $h1.16b @ CTR block 8k+16
mov $ctr1.16b, $h2.16b @ CTR block 8k+17
mov $ctr2.16b, $h3.16b @ CTR block 8k+18
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
b.lt .L192_dec_main_loop
.L192_dec_prepretail: @ PREPRETAIL
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
rev64 $res0b, $res0b @ GHASH block 8k
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev64 $res3b, $res3b @ GHASH block 8k+3
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
eor $res0b, $res0b, $acc_lb @ PRE 1
rev64 $res2b, $res2b @ GHASH block 8k+2
rev64 $res1b, $res1b @ GHASH block 8k+1
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
rev64 $res5b, $res5b @ GHASH block 8k+5
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
rev64 $res7b, $res7b @ GHASH block 8k+7
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
rev64 $res4b, $res4b @ GHASH block 8k+4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
rev64 $res6b, $res6b @ GHASH block 8k+6
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ldr $rk12q, [$cc, #192] @ load rk12
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
.L192_dec_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
ext $h8.16b, $h8.16b, $h8.16b, #8
mov $t1.16b, $rk12
ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
ext $h7.16b, $h7.16b, $h7.16b, #8
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
cmp $main_end_input_ptr, #112
b.gt .L192_dec_blocks_more_than_7
mov $ctr7b, $ctr6b
movi $acc_h.8b, #0
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
cmp $main_end_input_ptr, #96
movi $acc_l.8b, #0
mov $ctr3b, $ctr2b
mov $ctr2b, $ctr1b
movi $acc_m.8b, #0
b.gt .L192_dec_blocks_more_than_6
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
cmp $main_end_input_ptr, #80
b.gt .L192_dec_blocks_more_than_5
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr1b
cmp $main_end_input_ptr, #64
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
b.gt .L192_dec_blocks_more_than_4
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr1b
cmp $main_end_input_ptr, #48
b.gt .L192_dec_blocks_more_than_3
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
cmp $main_end_input_ptr, #32
mov $ctr6b, $ctr1b
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
b.gt .L192_dec_blocks_more_than_2
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr1b
cmp $main_end_input_ptr, #16
b.gt .L192_dec_blocks_more_than_1
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
b .L192_dec_blocks_less_than_1
.L192_dec_blocks_more_than_7: @ blocks left > 7
rev64 $res0b, $res1b @ GHASH final-7 block
ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
eor $res0b, $res0b, $t0.16b @ feed in partial tag
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
.L192_dec_blocks_more_than_6: @ blocks left > 6
rev64 $res0b, $res1b @ GHASH final-6 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
.L192_dec_blocks_more_than_5: @ blocks left > 5
rev64 $res0b, $res1b @ GHASH final-5 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
movi $t0.8b, #0 @ supress further partial tag feed in
st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
.L192_dec_blocks_more_than_4: @ blocks left > 4
rev64 $res0b, $res1b @ GHASH final-4 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ supress further partial tag feed in
ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
.L192_dec_blocks_more_than_3: @ blocks left > 3
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
rev64 $res0b, $res1b @ GHASH final-3 block
ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
movi $t0.8b, #0 @ supress further partial tag feed in
pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
.L192_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
.L192_dec_blocks_more_than_1: @ blocks left > 1
rev64 $res0b, $res1b @ GHASH final-1 block
ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
ldr $h2q, [$current_tag, #64] @ load h1l | h1h
ext $h2.16b, $h2.16b, $h2.16b, #8
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ supress further partial tag feed in
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
.L192_dec_blocks_less_than_1: @ blocks left <= 1
rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
str $rtmp_ctrq, [$counter] @ store the updated counter
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
cmp $bit_length, #64
csel $temp2_x, $temp1_x, $temp0_x, lt
csel $temp3_x, $temp0_x, xzr, lt
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
mov $ctr0.d[1], $temp3_x
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
rev64 $res0b, $res1b @ GHASH final block
st1 { $res4b}, [$output_ptr] @ store all 16B
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
st1 { $acc_l.16b }, [$current_tag]
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L192_dec_ret:
mov w0, #0x0
ret
.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
___
}
{
my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
my ($temp2_x,$temp3_x)=map("x$_",(13..14));
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
my $t0="v16";
my $t0d="d16";
my $t1="v29";
my $t2=$res1;
my $t3=$t1;
my $t4=$res0;
my $t5=$res2;
my $t6=$t0;
my $t7=$res3;
my $t8=$res4;
my $t9=$res5;
my $t10=$res6;
my $t11="v21";
my $t12=$t1;
my $rtmp_ctr="v30";
my $rtmp_ctrq="q30";
my $rctr_inc="v31";
my $rctr_incd="d31";
my $mod_constantd=$t0d;
my $mod_constant=$t0;
my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
my $rk2q1="v28.1q";
my $rk3q1="v26.1q";
my $rk4v="v27";
#########################################################################################
# size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global unroll8_eor3_aes_gcm_enc_256_kernel
.type unroll8_eor3_aes_gcm_enc_256_kernel,%function
.align 4
unroll8_eor3_aes_gcm_enc_256_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L256_enc_ret
stp d8, d9, [sp, #-80]!
mov $counter, x4
mov $cc, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add $modulo_constant, sp, #64
ld1 { $ctr0b}, [$counter] @ CTR block 0
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $constant_temp, #0x100000000 @ set up counter increment
movi $rctr_inc.16b, #0x0
mov $rctr_inc.d[1], $constant_temp
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
ldr $rk14q, [$cc, #224] @ load rk14
aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
aese $ctr2b, $rk13 @ AES block 2 - round 13
aese $ctr1b, $rk13 @ AES block 1 - round 13
aese $ctr4b, $rk13 @ AES block 4 - round 13
aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
aese $ctr0b, $rk13 @ AES block 0 - round 13
aese $ctr5b, $rk13 @ AES block 5 - round 13
aese $ctr6b, $rk13 @ AES block 6 - round 13
aese $ctr7b, $rk13 @ AES block 7 - round 13
aese $ctr3b, $rk13 @ AES block 3 - round 13
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
b.ge .L256_enc_tail @ handle tail
ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 0 - result
rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 1 - result
eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 3 - result
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 2 - result
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
b.ge .L256_enc_prepretail @ do prepretail
.L256_enc_main_loop: @ main loop start
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
rev64 $res3b, $res3b @ GHASH block 8k+3
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
rev64 $res1b, $res1b @ GHASH block 8k+1
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
rev64 $res0b, $res0b @ GHASH block 8k
rev64 $res4b, $res4b @ GHASH block 8k+4
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
eor $res0b, $res0b, $acc_lb @ PRE 1
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
rev64 $res6b, $res6b @ GHASH block 8k+6
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
rev64 $res2b, $res2b @ GHASH block 8k+2
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
rev64 $res5b, $res5b @ GHASH block 8k+5
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
rev64 $res7b, $res7b @ GHASH block 8k+7
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
ldr $rk14q, [$cc, #224] @ load rk14
aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 8k+10 - result
rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 8k+11 - result
mov $ctr3.16b, $h4.16b @ CTR block 8k+19
eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 8k+9 - result
eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 8k+8 - result
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
mov $ctr2.16b, $h3.16b @ CTR block 8k+18
eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
mov $ctr1.16b, $h2.16b @ CTR block 8k+17
stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
mov $ctr0.16b, $h1.16b @ CTR block 8k+16
b.lt .L256_enc_main_loop
.L256_enc_prepretail: @ PREPRETAIL
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
rev64 $res2b, $res2b @ GHASH block 8k+2
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
rev64 $res5b, $res5b @ GHASH block 8k+5
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev64 $res0b, $res0b @ GHASH block 8k
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
rev64 $res1b, $res1b @ GHASH block 8k+1
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
eor $res0b, $res0b, $acc_lb @ PRE 1
rev64 $res3b, $res3b @ GHASH block 8k+3
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
rev64 $res6b, $res6b @ GHASH block 8k+6
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
rev64 $res4b, $res4b @ GHASH block 8k+4
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
rev64 $res7b, $res7b @ GHASH block 8k+7
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
ldr $rk14q, [$cc, #224] @ load rk14
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
.L256_enc_tail: @ TAIL
ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
ext $h7.16b, $h7.16b, $h7.16b, #8
mov $t1.16b, $rk14
cmp $main_end_input_ptr, #112
eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
b.gt .L256_enc_blocks_more_than_7
movi $acc_l.8b, #0
mov $ctr7b, $ctr6b
movi $acc_h.8b, #0
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr2b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr2b, $ctr1b
movi $acc_m.8b, #0
cmp $main_end_input_ptr, #96
b.gt .L256_enc_blocks_more_than_6
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
cmp $main_end_input_ptr, #80
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
b.gt .L256_enc_blocks_more_than_5
mov $ctr7b, $ctr6b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
cmp $main_end_input_ptr, #64
mov $ctr4b, $ctr1b
b.gt .L256_enc_blocks_more_than_4
cmp $main_end_input_ptr, #48
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
b.gt .L256_enc_blocks_more_than_3
cmp $main_end_input_ptr, #32
mov $ctr7b, $ctr6b
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
mov $ctr6b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
b.gt .L256_enc_blocks_more_than_2
mov $ctr7b, $ctr1b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
cmp $main_end_input_ptr, #16
b.gt .L256_enc_blocks_more_than_1
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
b .L256_enc_blocks_less_than_1
.L256_enc_blocks_more_than_7: @ blocks left > 7
st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
rev64 $res0b, $res1b @ GHASH final-7 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
.L256_enc_blocks_more_than_6: @ blocks left > 6
st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
rev64 $res0b, $res1b @ GHASH final-6 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
movi $t0.8b, #0 @ supress further partial tag feed in
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
.L256_enc_blocks_more_than_5: @ blocks left > 5
st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
rev64 $res0b, $res1b @ GHASH final-5 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
.L256_enc_blocks_more_than_4: @ blocks left > 4
st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
rev64 $res0b, $res1b @ GHASH final-4 block
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
.L256_enc_blocks_more_than_3: @ blocks left > 3
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
rev64 $res0b, $res1b @ GHASH final-3 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
movi $t0.8b, #0 @ supress further partial tag feed in
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
.L256_enc_blocks_more_than_2: @ blocks left > 2
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
rev64 $res0b, $res1b @ GHASH final-2 block
ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
.L256_enc_blocks_more_than_1: @ blocks left > 1
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
rev64 $res0b, $res1b @ GHASH final-1 block
ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ supress further partial tag feed in
ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L256_enc_blocks_less_than_1: @ blocks left <= 1
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
cmp $bit_length, #64
mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
csel $temp3_x, $temp0_x, xzr, lt
csel $temp2_x, $temp1_x, $temp0_x, lt
mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
mov $ctr0.d[1], $temp3_x
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
str $rtmp_ctrq, [$counter] @ store the updated counter
eor $res0b, $res0b, $t0.16b @ feed in partial tag
st1 { $res1b}, [$output_ptr] @ store all 16B
ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
st1 { $acc_l.16b }, [$current_tag]
lsr x0, $bit_length, #3 @ return sizes
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L256_enc_ret:
mov w0, #0x0
ret
.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
___
{
#########################################################################################
# size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global unroll8_eor3_aes_gcm_dec_256_kernel
.type unroll8_eor3_aes_gcm_dec_256_kernel,%function
.align 4
unroll8_eor3_aes_gcm_dec_256_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L256_dec_ret
stp d8, d9, [sp, #-80]!
mov $counter, x4
mov $cc, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add $modulo_constant, sp, #64
ld1 { $ctr0b}, [$counter] @ CTR block 0
mov $constant_temp, #0x100000000 @ set up counter increment
movi $rctr_inc.16b, #0x0
mov $rctr_inc.d[1], $constant_temp
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
ldr $rk14q, [$cc, #224] @ load rk14
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
aese $ctr5b, $rk13 @ AES block 5 - round 13
aese $ctr1b, $rk13 @ AES block 1 - round 13
aese $ctr2b, $rk13 @ AES block 2 - round 13
aese $ctr0b, $rk13 @ AES block 0 - round 13
aese $ctr4b, $rk13 @ AES block 4 - round 13
aese $ctr6b, $rk13 @ AES block 6 - round 13
aese $ctr3b, $rk13 @ AES block 3 - round 13
aese $ctr7b, $rk13 @ AES block 7 - round 13
b.ge .L256_dec_tail @ handle tail
ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 1 - result
eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 0 - result
stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 3 - result
eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 5 - result
eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 4 - result
rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 2 - result
stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 6 - result
rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 7 - result
stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
b.ge .L256_dec_prepretail @ do prepretail
.L256_dec_main_loop: @ main loop start
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
rev64 $res1b, $res1b @ GHASH block 8k+1
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
rev64 $res0b, $res0b @ GHASH block 8k
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev64 $res4b, $res4b @ GHASH block 8k+4
rev64 $res3b, $res3b @ GHASH block 8k+3
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
rev64 $res7b, $res7b @ GHASH block 8k+7
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
eor $res0b, $res0b, $acc_lb @ PRE 1
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
rev64 $res2b, $res2b @ GHASH block 8k+2
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
rev64 $res5b, $res5b @ GHASH block 8k+5
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
rev64 $res6b, $res6b @ GHASH block 8k+6
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
ldr $rk14q, [$cc, #224] @ load rk14
aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 8k+10 - result
eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 8k+9 - result
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 8k+13 - result
eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 8k+8 - result
aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
mov $ctr0.16b, $h1.16b @ CTR block 8k+16
eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 8k+12 - result
eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 8k+11 - result
stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
mov $ctr3.16b, $h4.16b @ CTR block 8k+19
mov $ctr2.16b, $h3.16b @ CTR block 8k+18
aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
mov $ctr1.16b, $h2.16b @ CTR block 8k+17
stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 8k+15 - result
eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 8k+14 - result
rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
b.lt .L256_dec_main_loop
.L256_dec_prepretail: @ PREPRETAIL
ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
rev64 $res4b, $res4b @ GHASH block 8k+4
ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
rev64 $res0b, $res0b @ GHASH block 8k
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
ldr $h7q, [$current_tag, #176] @ load h7l | h7h
ext $h7.16b, $h7.16b, $h7.16b, #8
ldr $h8q, [$current_tag, #208] @ load h8l | h8h
ext $h8.16b, $h8.16b, $h8.16b, #8
rev64 $res1b, $res1b @ GHASH block 8k+1
rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
rev64 $res2b, $res2b @ GHASH block 8k+2
ldr $h5q, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
ldr $h6q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
eor $res0b, $res0b, $acc_lb @ PRE 1
aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
rev64 $res3b, $res3b @ GHASH block 8k+3
pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
rev64 $res6b, $res6b @ GHASH block 8k+6
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
rev64 $res7b, $res7b @ GHASH block 8k+7
rev64 $res5b, $res5b @ GHASH block 8k+5
eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
ldr $rk14q, [$cc, #224] @ load rk14
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
.L256_dec_tail: @ TAIL
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
cmp $main_end_input_ptr, #112
ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
ext $h8.16b, $h8.16b, $h8.16b, #8
mov $t1.16b, $rk14
ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
ext $h5.16b, $h5.16b, $h5.16b, #8
eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
ext $h6.16b, $h6.16b, $h6.16b, #8
ext $h7.16b, $h7.16b, $h7.16b, #8
b.gt .L256_dec_blocks_more_than_7
mov $ctr7b, $ctr6b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr3b
movi $acc_l.8b, #0
movi $acc_h.8b, #0
movi $acc_m.8b, #0
mov $ctr3b, $ctr2b
cmp $main_end_input_ptr, #96
mov $ctr2b, $ctr1b
b.gt .L256_dec_blocks_more_than_6
mov $ctr7b, $ctr6b
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
cmp $main_end_input_ptr, #80
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr4b, $ctr3b
mov $ctr3b, $ctr1b
b.gt .L256_dec_blocks_more_than_5
cmp $main_end_input_ptr, #64
mov $ctr7b, $ctr6b
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr4b
mov $ctr4b, $ctr1b
b.gt .L256_dec_blocks_more_than_4
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
cmp $main_end_input_ptr, #48
mov $ctr6b, $ctr5b
mov $ctr5b, $ctr1b
b.gt .L256_dec_blocks_more_than_3
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr6b
cmp $main_end_input_ptr, #32
mov $ctr6b, $ctr1b
b.gt .L256_dec_blocks_more_than_2
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
mov $ctr7b, $ctr1b
cmp $main_end_input_ptr, #16
b.gt .L256_dec_blocks_more_than_1
sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
b .L256_dec_blocks_less_than_1
.L256_dec_blocks_more_than_7: @ blocks left > 7
rev64 $res0b, $res1b @ GHASH final-7 block
ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
.L256_dec_blocks_more_than_6: @ blocks left > 6
rev64 $res0b, $res1b @ GHASH final-6 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
movi $t0.8b, #0 @ supress further partial tag feed in
ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
.L256_dec_blocks_more_than_5: @ blocks left > 5
rev64 $res0b, $res1b @ GHASH final-5 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
.L256_dec_blocks_more_than_4: @ blocks left > 4
rev64 $res0b, $res1b @ GHASH final-4 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
movi $t0.8b, #0 @ supress further partial tag feed in
pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
.L256_dec_blocks_more_than_3: @ blocks left > 3
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4.16b, $h4.16b, $h4.16b, #8
rev64 $res0b, $res1b @ GHASH final-3 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
movi $t0.8b, #0 @ supress further partial tag feed in
pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
.L256_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3.16b, $h3.16b, $h3.16b, #8
ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
movi $t0.8b, #0 @ supress further partial tag feed in
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
.L256_dec_blocks_more_than_1: @ blocks left > 1
rev64 $res0b, $res1b @ GHASH final-1 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2.16b, $h2.16b, $h2.16b, #8
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
movi $t0.8b, #0 @ supress further partial tag feed in
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L256_dec_blocks_less_than_1: @ blocks left <= 1
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
str $rtmp_ctrq, [$counter] @ store the updated counter
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
cmp $bit_length, #64
mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
csel $temp3_x, $temp0_x, xzr, lt
csel $temp2_x, $temp1_x, $temp0_x, lt
mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
mov $ctr0.d[1], $temp3_x
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1.16b, $h1.16b, $h1.16b, #8
bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
st1 { $res4b}, [$output_ptr] @ store all 16B
eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
eor $t11.16b, $acc_hb, $t11.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $t11.16b @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
st1 { $acc_l.16b }, [$current_tag]
lsr x0, $bit_length, #3 @ return sizes
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L256_dec_ret:
mov w0, #0x0
ret
.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
___
}
}
$code.=<<___;
.asciz "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
.align 2
#endif
___
{
my %opcode = (
"rax1" => 0xce608c00, "eor3" => 0xce000000,
"bcax" => 0xce200000, "xar" => 0xce800000 );
sub unsha3 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
$mnemonic,$arg;
}
sub unvmov {
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
$3<8?$3:$3+8,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/@\s/\/\//o; # old->new style commentary
s/\`([^\`]*)\`/eval($1)/ge;
m/\bld1r\b/ and s/\.16b/.2d/g or
s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush