mirror of
https://github.com/openssl/openssl.git
synced 2025-01-06 13:26:43 +08:00
1aa89a7a3a
They now generally conform to the following argument sequence: script.pl "$(PERLASM_SCHEME)" [ C preprocessor arguments ... ] \ $(PROCESSOR) <output file> However, in the spirit of being able to use these scripts manually, they also allow for no argument, or for only the flavour, or for only the output file. This is done by only using the last argument as output file if it's a file (it has an extension), and only using the first argument as flavour if it isn't a file (it doesn't have an extension). While we're at it, we make all $xlate calls the same, i.e. the $output argument is always quoted, and we always die on error when trying to start $xlate. There's a perl lesson in this, regarding operator priority... This will always succeed, even when it fails: open FOO, "something" || die "ERR: $!"; The reason is that '||' has higher priority than list operators (a function is essentially a list operator and gobbles up everything following it that isn't lower priority), and since a non-empty string is always true, so that ends up being exactly the same as: open FOO, "something"; This, however, will fail if "something" can't be opened: open FOO, "something" or die "ERR: $!"; The reason is that 'or' has lower priority that list operators, i.e. it's performed after the 'open' call. Reviewed-by: Matt Caswell <matt@openssl.org> (Merged from https://github.com/openssl/openssl/pull/9884)
944 lines
21 KiB
Raku
Executable File
944 lines
21 KiB
Raku
Executable File
#! /usr/bin/env perl
|
|
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
# This module implements Poly1305 hash for ARMv8.
|
|
#
|
|
# June 2015
|
|
#
|
|
# Numbers are cycles per processed byte with poly1305_blocks alone.
|
|
#
|
|
# IALU/gcc-4.9 NEON
|
|
#
|
|
# Apple A7 1.86/+5% 0.72
|
|
# Cortex-A53 2.69/+58% 1.47
|
|
# Cortex-A57 2.70/+7% 1.14
|
|
# Denver 1.64/+50% 1.18(*)
|
|
# X-Gene 2.13/+68% 2.27
|
|
# Mongoose 1.77/+75% 1.12
|
|
# Kryo 2.70/+55% 1.13
|
|
# ThunderX2 1.17/+95% 1.36
|
|
#
|
|
# (*) estimate based on resources availability is less than 1.0,
|
|
# i.e. measured result is worse than expected, presumably binary
|
|
# translator is not almighty;
|
|
|
|
# $output is the last argument if it looks like a file (it has an extension)
|
|
# $flavour is the first argument if it doesn't look like a file
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
|
|
or die "can't call $xlate: $!";
|
|
*STDOUT=*OUT;
|
|
|
|
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
|
|
my ($mac,$nonce)=($inp,$len);
|
|
|
|
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
|
|
|
|
$code.=<<___;
|
|
#include "arm_arch.h"
|
|
|
|
.text
|
|
|
|
// forward "declarations" are required for Apple
|
|
.extern OPENSSL_armcap_P
|
|
.globl poly1305_blocks
|
|
.globl poly1305_emit
|
|
|
|
.globl poly1305_init
|
|
.type poly1305_init,%function
|
|
.align 5
|
|
poly1305_init:
|
|
cmp $inp,xzr
|
|
stp xzr,xzr,[$ctx] // zero hash value
|
|
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
|
|
|
|
csel x0,xzr,x0,eq
|
|
b.eq .Lno_key
|
|
|
|
adrp x17,OPENSSL_armcap_P
|
|
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
|
|
|
|
ldp $r0,$r1,[$inp] // load key
|
|
mov $s1,#0xfffffffc0fffffff
|
|
movk $s1,#0x0fff,lsl#48
|
|
#ifdef __ARMEB__
|
|
rev $r0,$r0 // flip bytes
|
|
rev $r1,$r1
|
|
#endif
|
|
and $r0,$r0,$s1 // &=0ffffffc0fffffff
|
|
and $s1,$s1,#-4
|
|
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
|
|
stp $r0,$r1,[$ctx,#32] // save key value
|
|
|
|
tst w17,#ARMV7_NEON
|
|
|
|
adr $d0,.Lpoly1305_blocks
|
|
adr $r0,.Lpoly1305_blocks_neon
|
|
adr $d1,.Lpoly1305_emit
|
|
adr $r1,.Lpoly1305_emit_neon
|
|
|
|
csel $d0,$d0,$r0,eq
|
|
csel $d1,$d1,$r1,eq
|
|
|
|
#ifdef __ILP32__
|
|
stp w12,w13,[$len]
|
|
#else
|
|
stp $d0,$d1,[$len]
|
|
#endif
|
|
|
|
mov x0,#1
|
|
.Lno_key:
|
|
ret
|
|
.size poly1305_init,.-poly1305_init
|
|
|
|
.type poly1305_blocks,%function
|
|
.align 5
|
|
poly1305_blocks:
|
|
.Lpoly1305_blocks:
|
|
ands $len,$len,#-16
|
|
b.eq .Lno_data
|
|
|
|
ldp $h0,$h1,[$ctx] // load hash value
|
|
ldp $r0,$r1,[$ctx,#32] // load key value
|
|
ldr $h2,[$ctx,#16]
|
|
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
|
b .Loop
|
|
|
|
.align 5
|
|
.Loop:
|
|
ldp $t0,$t1,[$inp],#16 // load input
|
|
sub $len,$len,#16
|
|
#ifdef __ARMEB__
|
|
rev $t0,$t0
|
|
rev $t1,$t1
|
|
#endif
|
|
adds $h0,$h0,$t0 // accumulate input
|
|
adcs $h1,$h1,$t1
|
|
|
|
mul $d0,$h0,$r0 // h0*r0
|
|
adc $h2,$h2,$padbit
|
|
umulh $d1,$h0,$r0
|
|
|
|
mul $t0,$h1,$s1 // h1*5*r1
|
|
umulh $t1,$h1,$s1
|
|
|
|
adds $d0,$d0,$t0
|
|
mul $t0,$h0,$r1 // h0*r1
|
|
adc $d1,$d1,$t1
|
|
umulh $d2,$h0,$r1
|
|
|
|
adds $d1,$d1,$t0
|
|
mul $t0,$h1,$r0 // h1*r0
|
|
adc $d2,$d2,xzr
|
|
umulh $t1,$h1,$r0
|
|
|
|
adds $d1,$d1,$t0
|
|
mul $t0,$h2,$s1 // h2*5*r1
|
|
adc $d2,$d2,$t1
|
|
mul $t1,$h2,$r0 // h2*r0
|
|
|
|
adds $d1,$d1,$t0
|
|
adc $d2,$d2,$t1
|
|
|
|
and $t0,$d2,#-4 // final reduction
|
|
and $h2,$d2,#3
|
|
add $t0,$t0,$d2,lsr#2
|
|
adds $h0,$d0,$t0
|
|
adcs $h1,$d1,xzr
|
|
adc $h2,$h2,xzr
|
|
|
|
cbnz $len,.Loop
|
|
|
|
stp $h0,$h1,[$ctx] // store hash value
|
|
str $h2,[$ctx,#16]
|
|
|
|
.Lno_data:
|
|
ret
|
|
.size poly1305_blocks,.-poly1305_blocks
|
|
|
|
.type poly1305_emit,%function
|
|
.align 5
|
|
poly1305_emit:
|
|
.Lpoly1305_emit:
|
|
ldp $h0,$h1,[$ctx] // load hash base 2^64
|
|
ldr $h2,[$ctx,#16]
|
|
ldp $t0,$t1,[$nonce] // load nonce
|
|
|
|
adds $d0,$h0,#5 // compare to modulus
|
|
adcs $d1,$h1,xzr
|
|
adc $d2,$h2,xzr
|
|
|
|
tst $d2,#-4 // see if it's carried/borrowed
|
|
|
|
csel $h0,$h0,$d0,eq
|
|
csel $h1,$h1,$d1,eq
|
|
|
|
#ifdef __ARMEB__
|
|
ror $t0,$t0,#32 // flip nonce words
|
|
ror $t1,$t1,#32
|
|
#endif
|
|
adds $h0,$h0,$t0 // accumulate nonce
|
|
adc $h1,$h1,$t1
|
|
#ifdef __ARMEB__
|
|
rev $h0,$h0 // flip output bytes
|
|
rev $h1,$h1
|
|
#endif
|
|
stp $h0,$h1,[$mac] // write result
|
|
|
|
ret
|
|
.size poly1305_emit,.-poly1305_emit
|
|
___
|
|
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
|
|
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
|
|
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
|
|
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
|
|
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
|
|
my ($T0,$T1,$MASK) = map("v$_",(29..31));
|
|
|
|
my ($in2,$zeros)=("x16","x17");
|
|
my $is_base2_26 = $zeros; # borrow
|
|
|
|
$code.=<<___;
|
|
.type poly1305_mult,%function
|
|
.align 5
|
|
poly1305_mult:
|
|
mul $d0,$h0,$r0 // h0*r0
|
|
umulh $d1,$h0,$r0
|
|
|
|
mul $t0,$h1,$s1 // h1*5*r1
|
|
umulh $t1,$h1,$s1
|
|
|
|
adds $d0,$d0,$t0
|
|
mul $t0,$h0,$r1 // h0*r1
|
|
adc $d1,$d1,$t1
|
|
umulh $d2,$h0,$r1
|
|
|
|
adds $d1,$d1,$t0
|
|
mul $t0,$h1,$r0 // h1*r0
|
|
adc $d2,$d2,xzr
|
|
umulh $t1,$h1,$r0
|
|
|
|
adds $d1,$d1,$t0
|
|
mul $t0,$h2,$s1 // h2*5*r1
|
|
adc $d2,$d2,$t1
|
|
mul $t1,$h2,$r0 // h2*r0
|
|
|
|
adds $d1,$d1,$t0
|
|
adc $d2,$d2,$t1
|
|
|
|
and $t0,$d2,#-4 // final reduction
|
|
and $h2,$d2,#3
|
|
add $t0,$t0,$d2,lsr#2
|
|
adds $h0,$d0,$t0
|
|
adcs $h1,$d1,xzr
|
|
adc $h2,$h2,xzr
|
|
|
|
ret
|
|
.size poly1305_mult,.-poly1305_mult
|
|
|
|
.type poly1305_splat,%function
|
|
.align 5
|
|
poly1305_splat:
|
|
and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
|
ubfx x13,$h0,#26,#26
|
|
extr x14,$h1,$h0,#52
|
|
and x14,x14,#0x03ffffff
|
|
ubfx x15,$h1,#14,#26
|
|
extr x16,$h2,$h1,#40
|
|
|
|
str w12,[$ctx,#16*0] // r0
|
|
add w12,w13,w13,lsl#2 // r1*5
|
|
str w13,[$ctx,#16*1] // r1
|
|
add w13,w14,w14,lsl#2 // r2*5
|
|
str w12,[$ctx,#16*2] // s1
|
|
str w14,[$ctx,#16*3] // r2
|
|
add w14,w15,w15,lsl#2 // r3*5
|
|
str w13,[$ctx,#16*4] // s2
|
|
str w15,[$ctx,#16*5] // r3
|
|
add w15,w16,w16,lsl#2 // r4*5
|
|
str w14,[$ctx,#16*6] // s3
|
|
str w16,[$ctx,#16*7] // r4
|
|
str w15,[$ctx,#16*8] // s4
|
|
|
|
ret
|
|
.size poly1305_splat,.-poly1305_splat
|
|
|
|
.type poly1305_blocks_neon,%function
|
|
.align 5
|
|
poly1305_blocks_neon:
|
|
.Lpoly1305_blocks_neon:
|
|
ldr $is_base2_26,[$ctx,#24]
|
|
cmp $len,#128
|
|
b.hs .Lblocks_neon
|
|
cbz $is_base2_26,.Lpoly1305_blocks
|
|
|
|
.Lblocks_neon:
|
|
.inst 0xd503233f // paciasp
|
|
stp x29,x30,[sp,#-80]!
|
|
add x29,sp,#0
|
|
|
|
ands $len,$len,#-16
|
|
b.eq .Lno_data_neon
|
|
|
|
cbz $is_base2_26,.Lbase2_64_neon
|
|
|
|
ldp w10,w11,[$ctx] // load hash value base 2^26
|
|
ldp w12,w13,[$ctx,#8]
|
|
ldr w14,[$ctx,#16]
|
|
|
|
tst $len,#31
|
|
b.eq .Leven_neon
|
|
|
|
ldp $r0,$r1,[$ctx,#32] // load key value
|
|
|
|
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
|
lsr $h1,x12,#12
|
|
adds $h0,$h0,x12,lsl#52
|
|
add $h1,$h1,x13,lsl#14
|
|
adc $h1,$h1,xzr
|
|
lsr $h2,x14,#24
|
|
adds $h1,$h1,x14,lsl#40
|
|
adc $d2,$h2,xzr // can be partially reduced...
|
|
|
|
ldp $d0,$d1,[$inp],#16 // load input
|
|
sub $len,$len,#16
|
|
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
|
|
|
and $t0,$d2,#-4 // ... so reduce
|
|
and $h2,$d2,#3
|
|
add $t0,$t0,$d2,lsr#2
|
|
adds $h0,$h0,$t0
|
|
adcs $h1,$h1,xzr
|
|
adc $h2,$h2,xzr
|
|
|
|
#ifdef __ARMEB__
|
|
rev $d0,$d0
|
|
rev $d1,$d1
|
|
#endif
|
|
adds $h0,$h0,$d0 // accumulate input
|
|
adcs $h1,$h1,$d1
|
|
adc $h2,$h2,$padbit
|
|
|
|
bl poly1305_mult
|
|
ldr x30,[sp,#8]
|
|
|
|
cbz $padbit,.Lstore_base2_64_neon
|
|
|
|
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
|
ubfx x11,$h0,#26,#26
|
|
extr x12,$h1,$h0,#52
|
|
and x12,x12,#0x03ffffff
|
|
ubfx x13,$h1,#14,#26
|
|
extr x14,$h2,$h1,#40
|
|
|
|
cbnz $len,.Leven_neon
|
|
|
|
stp w10,w11,[$ctx] // store hash value base 2^26
|
|
stp w12,w13,[$ctx,#8]
|
|
str w14,[$ctx,#16]
|
|
b .Lno_data_neon
|
|
|
|
.align 4
|
|
.Lstore_base2_64_neon:
|
|
stp $h0,$h1,[$ctx] // store hash value base 2^64
|
|
stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
|
|
b .Lno_data_neon
|
|
|
|
.align 4
|
|
.Lbase2_64_neon:
|
|
ldp $r0,$r1,[$ctx,#32] // load key value
|
|
|
|
ldp $h0,$h1,[$ctx] // load hash value base 2^64
|
|
ldr $h2,[$ctx,#16]
|
|
|
|
tst $len,#31
|
|
b.eq .Linit_neon
|
|
|
|
ldp $d0,$d1,[$inp],#16 // load input
|
|
sub $len,$len,#16
|
|
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
|
#ifdef __ARMEB__
|
|
rev $d0,$d0
|
|
rev $d1,$d1
|
|
#endif
|
|
adds $h0,$h0,$d0 // accumulate input
|
|
adcs $h1,$h1,$d1
|
|
adc $h2,$h2,$padbit
|
|
|
|
bl poly1305_mult
|
|
|
|
.Linit_neon:
|
|
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
|
ubfx x11,$h0,#26,#26
|
|
extr x12,$h1,$h0,#52
|
|
and x12,x12,#0x03ffffff
|
|
ubfx x13,$h1,#14,#26
|
|
extr x14,$h2,$h1,#40
|
|
|
|
stp d8,d9,[sp,#16] // meet ABI requirements
|
|
stp d10,d11,[sp,#32]
|
|
stp d12,d13,[sp,#48]
|
|
stp d14,d15,[sp,#64]
|
|
|
|
fmov ${H0},x10
|
|
fmov ${H1},x11
|
|
fmov ${H2},x12
|
|
fmov ${H3},x13
|
|
fmov ${H4},x14
|
|
|
|
////////////////////////////////// initialize r^n table
|
|
mov $h0,$r0 // r^1
|
|
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
|
mov $h1,$r1
|
|
mov $h2,xzr
|
|
add $ctx,$ctx,#48+12
|
|
bl poly1305_splat
|
|
|
|
bl poly1305_mult // r^2
|
|
sub $ctx,$ctx,#4
|
|
bl poly1305_splat
|
|
|
|
bl poly1305_mult // r^3
|
|
sub $ctx,$ctx,#4
|
|
bl poly1305_splat
|
|
|
|
bl poly1305_mult // r^4
|
|
sub $ctx,$ctx,#4
|
|
bl poly1305_splat
|
|
ldr x30,[sp,#8]
|
|
|
|
add $in2,$inp,#32
|
|
adr $zeros,.Lzeros
|
|
subs $len,$len,#64
|
|
csel $in2,$zeros,$in2,lo
|
|
|
|
mov x4,#1
|
|
stur x4,[$ctx,#-24] // set is_base2_26
|
|
sub $ctx,$ctx,#48 // restore original $ctx
|
|
b .Ldo_neon
|
|
|
|
.align 4
|
|
.Leven_neon:
|
|
add $in2,$inp,#32
|
|
adr $zeros,.Lzeros
|
|
subs $len,$len,#64
|
|
csel $in2,$zeros,$in2,lo
|
|
|
|
stp d8,d9,[sp,#16] // meet ABI requirements
|
|
stp d10,d11,[sp,#32]
|
|
stp d12,d13,[sp,#48]
|
|
stp d14,d15,[sp,#64]
|
|
|
|
fmov ${H0},x10
|
|
fmov ${H1},x11
|
|
fmov ${H2},x12
|
|
fmov ${H3},x13
|
|
fmov ${H4},x14
|
|
|
|
.Ldo_neon:
|
|
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
|
|
ldp x9,x13,[$in2],#48
|
|
|
|
lsl $padbit,$padbit,#24
|
|
add x15,$ctx,#48
|
|
|
|
#ifdef __ARMEB__
|
|
rev x8,x8
|
|
rev x12,x12
|
|
rev x9,x9
|
|
rev x13,x13
|
|
#endif
|
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
|
and x5,x9,#0x03ffffff
|
|
ubfx x6,x8,#26,#26
|
|
ubfx x7,x9,#26,#26
|
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
|
extr x8,x12,x8,#52
|
|
extr x9,x13,x9,#52
|
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
|
fmov $IN23_0,x4
|
|
and x8,x8,#0x03ffffff
|
|
and x9,x9,#0x03ffffff
|
|
ubfx x10,x12,#14,#26
|
|
ubfx x11,x13,#14,#26
|
|
add x12,$padbit,x12,lsr#40
|
|
add x13,$padbit,x13,lsr#40
|
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
|
fmov $IN23_1,x6
|
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
|
fmov $IN23_2,x8
|
|
fmov $IN23_3,x10
|
|
fmov $IN23_4,x12
|
|
|
|
ldp x8,x12,[$inp],#16 // inp[0:1]
|
|
ldp x9,x13,[$inp],#48
|
|
|
|
ld1 {$R0,$R1,$S1,$R2},[x15],#64
|
|
ld1 {$S2,$R3,$S3,$R4},[x15],#64
|
|
ld1 {$S4},[x15]
|
|
|
|
#ifdef __ARMEB__
|
|
rev x8,x8
|
|
rev x12,x12
|
|
rev x9,x9
|
|
rev x13,x13
|
|
#endif
|
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
|
and x5,x9,#0x03ffffff
|
|
ubfx x6,x8,#26,#26
|
|
ubfx x7,x9,#26,#26
|
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
|
extr x8,x12,x8,#52
|
|
extr x9,x13,x9,#52
|
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
|
fmov $IN01_0,x4
|
|
and x8,x8,#0x03ffffff
|
|
and x9,x9,#0x03ffffff
|
|
ubfx x10,x12,#14,#26
|
|
ubfx x11,x13,#14,#26
|
|
add x12,$padbit,x12,lsr#40
|
|
add x13,$padbit,x13,lsr#40
|
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
|
fmov $IN01_1,x6
|
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
|
movi $MASK.2d,#-1
|
|
fmov $IN01_2,x8
|
|
fmov $IN01_3,x10
|
|
fmov $IN01_4,x12
|
|
ushr $MASK.2d,$MASK.2d,#38
|
|
|
|
b.ls .Lskip_loop
|
|
|
|
.align 4
|
|
.Loop_neon:
|
|
////////////////////////////////////////////////////////////////
|
|
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
|
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
|
// \___________________/
|
|
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
|
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
|
// \___________________/ \____________________/
|
|
//
|
|
// Note that we start with inp[2:3]*r^2. This is because it
|
|
// doesn't depend on reduction in previous iteration.
|
|
////////////////////////////////////////////////////////////////
|
|
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
|
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
|
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
|
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
|
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
|
|
|
subs $len,$len,#64
|
|
umull $ACC4,$IN23_0,${R4}[2]
|
|
csel $in2,$zeros,$in2,lo
|
|
umull $ACC3,$IN23_0,${R3}[2]
|
|
umull $ACC2,$IN23_0,${R2}[2]
|
|
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
|
|
umull $ACC1,$IN23_0,${R1}[2]
|
|
ldp x9,x13,[$in2],#48
|
|
umull $ACC0,$IN23_0,${R0}[2]
|
|
#ifdef __ARMEB__
|
|
rev x8,x8
|
|
rev x12,x12
|
|
rev x9,x9
|
|
rev x13,x13
|
|
#endif
|
|
|
|
umlal $ACC4,$IN23_1,${R3}[2]
|
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
|
umlal $ACC3,$IN23_1,${R2}[2]
|
|
and x5,x9,#0x03ffffff
|
|
umlal $ACC2,$IN23_1,${R1}[2]
|
|
ubfx x6,x8,#26,#26
|
|
umlal $ACC1,$IN23_1,${R0}[2]
|
|
ubfx x7,x9,#26,#26
|
|
umlal $ACC0,$IN23_1,${S4}[2]
|
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
|
|
|
umlal $ACC4,$IN23_2,${R2}[2]
|
|
extr x8,x12,x8,#52
|
|
umlal $ACC3,$IN23_2,${R1}[2]
|
|
extr x9,x13,x9,#52
|
|
umlal $ACC2,$IN23_2,${R0}[2]
|
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
|
umlal $ACC1,$IN23_2,${S4}[2]
|
|
fmov $IN23_0,x4
|
|
umlal $ACC0,$IN23_2,${S3}[2]
|
|
and x8,x8,#0x03ffffff
|
|
|
|
umlal $ACC4,$IN23_3,${R1}[2]
|
|
and x9,x9,#0x03ffffff
|
|
umlal $ACC3,$IN23_3,${R0}[2]
|
|
ubfx x10,x12,#14,#26
|
|
umlal $ACC2,$IN23_3,${S4}[2]
|
|
ubfx x11,x13,#14,#26
|
|
umlal $ACC1,$IN23_3,${S3}[2]
|
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
|
umlal $ACC0,$IN23_3,${S2}[2]
|
|
fmov $IN23_1,x6
|
|
|
|
add $IN01_2,$IN01_2,$H2
|
|
add x12,$padbit,x12,lsr#40
|
|
umlal $ACC4,$IN23_4,${R0}[2]
|
|
add x13,$padbit,x13,lsr#40
|
|
umlal $ACC3,$IN23_4,${S4}[2]
|
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
|
umlal $ACC2,$IN23_4,${S3}[2]
|
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
|
umlal $ACC1,$IN23_4,${S2}[2]
|
|
fmov $IN23_2,x8
|
|
umlal $ACC0,$IN23_4,${S1}[2]
|
|
fmov $IN23_3,x10
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// (hash+inp[0:1])*r^4 and accumulate
|
|
|
|
add $IN01_0,$IN01_0,$H0
|
|
fmov $IN23_4,x12
|
|
umlal $ACC3,$IN01_2,${R1}[0]
|
|
ldp x8,x12,[$inp],#16 // inp[0:1]
|
|
umlal $ACC0,$IN01_2,${S3}[0]
|
|
ldp x9,x13,[$inp],#48
|
|
umlal $ACC4,$IN01_2,${R2}[0]
|
|
umlal $ACC1,$IN01_2,${S4}[0]
|
|
umlal $ACC2,$IN01_2,${R0}[0]
|
|
#ifdef __ARMEB__
|
|
rev x8,x8
|
|
rev x12,x12
|
|
rev x9,x9
|
|
rev x13,x13
|
|
#endif
|
|
|
|
add $IN01_1,$IN01_1,$H1
|
|
umlal $ACC3,$IN01_0,${R3}[0]
|
|
umlal $ACC4,$IN01_0,${R4}[0]
|
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
|
umlal $ACC2,$IN01_0,${R2}[0]
|
|
and x5,x9,#0x03ffffff
|
|
umlal $ACC0,$IN01_0,${R0}[0]
|
|
ubfx x6,x8,#26,#26
|
|
umlal $ACC1,$IN01_0,${R1}[0]
|
|
ubfx x7,x9,#26,#26
|
|
|
|
add $IN01_3,$IN01_3,$H3
|
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
|
umlal $ACC3,$IN01_1,${R2}[0]
|
|
extr x8,x12,x8,#52
|
|
umlal $ACC4,$IN01_1,${R3}[0]
|
|
extr x9,x13,x9,#52
|
|
umlal $ACC0,$IN01_1,${S4}[0]
|
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
|
umlal $ACC2,$IN01_1,${R1}[0]
|
|
fmov $IN01_0,x4
|
|
umlal $ACC1,$IN01_1,${R0}[0]
|
|
and x8,x8,#0x03ffffff
|
|
|
|
add $IN01_4,$IN01_4,$H4
|
|
and x9,x9,#0x03ffffff
|
|
umlal $ACC3,$IN01_3,${R0}[0]
|
|
ubfx x10,x12,#14,#26
|
|
umlal $ACC0,$IN01_3,${S2}[0]
|
|
ubfx x11,x13,#14,#26
|
|
umlal $ACC4,$IN01_3,${R1}[0]
|
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
|
umlal $ACC1,$IN01_3,${S3}[0]
|
|
fmov $IN01_1,x6
|
|
umlal $ACC2,$IN01_3,${S4}[0]
|
|
add x12,$padbit,x12,lsr#40
|
|
|
|
umlal $ACC3,$IN01_4,${S4}[0]
|
|
add x13,$padbit,x13,lsr#40
|
|
umlal $ACC0,$IN01_4,${S1}[0]
|
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
|
umlal $ACC4,$IN01_4,${R0}[0]
|
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
|
umlal $ACC1,$IN01_4,${S2}[0]
|
|
fmov $IN01_2,x8
|
|
umlal $ACC2,$IN01_4,${S3}[0]
|
|
fmov $IN01_3,x10
|
|
fmov $IN01_4,x12
|
|
|
|
/////////////////////////////////////////////////////////////////
|
|
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
|
// and P. Schwabe
|
|
//
|
|
// [see discussion in poly1305-armv4 module]
|
|
|
|
ushr $T0.2d,$ACC3,#26
|
|
xtn $H3,$ACC3
|
|
ushr $T1.2d,$ACC0,#26
|
|
and $ACC0,$ACC0,$MASK.2d
|
|
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
|
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
|
|
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
|
|
|
ushr $T0.2d,$ACC4,#26
|
|
xtn $H4,$ACC4
|
|
ushr $T1.2d,$ACC1,#26
|
|
xtn $H1,$ACC1
|
|
bic $H4,#0xfc,lsl#24
|
|
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
|
|
|
add $ACC0,$ACC0,$T0.2d
|
|
shl $T0.2d,$T0.2d,#2
|
|
shrn $T1.2s,$ACC2,#26
|
|
xtn $H2,$ACC2
|
|
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
|
bic $H1,#0xfc,lsl#24
|
|
add $H3,$H3,$T1.2s // h2 -> h3
|
|
bic $H2,#0xfc,lsl#24
|
|
|
|
shrn $T0.2s,$ACC0,#26
|
|
xtn $H0,$ACC0
|
|
ushr $T1.2s,$H3,#26
|
|
bic $H3,#0xfc,lsl#24
|
|
bic $H0,#0xfc,lsl#24
|
|
add $H1,$H1,$T0.2s // h0 -> h1
|
|
add $H4,$H4,$T1.2s // h3 -> h4
|
|
|
|
b.hi .Loop_neon
|
|
|
|
.Lskip_loop:
|
|
dup $IN23_2,${IN23_2}[0]
|
|
add $IN01_2,$IN01_2,$H2
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
|
|
|
adds $len,$len,#32
|
|
b.ne .Long_tail
|
|
|
|
dup $IN23_2,${IN01_2}[0]
|
|
add $IN23_0,$IN01_0,$H0
|
|
add $IN23_3,$IN01_3,$H3
|
|
add $IN23_1,$IN01_1,$H1
|
|
add $IN23_4,$IN01_4,$H4
|
|
|
|
.Long_tail:
|
|
dup $IN23_0,${IN23_0}[0]
|
|
umull2 $ACC0,$IN23_2,${S3}
|
|
umull2 $ACC3,$IN23_2,${R1}
|
|
umull2 $ACC4,$IN23_2,${R2}
|
|
umull2 $ACC2,$IN23_2,${R0}
|
|
umull2 $ACC1,$IN23_2,${S4}
|
|
|
|
dup $IN23_1,${IN23_1}[0]
|
|
umlal2 $ACC0,$IN23_0,${R0}
|
|
umlal2 $ACC2,$IN23_0,${R2}
|
|
umlal2 $ACC3,$IN23_0,${R3}
|
|
umlal2 $ACC4,$IN23_0,${R4}
|
|
umlal2 $ACC1,$IN23_0,${R1}
|
|
|
|
dup $IN23_3,${IN23_3}[0]
|
|
umlal2 $ACC0,$IN23_1,${S4}
|
|
umlal2 $ACC3,$IN23_1,${R2}
|
|
umlal2 $ACC2,$IN23_1,${R1}
|
|
umlal2 $ACC4,$IN23_1,${R3}
|
|
umlal2 $ACC1,$IN23_1,${R0}
|
|
|
|
dup $IN23_4,${IN23_4}[0]
|
|
umlal2 $ACC3,$IN23_3,${R0}
|
|
umlal2 $ACC4,$IN23_3,${R1}
|
|
umlal2 $ACC0,$IN23_3,${S2}
|
|
umlal2 $ACC1,$IN23_3,${S3}
|
|
umlal2 $ACC2,$IN23_3,${S4}
|
|
|
|
umlal2 $ACC3,$IN23_4,${S4}
|
|
umlal2 $ACC0,$IN23_4,${S1}
|
|
umlal2 $ACC4,$IN23_4,${R0}
|
|
umlal2 $ACC1,$IN23_4,${S2}
|
|
umlal2 $ACC2,$IN23_4,${S3}
|
|
|
|
b.eq .Lshort_tail
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
|
|
|
add $IN01_0,$IN01_0,$H0
|
|
umlal $ACC3,$IN01_2,${R1}
|
|
umlal $ACC0,$IN01_2,${S3}
|
|
umlal $ACC4,$IN01_2,${R2}
|
|
umlal $ACC1,$IN01_2,${S4}
|
|
umlal $ACC2,$IN01_2,${R0}
|
|
|
|
add $IN01_1,$IN01_1,$H1
|
|
umlal $ACC3,$IN01_0,${R3}
|
|
umlal $ACC0,$IN01_0,${R0}
|
|
umlal $ACC4,$IN01_0,${R4}
|
|
umlal $ACC1,$IN01_0,${R1}
|
|
umlal $ACC2,$IN01_0,${R2}
|
|
|
|
add $IN01_3,$IN01_3,$H3
|
|
umlal $ACC3,$IN01_1,${R2}
|
|
umlal $ACC0,$IN01_1,${S4}
|
|
umlal $ACC4,$IN01_1,${R3}
|
|
umlal $ACC1,$IN01_1,${R0}
|
|
umlal $ACC2,$IN01_1,${R1}
|
|
|
|
add $IN01_4,$IN01_4,$H4
|
|
umlal $ACC3,$IN01_3,${R0}
|
|
umlal $ACC0,$IN01_3,${S2}
|
|
umlal $ACC4,$IN01_3,${R1}
|
|
umlal $ACC1,$IN01_3,${S3}
|
|
umlal $ACC2,$IN01_3,${S4}
|
|
|
|
umlal $ACC3,$IN01_4,${S4}
|
|
umlal $ACC0,$IN01_4,${S1}
|
|
umlal $ACC4,$IN01_4,${R0}
|
|
umlal $ACC1,$IN01_4,${S2}
|
|
umlal $ACC2,$IN01_4,${S3}
|
|
|
|
.Lshort_tail:
|
|
////////////////////////////////////////////////////////////////
|
|
// horizontal add
|
|
|
|
addp $ACC3,$ACC3,$ACC3
|
|
ldp d8,d9,[sp,#16] // meet ABI requirements
|
|
addp $ACC0,$ACC0,$ACC0
|
|
ldp d10,d11,[sp,#32]
|
|
addp $ACC4,$ACC4,$ACC4
|
|
ldp d12,d13,[sp,#48]
|
|
addp $ACC1,$ACC1,$ACC1
|
|
ldp d14,d15,[sp,#64]
|
|
addp $ACC2,$ACC2,$ACC2
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// lazy reduction, but without narrowing
|
|
|
|
ushr $T0.2d,$ACC3,#26
|
|
and $ACC3,$ACC3,$MASK.2d
|
|
ushr $T1.2d,$ACC0,#26
|
|
and $ACC0,$ACC0,$MASK.2d
|
|
|
|
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
|
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
|
|
|
ushr $T0.2d,$ACC4,#26
|
|
and $ACC4,$ACC4,$MASK.2d
|
|
ushr $T1.2d,$ACC1,#26
|
|
and $ACC1,$ACC1,$MASK.2d
|
|
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
|
|
|
add $ACC0,$ACC0,$T0.2d
|
|
shl $T0.2d,$T0.2d,#2
|
|
ushr $T1.2d,$ACC2,#26
|
|
and $ACC2,$ACC2,$MASK.2d
|
|
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
|
add $ACC3,$ACC3,$T1.2d // h2 -> h3
|
|
|
|
ushr $T0.2d,$ACC0,#26
|
|
and $ACC0,$ACC0,$MASK.2d
|
|
ushr $T1.2d,$ACC3,#26
|
|
and $ACC3,$ACC3,$MASK.2d
|
|
add $ACC1,$ACC1,$T0.2d // h0 -> h1
|
|
add $ACC4,$ACC4,$T1.2d // h3 -> h4
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// write the result, can be partially reduced
|
|
|
|
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
|
|
st1 {$ACC4}[0],[$ctx]
|
|
|
|
.Lno_data_neon:
|
|
.inst 0xd50323bf // autiasp
|
|
ldr x29,[sp],#80
|
|
ret
|
|
.size poly1305_blocks_neon,.-poly1305_blocks_neon
|
|
|
|
.type poly1305_emit_neon,%function
|
|
.align 5
|
|
poly1305_emit_neon:
|
|
.Lpoly1305_emit_neon:
|
|
ldr $is_base2_26,[$ctx,#24]
|
|
cbz $is_base2_26,poly1305_emit
|
|
|
|
ldp w10,w11,[$ctx] // load hash value base 2^26
|
|
ldp w12,w13,[$ctx,#8]
|
|
ldr w14,[$ctx,#16]
|
|
|
|
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
|
lsr $h1,x12,#12
|
|
adds $h0,$h0,x12,lsl#52
|
|
add $h1,$h1,x13,lsl#14
|
|
adc $h1,$h1,xzr
|
|
lsr $h2,x14,#24
|
|
adds $h1,$h1,x14,lsl#40
|
|
adc $h2,$h2,xzr // can be partially reduced...
|
|
|
|
ldp $t0,$t1,[$nonce] // load nonce
|
|
|
|
and $d0,$h2,#-4 // ... so reduce
|
|
add $d0,$d0,$h2,lsr#2
|
|
and $h2,$h2,#3
|
|
adds $h0,$h0,$d0
|
|
adcs $h1,$h1,xzr
|
|
adc $h2,$h2,xzr
|
|
|
|
adds $d0,$h0,#5 // compare to modulus
|
|
adcs $d1,$h1,xzr
|
|
adc $d2,$h2,xzr
|
|
|
|
tst $d2,#-4 // see if it's carried/borrowed
|
|
|
|
csel $h0,$h0,$d0,eq
|
|
csel $h1,$h1,$d1,eq
|
|
|
|
#ifdef __ARMEB__
|
|
ror $t0,$t0,#32 // flip nonce words
|
|
ror $t1,$t1,#32
|
|
#endif
|
|
adds $h0,$h0,$t0 // accumulate nonce
|
|
adc $h1,$h1,$t1
|
|
#ifdef __ARMEB__
|
|
rev $h0,$h0 // flip output bytes
|
|
rev $h1,$h1
|
|
#endif
|
|
stp $h0,$h1,[$mac] // write result
|
|
|
|
ret
|
|
.size poly1305_emit_neon,.-poly1305_emit_neon
|
|
|
|
.align 5
|
|
.Lzeros:
|
|
.long 0,0,0,0,0,0,0,0
|
|
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
|
.align 2
|
|
___
|
|
|
|
foreach (split("\n",$code)) {
|
|
s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
|
|
s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
|
|
(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
|
|
(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
|
|
(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
|
|
(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
|
|
(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
|
|
|
|
s/\.[124]([sd])\[/.$1\[/;
|
|
|
|
print $_,"\n";
|
|
}
|
|
close STDOUT;
|