mirror of
https://github.com/openssl/openssl.git
synced 2024-12-03 05:41:46 +08:00
32be631ca1
If one of the perlasm xlate drivers crashes, OpenSSL's build will currently swallow the error and silently truncate the output to however far the driver got. This will hopefully fail to build, but better to check such things. Handle this by checking for errors when closing STDOUT (which is a pipe to the xlate driver). Reviewed-by: Richard Levitte <levitte@openssl.org> Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org> (Merged from https://github.com/openssl/openssl/pull/10883)
767 lines
20 KiB
Prolog
767 lines
20 KiB
Prolog
#! /usr/bin/env perl
|
||
# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||
#
|
||
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
||
# this file except in compliance with the License. You can obtain a copy
|
||
# in the file LICENSE in the source distribution or at
|
||
# https://www.openssl.org/source/license.html
|
||
|
||
|
||
# ====================================================================
|
||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
# ====================================================================
|
||
|
||
# January 2007.
|
||
|
||
# Montgomery multiplication for ARMv4.
|
||
#
|
||
# Performance improvement naturally varies among CPU implementations
|
||
# and compilers. The code was observed to provide +65-35% improvement
|
||
# [depending on key length, less for longer keys] on ARM920T, and
|
||
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
|
||
# base and compiler generated code with in-lined umull and even umlal
|
||
# instructions. The latter means that this code didn't really have an
|
||
# "advantage" of utilizing some "secret" instruction.
|
||
#
|
||
# The code is interoperable with Thumb ISA and is rather compact, less
|
||
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
|
||
# about decorations, ABI and instruction syntax are identical.
|
||
|
||
# November 2013
|
||
#
|
||
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
|
||
# performance improvement on Cortex-A8 is ~45-100% depending on key
|
||
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
|
||
# On Snapdragon S4 improvement was measured to vary from ~70% to
|
||
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
|
||
# rather because original integer-only code seems to perform
|
||
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
|
||
# different. It's being looked into, but the trouble is that
|
||
# performance for vectors longer than 256 bits is actually couple
|
||
# of percent worse than for integer-only code. The code is chosen
|
||
# for execution on all NEON-capable processors, because gain on
|
||
# others outweighs the marginal loss on Cortex-A9.
|
||
|
||
# September 2015
|
||
#
|
||
# Align Cortex-A9 performance with November 2013 improvements, i.e.
|
||
# NEON code is now ~20-105% faster than integer-only one on this
|
||
# processor. But this optimization further improved performance even
|
||
# on other processors: NEON code path is ~45-180% faster than original
|
||
# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
|
||
# Snapdragon S4.
|
||
|
||
# $output is the last argument if it looks like a file (it has an extension)
|
||
# $flavour is the first argument if it doesn't look like a file
|
||
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
||
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
||
|
||
if ($flavour && $flavour ne "void") {
|
||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||
die "can't locate arm-xlate.pl";
|
||
|
||
open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
|
||
or die "can't call $xlate: $1";
|
||
} else {
|
||
$output and open STDOUT,">$output";
|
||
}
|
||
|
||
$num="r0"; # starts as num argument, but holds &tp[num-1]
|
||
$ap="r1";
|
||
$bp="r2"; $bi="r2"; $rp="r2";
|
||
$np="r3";
|
||
$tp="r4";
|
||
$aj="r5";
|
||
$nj="r6";
|
||
$tj="r7";
|
||
$n0="r8";
|
||
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
|
||
$alo="r10"; # sl, gcc uses it to keep @GOT
|
||
$ahi="r11"; # fp
|
||
$nlo="r12"; # ip
|
||
########### # r13 is stack pointer
|
||
$nhi="r14"; # lr
|
||
########### # r15 is program counter
|
||
|
||
#### argument block layout relative to &tp[num-1], a.k.a. $num
|
||
$_rp="$num,#12*4";
|
||
# ap permanently resides in r1
|
||
$_bp="$num,#13*4";
|
||
# np permanently resides in r3
|
||
$_n0="$num,#14*4";
|
||
$_num="$num,#15*4"; $_bpend=$_num;
|
||
|
||
$code=<<___;
|
||
#include "arm_arch.h"
|
||
|
||
#if defined(__thumb2__)
|
||
.syntax unified
|
||
.thumb
|
||
#else
|
||
.code 32
|
||
#endif
|
||
|
||
.text
|
||
|
||
#if __ARM_MAX_ARCH__>=7
|
||
.align 5
|
||
.LOPENSSL_armcap:
|
||
# ifdef _WIN32
|
||
.word OPENSSL_armcap_P
|
||
# else
|
||
.word OPENSSL_armcap_P-.Lbn_mul_mont
|
||
# endif
|
||
#endif
|
||
|
||
.global bn_mul_mont
|
||
.type bn_mul_mont,%function
|
||
|
||
.align 5
|
||
bn_mul_mont:
|
||
.Lbn_mul_mont:
|
||
ldr ip,[sp,#4] @ load num
|
||
stmdb sp!,{r0,r2} @ sp points at argument block
|
||
#if __ARM_MAX_ARCH__>=7
|
||
tst ip,#7
|
||
bne .Lialu
|
||
ldr r0,.LOPENSSL_armcap
|
||
#if !defined(_WIN32)
|
||
adr r2,.Lbn_mul_mont
|
||
ldr r0,[r0,r2]
|
||
# endif
|
||
# if defined(__APPLE__) || defined(_WIN32)
|
||
ldr r0,[r0]
|
||
# endif
|
||
tst r0,#ARMV7_NEON @ NEON available?
|
||
ldmia sp, {r0,r2}
|
||
beq .Lialu
|
||
add sp,sp,#8
|
||
b bn_mul8x_mont_neon
|
||
.align 4
|
||
.Lialu:
|
||
#endif
|
||
cmp ip,#2
|
||
mov $num,ip @ load num
|
||
#ifdef __thumb2__
|
||
ittt lt
|
||
#endif
|
||
movlt r0,#0
|
||
addlt sp,sp,#2*4
|
||
blt .Labrt
|
||
|
||
stmdb sp!,{r4-r12,lr} @ save 10 registers
|
||
|
||
mov $num,$num,lsl#2 @ rescale $num for byte count
|
||
sub sp,sp,$num @ alloca(4*num)
|
||
sub sp,sp,#4 @ +extra dword
|
||
sub $num,$num,#4 @ "num=num-1"
|
||
add $tp,$bp,$num @ &bp[num-1]
|
||
|
||
add $num,sp,$num @ $num to point at &tp[num-1]
|
||
ldr $n0,[$_n0] @ &n0
|
||
ldr $bi,[$bp] @ bp[0]
|
||
ldr $aj,[$ap],#4 @ ap[0],ap++
|
||
ldr $nj,[$np],#4 @ np[0],np++
|
||
ldr $n0,[$n0] @ *n0
|
||
str $tp,[$_bpend] @ save &bp[num]
|
||
|
||
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
|
||
str $n0,[$_n0] @ save n0 value
|
||
mul $n0,$alo,$n0 @ "tp[0]"*n0
|
||
mov $nlo,#0
|
||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
|
||
mov $tp,sp
|
||
|
||
.L1st:
|
||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||
mov $alo,$ahi
|
||
ldr $nj,[$np],#4 @ np[j],np++
|
||
mov $ahi,#0
|
||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
|
||
mov $nhi,#0
|
||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||
adds $nlo,$nlo,$alo
|
||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||
adc $nlo,$nhi,#0
|
||
cmp $tp,$num
|
||
bne .L1st
|
||
|
||
adds $nlo,$nlo,$ahi
|
||
ldr $tp,[$_bp] @ restore bp
|
||
mov $nhi,#0
|
||
ldr $n0,[$_n0] @ restore n0
|
||
adc $nhi,$nhi,#0
|
||
str $nlo,[$num] @ tp[num-1]=
|
||
mov $tj,sp
|
||
str $nhi,[$num,#4] @ tp[num]=
|
||
|
||
.Louter:
|
||
sub $tj,$num,$tj @ "original" $num-1 value
|
||
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
|
||
ldr $bi,[$tp,#4]! @ *(++bp)
|
||
sub $np,$np,$tj @ "rewind" np to &np[1]
|
||
ldr $aj,[$ap,#-4] @ ap[0]
|
||
ldr $alo,[sp] @ tp[0]
|
||
ldr $nj,[$np,#-4] @ np[0]
|
||
ldr $tj,[sp,#4] @ tp[1]
|
||
|
||
mov $ahi,#0
|
||
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
|
||
str $tp,[$_bp] @ save bp
|
||
mul $n0,$alo,$n0
|
||
mov $nlo,#0
|
||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
|
||
mov $tp,sp
|
||
|
||
.Linner:
|
||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||
adds $alo,$ahi,$tj @ +=tp[j]
|
||
ldr $nj,[$np],#4 @ np[j],np++
|
||
mov $ahi,#0
|
||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
|
||
mov $nhi,#0
|
||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||
adc $ahi,$ahi,#0
|
||
ldr $tj,[$tp,#8] @ tp[j+1]
|
||
adds $nlo,$nlo,$alo
|
||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||
adc $nlo,$nhi,#0
|
||
cmp $tp,$num
|
||
bne .Linner
|
||
|
||
adds $nlo,$nlo,$ahi
|
||
mov $nhi,#0
|
||
ldr $tp,[$_bp] @ restore bp
|
||
adc $nhi,$nhi,#0
|
||
ldr $n0,[$_n0] @ restore n0
|
||
adds $nlo,$nlo,$tj
|
||
ldr $tj,[$_bpend] @ restore &bp[num]
|
||
adc $nhi,$nhi,#0
|
||
str $nlo,[$num] @ tp[num-1]=
|
||
str $nhi,[$num,#4] @ tp[num]=
|
||
|
||
cmp $tp,$tj
|
||
#ifdef __thumb2__
|
||
itt ne
|
||
#endif
|
||
movne $tj,sp
|
||
bne .Louter
|
||
|
||
ldr $rp,[$_rp] @ pull rp
|
||
mov $aj,sp
|
||
add $num,$num,#4 @ $num to point at &tp[num]
|
||
sub $aj,$num,$aj @ "original" num value
|
||
mov $tp,sp @ "rewind" $tp
|
||
mov $ap,$tp @ "borrow" $ap
|
||
sub $np,$np,$aj @ "rewind" $np to &np[0]
|
||
|
||
subs $tj,$tj,$tj @ "clear" carry flag
|
||
.Lsub: ldr $tj,[$tp],#4
|
||
ldr $nj,[$np],#4
|
||
sbcs $tj,$tj,$nj @ tp[j]-np[j]
|
||
str $tj,[$rp],#4 @ rp[j]=
|
||
teq $tp,$num @ preserve carry
|
||
bne .Lsub
|
||
sbcs $nhi,$nhi,#0 @ upmost carry
|
||
mov $tp,sp @ "rewind" $tp
|
||
sub $rp,$rp,$aj @ "rewind" $rp
|
||
|
||
.Lcopy: ldr $tj,[$tp] @ conditional copy
|
||
ldr $aj,[$rp]
|
||
str sp,[$tp],#4 @ zap tp
|
||
#ifdef __thumb2__
|
||
it cc
|
||
#endif
|
||
movcc $aj,$tj
|
||
str $aj,[$rp],#4
|
||
teq $tp,$num @ preserve carry
|
||
bne .Lcopy
|
||
|
||
mov sp,$num
|
||
add sp,sp,#4 @ skip over tp[num+1]
|
||
ldmia sp!,{r4-r12,lr} @ restore registers
|
||
add sp,sp,#2*4 @ skip over {r0,r2}
|
||
mov r0,#1
|
||
.Labrt:
|
||
#if __ARM_ARCH__>=5
|
||
ret @ bx lr
|
||
#else
|
||
tst lr,#1
|
||
moveq pc,lr @ be binary compatible with V4, yet
|
||
bx lr @ interoperable with Thumb ISA:-)
|
||
#endif
|
||
.size bn_mul_mont,.-bn_mul_mont
|
||
___
|
||
{
|
||
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
|
||
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
|
||
my ($Z,$Temp)=("q4","q5");
|
||
my @ACC=map("q$_",(6..13));
|
||
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
|
||
my $zero="$Z#lo";
|
||
my $temp="$Temp#lo";
|
||
|
||
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
|
||
my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
|
||
|
||
$code.=<<___;
|
||
#if __ARM_MAX_ARCH__>=7
|
||
.arch armv7-a
|
||
.fpu neon
|
||
|
||
.type bn_mul8x_mont_neon,%function
|
||
.align 5
|
||
bn_mul8x_mont_neon:
|
||
mov ip,sp
|
||
stmdb sp!,{r4-r11}
|
||
vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
ldmia ip,{r4-r5} @ load rest of parameter block
|
||
mov ip,sp
|
||
|
||
cmp $num,#8
|
||
bhi .LNEON_8n
|
||
|
||
@ special case for $num==8, everything is in register bank...
|
||
|
||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||
veor $zero,$zero,$zero
|
||
sub $toutptr,sp,$num,lsl#4
|
||
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
|
||
and $toutptr,$toutptr,#-64
|
||
vld1.32 {${M0}[0]}, [$n0,:32]
|
||
mov sp,$toutptr @ alloca
|
||
vzip.16 $Bi,$zero
|
||
|
||
vmull.u32 @ACC[0],$Bi,${A0}[0]
|
||
vmull.u32 @ACC[1],$Bi,${A0}[1]
|
||
vmull.u32 @ACC[2],$Bi,${A1}[0]
|
||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||
vmull.u32 @ACC[3],$Bi,${A1}[1]
|
||
|
||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||
veor $zero,$zero,$zero
|
||
vmul.u32 $Ni,$Ni,$M0
|
||
|
||
vmull.u32 @ACC[4],$Bi,${A2}[0]
|
||
vld1.32 {$N0-$N3}, [$nptr]!
|
||
vmull.u32 @ACC[5],$Bi,${A2}[1]
|
||
vmull.u32 @ACC[6],$Bi,${A3}[0]
|
||
vzip.16 $Ni,$zero
|
||
vmull.u32 @ACC[7],$Bi,${A3}[1]
|
||
|
||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||
sub $outer,$num,#1
|
||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||
|
||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||
vmov $Temp,@ACC[0]
|
||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||
vmov @ACC[0],@ACC[1]
|
||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||
vmov @ACC[1],@ACC[2]
|
||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||
vmov @ACC[2],@ACC[3]
|
||
vmov @ACC[3],@ACC[4]
|
||
vshr.u64 $temp,$temp,#16
|
||
vmov @ACC[4],@ACC[5]
|
||
vmov @ACC[5],@ACC[6]
|
||
vadd.u64 $temp,$temp,$Temp#hi
|
||
vmov @ACC[6],@ACC[7]
|
||
veor @ACC[7],@ACC[7]
|
||
vshr.u64 $temp,$temp,#16
|
||
|
||
b .LNEON_outer8
|
||
|
||
.align 4
|
||
.LNEON_outer8:
|
||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||
veor $zero,$zero,$zero
|
||
vzip.16 $Bi,$zero
|
||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
|
||
|
||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||
|
||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||
veor $zero,$zero,$zero
|
||
subs $outer,$outer,#1
|
||
vmul.u32 $Ni,$Ni,$M0
|
||
|
||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||
vzip.16 $Ni,$zero
|
||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||
|
||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||
|
||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||
vmov $Temp,@ACC[0]
|
||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||
vmov @ACC[0],@ACC[1]
|
||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||
vmov @ACC[1],@ACC[2]
|
||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||
vmov @ACC[2],@ACC[3]
|
||
vmov @ACC[3],@ACC[4]
|
||
vshr.u64 $temp,$temp,#16
|
||
vmov @ACC[4],@ACC[5]
|
||
vmov @ACC[5],@ACC[6]
|
||
vadd.u64 $temp,$temp,$Temp#hi
|
||
vmov @ACC[6],@ACC[7]
|
||
veor @ACC[7],@ACC[7]
|
||
vshr.u64 $temp,$temp,#16
|
||
|
||
bne .LNEON_outer8
|
||
|
||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
|
||
mov $toutptr,sp
|
||
vshr.u64 $temp,@ACC[0]#lo,#16
|
||
mov $inner,$num
|
||
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
|
||
add $tinptr,sp,#96
|
||
vshr.u64 $temp,@ACC[0]#hi,#16
|
||
vzip.16 @ACC[0]#lo,@ACC[0]#hi
|
||
|
||
b .LNEON_tail_entry
|
||
|
||
.align 4
|
||
.LNEON_8n:
|
||
veor @ACC[0],@ACC[0],@ACC[0]
|
||
sub $toutptr,sp,#128
|
||
veor @ACC[1],@ACC[1],@ACC[1]
|
||
sub $toutptr,$toutptr,$num,lsl#4
|
||
veor @ACC[2],@ACC[2],@ACC[2]
|
||
and $toutptr,$toutptr,#-64
|
||
veor @ACC[3],@ACC[3],@ACC[3]
|
||
mov sp,$toutptr @ alloca
|
||
veor @ACC[4],@ACC[4],@ACC[4]
|
||
add $toutptr,$toutptr,#256
|
||
veor @ACC[5],@ACC[5],@ACC[5]
|
||
sub $inner,$num,#8
|
||
veor @ACC[6],@ACC[6],@ACC[6]
|
||
veor @ACC[7],@ACC[7],@ACC[7]
|
||
|
||
.LNEON_8n_init:
|
||
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
|
||
subs $inner,$inner,#8
|
||
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
|
||
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
|
||
vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
|
||
bne .LNEON_8n_init
|
||
|
||
add $tinptr,sp,#256
|
||
vld1.32 {$A0-$A3},[$aptr]!
|
||
add $bnptr,sp,#8
|
||
vld1.32 {${M0}[0]},[$n0,:32]
|
||
mov $outer,$num
|
||
b .LNEON_8n_outer
|
||
|
||
.align 4
|
||
.LNEON_8n_outer:
|
||
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
|
||
veor $zero,$zero,$zero
|
||
vzip.16 $Bi,$zero
|
||
add $toutptr,sp,#128
|
||
vld1.32 {$N0-$N3},[$nptr]!
|
||
|
||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||
veor $zero,$zero,$zero
|
||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||
vmul.u32 $Ni,$Ni,$M0
|
||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||
vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
|
||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||
vzip.16 $Ni,$zero
|
||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||
___
|
||
for ($i=0; $i<7;) {
|
||
$code.=<<___;
|
||
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
|
||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||
veor $temp,$temp,$temp
|
||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||
vzip.16 $Bi,$temp
|
||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
|
||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
|
||
vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
|
||
___
|
||
push(@ACC,shift(@ACC)); $i++;
|
||
$code.=<<___;
|
||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||
vld1.64 {@ACC[7]},[$tinptr,:128]!
|
||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||
veor $zero,$zero,$zero
|
||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||
vmul.u32 $Ni,$Ni,$M0
|
||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||
vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
|
||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||
vzip.16 $Ni,$zero
|
||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||
___
|
||
}
|
||
$code.=<<___;
|
||
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
|
||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||
vld1.32 {$A0-$A3},[$aptr]!
|
||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
|
||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
|
||
vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
|
||
add $bnptr,sp,#8 @ rewind
|
||
___
|
||
push(@ACC,shift(@ACC));
|
||
$code.=<<___;
|
||
sub $inner,$num,#8
|
||
b .LNEON_8n_inner
|
||
|
||
.align 4
|
||
.LNEON_8n_inner:
|
||
subs $inner,$inner,#8
|
||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||
vld1.64 {@ACC[7]},[$tinptr,:128]
|
||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
|
||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||
vld1.32 {$N0-$N3},[$nptr]!
|
||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||
it ne
|
||
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
|
||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||
___
|
||
for ($i=1; $i<8; $i++) {
|
||
$code.=<<___;
|
||
vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
|
||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||
vst1.64 {@ACC[0]},[$toutptr,:128]!
|
||
___
|
||
push(@ACC,shift(@ACC));
|
||
$code.=<<___;
|
||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||
vld1.64 {@ACC[7]},[$tinptr,:128]
|
||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
|
||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||
it ne
|
||
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
|
||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||
___
|
||
}
|
||
$code.=<<___;
|
||
it eq
|
||
subeq $aptr,$aptr,$num,lsl#2 @ rewind
|
||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
|
||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||
vld1.32 {$A0-$A3},[$aptr]!
|
||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||
add $bnptr,sp,#8 @ rewind
|
||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||
vst1.64 {@ACC[0]},[$toutptr,:128]!
|
||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||
|
||
bne .LNEON_8n_inner
|
||
___
|
||
push(@ACC,shift(@ACC));
|
||
$code.=<<___;
|
||
add $tinptr,sp,#128
|
||
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
|
||
veor q2,q2,q2 @ $N0-$N1
|
||
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
|
||
veor q3,q3,q3 @ $N2-$N3
|
||
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
|
||
vst1.64 {@ACC[6]},[$toutptr,:128]
|
||
|
||
subs $outer,$outer,#8
|
||
vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
|
||
vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
|
||
vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
|
||
vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
|
||
|
||
itt ne
|
||
subne $nptr,$nptr,$num,lsl#2 @ rewind
|
||
bne .LNEON_8n_outer
|
||
|
||
add $toutptr,sp,#128
|
||
vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
|
||
vshr.u64 $temp,@ACC[0]#lo,#16
|
||
vst1.64 {q2-q3},[sp,:256]!
|
||
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
|
||
vst1.64 {q2-q3}, [sp,:256]!
|
||
vshr.u64 $temp,@ACC[0]#hi,#16
|
||
vst1.64 {q2-q3}, [sp,:256]!
|
||
vzip.16 @ACC[0]#lo,@ACC[0]#hi
|
||
|
||
mov $inner,$num
|
||
b .LNEON_tail_entry
|
||
|
||
.align 4
|
||
.LNEON_tail:
|
||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
|
||
vshr.u64 $temp,@ACC[0]#lo,#16
|
||
vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
|
||
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
|
||
vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
|
||
vshr.u64 $temp,@ACC[0]#hi,#16
|
||
vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
|
||
vzip.16 @ACC[0]#lo,@ACC[0]#hi
|
||
|
||
.LNEON_tail_entry:
|
||
___
|
||
for ($i=1; $i<8; $i++) {
|
||
$code.=<<___;
|
||
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
|
||
vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
|
||
vshr.u64 $temp,@ACC[1]#lo,#16
|
||
vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
|
||
vshr.u64 $temp,@ACC[1]#hi,#16
|
||
vzip.16 @ACC[1]#lo,@ACC[1]#hi
|
||
___
|
||
push(@ACC,shift(@ACC));
|
||
}
|
||
push(@ACC,shift(@ACC));
|
||
$code.=<<___;
|
||
vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
|
||
subs $inner,$inner,#8
|
||
vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
|
||
bne .LNEON_tail
|
||
|
||
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
|
||
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
|
||
subs $aptr,sp,#0 @ clear carry flag
|
||
add $bptr,sp,$num,lsl#2
|
||
|
||
.LNEON_sub:
|
||
ldmia $aptr!, {r4-r7}
|
||
ldmia $nptr!, {r8-r11}
|
||
sbcs r8, r4,r8
|
||
sbcs r9, r5,r9
|
||
sbcs r10,r6,r10
|
||
sbcs r11,r7,r11
|
||
teq $aptr,$bptr @ preserves carry
|
||
stmia $rptr!, {r8-r11}
|
||
bne .LNEON_sub
|
||
|
||
ldr r10, [$aptr] @ load top-most bit
|
||
mov r11,sp
|
||
veor q0,q0,q0
|
||
sub r11,$bptr,r11 @ this is num*4
|
||
veor q1,q1,q1
|
||
mov $aptr,sp
|
||
sub $rptr,$rptr,r11 @ rewind $rptr
|
||
mov $nptr,$bptr @ second 3/4th of frame
|
||
sbcs r10,r10,#0 @ result is carry flag
|
||
|
||
.LNEON_copy_n_zap:
|
||
ldmia $aptr!, {r4-r7}
|
||
ldmia $rptr, {r8-r11}
|
||
it cc
|
||
movcc r8, r4
|
||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||
itt cc
|
||
movcc r9, r5
|
||
movcc r10,r6
|
||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||
it cc
|
||
movcc r11,r7
|
||
ldmia $aptr, {r4-r7}
|
||
stmia $rptr!, {r8-r11}
|
||
sub $aptr,$aptr,#16
|
||
ldmia $rptr, {r8-r11}
|
||
it cc
|
||
movcc r8, r4
|
||
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
|
||
itt cc
|
||
movcc r9, r5
|
||
movcc r10,r6
|
||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||
it cc
|
||
movcc r11,r7
|
||
teq $aptr,$bptr @ preserves carry
|
||
stmia $rptr!, {r8-r11}
|
||
bne .LNEON_copy_n_zap
|
||
|
||
mov sp,ip
|
||
vldmia sp!,{d8-d15}
|
||
ldmia sp!,{r4-r11}
|
||
ret @ bx lr
|
||
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
||
#endif
|
||
___
|
||
}
|
||
$code.=<<___;
|
||
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||
.align 2
|
||
#if __ARM_MAX_ARCH__>=7
|
||
.comm OPENSSL_armcap_P,4,4
|
||
#endif
|
||
___
|
||
|
||
foreach (split("\n",$code)) {
|
||
s/\`([^\`]*)\`/eval $1/ge;
|
||
|
||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
|
||
s/\bret\b/bx lr/g or
|
||
s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
|
||
|
||
print $_,"\n";
|
||
}
|
||
|
||
close STDOUT or die "error closing STDOUT";
|