2016-05-21 20:23:39 +08:00
|
|
|
|
#! /usr/bin/env perl
|
2021-07-29 22:41:35 +08:00
|
|
|
|
# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
|
2016-05-21 20:23:39 +08:00
|
|
|
|
#
|
2018-12-06 20:22:12 +08:00
|
|
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
2016-05-21 20:23:39 +08:00
|
|
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
|
|
|
# in the file LICENSE in the source distribution or at
|
|
|
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
|
|
2005-12-16 06:43:33 +08:00
|
|
|
|
|
|
|
|
|
# ====================================================================
|
2017-10-11 05:55:09 +08:00
|
|
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
2007-06-18 01:10:03 +08:00
|
|
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
|
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
|
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
2005-12-16 06:43:33 +08:00
|
|
|
|
# ====================================================================
|
|
|
|
|
|
|
|
|
|
# December 2005
|
|
|
|
|
#
|
|
|
|
|
# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
|
|
|
|
|
# for undertaken effort are multiple. First of all, UltraSPARC is not
|
|
|
|
|
# the whole SPARCv9 universe and other VIS-free implementations deserve
|
|
|
|
|
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
|
2016-08-06 01:56:58 +08:00
|
|
|
|
# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
|
2005-12-16 06:43:33 +08:00
|
|
|
|
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
|
|
|
|
|
# several integrated RSA/DSA accelerator circuits accessible through
|
|
|
|
|
# kernel driver [only(*)], but having decent user-land software
|
|
|
|
|
# implementation is important too. Finally, reasons like desire to
|
|
|
|
|
# experiment with dedicated squaring procedure. Yes, this module
|
|
|
|
|
# implements one, because it was easiest to draft it in SPARCv9
|
|
|
|
|
# instructions...
|
|
|
|
|
|
|
|
|
|
# (*) Engine accessing the driver in question is on my TODO list.
|
2017-03-29 05:57:28 +08:00
|
|
|
|
# For reference, accelerator is estimated to give 6 to 10 times
|
2005-12-16 06:43:33 +08:00
|
|
|
|
# improvement on single-threaded RSA sign. It should be noted
|
|
|
|
|
# that 6-10x improvement coefficient does not actually mean
|
|
|
|
|
# something extraordinary in terms of absolute [single-threaded]
|
|
|
|
|
# performance, as SPARCv9 instruction set is by all means least
|
|
|
|
|
# suitable for high performance crypto among other 64 bit
|
|
|
|
|
# platforms. 6-10x factor simply places T1 in same performance
|
|
|
|
|
# domain as say AMD64 and IA-64. Improvement of RSA verify don't
|
|
|
|
|
# appear impressive at all, but it's the sign operation which is
|
|
|
|
|
# far more critical/interesting.
|
|
|
|
|
|
|
|
|
|
# You might notice that inner loops are modulo-scheduled:-) This has
|
|
|
|
|
# essentially negligible impact on UltraSPARC performance, it's
|
|
|
|
|
# Fujitsu SPARC64 V users who should notice and hopefully appreciate
|
|
|
|
|
# the advantage... Currently this module surpasses sparcv9a-mont.pl
|
|
|
|
|
# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
|
|
|
|
|
# module still have hidden potential [see TODO list there], which is
|
|
|
|
|
# estimated to be larger than 20%...
|
|
|
|
|
|
Unify all assembler file generators
They now generally conform to the following argument sequence:
script.pl "$(PERLASM_SCHEME)" [ C preprocessor arguments ... ] \
$(PROCESSOR) <output file>
However, in the spirit of being able to use these scripts manually,
they also allow for no argument, or for only the flavour, or for only
the output file. This is done by only using the last argument as
output file if it's a file (it has an extension), and only using the
first argument as flavour if it isn't a file (it doesn't have an
extension).
While we're at it, we make all $xlate calls the same, i.e. the $output
argument is always quoted, and we always die on error when trying to
start $xlate.
There's a perl lesson in this, regarding operator priority...
This will always succeed, even when it fails:
open FOO, "something" || die "ERR: $!";
The reason is that '||' has higher priority than list operators (a
function is essentially a list operator and gobbles up everything
following it that isn't lower priority), and since a non-empty string
is always true, so that ends up being exactly the same as:
open FOO, "something";
This, however, will fail if "something" can't be opened:
open FOO, "something" or die "ERR: $!";
The reason is that 'or' has lower priority that list operators,
i.e. it's performed after the 'open' call.
Reviewed-by: Matt Caswell <matt@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/9884)
2019-09-13 06:06:46 +08:00
|
|
|
|
$output = pop and open STDOUT,">$output";
|
2016-03-07 22:41:33 +08:00
|
|
|
|
|
2005-12-16 06:43:33 +08:00
|
|
|
|
# int bn_mul_mont(
|
|
|
|
|
$rp="%i0"; # BN_ULONG *rp,
|
|
|
|
|
$ap="%i1"; # const BN_ULONG *ap,
|
|
|
|
|
$bp="%i2"; # const BN_ULONG *bp,
|
|
|
|
|
$np="%i3"; # const BN_ULONG *np,
|
|
|
|
|
$n0="%i4"; # const BN_ULONG *n0,
|
|
|
|
|
$num="%i5"; # int num);
|
|
|
|
|
|
2016-03-08 16:46:19 +08:00
|
|
|
|
$frame="STACK_FRAME";
|
|
|
|
|
$bias="STACK_BIAS";
|
2005-12-16 06:43:33 +08:00
|
|
|
|
|
|
|
|
|
$car0="%o0";
|
|
|
|
|
$car1="%o1";
|
|
|
|
|
$car2="%o2"; # 1 bit
|
|
|
|
|
$acc0="%o3";
|
|
|
|
|
$acc1="%o4";
|
|
|
|
|
$mask="%g1"; # 32 bits, what a waste...
|
|
|
|
|
$tmp0="%g4";
|
|
|
|
|
$tmp1="%g5";
|
|
|
|
|
|
|
|
|
|
$i="%l0";
|
|
|
|
|
$j="%l1";
|
|
|
|
|
$mul0="%l2";
|
|
|
|
|
$mul1="%l3";
|
|
|
|
|
$tp="%l4";
|
|
|
|
|
$apj="%l5";
|
|
|
|
|
$npj="%l6";
|
|
|
|
|
$tpj="%l7";
|
|
|
|
|
|
2005-12-17 01:39:57 +08:00
|
|
|
|
$fname="bn_mul_mont_int";
|
2005-12-16 06:43:33 +08:00
|
|
|
|
|
|
|
|
|
$code=<<___;
|
2021-07-07 23:47:06 +08:00
|
|
|
|
#ifndef __ASSEMBLER__
|
|
|
|
|
# define __ASSEMBLER__ 1
|
|
|
|
|
#endif
|
|
|
|
|
#include "crypto/sparc_arch.h"
|
2016-03-08 16:46:19 +08:00
|
|
|
|
|
2005-12-16 06:43:33 +08:00
|
|
|
|
.section ".text",#alloc,#execinstr
|
|
|
|
|
|
|
|
|
|
.global $fname
|
|
|
|
|
.align 32
|
|
|
|
|
$fname:
|
|
|
|
|
cmp %o5,4 ! 128 bits minimum
|
|
|
|
|
bge,pt %icc,.Lenter
|
|
|
|
|
sethi %hi(0xffffffff),$mask
|
|
|
|
|
retl
|
|
|
|
|
clr %o0
|
|
|
|
|
.align 32
|
|
|
|
|
.Lenter:
|
|
|
|
|
save %sp,-$frame,%sp
|
|
|
|
|
sll $num,2,$num ! num*=4
|
|
|
|
|
or $mask,%lo(0xffffffff),$mask
|
|
|
|
|
ld [$n0],$n0
|
|
|
|
|
cmp $ap,$bp
|
|
|
|
|
and $num,$mask,$num
|
|
|
|
|
ld [$bp],$mul0 ! bp[0]
|
|
|
|
|
nop
|
|
|
|
|
|
|
|
|
|
add %sp,$bias,%o7 ! real top of stack
|
2005-12-28 05:27:39 +08:00
|
|
|
|
ld [$ap],$car0 ! ap[0] ! redundant in squaring context
|
2005-12-16 06:43:33 +08:00
|
|
|
|
sub %o7,$num,%o7
|
|
|
|
|
ld [$ap+4],$apj ! ap[1]
|
|
|
|
|
and %o7,-1024,%o7
|
|
|
|
|
ld [$np],$car1 ! np[0]
|
|
|
|
|
sub %o7,$bias,%sp ! alloca
|
|
|
|
|
ld [$np+4],$npj ! np[1]
|
2016-03-08 16:46:19 +08:00
|
|
|
|
be,pt SIZE_T_CC,.Lbn_sqr_mont
|
2005-12-16 06:43:33 +08:00
|
|
|
|
mov 12,$j
|
|
|
|
|
|
|
|
|
|
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
|
|
|
|
|
mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
add %sp,$bias+$frame,$tp
|
|
|
|
|
ld [$ap+8],$apj !prologue!
|
|
|
|
|
|
|
|
|
|
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
|
|
|
|
and $mul1,$mask,$mul1
|
|
|
|
|
|
|
|
|
|
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
|
|
|
|
mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
ld [$np+8],$npj !prologue!
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
mov $tmp0,$acc0 !prologue!
|
|
|
|
|
|
|
|
|
|
.L1st:
|
|
|
|
|
mulx $apj,$mul0,$tmp0
|
|
|
|
|
mulx $npj,$mul1,$tmp1
|
|
|
|
|
add $acc0,$car0,$car0
|
|
|
|
|
ld [$ap+$j],$apj ! ap[j]
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
ld [$np+$j],$npj ! np[j]
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
add $j,4,$j ! j++
|
|
|
|
|
mov $tmp0,$acc0
|
|
|
|
|
st $car1,[$tp]
|
|
|
|
|
cmp $j,$num
|
|
|
|
|
mov $tmp1,$acc1
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
bl %icc,.L1st
|
|
|
|
|
add $tp,4,$tp ! tp++
|
|
|
|
|
!.L1st
|
|
|
|
|
|
|
|
|
|
mulx $apj,$mul0,$tmp0 !epilogue!
|
|
|
|
|
mulx $npj,$mul1,$tmp1
|
|
|
|
|
add $acc0,$car0,$car0
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
st $car1,[$tp]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $tmp0,$car0,$car0
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
add $tmp1,$car1,$car1
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
st $car1,[$tp+4]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $car0,$car1,$car1
|
|
|
|
|
st $car1,[$tp+8]
|
|
|
|
|
srlx $car1,32,$car2
|
|
|
|
|
|
|
|
|
|
mov 4,$i ! i++
|
|
|
|
|
ld [$bp+4],$mul0 ! bp[1]
|
|
|
|
|
.Louter:
|
|
|
|
|
add %sp,$bias+$frame,$tp
|
|
|
|
|
ld [$ap],$car0 ! ap[0]
|
|
|
|
|
ld [$ap+4],$apj ! ap[1]
|
|
|
|
|
ld [$np],$car1 ! np[0]
|
|
|
|
|
ld [$np+4],$npj ! np[1]
|
|
|
|
|
ld [$tp],$tmp1 ! tp[0]
|
|
|
|
|
ld [$tp+4],$tpj ! tp[1]
|
|
|
|
|
mov 12,$j
|
|
|
|
|
|
|
|
|
|
mulx $car0,$mul0,$car0
|
|
|
|
|
mulx $apj,$mul0,$tmp0 !prologue!
|
|
|
|
|
add $tmp1,$car0,$car0
|
|
|
|
|
ld [$ap+8],$apj !prologue!
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
|
|
|
|
|
mulx $n0,$acc0,$mul1
|
|
|
|
|
and $mul1,$mask,$mul1
|
|
|
|
|
|
|
|
|
|
mulx $car1,$mul1,$car1
|
|
|
|
|
mulx $npj,$mul1,$acc1 !prologue!
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
ld [$np+8],$npj !prologue!
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
mov $tmp0,$acc0 !prologue!
|
|
|
|
|
|
|
|
|
|
.Linner:
|
|
|
|
|
mulx $apj,$mul0,$tmp0
|
|
|
|
|
mulx $npj,$mul1,$tmp1
|
|
|
|
|
add $tpj,$car0,$car0
|
|
|
|
|
ld [$ap+$j],$apj ! ap[j]
|
|
|
|
|
add $acc0,$car0,$car0
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
ld [$np+$j],$npj ! np[j]
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
ld [$tp+8],$tpj ! tp[j]
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
add $j,4,$j ! j++
|
|
|
|
|
mov $tmp0,$acc0
|
|
|
|
|
st $car1,[$tp] ! tp[j-1]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
mov $tmp1,$acc1
|
|
|
|
|
cmp $j,$num
|
|
|
|
|
bl %icc,.Linner
|
|
|
|
|
add $tp,4,$tp ! tp++
|
|
|
|
|
!.Linner
|
|
|
|
|
|
|
|
|
|
mulx $apj,$mul0,$tmp0 !epilogue!
|
|
|
|
|
mulx $npj,$mul1,$tmp1
|
|
|
|
|
add $tpj,$car0,$car0
|
|
|
|
|
add $acc0,$car0,$car0
|
|
|
|
|
ld [$tp+8],$tpj ! tp[j]
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
st $car1,[$tp] ! tp[j-1]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $tpj,$car0,$car0
|
|
|
|
|
add $tmp0,$car0,$car0
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
add $tmp1,$car1,$car1
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
st $car1,[$tp+4] ! tp[j-1]
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $i,4,$i ! i++
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $car0,$car1,$car1
|
|
|
|
|
cmp $i,$num
|
|
|
|
|
add $car2,$car1,$car1
|
|
|
|
|
st $car1,[$tp+8]
|
|
|
|
|
|
|
|
|
|
srlx $car1,32,$car2
|
|
|
|
|
bl,a %icc,.Louter
|
|
|
|
|
ld [$bp+$i],$mul0 ! bp[i]
|
|
|
|
|
!.Louter
|
|
|
|
|
|
|
|
|
|
add $tp,12,$tp
|
|
|
|
|
|
|
|
|
|
.Ltail:
|
|
|
|
|
add $np,$num,$np
|
|
|
|
|
add $rp,$num,$rp
|
|
|
|
|
sub %g0,$num,%o7 ! k=-num
|
2007-06-20 20:24:22 +08:00
|
|
|
|
ba .Lsub
|
|
|
|
|
subcc %g0,%g0,%g0 ! clear %icc.c
|
|
|
|
|
.align 16
|
2005-12-16 06:43:33 +08:00
|
|
|
|
.Lsub:
|
|
|
|
|
ld [$tp+%o7],%o0
|
|
|
|
|
ld [$np+%o7],%o1
|
2007-06-18 01:10:03 +08:00
|
|
|
|
subccc %o0,%o1,%o1 ! tp[j]-np[j]
|
2007-06-20 20:24:22 +08:00
|
|
|
|
add $rp,%o7,$i
|
2005-12-16 06:43:33 +08:00
|
|
|
|
add %o7,4,%o7
|
|
|
|
|
brnz %o7,.Lsub
|
2007-06-20 20:24:22 +08:00
|
|
|
|
st %o1,[$i]
|
2018-05-01 04:59:51 +08:00
|
|
|
|
subccc $car2,0,$car2 ! handle upmost overflow bit
|
2005-12-16 06:43:33 +08:00
|
|
|
|
sub %g0,$num,%o7
|
|
|
|
|
|
|
|
|
|
.Lcopy:
|
2018-05-01 04:59:51 +08:00
|
|
|
|
ld [$tp+%o7],%o1 ! conditional copy
|
|
|
|
|
ld [$rp+%o7],%o0
|
2007-06-18 01:10:03 +08:00
|
|
|
|
st %g0,[$tp+%o7] ! zap tp
|
2018-05-01 04:59:51 +08:00
|
|
|
|
movcs %icc,%o1,%o0
|
2005-12-16 06:43:33 +08:00
|
|
|
|
st %o0,[$rp+%o7]
|
|
|
|
|
add %o7,4,%o7
|
|
|
|
|
brnz %o7,.Lcopy
|
|
|
|
|
nop
|
|
|
|
|
mov 1,%i0
|
|
|
|
|
ret
|
|
|
|
|
restore
|
|
|
|
|
___
|
|
|
|
|
|
|
|
|
|
########
|
2005-12-17 01:39:57 +08:00
|
|
|
|
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
|
|
|
|
|
######## code without following dedicated squaring procedure.
|
2005-12-16 06:43:33 +08:00
|
|
|
|
########
|
2017-03-22 17:51:25 +08:00
|
|
|
|
$sbit="%o5";
|
2005-12-16 06:43:33 +08:00
|
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
|
.align 32
|
|
|
|
|
.Lbn_sqr_mont:
|
|
|
|
|
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
|
|
|
|
|
mulx $apj,$mul0,$tmp0 !prologue!
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
add %sp,$bias+$frame,$tp
|
|
|
|
|
ld [$ap+8],$apj !prologue!
|
|
|
|
|
|
|
|
|
|
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
and $mul1,$mask,$mul1
|
|
|
|
|
|
|
|
|
|
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
|
|
|
|
mulx $npj,$mul1,$acc1 !prologue!
|
|
|
|
|
and $car0,1,$sbit
|
|
|
|
|
ld [$np+8],$npj !prologue!
|
|
|
|
|
srlx $car0,1,$car0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
mov $tmp0,$acc0 !prologue!
|
|
|
|
|
|
|
|
|
|
.Lsqr_1st:
|
|
|
|
|
mulx $apj,$mul0,$tmp0
|
|
|
|
|
mulx $npj,$mul1,$tmp1
|
|
|
|
|
add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
ld [$ap+$j],$apj ! ap[j]
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
ld [$np+$j],$npj ! np[j]
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$acc0,$acc0
|
|
|
|
|
or $sbit,$acc0,$acc0
|
|
|
|
|
mov $tmp1,$acc1
|
|
|
|
|
srlx $acc0,32,$sbit
|
|
|
|
|
add $j,4,$j ! j++
|
|
|
|
|
and $acc0,$mask,$acc0
|
|
|
|
|
cmp $j,$num
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
st $car1,[$tp]
|
|
|
|
|
mov $tmp0,$acc0
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
bl %icc,.Lsqr_1st
|
|
|
|
|
add $tp,4,$tp ! tp++
|
|
|
|
|
!.Lsqr_1st
|
|
|
|
|
|
|
|
|
|
mulx $apj,$mul0,$tmp0 ! epilogue
|
|
|
|
|
mulx $npj,$mul1,$tmp1
|
|
|
|
|
add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$acc0,$acc0
|
|
|
|
|
or $sbit,$acc0,$acc0
|
|
|
|
|
srlx $acc0,32,$sbit
|
|
|
|
|
and $acc0,$mask,$acc0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
st $car1,[$tp]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $tmp0,$car0,$car0 ! ap[j]*a0+c0
|
|
|
|
|
add $tmp1,$car1,$car1
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$acc0,$acc0
|
|
|
|
|
or $sbit,$acc0,$acc0
|
|
|
|
|
srlx $acc0,32,$sbit
|
|
|
|
|
and $acc0,$mask,$acc0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
st $car1,[$tp+4]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $car0,$car0,$car0
|
|
|
|
|
or $sbit,$car0,$car0
|
|
|
|
|
add $car0,$car1,$car1
|
|
|
|
|
st $car1,[$tp+8]
|
|
|
|
|
srlx $car1,32,$car2
|
|
|
|
|
|
|
|
|
|
ld [%sp+$bias+$frame],$tmp0 ! tp[0]
|
|
|
|
|
ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
|
|
|
|
|
ld [%sp+$bias+$frame+8],$tpj ! tp[2]
|
|
|
|
|
ld [$ap+4],$mul0 ! ap[1]
|
|
|
|
|
ld [$ap+8],$apj ! ap[2]
|
|
|
|
|
ld [$np],$car1 ! np[0]
|
|
|
|
|
ld [$np+4],$npj ! np[1]
|
|
|
|
|
mulx $n0,$tmp0,$mul1
|
|
|
|
|
|
|
|
|
|
mulx $mul0,$mul0,$car0
|
|
|
|
|
and $mul1,$mask,$mul1
|
|
|
|
|
|
|
|
|
|
mulx $car1,$mul1,$car1
|
|
|
|
|
mulx $npj,$mul1,$acc1
|
|
|
|
|
add $tmp0,$car1,$car1
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
ld [$np+8],$npj ! np[2]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
add $tmp1,$car1,$car1
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
and $car0,1,$sbit
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
srlx $car0,1,$car0
|
|
|
|
|
mov 12,$j
|
|
|
|
|
st $car1,[%sp+$bias+$frame] ! tp[0]=
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
add %sp,$bias+$frame+4,$tp
|
|
|
|
|
|
|
|
|
|
.Lsqr_2nd:
|
|
|
|
|
mulx $apj,$mul0,$acc0
|
|
|
|
|
mulx $npj,$mul1,$acc1
|
|
|
|
|
add $acc0,$car0,$car0
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $tpj,$sbit,$sbit
|
2005-12-16 06:43:33 +08:00
|
|
|
|
ld [$ap+$j],$apj ! ap[j]
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
ld [$np+$j],$npj ! np[j]
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
ld [$tp+8],$tpj ! tp[j]
|
|
|
|
|
add $acc0,$acc0,$acc0
|
|
|
|
|
add $j,4,$j ! j++
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $sbit,$acc0,$acc0
|
2005-12-16 06:43:33 +08:00
|
|
|
|
srlx $acc0,32,$sbit
|
|
|
|
|
and $acc0,$mask,$acc0
|
|
|
|
|
cmp $j,$num
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
st $car1,[$tp] ! tp[j-1]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
bl %icc,.Lsqr_2nd
|
|
|
|
|
add $tp,4,$tp ! tp++
|
|
|
|
|
!.Lsqr_2nd
|
|
|
|
|
|
|
|
|
|
mulx $apj,$mul0,$acc0
|
|
|
|
|
mulx $npj,$mul1,$acc1
|
|
|
|
|
add $acc0,$car0,$car0
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $tpj,$sbit,$sbit
|
2005-12-16 06:43:33 +08:00
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
add $acc0,$acc0,$acc0
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $sbit,$acc0,$acc0
|
2005-12-16 06:43:33 +08:00
|
|
|
|
srlx $acc0,32,$sbit
|
|
|
|
|
and $acc0,$mask,$acc0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
st $car1,[$tp] ! tp[j-1]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $car0,$car0,$car0
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $sbit,$car0,$car0
|
2005-12-16 06:43:33 +08:00
|
|
|
|
add $car0,$car1,$car1
|
|
|
|
|
add $car2,$car1,$car1
|
|
|
|
|
st $car1,[$tp+4]
|
|
|
|
|
srlx $car1,32,$car2
|
|
|
|
|
|
|
|
|
|
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
|
|
|
|
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
|
|
|
|
ld [$ap+8],$mul0 ! ap[2]
|
|
|
|
|
ld [$np],$car1 ! np[0]
|
|
|
|
|
ld [$np+4],$npj ! np[1]
|
|
|
|
|
mulx $n0,$tmp1,$mul1
|
|
|
|
|
and $mul1,$mask,$mul1
|
|
|
|
|
mov 8,$i
|
|
|
|
|
|
|
|
|
|
mulx $mul0,$mul0,$car0
|
|
|
|
|
mulx $car1,$mul1,$car1
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
add $tmp1,$car1,$car1
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add %sp,$bias+$frame,$tp
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
and $car0,1,$sbit
|
|
|
|
|
srlx $car0,1,$car0
|
|
|
|
|
mov 4,$j
|
|
|
|
|
|
|
|
|
|
.Lsqr_outer:
|
|
|
|
|
.Lsqr_inner1:
|
|
|
|
|
mulx $npj,$mul1,$acc1
|
|
|
|
|
add $tpj,$car1,$car1
|
|
|
|
|
add $j,4,$j
|
|
|
|
|
ld [$tp+8],$tpj
|
|
|
|
|
cmp $j,$i
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
ld [$np+$j],$npj
|
|
|
|
|
st $car1,[$tp]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
bl %icc,.Lsqr_inner1
|
|
|
|
|
add $tp,4,$tp
|
|
|
|
|
!.Lsqr_inner1
|
|
|
|
|
|
|
|
|
|
add $j,4,$j
|
|
|
|
|
ld [$ap+$j],$apj ! ap[j]
|
|
|
|
|
mulx $npj,$mul1,$acc1
|
|
|
|
|
add $tpj,$car1,$car1
|
|
|
|
|
ld [$np+$j],$npj ! np[j]
|
2018-06-08 21:02:39 +08:00
|
|
|
|
srlx $car1,32,$tmp0
|
|
|
|
|
and $car1,$mask,$car1
|
|
|
|
|
add $tmp0,$sbit,$sbit
|
2005-12-16 06:43:33 +08:00
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
ld [$tp+8],$tpj ! tp[j]
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
st $car1,[$tp]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $j,4,$j
|
|
|
|
|
cmp $j,$num
|
|
|
|
|
be,pn %icc,.Lsqr_no_inner2
|
|
|
|
|
add $tp,4,$tp
|
|
|
|
|
|
|
|
|
|
.Lsqr_inner2:
|
|
|
|
|
mulx $apj,$mul0,$acc0
|
|
|
|
|
mulx $npj,$mul1,$acc1
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $tpj,$sbit,$sbit
|
2005-12-16 06:43:33 +08:00
|
|
|
|
add $acc0,$car0,$car0
|
|
|
|
|
ld [$ap+$j],$apj ! ap[j]
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
ld [$np+$j],$npj ! np[j]
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$acc0,$acc0
|
|
|
|
|
ld [$tp+8],$tpj ! tp[j]
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $sbit,$acc0,$acc0
|
2005-12-16 06:43:33 +08:00
|
|
|
|
add $j,4,$j ! j++
|
|
|
|
|
srlx $acc0,32,$sbit
|
|
|
|
|
and $acc0,$mask,$acc0
|
|
|
|
|
cmp $j,$num
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
st $car1,[$tp] ! tp[j-1]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
bl %icc,.Lsqr_inner2
|
|
|
|
|
add $tp,4,$tp ! tp++
|
|
|
|
|
|
|
|
|
|
.Lsqr_no_inner2:
|
|
|
|
|
mulx $apj,$mul0,$acc0
|
|
|
|
|
mulx $npj,$mul1,$acc1
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $tpj,$sbit,$sbit
|
2005-12-16 06:43:33 +08:00
|
|
|
|
add $acc0,$car0,$car0
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add $acc0,$acc0,$acc0
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $sbit,$acc0,$acc0
|
2005-12-16 06:43:33 +08:00
|
|
|
|
srlx $acc0,32,$sbit
|
|
|
|
|
and $acc0,$mask,$acc0
|
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
st $car1,[$tp] ! tp[j-1]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $car0,$car0,$car0
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $sbit,$car0,$car0
|
2005-12-16 06:43:33 +08:00
|
|
|
|
add $car0,$car1,$car1
|
|
|
|
|
add $car2,$car1,$car1
|
|
|
|
|
st $car1,[$tp+4]
|
|
|
|
|
srlx $car1,32,$car2
|
|
|
|
|
|
|
|
|
|
add $i,4,$i ! i++
|
|
|
|
|
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
|
|
|
|
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
|
|
|
|
ld [$ap+$i],$mul0 ! ap[j]
|
|
|
|
|
ld [$np],$car1 ! np[0]
|
|
|
|
|
ld [$np+4],$npj ! np[1]
|
|
|
|
|
mulx $n0,$tmp1,$mul1
|
|
|
|
|
and $mul1,$mask,$mul1
|
|
|
|
|
add $i,4,$tmp0
|
|
|
|
|
|
|
|
|
|
mulx $mul0,$mul0,$car0
|
|
|
|
|
mulx $car1,$mul1,$car1
|
|
|
|
|
and $car0,$mask,$acc0
|
|
|
|
|
add $tmp1,$car1,$car1
|
|
|
|
|
srlx $car0,32,$car0
|
|
|
|
|
add %sp,$bias+$frame,$tp
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
and $car0,1,$sbit
|
|
|
|
|
srlx $car0,1,$car0
|
|
|
|
|
|
|
|
|
|
cmp $tmp0,$num ! i<num-1
|
|
|
|
|
bl %icc,.Lsqr_outer
|
|
|
|
|
mov 4,$j
|
|
|
|
|
|
|
|
|
|
.Lsqr_last:
|
|
|
|
|
mulx $npj,$mul1,$acc1
|
|
|
|
|
add $tpj,$car1,$car1
|
|
|
|
|
add $j,4,$j
|
|
|
|
|
ld [$tp+8],$tpj
|
|
|
|
|
cmp $j,$i
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
ld [$np+$j],$npj
|
|
|
|
|
st $car1,[$tp]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
bl %icc,.Lsqr_last
|
|
|
|
|
add $tp,4,$tp
|
|
|
|
|
!.Lsqr_last
|
|
|
|
|
|
|
|
|
|
mulx $npj,$mul1,$acc1
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $tpj,$acc0,$acc0
|
|
|
|
|
srlx $acc0,32,$tmp0
|
|
|
|
|
and $acc0,$mask,$acc0
|
|
|
|
|
add $tmp0,$sbit,$sbit
|
2005-12-16 06:43:33 +08:00
|
|
|
|
add $acc0,$car1,$car1
|
|
|
|
|
add $acc1,$car1,$car1
|
|
|
|
|
st $car1,[$tp]
|
|
|
|
|
srlx $car1,32,$car1
|
|
|
|
|
|
|
|
|
|
add $car0,$car0,$car0 ! recover $car0
|
2017-03-22 17:51:25 +08:00
|
|
|
|
add $sbit,$car0,$car0
|
2005-12-16 06:43:33 +08:00
|
|
|
|
add $car0,$car1,$car1
|
|
|
|
|
add $car2,$car1,$car1
|
|
|
|
|
st $car1,[$tp+4]
|
|
|
|
|
srlx $car1,32,$car2
|
|
|
|
|
|
|
|
|
|
ba .Ltail
|
|
|
|
|
add $tp,8,$tp
|
|
|
|
|
.type $fname,#function
|
|
|
|
|
.size $fname,(.-$fname)
|
2017-11-12 08:03:10 +08:00
|
|
|
|
.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
|
2007-06-20 20:24:22 +08:00
|
|
|
|
.align 32
|
2005-12-16 06:43:33 +08:00
|
|
|
|
___
|
|
|
|
|
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
|
|
|
|
print $code;
|
2020-02-17 10:17:53 +08:00
|
|
|
|
close STDOUT or die "error closing STDOUT: $!";
|