mirror of
https://github.com/openssl/openssl.git
synced 2025-01-24 13:55:42 +08:00
1aa89a7a3a
They now generally conform to the following argument sequence: script.pl "$(PERLASM_SCHEME)" [ C preprocessor arguments ... ] \ $(PROCESSOR) <output file> However, in the spirit of being able to use these scripts manually, they also allow for no argument, or for only the flavour, or for only the output file. This is done by only using the last argument as output file if it's a file (it has an extension), and only using the first argument as flavour if it isn't a file (it doesn't have an extension). While we're at it, we make all $xlate calls the same, i.e. the $output argument is always quoted, and we always die on error when trying to start $xlate. There's a perl lesson in this, regarding operator priority... This will always succeed, even when it fails: open FOO, "something" || die "ERR: $!"; The reason is that '||' has higher priority than list operators (a function is essentially a list operator and gobbles up everything following it that isn't lower priority), and since a non-empty string is always true, so that ends up being exactly the same as: open FOO, "something"; This, however, will fail if "something" can't be opened: open FOO, "something" or die "ERR: $!"; The reason is that 'or' has lower priority that list operators, i.e. it's performed after the 'open' call. Reviewed-by: Matt Caswell <matt@openssl.org> (Merged from https://github.com/openssl/openssl/pull/9884)
2386 lines
52 KiB
Raku
Executable File
2386 lines
52 KiB
Raku
Executable File
#! /usr/bin/env perl
|
|
# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
# ECP_NISTZ256 module for PPC64.
|
|
#
|
|
# August 2016.
|
|
#
|
|
# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
|
|
# http://eprint.iacr.org/2013/816.
|
|
#
|
|
# with/without -DECP_NISTZ256_ASM
|
|
# POWER7 +260-530%
|
|
# POWER8 +220-340%
|
|
|
|
# $output is the last argument if it looks like a file (it has an extension)
|
|
# $flavour is the first argument if it doesn't look like a file
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
|
die "can't locate ppc-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
|
|
or die "can't call $xlate: $!";
|
|
*STDOUT=*OUT;
|
|
|
|
my $sp="r1";
|
|
|
|
{
|
|
my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
|
|
$acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
|
|
map("r$_",(3..12,22..31));
|
|
|
|
my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
|
|
|
|
$code.=<<___;
|
|
.machine "any"
|
|
.text
|
|
___
|
|
########################################################################
|
|
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
|
|
#
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
open TABLE,"<ecp_nistz256_table.c" or
|
|
open TABLE,"<${dir}../ecp_nistz256_table.c" or
|
|
die "failed to open ecp_nistz256_table.c:",$!;
|
|
|
|
use integer;
|
|
|
|
foreach(<TABLE>) {
|
|
s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
|
|
}
|
|
close TABLE;
|
|
|
|
# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
|
|
# 64*16*37-1 is because $#arr returns last valid index or @arr, not
|
|
# amount of elements.
|
|
die "insane number of elements" if ($#arr != 64*16*37-1);
|
|
|
|
$code.=<<___;
|
|
.type ecp_nistz256_precomputed,\@object
|
|
.globl ecp_nistz256_precomputed
|
|
.align 12
|
|
ecp_nistz256_precomputed:
|
|
___
|
|
########################################################################
|
|
# this conversion smashes P256_POINT_AFFINE by individual bytes with
|
|
# 64 byte interval, similar to
|
|
# 1111222233334444
|
|
# 1234123412341234
|
|
for(1..37) {
|
|
@tbl = splice(@arr,0,64*16);
|
|
for($i=0;$i<64;$i++) {
|
|
undef @line;
|
|
for($j=0;$j<64;$j++) {
|
|
push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
|
|
}
|
|
$code.=".byte\t";
|
|
$code.=join(',',map { sprintf "0x%02x",$_} @line);
|
|
$code.="\n";
|
|
}
|
|
}
|
|
|
|
$code.=<<___;
|
|
.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
|
|
.asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
|
|
|
|
# void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
|
|
# const BN_ULONG x2[4]);
|
|
.globl ecp_nistz256_mul_mont
|
|
.align 5
|
|
ecp_nistz256_mul_mont:
|
|
stdu $sp,-128($sp)
|
|
mflr r0
|
|
std r22,48($sp)
|
|
std r23,56($sp)
|
|
std r24,64($sp)
|
|
std r25,72($sp)
|
|
std r26,80($sp)
|
|
std r27,88($sp)
|
|
std r28,96($sp)
|
|
std r29,104($sp)
|
|
std r30,112($sp)
|
|
std r31,120($sp)
|
|
|
|
ld $a0,0($ap)
|
|
ld $bi,0($bp)
|
|
ld $a1,8($ap)
|
|
ld $a2,16($ap)
|
|
ld $a3,24($ap)
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
bl __ecp_nistz256_mul_mont
|
|
|
|
mtlr r0
|
|
ld r22,48($sp)
|
|
ld r23,56($sp)
|
|
ld r24,64($sp)
|
|
ld r25,72($sp)
|
|
ld r26,80($sp)
|
|
ld r27,88($sp)
|
|
ld r28,96($sp)
|
|
ld r29,104($sp)
|
|
ld r30,112($sp)
|
|
ld r31,120($sp)
|
|
addi $sp,$sp,128
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,10,3,0
|
|
.long 0
|
|
.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
|
|
|
|
# void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
|
.globl ecp_nistz256_sqr_mont
|
|
.align 4
|
|
ecp_nistz256_sqr_mont:
|
|
stdu $sp,-128($sp)
|
|
mflr r0
|
|
std r22,48($sp)
|
|
std r23,56($sp)
|
|
std r24,64($sp)
|
|
std r25,72($sp)
|
|
std r26,80($sp)
|
|
std r27,88($sp)
|
|
std r28,96($sp)
|
|
std r29,104($sp)
|
|
std r30,112($sp)
|
|
std r31,120($sp)
|
|
|
|
ld $a0,0($ap)
|
|
ld $a1,8($ap)
|
|
ld $a2,16($ap)
|
|
ld $a3,24($ap)
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
bl __ecp_nistz256_sqr_mont
|
|
|
|
mtlr r0
|
|
ld r22,48($sp)
|
|
ld r23,56($sp)
|
|
ld r24,64($sp)
|
|
ld r25,72($sp)
|
|
ld r26,80($sp)
|
|
ld r27,88($sp)
|
|
ld r28,96($sp)
|
|
ld r29,104($sp)
|
|
ld r30,112($sp)
|
|
ld r31,120($sp)
|
|
addi $sp,$sp,128
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,10,2,0
|
|
.long 0
|
|
.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
|
|
|
|
# void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
|
|
# const BN_ULONG x2[4]);
|
|
.globl ecp_nistz256_add
|
|
.align 4
|
|
ecp_nistz256_add:
|
|
stdu $sp,-128($sp)
|
|
mflr r0
|
|
std r28,96($sp)
|
|
std r29,104($sp)
|
|
std r30,112($sp)
|
|
std r31,120($sp)
|
|
|
|
ld $acc0,0($ap)
|
|
ld $t0, 0($bp)
|
|
ld $acc1,8($ap)
|
|
ld $t1, 8($bp)
|
|
ld $acc2,16($ap)
|
|
ld $t2, 16($bp)
|
|
ld $acc3,24($ap)
|
|
ld $t3, 24($bp)
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
bl __ecp_nistz256_add
|
|
|
|
mtlr r0
|
|
ld r28,96($sp)
|
|
ld r29,104($sp)
|
|
ld r30,112($sp)
|
|
ld r31,120($sp)
|
|
addi $sp,$sp,128
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,4,3,0
|
|
.long 0
|
|
.size ecp_nistz256_add,.-ecp_nistz256_add
|
|
|
|
# void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
|
.globl ecp_nistz256_div_by_2
|
|
.align 4
|
|
ecp_nistz256_div_by_2:
|
|
stdu $sp,-128($sp)
|
|
mflr r0
|
|
std r28,96($sp)
|
|
std r29,104($sp)
|
|
std r30,112($sp)
|
|
std r31,120($sp)
|
|
|
|
ld $acc0,0($ap)
|
|
ld $acc1,8($ap)
|
|
ld $acc2,16($ap)
|
|
ld $acc3,24($ap)
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
bl __ecp_nistz256_div_by_2
|
|
|
|
mtlr r0
|
|
ld r28,96($sp)
|
|
ld r29,104($sp)
|
|
ld r30,112($sp)
|
|
ld r31,120($sp)
|
|
addi $sp,$sp,128
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,4,2,0
|
|
.long 0
|
|
.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
|
|
|
|
# void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
|
.globl ecp_nistz256_mul_by_2
|
|
.align 4
|
|
ecp_nistz256_mul_by_2:
|
|
stdu $sp,-128($sp)
|
|
mflr r0
|
|
std r28,96($sp)
|
|
std r29,104($sp)
|
|
std r30,112($sp)
|
|
std r31,120($sp)
|
|
|
|
ld $acc0,0($ap)
|
|
ld $acc1,8($ap)
|
|
ld $acc2,16($ap)
|
|
ld $acc3,24($ap)
|
|
|
|
mr $t0,$acc0
|
|
mr $t1,$acc1
|
|
mr $t2,$acc2
|
|
mr $t3,$acc3
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
bl __ecp_nistz256_add # ret = a+a // 2*a
|
|
|
|
mtlr r0
|
|
ld r28,96($sp)
|
|
ld r29,104($sp)
|
|
ld r30,112($sp)
|
|
ld r31,120($sp)
|
|
addi $sp,$sp,128
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,4,3,0
|
|
.long 0
|
|
.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
|
|
|
|
# void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
|
.globl ecp_nistz256_mul_by_3
|
|
.align 4
|
|
ecp_nistz256_mul_by_3:
|
|
stdu $sp,-128($sp)
|
|
mflr r0
|
|
std r28,96($sp)
|
|
std r29,104($sp)
|
|
std r30,112($sp)
|
|
std r31,120($sp)
|
|
|
|
ld $acc0,0($ap)
|
|
ld $acc1,8($ap)
|
|
ld $acc2,16($ap)
|
|
ld $acc3,24($ap)
|
|
|
|
mr $t0,$acc0
|
|
std $acc0,64($sp)
|
|
mr $t1,$acc1
|
|
std $acc1,72($sp)
|
|
mr $t2,$acc2
|
|
std $acc2,80($sp)
|
|
mr $t3,$acc3
|
|
std $acc3,88($sp)
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
bl __ecp_nistz256_add # ret = a+a // 2*a
|
|
|
|
ld $t0,64($sp)
|
|
ld $t1,72($sp)
|
|
ld $t2,80($sp)
|
|
ld $t3,88($sp)
|
|
|
|
bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
|
|
|
|
mtlr r0
|
|
ld r28,96($sp)
|
|
ld r29,104($sp)
|
|
ld r30,112($sp)
|
|
ld r31,120($sp)
|
|
addi $sp,$sp,128
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,4,2,0
|
|
.long 0
|
|
.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
|
|
|
|
# void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
|
|
# const BN_ULONG x2[4]);
|
|
.globl ecp_nistz256_sub
|
|
.align 4
|
|
ecp_nistz256_sub:
|
|
stdu $sp,-128($sp)
|
|
mflr r0
|
|
std r28,96($sp)
|
|
std r29,104($sp)
|
|
std r30,112($sp)
|
|
std r31,120($sp)
|
|
|
|
ld $acc0,0($ap)
|
|
ld $acc1,8($ap)
|
|
ld $acc2,16($ap)
|
|
ld $acc3,24($ap)
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
bl __ecp_nistz256_sub_from
|
|
|
|
mtlr r0
|
|
ld r28,96($sp)
|
|
ld r29,104($sp)
|
|
ld r30,112($sp)
|
|
ld r31,120($sp)
|
|
addi $sp,$sp,128
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,4,3,0
|
|
.long 0
|
|
.size ecp_nistz256_sub,.-ecp_nistz256_sub
|
|
|
|
# void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
|
.globl ecp_nistz256_neg
|
|
.align 4
|
|
ecp_nistz256_neg:
|
|
stdu $sp,-128($sp)
|
|
mflr r0
|
|
std r28,96($sp)
|
|
std r29,104($sp)
|
|
std r30,112($sp)
|
|
std r31,120($sp)
|
|
|
|
mr $bp,$ap
|
|
li $acc0,0
|
|
li $acc1,0
|
|
li $acc2,0
|
|
li $acc3,0
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
bl __ecp_nistz256_sub_from
|
|
|
|
mtlr r0
|
|
ld r28,96($sp)
|
|
ld r29,104($sp)
|
|
ld r30,112($sp)
|
|
ld r31,120($sp)
|
|
addi $sp,$sp,128
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,4,2,0
|
|
.long 0
|
|
.size ecp_nistz256_neg,.-ecp_nistz256_neg
|
|
|
|
# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
|
|
# to $a0-$a3 and b[0] - to $bi
|
|
.type __ecp_nistz256_mul_mont,\@function
|
|
.align 4
|
|
__ecp_nistz256_mul_mont:
|
|
mulld $acc0,$a0,$bi # a[0]*b[0]
|
|
mulhdu $t0,$a0,$bi
|
|
|
|
mulld $acc1,$a1,$bi # a[1]*b[0]
|
|
mulhdu $t1,$a1,$bi
|
|
|
|
mulld $acc2,$a2,$bi # a[2]*b[0]
|
|
mulhdu $t2,$a2,$bi
|
|
|
|
mulld $acc3,$a3,$bi # a[3]*b[0]
|
|
mulhdu $t3,$a3,$bi
|
|
ld $bi,8($bp) # b[1]
|
|
|
|
addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
|
|
sldi $t0,$acc0,32
|
|
adde $acc2,$acc2,$t1
|
|
srdi $t1,$acc0,32
|
|
adde $acc3,$acc3,$t2
|
|
addze $acc4,$t3
|
|
li $acc5,0
|
|
___
|
|
for($i=1;$i<4;$i++) {
|
|
################################################################
|
|
# Reduction iteration is normally performed by accumulating
|
|
# result of multiplication of modulus by "magic" digit [and
|
|
# omitting least significant word, which is guaranteed to
|
|
# be 0], but thanks to special form of modulus and "magic"
|
|
# digit being equal to least significant word, it can be
|
|
# performed with additions and subtractions alone. Indeed:
|
|
#
|
|
# ffff0001.00000000.0000ffff.ffffffff
|
|
# * abcdefgh
|
|
# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
|
|
#
|
|
# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
|
# rewrite above as:
|
|
#
|
|
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
|
|
# + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
|
|
# - 0000abcd.efgh0000.00000000.00000000.abcdefgh
|
|
#
|
|
# or marking redundant operations:
|
|
#
|
|
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
|
|
# + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
|
|
# - 0000abcd.efgh0000.--------.--------.--------
|
|
|
|
$code.=<<___;
|
|
subfc $t2,$t0,$acc0 # "*0xffff0001"
|
|
subfe $t3,$t1,$acc0
|
|
addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
|
adde $acc1,$acc2,$t1
|
|
adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
|
adde $acc3,$acc4,$t3
|
|
addze $acc4,$acc5
|
|
|
|
mulld $t0,$a0,$bi # lo(a[0]*b[i])
|
|
mulld $t1,$a1,$bi # lo(a[1]*b[i])
|
|
mulld $t2,$a2,$bi # lo(a[2]*b[i])
|
|
mulld $t3,$a3,$bi # lo(a[3]*b[i])
|
|
addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
|
|
mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
|
|
adde $acc1,$acc1,$t1
|
|
mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
|
|
adde $acc2,$acc2,$t2
|
|
mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
|
|
adde $acc3,$acc3,$t3
|
|
mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
|
|
addze $acc4,$acc4
|
|
___
|
|
$code.=<<___ if ($i<3);
|
|
ld $bi,8*($i+1)($bp) # b[$i+1]
|
|
___
|
|
$code.=<<___;
|
|
addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
|
|
sldi $t0,$acc0,32
|
|
adde $acc2,$acc2,$t1
|
|
srdi $t1,$acc0,32
|
|
adde $acc3,$acc3,$t2
|
|
adde $acc4,$acc4,$t3
|
|
li $acc5,0
|
|
addze $acc5,$acc5
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
# last reduction
|
|
subfc $t2,$t0,$acc0 # "*0xffff0001"
|
|
subfe $t3,$t1,$acc0
|
|
addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
|
adde $acc1,$acc2,$t1
|
|
adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
|
adde $acc3,$acc4,$t3
|
|
addze $acc4,$acc5
|
|
|
|
li $t2,0
|
|
addic $acc0,$acc0,1 # ret -= modulus
|
|
subfe $acc1,$poly1,$acc1
|
|
subfe $acc2,$t2,$acc2
|
|
subfe $acc3,$poly3,$acc3
|
|
subfe $acc4,$t2,$acc4
|
|
|
|
addc $acc0,$acc0,$acc4 # ret += modulus if borrow
|
|
and $t1,$poly1,$acc4
|
|
and $t3,$poly3,$acc4
|
|
adde $acc1,$acc1,$t1
|
|
addze $acc2,$acc2
|
|
adde $acc3,$acc3,$t3
|
|
|
|
std $acc0,0($rp)
|
|
std $acc1,8($rp)
|
|
std $acc2,16($rp)
|
|
std $acc3,24($rp)
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,1,0
|
|
.long 0
|
|
.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
|
|
|
|
# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
|
|
# to $a0-$a3
|
|
.type __ecp_nistz256_sqr_mont,\@function
|
|
.align 4
|
|
__ecp_nistz256_sqr_mont:
|
|
################################################################
|
|
# | | | | | |a1*a0| |
|
|
# | | | | |a2*a0| | |
|
|
# | |a3*a2|a3*a0| | | |
|
|
# | | | |a2*a1| | | |
|
|
# | | |a3*a1| | | | |
|
|
# *| | | | | | | | 2|
|
|
# +|a3*a3|a2*a2|a1*a1|a0*a0|
|
|
# |--+--+--+--+--+--+--+--|
|
|
# |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
|
|
#
|
|
# "can't overflow" below mark carrying into high part of
|
|
# multiplication result, which can't overflow, because it
|
|
# can never be all ones.
|
|
|
|
mulld $acc1,$a1,$a0 # a[1]*a[0]
|
|
mulhdu $t1,$a1,$a0
|
|
mulld $acc2,$a2,$a0 # a[2]*a[0]
|
|
mulhdu $t2,$a2,$a0
|
|
mulld $acc3,$a3,$a0 # a[3]*a[0]
|
|
mulhdu $acc4,$a3,$a0
|
|
|
|
addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
|
|
mulld $t0,$a2,$a1 # a[2]*a[1]
|
|
mulhdu $t1,$a2,$a1
|
|
adde $acc3,$acc3,$t2
|
|
mulld $t2,$a3,$a1 # a[3]*a[1]
|
|
mulhdu $t3,$a3,$a1
|
|
addze $acc4,$acc4 # can't overflow
|
|
|
|
mulld $acc5,$a3,$a2 # a[3]*a[2]
|
|
mulhdu $acc6,$a3,$a2
|
|
|
|
addc $t1,$t1,$t2 # accumulate high parts of multiplication
|
|
addze $t2,$t3 # can't overflow
|
|
|
|
addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
|
|
adde $acc4,$acc4,$t1
|
|
adde $acc5,$acc5,$t2
|
|
addze $acc6,$acc6 # can't overflow
|
|
|
|
addc $acc1,$acc1,$acc1 # acc[1-6]*=2
|
|
adde $acc2,$acc2,$acc2
|
|
adde $acc3,$acc3,$acc3
|
|
adde $acc4,$acc4,$acc4
|
|
adde $acc5,$acc5,$acc5
|
|
adde $acc6,$acc6,$acc6
|
|
li $acc7,0
|
|
addze $acc7,$acc7
|
|
|
|
mulld $acc0,$a0,$a0 # a[0]*a[0]
|
|
mulhdu $a0,$a0,$a0
|
|
mulld $t1,$a1,$a1 # a[1]*a[1]
|
|
mulhdu $a1,$a1,$a1
|
|
mulld $t2,$a2,$a2 # a[2]*a[2]
|
|
mulhdu $a2,$a2,$a2
|
|
mulld $t3,$a3,$a3 # a[3]*a[3]
|
|
mulhdu $a3,$a3,$a3
|
|
addc $acc1,$acc1,$a0 # +a[i]*a[i]
|
|
sldi $t0,$acc0,32
|
|
adde $acc2,$acc2,$t1
|
|
srdi $t1,$acc0,32
|
|
adde $acc3,$acc3,$a1
|
|
adde $acc4,$acc4,$t2
|
|
adde $acc5,$acc5,$a2
|
|
adde $acc6,$acc6,$t3
|
|
adde $acc7,$acc7,$a3
|
|
___
|
|
for($i=0;$i<3;$i++) { # reductions, see commentary in
|
|
# multiplication for details
|
|
$code.=<<___;
|
|
subfc $t2,$t0,$acc0 # "*0xffff0001"
|
|
subfe $t3,$t1,$acc0
|
|
addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
|
sldi $t0,$acc0,32
|
|
adde $acc1,$acc2,$t1
|
|
srdi $t1,$acc0,32
|
|
adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
|
addze $acc3,$t3 # can't overflow
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
subfc $t2,$t0,$acc0 # "*0xffff0001"
|
|
subfe $t3,$t1,$acc0
|
|
addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
|
adde $acc1,$acc2,$t1
|
|
adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
|
addze $acc3,$t3 # can't overflow
|
|
|
|
addc $acc0,$acc0,$acc4 # accumulate upper half
|
|
adde $acc1,$acc1,$acc5
|
|
adde $acc2,$acc2,$acc6
|
|
adde $acc3,$acc3,$acc7
|
|
li $t2,0
|
|
addze $acc4,$t2
|
|
|
|
addic $acc0,$acc0,1 # ret -= modulus
|
|
subfe $acc1,$poly1,$acc1
|
|
subfe $acc2,$t2,$acc2
|
|
subfe $acc3,$poly3,$acc3
|
|
subfe $acc4,$t2,$acc4
|
|
|
|
addc $acc0,$acc0,$acc4 # ret += modulus if borrow
|
|
and $t1,$poly1,$acc4
|
|
and $t3,$poly3,$acc4
|
|
adde $acc1,$acc1,$t1
|
|
addze $acc2,$acc2
|
|
adde $acc3,$acc3,$t3
|
|
|
|
std $acc0,0($rp)
|
|
std $acc1,8($rp)
|
|
std $acc2,16($rp)
|
|
std $acc3,24($rp)
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,1,0
|
|
.long 0
|
|
.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
|
|
|
|
# Note that __ecp_nistz256_add expects both input vectors pre-loaded to
|
|
# $a0-$a3 and $t0-$t3. This is done because it's used in multiple
|
|
# contexts, e.g. in multiplication by 2 and 3...
|
|
.type __ecp_nistz256_add,\@function
|
|
.align 4
|
|
__ecp_nistz256_add:
|
|
addc $acc0,$acc0,$t0 # ret = a+b
|
|
adde $acc1,$acc1,$t1
|
|
adde $acc2,$acc2,$t2
|
|
li $t2,0
|
|
adde $acc3,$acc3,$t3
|
|
addze $t0,$t2
|
|
|
|
# if a+b >= modulus, subtract modulus
|
|
#
|
|
# But since comparison implies subtraction, we subtract
|
|
# modulus and then add it back if subtraction borrowed.
|
|
|
|
subic $acc0,$acc0,-1
|
|
subfe $acc1,$poly1,$acc1
|
|
subfe $acc2,$t2,$acc2
|
|
subfe $acc3,$poly3,$acc3
|
|
subfe $t0,$t2,$t0
|
|
|
|
addc $acc0,$acc0,$t0
|
|
and $t1,$poly1,$t0
|
|
and $t3,$poly3,$t0
|
|
adde $acc1,$acc1,$t1
|
|
addze $acc2,$acc2
|
|
adde $acc3,$acc3,$t3
|
|
|
|
std $acc0,0($rp)
|
|
std $acc1,8($rp)
|
|
std $acc2,16($rp)
|
|
std $acc3,24($rp)
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,3,0
|
|
.long 0
|
|
.size __ecp_nistz256_add,.-__ecp_nistz256_add
|
|
|
|
.type __ecp_nistz256_sub_from,\@function
|
|
.align 4
|
|
__ecp_nistz256_sub_from:
|
|
ld $t0,0($bp)
|
|
ld $t1,8($bp)
|
|
ld $t2,16($bp)
|
|
ld $t3,24($bp)
|
|
subfc $acc0,$t0,$acc0 # ret = a-b
|
|
subfe $acc1,$t1,$acc1
|
|
subfe $acc2,$t2,$acc2
|
|
subfe $acc3,$t3,$acc3
|
|
subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
|
|
|
|
# if a-b borrowed, add modulus
|
|
|
|
addc $acc0,$acc0,$t0 # ret -= modulus & t0
|
|
and $t1,$poly1,$t0
|
|
and $t3,$poly3,$t0
|
|
adde $acc1,$acc1,$t1
|
|
addze $acc2,$acc2
|
|
adde $acc3,$acc3,$t3
|
|
|
|
std $acc0,0($rp)
|
|
std $acc1,8($rp)
|
|
std $acc2,16($rp)
|
|
std $acc3,24($rp)
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,3,0
|
|
.long 0
|
|
.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
|
|
|
|
.type __ecp_nistz256_sub_morf,\@function
|
|
.align 4
|
|
__ecp_nistz256_sub_morf:
|
|
ld $t0,0($bp)
|
|
ld $t1,8($bp)
|
|
ld $t2,16($bp)
|
|
ld $t3,24($bp)
|
|
subfc $acc0,$acc0,$t0 # ret = b-a
|
|
subfe $acc1,$acc1,$t1
|
|
subfe $acc2,$acc2,$t2
|
|
subfe $acc3,$acc3,$t3
|
|
subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
|
|
|
|
# if b-a borrowed, add modulus
|
|
|
|
addc $acc0,$acc0,$t0 # ret -= modulus & t0
|
|
and $t1,$poly1,$t0
|
|
and $t3,$poly3,$t0
|
|
adde $acc1,$acc1,$t1
|
|
addze $acc2,$acc2
|
|
adde $acc3,$acc3,$t3
|
|
|
|
std $acc0,0($rp)
|
|
std $acc1,8($rp)
|
|
std $acc2,16($rp)
|
|
std $acc3,24($rp)
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,3,0
|
|
.long 0
|
|
.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
|
|
|
|
.type __ecp_nistz256_div_by_2,\@function
|
|
.align 4
|
|
__ecp_nistz256_div_by_2:
|
|
andi. $t0,$acc0,1
|
|
addic $acc0,$acc0,-1 # a += modulus
|
|
neg $t0,$t0
|
|
adde $acc1,$acc1,$poly1
|
|
not $t0,$t0
|
|
addze $acc2,$acc2
|
|
li $t2,0
|
|
adde $acc3,$acc3,$poly3
|
|
and $t1,$poly1,$t0
|
|
addze $ap,$t2 # ap = carry
|
|
and $t3,$poly3,$t0
|
|
|
|
subfc $acc0,$t0,$acc0 # a -= modulus if a was even
|
|
subfe $acc1,$t1,$acc1
|
|
subfe $acc2,$t2,$acc2
|
|
subfe $acc3,$t3,$acc3
|
|
subfe $ap, $t2,$ap
|
|
|
|
srdi $acc0,$acc0,1
|
|
sldi $t0,$acc1,63
|
|
srdi $acc1,$acc1,1
|
|
sldi $t1,$acc2,63
|
|
srdi $acc2,$acc2,1
|
|
sldi $t2,$acc3,63
|
|
srdi $acc3,$acc3,1
|
|
sldi $t3,$ap,63
|
|
or $acc0,$acc0,$t0
|
|
or $acc1,$acc1,$t1
|
|
or $acc2,$acc2,$t2
|
|
or $acc3,$acc3,$t3
|
|
|
|
std $acc0,0($rp)
|
|
std $acc1,8($rp)
|
|
std $acc2,16($rp)
|
|
std $acc3,24($rp)
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,1,0
|
|
.long 0
|
|
.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
|
|
___
|
|
########################################################################
|
|
# following subroutines are "literal" implementation of those found in
|
|
# ecp_nistz256.c
|
|
#
|
|
########################################################################
|
|
# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
|
|
#
|
|
if (1) {
|
|
my $FRAME=64+32*4+12*8;
|
|
my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
|
|
# above map() describes stack layout with 4 temporary
|
|
# 256-bit vectors on top.
|
|
my ($rp_real,$ap_real) = map("r$_",(20,21));
|
|
|
|
$code.=<<___;
|
|
.globl ecp_nistz256_point_double
|
|
.align 5
|
|
ecp_nistz256_point_double:
|
|
stdu $sp,-$FRAME($sp)
|
|
mflr r0
|
|
std r20,$FRAME-8*12($sp)
|
|
std r21,$FRAME-8*11($sp)
|
|
std r22,$FRAME-8*10($sp)
|
|
std r23,$FRAME-8*9($sp)
|
|
std r24,$FRAME-8*8($sp)
|
|
std r25,$FRAME-8*7($sp)
|
|
std r26,$FRAME-8*6($sp)
|
|
std r27,$FRAME-8*5($sp)
|
|
std r28,$FRAME-8*4($sp)
|
|
std r29,$FRAME-8*3($sp)
|
|
std r30,$FRAME-8*2($sp)
|
|
std r31,$FRAME-8*1($sp)
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
.Ldouble_shortcut:
|
|
ld $acc0,32($ap)
|
|
ld $acc1,40($ap)
|
|
ld $acc2,48($ap)
|
|
ld $acc3,56($ap)
|
|
mr $t0,$acc0
|
|
mr $t1,$acc1
|
|
mr $t2,$acc2
|
|
mr $t3,$acc3
|
|
ld $a0,64($ap) # forward load for p256_sqr_mont
|
|
ld $a1,72($ap)
|
|
ld $a2,80($ap)
|
|
ld $a3,88($ap)
|
|
mr $rp_real,$rp
|
|
mr $ap_real,$ap
|
|
addi $rp,$sp,$S
|
|
bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
|
|
|
|
addi $rp,$sp,$Zsqr
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
|
|
|
|
ld $t0,0($ap_real)
|
|
ld $t1,8($ap_real)
|
|
ld $t2,16($ap_real)
|
|
ld $t3,24($ap_real)
|
|
mr $a0,$acc0 # put Zsqr aside for p256_sub
|
|
mr $a1,$acc1
|
|
mr $a2,$acc2
|
|
mr $a3,$acc3
|
|
addi $rp,$sp,$M
|
|
bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
|
|
|
|
addi $bp,$ap_real,0
|
|
mr $acc0,$a0 # restore Zsqr
|
|
mr $acc1,$a1
|
|
mr $acc2,$a2
|
|
mr $acc3,$a3
|
|
ld $a0,$S+0($sp) # forward load for p256_sqr_mont
|
|
ld $a1,$S+8($sp)
|
|
ld $a2,$S+16($sp)
|
|
ld $a3,$S+24($sp)
|
|
addi $rp,$sp,$Zsqr
|
|
bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
|
|
|
|
addi $rp,$sp,$S
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
|
|
|
|
ld $bi,32($ap_real)
|
|
ld $a0,64($ap_real)
|
|
ld $a1,72($ap_real)
|
|
ld $a2,80($ap_real)
|
|
ld $a3,88($ap_real)
|
|
addi $bp,$ap_real,32
|
|
addi $rp,$sp,$tmp0
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
|
|
|
|
mr $t0,$acc0
|
|
mr $t1,$acc1
|
|
mr $t2,$acc2
|
|
mr $t3,$acc3
|
|
ld $a0,$S+0($sp) # forward load for p256_sqr_mont
|
|
ld $a1,$S+8($sp)
|
|
ld $a2,$S+16($sp)
|
|
ld $a3,$S+24($sp)
|
|
addi $rp,$rp_real,64
|
|
bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
|
|
|
|
addi $rp,$sp,$tmp0
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
|
|
|
|
ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
|
|
ld $a0,$M+0($sp)
|
|
ld $a1,$M+8($sp)
|
|
ld $a2,$M+16($sp)
|
|
ld $a3,$M+24($sp)
|
|
addi $rp,$rp_real,32
|
|
bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
|
|
|
|
addi $bp,$sp,$Zsqr
|
|
addi $rp,$sp,$M
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
|
|
|
|
mr $t0,$acc0 # duplicate M
|
|
mr $t1,$acc1
|
|
mr $t2,$acc2
|
|
mr $t3,$acc3
|
|
mr $a0,$acc0 # put M aside
|
|
mr $a1,$acc1
|
|
mr $a2,$acc2
|
|
mr $a3,$acc3
|
|
addi $rp,$sp,$M
|
|
bl __ecp_nistz256_add
|
|
mr $t0,$a0 # restore M
|
|
mr $t1,$a1
|
|
mr $t2,$a2
|
|
mr $t3,$a3
|
|
ld $bi,0($ap_real) # forward load for p256_mul_mont
|
|
ld $a0,$S+0($sp)
|
|
ld $a1,$S+8($sp)
|
|
ld $a2,$S+16($sp)
|
|
ld $a3,$S+24($sp)
|
|
bl __ecp_nistz256_add # p256_mul_by_3(M, M);
|
|
|
|
addi $bp,$ap_real,0
|
|
addi $rp,$sp,$S
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
|
|
|
|
mr $t0,$acc0
|
|
mr $t1,$acc1
|
|
mr $t2,$acc2
|
|
mr $t3,$acc3
|
|
ld $a0,$M+0($sp) # forward load for p256_sqr_mont
|
|
ld $a1,$M+8($sp)
|
|
ld $a2,$M+16($sp)
|
|
ld $a3,$M+24($sp)
|
|
addi $rp,$sp,$tmp0
|
|
bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
|
|
|
|
addi $rp,$rp_real,0
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
|
|
|
|
addi $bp,$sp,$tmp0
|
|
bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
|
|
|
|
addi $bp,$sp,$S
|
|
addi $rp,$sp,$S
|
|
bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
|
|
|
|
ld $bi,$M($sp)
|
|
mr $a0,$acc0 # copy S
|
|
mr $a1,$acc1
|
|
mr $a2,$acc2
|
|
mr $a3,$acc3
|
|
addi $bp,$sp,$M
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
|
|
|
|
addi $bp,$rp_real,32
|
|
addi $rp,$rp_real,32
|
|
bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
|
|
|
|
mtlr r0
|
|
ld r20,$FRAME-8*12($sp)
|
|
ld r21,$FRAME-8*11($sp)
|
|
ld r22,$FRAME-8*10($sp)
|
|
ld r23,$FRAME-8*9($sp)
|
|
ld r24,$FRAME-8*8($sp)
|
|
ld r25,$FRAME-8*7($sp)
|
|
ld r26,$FRAME-8*6($sp)
|
|
ld r27,$FRAME-8*5($sp)
|
|
ld r28,$FRAME-8*4($sp)
|
|
ld r29,$FRAME-8*3($sp)
|
|
ld r30,$FRAME-8*2($sp)
|
|
ld r31,$FRAME-8*1($sp)
|
|
addi $sp,$sp,$FRAME
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,12,2,0
|
|
.long 0
|
|
.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
|
|
___
|
|
}
|
|
|
|
########################################################################
|
|
# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
|
|
# const P256_POINT *in2);
|
|
if (1) {
|
|
my $FRAME = 64 + 32*12 + 16*8;
|
|
my ($res_x,$res_y,$res_z,
|
|
$H,$Hsqr,$R,$Rsqr,$Hcub,
|
|
$U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
|
|
my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
|
|
# above map() describes stack layout with 12 temporary
|
|
# 256-bit vectors on top.
|
|
my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
|
|
|
|
$code.=<<___;
|
|
.globl ecp_nistz256_point_add
|
|
.align 5
|
|
ecp_nistz256_point_add:
|
|
stdu $sp,-$FRAME($sp)
|
|
mflr r0
|
|
std r16,$FRAME-8*16($sp)
|
|
std r17,$FRAME-8*15($sp)
|
|
std r18,$FRAME-8*14($sp)
|
|
std r19,$FRAME-8*13($sp)
|
|
std r20,$FRAME-8*12($sp)
|
|
std r21,$FRAME-8*11($sp)
|
|
std r22,$FRAME-8*10($sp)
|
|
std r23,$FRAME-8*9($sp)
|
|
std r24,$FRAME-8*8($sp)
|
|
std r25,$FRAME-8*7($sp)
|
|
std r26,$FRAME-8*6($sp)
|
|
std r27,$FRAME-8*5($sp)
|
|
std r28,$FRAME-8*4($sp)
|
|
std r29,$FRAME-8*3($sp)
|
|
std r30,$FRAME-8*2($sp)
|
|
std r31,$FRAME-8*1($sp)
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
ld $a0,64($bp) # in2_z
|
|
ld $a1,72($bp)
|
|
ld $a2,80($bp)
|
|
ld $a3,88($bp)
|
|
mr $rp_real,$rp
|
|
mr $ap_real,$ap
|
|
mr $bp_real,$bp
|
|
or $t0,$a0,$a1
|
|
or $t2,$a2,$a3
|
|
or $in2infty,$t0,$t2
|
|
neg $t0,$in2infty
|
|
or $in2infty,$in2infty,$t0
|
|
sradi $in2infty,$in2infty,63 # !in2infty
|
|
addi $rp,$sp,$Z2sqr
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
|
|
|
|
ld $a0,64($ap_real) # in1_z
|
|
ld $a1,72($ap_real)
|
|
ld $a2,80($ap_real)
|
|
ld $a3,88($ap_real)
|
|
or $t0,$a0,$a1
|
|
or $t2,$a2,$a3
|
|
or $in1infty,$t0,$t2
|
|
neg $t0,$in1infty
|
|
or $in1infty,$in1infty,$t0
|
|
sradi $in1infty,$in1infty,63 # !in1infty
|
|
addi $rp,$sp,$Z1sqr
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
|
|
|
|
ld $bi,64($bp_real)
|
|
ld $a0,$Z2sqr+0($sp)
|
|
ld $a1,$Z2sqr+8($sp)
|
|
ld $a2,$Z2sqr+16($sp)
|
|
ld $a3,$Z2sqr+24($sp)
|
|
addi $bp,$bp_real,64
|
|
addi $rp,$sp,$S1
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
|
|
|
|
ld $bi,64($ap_real)
|
|
ld $a0,$Z1sqr+0($sp)
|
|
ld $a1,$Z1sqr+8($sp)
|
|
ld $a2,$Z1sqr+16($sp)
|
|
ld $a3,$Z1sqr+24($sp)
|
|
addi $bp,$ap_real,64
|
|
addi $rp,$sp,$S2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
|
|
|
|
ld $bi,32($ap_real)
|
|
ld $a0,$S1+0($sp)
|
|
ld $a1,$S1+8($sp)
|
|
ld $a2,$S1+16($sp)
|
|
ld $a3,$S1+24($sp)
|
|
addi $bp,$ap_real,32
|
|
addi $rp,$sp,$S1
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
|
|
|
|
ld $bi,32($bp_real)
|
|
ld $a0,$S2+0($sp)
|
|
ld $a1,$S2+8($sp)
|
|
ld $a2,$S2+16($sp)
|
|
ld $a3,$S2+24($sp)
|
|
addi $bp,$bp_real,32
|
|
addi $rp,$sp,$S2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
|
|
|
|
addi $bp,$sp,$S1
|
|
ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
|
|
ld $a0,0($ap_real)
|
|
ld $a1,8($ap_real)
|
|
ld $a2,16($ap_real)
|
|
ld $a3,24($ap_real)
|
|
addi $rp,$sp,$R
|
|
bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
|
|
|
|
or $acc0,$acc0,$acc1 # see if result is zero
|
|
or $acc2,$acc2,$acc3
|
|
or $temp,$acc0,$acc2
|
|
|
|
addi $bp,$sp,$Z2sqr
|
|
addi $rp,$sp,$U1
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
|
|
|
|
ld $bi,$Z1sqr($sp)
|
|
ld $a0,0($bp_real)
|
|
ld $a1,8($bp_real)
|
|
ld $a2,16($bp_real)
|
|
ld $a3,24($bp_real)
|
|
addi $bp,$sp,$Z1sqr
|
|
addi $rp,$sp,$U2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
|
|
|
|
addi $bp,$sp,$U1
|
|
ld $a0,$R+0($sp) # forward load for p256_sqr_mont
|
|
ld $a1,$R+8($sp)
|
|
ld $a2,$R+16($sp)
|
|
ld $a3,$R+24($sp)
|
|
addi $rp,$sp,$H
|
|
bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
|
|
|
|
or $acc0,$acc0,$acc1 # see if result is zero
|
|
or $acc2,$acc2,$acc3
|
|
or. $acc0,$acc0,$acc2
|
|
bne .Ladd_proceed # is_equal(U1,U2)?
|
|
|
|
and. $t0,$in1infty,$in2infty
|
|
beq .Ladd_proceed # (in1infty || in2infty)?
|
|
|
|
cmpldi $temp,0
|
|
beq .Ladd_double # is_equal(S1,S2)?
|
|
|
|
xor $a0,$a0,$a0
|
|
std $a0,0($rp_real)
|
|
std $a0,8($rp_real)
|
|
std $a0,16($rp_real)
|
|
std $a0,24($rp_real)
|
|
std $a0,32($rp_real)
|
|
std $a0,40($rp_real)
|
|
std $a0,48($rp_real)
|
|
std $a0,56($rp_real)
|
|
std $a0,64($rp_real)
|
|
std $a0,72($rp_real)
|
|
std $a0,80($rp_real)
|
|
std $a0,88($rp_real)
|
|
b .Ladd_done
|
|
|
|
.align 4
|
|
.Ladd_double:
|
|
ld $bp,0($sp) # back-link
|
|
mr $ap,$ap_real
|
|
mr $rp,$rp_real
|
|
ld r16,$FRAME-8*16($sp)
|
|
ld r17,$FRAME-8*15($sp)
|
|
ld r18,$FRAME-8*14($sp)
|
|
ld r19,$FRAME-8*13($sp)
|
|
stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
|
|
b .Ldouble_shortcut
|
|
|
|
.align 4
|
|
.Ladd_proceed:
|
|
addi $rp,$sp,$Rsqr
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
|
|
|
|
ld $bi,64($ap_real)
|
|
ld $a0,$H+0($sp)
|
|
ld $a1,$H+8($sp)
|
|
ld $a2,$H+16($sp)
|
|
ld $a3,$H+24($sp)
|
|
addi $bp,$ap_real,64
|
|
addi $rp,$sp,$res_z
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
|
|
|
|
ld $a0,$H+0($sp)
|
|
ld $a1,$H+8($sp)
|
|
ld $a2,$H+16($sp)
|
|
ld $a3,$H+24($sp)
|
|
addi $rp,$sp,$Hsqr
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
|
|
|
|
ld $bi,64($bp_real)
|
|
ld $a0,$res_z+0($sp)
|
|
ld $a1,$res_z+8($sp)
|
|
ld $a2,$res_z+16($sp)
|
|
ld $a3,$res_z+24($sp)
|
|
addi $bp,$bp_real,64
|
|
addi $rp,$sp,$res_z
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
|
|
|
|
ld $bi,$H($sp)
|
|
ld $a0,$Hsqr+0($sp)
|
|
ld $a1,$Hsqr+8($sp)
|
|
ld $a2,$Hsqr+16($sp)
|
|
ld $a3,$Hsqr+24($sp)
|
|
addi $bp,$sp,$H
|
|
addi $rp,$sp,$Hcub
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
|
|
|
|
ld $bi,$Hsqr($sp)
|
|
ld $a0,$U1+0($sp)
|
|
ld $a1,$U1+8($sp)
|
|
ld $a2,$U1+16($sp)
|
|
ld $a3,$U1+24($sp)
|
|
addi $bp,$sp,$Hsqr
|
|
addi $rp,$sp,$U2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
|
|
|
|
mr $t0,$acc0
|
|
mr $t1,$acc1
|
|
mr $t2,$acc2
|
|
mr $t3,$acc3
|
|
addi $rp,$sp,$Hsqr
|
|
bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
|
|
|
|
addi $bp,$sp,$Rsqr
|
|
addi $rp,$sp,$res_x
|
|
bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
|
|
|
|
addi $bp,$sp,$Hcub
|
|
bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
|
|
|
|
addi $bp,$sp,$U2
|
|
ld $bi,$Hcub($sp) # forward load for p256_mul_mont
|
|
ld $a0,$S1+0($sp)
|
|
ld $a1,$S1+8($sp)
|
|
ld $a2,$S1+16($sp)
|
|
ld $a3,$S1+24($sp)
|
|
addi $rp,$sp,$res_y
|
|
bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
|
|
|
|
addi $bp,$sp,$Hcub
|
|
addi $rp,$sp,$S2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
|
|
|
|
ld $bi,$R($sp)
|
|
ld $a0,$res_y+0($sp)
|
|
ld $a1,$res_y+8($sp)
|
|
ld $a2,$res_y+16($sp)
|
|
ld $a3,$res_y+24($sp)
|
|
addi $bp,$sp,$R
|
|
addi $rp,$sp,$res_y
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
|
|
|
|
addi $bp,$sp,$S2
|
|
bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
|
|
|
|
ld $t0,0($bp_real) # in2
|
|
ld $t1,8($bp_real)
|
|
ld $t2,16($bp_real)
|
|
ld $t3,24($bp_real)
|
|
ld $a0,$res_x+0($sp) # res
|
|
ld $a1,$res_x+8($sp)
|
|
ld $a2,$res_x+16($sp)
|
|
ld $a3,$res_x+24($sp)
|
|
___
|
|
for($i=0;$i<64;$i+=32) { # conditional moves
|
|
$code.=<<___;
|
|
ld $acc0,$i+0($ap_real) # in1
|
|
ld $acc1,$i+8($ap_real)
|
|
ld $acc2,$i+16($ap_real)
|
|
ld $acc3,$i+24($ap_real)
|
|
andc $t0,$t0,$in1infty
|
|
andc $t1,$t1,$in1infty
|
|
andc $t2,$t2,$in1infty
|
|
andc $t3,$t3,$in1infty
|
|
and $a0,$a0,$in1infty
|
|
and $a1,$a1,$in1infty
|
|
and $a2,$a2,$in1infty
|
|
and $a3,$a3,$in1infty
|
|
or $t0,$t0,$a0
|
|
or $t1,$t1,$a1
|
|
or $t2,$t2,$a2
|
|
or $t3,$t3,$a3
|
|
andc $acc0,$acc0,$in2infty
|
|
andc $acc1,$acc1,$in2infty
|
|
andc $acc2,$acc2,$in2infty
|
|
andc $acc3,$acc3,$in2infty
|
|
and $t0,$t0,$in2infty
|
|
and $t1,$t1,$in2infty
|
|
and $t2,$t2,$in2infty
|
|
and $t3,$t3,$in2infty
|
|
or $acc0,$acc0,$t0
|
|
or $acc1,$acc1,$t1
|
|
or $acc2,$acc2,$t2
|
|
or $acc3,$acc3,$t3
|
|
|
|
ld $t0,$i+32($bp_real) # in2
|
|
ld $t1,$i+40($bp_real)
|
|
ld $t2,$i+48($bp_real)
|
|
ld $t3,$i+56($bp_real)
|
|
ld $a0,$res_x+$i+32($sp)
|
|
ld $a1,$res_x+$i+40($sp)
|
|
ld $a2,$res_x+$i+48($sp)
|
|
ld $a3,$res_x+$i+56($sp)
|
|
std $acc0,$i+0($rp_real)
|
|
std $acc1,$i+8($rp_real)
|
|
std $acc2,$i+16($rp_real)
|
|
std $acc3,$i+24($rp_real)
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
ld $acc0,$i+0($ap_real) # in1
|
|
ld $acc1,$i+8($ap_real)
|
|
ld $acc2,$i+16($ap_real)
|
|
ld $acc3,$i+24($ap_real)
|
|
andc $t0,$t0,$in1infty
|
|
andc $t1,$t1,$in1infty
|
|
andc $t2,$t2,$in1infty
|
|
andc $t3,$t3,$in1infty
|
|
and $a0,$a0,$in1infty
|
|
and $a1,$a1,$in1infty
|
|
and $a2,$a2,$in1infty
|
|
and $a3,$a3,$in1infty
|
|
or $t0,$t0,$a0
|
|
or $t1,$t1,$a1
|
|
or $t2,$t2,$a2
|
|
or $t3,$t3,$a3
|
|
andc $acc0,$acc0,$in2infty
|
|
andc $acc1,$acc1,$in2infty
|
|
andc $acc2,$acc2,$in2infty
|
|
andc $acc3,$acc3,$in2infty
|
|
and $t0,$t0,$in2infty
|
|
and $t1,$t1,$in2infty
|
|
and $t2,$t2,$in2infty
|
|
and $t3,$t3,$in2infty
|
|
or $acc0,$acc0,$t0
|
|
or $acc1,$acc1,$t1
|
|
or $acc2,$acc2,$t2
|
|
or $acc3,$acc3,$t3
|
|
std $acc0,$i+0($rp_real)
|
|
std $acc1,$i+8($rp_real)
|
|
std $acc2,$i+16($rp_real)
|
|
std $acc3,$i+24($rp_real)
|
|
|
|
.Ladd_done:
|
|
mtlr r0
|
|
ld r16,$FRAME-8*16($sp)
|
|
ld r17,$FRAME-8*15($sp)
|
|
ld r18,$FRAME-8*14($sp)
|
|
ld r19,$FRAME-8*13($sp)
|
|
ld r20,$FRAME-8*12($sp)
|
|
ld r21,$FRAME-8*11($sp)
|
|
ld r22,$FRAME-8*10($sp)
|
|
ld r23,$FRAME-8*9($sp)
|
|
ld r24,$FRAME-8*8($sp)
|
|
ld r25,$FRAME-8*7($sp)
|
|
ld r26,$FRAME-8*6($sp)
|
|
ld r27,$FRAME-8*5($sp)
|
|
ld r28,$FRAME-8*4($sp)
|
|
ld r29,$FRAME-8*3($sp)
|
|
ld r30,$FRAME-8*2($sp)
|
|
ld r31,$FRAME-8*1($sp)
|
|
addi $sp,$sp,$FRAME
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,16,3,0
|
|
.long 0
|
|
.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
|
|
___
|
|
}
|
|
|
|
########################################################################
|
|
# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
|
|
# const P256_POINT_AFFINE *in2);
|
|
if (1) {
|
|
my $FRAME = 64 + 32*10 + 16*8;
|
|
my ($res_x,$res_y,$res_z,
|
|
$U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
|
|
my $Z1sqr = $S2;
|
|
# above map() describes stack layout with 10 temporary
|
|
# 256-bit vectors on top.
|
|
my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
|
|
|
|
$code.=<<___;
|
|
.globl ecp_nistz256_point_add_affine
|
|
.align 5
|
|
ecp_nistz256_point_add_affine:
|
|
stdu $sp,-$FRAME($sp)
|
|
mflr r0
|
|
std r16,$FRAME-8*16($sp)
|
|
std r17,$FRAME-8*15($sp)
|
|
std r18,$FRAME-8*14($sp)
|
|
std r19,$FRAME-8*13($sp)
|
|
std r20,$FRAME-8*12($sp)
|
|
std r21,$FRAME-8*11($sp)
|
|
std r22,$FRAME-8*10($sp)
|
|
std r23,$FRAME-8*9($sp)
|
|
std r24,$FRAME-8*8($sp)
|
|
std r25,$FRAME-8*7($sp)
|
|
std r26,$FRAME-8*6($sp)
|
|
std r27,$FRAME-8*5($sp)
|
|
std r28,$FRAME-8*4($sp)
|
|
std r29,$FRAME-8*3($sp)
|
|
std r30,$FRAME-8*2($sp)
|
|
std r31,$FRAME-8*1($sp)
|
|
|
|
li $poly1,-1
|
|
srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
|
li $poly3,1
|
|
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
|
|
|
mr $rp_real,$rp
|
|
mr $ap_real,$ap
|
|
mr $bp_real,$bp
|
|
|
|
ld $a0,64($ap) # in1_z
|
|
ld $a1,72($ap)
|
|
ld $a2,80($ap)
|
|
ld $a3,88($ap)
|
|
or $t0,$a0,$a1
|
|
or $t2,$a2,$a3
|
|
or $in1infty,$t0,$t2
|
|
neg $t0,$in1infty
|
|
or $in1infty,$in1infty,$t0
|
|
sradi $in1infty,$in1infty,63 # !in1infty
|
|
|
|
ld $acc0,0($bp) # in2_x
|
|
ld $acc1,8($bp)
|
|
ld $acc2,16($bp)
|
|
ld $acc3,24($bp)
|
|
ld $t0,32($bp) # in2_y
|
|
ld $t1,40($bp)
|
|
ld $t2,48($bp)
|
|
ld $t3,56($bp)
|
|
or $acc0,$acc0,$acc1
|
|
or $acc2,$acc2,$acc3
|
|
or $acc0,$acc0,$acc2
|
|
or $t0,$t0,$t1
|
|
or $t2,$t2,$t3
|
|
or $t0,$t0,$t2
|
|
or $in2infty,$acc0,$t0
|
|
neg $t0,$in2infty
|
|
or $in2infty,$in2infty,$t0
|
|
sradi $in2infty,$in2infty,63 # !in2infty
|
|
|
|
addi $rp,$sp,$Z1sqr
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
|
|
|
|
mr $a0,$acc0
|
|
mr $a1,$acc1
|
|
mr $a2,$acc2
|
|
mr $a3,$acc3
|
|
ld $bi,0($bp_real)
|
|
addi $bp,$bp_real,0
|
|
addi $rp,$sp,$U2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
|
|
|
|
addi $bp,$ap_real,0
|
|
ld $bi,64($ap_real) # forward load for p256_mul_mont
|
|
ld $a0,$Z1sqr+0($sp)
|
|
ld $a1,$Z1sqr+8($sp)
|
|
ld $a2,$Z1sqr+16($sp)
|
|
ld $a3,$Z1sqr+24($sp)
|
|
addi $rp,$sp,$H
|
|
bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
|
|
|
|
addi $bp,$ap_real,64
|
|
addi $rp,$sp,$S2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
|
|
|
|
ld $bi,64($ap_real)
|
|
ld $a0,$H+0($sp)
|
|
ld $a1,$H+8($sp)
|
|
ld $a2,$H+16($sp)
|
|
ld $a3,$H+24($sp)
|
|
addi $bp,$ap_real,64
|
|
addi $rp,$sp,$res_z
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
|
|
|
|
ld $bi,32($bp_real)
|
|
ld $a0,$S2+0($sp)
|
|
ld $a1,$S2+8($sp)
|
|
ld $a2,$S2+16($sp)
|
|
ld $a3,$S2+24($sp)
|
|
addi $bp,$bp_real,32
|
|
addi $rp,$sp,$S2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
|
|
|
|
addi $bp,$ap_real,32
|
|
ld $a0,$H+0($sp) # forward load for p256_sqr_mont
|
|
ld $a1,$H+8($sp)
|
|
ld $a2,$H+16($sp)
|
|
ld $a3,$H+24($sp)
|
|
addi $rp,$sp,$R
|
|
bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
|
|
|
|
addi $rp,$sp,$Hsqr
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
|
|
|
|
ld $a0,$R+0($sp)
|
|
ld $a1,$R+8($sp)
|
|
ld $a2,$R+16($sp)
|
|
ld $a3,$R+24($sp)
|
|
addi $rp,$sp,$Rsqr
|
|
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
|
|
|
|
ld $bi,$H($sp)
|
|
ld $a0,$Hsqr+0($sp)
|
|
ld $a1,$Hsqr+8($sp)
|
|
ld $a2,$Hsqr+16($sp)
|
|
ld $a3,$Hsqr+24($sp)
|
|
addi $bp,$sp,$H
|
|
addi $rp,$sp,$Hcub
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
|
|
|
|
ld $bi,0($ap_real)
|
|
ld $a0,$Hsqr+0($sp)
|
|
ld $a1,$Hsqr+8($sp)
|
|
ld $a2,$Hsqr+16($sp)
|
|
ld $a3,$Hsqr+24($sp)
|
|
addi $bp,$ap_real,0
|
|
addi $rp,$sp,$U2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
|
|
|
|
mr $t0,$acc0
|
|
mr $t1,$acc1
|
|
mr $t2,$acc2
|
|
mr $t3,$acc3
|
|
addi $rp,$sp,$Hsqr
|
|
bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
|
|
|
|
addi $bp,$sp,$Rsqr
|
|
addi $rp,$sp,$res_x
|
|
bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
|
|
|
|
addi $bp,$sp,$Hcub
|
|
bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
|
|
|
|
addi $bp,$sp,$U2
|
|
ld $bi,32($ap_real) # forward load for p256_mul_mont
|
|
ld $a0,$Hcub+0($sp)
|
|
ld $a1,$Hcub+8($sp)
|
|
ld $a2,$Hcub+16($sp)
|
|
ld $a3,$Hcub+24($sp)
|
|
addi $rp,$sp,$res_y
|
|
bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
|
|
|
|
addi $bp,$ap_real,32
|
|
addi $rp,$sp,$S2
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
|
|
|
|
ld $bi,$R($sp)
|
|
ld $a0,$res_y+0($sp)
|
|
ld $a1,$res_y+8($sp)
|
|
ld $a2,$res_y+16($sp)
|
|
ld $a3,$res_y+24($sp)
|
|
addi $bp,$sp,$R
|
|
addi $rp,$sp,$res_y
|
|
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
|
|
|
|
addi $bp,$sp,$S2
|
|
bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
|
|
|
|
ld $t0,0($bp_real) # in2
|
|
ld $t1,8($bp_real)
|
|
ld $t2,16($bp_real)
|
|
ld $t3,24($bp_real)
|
|
ld $a0,$res_x+0($sp) # res
|
|
ld $a1,$res_x+8($sp)
|
|
ld $a2,$res_x+16($sp)
|
|
ld $a3,$res_x+24($sp)
|
|
___
|
|
for($i=0;$i<64;$i+=32) { # conditional moves
|
|
$code.=<<___;
|
|
ld $acc0,$i+0($ap_real) # in1
|
|
ld $acc1,$i+8($ap_real)
|
|
ld $acc2,$i+16($ap_real)
|
|
ld $acc3,$i+24($ap_real)
|
|
andc $t0,$t0,$in1infty
|
|
andc $t1,$t1,$in1infty
|
|
andc $t2,$t2,$in1infty
|
|
andc $t3,$t3,$in1infty
|
|
and $a0,$a0,$in1infty
|
|
and $a1,$a1,$in1infty
|
|
and $a2,$a2,$in1infty
|
|
and $a3,$a3,$in1infty
|
|
or $t0,$t0,$a0
|
|
or $t1,$t1,$a1
|
|
or $t2,$t2,$a2
|
|
or $t3,$t3,$a3
|
|
andc $acc0,$acc0,$in2infty
|
|
andc $acc1,$acc1,$in2infty
|
|
andc $acc2,$acc2,$in2infty
|
|
andc $acc3,$acc3,$in2infty
|
|
and $t0,$t0,$in2infty
|
|
and $t1,$t1,$in2infty
|
|
and $t2,$t2,$in2infty
|
|
and $t3,$t3,$in2infty
|
|
or $acc0,$acc0,$t0
|
|
or $acc1,$acc1,$t1
|
|
or $acc2,$acc2,$t2
|
|
or $acc3,$acc3,$t3
|
|
___
|
|
$code.=<<___ if ($i==0);
|
|
ld $t0,32($bp_real) # in2
|
|
ld $t1,40($bp_real)
|
|
ld $t2,48($bp_real)
|
|
ld $t3,56($bp_real)
|
|
___
|
|
$code.=<<___ if ($i==32);
|
|
li $t0,1 # Lone_mont
|
|
not $t1,$poly1
|
|
li $t2,-1
|
|
not $t3,$poly3
|
|
___
|
|
$code.=<<___;
|
|
ld $a0,$res_x+$i+32($sp)
|
|
ld $a1,$res_x+$i+40($sp)
|
|
ld $a2,$res_x+$i+48($sp)
|
|
ld $a3,$res_x+$i+56($sp)
|
|
std $acc0,$i+0($rp_real)
|
|
std $acc1,$i+8($rp_real)
|
|
std $acc2,$i+16($rp_real)
|
|
std $acc3,$i+24($rp_real)
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
ld $acc0,$i+0($ap_real) # in1
|
|
ld $acc1,$i+8($ap_real)
|
|
ld $acc2,$i+16($ap_real)
|
|
ld $acc3,$i+24($ap_real)
|
|
andc $t0,$t0,$in1infty
|
|
andc $t1,$t1,$in1infty
|
|
andc $t2,$t2,$in1infty
|
|
andc $t3,$t3,$in1infty
|
|
and $a0,$a0,$in1infty
|
|
and $a1,$a1,$in1infty
|
|
and $a2,$a2,$in1infty
|
|
and $a3,$a3,$in1infty
|
|
or $t0,$t0,$a0
|
|
or $t1,$t1,$a1
|
|
or $t2,$t2,$a2
|
|
or $t3,$t3,$a3
|
|
andc $acc0,$acc0,$in2infty
|
|
andc $acc1,$acc1,$in2infty
|
|
andc $acc2,$acc2,$in2infty
|
|
andc $acc3,$acc3,$in2infty
|
|
and $t0,$t0,$in2infty
|
|
and $t1,$t1,$in2infty
|
|
and $t2,$t2,$in2infty
|
|
and $t3,$t3,$in2infty
|
|
or $acc0,$acc0,$t0
|
|
or $acc1,$acc1,$t1
|
|
or $acc2,$acc2,$t2
|
|
or $acc3,$acc3,$t3
|
|
std $acc0,$i+0($rp_real)
|
|
std $acc1,$i+8($rp_real)
|
|
std $acc2,$i+16($rp_real)
|
|
std $acc3,$i+24($rp_real)
|
|
|
|
mtlr r0
|
|
ld r16,$FRAME-8*16($sp)
|
|
ld r17,$FRAME-8*15($sp)
|
|
ld r18,$FRAME-8*14($sp)
|
|
ld r19,$FRAME-8*13($sp)
|
|
ld r20,$FRAME-8*12($sp)
|
|
ld r21,$FRAME-8*11($sp)
|
|
ld r22,$FRAME-8*10($sp)
|
|
ld r23,$FRAME-8*9($sp)
|
|
ld r24,$FRAME-8*8($sp)
|
|
ld r25,$FRAME-8*7($sp)
|
|
ld r26,$FRAME-8*6($sp)
|
|
ld r27,$FRAME-8*5($sp)
|
|
ld r28,$FRAME-8*4($sp)
|
|
ld r29,$FRAME-8*3($sp)
|
|
ld r30,$FRAME-8*2($sp)
|
|
ld r31,$FRAME-8*1($sp)
|
|
addi $sp,$sp,$FRAME
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,16,3,0
|
|
.long 0
|
|
.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
|
|
___
|
|
}
|
|
if (1) {
|
|
my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
|
|
my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
|
|
|
|
$code.=<<___;
|
|
########################################################################
|
|
# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
|
|
# uint64_t b[4]);
|
|
.globl ecp_nistz256_ord_mul_mont
|
|
.align 5
|
|
ecp_nistz256_ord_mul_mont:
|
|
stdu $sp,-160($sp)
|
|
std r18,48($sp)
|
|
std r19,56($sp)
|
|
std r20,64($sp)
|
|
std r21,72($sp)
|
|
std r22,80($sp)
|
|
std r23,88($sp)
|
|
std r24,96($sp)
|
|
std r25,104($sp)
|
|
std r26,112($sp)
|
|
std r27,120($sp)
|
|
std r28,128($sp)
|
|
std r29,136($sp)
|
|
std r30,144($sp)
|
|
std r31,152($sp)
|
|
|
|
ld $a0,0($ap)
|
|
ld $bi,0($bp)
|
|
ld $a1,8($ap)
|
|
ld $a2,16($ap)
|
|
ld $a3,24($ap)
|
|
|
|
lis $ordk,0xccd1
|
|
lis $ord0,0xf3b9
|
|
lis $ord1,0xbce6
|
|
ori $ordk,$ordk,0xc8aa
|
|
ori $ord0,$ord0,0xcac2
|
|
ori $ord1,$ord1,0xfaad
|
|
sldi $ordk,$ordk,32
|
|
sldi $ord0,$ord0,32
|
|
sldi $ord1,$ord1,32
|
|
oris $ordk,$ordk,0xee00
|
|
oris $ord0,$ord0,0xfc63
|
|
oris $ord1,$ord1,0xa717
|
|
ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
|
|
ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
|
|
ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
|
|
li $ord2,-1 # 0xffffffffffffffff
|
|
sldi $ord3,$ord2,32 # 0xffffffff00000000
|
|
li $zr,0
|
|
|
|
mulld $acc0,$a0,$bi # a[0]*b[0]
|
|
mulhdu $t0,$a0,$bi
|
|
|
|
mulld $acc1,$a1,$bi # a[1]*b[0]
|
|
mulhdu $t1,$a1,$bi
|
|
|
|
mulld $acc2,$a2,$bi # a[2]*b[0]
|
|
mulhdu $t2,$a2,$bi
|
|
|
|
mulld $acc3,$a3,$bi # a[3]*b[0]
|
|
mulhdu $acc4,$a3,$bi
|
|
|
|
mulld $t4,$acc0,$ordk
|
|
|
|
addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
|
|
adde $acc2,$acc2,$t1
|
|
adde $acc3,$acc3,$t2
|
|
addze $acc4,$acc4
|
|
li $acc5,0
|
|
___
|
|
for ($i=1;$i<4;$i++) {
|
|
################################################################
|
|
# ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
|
|
# * abcdefgh
|
|
# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
|
|
#
|
|
# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
|
# rewrite above as:
|
|
#
|
|
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
|
|
# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
|
|
# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
|
|
$code.=<<___;
|
|
ld $bi,8*$i($bp) # b[i]
|
|
|
|
sldi $t0,$t4,32
|
|
subfc $acc2,$t4,$acc2
|
|
srdi $t1,$t4,32
|
|
subfe $acc3,$t0,$acc3
|
|
subfe $acc4,$t1,$acc4
|
|
subfe $acc5,$zr,$acc5
|
|
|
|
addic $t0,$acc0,-1 # discarded
|
|
mulhdu $t1,$ord0,$t4
|
|
mulld $t2,$ord1,$t4
|
|
mulhdu $t3,$ord1,$t4
|
|
|
|
adde $t2,$t2,$t1
|
|
mulld $t0,$a0,$bi
|
|
addze $t3,$t3
|
|
mulld $t1,$a1,$bi
|
|
|
|
addc $acc0,$acc1,$t2
|
|
mulld $t2,$a2,$bi
|
|
adde $acc1,$acc2,$t3
|
|
mulld $t3,$a3,$bi
|
|
adde $acc2,$acc3,$t4
|
|
adde $acc3,$acc4,$t4
|
|
addze $acc4,$acc5
|
|
|
|
addc $acc0,$acc0,$t0 # accumulate low parts
|
|
mulhdu $t0,$a0,$bi
|
|
adde $acc1,$acc1,$t1
|
|
mulhdu $t1,$a1,$bi
|
|
adde $acc2,$acc2,$t2
|
|
mulhdu $t2,$a2,$bi
|
|
adde $acc3,$acc3,$t3
|
|
mulhdu $t3,$a3,$bi
|
|
addze $acc4,$acc4
|
|
mulld $t4,$acc0,$ordk
|
|
addc $acc1,$acc1,$t0 # accumulate high parts
|
|
adde $acc2,$acc2,$t1
|
|
adde $acc3,$acc3,$t2
|
|
adde $acc4,$acc4,$t3
|
|
addze $acc5,$zr
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
sldi $t0,$t4,32 # last reduction
|
|
subfc $acc2,$t4,$acc2
|
|
srdi $t1,$t4,32
|
|
subfe $acc3,$t0,$acc3
|
|
subfe $acc4,$t1,$acc4
|
|
subfe $acc5,$zr,$acc5
|
|
|
|
addic $t0,$acc0,-1 # discarded
|
|
mulhdu $t1,$ord0,$t4
|
|
mulld $t2,$ord1,$t4
|
|
mulhdu $t3,$ord1,$t4
|
|
|
|
adde $t2,$t2,$t1
|
|
addze $t3,$t3
|
|
|
|
addc $acc0,$acc1,$t2
|
|
adde $acc1,$acc2,$t3
|
|
adde $acc2,$acc3,$t4
|
|
adde $acc3,$acc4,$t4
|
|
addze $acc4,$acc5
|
|
|
|
subfc $acc0,$ord0,$acc0 # ret -= modulus
|
|
subfe $acc1,$ord1,$acc1
|
|
subfe $acc2,$ord2,$acc2
|
|
subfe $acc3,$ord3,$acc3
|
|
subfe $acc4,$zr,$acc4
|
|
|
|
and $t0,$ord0,$acc4
|
|
and $t1,$ord1,$acc4
|
|
addc $acc0,$acc0,$t0 # ret += modulus if borrow
|
|
and $t3,$ord3,$acc4
|
|
adde $acc1,$acc1,$t1
|
|
adde $acc2,$acc2,$acc4
|
|
adde $acc3,$acc3,$t3
|
|
|
|
std $acc0,0($rp)
|
|
std $acc1,8($rp)
|
|
std $acc2,16($rp)
|
|
std $acc3,24($rp)
|
|
|
|
ld r18,48($sp)
|
|
ld r19,56($sp)
|
|
ld r20,64($sp)
|
|
ld r21,72($sp)
|
|
ld r22,80($sp)
|
|
ld r23,88($sp)
|
|
ld r24,96($sp)
|
|
ld r25,104($sp)
|
|
ld r26,112($sp)
|
|
ld r27,120($sp)
|
|
ld r28,128($sp)
|
|
ld r29,136($sp)
|
|
ld r30,144($sp)
|
|
ld r31,152($sp)
|
|
addi $sp,$sp,160
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,14,3,0
|
|
.long 0
|
|
.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
|
|
|
|
################################################################################
|
|
# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
|
|
# uint64_t rep);
|
|
.globl ecp_nistz256_ord_sqr_mont
|
|
.align 5
|
|
ecp_nistz256_ord_sqr_mont:
|
|
stdu $sp,-160($sp)
|
|
std r18,48($sp)
|
|
std r19,56($sp)
|
|
std r20,64($sp)
|
|
std r21,72($sp)
|
|
std r22,80($sp)
|
|
std r23,88($sp)
|
|
std r24,96($sp)
|
|
std r25,104($sp)
|
|
std r26,112($sp)
|
|
std r27,120($sp)
|
|
std r28,128($sp)
|
|
std r29,136($sp)
|
|
std r30,144($sp)
|
|
std r31,152($sp)
|
|
|
|
mtctr $bp
|
|
|
|
ld $a0,0($ap)
|
|
ld $a1,8($ap)
|
|
ld $a2,16($ap)
|
|
ld $a3,24($ap)
|
|
|
|
lis $ordk,0xccd1
|
|
lis $ord0,0xf3b9
|
|
lis $ord1,0xbce6
|
|
ori $ordk,$ordk,0xc8aa
|
|
ori $ord0,$ord0,0xcac2
|
|
ori $ord1,$ord1,0xfaad
|
|
sldi $ordk,$ordk,32
|
|
sldi $ord0,$ord0,32
|
|
sldi $ord1,$ord1,32
|
|
oris $ordk,$ordk,0xee00
|
|
oris $ord0,$ord0,0xfc63
|
|
oris $ord1,$ord1,0xa717
|
|
ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
|
|
ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
|
|
ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
|
|
li $ord2,-1 # 0xffffffffffffffff
|
|
sldi $ord3,$ord2,32 # 0xffffffff00000000
|
|
li $zr,0
|
|
b .Loop_ord_sqr
|
|
|
|
.align 5
|
|
.Loop_ord_sqr:
|
|
################################################################
|
|
# | | | | | |a1*a0| |
|
|
# | | | | |a2*a0| | |
|
|
# | |a3*a2|a3*a0| | | |
|
|
# | | | |a2*a1| | | |
|
|
# | | |a3*a1| | | | |
|
|
# *| | | | | | | | 2|
|
|
# +|a3*a3|a2*a2|a1*a1|a0*a0|
|
|
# |--+--+--+--+--+--+--+--|
|
|
# |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
|
|
#
|
|
# "can't overflow" below mark carrying into high part of
|
|
# multiplication result, which can't overflow, because it
|
|
# can never be all ones.
|
|
|
|
mulld $acc1,$a1,$a0 # a[1]*a[0]
|
|
mulhdu $t1,$a1,$a0
|
|
mulld $acc2,$a2,$a0 # a[2]*a[0]
|
|
mulhdu $t2,$a2,$a0
|
|
mulld $acc3,$a3,$a0 # a[3]*a[0]
|
|
mulhdu $acc4,$a3,$a0
|
|
|
|
addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
|
|
mulld $t0,$a2,$a1 # a[2]*a[1]
|
|
mulhdu $t1,$a2,$a1
|
|
adde $acc3,$acc3,$t2
|
|
mulld $t2,$a3,$a1 # a[3]*a[1]
|
|
mulhdu $t3,$a3,$a1
|
|
addze $acc4,$acc4 # can't overflow
|
|
|
|
mulld $acc5,$a3,$a2 # a[3]*a[2]
|
|
mulhdu $acc6,$a3,$a2
|
|
|
|
addc $t1,$t1,$t2 # accumulate high parts of multiplication
|
|
mulld $acc0,$a0,$a0 # a[0]*a[0]
|
|
addze $t2,$t3 # can't overflow
|
|
|
|
addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
|
|
mulhdu $a0,$a0,$a0
|
|
adde $acc4,$acc4,$t1
|
|
mulld $t1,$a1,$a1 # a[1]*a[1]
|
|
adde $acc5,$acc5,$t2
|
|
mulhdu $a1,$a1,$a1
|
|
addze $acc6,$acc6 # can't overflow
|
|
|
|
addc $acc1,$acc1,$acc1 # acc[1-6]*=2
|
|
mulld $t2,$a2,$a2 # a[2]*a[2]
|
|
adde $acc2,$acc2,$acc2
|
|
mulhdu $a2,$a2,$a2
|
|
adde $acc3,$acc3,$acc3
|
|
mulld $t3,$a3,$a3 # a[3]*a[3]
|
|
adde $acc4,$acc4,$acc4
|
|
mulhdu $a3,$a3,$a3
|
|
adde $acc5,$acc5,$acc5
|
|
adde $acc6,$acc6,$acc6
|
|
addze $acc7,$zr
|
|
|
|
addc $acc1,$acc1,$a0 # +a[i]*a[i]
|
|
mulld $t4,$acc0,$ordk
|
|
adde $acc2,$acc2,$t1
|
|
adde $acc3,$acc3,$a1
|
|
adde $acc4,$acc4,$t2
|
|
adde $acc5,$acc5,$a2
|
|
adde $acc6,$acc6,$t3
|
|
adde $acc7,$acc7,$a3
|
|
___
|
|
for($i=0; $i<4; $i++) { # reductions
|
|
$code.=<<___;
|
|
addic $t0,$acc0,-1 # discarded
|
|
mulhdu $t1,$ord0,$t4
|
|
mulld $t2,$ord1,$t4
|
|
mulhdu $t3,$ord1,$t4
|
|
|
|
adde $t2,$t2,$t1
|
|
addze $t3,$t3
|
|
|
|
addc $acc0,$acc1,$t2
|
|
adde $acc1,$acc2,$t3
|
|
adde $acc2,$acc3,$t4
|
|
adde $acc3,$zr,$t4 # can't overflow
|
|
___
|
|
$code.=<<___ if ($i<3);
|
|
mulld $t3,$acc0,$ordk
|
|
___
|
|
$code.=<<___;
|
|
sldi $t0,$t4,32
|
|
subfc $acc1,$t4,$acc1
|
|
srdi $t1,$t4,32
|
|
subfe $acc2,$t0,$acc2
|
|
subfe $acc3,$t1,$acc3 # can't borrow
|
|
___
|
|
($t3,$t4) = ($t4,$t3);
|
|
}
|
|
$code.=<<___;
|
|
addc $acc0,$acc0,$acc4 # accumulate upper half
|
|
adde $acc1,$acc1,$acc5
|
|
adde $acc2,$acc2,$acc6
|
|
adde $acc3,$acc3,$acc7
|
|
addze $acc4,$zr
|
|
|
|
subfc $acc0,$ord0,$acc0 # ret -= modulus
|
|
subfe $acc1,$ord1,$acc1
|
|
subfe $acc2,$ord2,$acc2
|
|
subfe $acc3,$ord3,$acc3
|
|
subfe $acc4,$zr,$acc4
|
|
|
|
and $t0,$ord0,$acc4
|
|
and $t1,$ord1,$acc4
|
|
addc $a0,$acc0,$t0 # ret += modulus if borrow
|
|
and $t3,$ord3,$acc4
|
|
adde $a1,$acc1,$t1
|
|
adde $a2,$acc2,$acc4
|
|
adde $a3,$acc3,$t3
|
|
|
|
bdnz .Loop_ord_sqr
|
|
|
|
std $a0,0($rp)
|
|
std $a1,8($rp)
|
|
std $a2,16($rp)
|
|
std $a3,24($rp)
|
|
|
|
ld r18,48($sp)
|
|
ld r19,56($sp)
|
|
ld r20,64($sp)
|
|
ld r21,72($sp)
|
|
ld r22,80($sp)
|
|
ld r23,88($sp)
|
|
ld r24,96($sp)
|
|
ld r25,104($sp)
|
|
ld r26,112($sp)
|
|
ld r27,120($sp)
|
|
ld r28,128($sp)
|
|
ld r29,136($sp)
|
|
ld r30,144($sp)
|
|
ld r31,152($sp)
|
|
addi $sp,$sp,160
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,0,0x80,14,3,0
|
|
.long 0
|
|
.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
|
|
___
|
|
} }
|
|
|
|
########################################################################
|
|
# scatter-gather subroutines
|
|
{
|
|
my ($out,$inp,$index,$mask)=map("r$_",(3..7));
|
|
$code.=<<___;
|
|
########################################################################
|
|
# void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
|
|
# int index);
|
|
.globl ecp_nistz256_scatter_w5
|
|
.align 4
|
|
ecp_nistz256_scatter_w5:
|
|
slwi $index,$index,2
|
|
add $out,$out,$index
|
|
|
|
ld r8, 0($inp) # X
|
|
ld r9, 8($inp)
|
|
ld r10,16($inp)
|
|
ld r11,24($inp)
|
|
|
|
stw r8, 64*0-4($out)
|
|
srdi r8, r8, 32
|
|
stw r9, 64*1-4($out)
|
|
srdi r9, r9, 32
|
|
stw r10,64*2-4($out)
|
|
srdi r10,r10,32
|
|
stw r11,64*3-4($out)
|
|
srdi r11,r11,32
|
|
stw r8, 64*4-4($out)
|
|
stw r9, 64*5-4($out)
|
|
stw r10,64*6-4($out)
|
|
stw r11,64*7-4($out)
|
|
addi $out,$out,64*8
|
|
|
|
ld r8, 32($inp) # Y
|
|
ld r9, 40($inp)
|
|
ld r10,48($inp)
|
|
ld r11,56($inp)
|
|
|
|
stw r8, 64*0-4($out)
|
|
srdi r8, r8, 32
|
|
stw r9, 64*1-4($out)
|
|
srdi r9, r9, 32
|
|
stw r10,64*2-4($out)
|
|
srdi r10,r10,32
|
|
stw r11,64*3-4($out)
|
|
srdi r11,r11,32
|
|
stw r8, 64*4-4($out)
|
|
stw r9, 64*5-4($out)
|
|
stw r10,64*6-4($out)
|
|
stw r11,64*7-4($out)
|
|
addi $out,$out,64*8
|
|
|
|
ld r8, 64($inp) # Z
|
|
ld r9, 72($inp)
|
|
ld r10,80($inp)
|
|
ld r11,88($inp)
|
|
|
|
stw r8, 64*0-4($out)
|
|
srdi r8, r8, 32
|
|
stw r9, 64*1-4($out)
|
|
srdi r9, r9, 32
|
|
stw r10,64*2-4($out)
|
|
srdi r10,r10,32
|
|
stw r11,64*3-4($out)
|
|
srdi r11,r11,32
|
|
stw r8, 64*4-4($out)
|
|
stw r9, 64*5-4($out)
|
|
stw r10,64*6-4($out)
|
|
stw r11,64*7-4($out)
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,3,0
|
|
.long 0
|
|
.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
|
|
|
|
########################################################################
|
|
# void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
|
|
# int index);
|
|
.globl ecp_nistz256_gather_w5
|
|
.align 4
|
|
ecp_nistz256_gather_w5:
|
|
neg r0,$index
|
|
sradi r0,r0,63
|
|
|
|
add $index,$index,r0
|
|
slwi $index,$index,2
|
|
add $inp,$inp,$index
|
|
|
|
lwz r5, 64*0($inp)
|
|
lwz r6, 64*1($inp)
|
|
lwz r7, 64*2($inp)
|
|
lwz r8, 64*3($inp)
|
|
lwz r9, 64*4($inp)
|
|
lwz r10,64*5($inp)
|
|
lwz r11,64*6($inp)
|
|
lwz r12,64*7($inp)
|
|
addi $inp,$inp,64*8
|
|
sldi r9, r9, 32
|
|
sldi r10,r10,32
|
|
sldi r11,r11,32
|
|
sldi r12,r12,32
|
|
or r5,r5,r9
|
|
or r6,r6,r10
|
|
or r7,r7,r11
|
|
or r8,r8,r12
|
|
and r5,r5,r0
|
|
and r6,r6,r0
|
|
and r7,r7,r0
|
|
and r8,r8,r0
|
|
std r5,0($out) # X
|
|
std r6,8($out)
|
|
std r7,16($out)
|
|
std r8,24($out)
|
|
|
|
lwz r5, 64*0($inp)
|
|
lwz r6, 64*1($inp)
|
|
lwz r7, 64*2($inp)
|
|
lwz r8, 64*3($inp)
|
|
lwz r9, 64*4($inp)
|
|
lwz r10,64*5($inp)
|
|
lwz r11,64*6($inp)
|
|
lwz r12,64*7($inp)
|
|
addi $inp,$inp,64*8
|
|
sldi r9, r9, 32
|
|
sldi r10,r10,32
|
|
sldi r11,r11,32
|
|
sldi r12,r12,32
|
|
or r5,r5,r9
|
|
or r6,r6,r10
|
|
or r7,r7,r11
|
|
or r8,r8,r12
|
|
and r5,r5,r0
|
|
and r6,r6,r0
|
|
and r7,r7,r0
|
|
and r8,r8,r0
|
|
std r5,32($out) # Y
|
|
std r6,40($out)
|
|
std r7,48($out)
|
|
std r8,56($out)
|
|
|
|
lwz r5, 64*0($inp)
|
|
lwz r6, 64*1($inp)
|
|
lwz r7, 64*2($inp)
|
|
lwz r8, 64*3($inp)
|
|
lwz r9, 64*4($inp)
|
|
lwz r10,64*5($inp)
|
|
lwz r11,64*6($inp)
|
|
lwz r12,64*7($inp)
|
|
sldi r9, r9, 32
|
|
sldi r10,r10,32
|
|
sldi r11,r11,32
|
|
sldi r12,r12,32
|
|
or r5,r5,r9
|
|
or r6,r6,r10
|
|
or r7,r7,r11
|
|
or r8,r8,r12
|
|
and r5,r5,r0
|
|
and r6,r6,r0
|
|
and r7,r7,r0
|
|
and r8,r8,r0
|
|
std r5,64($out) # Z
|
|
std r6,72($out)
|
|
std r7,80($out)
|
|
std r8,88($out)
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,3,0
|
|
.long 0
|
|
.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
|
|
|
|
########################################################################
|
|
# void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
|
|
# int index);
|
|
.globl ecp_nistz256_scatter_w7
|
|
.align 4
|
|
ecp_nistz256_scatter_w7:
|
|
li r0,8
|
|
mtctr r0
|
|
add $out,$out,$index
|
|
subi $inp,$inp,8
|
|
|
|
.Loop_scatter_w7:
|
|
ldu r0,8($inp)
|
|
stb r0,64*0($out)
|
|
srdi r0,r0,8
|
|
stb r0,64*1($out)
|
|
srdi r0,r0,8
|
|
stb r0,64*2($out)
|
|
srdi r0,r0,8
|
|
stb r0,64*3($out)
|
|
srdi r0,r0,8
|
|
stb r0,64*4($out)
|
|
srdi r0,r0,8
|
|
stb r0,64*5($out)
|
|
srdi r0,r0,8
|
|
stb r0,64*6($out)
|
|
srdi r0,r0,8
|
|
stb r0,64*7($out)
|
|
addi $out,$out,64*8
|
|
bdnz .Loop_scatter_w7
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,3,0
|
|
.long 0
|
|
.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
|
|
|
|
########################################################################
|
|
# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
|
|
# int index);
|
|
.globl ecp_nistz256_gather_w7
|
|
.align 4
|
|
ecp_nistz256_gather_w7:
|
|
li r0,8
|
|
mtctr r0
|
|
neg r0,$index
|
|
sradi r0,r0,63
|
|
|
|
add $index,$index,r0
|
|
add $inp,$inp,$index
|
|
subi $out,$out,8
|
|
|
|
.Loop_gather_w7:
|
|
lbz r5, 64*0($inp)
|
|
lbz r6, 64*1($inp)
|
|
lbz r7, 64*2($inp)
|
|
lbz r8, 64*3($inp)
|
|
lbz r9, 64*4($inp)
|
|
lbz r10,64*5($inp)
|
|
lbz r11,64*6($inp)
|
|
lbz r12,64*7($inp)
|
|
addi $inp,$inp,64*8
|
|
|
|
sldi r6, r6, 8
|
|
sldi r7, r7, 16
|
|
sldi r8, r8, 24
|
|
sldi r9, r9, 32
|
|
sldi r10,r10,40
|
|
sldi r11,r11,48
|
|
sldi r12,r12,56
|
|
|
|
or r5,r5,r6
|
|
or r7,r7,r8
|
|
or r9,r9,r10
|
|
or r11,r11,r12
|
|
or r5,r5,r7
|
|
or r9,r9,r11
|
|
or r5,r5,r9
|
|
and r5,r5,r0
|
|
stdu r5,8($out)
|
|
bdnz .Loop_gather_w7
|
|
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,3,0
|
|
.long 0
|
|
.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
|
|
___
|
|
}
|
|
|
|
foreach (split("\n",$code)) {
|
|
s/\`([^\`]*)\`/eval $1/ge;
|
|
|
|
print $_,"\n";
|
|
}
|
|
close STDOUT; # enforce flush
|