mirror of
https://github.com/openssl/openssl.git
synced 2024-12-27 06:21:43 +08:00
30691da1ba
Only field multiplication and squaring (but not reduction) show a significant improvement. This is enabled on Power ISA >= 3.0. On a Power 9 CPU an average 10% performance improvement is seen (ECHDE: 14%, ECDSA sign: 6%, ECDSA verify 10%), compared to existing code. On an upcoming Power 10 CPU we see an average performance improvement of 26% (ECHDE: 38%, ECDSA sign: 16%, ECDSA verify 25%), compared to existing code. Signed-off-by: Amitay Isaacs <amitay@ozlabs.org> Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/15401)
437 lines
8.6 KiB
Perl
Executable File
437 lines
8.6 KiB
Perl
Executable File
#! /usr/bin/env perl
|
|
# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the OpenSSL license (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
#
|
|
# ====================================================================
|
|
# Written by Amitay Isaacs <amitay@ozlabs.org> and Martin Schwenke
|
|
# <martin@meltin.net> for the OpenSSL project.
|
|
# ====================================================================
|
|
#
|
|
# p521 lower-level primitives for PPC64 using vector instructions.
|
|
#
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
my $flavour = shift;
|
|
my $output = "";
|
|
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
|
if (!$output) {
|
|
$output = "-";
|
|
}
|
|
|
|
my ($xlate, $dir);
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
|
die "can't locate ppc-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour $output";
|
|
*STDOUT=*OUT;
|
|
|
|
my $code = "";
|
|
|
|
my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12");
|
|
|
|
my $vzero = "v32";
|
|
|
|
sub startproc($)
|
|
{
|
|
my ($name) = @_;
|
|
|
|
$code.=<<___;
|
|
.globl ${name}
|
|
${name}:
|
|
.cfi_startproc
|
|
|
|
___
|
|
}
|
|
|
|
sub endproc($)
|
|
{
|
|
my ($name) = @_;
|
|
|
|
$code.=<<___;
|
|
blr
|
|
.cfi_endproc
|
|
.size ${name},.-${name}
|
|
|
|
___
|
|
}
|
|
|
|
|
|
sub push_vrs($$)
|
|
{
|
|
my ($min, $max) = @_;
|
|
|
|
my $count = $max - $min + 1;
|
|
|
|
$code.=<<___;
|
|
mr $savesp,$sp
|
|
stdu $sp,-16*`$count+1`($sp)
|
|
|
|
___
|
|
for (my $i = $min; $i <= $max; $i++) {
|
|
my $mult = $max - $i + 1;
|
|
$code.=<<___;
|
|
stxv $i,-16*$mult($savesp)
|
|
___
|
|
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
___
|
|
}
|
|
|
|
sub pop_vrs($$)
|
|
{
|
|
my ($min, $max) = @_;
|
|
|
|
$code.=<<___;
|
|
ld $savesp,0($sp)
|
|
___
|
|
for (my $i = $min; $i <= $max; $i++) {
|
|
my $mult = $max - $i + 1;
|
|
$code.=<<___;
|
|
lxv $i,-16*$mult($savesp)
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
mr $sp,$savesp
|
|
|
|
___
|
|
}
|
|
|
|
sub load_vrs($$)
|
|
{
|
|
my ($pointer, $reg_list) = @_;
|
|
|
|
for (my $i = 0; $i <= 8; $i++) {
|
|
my $offset = $i * 8;
|
|
$code.=<<___;
|
|
lxsd $reg_list->[$i],$offset($pointer)
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
___
|
|
}
|
|
|
|
sub store_vrs($$)
|
|
{
|
|
my ($pointer, $reg_list) = @_;
|
|
|
|
for (my $i = 0; $i <= 8; $i++) {
|
|
my $offset = $i * 16;
|
|
$code.=<<___;
|
|
stxv $reg_list->[$i],$offset($pointer)
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
___
|
|
|
|
{
|
|
# mul/square common
|
|
my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54");
|
|
my ($zero, $one) = ("r8", "r9");
|
|
my @out = map("v$_",(55..63));
|
|
|
|
{
|
|
#
|
|
# p521_felem_mul
|
|
#
|
|
|
|
my ($in1p, $in2p) = ("r4", "r5");
|
|
my @in1 = map("v$_",(45..53));
|
|
my @in2 = map("v$_",(35..43));
|
|
|
|
startproc("p521_felem_mul");
|
|
|
|
push_vrs(52, 63);
|
|
|
|
$code.=<<___;
|
|
vspltisw $vzero,0
|
|
|
|
___
|
|
|
|
load_vrs($in1p, \@in1);
|
|
load_vrs($in2p, \@in2);
|
|
|
|
$code.=<<___;
|
|
vmsumudm $out[0],$in1[0],$in2[0],$vzero
|
|
|
|
xxpermdi $t1,$in1[0],$in1[1],0b00
|
|
xxpermdi $t2,$in2[1],$in2[0],0b00
|
|
vmsumudm $out[1],$t1,$t2,$vzero
|
|
|
|
xxpermdi $t2,$in2[2],$in2[1],0b00
|
|
vmsumudm $out[2],$t1,$t2,$vzero
|
|
vmsumudm $out[2],$in1[2],$in2[0],$out[2]
|
|
|
|
xxpermdi $t2,$in2[3],$in2[2],0b00
|
|
vmsumudm $out[3],$t1,$t2,$vzero
|
|
xxpermdi $t3,$in1[2],$in1[3],0b00
|
|
xxpermdi $t4,$in2[1],$in2[0],0b00
|
|
vmsumudm $out[3],$t3,$t4,$out[3]
|
|
|
|
xxpermdi $t2,$in2[4],$in2[3],0b00
|
|
vmsumudm $out[4],$t1,$t2,$vzero
|
|
xxpermdi $t4,$in2[2],$in2[1],0b00
|
|
vmsumudm $out[4],$t3,$t4,$out[4]
|
|
vmsumudm $out[4],$in1[4],$in2[0],$out[4]
|
|
|
|
xxpermdi $t2,$in2[5],$in2[4],0b00
|
|
vmsumudm $out[5],$t1,$t2,$vzero
|
|
xxpermdi $t4,$in2[3],$in2[2],0b00
|
|
vmsumudm $out[5],$t3,$t4,$out[5]
|
|
|
|
xxpermdi $t2,$in2[6],$in2[5],0b00
|
|
vmsumudm $out[6],$t1,$t2,$vzero
|
|
xxpermdi $t4,$in2[4],$in2[3],0b00
|
|
vmsumudm $out[6],$t3,$t4,$out[6]
|
|
|
|
xxpermdi $t2,$in2[7],$in2[6],0b00
|
|
vmsumudm $out[7],$t1,$t2,$vzero
|
|
xxpermdi $t4,$in2[5],$in2[4],0b00
|
|
vmsumudm $out[7],$t3,$t4,$out[7]
|
|
|
|
xxpermdi $t2,$in2[8],$in2[7],0b00
|
|
vmsumudm $out[8],$t1,$t2,$vzero
|
|
xxpermdi $t4,$in2[6],$in2[5],0b00
|
|
vmsumudm $out[8],$t3,$t4,$out[8]
|
|
|
|
xxpermdi $t1,$in1[4],$in1[5],0b00
|
|
xxpermdi $t2,$in2[1],$in2[0],0b00
|
|
vmsumudm $out[5],$t1,$t2,$out[5]
|
|
|
|
xxpermdi $t2,$in2[2],$in2[1],0b00
|
|
vmsumudm $out[6],$t1,$t2,$out[6]
|
|
vmsumudm $out[6],$in1[6],$in2[0],$out[6]
|
|
|
|
xxpermdi $t2,$in2[3],$in2[2],0b00
|
|
vmsumudm $out[7],$t1,$t2,$out[7]
|
|
xxpermdi $t3,$in1[6],$in1[7],0b00
|
|
xxpermdi $t4,$in2[1],$in2[0],0b00
|
|
vmsumudm $out[7],$t3,$t4,$out[7]
|
|
|
|
xxpermdi $t2,$in2[4],$in2[3],0b00
|
|
vmsumudm $out[8],$t1,$t2,$out[8]
|
|
xxpermdi $t4,$in2[2],$in2[1],0b00
|
|
vmsumudm $out[8],$t3,$t4,$out[8]
|
|
vmsumudm $out[8],$in1[8],$in2[0],$out[8]
|
|
|
|
li $zero,0
|
|
li $one,1
|
|
mtvsrdd $t1,$one,$zero
|
|
___
|
|
|
|
for (my $i = 0; $i <= 8; $i++) {
|
|
$code.=<<___;
|
|
vsld $in2[$i],$in2[$i],$t1
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
vmsumudm $out[7],$in1[8],$in2[8],$out[7]
|
|
|
|
xxpermdi $t2,$in2[8],$in2[7],0b00
|
|
xxpermdi $t1,$in1[7],$in1[8],0b00
|
|
vmsumudm $out[6],$t1,$t2,$out[6]
|
|
|
|
xxpermdi $t1,$in1[6],$in1[7],0b00
|
|
vmsumudm $out[5],$t1,$t2,$out[5]
|
|
vmsumudm $out[5],$in1[8],$in2[6],$out[5]
|
|
|
|
xxpermdi $t1,$in1[5],$in1[6],0b00
|
|
vmsumudm $out[4],$t1,$t2,$out[4]
|
|
xxpermdi $t4,$in2[6],$in2[5],0b00
|
|
xxpermdi $t3,$in1[7],$in1[8],0b00
|
|
vmsumudm $out[4],$t3,$t4,$out[4]
|
|
|
|
xxpermdi $t1,$in1[4],$in1[5],0b00
|
|
vmsumudm $out[3],$t1,$t2,$out[3]
|
|
xxpermdi $t3,$in1[6],$in1[7],0b00
|
|
vmsumudm $out[3],$t3,$t4,$out[3]
|
|
vmsumudm $out[3],$in1[8],$in2[4],$out[3]
|
|
|
|
xxpermdi $t1,$in1[3],$in1[4],0b00
|
|
vmsumudm $out[2],$t1,$t2,$out[2]
|
|
xxpermdi $t3,$in1[5],$in1[6],0b00
|
|
vmsumudm $out[2],$t3,$t4,$out[2]
|
|
|
|
xxpermdi $t1,$in1[2],$in1[3],0b00
|
|
vmsumudm $out[1],$t1,$t2,$out[1]
|
|
xxpermdi $t3,$in1[4],$in1[5],0b00
|
|
vmsumudm $out[1],$t3,$t4,$out[1]
|
|
|
|
xxpermdi $t1,$in1[1],$in1[2],0b00
|
|
vmsumudm $out[0],$t1,$t2,$out[0]
|
|
xxpermdi $t3,$in1[3],$in1[4],0b00
|
|
vmsumudm $out[0],$t3,$t4,$out[0]
|
|
|
|
xxpermdi $t2,$in2[4],$in2[3],0b00
|
|
xxpermdi $t1,$in1[7],$in1[8],0b00
|
|
vmsumudm $out[2],$t1,$t2,$out[2]
|
|
|
|
xxpermdi $t1,$in1[6],$in1[7],0b00
|
|
vmsumudm $out[1],$t1,$t2,$out[1]
|
|
vmsumudm $out[1],$in1[8],$in2[2],$out[1]
|
|
|
|
xxpermdi $t1,$in1[5],$in1[6],0b00
|
|
vmsumudm $out[0],$t1,$t2,$out[0]
|
|
xxpermdi $t4,$in2[2],$in2[1],0b00
|
|
xxpermdi $t3,$in1[7],$in1[8],0b00
|
|
vmsumudm $out[0],$t3,$t4,$out[0]
|
|
|
|
___
|
|
|
|
store_vrs($outp, \@out);
|
|
|
|
pop_vrs(52, 63);
|
|
|
|
endproc("p521_felem_mul");
|
|
}
|
|
|
|
{
|
|
#
|
|
# p51_felem_square
|
|
#
|
|
|
|
my ($inp) = ("r4");
|
|
my @in = map("v$_",(45..53));
|
|
my @inx2 = map("v$_",(35..43));
|
|
|
|
startproc("p521_felem_square");
|
|
|
|
push_vrs(52, 63);
|
|
|
|
$code.=<<___;
|
|
vspltisw $vzero,0
|
|
|
|
___
|
|
|
|
load_vrs($inp, \@in);
|
|
|
|
$code.=<<___;
|
|
li $zero,0
|
|
li $one,1
|
|
mtvsrdd $t1,$one,$zero
|
|
___
|
|
|
|
for (my $i = 0; $i <= 8; $i++) {
|
|
$code.=<<___;
|
|
vsld $inx2[$i],$in[$i],$t1
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
vmsumudm $out[0],$in[0],$in[0],$vzero
|
|
|
|
vmsumudm $out[1],$in[0],$inx2[1],$vzero
|
|
|
|
xxpermdi $t1,$in[0],$in[1],0b00
|
|
xxpermdi $t2,$inx2[2],$in[1],0b00
|
|
vmsumudm $out[2],$t1,$t2,$vzero
|
|
|
|
xxpermdi $t2,$inx2[3],$inx2[2],0b00
|
|
vmsumudm $out[3],$t1,$t2,$vzero
|
|
|
|
xxpermdi $t2,$inx2[4],$inx2[3],0b00
|
|
vmsumudm $out[4],$t1,$t2,$vzero
|
|
vmsumudm $out[4],$in[2],$in[2],$out[4]
|
|
|
|
xxpermdi $t2,$inx2[5],$inx2[4],0b00
|
|
vmsumudm $out[5],$t1,$t2,$vzero
|
|
vmsumudm $out[5],$in[2],$inx2[3],$out[5]
|
|
|
|
xxpermdi $t2,$inx2[6],$inx2[5],0b00
|
|
vmsumudm $out[6],$t1,$t2,$vzero
|
|
xxpermdi $t3,$in[2],$in[3],0b00
|
|
xxpermdi $t4,$inx2[4],$in[3],0b00
|
|
vmsumudm $out[6],$t3,$t4,$out[6]
|
|
|
|
xxpermdi $t2,$inx2[7],$inx2[6],0b00
|
|
vmsumudm $out[7],$t1,$t2,$vzero
|
|
xxpermdi $t4,$inx2[5],$inx2[4],0b00
|
|
vmsumudm $out[7],$t3,$t4,$out[7]
|
|
|
|
xxpermdi $t2,$inx2[8],$inx2[7],0b00
|
|
vmsumudm $out[8],$t1,$t2,$vzero
|
|
xxpermdi $t4,$inx2[6],$inx2[5],0b00
|
|
vmsumudm $out[8],$t3,$t4,$out[8]
|
|
vmsumudm $out[8],$in[4],$in[4],$out[8]
|
|
|
|
vmsumudm $out[1],$in[5],$inx2[5],$out[1]
|
|
|
|
vmsumudm $out[3],$in[6],$inx2[6],$out[3]
|
|
|
|
vmsumudm $out[5],$in[7],$inx2[7],$out[5]
|
|
|
|
vmsumudm $out[7],$in[8],$inx2[8],$out[7]
|
|
|
|
mtvsrdd $t1,$one,$zero
|
|
___
|
|
|
|
for (my $i = 5; $i <= 8; $i++) {
|
|
$code.=<<___;
|
|
vsld $inx2[$i],$inx2[$i],$t1
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
vmsumudm $out[6],$in[7],$inx2[8],$out[6]
|
|
|
|
vmsumudm $out[5],$in[6],$inx2[8],$out[5]
|
|
|
|
xxpermdi $t2,$inx2[8],$inx2[7],0b00
|
|
xxpermdi $t1,$in[5],$in[6],0b00
|
|
vmsumudm $out[4],$t1,$t2,$out[4]
|
|
|
|
xxpermdi $t1,$in[4],$in[5],0b00
|
|
vmsumudm $out[3],$t1,$t2,$out[3]
|
|
|
|
xxpermdi $t1,$in[3],$in[4],0b00
|
|
vmsumudm $out[2],$t1,$t2,$out[2]
|
|
vmsumudm $out[2],$in[5],$inx2[6],$out[2]
|
|
|
|
xxpermdi $t1,$in[2],$in[3],0b00
|
|
vmsumudm $out[1],$t1,$t2,$out[1]
|
|
vmsumudm $out[1],$in[4],$inx2[6],$out[1]
|
|
|
|
xxpermdi $t1,$in[1],$in[2],0b00
|
|
vmsumudm $out[0],$t1,$t2,$out[0]
|
|
xxpermdi $t2,$inx2[6],$inx2[5],0b00
|
|
xxpermdi $t1,$in[3],$in[4],0b00
|
|
vmsumudm $out[0],$t1,$t2,$out[0]
|
|
|
|
___
|
|
|
|
store_vrs($outp, \@out);
|
|
|
|
pop_vrs(52, 63);
|
|
|
|
endproc("p521_felem_square");
|
|
}
|
|
}
|
|
|
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
print $code;
|
|
close STDOUT or die "error closing STDOUT: $!";
|