bn: Add fixed length (n=6), unrolled PPC Montgomery Multiplication

Overall improvement for p384 of ~18% on Power 9, compared to existing
Power assembling code.  See comment in code for more details.

Multiple unrolled versions could be generated for values other than
6.  However, for TLS 1.3 the only other ECC algorithms that might use
Montgomery Multiplication are p256 and p521, but these have custom
algorithms that don't use Montgomery Multiplication.  Non-ECC
algorithms are likely to use larger key lengths that won't fit into
the n <= 10 length limitation of this code.

Signed-off-by: Amitay Isaacs <amitay@ozlabs.org>
Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
Signed-off-by: Martin Schwenke <martin@meltin.net>

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/15175)
This commit is contained in:
Martin Schwenke 2021-04-14 14:31:58 +10:00 committed by Pauli
parent 531df8185f
commit 0d40ca47bd
6 changed files with 602 additions and 2 deletions

585
crypto/bn/asm/ppc64-mont-fixed.pl Executable file
View File

@ -0,0 +1,585 @@
#! /usr/bin/env perl
# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
# the OpenSSL project.
# ====================================================================
#
# Fixed length (n=6), unrolled PPC Montgomery Multiplication
#
# 2021
#
# Although this is a generic implementation for unrolling Montgomery
# Multiplication for arbitrary values of n, this is currently only
# used for n = 6 to improve the performance of ECC p384.
#
# Unrolling allows intermediate results to be stored in registers,
# rather than on the stack, improving performance by ~7% compared to
# the existing PPC assembly code.
#
# The ISA 3.0 implementation uses combination multiply/add
# instructions (maddld, maddhdu) to improve performance by an
# additional ~10% on Power 9.
#
# Finally, saving non-volatile registers into volatile vector
# registers instead of onto the stack saves a little more.
#
# On a Power 9 machine we see an overall improvement of ~18%.
#
use strict;
use warnings;
my ($flavour, $output, $dir, $xlate);
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
if ($flavour !~ /64/) {
die "bad flavour ($flavour) - only ppc64 permitted";
}
my $SIZE_T= 8;
# Registers are global so the code is remotely readable
# Parameters for Montgomery multiplication
my $sp = "r1";
my $toc = "r2";
my $rp = "r3";
my $ap = "r4";
my $bp = "r5";
my $np = "r6";
my $n0 = "r7";
my $num = "r8";
$rp = "r9"; # $rp is reassigned
my $c0 = "r10";
my $bp0 = "r11";
my $bpi = "r11";
my $bpj = "r11";
my $tj = "r12";
my $apj = "r12";
my $npj = "r12";
my $lo = "r14";
my $c1 = "r14";
my $i = "r15";
# Non-volatile registers used for tp[i]
#
# 12 registers are available but the limit on unrolling is 10,
# since registers from $tp[0] to $tp[$n+1] are used.
my @tp = ("r20" .. "r31");
# volatile VSRs for saving non-volatile GPRs - faster than stack
my @vsrs = ("v32" .. "v46");
package Mont;
sub new($$)
{
my ($class, $n) = @_;
if ($n > 10) {
die "Can't unroll for BN length ${n} (maximum 10)"
}
my $self = {
code => "",
n => $n,
};
bless $self, $class;
return $self;
}
sub add_code($$)
{
my ($self, $c) = @_;
$self->{code} .= $c;
}
sub get_code($)
{
my ($self) = @_;
return $self->{code};
}
sub get_function_name($)
{
my ($self) = @_;
return "bn_mul_mont_fixed_n" . $self->{n};
}
sub get_label($$)
{
my ($self, $l) = @_;
return "L" . $l . "_" . $self->{n};
}
sub get_labels($@)
{
my ($self, @labels) = @_;
my %out = ();
foreach my $l (@labels) {
$out{"$l"} = $self->get_label("$l");
}
return \%out;
}
sub nl($)
{
my ($self) = @_;
$self->add_code("\n");
}
sub copy_result($)
{
my ($self) = @_;
my ($n) = $self->{n};
for (my $j = 0; $j < $n; $j++) {
$self->add_code(<<___);
std $tp[$j],`$j*$SIZE_T`($rp)
___
}
}
sub mul_mont_fixed($)
{
my ($self) = @_;
my ($n) = $self->{n};
my $fname = $self->get_function_name();
my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
$self->add_code(<<___);
.globl .${fname}
.${fname}:
mr $rp,r3
___
$self->save_registers();
$self->add_code(<<___);
ld $n0,0($n0)
ld $bp0,0($bp)
ld $apj,0($ap)
___
$self->mul_c_0($tp[0], $apj, $bp0, $c0);
for (my $j = 1; $j < $n - 1; $j++) {
$self->add_code(<<___);
ld $apj,`$j*$SIZE_T`($ap)
___
$self->mul($tp[$j], $apj, $bp0, $c0);
}
$self->add_code(<<___);
ld $apj,`($n-1)*$SIZE_T`($ap)
___
$self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
$self->add_code(<<___);
li $tp[$n+1],0
___
$self->add_code(<<___);
li $i,0
mtctr $num
b $label->{"enter"}
$label->{"outer"}:
ldx $bpi,$bp,$i
ld $apj,0($ap)
___
$self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
for (my $j = 1; $j < $n; $j++) {
$self->add_code(<<___);
ld $apj,`$j*$SIZE_T`($ap)
___
$self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
}
$self->add_code(<<___);
addc $tp[$n],$tp[$n],$c0
addze $tp[$n+1],$tp[$n+1]
___
$self->add_code(<<___);
$label->{"enter"}:
mulld $bpi,$tp[0],$n0
ld $npj,0($np)
___
$self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
for (my $j = 1; $j < $n; $j++) {
$self->add_code(<<___);
ld $npj,`$j*$SIZE_T`($np)
___
$self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
}
$self->add_code(<<___);
addc $tp[$n-1],$tp[$n],$c0
addze $tp[$n],$tp[$n+1]
addi $i,$i,$SIZE_T
bc 25,0,$label->{"outer"}
and. $tp[$n],$tp[$n],$tp[$n]
bne $label->{"sub"}
cmpld $tp[$n-1],$npj
blt $label->{"copy"}
$label->{"sub"}:
___
#
# Reduction
#
$self->add_code(<<___);
ld $bpj,`0*$SIZE_T`($np)
subfc $c1,$bpj,$tp[0]
std $c1,`0*$SIZE_T`($rp)
___
for (my $j = 1; $j < $n - 1; $j++) {
$self->add_code(<<___);
ld $bpj,`$j*$SIZE_T`($np)
subfe $c1,$bpj,$tp[$j]
std $c1,`$j*$SIZE_T`($rp)
___
}
$self->add_code(<<___);
subfe $c1,$npj,$tp[$n-1]
std $c1,`($n-1)*$SIZE_T`($rp)
___
$self->add_code(<<___);
addme. $tp[$n],$tp[$n]
beq $label->{"end"}
$label->{"copy"}:
___
$self->copy_result();
$self->add_code(<<___);
$label->{"end"}:
___
$self->restore_registers();
$self->add_code(<<___);
li r3,1
blr
.size ${fname},.-${fname}
___
}
package Mont::GPR;
our @ISA = ('Mont');
sub new($$)
{
my ($class, $n) = @_;
return $class->SUPER::new($n);
}
sub save_registers($)
{
my ($self) = @_;
my $n = $self->{n};
$self->add_code(<<___);
mtvsrd $vsrs[0],$lo
mtvsrd $vsrs[1],$i
___
for (my $j = 0; $j <= $n+1; $j++) {
$self->{code}.=<<___;
mtvsrd $vsrs[$j+2],$tp[$j]
___
}
$self->add_code(<<___);
___
}
sub restore_registers($)
{
my ($self) = @_;
my $n = $self->{n};
$self->add_code(<<___);
mfvsrd $lo,$vsrs[0]
mfvsrd $i,$vsrs[1]
___
for (my $j = 0; $j <= $n+1; $j++) {
$self->{code}.=<<___;
mfvsrd $tp[$j],$vsrs[$j+2]
___
}
$self->{code} .=<<___;
___
}
# Direct translation of C mul()
sub mul($$$$$)
{
my ($self, $r, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $lo,$a,$w
addc $r,$lo,$c
mulhdu $c,$a,$w
addze $c,$c
___
}
# Like mul() but $c is ignored as an input - an optimisation to save a
# preliminary instruction that would set input $c to 0
sub mul_c_0($$$$$)
{
my ($self, $r, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $r,$a,$w
mulhdu $c,$a,$w
___
}
# Like mul() but does not to the final addition of CA into $c - an
# optimisation to save an instruction
sub mul_last($$$$$$)
{
my ($self, $r1, $r2, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $lo,$a,$w
addc $r1,$lo,$c
mulhdu $c,$a,$w
addze $r2,$c
___
}
# Like C mul_add() but allow $r_out and $r_in to be different
sub mul_add($$$$$$)
{
my ($self, $r_out, $r_in, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $lo,$a,$w
addc $lo,$lo,$c
mulhdu $c,$a,$w
addze $c,$c
addc $r_out,$r_in,$lo
addze $c,$c
___
}
# Like mul_add() but $c is ignored as an input - an optimisation to save a
# preliminary instruction that would set input $c to 0
sub mul_add_c_0($$$$$$)
{
my ($self, $r_out, $r_in, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $lo,$a,$w
addc $r_out,$r_in,$lo
mulhdu $c,$a,$w
addze $c,$c
___
}
package Mont::GPR_300;
our @ISA = ('Mont::GPR');
sub new($$)
{
my ($class, $n) = @_;
my $mont = $class->SUPER::new($n);
return $mont;
}
sub get_function_name($)
{
my ($self) = @_;
return "bn_mul_mont_300_fixed_n" . $self->{n};
}
sub get_label($$)
{
my ($self, $l) = @_;
return "L" . $l . "_300_" . $self->{n};
}
# Direct translation of C mul()
sub mul($$$$$)
{
my ($self, $r, $a, $w, $c, $last) = @_;
$self->add_code(<<___);
maddld $r,$a,$w,$c
maddhdu $c,$a,$w,$c
___
}
# Save the last carry as the final entry
sub mul_last($$$$$)
{
my ($self, $r1, $r2, $a, $w, $c) = @_;
$self->add_code(<<___);
maddld $r1,$a,$w,$c
maddhdu $r2,$a,$w,$c
___
}
# Like mul() but $c is ignored as an input - an optimisation to save a
# preliminary instruction that would set input $c to 0
sub mul_c_0($$$$$)
{
my ($self, $r, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $r,$a,$w
mulhdu $c,$a,$w
___
}
# Like C mul_add() but allow $r_out and $r_in to be different
sub mul_add($$$$$$)
{
my ($self, $r_out, $r_in, $a, $w, $c) = @_;
$self->add_code(<<___);
maddld $lo,$a,$w,$c
maddhdu $c,$a,$w,$c
addc $r_out,$r_in,$lo
addze $c,$c
___
}
# Like mul_add() but $c is ignored as an input - an optimisation to save a
# preliminary instruction that would set input $c to 0
sub mul_add_c_0($$$$$$)
{
my ($self, $r_out, $r_in, $a, $w, $c) = @_;
$self->add_code(<<___);
maddld $lo,$a,$w,$r_in
maddhdu $c,$a,$w,$r_in
___
if ($r_out ne $lo) {
$self->add_code(<<___);
mr $r_out,$lo
___
}
$self->nl();
}
package main;
my $code;
$code.=<<___;
.machine "any"
.text
.align 5
.p2align 5,,31
___
my $mont;
$mont = new Mont::GPR(6);
$mont->mul_mont_fixed();
$code .= $mont->get_code();
$mont = new Mont::GPR_300(6);
$mont->mul_mont_fixed();
$code .= $mont->get_code();
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code.=<<___;
.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
___
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}]
$BNASM_ppc32=bn-ppc.s ppc-mont.s
$BNDEF_ppc32=OPENSSL_BN_ASM_MONT
$BNASM_ppc64=$BNASM_ppc32
$BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s
$BNDEF_ppc64=$BNDEF_ppc32
$BNASM_c64xplus=asm/bn-c64xplus.asm
@ -168,6 +168,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl
GENERATE[bn-ppc.s]=asm/ppc.pl
GENERATE[ppc-mont.s]=asm/ppc-mont.pl
GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl
GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl
GENERATE[alpha-mont.S]=asm/alpha-mont.pl

View File

@ -47,6 +47,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
const BN_ULONG *bp, const BN_ULONG *np,
const BN_ULONG *n0, int num);
int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
const BN_ULONG *bp, const BN_ULONG *np,
const BN_ULONG *n0, int num);
if (num < 4)
return 0;
@ -62,6 +68,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
* no opportunity to figure it out...
*/
if (num == 6)
if (OPENSSL_ppccap_P & PPC_MADD300)
return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
else
return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
return bn_mul_mont_int(rp, ap, bp, np, n0, num);
}
#endif

View File

@ -42,6 +42,7 @@ eb240c1f72063048abe026ab7fab340361a329d5cd355276a25950be446cc091 crypto/bn/asm/
b27ec5181e387e812925bb26823b830f49d7a6e4971b6d11ea583f5632a1504b crypto/bn/asm/parisc-mont.pl
9973523b361db963eea4938a7a8a3adc692e1a4e1aec4fa1f1e57dc93da37921 crypto/bn/asm/ppc-mont.pl
59cd27e1e10c4984b7fb684b27f491e7634473b1bcff197a07e0ca653124aa9a crypto/bn/asm/ppc.pl
13ba6625cc6c673dc6f7ef69a7bbe40487c5553b3873a996af4904de5b1cd82b crypto/bn/asm/ppc64-mont-fixed.pl
a25be64867ab837d93855af232e2bfa71b85b2c6f00e35e620fdc5618187fb6f crypto/bn/asm/ppc64-mont.pl
231579e532443665020d4d522d9f11713d9c5d5c814b95b434b0f65452e16de4 crypto/bn/asm/rsaz-avx2.pl
c9bd8679a5104affd9f3f0bcda726f823a1a53cac872e4a21a6f2370489dae08 crypto/bn/asm/rsaz-avx512.pl

View File

@ -1 +1 @@
2e67c3ed3222fedf2d26e91f47b2b7708a95f39a74bd1489412f324f84daa57d providers/fips-sources.checksums
4fcfc6375eef7bed6219191cce24513be04a6ebb8b2d5da8e404150a2ecc0eba providers/fips-sources.checksums

View File

@ -42,6 +42,7 @@ crypto/bn/asm/mips.pl
crypto/bn/asm/parisc-mont.pl
crypto/bn/asm/ppc-mont.pl
crypto/bn/asm/ppc.pl
crypto/bn/asm/ppc64-mont-fixed.pl
crypto/bn/asm/ppc64-mont.pl
crypto/bn/asm/rsaz-avx2.pl
crypto/bn/asm/rsaz-avx512.pl