openssl/crypto/bn/asm/ppc64-mont-fixed.pl
Martin Schwenke 0d40ca47bd bn: Add fixed length (n=6), unrolled PPC Montgomery Multiplication
Overall improvement for p384 of ~18% on Power 9, compared to existing
Power assembling code.  See comment in code for more details.

Multiple unrolled versions could be generated for values other than
6.  However, for TLS 1.3 the only other ECC algorithms that might use
Montgomery Multiplication are p256 and p521, but these have custom
algorithms that don't use Montgomery Multiplication.  Non-ECC
algorithms are likely to use larger key lengths that won't fit into
the n <= 10 length limitation of this code.

Signed-off-by: Amitay Isaacs <amitay@ozlabs.org>
Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
Signed-off-by: Martin Schwenke <martin@meltin.net>

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/15175)
2021-05-08 20:39:29 +10:00

586 lines
9.8 KiB
Perl
Executable File

#! /usr/bin/env perl
# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
# the OpenSSL project.
# ====================================================================
#
# Fixed length (n=6), unrolled PPC Montgomery Multiplication
#
# 2021
#
# Although this is a generic implementation for unrolling Montgomery
# Multiplication for arbitrary values of n, this is currently only
# used for n = 6 to improve the performance of ECC p384.
#
# Unrolling allows intermediate results to be stored in registers,
# rather than on the stack, improving performance by ~7% compared to
# the existing PPC assembly code.
#
# The ISA 3.0 implementation uses combination multiply/add
# instructions (maddld, maddhdu) to improve performance by an
# additional ~10% on Power 9.
#
# Finally, saving non-volatile registers into volatile vector
# registers instead of onto the stack saves a little more.
#
# On a Power 9 machine we see an overall improvement of ~18%.
#
use strict;
use warnings;
my ($flavour, $output, $dir, $xlate);
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
if ($flavour !~ /64/) {
die "bad flavour ($flavour) - only ppc64 permitted";
}
my $SIZE_T= 8;
# Registers are global so the code is remotely readable
# Parameters for Montgomery multiplication
my $sp = "r1";
my $toc = "r2";
my $rp = "r3";
my $ap = "r4";
my $bp = "r5";
my $np = "r6";
my $n0 = "r7";
my $num = "r8";
$rp = "r9"; # $rp is reassigned
my $c0 = "r10";
my $bp0 = "r11";
my $bpi = "r11";
my $bpj = "r11";
my $tj = "r12";
my $apj = "r12";
my $npj = "r12";
my $lo = "r14";
my $c1 = "r14";
my $i = "r15";
# Non-volatile registers used for tp[i]
#
# 12 registers are available but the limit on unrolling is 10,
# since registers from $tp[0] to $tp[$n+1] are used.
my @tp = ("r20" .. "r31");
# volatile VSRs for saving non-volatile GPRs - faster than stack
my @vsrs = ("v32" .. "v46");
package Mont;
sub new($$)
{
my ($class, $n) = @_;
if ($n > 10) {
die "Can't unroll for BN length ${n} (maximum 10)"
}
my $self = {
code => "",
n => $n,
};
bless $self, $class;
return $self;
}
sub add_code($$)
{
my ($self, $c) = @_;
$self->{code} .= $c;
}
sub get_code($)
{
my ($self) = @_;
return $self->{code};
}
sub get_function_name($)
{
my ($self) = @_;
return "bn_mul_mont_fixed_n" . $self->{n};
}
sub get_label($$)
{
my ($self, $l) = @_;
return "L" . $l . "_" . $self->{n};
}
sub get_labels($@)
{
my ($self, @labels) = @_;
my %out = ();
foreach my $l (@labels) {
$out{"$l"} = $self->get_label("$l");
}
return \%out;
}
sub nl($)
{
my ($self) = @_;
$self->add_code("\n");
}
sub copy_result($)
{
my ($self) = @_;
my ($n) = $self->{n};
for (my $j = 0; $j < $n; $j++) {
$self->add_code(<<___);
std $tp[$j],`$j*$SIZE_T`($rp)
___
}
}
sub mul_mont_fixed($)
{
my ($self) = @_;
my ($n) = $self->{n};
my $fname = $self->get_function_name();
my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
$self->add_code(<<___);
.globl .${fname}
.${fname}:
mr $rp,r3
___
$self->save_registers();
$self->add_code(<<___);
ld $n0,0($n0)
ld $bp0,0($bp)
ld $apj,0($ap)
___
$self->mul_c_0($tp[0], $apj, $bp0, $c0);
for (my $j = 1; $j < $n - 1; $j++) {
$self->add_code(<<___);
ld $apj,`$j*$SIZE_T`($ap)
___
$self->mul($tp[$j], $apj, $bp0, $c0);
}
$self->add_code(<<___);
ld $apj,`($n-1)*$SIZE_T`($ap)
___
$self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
$self->add_code(<<___);
li $tp[$n+1],0
___
$self->add_code(<<___);
li $i,0
mtctr $num
b $label->{"enter"}
$label->{"outer"}:
ldx $bpi,$bp,$i
ld $apj,0($ap)
___
$self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
for (my $j = 1; $j < $n; $j++) {
$self->add_code(<<___);
ld $apj,`$j*$SIZE_T`($ap)
___
$self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
}
$self->add_code(<<___);
addc $tp[$n],$tp[$n],$c0
addze $tp[$n+1],$tp[$n+1]
___
$self->add_code(<<___);
$label->{"enter"}:
mulld $bpi,$tp[0],$n0
ld $npj,0($np)
___
$self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
for (my $j = 1; $j < $n; $j++) {
$self->add_code(<<___);
ld $npj,`$j*$SIZE_T`($np)
___
$self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
}
$self->add_code(<<___);
addc $tp[$n-1],$tp[$n],$c0
addze $tp[$n],$tp[$n+1]
addi $i,$i,$SIZE_T
bc 25,0,$label->{"outer"}
and. $tp[$n],$tp[$n],$tp[$n]
bne $label->{"sub"}
cmpld $tp[$n-1],$npj
blt $label->{"copy"}
$label->{"sub"}:
___
#
# Reduction
#
$self->add_code(<<___);
ld $bpj,`0*$SIZE_T`($np)
subfc $c1,$bpj,$tp[0]
std $c1,`0*$SIZE_T`($rp)
___
for (my $j = 1; $j < $n - 1; $j++) {
$self->add_code(<<___);
ld $bpj,`$j*$SIZE_T`($np)
subfe $c1,$bpj,$tp[$j]
std $c1,`$j*$SIZE_T`($rp)
___
}
$self->add_code(<<___);
subfe $c1,$npj,$tp[$n-1]
std $c1,`($n-1)*$SIZE_T`($rp)
___
$self->add_code(<<___);
addme. $tp[$n],$tp[$n]
beq $label->{"end"}
$label->{"copy"}:
___
$self->copy_result();
$self->add_code(<<___);
$label->{"end"}:
___
$self->restore_registers();
$self->add_code(<<___);
li r3,1
blr
.size ${fname},.-${fname}
___
}
package Mont::GPR;
our @ISA = ('Mont');
sub new($$)
{
my ($class, $n) = @_;
return $class->SUPER::new($n);
}
sub save_registers($)
{
my ($self) = @_;
my $n = $self->{n};
$self->add_code(<<___);
mtvsrd $vsrs[0],$lo
mtvsrd $vsrs[1],$i
___
for (my $j = 0; $j <= $n+1; $j++) {
$self->{code}.=<<___;
mtvsrd $vsrs[$j+2],$tp[$j]
___
}
$self->add_code(<<___);
___
}
sub restore_registers($)
{
my ($self) = @_;
my $n = $self->{n};
$self->add_code(<<___);
mfvsrd $lo,$vsrs[0]
mfvsrd $i,$vsrs[1]
___
for (my $j = 0; $j <= $n+1; $j++) {
$self->{code}.=<<___;
mfvsrd $tp[$j],$vsrs[$j+2]
___
}
$self->{code} .=<<___;
___
}
# Direct translation of C mul()
sub mul($$$$$)
{
my ($self, $r, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $lo,$a,$w
addc $r,$lo,$c
mulhdu $c,$a,$w
addze $c,$c
___
}
# Like mul() but $c is ignored as an input - an optimisation to save a
# preliminary instruction that would set input $c to 0
sub mul_c_0($$$$$)
{
my ($self, $r, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $r,$a,$w
mulhdu $c,$a,$w
___
}
# Like mul() but does not to the final addition of CA into $c - an
# optimisation to save an instruction
sub mul_last($$$$$$)
{
my ($self, $r1, $r2, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $lo,$a,$w
addc $r1,$lo,$c
mulhdu $c,$a,$w
addze $r2,$c
___
}
# Like C mul_add() but allow $r_out and $r_in to be different
sub mul_add($$$$$$)
{
my ($self, $r_out, $r_in, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $lo,$a,$w
addc $lo,$lo,$c
mulhdu $c,$a,$w
addze $c,$c
addc $r_out,$r_in,$lo
addze $c,$c
___
}
# Like mul_add() but $c is ignored as an input - an optimisation to save a
# preliminary instruction that would set input $c to 0
sub mul_add_c_0($$$$$$)
{
my ($self, $r_out, $r_in, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $lo,$a,$w
addc $r_out,$r_in,$lo
mulhdu $c,$a,$w
addze $c,$c
___
}
package Mont::GPR_300;
our @ISA = ('Mont::GPR');
sub new($$)
{
my ($class, $n) = @_;
my $mont = $class->SUPER::new($n);
return $mont;
}
sub get_function_name($)
{
my ($self) = @_;
return "bn_mul_mont_300_fixed_n" . $self->{n};
}
sub get_label($$)
{
my ($self, $l) = @_;
return "L" . $l . "_300_" . $self->{n};
}
# Direct translation of C mul()
sub mul($$$$$)
{
my ($self, $r, $a, $w, $c, $last) = @_;
$self->add_code(<<___);
maddld $r,$a,$w,$c
maddhdu $c,$a,$w,$c
___
}
# Save the last carry as the final entry
sub mul_last($$$$$)
{
my ($self, $r1, $r2, $a, $w, $c) = @_;
$self->add_code(<<___);
maddld $r1,$a,$w,$c
maddhdu $r2,$a,$w,$c
___
}
# Like mul() but $c is ignored as an input - an optimisation to save a
# preliminary instruction that would set input $c to 0
sub mul_c_0($$$$$)
{
my ($self, $r, $a, $w, $c) = @_;
$self->add_code(<<___);
mulld $r,$a,$w
mulhdu $c,$a,$w
___
}
# Like C mul_add() but allow $r_out and $r_in to be different
sub mul_add($$$$$$)
{
my ($self, $r_out, $r_in, $a, $w, $c) = @_;
$self->add_code(<<___);
maddld $lo,$a,$w,$c
maddhdu $c,$a,$w,$c
addc $r_out,$r_in,$lo
addze $c,$c
___
}
# Like mul_add() but $c is ignored as an input - an optimisation to save a
# preliminary instruction that would set input $c to 0
sub mul_add_c_0($$$$$$)
{
my ($self, $r_out, $r_in, $a, $w, $c) = @_;
$self->add_code(<<___);
maddld $lo,$a,$w,$r_in
maddhdu $c,$a,$w,$r_in
___
if ($r_out ne $lo) {
$self->add_code(<<___);
mr $r_out,$lo
___
}
$self->nl();
}
package main;
my $code;
$code.=<<___;
.machine "any"
.text
.align 5
.p2align 5,,31
___
my $mont;
$mont = new Mont::GPR(6);
$mont->mul_mont_fixed();
$code .= $mont->get_code();
$mont = new Mont::GPR_300(6);
$mont->mul_mont_fixed();
$code .= $mont->get_code();
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code.=<<___;
.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
___
print $code;
close STDOUT or die "error closing STDOUT: $!";