#! /usr/bin/env perl # Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Amitay Isaacs , Martin Schwenke # & Alastair D'Silva for # the OpenSSL project. # ==================================================================== # # Fixed length (n=6), unrolled PPC Montgomery Multiplication # # 2021 # # Although this is a generic implementation for unrolling Montgomery # Multiplication for arbitrary values of n, this is currently only # used for n = 6 to improve the performance of ECC p384. # # Unrolling allows intermediate results to be stored in registers, # rather than on the stack, improving performance by ~7% compared to # the existing PPC assembly code. # # The ISA 3.0 implementation uses combination multiply/add # instructions (maddld, maddhdu) to improve performance by an # additional ~10% on Power 9. # # Finally, saving non-volatile registers into volatile vector # registers instead of onto the stack saves a little more. # # On a Power 9 machine we see an overall improvement of ~18%. # use strict; use warnings; my ($flavour, $output, $dir, $xlate); # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; if ($flavour !~ /64/) { die "bad flavour ($flavour) - only ppc64 permitted"; } my $SIZE_T= 8; # Registers are global so the code is remotely readable # Parameters for Montgomery multiplication my $sp = "r1"; my $toc = "r2"; my $rp = "r3"; my $ap = "r4"; my $bp = "r5"; my $np = "r6"; my $n0 = "r7"; my $num = "r8"; my $i = "r9"; my $c0 = "r10"; my $bp0 = "r11"; my $bpi = "r11"; my $bpj = "r11"; my $tj = "r12"; my $apj = "r12"; my $npj = "r12"; my $lo = "r14"; my $c1 = "r14"; # Non-volatile registers used for tp[i] # # 12 registers are available but the limit on unrolling is 10, # since registers from $tp[0] to $tp[$n+1] are used. my @tp = ("r20" .. "r31"); # volatile VSRs for saving non-volatile GPRs - faster than stack my @vsrs = ("v32" .. "v46"); package Mont; sub new($$) { my ($class, $n) = @_; if ($n > 10) { die "Can't unroll for BN length ${n} (maximum 10)" } my $self = { code => "", n => $n, }; bless $self, $class; return $self; } sub add_code($$) { my ($self, $c) = @_; $self->{code} .= $c; } sub get_code($) { my ($self) = @_; return $self->{code}; } sub get_function_name($) { my ($self) = @_; return "bn_mul_mont_fixed_n" . $self->{n}; } sub get_label($$) { my ($self, $l) = @_; return "L" . $l . "_" . $self->{n}; } sub get_labels($@) { my ($self, @labels) = @_; my %out = (); foreach my $l (@labels) { $out{"$l"} = $self->get_label("$l"); } return \%out; } sub nl($) { my ($self) = @_; $self->add_code("\n"); } sub copy_result($) { my ($self) = @_; my ($n) = $self->{n}; for (my $j = 0; $j < $n; $j++) { $self->add_code(<<___); std $tp[$j],`$j*$SIZE_T`($rp) ___ } } sub mul_mont_fixed($) { my ($self) = @_; my ($n) = $self->{n}; my $fname = $self->get_function_name(); my $label = $self->get_labels("outer", "enter", "sub", "copy", "end"); $self->add_code(<<___); .globl .${fname} .align 5 .${fname}: ___ $self->save_registers(); $self->add_code(<<___); ld $n0,0($n0) ld $bp0,0($bp) ld $apj,0($ap) ___ $self->mul_c_0($tp[0], $apj, $bp0, $c0); for (my $j = 1; $j < $n - 1; $j++) { $self->add_code(<<___); ld $apj,`$j*$SIZE_T`($ap) ___ $self->mul($tp[$j], $apj, $bp0, $c0); } $self->add_code(<<___); ld $apj,`($n-1)*$SIZE_T`($ap) ___ $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0); $self->add_code(<<___); li $tp[$n+1],0 ___ $self->add_code(<<___); li $i,0 mtctr $num b $label->{"enter"} .align 4 $label->{"outer"}: ldx $bpi,$bp,$i ld $apj,0($ap) ___ $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0); for (my $j = 1; $j < $n; $j++) { $self->add_code(<<___); ld $apj,`$j*$SIZE_T`($ap) ___ $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0); } $self->add_code(<<___); addc $tp[$n],$tp[$n],$c0 addze $tp[$n+1],$tp[$n+1] ___ $self->add_code(<<___); .align 4 $label->{"enter"}: mulld $bpi,$tp[0],$n0 ld $npj,0($np) ___ $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0); for (my $j = 1; $j < $n; $j++) { $self->add_code(<<___); ld $npj,`$j*$SIZE_T`($np) ___ $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0); } $self->add_code(<<___); addc $tp[$n-1],$tp[$n],$c0 addze $tp[$n],$tp[$n+1] addi $i,$i,$SIZE_T bdnz $label->{"outer"} and. $tp[$n],$tp[$n],$tp[$n] bne $label->{"sub"} cmpld $tp[$n-1],$npj blt $label->{"copy"} $label->{"sub"}: ___ # # Reduction # $self->add_code(<<___); ld $bpj,`0*$SIZE_T`($np) subfc $c1,$bpj,$tp[0] std $c1,`0*$SIZE_T`($rp) ___ for (my $j = 1; $j < $n - 1; $j++) { $self->add_code(<<___); ld $bpj,`$j*$SIZE_T`($np) subfe $c1,$bpj,$tp[$j] std $c1,`$j*$SIZE_T`($rp) ___ } $self->add_code(<<___); subfe $c1,$npj,$tp[$n-1] std $c1,`($n-1)*$SIZE_T`($rp) ___ $self->add_code(<<___); addme. $tp[$n],$tp[$n] beq $label->{"end"} $label->{"copy"}: ___ $self->copy_result(); $self->add_code(<<___); $label->{"end"}: ___ $self->restore_registers(); $self->add_code(<<___); li r3,1 blr .size .${fname},.-.${fname} ___ } package Mont::GPR; our @ISA = ('Mont'); sub new($$) { my ($class, $n) = @_; return $class->SUPER::new($n); } sub save_registers($) { my ($self) = @_; my $n = $self->{n}; $self->add_code(<<___); std $lo,-8($sp) ___ for (my $j = 0; $j <= $n+1; $j++) { $self->{code}.=<<___; std $tp[$j],-`($j+2)*8`($sp) ___ } $self->add_code(<<___); ___ } sub restore_registers($) { my ($self) = @_; my $n = $self->{n}; $self->add_code(<<___); ld $lo,-8($sp) ___ for (my $j = 0; $j <= $n+1; $j++) { $self->{code}.=<<___; ld $tp[$j],-`($j+2)*8`($sp) ___ } $self->{code} .=<<___; ___ } # Direct translation of C mul() sub mul($$$$$) { my ($self, $r, $a, $w, $c) = @_; $self->add_code(<<___); mulld $lo,$a,$w addc $r,$lo,$c mulhdu $c,$a,$w addze $c,$c ___ } # Like mul() but $c is ignored as an input - an optimisation to save a # preliminary instruction that would set input $c to 0 sub mul_c_0($$$$$) { my ($self, $r, $a, $w, $c) = @_; $self->add_code(<<___); mulld $r,$a,$w mulhdu $c,$a,$w ___ } # Like mul() but does not to the final addition of CA into $c - an # optimisation to save an instruction sub mul_last($$$$$$) { my ($self, $r1, $r2, $a, $w, $c) = @_; $self->add_code(<<___); mulld $lo,$a,$w addc $r1,$lo,$c mulhdu $c,$a,$w addze $r2,$c ___ } # Like C mul_add() but allow $r_out and $r_in to be different sub mul_add($$$$$$) { my ($self, $r_out, $r_in, $a, $w, $c) = @_; $self->add_code(<<___); mulld $lo,$a,$w addc $lo,$lo,$c mulhdu $c,$a,$w addze $c,$c addc $r_out,$r_in,$lo addze $c,$c ___ } # Like mul_add() but $c is ignored as an input - an optimisation to save a # preliminary instruction that would set input $c to 0 sub mul_add_c_0($$$$$$) { my ($self, $r_out, $r_in, $a, $w, $c) = @_; $self->add_code(<<___); mulld $lo,$a,$w addc $r_out,$r_in,$lo mulhdu $c,$a,$w addze $c,$c ___ } package Mont::GPR_300; our @ISA = ('Mont::GPR'); sub new($$) { my ($class, $n) = @_; my $mont = $class->SUPER::new($n); return $mont; } sub get_function_name($) { my ($self) = @_; return "bn_mul_mont_300_fixed_n" . $self->{n}; } sub get_label($$) { my ($self, $l) = @_; return "L" . $l . "_300_" . $self->{n}; } # Direct translation of C mul() sub mul($$$$$) { my ($self, $r, $a, $w, $c, $last) = @_; $self->add_code(<<___); maddld $r,$a,$w,$c maddhdu $c,$a,$w,$c ___ } # Save the last carry as the final entry sub mul_last($$$$$) { my ($self, $r1, $r2, $a, $w, $c) = @_; $self->add_code(<<___); maddld $r1,$a,$w,$c maddhdu $r2,$a,$w,$c ___ } # Like mul() but $c is ignored as an input - an optimisation to save a # preliminary instruction that would set input $c to 0 sub mul_c_0($$$$$) { my ($self, $r, $a, $w, $c) = @_; $self->add_code(<<___); mulld $r,$a,$w mulhdu $c,$a,$w ___ } # Like C mul_add() but allow $r_out and $r_in to be different sub mul_add($$$$$$) { my ($self, $r_out, $r_in, $a, $w, $c) = @_; $self->add_code(<<___); maddld $lo,$a,$w,$c maddhdu $c,$a,$w,$c addc $r_out,$r_in,$lo addze $c,$c ___ } # Like mul_add() but $c is ignored as an input - an optimisation to save a # preliminary instruction that would set input $c to 0 sub mul_add_c_0($$$$$$) { my ($self, $r_out, $r_in, $a, $w, $c) = @_; $self->add_code(<<___); maddld $lo,$a,$w,$r_in maddhdu $c,$a,$w,$r_in ___ if ($r_out ne $lo) { $self->add_code(<<___); mr $r_out,$lo ___ } $self->nl(); } package main; my $code; $code.=<<___; .machine "any" .text ___ my $mont; $mont = new Mont::GPR(6); $mont->mul_mont_fixed(); $code .= $mont->get_code(); $mont = new Mont::GPR_300(6); $mont->mul_mont_fixed(); $code .= $mont->get_code(); $code =~ s/\`([^\`]*)\`/eval $1/gem; $code.=<<___; .asciz "Montgomery Multiplication for PPC by , " ___ print $code; close STDOUT or die "error closing STDOUT: $!";