mirror of
https://github.com/openssl/openssl.git
synced 2025-01-18 13:44:20 +08:00
336923c0c8
bn_sqr_comba8 does for instance compute a wrong result for the value: a=0x4aaac919 62056c84 fba7334e 1a6be678 022181ba fd3aa878 899b2346 ee210f45 The correct result is: r=0x15c72e32 605a3061 d11b1012 3c187483 6df96999 bd0c22ba d3e7d437 4724a82f 912c5e61 6a187efe 8f7c47fc f6945fe5 75be8e3d 97ed17d4 7950b465 3cb32899 but the actual result was: r=0x15c72e32 605a3061 d11b1012 3c187483 6df96999 bd0c22ba d3e7d437 4724a82f 912c5e61 6a187efe 8f7c47fc f6945fe5 75be8e3c 97ed17d4 7950b465 3cb32899 so the forth word of the result was 0x75be8e3c but should have been 0x75be8e3d instead. Likewise bn_sqr_comba4 has an identical bug for the same value as well: a=0x022181ba fd3aa878 899b2346 ee210f45 correct result: r=0x00048a69 9fe82f8b 62bd2ed1 88781335 75be8e3d 97ed17d4 7950b465 3cb32899 wrong result: r=0x00048a69 9fe82f8b 62bd2ed1 88781335 75be8e3c 97ed17d4 7950b465 3cb32899 Fortunately the bn_mul_comba4/8 code paths are not affected. Also the mips64 target does in fact not handle the carry propagation correctly. Example: a=0x4aaac91900000000 62056c8400000000 fba7334e00000000 1a6be67800000000 022181ba00000000 fd3aa87800000000 899b234635dad283 ee210f4500000001 correct result: r=0x15c72e32272c4471 392debf018c679c8 b85496496bf8254c d0204f36611e2be1 0cdb3db8f3c081d8 c94ba0e1bacc5061 191b83d47ff929f6 5be0aebfc13ae68d 3eea7a7fdf2f5758 42f7ec656cab3cb5 6a28095be34756f2 64f24687bf37de06 2822309cd1d292f9 6fa698c972372f09 771e97d3a868cda0 dc421e8a00000001 wrong result: r=0x15c72e32272c4471 392debf018c679c8 b85496496bf8254c d0204f36611e2be1 0cdb3db8f3c081d8 c94ba0e1bacc5061 191b83d47ff929f6 5be0aebfc13ae68d 3eea7a7fdf2f5758 42f7ec656cab3cb5 6a28095be34756f2 64f24687bf37de06 2822309cd1d292f8 6fa698c972372f09 771e97d3a868cda0 dc421e8a00000001 Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/17258)
2270 lines
48 KiB
Raku
2270 lines
48 KiB
Raku
#! /usr/bin/env perl
|
|
# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project.
|
|
#
|
|
# Rights for redistribution and usage in source and binary forms are
|
|
# granted according to the License. Warranty of any kind is disclaimed.
|
|
# ====================================================================
|
|
|
|
|
|
# July 1999
|
|
#
|
|
# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
|
|
#
|
|
# The module is designed to work with either of the "new" MIPS ABI(5),
|
|
# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
|
|
# IRIX 5.x not only because it doesn't support new ABIs but also
|
|
# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
|
|
# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
|
|
# cause illegal instruction exception:-(
|
|
#
|
|
# In addition the code depends on preprocessor flags set up by MIPSpro
|
|
# compiler driver (either as or cc) and therefore (probably?) can't be
|
|
# compiled by the GNU assembler. GNU C driver manages fine though...
|
|
# I mean as long as -mmips-as is specified or is the default option,
|
|
# because then it simply invokes /usr/bin/as which in turn takes
|
|
# perfect care of the preprocessor definitions. Another neat feature
|
|
# offered by the MIPSpro assembler is an optimization pass. This gave
|
|
# me the opportunity to have the code looking more regular as all those
|
|
# architecture dependent instruction rescheduling details were left to
|
|
# the assembler. Cool, huh?
|
|
#
|
|
# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
|
|
# goes way over 3 times faster!
|
|
#
|
|
# <appro@openssl.org>
|
|
|
|
# October 2010
|
|
#
|
|
# Adapt the module even for 32-bit ABIs and other OSes. The former was
|
|
# achieved by mechanical replacement of 64-bit arithmetic instructions
|
|
# such as dmultu, daddu, etc. with their 32-bit counterparts and
|
|
# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
|
|
# >3x performance improvement naturally does not apply to 32-bit code
|
|
# [because there is no instruction 32-bit compiler can't use], one
|
|
# has to content with 40-85% improvement depending on benchmark and
|
|
# key length, more for longer keys.
|
|
|
|
# $output is the last argument if it looks like a file (it has an extension)
|
|
# $flavour is the first argument if it doesn't look like a file
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
|
|
|
|
if ($flavour =~ /64|n32/i) {
|
|
$LD="ld";
|
|
$ST="sd";
|
|
$MULTU="dmultu";
|
|
$DIVU="ddivu";
|
|
$ADDU="daddu";
|
|
$SUBU="dsubu";
|
|
$SRL="dsrl";
|
|
$SLL="dsll";
|
|
$BNSZ=8;
|
|
$PTR_ADD="daddu";
|
|
$PTR_SUB="dsubu";
|
|
$SZREG=8;
|
|
$REG_S="sd";
|
|
$REG_L="ld";
|
|
} else {
|
|
$LD="lw";
|
|
$ST="sw";
|
|
$MULTU="multu";
|
|
$DIVU="divu";
|
|
$ADDU="addu";
|
|
$SUBU="subu";
|
|
$SRL="srl";
|
|
$SLL="sll";
|
|
$BNSZ=4;
|
|
$PTR_ADD="addu";
|
|
$PTR_SUB="subu";
|
|
$SZREG=4;
|
|
$REG_S="sw";
|
|
$REG_L="lw";
|
|
$code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set mips2\n#endif\n";
|
|
}
|
|
|
|
$output and open STDOUT,">$output";
|
|
|
|
# Below is N32/64 register layout used in the original module.
|
|
#
|
|
($zero,$at,$v0,$v1)=map("\$$_",(0..3));
|
|
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
|
($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
|
|
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
|
|
($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
|
|
($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
|
|
#
|
|
# No special adaptation is required for O32. NUBI on the other hand
|
|
# is treated by saving/restoring ($v1,$t0..$t3).
|
|
|
|
$gp=$v1 if ($flavour =~ /nubi/i);
|
|
|
|
$minus4=$v1;
|
|
|
|
$code.=<<___;
|
|
#include "mips_arch.h"
|
|
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
# define ddivu(rs,rt)
|
|
# define mfqt(rd,rs,rt) ddivu rd,rs,rt
|
|
# define mfrm(rd,rs,rt) dmodu rd,rs,rt
|
|
#elif defined(_MIPS_ARCH_MIPS32R6)
|
|
# define divu(rs,rt)
|
|
# define mfqt(rd,rs,rt) divu rd,rs,rt
|
|
# define mfrm(rd,rs,rt) modu rd,rs,rt
|
|
#else
|
|
# define $DIVU(rs,rt) $DIVU $zero,rs,rt
|
|
# define mfqt(rd,rs,rt) mflo rd
|
|
# define mfrm(rd,rs,rt) mfhi rd
|
|
#endif
|
|
|
|
.rdata
|
|
.asciiz "mips3.s, Version 1.2"
|
|
.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
|
|
|
|
.text
|
|
.set noat
|
|
|
|
.align 5
|
|
.globl bn_mul_add_words
|
|
.ent bn_mul_add_words
|
|
bn_mul_add_words:
|
|
.set noreorder
|
|
bgtz $a2,bn_mul_add_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_mul_add_words
|
|
|
|
.align 5
|
|
.ent bn_mul_add_words_internal
|
|
bn_mul_add_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $ta0,$a2,$minus4
|
|
beqz $ta0,.L_bn_mul_add_words_tail
|
|
|
|
.L_bn_mul_add_words_loop:
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t1,0($a0)
|
|
$LD $t2,$BNSZ($a1)
|
|
$LD $t3,$BNSZ($a0)
|
|
$LD $ta0,2*$BNSZ($a1)
|
|
$LD $ta1,2*$BNSZ($a0)
|
|
$ADDU $t1,$v0
|
|
sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
|
|
# values", but it seems to work fine
|
|
# even on 64-bit registers.
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $t1,$at
|
|
$ADDU $v0,$t0
|
|
$MULTU ($t2,$a3)
|
|
sltu $at,$t1,$at
|
|
$ST $t1,0($a0)
|
|
$ADDU $v0,$at
|
|
|
|
$LD $ta2,3*$BNSZ($a1)
|
|
$LD $ta3,3*$BNSZ($a0)
|
|
$ADDU $t3,$v0
|
|
sltu $v0,$t3,$v0
|
|
mflo ($at,$t2,$a3)
|
|
mfhi ($t2,$t2,$a3)
|
|
$ADDU $t3,$at
|
|
$ADDU $v0,$t2
|
|
$MULTU ($ta0,$a3)
|
|
sltu $at,$t3,$at
|
|
$ST $t3,$BNSZ($a0)
|
|
$ADDU $v0,$at
|
|
|
|
subu $a2,4
|
|
$PTR_ADD $a0,4*$BNSZ
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
$ADDU $ta1,$v0
|
|
sltu $v0,$ta1,$v0
|
|
mflo ($at,$ta0,$a3)
|
|
mfhi ($ta0,$ta0,$a3)
|
|
$ADDU $ta1,$at
|
|
$ADDU $v0,$ta0
|
|
$MULTU ($ta2,$a3)
|
|
sltu $at,$ta1,$at
|
|
$ST $ta1,-2*$BNSZ($a0)
|
|
$ADDU $v0,$at
|
|
|
|
|
|
and $ta0,$a2,$minus4
|
|
$ADDU $ta3,$v0
|
|
sltu $v0,$ta3,$v0
|
|
mflo ($at,$ta2,$a3)
|
|
mfhi ($ta2,$ta2,$a3)
|
|
$ADDU $ta3,$at
|
|
$ADDU $v0,$ta2
|
|
sltu $at,$ta3,$at
|
|
$ST $ta3,-$BNSZ($a0)
|
|
.set noreorder
|
|
bgtz $ta0,.L_bn_mul_add_words_loop
|
|
$ADDU $v0,$at
|
|
|
|
beqz $a2,.L_bn_mul_add_words_return
|
|
nop
|
|
|
|
.L_bn_mul_add_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t1,0($a0)
|
|
subu $a2,1
|
|
$ADDU $t1,$v0
|
|
sltu $v0,$t1,$v0
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $t1,$at
|
|
$ADDU $v0,$t0
|
|
sltu $at,$t1,$at
|
|
$ST $t1,0($a0)
|
|
$ADDU $v0,$at
|
|
beqz $a2,.L_bn_mul_add_words_return
|
|
|
|
$LD $t0,$BNSZ($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t1,$BNSZ($a0)
|
|
subu $a2,1
|
|
$ADDU $t1,$v0
|
|
sltu $v0,$t1,$v0
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $t1,$at
|
|
$ADDU $v0,$t0
|
|
sltu $at,$t1,$at
|
|
$ST $t1,$BNSZ($a0)
|
|
$ADDU $v0,$at
|
|
beqz $a2,.L_bn_mul_add_words_return
|
|
|
|
$LD $t0,2*$BNSZ($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t1,2*$BNSZ($a0)
|
|
$ADDU $t1,$v0
|
|
sltu $v0,$t1,$v0
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $t1,$at
|
|
$ADDU $v0,$t0
|
|
sltu $at,$t1,$at
|
|
$ST $t1,2*$BNSZ($a0)
|
|
$ADDU $v0,$at
|
|
|
|
.L_bn_mul_add_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_mul_add_words_internal
|
|
|
|
.align 5
|
|
.globl bn_mul_words
|
|
.ent bn_mul_words
|
|
bn_mul_words:
|
|
.set noreorder
|
|
bgtz $a2,bn_mul_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_mul_words
|
|
|
|
.align 5
|
|
.ent bn_mul_words_internal
|
|
bn_mul_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $ta0,$a2,$minus4
|
|
beqz $ta0,.L_bn_mul_words_tail
|
|
|
|
.L_bn_mul_words_loop:
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t2,$BNSZ($a1)
|
|
$LD $ta0,2*$BNSZ($a1)
|
|
$LD $ta2,3*$BNSZ($a1)
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t1,$v0,$at
|
|
$MULTU ($t2,$a3)
|
|
$ST $v0,0($a0)
|
|
$ADDU $v0,$t1,$t0
|
|
|
|
subu $a2,4
|
|
$PTR_ADD $a0,4*$BNSZ
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
mflo ($at,$t2,$a3)
|
|
mfhi ($t2,$t2,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t3,$v0,$at
|
|
$MULTU ($ta0,$a3)
|
|
$ST $v0,-3*$BNSZ($a0)
|
|
$ADDU $v0,$t3,$t2
|
|
|
|
mflo ($at,$ta0,$a3)
|
|
mfhi ($ta0,$ta0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $ta1,$v0,$at
|
|
$MULTU ($ta2,$a3)
|
|
$ST $v0,-2*$BNSZ($a0)
|
|
$ADDU $v0,$ta1,$ta0
|
|
|
|
and $ta0,$a2,$minus4
|
|
mflo ($at,$ta2,$a3)
|
|
mfhi ($ta2,$ta2,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $ta3,$v0,$at
|
|
$ST $v0,-$BNSZ($a0)
|
|
.set noreorder
|
|
bgtz $ta0,.L_bn_mul_words_loop
|
|
$ADDU $v0,$ta3,$ta2
|
|
|
|
beqz $a2,.L_bn_mul_words_return
|
|
nop
|
|
|
|
.L_bn_mul_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$a3)
|
|
subu $a2,1
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t1,$v0,$at
|
|
$ST $v0,0($a0)
|
|
$ADDU $v0,$t1,$t0
|
|
beqz $a2,.L_bn_mul_words_return
|
|
|
|
$LD $t0,$BNSZ($a1)
|
|
$MULTU ($t0,$a3)
|
|
subu $a2,1
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t1,$v0,$at
|
|
$ST $v0,$BNSZ($a0)
|
|
$ADDU $v0,$t1,$t0
|
|
beqz $a2,.L_bn_mul_words_return
|
|
|
|
$LD $t0,2*$BNSZ($a1)
|
|
$MULTU ($t0,$a3)
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t1,$v0,$at
|
|
$ST $v0,2*$BNSZ($a0)
|
|
$ADDU $v0,$t1,$t0
|
|
|
|
.L_bn_mul_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_mul_words_internal
|
|
|
|
.align 5
|
|
.globl bn_sqr_words
|
|
.ent bn_sqr_words
|
|
bn_sqr_words:
|
|
.set noreorder
|
|
bgtz $a2,bn_sqr_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_sqr_words
|
|
|
|
.align 5
|
|
.ent bn_sqr_words_internal
|
|
bn_sqr_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $ta0,$a2,$minus4
|
|
beqz $ta0,.L_bn_sqr_words_tail
|
|
|
|
.L_bn_sqr_words_loop:
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$t0)
|
|
$LD $t2,$BNSZ($a1)
|
|
$LD $ta0,2*$BNSZ($a1)
|
|
$LD $ta2,3*$BNSZ($a1)
|
|
mflo ($t1,$t0,$t0)
|
|
mfhi ($t0,$t0,$t0)
|
|
$ST $t1,0($a0)
|
|
$ST $t0,$BNSZ($a0)
|
|
|
|
$MULTU ($t2,$t2)
|
|
subu $a2,4
|
|
$PTR_ADD $a0,8*$BNSZ
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
mflo ($t3,$t2,$t2)
|
|
mfhi ($t2,$t2,$t2)
|
|
$ST $t3,-6*$BNSZ($a0)
|
|
$ST $t2,-5*$BNSZ($a0)
|
|
|
|
$MULTU ($ta0,$ta0)
|
|
mflo ($ta1,$ta0,$ta0)
|
|
mfhi ($ta0,$ta0,$ta0)
|
|
$ST $ta1,-4*$BNSZ($a0)
|
|
$ST $ta0,-3*$BNSZ($a0)
|
|
|
|
|
|
$MULTU ($ta2,$ta2)
|
|
and $ta0,$a2,$minus4
|
|
mflo ($ta3,$ta2,$ta2)
|
|
mfhi ($ta2,$ta2,$ta2)
|
|
$ST $ta3,-2*$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
bgtz $ta0,.L_bn_sqr_words_loop
|
|
$ST $ta2,-$BNSZ($a0)
|
|
|
|
beqz $a2,.L_bn_sqr_words_return
|
|
nop
|
|
|
|
.L_bn_sqr_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$t0)
|
|
subu $a2,1
|
|
mflo ($t1,$t0,$t0)
|
|
mfhi ($t0,$t0,$t0)
|
|
$ST $t1,0($a0)
|
|
$ST $t0,$BNSZ($a0)
|
|
beqz $a2,.L_bn_sqr_words_return
|
|
|
|
$LD $t0,$BNSZ($a1)
|
|
$MULTU ($t0,$t0)
|
|
subu $a2,1
|
|
mflo ($t1,$t0,$t0)
|
|
mfhi ($t0,$t0,$t0)
|
|
$ST $t1,2*$BNSZ($a0)
|
|
$ST $t0,3*$BNSZ($a0)
|
|
beqz $a2,.L_bn_sqr_words_return
|
|
|
|
$LD $t0,2*$BNSZ($a1)
|
|
$MULTU ($t0,$t0)
|
|
mflo ($t1,$t0,$t0)
|
|
mfhi ($t0,$t0,$t0)
|
|
$ST $t1,4*$BNSZ($a0)
|
|
$ST $t0,5*$BNSZ($a0)
|
|
|
|
.L_bn_sqr_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
|
|
.end bn_sqr_words_internal
|
|
|
|
.align 5
|
|
.globl bn_add_words
|
|
.ent bn_add_words
|
|
bn_add_words:
|
|
.set noreorder
|
|
bgtz $a3,bn_add_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_add_words
|
|
|
|
.align 5
|
|
.ent bn_add_words_internal
|
|
bn_add_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $at,$a3,$minus4
|
|
beqz $at,.L_bn_add_words_tail
|
|
|
|
.L_bn_add_words_loop:
|
|
$LD $t0,0($a1)
|
|
$LD $ta0,0($a2)
|
|
subu $a3,4
|
|
$LD $t1,$BNSZ($a1)
|
|
and $at,$a3,$minus4
|
|
$LD $t2,2*$BNSZ($a1)
|
|
$PTR_ADD $a2,4*$BNSZ
|
|
$LD $t3,3*$BNSZ($a1)
|
|
$PTR_ADD $a0,4*$BNSZ
|
|
$LD $ta1,-3*$BNSZ($a2)
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
$LD $ta2,-2*$BNSZ($a2)
|
|
$LD $ta3,-$BNSZ($a2)
|
|
$ADDU $ta0,$t0
|
|
sltu $t8,$ta0,$t0
|
|
$ADDU $t0,$ta0,$v0
|
|
sltu $v0,$t0,$ta0
|
|
$ST $t0,-4*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
$ADDU $ta1,$t1
|
|
sltu $t9,$ta1,$t1
|
|
$ADDU $t1,$ta1,$v0
|
|
sltu $v0,$t1,$ta1
|
|
$ST $t1,-3*$BNSZ($a0)
|
|
$ADDU $v0,$t9
|
|
|
|
$ADDU $ta2,$t2
|
|
sltu $t8,$ta2,$t2
|
|
$ADDU $t2,$ta2,$v0
|
|
sltu $v0,$t2,$ta2
|
|
$ST $t2,-2*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
$ADDU $ta3,$t3
|
|
sltu $t9,$ta3,$t3
|
|
$ADDU $t3,$ta3,$v0
|
|
sltu $v0,$t3,$ta3
|
|
$ST $t3,-$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
bgtz $at,.L_bn_add_words_loop
|
|
$ADDU $v0,$t9
|
|
|
|
beqz $a3,.L_bn_add_words_return
|
|
nop
|
|
|
|
.L_bn_add_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$LD $ta0,0($a2)
|
|
$ADDU $ta0,$t0
|
|
subu $a3,1
|
|
sltu $t8,$ta0,$t0
|
|
$ADDU $t0,$ta0,$v0
|
|
sltu $v0,$t0,$ta0
|
|
$ST $t0,0($a0)
|
|
$ADDU $v0,$t8
|
|
beqz $a3,.L_bn_add_words_return
|
|
|
|
$LD $t1,$BNSZ($a1)
|
|
$LD $ta1,$BNSZ($a2)
|
|
$ADDU $ta1,$t1
|
|
subu $a3,1
|
|
sltu $t9,$ta1,$t1
|
|
$ADDU $t1,$ta1,$v0
|
|
sltu $v0,$t1,$ta1
|
|
$ST $t1,$BNSZ($a0)
|
|
$ADDU $v0,$t9
|
|
beqz $a3,.L_bn_add_words_return
|
|
|
|
$LD $t2,2*$BNSZ($a1)
|
|
$LD $ta2,2*$BNSZ($a2)
|
|
$ADDU $ta2,$t2
|
|
sltu $t8,$ta2,$t2
|
|
$ADDU $t2,$ta2,$v0
|
|
sltu $v0,$t2,$ta2
|
|
$ST $t2,2*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
.L_bn_add_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
|
|
.end bn_add_words_internal
|
|
|
|
.align 5
|
|
.globl bn_sub_words
|
|
.ent bn_sub_words
|
|
bn_sub_words:
|
|
.set noreorder
|
|
bgtz $a3,bn_sub_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$zero
|
|
.end bn_sub_words
|
|
|
|
.align 5
|
|
.ent bn_sub_words_internal
|
|
bn_sub_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $at,$a3,$minus4
|
|
beqz $at,.L_bn_sub_words_tail
|
|
|
|
.L_bn_sub_words_loop:
|
|
$LD $t0,0($a1)
|
|
$LD $ta0,0($a2)
|
|
subu $a3,4
|
|
$LD $t1,$BNSZ($a1)
|
|
and $at,$a3,$minus4
|
|
$LD $t2,2*$BNSZ($a1)
|
|
$PTR_ADD $a2,4*$BNSZ
|
|
$LD $t3,3*$BNSZ($a1)
|
|
$PTR_ADD $a0,4*$BNSZ
|
|
$LD $ta1,-3*$BNSZ($a2)
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
$LD $ta2,-2*$BNSZ($a2)
|
|
$LD $ta3,-$BNSZ($a2)
|
|
sltu $t8,$t0,$ta0
|
|
$SUBU $ta0,$t0,$ta0
|
|
$SUBU $t0,$ta0,$v0
|
|
sgtu $v0,$t0,$ta0
|
|
$ST $t0,-4*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
sltu $t9,$t1,$ta1
|
|
$SUBU $ta1,$t1,$ta1
|
|
$SUBU $t1,$ta1,$v0
|
|
sgtu $v0,$t1,$ta1
|
|
$ST $t1,-3*$BNSZ($a0)
|
|
$ADDU $v0,$t9
|
|
|
|
|
|
sltu $t8,$t2,$ta2
|
|
$SUBU $ta2,$t2,$ta2
|
|
$SUBU $t2,$ta2,$v0
|
|
sgtu $v0,$t2,$ta2
|
|
$ST $t2,-2*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
sltu $t9,$t3,$ta3
|
|
$SUBU $ta3,$t3,$ta3
|
|
$SUBU $t3,$ta3,$v0
|
|
sgtu $v0,$t3,$ta3
|
|
$ST $t3,-$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
bgtz $at,.L_bn_sub_words_loop
|
|
$ADDU $v0,$t9
|
|
|
|
beqz $a3,.L_bn_sub_words_return
|
|
nop
|
|
|
|
.L_bn_sub_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$LD $ta0,0($a2)
|
|
subu $a3,1
|
|
sltu $t8,$t0,$ta0
|
|
$SUBU $ta0,$t0,$ta0
|
|
$SUBU $t0,$ta0,$v0
|
|
sgtu $v0,$t0,$ta0
|
|
$ST $t0,0($a0)
|
|
$ADDU $v0,$t8
|
|
beqz $a3,.L_bn_sub_words_return
|
|
|
|
$LD $t1,$BNSZ($a1)
|
|
subu $a3,1
|
|
$LD $ta1,$BNSZ($a2)
|
|
sltu $t9,$t1,$ta1
|
|
$SUBU $ta1,$t1,$ta1
|
|
$SUBU $t1,$ta1,$v0
|
|
sgtu $v0,$t1,$ta1
|
|
$ST $t1,$BNSZ($a0)
|
|
$ADDU $v0,$t9
|
|
beqz $a3,.L_bn_sub_words_return
|
|
|
|
$LD $t2,2*$BNSZ($a1)
|
|
$LD $ta2,2*$BNSZ($a2)
|
|
sltu $t8,$t2,$ta2
|
|
$SUBU $ta2,$t2,$ta2
|
|
$SUBU $t2,$ta2,$v0
|
|
sgtu $v0,$t2,$ta2
|
|
$ST $t2,2*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
.L_bn_sub_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_sub_words_internal
|
|
|
|
#if 0
|
|
/*
|
|
* The bn_div_3_words entry point is re-used for constant-time interface.
|
|
* Implementation is retained as historical reference.
|
|
*/
|
|
.align 5
|
|
.globl bn_div_3_words
|
|
.ent bn_div_3_words
|
|
bn_div_3_words:
|
|
.set noreorder
|
|
move $a3,$a0 # we know that bn_div_words does not
|
|
# touch $a3, $ta2, $ta3 and preserves $a2
|
|
# so that we can save two arguments
|
|
# and return address in registers
|
|
# instead of stack:-)
|
|
|
|
$LD $a0,($a3)
|
|
move $ta2,$a1
|
|
bne $a0,$a2,bn_div_3_words_internal
|
|
$LD $a1,-$BNSZ($a3)
|
|
li $v0,-1
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_div_3_words
|
|
|
|
.align 5
|
|
.ent bn_div_3_words_internal
|
|
bn_div_3_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
move $ta3,$ra
|
|
bal bn_div_words_internal
|
|
move $ra,$ta3
|
|
$MULTU ($ta2,$v0)
|
|
$LD $t2,-2*$BNSZ($a3)
|
|
move $ta0,$zero
|
|
mfhi ($t1,$ta2,$v0)
|
|
mflo ($t0,$ta2,$v0)
|
|
sltu $t8,$t1,$a1
|
|
.L_bn_div_3_words_inner_loop:
|
|
bnez $t8,.L_bn_div_3_words_inner_loop_done
|
|
sgeu $at,$t2,$t0
|
|
seq $t9,$t1,$a1
|
|
and $at,$t9
|
|
sltu $t3,$t0,$ta2
|
|
$ADDU $a1,$a2
|
|
$SUBU $t1,$t3
|
|
$SUBU $t0,$ta2
|
|
sltu $t8,$t1,$a1
|
|
sltu $ta0,$a1,$a2
|
|
or $t8,$ta0
|
|
.set noreorder
|
|
beqz $at,.L_bn_div_3_words_inner_loop
|
|
$SUBU $v0,1
|
|
$ADDU $v0,1
|
|
.set reorder
|
|
.L_bn_div_3_words_inner_loop_done:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_div_3_words_internal
|
|
#endif
|
|
|
|
.align 5
|
|
.globl bn_div_words
|
|
.ent bn_div_words
|
|
bn_div_words:
|
|
.set noreorder
|
|
bnez $a2,bn_div_words_internal
|
|
li $v0,-1 # I would rather signal div-by-zero
|
|
# which can be done with 'break 7'
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_div_words
|
|
|
|
.align 5
|
|
.ent bn_div_words_internal
|
|
bn_div_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
move $v1,$zero
|
|
bltz $a2,.L_bn_div_words_body
|
|
move $t9,$v1
|
|
$SLL $a2,1
|
|
bgtz $a2,.-4
|
|
addu $t9,1
|
|
|
|
.set reorder
|
|
negu $t1,$t9
|
|
li $t2,-1
|
|
$SLL $t2,$t1
|
|
and $t2,$a0
|
|
$SRL $at,$a1,$t1
|
|
.set noreorder
|
|
beqz $t2,.+12
|
|
nop
|
|
break 6 # signal overflow
|
|
.set reorder
|
|
$SLL $a0,$t9
|
|
$SLL $a1,$t9
|
|
or $a0,$at
|
|
___
|
|
$QT=$ta0;
|
|
$HH=$ta1;
|
|
$DH=$v1;
|
|
$code.=<<___;
|
|
.L_bn_div_words_body:
|
|
$SRL $DH,$a2,4*$BNSZ # bits
|
|
sgeu $at,$a0,$a2
|
|
.set noreorder
|
|
beqz $at,.+12
|
|
nop
|
|
$SUBU $a0,$a2
|
|
.set reorder
|
|
|
|
li $QT,-1
|
|
$SRL $HH,$a0,4*$BNSZ # bits
|
|
$SRL $QT,4*$BNSZ # q=0xffffffff
|
|
beq $DH,$HH,.L_bn_div_words_skip_div1
|
|
$DIVU ($a0,$DH)
|
|
mfqt ($QT,$a0,$DH)
|
|
.L_bn_div_words_skip_div1:
|
|
$MULTU ($a2,$QT)
|
|
$SLL $t3,$a0,4*$BNSZ # bits
|
|
$SRL $at,$a1,4*$BNSZ # bits
|
|
or $t3,$at
|
|
mflo ($t0,$a2,$QT)
|
|
mfhi ($t1,$a2,$QT)
|
|
.L_bn_div_words_inner_loop1:
|
|
sltu $t2,$t3,$t0
|
|
seq $t8,$HH,$t1
|
|
sltu $at,$HH,$t1
|
|
and $t2,$t8
|
|
sltu $v0,$t0,$a2
|
|
or $at,$t2
|
|
.set noreorder
|
|
beqz $at,.L_bn_div_words_inner_loop1_done
|
|
$SUBU $t1,$v0
|
|
$SUBU $t0,$a2
|
|
b .L_bn_div_words_inner_loop1
|
|
$SUBU $QT,1
|
|
.set reorder
|
|
.L_bn_div_words_inner_loop1_done:
|
|
|
|
$SLL $a1,4*$BNSZ # bits
|
|
$SUBU $a0,$t3,$t0
|
|
$SLL $v0,$QT,4*$BNSZ # bits
|
|
|
|
li $QT,-1
|
|
$SRL $HH,$a0,4*$BNSZ # bits
|
|
$SRL $QT,4*$BNSZ # q=0xffffffff
|
|
beq $DH,$HH,.L_bn_div_words_skip_div2
|
|
$DIVU ($a0,$DH)
|
|
mfqt ($QT,$a0,$DH)
|
|
.L_bn_div_words_skip_div2:
|
|
$MULTU ($a2,$QT)
|
|
$SLL $t3,$a0,4*$BNSZ # bits
|
|
$SRL $at,$a1,4*$BNSZ # bits
|
|
or $t3,$at
|
|
mflo ($t0,$a2,$QT)
|
|
mfhi ($t1,$a2,$QT)
|
|
.L_bn_div_words_inner_loop2:
|
|
sltu $t2,$t3,$t0
|
|
seq $t8,$HH,$t1
|
|
sltu $at,$HH,$t1
|
|
and $t2,$t8
|
|
sltu $v1,$t0,$a2
|
|
or $at,$t2
|
|
.set noreorder
|
|
beqz $at,.L_bn_div_words_inner_loop2_done
|
|
$SUBU $t1,$v1
|
|
$SUBU $t0,$a2
|
|
b .L_bn_div_words_inner_loop2
|
|
$SUBU $QT,1
|
|
.set reorder
|
|
.L_bn_div_words_inner_loop2_done:
|
|
|
|
$SUBU $a0,$t3,$t0
|
|
or $v0,$QT
|
|
$SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
|
|
$SRL $a2,$t9 # restore $a2
|
|
|
|
.set noreorder
|
|
move $a1,$v1
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_div_words_internal
|
|
___
|
|
undef $HH; undef $QT; undef $DH;
|
|
|
|
($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
|
|
($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
|
|
|
|
($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
|
|
($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
|
|
|
|
($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
|
|
|
|
$code.=<<___;
|
|
|
|
.align 5
|
|
.globl bn_mul_comba8
|
|
.ent bn_mul_comba8
|
|
bn_mul_comba8:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,12*$SZREG,$ra
|
|
.mask 0x803ff008,-$SZREG
|
|
$PTR_SUB $sp,12*$SZREG
|
|
$REG_S $ra,11*$SZREG($sp)
|
|
$REG_S $s5,10*$SZREG($sp)
|
|
$REG_S $s4,9*$SZREG($sp)
|
|
$REG_S $s3,8*$SZREG($sp)
|
|
$REG_S $s2,7*$SZREG($sp)
|
|
$REG_S $s1,6*$SZREG($sp)
|
|
$REG_S $s0,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___ if ($flavour !~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x003f0000,-$SZREG
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $s5,5*$SZREG($sp)
|
|
$REG_S $s4,4*$SZREG($sp)
|
|
$REG_S $s3,3*$SZREG($sp)
|
|
$REG_S $s2,2*$SZREG($sp)
|
|
$REG_S $s1,1*$SZREG($sp)
|
|
$REG_S $s0,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
|
|
.set reorder
|
|
$LD $a_0,0($a1) # If compiled with -mips3 option on
|
|
# R5000 box assembler barks on this
|
|
# 1ine with "should not have mult/div
|
|
# as last instruction in bb (R10K
|
|
# bug)" warning. If anybody out there
|
|
# has a clue about how to circumvent
|
|
# this do send me a note.
|
|
# <appro\@fy.chalmers.se>
|
|
|
|
$LD $b_0,0($a2)
|
|
$LD $a_1,$BNSZ($a1)
|
|
$LD $a_2,2*$BNSZ($a1)
|
|
$MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
|
|
$LD $a_3,3*$BNSZ($a1)
|
|
$LD $b_1,$BNSZ($a2)
|
|
$LD $b_2,2*$BNSZ($a2)
|
|
$LD $b_3,3*$BNSZ($a2)
|
|
mflo ($c_1,$a_0,$b_0)
|
|
mfhi ($c_2,$a_0,$b_0)
|
|
|
|
$LD $a_4,4*$BNSZ($a1)
|
|
$LD $a_5,5*$BNSZ($a1)
|
|
$MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
|
|
$LD $a_6,6*$BNSZ($a1)
|
|
$LD $a_7,7*$BNSZ($a1)
|
|
$LD $b_4,4*$BNSZ($a2)
|
|
$LD $b_5,5*$BNSZ($a2)
|
|
mflo ($t_1,$a_0,$b_1)
|
|
mfhi ($t_2,$a_0,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
|
|
$ADDU $c_3,$t_2,$at
|
|
$LD $b_6,6*$BNSZ($a2)
|
|
$LD $b_7,7*$BNSZ($a2)
|
|
$ST $c_1,0($a0) # r[0]=c1;
|
|
mflo ($t_1,$a_1,$b_0)
|
|
mfhi ($t_2,$a_1,$b_0)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
$ST $c_2,$BNSZ($a0) # r[1]=c2;
|
|
|
|
mflo ($t_1,$a_2,$b_0)
|
|
mfhi ($t_2,$a_2,$b_0)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
mflo ($t_1,$a_1,$b_1)
|
|
mfhi ($t_2,$a_1,$b_1)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_0,$b_2)
|
|
mfhi ($t_2,$a_0,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,2*$BNSZ($a0) # r[2]=c3;
|
|
|
|
mflo ($t_1,$a_0,$b_3)
|
|
mfhi ($t_2,$a_0,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_1,$b_2)
|
|
mfhi ($t_2,$a_1,$b_2)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_2,$b_1)
|
|
mfhi ($t_2,$a_2,$b_1)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_3,$b_0)
|
|
mfhi ($t_2,$a_3,$b_0)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,3*$BNSZ($a0) # r[3]=c1;
|
|
|
|
mflo ($t_1,$a_4,$b_0)
|
|
mfhi ($t_2,$a_4,$b_0)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_3,$b_1)
|
|
mfhi ($t_2,$a_3,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_2,$b_2)
|
|
mfhi ($t_2,$a_2,$b_2)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_1,$b_3)
|
|
mfhi ($t_2,$a_1,$b_3)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_0,$b_4)
|
|
mfhi ($t_2,$a_0,$b_4)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,4*$BNSZ($a0) # r[4]=c2;
|
|
|
|
mflo ($t_1,$a_0,$b_5)
|
|
mfhi ($t_2,$a_0,$b_5)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_1,$b_4)
|
|
mfhi ($t_2,$a_1,$b_4)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_2,$b_3)
|
|
mfhi ($t_2,$a_2,$b_3)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_3,$b_2)
|
|
mfhi ($t_2,$a_3,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_4,$b_1)
|
|
mfhi ($t_2,$a_4,$b_1)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_5,$b_0)
|
|
mfhi ($t_2,$a_5,$b_0)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,5*$BNSZ($a0) # r[5]=c3;
|
|
|
|
mflo ($t_1,$a_6,$b_0)
|
|
mfhi ($t_2,$a_6,$b_0)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_5,$b_1)
|
|
mfhi ($t_2,$a_5,$b_1)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_4,$b_2)
|
|
mfhi ($t_2,$a_4,$b_2)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_3,$b_3)
|
|
mfhi ($t_2,$a_3,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_2,$b_4)
|
|
mfhi ($t_2,$a_2,$b_4)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_1,$b_5)
|
|
mfhi ($t_2,$a_1,$b_5)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_0,$b_6)
|
|
mfhi ($t_2,$a_0,$b_6)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,6*$BNSZ($a0) # r[6]=c1;
|
|
|
|
mflo ($t_1,$a_0,$b_7)
|
|
mfhi ($t_2,$a_0,$b_7)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_1,$b_6)
|
|
mfhi ($t_2,$a_1,$b_6)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_2,$b_5)
|
|
mfhi ($t_2,$a_2,$b_5)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_3,$b_4)
|
|
mfhi ($t_2,$a_3,$b_4)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_4,$b_3)
|
|
mfhi ($t_2,$a_4,$b_3)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_5,$b_2)
|
|
mfhi ($t_2,$a_5,$b_2)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_6,$b_1)
|
|
mfhi ($t_2,$a_6,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_7,$b_0)
|
|
mfhi ($t_2,$a_7,$b_0)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,7*$BNSZ($a0) # r[7]=c2;
|
|
|
|
mflo ($t_1,$a_7,$b_1)
|
|
mfhi ($t_2,$a_7,$b_1)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_6,$b_2)
|
|
mfhi ($t_2,$a_6,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_5,$b_3)
|
|
mfhi ($t_2,$a_5,$b_3)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_4,$b_4)
|
|
mfhi ($t_2,$a_4,$b_4)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_3,$b_5)
|
|
mfhi ($t_2,$a_3,$b_5)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_2,$b_6)
|
|
mfhi ($t_2,$a_2,$b_6)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_1,$b_7)
|
|
mfhi ($t_2,$a_1,$b_7)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,8*$BNSZ($a0) # r[8]=c3;
|
|
|
|
mflo ($t_1,$a_2,$b_7)
|
|
mfhi ($t_2,$a_2,$b_7)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_3,$b_6)
|
|
mfhi ($t_2,$a_3,$b_6)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_4,$b_5)
|
|
mfhi ($t_2,$a_4,$b_5)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_5,$b_4)
|
|
mfhi ($t_2,$a_5,$b_4)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_6,$b_3)
|
|
mfhi ($t_2,$a_6,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_7,$b_2)
|
|
mfhi ($t_2,$a_7,$b_2)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,9*$BNSZ($a0) # r[9]=c1;
|
|
|
|
mflo ($t_1,$a_7,$b_3)
|
|
mfhi ($t_2,$a_7,$b_3)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_6,$b_4)
|
|
mfhi ($t_2,$a_6,$b_4)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_5,$b_5)
|
|
mfhi ($t_2,$a_5,$b_5)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_4,$b_6)
|
|
mfhi ($t_2,$a_4,$b_6)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_3,$b_7)
|
|
mfhi ($t_2,$a_3,$b_7)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,10*$BNSZ($a0) # r[10]=c2;
|
|
|
|
mflo ($t_1,$a_4,$b_7)
|
|
mfhi ($t_2,$a_4,$b_7)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_5,$b_6)
|
|
mfhi ($t_2,$a_5,$b_6)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_6,$b_5)
|
|
mfhi ($t_2,$a_6,$b_5)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_7,$b_4)
|
|
mfhi ($t_2,$a_7,$b_4)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,11*$BNSZ($a0) # r[11]=c3;
|
|
|
|
mflo ($t_1,$a_7,$b_5)
|
|
mfhi ($t_2,$a_7,$b_5)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_6,$b_6)
|
|
mfhi ($t_2,$a_6,$b_6)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_5,$b_7)
|
|
mfhi ($t_2,$a_5,$b_7)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,12*$BNSZ($a0) # r[12]=c1;
|
|
|
|
mflo ($t_1,$a_6,$b_7)
|
|
mfhi ($t_2,$a_6,$b_7)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_7,$b_6)
|
|
mfhi ($t_2,$a_7,$b_6)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,13*$BNSZ($a0) # r[13]=c2;
|
|
|
|
mflo ($t_1,$a_7,$b_7)
|
|
mfhi ($t_2,$a_7,$b_7)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
$ST $c_3,14*$BNSZ($a0) # r[14]=c3;
|
|
$ST $c_1,15*$BNSZ($a0) # r[15]=c1;
|
|
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $s5,10*$SZREG($sp)
|
|
$REG_L $s4,9*$SZREG($sp)
|
|
$REG_L $s3,8*$SZREG($sp)
|
|
$REG_L $s2,7*$SZREG($sp)
|
|
$REG_L $s1,6*$SZREG($sp)
|
|
$REG_L $s0,5*$SZREG($sp)
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
jr $ra
|
|
$PTR_ADD $sp,12*$SZREG
|
|
___
|
|
$code.=<<___ if ($flavour !~ /nubi/i);
|
|
$REG_L $s5,5*$SZREG($sp)
|
|
$REG_L $s4,4*$SZREG($sp)
|
|
$REG_L $s3,3*$SZREG($sp)
|
|
$REG_L $s2,2*$SZREG($sp)
|
|
$REG_L $s1,1*$SZREG($sp)
|
|
$REG_L $s0,0*$SZREG($sp)
|
|
jr $ra
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
.end bn_mul_comba8
|
|
|
|
.align 5
|
|
.globl bn_mul_comba4
|
|
.ent bn_mul_comba4
|
|
bn_mul_comba4:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
$LD $a_0,0($a1)
|
|
$LD $b_0,0($a2)
|
|
$LD $a_1,$BNSZ($a1)
|
|
$LD $a_2,2*$BNSZ($a1)
|
|
$MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
|
|
$LD $a_3,3*$BNSZ($a1)
|
|
$LD $b_1,$BNSZ($a2)
|
|
$LD $b_2,2*$BNSZ($a2)
|
|
$LD $b_3,3*$BNSZ($a2)
|
|
mflo ($c_1,$a_0,$b_0)
|
|
mfhi ($c_2,$a_0,$b_0)
|
|
$ST $c_1,0($a0)
|
|
|
|
$MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
|
|
mflo ($t_1,$a_0,$b_1)
|
|
mfhi ($t_2,$a_0,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
|
|
$ADDU $c_3,$t_2,$at
|
|
mflo ($t_1,$a_1,$b_0)
|
|
mfhi ($t_2,$a_1,$b_0)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
$ST $c_2,$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_2,$b_0)
|
|
mfhi ($t_2,$a_2,$b_0)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
mflo ($t_1,$a_1,$b_1)
|
|
mfhi ($t_2,$a_1,$b_1)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_0,$b_2)
|
|
mfhi ($t_2,$a_0,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,2*$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_0,$b_3)
|
|
mfhi ($t_2,$a_0,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_1,$b_2)
|
|
mfhi ($t_2,$a_1,$b_2)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_2,$b_1)
|
|
mfhi ($t_2,$a_2,$b_1)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_3,$b_0)
|
|
mfhi ($t_2,$a_3,$b_0)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,3*$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_3,$b_1)
|
|
mfhi ($t_2,$a_3,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_2,$b_2)
|
|
mfhi ($t_2,$a_2,$b_2)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_1,$b_3)
|
|
mfhi ($t_2,$a_1,$b_3)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,4*$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_2,$b_3)
|
|
mfhi ($t_2,$a_2,$b_3)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_3,$b_2)
|
|
mfhi ($t_2,$a_3,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,5*$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_3,$b_3)
|
|
mfhi ($t_2,$a_3,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
$ST $c_1,6*$BNSZ($a0)
|
|
$ST $c_2,7*$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
nop
|
|
.end bn_mul_comba4
|
|
___
|
|
|
|
($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
|
|
|
|
sub add_c2 () {
|
|
my ($hi,$lo,$c0,$c1,$c2,
|
|
$warm, # !$warm denotes first call with specific sequence of
|
|
# $c_[XYZ] when there is no Z-carry to accumulate yet;
|
|
$an,$bn # these two are arguments for multiplication which
|
|
# result is used in *next* step [which is why it's
|
|
# commented as "forward multiplication" below];
|
|
)=@_;
|
|
$code.=<<___;
|
|
$ADDU $c0,$lo
|
|
sltu $at,$c0,$lo
|
|
$MULTU ($an,$bn) # forward multiplication
|
|
$ADDU $c0,$lo
|
|
$ADDU $at,$hi
|
|
sltu $lo,$c0,$lo
|
|
$ADDU $c1,$at
|
|
$ADDU $hi,$lo
|
|
___
|
|
$code.=<<___ if (!$warm);
|
|
sltu $c2,$c1,$at
|
|
$ADDU $c1,$hi
|
|
___
|
|
$code.=<<___ if ($warm);
|
|
sltu $at,$c1,$at
|
|
$ADDU $c1,$hi
|
|
$ADDU $c2,$at
|
|
___
|
|
$code.=<<___;
|
|
sltu $hi,$c1,$hi
|
|
$ADDU $c2,$hi
|
|
mflo ($lo,$an,$bn)
|
|
mfhi ($hi,$an,$bn)
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
.align 5
|
|
.globl bn_sqr_comba8
|
|
.ent bn_sqr_comba8
|
|
bn_sqr_comba8:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
$LD $a_0,0($a1)
|
|
$LD $a_1,$BNSZ($a1)
|
|
$LD $a_2,2*$BNSZ($a1)
|
|
$LD $a_3,3*$BNSZ($a1)
|
|
|
|
$MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
|
|
$LD $a_4,4*$BNSZ($a1)
|
|
$LD $a_5,5*$BNSZ($a1)
|
|
$LD $a_6,6*$BNSZ($a1)
|
|
$LD $a_7,7*$BNSZ($a1)
|
|
mflo ($c_1,$a_0,$a_0)
|
|
mfhi ($c_2,$a_0,$a_0)
|
|
$ST $c_1,0($a0)
|
|
|
|
$MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
|
|
mflo ($t_1,$a_0,$a_1)
|
|
mfhi ($t_2,$a_0,$a_1)
|
|
slt $c_1,$t_2,$zero
|
|
$SLL $t_2,1
|
|
$MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
|
|
slt $a2,$t_1,$zero
|
|
$ADDU $t_2,$a2
|
|
$SLL $t_1,1
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$ADDU $c_3,$t_2,$at
|
|
$ST $c_2,$BNSZ($a0)
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_2,$a_0)
|
|
mfhi ($t_2,$a_2,$a_0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,2*$BNSZ($a0)
|
|
mflo ($t_1,$a_0,$a_3)
|
|
mfhi ($t_2,$a_0,$a_3)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ST $c_1,3*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,4*$BNSZ($a0)
|
|
mflo ($t_1,$a_0,$a_5)
|
|
mfhi ($t_2,$a_0,$a_5)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ST $c_3,5*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,6*$BNSZ($a0)
|
|
mflo ($t_1,$a_0,$a_7)
|
|
mfhi ($t_2,$a_0,$a_7)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ST $c_2,7*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,8*$BNSZ($a0)
|
|
mflo ($t_1,$a_2,$a_7)
|
|
mfhi ($t_2,$a_2,$a_7)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ST $c_1,9*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,10*$BNSZ($a0)
|
|
mflo ($t_1,$a_4,$a_7)
|
|
mfhi ($t_2,$a_4,$a_7)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ST $c_3,11*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,12*$BNSZ($a0)
|
|
mflo ($t_1,$a_6,$a_7)
|
|
mfhi ($t_2,$a_6,$a_7)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ST $c_2,13*$BNSZ($a0)
|
|
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
$ST $c_3,14*$BNSZ($a0)
|
|
$ST $c_1,15*$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
nop
|
|
.end bn_sqr_comba8
|
|
|
|
.align 5
|
|
.globl bn_sqr_comba4
|
|
.ent bn_sqr_comba4
|
|
bn_sqr_comba4:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
$LD $a_0,0($a1)
|
|
$LD $a_1,$BNSZ($a1)
|
|
$MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
|
|
$LD $a_2,2*$BNSZ($a1)
|
|
$LD $a_3,3*$BNSZ($a1)
|
|
mflo ($c_1,$a_0,$a_0)
|
|
mfhi ($c_2,$a_0,$a_0)
|
|
$ST $c_1,0($a0)
|
|
|
|
$MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
|
|
mflo ($t_1,$a_0,$a_1)
|
|
mfhi ($t_2,$a_0,$a_1)
|
|
slt $c_1,$t_2,$zero
|
|
$SLL $t_2,1
|
|
$MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
|
|
slt $a2,$t_1,$zero
|
|
$ADDU $t_2,$a2
|
|
$SLL $t_1,1
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$ADDU $c_3,$t_2,$at
|
|
$ST $c_2,$BNSZ($a0)
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_2,$a_0)
|
|
mfhi ($t_2,$a_2,$a_0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,2*$BNSZ($a0)
|
|
mflo ($t_1,$a_0,$a_3)
|
|
mfhi ($t_2,$a_0,$a_3)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ST $c_1,3*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,4*$BNSZ($a0)
|
|
mflo ($t_1,$a_2,$a_3)
|
|
mfhi ($t_2,$a_2,$a_3)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ST $c_3,5*$BNSZ($a0)
|
|
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
$ST $c_1,6*$BNSZ($a0)
|
|
$ST $c_2,7*$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
nop
|
|
.end bn_sqr_comba4
|
|
___
|
|
print $code;
|
|
close STDOUT or die "error closing STDOUT: $!";
|